From 1538e33995eaf3f315cbb5506019b9f913ed8555 Mon Sep 17 00:00:00 2001 From: David Vernet Date: Tue, 18 Jun 2024 10:09:18 -1000 Subject: sched_ext: Print sched_ext info when dumping stack It would be useful to see what the sched_ext scheduler state is, and what scheduler is running, when we're dumping a task's stack. This patch therefore adds a new print_scx_info() function that's called in the same context as print_worker_info() and print_stop_info(). An example dump follows. BUG: kernel NULL pointer dereference, address: 0000000000000999 #PF: supervisor write access in kernel mode #PF: error_code(0x0002) - not-present page PGD 0 P4D 0 Oops: 0002 [#1] PREEMPT SMP CPU: 13 PID: 2047 Comm: insmod Tainted: G O 6.6.0-work-10323-gb58d4cae8e99-dirty #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS unknown 2/2/2022 Sched_ext: qmap (enabled+all), task: runnable_at=-17ms RIP: 0010:init_module+0x9/0x1000 [test_module] ... v3: - scx_ops_enable_state_str[] definition moved to an earlier patch as it's now used by core implementation. - Convert jiffy delta to msecs using jiffies_to_msecs() instead of multiplying by (HZ / MSEC_PER_SEC). The conversion is implemented in jiffies_delta_msecs(). v2: - We are now using scx_ops_enable_state_str[] outside CONFIG_SCHED_DEBUG. Move it outside of CONFIG_SCHED_DEBUG and to the top. This was reported by Changwoo and Andrea. Signed-off-by: David Vernet Reported-by: Changwoo Min Reported-by: Andrea Righi Signed-off-by: Tejun Heo --- lib/dump_stack.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/dump_stack.c b/lib/dump_stack.c index 222c6d6c8281..9581ef4efec5 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) print_worker_info(log_lvl, current); print_stop_info(log_lvl, current); + print_scx_info(log_lvl, current); } /** -- cgit v1.2.3 From 5ac79730324c6f37106ce397586020ffe6e8e234 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 18 Jul 2024 14:05:04 -0700 Subject: platform: Add test managed platform_device/driver APIs Introduce KUnit resource wrappers around platform_driver_register(), platform_device_alloc(), and platform_device_add() so that test authors can register platform drivers/devices from their tests and have the drivers/devices automatically be unregistered when the test is done. This makes test setup code simpler when a platform driver or platform device is needed. Add a few test cases at the same time to make sure the APIs work as intended. Cc: Brendan Higgins Reviewed-by: David Gow Cc: Rae Moar Reviewed-by: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Signed-off-by: Stephen Boyd Link: https://lore.kernel.org/r/20240718210513.3801024-6-sboyd@kernel.org --- Documentation/dev-tools/kunit/api/index.rst | 5 + .../dev-tools/kunit/api/platformdevice.rst | 10 + drivers/base/dd.c | 1 + include/kunit/platform_device.h | 20 ++ lib/kunit/Makefile | 4 +- lib/kunit/platform-test.c | 224 +++++++++++++++ lib/kunit/platform.c | 302 +++++++++++++++++++++ 7 files changed, 565 insertions(+), 1 deletion(-) create mode 100644 Documentation/dev-tools/kunit/api/platformdevice.rst create mode 100644 include/kunit/platform_device.h create mode 100644 lib/kunit/platform-test.c create mode 100644 lib/kunit/platform.c (limited to 'lib') diff --git a/Documentation/dev-tools/kunit/api/index.rst b/Documentation/dev-tools/kunit/api/index.rst index 282befa17edf..02b26f5e8750 100644 --- a/Documentation/dev-tools/kunit/api/index.rst +++ b/Documentation/dev-tools/kunit/api/index.rst @@ -10,6 +10,7 @@ API Reference resource functionredirection of + platformdevice This page documents the KUnit kernel testing API. It is divided into the @@ -36,3 +37,7 @@ Driver KUnit API Documentation/dev-tools/kunit/api/of.rst - Documents the KUnit device tree (OF) API + +Documentation/dev-tools/kunit/api/platformdevice.rst + + - Documents the KUnit platform device API diff --git a/Documentation/dev-tools/kunit/api/platformdevice.rst b/Documentation/dev-tools/kunit/api/platformdevice.rst new file mode 100644 index 000000000000..49ddd5729003 --- /dev/null +++ b/Documentation/dev-tools/kunit/api/platformdevice.rst @@ -0,0 +1,10 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=================== +Platform Device API +=================== + +The KUnit platform device API is used to test platform devices. + +.. kernel-doc:: lib/kunit/platform.c + :export: diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 9b745ba54de1..964111361497 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -393,6 +393,7 @@ bool device_is_bound(struct device *dev) { return dev->p && klist_node_attached(&dev->p->knode_driver); } +EXPORT_SYMBOL_GPL(device_is_bound); static void driver_bound(struct device *dev) { diff --git a/include/kunit/platform_device.h b/include/kunit/platform_device.h new file mode 100644 index 000000000000..0fc0999d2420 --- /dev/null +++ b/include/kunit/platform_device.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _KUNIT_PLATFORM_DRIVER_H +#define _KUNIT_PLATFORM_DRIVER_H + +struct kunit; +struct platform_device; +struct platform_driver; + +struct platform_device * +kunit_platform_device_alloc(struct kunit *test, const char *name, int id); +int kunit_platform_device_add(struct kunit *test, struct platform_device *pdev); + +int kunit_platform_device_prepare_wait_for_probe(struct kunit *test, + struct platform_device *pdev, + struct completion *x); + +int kunit_platform_driver_register(struct kunit *test, + struct platform_driver *drv); + +#endif diff --git a/lib/kunit/Makefile b/lib/kunit/Makefile index 30f6bbf04a4a..5aa51978e456 100644 --- a/lib/kunit/Makefile +++ b/lib/kunit/Makefile @@ -9,7 +9,8 @@ kunit-objs += test.o \ try-catch.o \ executor.o \ attributes.o \ - device.o + device.o \ + platform.o ifeq ($(CONFIG_KUNIT_DEBUGFS),y) kunit-objs += debugfs.o @@ -19,6 +20,7 @@ endif obj-y += hooks.o obj-$(CONFIG_KUNIT_TEST) += kunit-test.o +obj-$(CONFIG_KUNIT_TEST) += platform-test.o # string-stream-test compiles built-in only. ifeq ($(CONFIG_KUNIT_TEST),y) diff --git a/lib/kunit/platform-test.c b/lib/kunit/platform-test.c new file mode 100644 index 000000000000..e3debb8fbcef --- /dev/null +++ b/lib/kunit/platform-test.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KUnit test for KUnit platform driver infrastructure. + */ + +#include + +#include +#include + +/* + * Test that kunit_platform_device_alloc() creates a platform device. + */ +static void kunit_platform_device_alloc_test(struct kunit *test) +{ + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, + kunit_platform_device_alloc(test, "kunit-platform", 1)); +} + +/* + * Test that kunit_platform_device_add() registers a platform device on the + * platform bus with the proper name and id. + */ +static void kunit_platform_device_add_test(struct kunit *test) +{ + struct platform_device *pdev; + const char *name = "kunit-platform-add"; + const int id = -1; + + pdev = kunit_platform_device_alloc(test, name, id); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + + KUNIT_EXPECT_EQ(test, 0, kunit_platform_device_add(test, pdev)); + KUNIT_EXPECT_TRUE(test, dev_is_platform(&pdev->dev)); + KUNIT_EXPECT_STREQ(test, pdev->name, name); + KUNIT_EXPECT_EQ(test, pdev->id, id); +} + +/* + * Test that kunit_platform_device_add() called twice with the same device name + * and id fails the second time and properly cleans up. + */ +static void kunit_platform_device_add_twice_fails_test(struct kunit *test) +{ + struct platform_device *pdev; + const char *name = "kunit-platform-add-2"; + const int id = -1; + + pdev = kunit_platform_device_alloc(test, name, id); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_add(test, pdev)); + + pdev = kunit_platform_device_alloc(test, name, id); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + + KUNIT_EXPECT_NE(test, 0, kunit_platform_device_add(test, pdev)); +} + +static int kunit_platform_device_find_by_name(struct device *dev, const void *data) +{ + return strcmp(dev_name(dev), data) == 0; +} + +/* + * Test that kunit_platform_device_add() cleans up by removing the platform + * device when the test finishes. */ +static void kunit_platform_device_add_cleans_up(struct kunit *test) +{ + struct platform_device *pdev; + const char *name = "kunit-platform-clean"; + const int id = -1; + struct kunit fake; + struct device *dev; + + kunit_init_test(&fake, "kunit_platform_device_add_fake_test", NULL); + KUNIT_ASSERT_EQ(test, fake.status, KUNIT_SUCCESS); + + pdev = kunit_platform_device_alloc(&fake, name, id); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_add(&fake, pdev)); + dev = bus_find_device(&platform_bus_type, NULL, name, + kunit_platform_device_find_by_name); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, dev); + put_device(dev); + + /* Remove pdev */ + kunit_cleanup(&fake); + + /* + * Failing to migrate the kunit_resource would lead to an extra + * put_device() call on the platform device. The best we can do here is + * make sure the device no longer exists on the bus, but if something + * is wrong we'll see a refcount underflow here. We can't test for a + * refcount underflow because the kref matches the lifetime of the + * device which should already be freed and could be used by something + * else. + */ + dev = bus_find_device(&platform_bus_type, NULL, name, + kunit_platform_device_find_by_name); + KUNIT_EXPECT_PTR_EQ(test, NULL, dev); + put_device(dev); +} + +/* + * Test suite for struct platform_device kunit APIs + */ +static struct kunit_case kunit_platform_device_test_cases[] = { + KUNIT_CASE(kunit_platform_device_alloc_test), + KUNIT_CASE(kunit_platform_device_add_test), + KUNIT_CASE(kunit_platform_device_add_twice_fails_test), + KUNIT_CASE(kunit_platform_device_add_cleans_up), + {} +}; + +static struct kunit_suite kunit_platform_device_suite = { + .name = "kunit_platform_device", + .test_cases = kunit_platform_device_test_cases, +}; + +struct kunit_platform_driver_test_context { + struct platform_driver pdrv; + const char *data; +}; + +static const char * const test_data = "test data"; + +static inline struct kunit_platform_driver_test_context * +to_test_context(struct platform_device *pdev) +{ + return container_of(to_platform_driver(pdev->dev.driver), + struct kunit_platform_driver_test_context, + pdrv); +} + +static int kunit_platform_driver_probe(struct platform_device *pdev) +{ + struct kunit_platform_driver_test_context *ctx; + + ctx = to_test_context(pdev); + ctx->data = test_data; + + return 0; +} + +/* Test that kunit_platform_driver_register() registers a driver that probes. */ +static void kunit_platform_driver_register_test(struct kunit *test) +{ + struct platform_device *pdev; + struct kunit_platform_driver_test_context *ctx; + DECLARE_COMPLETION_ONSTACK(comp); + const char *name = "kunit-platform-register"; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + + pdev = kunit_platform_device_alloc(test, name, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_add(test, pdev)); + + ctx->pdrv.probe = kunit_platform_driver_probe; + ctx->pdrv.driver.name = name; + ctx->pdrv.driver.owner = THIS_MODULE; + + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_prepare_wait_for_probe(test, pdev, &comp)); + + KUNIT_EXPECT_EQ(test, 0, kunit_platform_driver_register(test, &ctx->pdrv)); + KUNIT_EXPECT_NE(test, 0, wait_for_completion_timeout(&comp, 3 * HZ)); + KUNIT_EXPECT_STREQ(test, ctx->data, test_data); +} + +/* + * Test that kunit_platform_device_prepare_wait_for_probe() completes the completion + * when the device is already probed. + */ +static void kunit_platform_device_prepare_wait_for_probe_completes_when_already_probed(struct kunit *test) +{ + struct platform_device *pdev; + struct kunit_platform_driver_test_context *ctx; + DECLARE_COMPLETION_ONSTACK(comp); + const char *name = "kunit-platform-wait"; + + ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + + pdev = kunit_platform_device_alloc(test, name, -1); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_add(test, pdev)); + + ctx->pdrv.probe = kunit_platform_driver_probe; + ctx->pdrv.driver.name = name; + ctx->pdrv.driver.owner = THIS_MODULE; + + /* Make sure driver has actually probed */ + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_prepare_wait_for_probe(test, pdev, &comp)); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_driver_register(test, &ctx->pdrv)); + KUNIT_ASSERT_NE(test, 0, wait_for_completion_timeout(&comp, 3 * HZ)); + + reinit_completion(&comp); + KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_prepare_wait_for_probe(test, pdev, &comp)); + + KUNIT_EXPECT_NE(test, 0, wait_for_completion_timeout(&comp, HZ)); +} + +static struct kunit_case kunit_platform_driver_test_cases[] = { + KUNIT_CASE(kunit_platform_driver_register_test), + KUNIT_CASE(kunit_platform_device_prepare_wait_for_probe_completes_when_already_probed), + {} +}; + +/* + * Test suite for struct platform_driver kunit APIs + */ +static struct kunit_suite kunit_platform_driver_suite = { + .name = "kunit_platform_driver", + .test_cases = kunit_platform_driver_test_cases, +}; + +kunit_test_suites( + &kunit_platform_device_suite, + &kunit_platform_driver_suite, +); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("KUnit test for KUnit platform driver infrastructure"); diff --git a/lib/kunit/platform.c b/lib/kunit/platform.c new file mode 100644 index 000000000000..0b518de26065 --- /dev/null +++ b/lib/kunit/platform.c @@ -0,0 +1,302 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test managed platform driver + */ + +#include +#include +#include +#include + +#include +#include + +struct kunit_platform_device_alloc_params { + const char *name; + int id; +}; + +static int kunit_platform_device_alloc_init(struct kunit_resource *res, void *context) +{ + struct kunit_platform_device_alloc_params *params = context; + struct platform_device *pdev; + + pdev = platform_device_alloc(params->name, params->id); + if (!pdev) + return -ENOMEM; + + res->data = pdev; + + return 0; +} + +static void kunit_platform_device_alloc_exit(struct kunit_resource *res) +{ + struct platform_device *pdev = res->data; + + platform_device_put(pdev); +} + +/** + * kunit_platform_device_alloc() - Allocate a KUnit test managed platform device + * @test: test context + * @name: device name of platform device to alloc + * @id: identifier of platform device to alloc. + * + * Allocate a test managed platform device. The device is put when the test completes. + * + * Return: Allocated platform device on success, NULL on failure. + */ +struct platform_device * +kunit_platform_device_alloc(struct kunit *test, const char *name, int id) +{ + struct kunit_platform_device_alloc_params params = { + .name = name, + .id = id, + }; + + return kunit_alloc_resource(test, + kunit_platform_device_alloc_init, + kunit_platform_device_alloc_exit, + GFP_KERNEL, ¶ms); +} +EXPORT_SYMBOL_GPL(kunit_platform_device_alloc); + +static void kunit_platform_device_add_exit(struct kunit_resource *res) +{ + struct platform_device *pdev = res->data; + + platform_device_unregister(pdev); +} + +static bool +kunit_platform_device_alloc_match(struct kunit *test, + struct kunit_resource *res, void *match_data) +{ + struct platform_device *pdev = match_data; + + return res->data == pdev && res->free == kunit_platform_device_alloc_exit; +} + +KUNIT_DEFINE_ACTION_WRAPPER(platform_device_unregister_wrapper, + platform_device_unregister, struct platform_device *); +/** + * kunit_platform_device_add() - Register a KUnit test managed platform device + * @test: test context + * @pdev: platform device to add + * + * Register a test managed platform device. The device is unregistered when the + * test completes. + * + * Return: 0 on success, negative errno on failure. + */ +int kunit_platform_device_add(struct kunit *test, struct platform_device *pdev) +{ + struct kunit_resource *res; + int ret; + + ret = platform_device_add(pdev); + if (ret) + return ret; + + res = kunit_find_resource(test, kunit_platform_device_alloc_match, pdev); + if (res) { + /* + * Transfer the reference count of the platform device if it + * was allocated with kunit_platform_device_alloc(). In this + * case, calling platform_device_put() when the test exits from + * kunit_platform_device_alloc_exit() would lead to reference + * count underflow because platform_device_unregister_wrapper() + * calls platform_device_unregister() which also calls + * platform_device_put(). + * + * Usually callers transfer the refcount initialized in + * platform_device_alloc() to platform_device_add() by calling + * platform_device_unregister() when platform_device_add() + * succeeds or platform_device_put() when it fails. KUnit has to + * keep this straight by redirecting the free routine for the + * resource to the right function. Luckily this only has to + * account for the success scenario. + */ + res->free = kunit_platform_device_add_exit; + kunit_put_resource(res); + } else { + ret = kunit_add_action_or_reset(test, platform_device_unregister_wrapper, pdev); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(kunit_platform_device_add); + +struct kunit_platform_device_probe_nb { + struct completion *x; + struct device *dev; + struct notifier_block nb; +}; + +static int kunit_platform_device_probe_notify(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct kunit_platform_device_probe_nb *knb; + struct device *dev = data; + + knb = container_of(nb, struct kunit_platform_device_probe_nb, nb); + if (event != BUS_NOTIFY_BOUND_DRIVER || knb->dev != dev) + return NOTIFY_DONE; + + complete(knb->x); + + return NOTIFY_OK; +} + +static void kunit_platform_device_probe_nb_remove(void *nb) +{ + bus_unregister_notifier(&platform_bus_type, nb); +} + +/** + * kunit_platform_device_prepare_wait_for_probe() - Prepare a completion + * variable to wait for a platform device to probe + * @test: test context + * @pdev: platform device to prepare to wait for probe of + * @x: completion variable completed when @dev has probed + * + * Prepare a completion variable @x to wait for @pdev to probe. Waiting on the + * completion forces a preemption, allowing the platform driver to probe. + * + * Example + * + * .. code-block:: c + * + * static int kunit_platform_driver_probe(struct platform_device *pdev) + * { + * return 0; + * } + * + * static void kunit_platform_driver_test(struct kunit *test) + * { + * struct platform_device *pdev; + * struct platform_driver *pdrv; + * DECLARE_COMPLETION_ONSTACK(comp); + * + * pdev = kunit_platform_device_alloc(test, "kunit-platform", -1); + * KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdev); + * KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_add(test, pdev)); + * + * pdrv = kunit_kzalloc(test, sizeof(*pdrv), GFP_KERNEL); + * KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pdrv); + * + * pdrv->probe = kunit_platform_driver_probe; + * pdrv->driver.name = "kunit-platform"; + * pdrv->driver.owner = THIS_MODULE; + * + * KUNIT_ASSERT_EQ(test, 0, kunit_platform_device_prepare_wait_for_probe(test, pdev, &comp)); + * KUNIT_ASSERT_EQ(test, 0, kunit_platform_driver_register(test, pdrv)); + * + * KUNIT_EXPECT_NE(test, 0, wait_for_completion_timeout(&comp, 3 * HZ)); + * } + * + * Return: 0 on success, negative errno on failure. + */ +int kunit_platform_device_prepare_wait_for_probe(struct kunit *test, + struct platform_device *pdev, + struct completion *x) +{ + struct device *dev = &pdev->dev; + struct kunit_platform_device_probe_nb *knb; + bool bound; + + knb = kunit_kzalloc(test, sizeof(*knb), GFP_KERNEL); + if (!knb) + return -ENOMEM; + + knb->nb.notifier_call = kunit_platform_device_probe_notify; + knb->dev = dev; + knb->x = x; + + device_lock(dev); + bound = device_is_bound(dev); + if (bound) { + device_unlock(dev); + complete(x); + kunit_kfree(test, knb); + return 0; + } + + bus_register_notifier(&platform_bus_type, &knb->nb); + device_unlock(&pdev->dev); + + return kunit_add_action_or_reset(test, kunit_platform_device_probe_nb_remove, &knb->nb); +} +EXPORT_SYMBOL_GPL(kunit_platform_device_prepare_wait_for_probe); + +KUNIT_DEFINE_ACTION_WRAPPER(platform_driver_unregister_wrapper, + platform_driver_unregister, struct platform_driver *); +/** + * kunit_platform_driver_register() - Register a KUnit test managed platform driver + * @test: test context + * @drv: platform driver to register + * + * Register a test managed platform driver. This allows callers to embed the + * @drv in a container structure and use container_of() in the probe function + * to pass information to KUnit tests. + * + * Example + * + * .. code-block:: c + * + * struct kunit_test_context { + * struct platform_driver pdrv; + * const char *data; + * }; + * + * static inline struct kunit_test_context * + * to_test_context(struct platform_device *pdev) + * { + * return container_of(to_platform_driver(pdev->dev.driver), + * struct kunit_test_context, + * pdrv); + * } + * + * static int kunit_platform_driver_probe(struct platform_device *pdev) + * { + * struct kunit_test_context *ctx; + * + * ctx = to_test_context(pdev); + * ctx->data = "test data"; + * + * return 0; + * } + * + * static void kunit_platform_driver_test(struct kunit *test) + * { + * struct kunit_test_context *ctx; + * + * ctx = kunit_kzalloc(test, sizeof(*ctx), GFP_KERNEL); + * KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ctx); + * + * ctx->pdrv.probe = kunit_platform_driver_probe; + * ctx->pdrv.driver.name = "kunit-platform"; + * ctx->pdrv.driver.owner = THIS_MODULE; + * + * KUNIT_EXPECT_EQ(test, 0, kunit_platform_driver_register(test, &ctx->pdrv)); + * <... wait for driver to probe ...> + * KUNIT_EXPECT_STREQ(test, ctx->data, "test data"); + * } + * + * Return: 0 on success, negative errno on failure. + */ +int kunit_platform_driver_register(struct kunit *test, + struct platform_driver *drv) +{ + int ret; + + ret = platform_driver_register(drv); + if (ret) + return ret; + + return kunit_add_action_or_reset(test, platform_driver_unregister_wrapper, drv); +} +EXPORT_SYMBOL_GPL(kunit_platform_driver_register); -- cgit v1.2.3 From 93c8332c8373fee415bd79f08d5ba4ba7ca5ad15 Mon Sep 17 00:00:00 2001 From: Xavier Date: Thu, 4 Jul 2024 14:24:43 +0800 Subject: Union-Find: add a new module in kernel library This patch implements a union-find data structure in the kernel library, which includes operations for allocating nodes, freeing nodes, finding the root of a node, and merging two nodes. Signed-off-by: Xavier Signed-off-by: Tejun Heo --- Documentation/core-api/union_find.rst | 102 +++++++++++++++++++++ .../translations/zh_CN/core-api/union_find.rst | 87 ++++++++++++++++++ MAINTAINERS | 9 ++ include/linux/union_find.h | 41 +++++++++ lib/Makefile | 2 +- lib/union_find.c | 49 ++++++++++ 6 files changed, 289 insertions(+), 1 deletion(-) create mode 100644 Documentation/core-api/union_find.rst create mode 100644 Documentation/translations/zh_CN/core-api/union_find.rst create mode 100644 include/linux/union_find.h create mode 100644 lib/union_find.c (limited to 'lib') diff --git a/Documentation/core-api/union_find.rst b/Documentation/core-api/union_find.rst new file mode 100644 index 000000000000..2bf0290c9184 --- /dev/null +++ b/Documentation/core-api/union_find.rst @@ -0,0 +1,102 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================== +Union-Find in Linux +==================== + + +:Date: June 21, 2024 +:Author: Xavier + +What is union-find, and what is it used for? +------------------------------------------------ + +Union-find is a data structure used to handle the merging and querying +of disjoint sets. The primary operations supported by union-find are: + + Initialization: Resetting each element as an individual set, with + each set's initial parent node pointing to itself. + Find: Determine which set a particular element belongs to, usually by + returning a “representative element” of that set. This operation + is used to check if two elements are in the same set. + Union: Merge two sets into one. + +As a data structure used to maintain sets (groups), union-find is commonly +utilized to solve problems related to offline queries, dynamic connectivity, +and graph theory. It is also a key component in Kruskal's algorithm for +computing the minimum spanning tree, which is crucial in scenarios like +network routing. Consequently, union-find is widely referenced. Additionally, +union-find has applications in symbolic computation, register allocation, +and more. + +Space Complexity: O(n), where n is the number of nodes. + +Time Complexity: Using path compression can reduce the time complexity of +the find operation, and using union by rank can reduce the time complexity +of the union operation. These optimizations reduce the average time +complexity of each find and union operation to O(α(n)), where α(n) is the +inverse Ackermann function. This can be roughly considered a constant time +complexity for practical purposes. + +This document covers use of the Linux union-find implementation. For more +information on the nature and implementation of union-find, see: + + Wikipedia entry on union-find + https://en.wikipedia.org/wiki/Disjoint-set_data_structure + +Linux implementation of union-find +----------------------------------- + +Linux's union-find implementation resides in the file "lib/union_find.c". +To use it, "#include ". + +The union-find data structure is defined as follows:: + + struct uf_node { + struct uf_node *parent; + unsigned int rank; + }; + +In this structure, parent points to the parent node of the current node. +The rank field represents the height of the current tree. During a union +operation, the tree with the smaller rank is attached under the tree with the +larger rank to maintain balance. + +Initializing union-find +-------------------- + +You can complete the initialization using either static or initialization +interface. Initialize the parent pointer to point to itself and set the rank +to 0. +Example:: + + struct uf_node my_node = UF_INIT_NODE(my_node); +or + uf_node_init(&my_node); + +Find the Root Node of union-find +-------------------------------- + +This operation is mainly used to determine whether two nodes belong to the same +set in the union-find. If they have the same root, they are in the same set. +During the find operation, path compression is performed to improve the +efficiency of subsequent find operations. +Example:: + + int connected; + struct uf_node *root1 = uf_find(&node_1); + struct uf_node *root2 = uf_find(&node_2); + if (root1 == root2) + connected = 1; + else + connected = 0; + +Union Two Sets in union-find +---------------------------- + +To union two sets in the union-find, you first find their respective root nodes +and then link the smaller node to the larger node based on the rank of the root +nodes. +Example:: + + uf_union(&node_1, &node_2); diff --git a/Documentation/translations/zh_CN/core-api/union_find.rst b/Documentation/translations/zh_CN/core-api/union_find.rst new file mode 100644 index 000000000000..a56de57147e9 --- /dev/null +++ b/Documentation/translations/zh_CN/core-api/union_find.rst @@ -0,0 +1,87 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. include:: ../disclaimer-zh_CN.rst + +:Original: Documentation/core-api/union_find.rst + +=========================== +Linux中的并查集(Union-Find) +=========================== + + +:日期: 2024年6月21日 +:作者: Xavier + +何为并查集,它有什么用? +--------------------- + +并查集是一种数据结构,用于处理一些不交集的合并及查询问题。并查集支持的主要操作: + 初始化:将每个元素初始化为单独的集合,每个集合的初始父节点指向自身 + 查询:查询某个元素属于哪个集合,通常是返回集合中的一个“代表元素”。这个操作是为 + 了判断两个元素是否在同一个集合之中。 + 合并:将两个集合合并为一个。 + +并查集作为一种用于维护集合(组)的数据结构,它通常用于解决一些离线查询、动态连通性和 +图论等相关问题,同时也是用于计算最小生成树的克鲁斯克尔算法中的关键,由于最小生成树在 +网络路由等场景下十分重要,并查集也得到了广泛的引用。此外,并查集在符号计算,寄存器分 +配等方面也有应用。 + +空间复杂度: O(n),n为节点数。 + +时间复杂度:使用路径压缩可以减少查找操作的时间复杂度,使用按秩合并可以减少合并操作的 +时间复杂度,使得并查集每个查询和合并操作的平均时间复杂度仅为O(α(n)),其中α(n)是反阿 +克曼函数,可以粗略地认为并查集的操作有常数的时间复杂度。 + +本文档涵盖了对Linux并查集实现的使用方法。更多关于并查集的性质和实现的信息,参见: + + 维基百科并查集词条 + https://en.wikipedia.org/wiki/Disjoint-set_data_structure + +并查集的Linux实现 +---------------- + +Linux的并查集实现在文件“lib/union_find.c”中。要使用它,需要 +“#include ”。 + +并查集的数据结构定义如下:: + + struct uf_node { + struct uf_node *parent; + unsigned int rank; + }; +其中parent为当前节点的父节点,rank为当前树的高度,在合并时将rank小的节点接到rank大 +的节点下面以增加平衡性。 + +初始化并查集 +--------- + +可以采用静态或初始化接口完成初始化操作。初始化时,parent 指针指向自身,rank 设置 +为 0。 +示例:: + + struct uf_node my_node = UF_INIT_NODE(my_node); +或 + uf_node_init(&my_node); + +查找并查集的根节点 +---------------- + +主要用于判断两个并查集是否属于一个集合,如果根相同,那么他们就是一个集合。在查找过程中 +会对路径进行压缩,提高后续查找效率。 +示例:: + + int connected; + struct uf_node *root1 = uf_find(&node_1); + struct uf_node *root2 = uf_find(&node_2); + if (root1 == root2) + connected = 1; + else + connected = 0; + +合并两个并查集 +------------- + +对于两个相交的并查集进行合并,会首先查找它们各自的根节点,然后根据根节点秩大小,将小的 +节点连接到大的节点下面。 +示例:: + + uf_union(&node_1, &node_2); diff --git a/MAINTAINERS b/MAINTAINERS index 42decde38320..82e3924816d2 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -23458,6 +23458,15 @@ F: drivers/cdrom/cdrom.c F: include/linux/cdrom.h F: include/uapi/linux/cdrom.h +UNION-FIND +M: Xavier +L: linux-kernel@vger.kernel.org +S: Maintained +F: Documentation/core-api/union_find.rst +F: Documentation/translations/zh_CN/core-api/union_find.rst +F: include/linux/union_find.h +F: lib/union_find.c + UNIVERSAL FLASH STORAGE HOST CONTROLLER DRIVER R: Alim Akhtar R: Avri Altman diff --git a/include/linux/union_find.h b/include/linux/union_find.h new file mode 100644 index 000000000000..cfd49263c138 --- /dev/null +++ b/include/linux/union_find.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_UNION_FIND_H +#define __LINUX_UNION_FIND_H +/** + * union_find.h - union-find data structure implementation + * + * This header provides functions and structures to implement the union-find + * data structure. The union-find data structure is used to manage disjoint + * sets and supports efficient union and find operations. + * + * See Documentation/core-api/union_find.rst for documentation and samples. + */ + +struct uf_node { + struct uf_node *parent; + unsigned int rank; +}; + +/* This macro is used for static initialization of a union-find node. */ +#define UF_INIT_NODE(node) {.parent = &node, .rank = 0} + +/** + * uf_node_init - Initialize a union-find node + * @node: pointer to the union-find node to be initialized + * + * This function sets the parent of the node to itself and + * initializes its rank to 0. + */ +static inline void uf_node_init(struct uf_node *node) +{ + node->parent = node; + node->rank = 0; +} + +/* find the root of a node */ +struct uf_node *uf_find(struct uf_node *node); + +/* Merge two intersecting nodes */ +void uf_union(struct uf_node *node1, struct uf_node *node2); + +#endif /* __LINUX_UNION_FIND_H */ diff --git a/lib/Makefile b/lib/Makefile index 322bb127b4dc..a5e3c1d5b6f9 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -34,7 +34,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ nmi_backtrace.o win_minmax.o memcat_p.o \ - buildid.o objpool.o + buildid.o objpool.o union_find.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/union_find.c b/lib/union_find.c new file mode 100644 index 000000000000..413b0f8adf7a --- /dev/null +++ b/lib/union_find.c @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +/** + * uf_find - Find the root of a node and perform path compression + * @node: the node to find the root of + * + * This function returns the root of the node by following the parent + * pointers. It also performs path compression, making the tree shallower. + * + * Returns the root node of the set containing node. + */ +struct uf_node *uf_find(struct uf_node *node) +{ + struct uf_node *parent; + + while (node->parent != node) { + parent = node->parent; + node->parent = parent->parent; + node = parent; + } + return node; +} + +/** + * uf_union - Merge two sets, using union by rank + * @node1: the first node + * @node2: the second node + * + * This function merges the sets containing node1 and node2, by comparing + * the ranks to keep the tree balanced. + */ +void uf_union(struct uf_node *node1, struct uf_node *node2) +{ + struct uf_node *root1 = uf_find(node1); + struct uf_node *root2 = uf_find(node2); + + if (root1 == root2) + return; + + if (root1->rank < root2->rank) { + root1->parent = root2; + } else if (root1->rank > root2->rank) { + root2->parent = root1; + } else { + root2->parent = root1; + root1->rank++; + } +} -- cgit v1.2.3 From 7424fc6b86c8980a87169e005f5cd4438d18efe6 Mon Sep 17 00:00:00 2001 From: Gatlin Newhouse Date: Wed, 24 Jul 2024 00:01:55 +0000 Subject: x86/traps: Enable UBSAN traps on x86 Currently ARM64 extracts which specific sanitizer has caused a trap via encoded data in the trap instruction. Clang on x86 currently encodes the same data in the UD1 instruction but x86 handle_bug() and is_valid_bugaddr() currently only look at UD2. Bring x86 to parity with ARM64, similar to commit 25b84002afb9 ("arm64: Support Clang UBSAN trap codes for better reporting"). See the llvm links for information about the code generation. Enable the reporting of UBSAN sanitizer details on x86 compiled with clang when CONFIG_UBSAN_TRAP=y by analysing UD1 and retrieving the type immediate which is encoded by the compiler after the UD1. [ tglx: Simplified it by moving the printk() into handle_bug() ] Signed-off-by: Gatlin Newhouse Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: Kees Cook Link: https://lore.kernel.org/all/20240724000206.451425-1-gatlin.newhouse@gmail.com Link: https://github.com/llvm/llvm-project/commit/c5978f42ec8e9#diff-bb68d7cd885f41cfc35843998b0f9f534adb60b415f647109e597ce448e92d9f Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/X86/X86InstrSystem.td#L27 --- arch/x86/include/asm/bug.h | 12 ++++++++++ arch/x86/kernel/traps.c | 59 ++++++++++++++++++++++++++++++++++++++++++---- include/linux/ubsan.h | 5 ++++ lib/Kconfig.ubsan | 4 ++-- 4 files changed, 73 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index a3ec87d198ac..806649c7f23d 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -13,6 +13,18 @@ #define INSN_UD2 0x0b0f #define LEN_UD2 2 +/* + * In clang we have UD1s reporting UBSAN failures on X86, 64 and 32bit. + */ +#define INSN_ASOP 0x67 +#define OPCODE_ESCAPE 0x0f +#define SECOND_BYTE_OPCODE_UD1 0xb9 +#define SECOND_BYTE_OPCODE_UD2 0x0b + +#define BUG_NONE 0xffff +#define BUG_UD1 0xfffe +#define BUG_UD2 0xfffd + #ifdef CONFIG_GENERIC_BUG #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4fa0b17e5043..415881607c5d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -91,6 +92,47 @@ __always_inline int is_valid_bugaddr(unsigned long addr) return *(unsigned short *)addr == INSN_UD2; } +/* + * Check for UD1 or UD2, accounting for Address Size Override Prefixes. + * If it's a UD1, get the ModRM byte to pass along to UBSan. + */ +__always_inline int decode_bug(unsigned long addr, u32 *imm) +{ + u8 v; + + if (addr < TASK_SIZE_MAX) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == INSN_ASOP) + v = *(u8 *)(addr++); + if (v != OPCODE_ESCAPE) + return BUG_NONE; + + v = *(u8 *)(addr++); + if (v == SECOND_BYTE_OPCODE_UD2) + return BUG_UD2; + + if (!IS_ENABLED(CONFIG_UBSAN_TRAP) || v != SECOND_BYTE_OPCODE_UD1) + return BUG_NONE; + + /* Retrieve the immediate (type value) for the UBSAN UD1 */ + v = *(u8 *)(addr++); + if (X86_MODRM_RM(v) == 4) + addr++; + + *imm = 0; + if (X86_MODRM_MOD(v) == 1) + *imm = *(u8 *)addr; + else if (X86_MODRM_MOD(v) == 2) + *imm = *(u32 *)addr; + else + WARN_ONCE(1, "Unexpected MODRM_MOD: %u\n", X86_MODRM_MOD(v)); + + return BUG_UD1; +} + + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, const char *str, struct pt_regs *regs, long error_code) @@ -216,6 +258,8 @@ static inline void handle_invalid_op(struct pt_regs *regs) static noinstr bool handle_bug(struct pt_regs *regs) { bool handled = false; + int ud_type; + u32 imm; /* * Normally @regs are unpoisoned by irqentry_enter(), but handle_bug() @@ -223,7 +267,8 @@ static noinstr bool handle_bug(struct pt_regs *regs) * irqentry_enter(). */ kmsan_unpoison_entry_regs(regs); - if (!is_valid_bugaddr(regs->ip)) + ud_type = decode_bug(regs->ip, &imm); + if (ud_type == BUG_NONE) return handled; /* @@ -236,10 +281,14 @@ static noinstr bool handle_bug(struct pt_regs *regs) */ if (regs->flags & X86_EFLAGS_IF) raw_local_irq_enable(); - if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || - handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { - regs->ip += LEN_UD2; - handled = true; + if (ud_type == BUG_UD2) { + if (report_bug(regs->ip, regs) == BUG_TRAP_TYPE_WARN || + handle_cfi_failure(regs) == BUG_TRAP_TYPE_WARN) { + regs->ip += LEN_UD2; + handled = true; + } + } else if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { + pr_crit("%s at %pS\n", report_ubsan_failure(regs, imm), (void *)regs->ip); } if (regs->flags & X86_EFLAGS_IF) raw_local_irq_disable(); diff --git a/include/linux/ubsan.h b/include/linux/ubsan.h index bff7445498de..d8219cbe09ff 100644 --- a/include/linux/ubsan.h +++ b/include/linux/ubsan.h @@ -4,6 +4,11 @@ #ifdef CONFIG_UBSAN_TRAP const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type); +#else +static inline const char *report_ubsan_failure(struct pt_regs *regs, u32 check_type) +{ + return NULL; +} #endif #endif diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index bdda600f8dfb..1d4aa7a83b3a 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -29,8 +29,8 @@ config UBSAN_TRAP Also note that selecting Y will cause your kernel to Oops with an "illegal instruction" error with no further details - when a UBSAN violation occurs. (Except on arm64, which will - report which Sanitizer failed.) This may make it hard to + when a UBSAN violation occurs. (Except on arm64 and x86, which + will report which Sanitizer failed.) This may make it hard to determine whether an Oops was caused by UBSAN or to figure out the details of a UBSAN violation. It makes the kernel log output less useful for bug reports. -- cgit v1.2.3 From 6cd0dd934b03d4ee4094ac474108723e2f2ed7d6 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:31 +0200 Subject: kcov: Add interrupt handling self test Add a boot self test that can catch sprious coverage from interrupts. The coverage callback filters out interrupt code, but only after the handler updates preempt count. Some code periodically leaks out of that section and leads to spurious coverage. Add a best-effort (but simple) test that is likely to catch such bugs. If the test is enabled on CI systems that use KCOV, they should catch any issues fast. Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Link: https://lore.kernel.org/all/7662127c97e29da1a748ad1c1539dd7b65b737b2.1718092070.git.dvyukov@google.com --- kernel/kcov.c | 31 +++++++++++++++++++++++++++++++ lib/Kconfig.debug | 8 ++++++++ 2 files changed, 39 insertions(+) (limited to 'lib') diff --git a/kernel/kcov.c b/kernel/kcov.c index f0a69d402066..d9d4a0c04185 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1058,6 +1059,32 @@ u64 kcov_common_handle(void) } EXPORT_SYMBOL(kcov_common_handle); +#ifdef CONFIG_KCOV_SELFTEST +static void __init selftest(void) +{ + unsigned long start; + + pr_err("running self test\n"); + /* + * Test that interrupts don't produce spurious coverage. + * The coverage callback filters out interrupt code, but only + * after the handler updates preempt count. Some code periodically + * leaks out of that section and leads to spurious coverage. + * It's hard to call the actual interrupt handler directly, + * so we just loop here for a bit waiting for a timer interrupt. + * We set kcov_mode to enable tracing, but don't setup the area, + * so any attempt to trace will crash. Note: we must not call any + * potentially traced functions in this region. + */ + start = jiffies; + current->kcov_mode = KCOV_MODE_TRACE_PC; + while ((jiffies - start) * MSEC_PER_SEC / HZ < 300) + ; + current->kcov_mode = 0; + pr_err("done running self test\n"); +} +#endif + static int __init kcov_init(void) { int cpu; @@ -1077,6 +1104,10 @@ static int __init kcov_init(void) */ debugfs_create_file_unsafe("kcov", 0600, NULL, NULL, &kcov_fops); +#ifdef CONFIG_KCOV_SELFTEST + selftest(); +#endif + return 0; } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..270e367b3e6f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2173,6 +2173,14 @@ config KCOV_IRQ_AREA_SIZE soft interrupts. This specifies the size of those areas in the number of unsigned long words. +config KCOV_SELFTEST + bool "Perform short selftests on boot" + depends on KCOV + help + Run short KCOV coverage collection selftests on boot. + On test failure, causes the kernel to panic. Recommended to be + enabled, ensuring critical functionality works as intended. + menuconfig RUNTIME_TESTING_MENU bool "Runtime Testing" default y -- cgit v1.2.3 From da4fe6815aca25603944a64b0965310512e867d0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 2 Aug 2024 14:09:00 +0800 Subject: Revert "lib/mpi: Introduce ec implementation to MPI library" This reverts commit d58bb7e55a8a65894cc02f27c3e2bf9403e7c40f. It's no longer needed since sm2 has been removed. Signed-off-by: Herbert Xu --- include/linux/mpi.h | 105 ---- lib/crypto/mpi/Makefile | 1 - lib/crypto/mpi/ec.c | 1507 ----------------------------------------------- 3 files changed, 1613 deletions(-) delete mode 100644 lib/crypto/mpi/ec.c (limited to 'lib') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index eb0d1c1db208..89b720893e12 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -157,111 +157,6 @@ void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); /*-- mpi-inv.c --*/ int mpi_invm(MPI x, MPI a, MPI n); -/*-- ec.c --*/ - -/* Object to represent a point in projective coordinates */ -struct gcry_mpi_point { - MPI x; - MPI y; - MPI z; -}; - -typedef struct gcry_mpi_point *MPI_POINT; - -/* Models describing an elliptic curve */ -enum gcry_mpi_ec_models { - /* The Short Weierstrass equation is - * y^2 = x^3 + ax + b - */ - MPI_EC_WEIERSTRASS = 0, - /* The Montgomery equation is - * by^2 = x^3 + ax^2 + x - */ - MPI_EC_MONTGOMERY, - /* The Twisted Edwards equation is - * ax^2 + y^2 = 1 + bx^2y^2 - * Note that we use 'b' instead of the commonly used 'd'. - */ - MPI_EC_EDWARDS -}; - -/* Dialects used with elliptic curves */ -enum ecc_dialects { - ECC_DIALECT_STANDARD = 0, - ECC_DIALECT_ED25519, - ECC_DIALECT_SAFECURVE -}; - -/* This context is used with all our EC functions. */ -struct mpi_ec_ctx { - enum gcry_mpi_ec_models model; /* The model describing this curve. */ - enum ecc_dialects dialect; /* The ECC dialect used with the curve. */ - int flags; /* Public key flags (not always used). */ - unsigned int nbits; /* Number of bits. */ - - /* Domain parameters. Note that they may not all be set and if set - * the MPIs may be flagged as constant. - */ - MPI p; /* Prime specifying the field GF(p). */ - MPI a; /* First coefficient of the Weierstrass equation. */ - MPI b; /* Second coefficient of the Weierstrass equation. */ - MPI_POINT G; /* Base point (generator). */ - MPI n; /* Order of G. */ - unsigned int h; /* Cofactor. */ - - /* The actual key. May not be set. */ - MPI_POINT Q; /* Public key. */ - MPI d; /* Private key. */ - - const char *name; /* Name of the curve. */ - - /* This structure is private to mpi/ec.c! */ - struct { - struct { - unsigned int a_is_pminus3:1; - unsigned int two_inv_p:1; - } valid; /* Flags to help setting the helper vars below. */ - - int a_is_pminus3; /* True if A = P - 3. */ - - MPI two_inv_p; - - mpi_barrett_t p_barrett; - - /* Scratch variables. */ - MPI scratch[11]; - - /* Helper for fast reduction. */ - /* int nist_nbits; /\* If this is a NIST curve, the # of bits. *\/ */ - /* MPI s[10]; */ - /* MPI c; */ - } t; - - /* Curve specific computation routines for the field. */ - void (*addm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*subm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ec); - void (*mulm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*pow2)(MPI w, const MPI b, struct mpi_ec_ctx *ctx); - void (*mul2)(MPI w, MPI u, struct mpi_ec_ctx *ctx); -}; - -void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model, - enum ecc_dialects dialect, - int flags, MPI p, MPI a, MPI b); -void mpi_ec_deinit(struct mpi_ec_ctx *ctx); -MPI_POINT mpi_point_new(unsigned int nbits); -void mpi_point_release(MPI_POINT p); -void mpi_point_init(MPI_POINT p); -void mpi_point_free_parts(MPI_POINT p); -int mpi_ec_get_affine(MPI x, MPI y, MPI_POINT point, struct mpi_ec_ctx *ctx); -void mpi_ec_add_points(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx); -void mpi_ec_mul_point(MPI_POINT result, - MPI scalar, MPI_POINT point, - struct mpi_ec_ctx *ctx); -int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx); - /* inline functions */ /** diff --git a/lib/crypto/mpi/Makefile b/lib/crypto/mpi/Makefile index 6e6ef9a34fe1..477debd7ed50 100644 --- a/lib/crypto/mpi/Makefile +++ b/lib/crypto/mpi/Makefile @@ -13,7 +13,6 @@ mpi-y = \ generic_mpih-rshift.o \ generic_mpih-sub1.o \ generic_mpih-add1.o \ - ec.o \ mpicoder.o \ mpi-add.o \ mpi-bit.o \ diff --git a/lib/crypto/mpi/ec.c b/lib/crypto/mpi/ec.c deleted file mode 100644 index 4781f00982ef..000000000000 --- a/lib/crypto/mpi/ec.c +++ /dev/null @@ -1,1507 +0,0 @@ -/* ec.c - Elliptic Curve functions - * Copyright (C) 2007 Free Software Foundation, Inc. - * Copyright (C) 2013 g10 Code GmbH - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -#include "mpi-internal.h" -#include "longlong.h" - -#define point_init(a) mpi_point_init((a)) -#define point_free(a) mpi_point_free_parts((a)) - -#define log_error(fmt, ...) pr_err(fmt, ##__VA_ARGS__) -#define log_fatal(fmt, ...) pr_err(fmt, ##__VA_ARGS__) - -#define DIM(v) (sizeof(v)/sizeof((v)[0])) - - -/* Create a new point option. NBITS gives the size in bits of one - * coordinate; it is only used to pre-allocate some resources and - * might also be passed as 0 to use a default value. - */ -MPI_POINT mpi_point_new(unsigned int nbits) -{ - MPI_POINT p; - - (void)nbits; /* Currently not used. */ - - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (p) - mpi_point_init(p); - return p; -} -EXPORT_SYMBOL_GPL(mpi_point_new); - -/* Release the point object P. P may be NULL. */ -void mpi_point_release(MPI_POINT p) -{ - if (p) { - mpi_point_free_parts(p); - kfree(p); - } -} -EXPORT_SYMBOL_GPL(mpi_point_release); - -/* Initialize the fields of a point object. gcry_mpi_point_free_parts - * may be used to release the fields. - */ -void mpi_point_init(MPI_POINT p) -{ - p->x = mpi_new(0); - p->y = mpi_new(0); - p->z = mpi_new(0); -} -EXPORT_SYMBOL_GPL(mpi_point_init); - -/* Release the parts of a point object. */ -void mpi_point_free_parts(MPI_POINT p) -{ - mpi_free(p->x); p->x = NULL; - mpi_free(p->y); p->y = NULL; - mpi_free(p->z); p->z = NULL; -} -EXPORT_SYMBOL_GPL(mpi_point_free_parts); - -/* Set the value from S into D. */ -static void point_set(MPI_POINT d, MPI_POINT s) -{ - mpi_set(d->x, s->x); - mpi_set(d->y, s->y); - mpi_set(d->z, s->z); -} - -static void point_resize(MPI_POINT p, struct mpi_ec_ctx *ctx) -{ - size_t nlimbs = ctx->p->nlimbs; - - mpi_resize(p->x, nlimbs); - p->x->nlimbs = nlimbs; - mpi_resize(p->z, nlimbs); - p->z->nlimbs = nlimbs; - - if (ctx->model != MPI_EC_MONTGOMERY) { - mpi_resize(p->y, nlimbs); - p->y->nlimbs = nlimbs; - } -} - -static void point_swap_cond(MPI_POINT d, MPI_POINT s, unsigned long swap, - struct mpi_ec_ctx *ctx) -{ - mpi_swap_cond(d->x, s->x, swap); - if (ctx->model != MPI_EC_MONTGOMERY) - mpi_swap_cond(d->y, s->y, swap); - mpi_swap_cond(d->z, s->z, swap); -} - - -/* W = W mod P. */ -static void ec_mod(MPI w, struct mpi_ec_ctx *ec) -{ - if (ec->t.p_barrett) - mpi_mod_barrett(w, w, ec->t.p_barrett); - else - mpi_mod(w, w, ec->p); -} - -static void ec_addm(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_add(w, u, v); - ec_mod(w, ctx); -} - -static void ec_subm(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ec) -{ - mpi_sub(w, u, v); - while (w->sign) - mpi_add(w, w, ec->p); - /*ec_mod(w, ec);*/ -} - -static void ec_mulm(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_mul(w, u, v); - ec_mod(w, ctx); -} - -/* W = 2 * U mod P. */ -static void ec_mul2(MPI w, MPI u, struct mpi_ec_ctx *ctx) -{ - mpi_lshift(w, u, 1); - ec_mod(w, ctx); -} - -static void ec_powm(MPI w, const MPI b, const MPI e, - struct mpi_ec_ctx *ctx) -{ - mpi_powm(w, b, e, ctx->p); - /* mpi_abs(w); */ -} - -/* Shortcut for - * ec_powm(B, B, mpi_const(MPI_C_TWO), ctx); - * for easier optimization. - */ -static void ec_pow2(MPI w, const MPI b, struct mpi_ec_ctx *ctx) -{ - /* Using mpi_mul is slightly faster (at least on amd64). */ - /* mpi_powm(w, b, mpi_const(MPI_C_TWO), ctx->p); */ - ec_mulm(w, b, b, ctx); -} - -/* Shortcut for - * ec_powm(B, B, mpi_const(MPI_C_THREE), ctx); - * for easier optimization. - */ -static void ec_pow3(MPI w, const MPI b, struct mpi_ec_ctx *ctx) -{ - mpi_powm(w, b, mpi_const(MPI_C_THREE), ctx->p); -} - -static void ec_invm(MPI x, MPI a, struct mpi_ec_ctx *ctx) -{ - if (!mpi_invm(x, a, ctx->p)) - log_error("ec_invm: inverse does not exist:\n"); -} - -static void mpih_set_cond(mpi_ptr_t wp, mpi_ptr_t up, - mpi_size_t usize, unsigned long set) -{ - mpi_size_t i; - mpi_limb_t mask = ((mpi_limb_t)0) - set; - mpi_limb_t x; - - for (i = 0; i < usize; i++) { - x = mask & (wp[i] ^ up[i]); - wp[i] = wp[i] ^ x; - } -} - -/* Routines for 2^255 - 19. */ - -#define LIMB_SIZE_25519 ((256+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB) - -static void ec_addm_25519(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_25519; - mpi_limb_t n[LIMB_SIZE_25519]; - mpi_limb_t borrow; - - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("addm_25519: different sizes\n"); - - memset(n, 0, sizeof(n)); - up = u->d; - vp = v->d; - wp = w->d; - - mpihelp_add_n(wp, up, vp, wsize); - borrow = mpihelp_sub_n(wp, wp, ctx->p->d, wsize); - mpih_set_cond(n, ctx->p->d, wsize, (borrow != 0UL)); - mpihelp_add_n(wp, wp, n, wsize); - wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB)); -} - -static void ec_subm_25519(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_25519; - mpi_limb_t n[LIMB_SIZE_25519]; - mpi_limb_t borrow; - - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("subm_25519: different sizes\n"); - - memset(n, 0, sizeof(n)); - up = u->d; - vp = v->d; - wp = w->d; - - borrow = mpihelp_sub_n(wp, up, vp, wsize); - mpih_set_cond(n, ctx->p->d, wsize, (borrow != 0UL)); - mpihelp_add_n(wp, wp, n, wsize); - wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB)); -} - -static void ec_mulm_25519(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_25519; - mpi_limb_t n[LIMB_SIZE_25519*2]; - mpi_limb_t m[LIMB_SIZE_25519+1]; - mpi_limb_t cy; - int msb; - - (void)ctx; - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("mulm_25519: different sizes\n"); - - up = u->d; - vp = v->d; - wp = w->d; - - mpihelp_mul_n(n, up, vp, wsize); - memcpy(wp, n, wsize * BYTES_PER_MPI_LIMB); - wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB)); - - memcpy(m, n+LIMB_SIZE_25519-1, (wsize+1) * BYTES_PER_MPI_LIMB); - mpihelp_rshift(m, m, LIMB_SIZE_25519+1, (255 % BITS_PER_MPI_LIMB)); - - memcpy(n, m, wsize * BYTES_PER_MPI_LIMB); - cy = mpihelp_lshift(m, m, LIMB_SIZE_25519, 4); - m[LIMB_SIZE_25519] = cy; - cy = mpihelp_add_n(m, m, n, wsize); - m[LIMB_SIZE_25519] += cy; - cy = mpihelp_add_n(m, m, n, wsize); - m[LIMB_SIZE_25519] += cy; - cy = mpihelp_add_n(m, m, n, wsize); - m[LIMB_SIZE_25519] += cy; - - cy = mpihelp_add_n(wp, wp, m, wsize); - m[LIMB_SIZE_25519] += cy; - - memset(m, 0, wsize * BYTES_PER_MPI_LIMB); - msb = (wp[LIMB_SIZE_25519-1] >> (255 % BITS_PER_MPI_LIMB)); - m[0] = (m[LIMB_SIZE_25519] * 2 + msb) * 19; - wp[LIMB_SIZE_25519-1] &= ~((mpi_limb_t)1 << (255 % BITS_PER_MPI_LIMB)); - mpihelp_add_n(wp, wp, m, wsize); - - m[0] = 0; - cy = mpihelp_sub_n(wp, wp, ctx->p->d, wsize); - mpih_set_cond(m, ctx->p->d, wsize, (cy != 0UL)); - mpihelp_add_n(wp, wp, m, wsize); -} - -static void ec_mul2_25519(MPI w, MPI u, struct mpi_ec_ctx *ctx) -{ - ec_addm_25519(w, u, u, ctx); -} - -static void ec_pow2_25519(MPI w, const MPI b, struct mpi_ec_ctx *ctx) -{ - ec_mulm_25519(w, b, b, ctx); -} - -/* Routines for 2^448 - 2^224 - 1. */ - -#define LIMB_SIZE_448 ((448+BITS_PER_MPI_LIMB-1)/BITS_PER_MPI_LIMB) -#define LIMB_SIZE_HALF_448 ((LIMB_SIZE_448+1)/2) - -static void ec_addm_448(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_448; - mpi_limb_t n[LIMB_SIZE_448]; - mpi_limb_t cy; - - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("addm_448: different sizes\n"); - - memset(n, 0, sizeof(n)); - up = u->d; - vp = v->d; - wp = w->d; - - cy = mpihelp_add_n(wp, up, vp, wsize); - mpih_set_cond(n, ctx->p->d, wsize, (cy != 0UL)); - mpihelp_sub_n(wp, wp, n, wsize); -} - -static void ec_subm_448(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_448; - mpi_limb_t n[LIMB_SIZE_448]; - mpi_limb_t borrow; - - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("subm_448: different sizes\n"); - - memset(n, 0, sizeof(n)); - up = u->d; - vp = v->d; - wp = w->d; - - borrow = mpihelp_sub_n(wp, up, vp, wsize); - mpih_set_cond(n, ctx->p->d, wsize, (borrow != 0UL)); - mpihelp_add_n(wp, wp, n, wsize); -} - -static void ec_mulm_448(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx) -{ - mpi_ptr_t wp, up, vp; - mpi_size_t wsize = LIMB_SIZE_448; - mpi_limb_t n[LIMB_SIZE_448*2]; - mpi_limb_t a2[LIMB_SIZE_HALF_448]; - mpi_limb_t a3[LIMB_SIZE_HALF_448]; - mpi_limb_t b0[LIMB_SIZE_HALF_448]; - mpi_limb_t b1[LIMB_SIZE_HALF_448]; - mpi_limb_t cy; - int i; -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - mpi_limb_t b1_rest, a3_rest; -#endif - - if (w->nlimbs != wsize || u->nlimbs != wsize || v->nlimbs != wsize) - log_bug("mulm_448: different sizes\n"); - - up = u->d; - vp = v->d; - wp = w->d; - - mpihelp_mul_n(n, up, vp, wsize); - - for (i = 0; i < (wsize + 1) / 2; i++) { - b0[i] = n[i]; - b1[i] = n[i+wsize/2]; - a2[i] = n[i+wsize]; - a3[i] = n[i+wsize+wsize/2]; - } - -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - b0[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL << 32)-1; - a2[LIMB_SIZE_HALF_448-1] &= ((mpi_limb_t)1UL << 32)-1; - - b1_rest = 0; - a3_rest = 0; - - for (i = (wsize + 1) / 2 - 1; i >= 0; i--) { - mpi_limb_t b1v, a3v; - b1v = b1[i]; - a3v = a3[i]; - b1[i] = (b1_rest << 32) | (b1v >> 32); - a3[i] = (a3_rest << 32) | (a3v >> 32); - b1_rest = b1v & (((mpi_limb_t)1UL << 32)-1); - a3_rest = a3v & (((mpi_limb_t)1UL << 32)-1); - } -#endif - - cy = mpihelp_add_n(b0, b0, a2, LIMB_SIZE_HALF_448); - cy += mpihelp_add_n(b0, b0, a3, LIMB_SIZE_HALF_448); - for (i = 0; i < (wsize + 1) / 2; i++) - wp[i] = b0[i]; -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - wp[LIMB_SIZE_HALF_448-1] &= (((mpi_limb_t)1UL << 32)-1); -#endif - -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - cy = b0[LIMB_SIZE_HALF_448-1] >> 32; -#endif - - cy = mpihelp_add_1(b1, b1, LIMB_SIZE_HALF_448, cy); - cy += mpihelp_add_n(b1, b1, a2, LIMB_SIZE_HALF_448); - cy += mpihelp_add_n(b1, b1, a3, LIMB_SIZE_HALF_448); - cy += mpihelp_add_n(b1, b1, a3, LIMB_SIZE_HALF_448); -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - b1_rest = 0; - for (i = (wsize + 1) / 2 - 1; i >= 0; i--) { - mpi_limb_t b1v = b1[i]; - b1[i] = (b1_rest << 32) | (b1v >> 32); - b1_rest = b1v & (((mpi_limb_t)1UL << 32)-1); - } - wp[LIMB_SIZE_HALF_448-1] |= (b1_rest << 32); -#endif - for (i = 0; i < wsize / 2; i++) - wp[i+(wsize + 1) / 2] = b1[i]; - -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - cy = b1[LIMB_SIZE_HALF_448-1]; -#endif - - memset(n, 0, wsize * BYTES_PER_MPI_LIMB); - -#if (LIMB_SIZE_HALF_448 > LIMB_SIZE_448/2) - n[LIMB_SIZE_HALF_448-1] = cy << 32; -#else - n[LIMB_SIZE_HALF_448] = cy; -#endif - n[0] = cy; - mpihelp_add_n(wp, wp, n, wsize); - - memset(n, 0, wsize * BYTES_PER_MPI_LIMB); - cy = mpihelp_sub_n(wp, wp, ctx->p->d, wsize); - mpih_set_cond(n, ctx->p->d, wsize, (cy != 0UL)); - mpihelp_add_n(wp, wp, n, wsize); -} - -static void ec_mul2_448(MPI w, MPI u, struct mpi_ec_ctx *ctx) -{ - ec_addm_448(w, u, u, ctx); -} - -static void ec_pow2_448(MPI w, const MPI b, struct mpi_ec_ctx *ctx) -{ - ec_mulm_448(w, b, b, ctx); -} - -struct field_table { - const char *p; - - /* computation routines for the field. */ - void (*addm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*subm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*mulm)(MPI w, MPI u, MPI v, struct mpi_ec_ctx *ctx); - void (*mul2)(MPI w, MPI u, struct mpi_ec_ctx *ctx); - void (*pow2)(MPI w, const MPI b, struct mpi_ec_ctx *ctx); -}; - -static const struct field_table field_table[] = { - { - "0x7FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFED", - ec_addm_25519, - ec_subm_25519, - ec_mulm_25519, - ec_mul2_25519, - ec_pow2_25519 - }, - { - "0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFE" - "FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF", - ec_addm_448, - ec_subm_448, - ec_mulm_448, - ec_mul2_448, - ec_pow2_448 - }, - { NULL, NULL, NULL, NULL, NULL, NULL }, -}; - -/* Force recomputation of all helper variables. */ -static void mpi_ec_get_reset(struct mpi_ec_ctx *ec) -{ - ec->t.valid.a_is_pminus3 = 0; - ec->t.valid.two_inv_p = 0; -} - -/* Accessor for helper variable. */ -static int ec_get_a_is_pminus3(struct mpi_ec_ctx *ec) -{ - MPI tmp; - - if (!ec->t.valid.a_is_pminus3) { - ec->t.valid.a_is_pminus3 = 1; - tmp = mpi_alloc_like(ec->p); - mpi_sub_ui(tmp, ec->p, 3); - ec->t.a_is_pminus3 = !mpi_cmp(ec->a, tmp); - mpi_free(tmp); - } - - return ec->t.a_is_pminus3; -} - -/* Accessor for helper variable. */ -static MPI ec_get_two_inv_p(struct mpi_ec_ctx *ec) -{ - if (!ec->t.valid.two_inv_p) { - ec->t.valid.two_inv_p = 1; - if (!ec->t.two_inv_p) - ec->t.two_inv_p = mpi_alloc(0); - ec_invm(ec->t.two_inv_p, mpi_const(MPI_C_TWO), ec); - } - return ec->t.two_inv_p; -} - -static const char *const curve25519_bad_points[] = { - "0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffed", - "0x0000000000000000000000000000000000000000000000000000000000000000", - "0x0000000000000000000000000000000000000000000000000000000000000001", - "0x00b8495f16056286fdb1329ceb8d09da6ac49ff1fae35616aeb8413b7c7aebe0", - "0x57119fd0dd4e22d8868e1c58c45c44045bef839c55b1d0b1248c50a3bc959c5f", - "0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffec", - "0x7fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffee", - NULL -}; - -static const char *const curve448_bad_points[] = { - "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffe" - "ffffffffffffffffffffffffffffffffffffffffffffffffffffffff", - "0x00000000000000000000000000000000000000000000000000000000" - "00000000000000000000000000000000000000000000000000000000", - "0x00000000000000000000000000000000000000000000000000000000" - "00000000000000000000000000000000000000000000000000000001", - "0xfffffffffffffffffffffffffffffffffffffffffffffffffffffffe" - "fffffffffffffffffffffffffffffffffffffffffffffffffffffffe", - "0xffffffffffffffffffffffffffffffffffffffffffffffffffffffff" - "00000000000000000000000000000000000000000000000000000000", - NULL -}; - -static const char *const *bad_points_table[] = { - curve25519_bad_points, - curve448_bad_points, -}; - -static void mpi_ec_coefficient_normalize(MPI a, MPI p) -{ - if (a->sign) { - mpi_resize(a, p->nlimbs); - mpihelp_sub_n(a->d, p->d, a->d, p->nlimbs); - a->nlimbs = p->nlimbs; - a->sign = 0; - } -} - -/* This function initialized a context for elliptic curve based on the - * field GF(p). P is the prime specifying this field, A is the first - * coefficient. CTX is expected to be zeroized. - */ -void mpi_ec_init(struct mpi_ec_ctx *ctx, enum gcry_mpi_ec_models model, - enum ecc_dialects dialect, - int flags, MPI p, MPI a, MPI b) -{ - int i; - static int use_barrett = -1 /* TODO: 1 or -1 */; - - mpi_ec_coefficient_normalize(a, p); - mpi_ec_coefficient_normalize(b, p); - - /* Fixme: Do we want to check some constraints? e.g. a < p */ - - ctx->model = model; - ctx->dialect = dialect; - ctx->flags = flags; - if (dialect == ECC_DIALECT_ED25519) - ctx->nbits = 256; - else - ctx->nbits = mpi_get_nbits(p); - ctx->p = mpi_copy(p); - ctx->a = mpi_copy(a); - ctx->b = mpi_copy(b); - - ctx->d = NULL; - ctx->t.two_inv_p = NULL; - - ctx->t.p_barrett = use_barrett > 0 ? mpi_barrett_init(ctx->p, 0) : NULL; - - mpi_ec_get_reset(ctx); - - if (model == MPI_EC_MONTGOMERY) { - for (i = 0; i < DIM(bad_points_table); i++) { - MPI p_candidate = mpi_scanval(bad_points_table[i][0]); - int match_p = !mpi_cmp(ctx->p, p_candidate); - int j; - - mpi_free(p_candidate); - if (!match_p) - continue; - - for (j = 0; i < DIM(ctx->t.scratch) && bad_points_table[i][j]; j++) - ctx->t.scratch[j] = mpi_scanval(bad_points_table[i][j]); - } - } else { - /* Allocate scratch variables. */ - for (i = 0; i < DIM(ctx->t.scratch); i++) - ctx->t.scratch[i] = mpi_alloc_like(ctx->p); - } - - ctx->addm = ec_addm; - ctx->subm = ec_subm; - ctx->mulm = ec_mulm; - ctx->mul2 = ec_mul2; - ctx->pow2 = ec_pow2; - - for (i = 0; field_table[i].p; i++) { - MPI f_p; - - f_p = mpi_scanval(field_table[i].p); - if (!f_p) - break; - - if (!mpi_cmp(p, f_p)) { - ctx->addm = field_table[i].addm; - ctx->subm = field_table[i].subm; - ctx->mulm = field_table[i].mulm; - ctx->mul2 = field_table[i].mul2; - ctx->pow2 = field_table[i].pow2; - mpi_free(f_p); - - mpi_resize(ctx->a, ctx->p->nlimbs); - ctx->a->nlimbs = ctx->p->nlimbs; - - mpi_resize(ctx->b, ctx->p->nlimbs); - ctx->b->nlimbs = ctx->p->nlimbs; - - for (i = 0; i < DIM(ctx->t.scratch) && ctx->t.scratch[i]; i++) - ctx->t.scratch[i]->nlimbs = ctx->p->nlimbs; - - break; - } - - mpi_free(f_p); - } -} -EXPORT_SYMBOL_GPL(mpi_ec_init); - -void mpi_ec_deinit(struct mpi_ec_ctx *ctx) -{ - int i; - - mpi_barrett_free(ctx->t.p_barrett); - - /* Domain parameter. */ - mpi_free(ctx->p); - mpi_free(ctx->a); - mpi_free(ctx->b); - mpi_point_release(ctx->G); - mpi_free(ctx->n); - - /* The key. */ - mpi_point_release(ctx->Q); - mpi_free(ctx->d); - - /* Private data of ec.c. */ - mpi_free(ctx->t.two_inv_p); - - for (i = 0; i < DIM(ctx->t.scratch); i++) - mpi_free(ctx->t.scratch[i]); -} -EXPORT_SYMBOL_GPL(mpi_ec_deinit); - -/* Compute the affine coordinates from the projective coordinates in - * POINT. Set them into X and Y. If one coordinate is not required, - * X or Y may be passed as NULL. CTX is the usual context. Returns: 0 - * on success or !0 if POINT is at infinity. - */ -int mpi_ec_get_affine(MPI x, MPI y, MPI_POINT point, struct mpi_ec_ctx *ctx) -{ - if (!mpi_cmp_ui(point->z, 0)) - return -1; - - switch (ctx->model) { - case MPI_EC_WEIERSTRASS: /* Using Jacobian coordinates. */ - { - MPI z1, z2, z3; - - z1 = mpi_new(0); - z2 = mpi_new(0); - ec_invm(z1, point->z, ctx); /* z1 = z^(-1) mod p */ - ec_mulm(z2, z1, z1, ctx); /* z2 = z^(-2) mod p */ - - if (x) - ec_mulm(x, point->x, z2, ctx); - - if (y) { - z3 = mpi_new(0); - ec_mulm(z3, z2, z1, ctx); /* z3 = z^(-3) mod p */ - ec_mulm(y, point->y, z3, ctx); - mpi_free(z3); - } - - mpi_free(z2); - mpi_free(z1); - } - return 0; - - case MPI_EC_MONTGOMERY: - { - if (x) - mpi_set(x, point->x); - - if (y) { - log_fatal("%s: Getting Y-coordinate on %s is not supported\n", - "mpi_ec_get_affine", "Montgomery"); - return -1; - } - } - return 0; - - case MPI_EC_EDWARDS: - { - MPI z; - - z = mpi_new(0); - ec_invm(z, point->z, ctx); - - mpi_resize(z, ctx->p->nlimbs); - z->nlimbs = ctx->p->nlimbs; - - if (x) { - mpi_resize(x, ctx->p->nlimbs); - x->nlimbs = ctx->p->nlimbs; - ctx->mulm(x, point->x, z, ctx); - } - if (y) { - mpi_resize(y, ctx->p->nlimbs); - y->nlimbs = ctx->p->nlimbs; - ctx->mulm(y, point->y, z, ctx); - } - - mpi_free(z); - } - return 0; - - default: - return -1; - } -} -EXPORT_SYMBOL_GPL(mpi_ec_get_affine); - -/* RESULT = 2 * POINT (Weierstrass version). */ -static void dup_point_weierstrass(MPI_POINT result, - MPI_POINT point, struct mpi_ec_ctx *ctx) -{ -#define x3 (result->x) -#define y3 (result->y) -#define z3 (result->z) -#define t1 (ctx->t.scratch[0]) -#define t2 (ctx->t.scratch[1]) -#define t3 (ctx->t.scratch[2]) -#define l1 (ctx->t.scratch[3]) -#define l2 (ctx->t.scratch[4]) -#define l3 (ctx->t.scratch[5]) - - if (!mpi_cmp_ui(point->y, 0) || !mpi_cmp_ui(point->z, 0)) { - /* P_y == 0 || P_z == 0 => [1:1:0] */ - mpi_set_ui(x3, 1); - mpi_set_ui(y3, 1); - mpi_set_ui(z3, 0); - } else { - if (ec_get_a_is_pminus3(ctx)) { - /* Use the faster case. */ - /* L1 = 3(X - Z^2)(X + Z^2) */ - /* T1: used for Z^2. */ - /* T2: used for the right term. */ - ec_pow2(t1, point->z, ctx); - ec_subm(l1, point->x, t1, ctx); - ec_mulm(l1, l1, mpi_const(MPI_C_THREE), ctx); - ec_addm(t2, point->x, t1, ctx); - ec_mulm(l1, l1, t2, ctx); - } else { - /* Standard case. */ - /* L1 = 3X^2 + aZ^4 */ - /* T1: used for aZ^4. */ - ec_pow2(l1, point->x, ctx); - ec_mulm(l1, l1, mpi_const(MPI_C_THREE), ctx); - ec_powm(t1, point->z, mpi_const(MPI_C_FOUR), ctx); - ec_mulm(t1, t1, ctx->a, ctx); - ec_addm(l1, l1, t1, ctx); - } - /* Z3 = 2YZ */ - ec_mulm(z3, point->y, point->z, ctx); - ec_mul2(z3, z3, ctx); - - /* L2 = 4XY^2 */ - /* T2: used for Y2; required later. */ - ec_pow2(t2, point->y, ctx); - ec_mulm(l2, t2, point->x, ctx); - ec_mulm(l2, l2, mpi_const(MPI_C_FOUR), ctx); - - /* X3 = L1^2 - 2L2 */ - /* T1: used for L2^2. */ - ec_pow2(x3, l1, ctx); - ec_mul2(t1, l2, ctx); - ec_subm(x3, x3, t1, ctx); - - /* L3 = 8Y^4 */ - /* T2: taken from above. */ - ec_pow2(t2, t2, ctx); - ec_mulm(l3, t2, mpi_const(MPI_C_EIGHT), ctx); - - /* Y3 = L1(L2 - X3) - L3 */ - ec_subm(y3, l2, x3, ctx); - ec_mulm(y3, y3, l1, ctx); - ec_subm(y3, y3, l3, ctx); - } - -#undef x3 -#undef y3 -#undef z3 -#undef t1 -#undef t2 -#undef t3 -#undef l1 -#undef l2 -#undef l3 -} - -/* RESULT = 2 * POINT (Montgomery version). */ -static void dup_point_montgomery(MPI_POINT result, - MPI_POINT point, struct mpi_ec_ctx *ctx) -{ - (void)result; - (void)point; - (void)ctx; - log_fatal("%s: %s not yet supported\n", - "mpi_ec_dup_point", "Montgomery"); -} - -/* RESULT = 2 * POINT (Twisted Edwards version). */ -static void dup_point_edwards(MPI_POINT result, - MPI_POINT point, struct mpi_ec_ctx *ctx) -{ -#define X1 (point->x) -#define Y1 (point->y) -#define Z1 (point->z) -#define X3 (result->x) -#define Y3 (result->y) -#define Z3 (result->z) -#define B (ctx->t.scratch[0]) -#define C (ctx->t.scratch[1]) -#define D (ctx->t.scratch[2]) -#define E (ctx->t.scratch[3]) -#define F (ctx->t.scratch[4]) -#define H (ctx->t.scratch[5]) -#define J (ctx->t.scratch[6]) - - /* Compute: (X_3 : Y_3 : Z_3) = 2( X_1 : Y_1 : Z_1 ) */ - - /* B = (X_1 + Y_1)^2 */ - ctx->addm(B, X1, Y1, ctx); - ctx->pow2(B, B, ctx); - - /* C = X_1^2 */ - /* D = Y_1^2 */ - ctx->pow2(C, X1, ctx); - ctx->pow2(D, Y1, ctx); - - /* E = aC */ - if (ctx->dialect == ECC_DIALECT_ED25519) - ctx->subm(E, ctx->p, C, ctx); - else - ctx->mulm(E, ctx->a, C, ctx); - - /* F = E + D */ - ctx->addm(F, E, D, ctx); - - /* H = Z_1^2 */ - ctx->pow2(H, Z1, ctx); - - /* J = F - 2H */ - ctx->mul2(J, H, ctx); - ctx->subm(J, F, J, ctx); - - /* X_3 = (B - C - D) · J */ - ctx->subm(X3, B, C, ctx); - ctx->subm(X3, X3, D, ctx); - ctx->mulm(X3, X3, J, ctx); - - /* Y_3 = F · (E - D) */ - ctx->subm(Y3, E, D, ctx); - ctx->mulm(Y3, Y3, F, ctx); - - /* Z_3 = F · J */ - ctx->mulm(Z3, F, J, ctx); - -#undef X1 -#undef Y1 -#undef Z1 -#undef X3 -#undef Y3 -#undef Z3 -#undef B -#undef C -#undef D -#undef E -#undef F -#undef H -#undef J -} - -/* RESULT = 2 * POINT */ -static void -mpi_ec_dup_point(MPI_POINT result, MPI_POINT point, struct mpi_ec_ctx *ctx) -{ - switch (ctx->model) { - case MPI_EC_WEIERSTRASS: - dup_point_weierstrass(result, point, ctx); - break; - case MPI_EC_MONTGOMERY: - dup_point_montgomery(result, point, ctx); - break; - case MPI_EC_EDWARDS: - dup_point_edwards(result, point, ctx); - break; - } -} - -/* RESULT = P1 + P2 (Weierstrass version).*/ -static void add_points_weierstrass(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx) -{ -#define x1 (p1->x) -#define y1 (p1->y) -#define z1 (p1->z) -#define x2 (p2->x) -#define y2 (p2->y) -#define z2 (p2->z) -#define x3 (result->x) -#define y3 (result->y) -#define z3 (result->z) -#define l1 (ctx->t.scratch[0]) -#define l2 (ctx->t.scratch[1]) -#define l3 (ctx->t.scratch[2]) -#define l4 (ctx->t.scratch[3]) -#define l5 (ctx->t.scratch[4]) -#define l6 (ctx->t.scratch[5]) -#define l7 (ctx->t.scratch[6]) -#define l8 (ctx->t.scratch[7]) -#define l9 (ctx->t.scratch[8]) -#define t1 (ctx->t.scratch[9]) -#define t2 (ctx->t.scratch[10]) - - if ((!mpi_cmp(x1, x2)) && (!mpi_cmp(y1, y2)) && (!mpi_cmp(z1, z2))) { - /* Same point; need to call the duplicate function. */ - mpi_ec_dup_point(result, p1, ctx); - } else if (!mpi_cmp_ui(z1, 0)) { - /* P1 is at infinity. */ - mpi_set(x3, p2->x); - mpi_set(y3, p2->y); - mpi_set(z3, p2->z); - } else if (!mpi_cmp_ui(z2, 0)) { - /* P2 is at infinity. */ - mpi_set(x3, p1->x); - mpi_set(y3, p1->y); - mpi_set(z3, p1->z); - } else { - int z1_is_one = !mpi_cmp_ui(z1, 1); - int z2_is_one = !mpi_cmp_ui(z2, 1); - - /* l1 = x1 z2^2 */ - /* l2 = x2 z1^2 */ - if (z2_is_one) - mpi_set(l1, x1); - else { - ec_pow2(l1, z2, ctx); - ec_mulm(l1, l1, x1, ctx); - } - if (z1_is_one) - mpi_set(l2, x2); - else { - ec_pow2(l2, z1, ctx); - ec_mulm(l2, l2, x2, ctx); - } - /* l3 = l1 - l2 */ - ec_subm(l3, l1, l2, ctx); - /* l4 = y1 z2^3 */ - ec_powm(l4, z2, mpi_const(MPI_C_THREE), ctx); - ec_mulm(l4, l4, y1, ctx); - /* l5 = y2 z1^3 */ - ec_powm(l5, z1, mpi_const(MPI_C_THREE), ctx); - ec_mulm(l5, l5, y2, ctx); - /* l6 = l4 - l5 */ - ec_subm(l6, l4, l5, ctx); - - if (!mpi_cmp_ui(l3, 0)) { - if (!mpi_cmp_ui(l6, 0)) { - /* P1 and P2 are the same - use duplicate function. */ - mpi_ec_dup_point(result, p1, ctx); - } else { - /* P1 is the inverse of P2. */ - mpi_set_ui(x3, 1); - mpi_set_ui(y3, 1); - mpi_set_ui(z3, 0); - } - } else { - /* l7 = l1 + l2 */ - ec_addm(l7, l1, l2, ctx); - /* l8 = l4 + l5 */ - ec_addm(l8, l4, l5, ctx); - /* z3 = z1 z2 l3 */ - ec_mulm(z3, z1, z2, ctx); - ec_mulm(z3, z3, l3, ctx); - /* x3 = l6^2 - l7 l3^2 */ - ec_pow2(t1, l6, ctx); - ec_pow2(t2, l3, ctx); - ec_mulm(t2, t2, l7, ctx); - ec_subm(x3, t1, t2, ctx); - /* l9 = l7 l3^2 - 2 x3 */ - ec_mul2(t1, x3, ctx); - ec_subm(l9, t2, t1, ctx); - /* y3 = (l9 l6 - l8 l3^3)/2 */ - ec_mulm(l9, l9, l6, ctx); - ec_powm(t1, l3, mpi_const(MPI_C_THREE), ctx); /* fixme: Use saved value*/ - ec_mulm(t1, t1, l8, ctx); - ec_subm(y3, l9, t1, ctx); - ec_mulm(y3, y3, ec_get_two_inv_p(ctx), ctx); - } - } - -#undef x1 -#undef y1 -#undef z1 -#undef x2 -#undef y2 -#undef z2 -#undef x3 -#undef y3 -#undef z3 -#undef l1 -#undef l2 -#undef l3 -#undef l4 -#undef l5 -#undef l6 -#undef l7 -#undef l8 -#undef l9 -#undef t1 -#undef t2 -} - -/* RESULT = P1 + P2 (Montgomery version).*/ -static void add_points_montgomery(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx) -{ - (void)result; - (void)p1; - (void)p2; - (void)ctx; - log_fatal("%s: %s not yet supported\n", - "mpi_ec_add_points", "Montgomery"); -} - -/* RESULT = P1 + P2 (Twisted Edwards version).*/ -static void add_points_edwards(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx) -{ -#define X1 (p1->x) -#define Y1 (p1->y) -#define Z1 (p1->z) -#define X2 (p2->x) -#define Y2 (p2->y) -#define Z2 (p2->z) -#define X3 (result->x) -#define Y3 (result->y) -#define Z3 (result->z) -#define A (ctx->t.scratch[0]) -#define B (ctx->t.scratch[1]) -#define C (ctx->t.scratch[2]) -#define D (ctx->t.scratch[3]) -#define E (ctx->t.scratch[4]) -#define F (ctx->t.scratch[5]) -#define G (ctx->t.scratch[6]) -#define tmp (ctx->t.scratch[7]) - - point_resize(result, ctx); - - /* Compute: (X_3 : Y_3 : Z_3) = (X_1 : Y_1 : Z_1) + (X_2 : Y_2 : Z_3) */ - - /* A = Z1 · Z2 */ - ctx->mulm(A, Z1, Z2, ctx); - - /* B = A^2 */ - ctx->pow2(B, A, ctx); - - /* C = X1 · X2 */ - ctx->mulm(C, X1, X2, ctx); - - /* D = Y1 · Y2 */ - ctx->mulm(D, Y1, Y2, ctx); - - /* E = d · C · D */ - ctx->mulm(E, ctx->b, C, ctx); - ctx->mulm(E, E, D, ctx); - - /* F = B - E */ - ctx->subm(F, B, E, ctx); - - /* G = B + E */ - ctx->addm(G, B, E, ctx); - - /* X_3 = A · F · ((X_1 + Y_1) · (X_2 + Y_2) - C - D) */ - ctx->addm(tmp, X1, Y1, ctx); - ctx->addm(X3, X2, Y2, ctx); - ctx->mulm(X3, X3, tmp, ctx); - ctx->subm(X3, X3, C, ctx); - ctx->subm(X3, X3, D, ctx); - ctx->mulm(X3, X3, F, ctx); - ctx->mulm(X3, X3, A, ctx); - - /* Y_3 = A · G · (D - aC) */ - if (ctx->dialect == ECC_DIALECT_ED25519) { - ctx->addm(Y3, D, C, ctx); - } else { - ctx->mulm(Y3, ctx->a, C, ctx); - ctx->subm(Y3, D, Y3, ctx); - } - ctx->mulm(Y3, Y3, G, ctx); - ctx->mulm(Y3, Y3, A, ctx); - - /* Z_3 = F · G */ - ctx->mulm(Z3, F, G, ctx); - - -#undef X1 -#undef Y1 -#undef Z1 -#undef X2 -#undef Y2 -#undef Z2 -#undef X3 -#undef Y3 -#undef Z3 -#undef A -#undef B -#undef C -#undef D -#undef E -#undef F -#undef G -#undef tmp -} - -/* Compute a step of Montgomery Ladder (only use X and Z in the point). - * Inputs: P1, P2, and x-coordinate of DIF = P1 - P1. - * Outputs: PRD = 2 * P1 and SUM = P1 + P2. - */ -static void montgomery_ladder(MPI_POINT prd, MPI_POINT sum, - MPI_POINT p1, MPI_POINT p2, MPI dif_x, - struct mpi_ec_ctx *ctx) -{ - ctx->addm(sum->x, p2->x, p2->z, ctx); - ctx->subm(p2->z, p2->x, p2->z, ctx); - ctx->addm(prd->x, p1->x, p1->z, ctx); - ctx->subm(p1->z, p1->x, p1->z, ctx); - ctx->mulm(p2->x, p1->z, sum->x, ctx); - ctx->mulm(p2->z, prd->x, p2->z, ctx); - ctx->pow2(p1->x, prd->x, ctx); - ctx->pow2(p1->z, p1->z, ctx); - ctx->addm(sum->x, p2->x, p2->z, ctx); - ctx->subm(p2->z, p2->x, p2->z, ctx); - ctx->mulm(prd->x, p1->x, p1->z, ctx); - ctx->subm(p1->z, p1->x, p1->z, ctx); - ctx->pow2(sum->x, sum->x, ctx); - ctx->pow2(sum->z, p2->z, ctx); - ctx->mulm(prd->z, p1->z, ctx->a, ctx); /* CTX->A: (a-2)/4 */ - ctx->mulm(sum->z, sum->z, dif_x, ctx); - ctx->addm(prd->z, p1->x, prd->z, ctx); - ctx->mulm(prd->z, prd->z, p1->z, ctx); -} - -/* RESULT = P1 + P2 */ -void mpi_ec_add_points(MPI_POINT result, - MPI_POINT p1, MPI_POINT p2, - struct mpi_ec_ctx *ctx) -{ - switch (ctx->model) { - case MPI_EC_WEIERSTRASS: - add_points_weierstrass(result, p1, p2, ctx); - break; - case MPI_EC_MONTGOMERY: - add_points_montgomery(result, p1, p2, ctx); - break; - case MPI_EC_EDWARDS: - add_points_edwards(result, p1, p2, ctx); - break; - } -} -EXPORT_SYMBOL_GPL(mpi_ec_add_points); - -/* Scalar point multiplication - the main function for ECC. If takes - * an integer SCALAR and a POINT as well as the usual context CTX. - * RESULT will be set to the resulting point. - */ -void mpi_ec_mul_point(MPI_POINT result, - MPI scalar, MPI_POINT point, - struct mpi_ec_ctx *ctx) -{ - MPI x1, y1, z1, k, h, yy; - unsigned int i, loops; - struct gcry_mpi_point p1, p2, p1inv; - - if (ctx->model == MPI_EC_EDWARDS) { - /* Simple left to right binary method. Algorithm 3.27 from - * {author={Hankerson, Darrel and Menezes, Alfred J. and Vanstone, Scott}, - * title = {Guide to Elliptic Curve Cryptography}, - * year = {2003}, isbn = {038795273X}, - * url = {http://www.cacr.math.uwaterloo.ca/ecc/}, - * publisher = {Springer-Verlag New York, Inc.}} - */ - unsigned int nbits; - int j; - - if (mpi_cmp(scalar, ctx->p) >= 0) - nbits = mpi_get_nbits(scalar); - else - nbits = mpi_get_nbits(ctx->p); - - mpi_set_ui(result->x, 0); - mpi_set_ui(result->y, 1); - mpi_set_ui(result->z, 1); - point_resize(point, ctx); - - point_resize(result, ctx); - point_resize(point, ctx); - - for (j = nbits-1; j >= 0; j--) { - mpi_ec_dup_point(result, result, ctx); - if (mpi_test_bit(scalar, j)) - mpi_ec_add_points(result, result, point, ctx); - } - return; - } else if (ctx->model == MPI_EC_MONTGOMERY) { - unsigned int nbits; - int j; - struct gcry_mpi_point p1_, p2_; - MPI_POINT q1, q2, prd, sum; - unsigned long sw; - mpi_size_t rsize; - - /* Compute scalar point multiplication with Montgomery Ladder. - * Note that we don't use Y-coordinate in the points at all. - * RESULT->Y will be filled by zero. - */ - - nbits = mpi_get_nbits(scalar); - point_init(&p1); - point_init(&p2); - point_init(&p1_); - point_init(&p2_); - mpi_set_ui(p1.x, 1); - mpi_free(p2.x); - p2.x = mpi_copy(point->x); - mpi_set_ui(p2.z, 1); - - point_resize(&p1, ctx); - point_resize(&p2, ctx); - point_resize(&p1_, ctx); - point_resize(&p2_, ctx); - - mpi_resize(point->x, ctx->p->nlimbs); - point->x->nlimbs = ctx->p->nlimbs; - - q1 = &p1; - q2 = &p2; - prd = &p1_; - sum = &p2_; - - for (j = nbits-1; j >= 0; j--) { - sw = mpi_test_bit(scalar, j); - point_swap_cond(q1, q2, sw, ctx); - montgomery_ladder(prd, sum, q1, q2, point->x, ctx); - point_swap_cond(prd, sum, sw, ctx); - swap(q1, prd); - swap(q2, sum); - } - - mpi_clear(result->y); - sw = (nbits & 1); - point_swap_cond(&p1, &p1_, sw, ctx); - - rsize = p1.z->nlimbs; - MPN_NORMALIZE(p1.z->d, rsize); - if (rsize == 0) { - mpi_set_ui(result->x, 1); - mpi_set_ui(result->z, 0); - } else { - z1 = mpi_new(0); - ec_invm(z1, p1.z, ctx); - ec_mulm(result->x, p1.x, z1, ctx); - mpi_set_ui(result->z, 1); - mpi_free(z1); - } - - point_free(&p1); - point_free(&p2); - point_free(&p1_); - point_free(&p2_); - return; - } - - x1 = mpi_alloc_like(ctx->p); - y1 = mpi_alloc_like(ctx->p); - h = mpi_alloc_like(ctx->p); - k = mpi_copy(scalar); - yy = mpi_copy(point->y); - - if (mpi_has_sign(k)) { - k->sign = 0; - ec_invm(yy, yy, ctx); - } - - if (!mpi_cmp_ui(point->z, 1)) { - mpi_set(x1, point->x); - mpi_set(y1, yy); - } else { - MPI z2, z3; - - z2 = mpi_alloc_like(ctx->p); - z3 = mpi_alloc_like(ctx->p); - ec_mulm(z2, point->z, point->z, ctx); - ec_mulm(z3, point->z, z2, ctx); - ec_invm(z2, z2, ctx); - ec_mulm(x1, point->x, z2, ctx); - ec_invm(z3, z3, ctx); - ec_mulm(y1, yy, z3, ctx); - mpi_free(z2); - mpi_free(z3); - } - z1 = mpi_copy(mpi_const(MPI_C_ONE)); - - mpi_mul(h, k, mpi_const(MPI_C_THREE)); /* h = 3k */ - loops = mpi_get_nbits(h); - if (loops < 2) { - /* If SCALAR is zero, the above mpi_mul sets H to zero and thus - * LOOPs will be zero. To avoid an underflow of I in the main - * loop we set LOOP to 2 and the result to (0,0,0). - */ - loops = 2; - mpi_clear(result->x); - mpi_clear(result->y); - mpi_clear(result->z); - } else { - mpi_set(result->x, point->x); - mpi_set(result->y, yy); - mpi_set(result->z, point->z); - } - mpi_free(yy); yy = NULL; - - p1.x = x1; x1 = NULL; - p1.y = y1; y1 = NULL; - p1.z = z1; z1 = NULL; - point_init(&p2); - point_init(&p1inv); - - /* Invert point: y = p - y mod p */ - point_set(&p1inv, &p1); - ec_subm(p1inv.y, ctx->p, p1inv.y, ctx); - - for (i = loops-2; i > 0; i--) { - mpi_ec_dup_point(result, result, ctx); - if (mpi_test_bit(h, i) == 1 && mpi_test_bit(k, i) == 0) { - point_set(&p2, result); - mpi_ec_add_points(result, &p2, &p1, ctx); - } - if (mpi_test_bit(h, i) == 0 && mpi_test_bit(k, i) == 1) { - point_set(&p2, result); - mpi_ec_add_points(result, &p2, &p1inv, ctx); - } - } - - point_free(&p1); - point_free(&p2); - point_free(&p1inv); - mpi_free(h); - mpi_free(k); -} -EXPORT_SYMBOL_GPL(mpi_ec_mul_point); - -/* Return true if POINT is on the curve described by CTX. */ -int mpi_ec_curve_point(MPI_POINT point, struct mpi_ec_ctx *ctx) -{ - int res = 0; - MPI x, y, w; - - x = mpi_new(0); - y = mpi_new(0); - w = mpi_new(0); - - /* Check that the point is in range. This needs to be done here and - * not after conversion to affine coordinates. - */ - if (mpi_cmpabs(point->x, ctx->p) >= 0) - goto leave; - if (mpi_cmpabs(point->y, ctx->p) >= 0) - goto leave; - if (mpi_cmpabs(point->z, ctx->p) >= 0) - goto leave; - - switch (ctx->model) { - case MPI_EC_WEIERSTRASS: - { - MPI xxx; - - if (mpi_ec_get_affine(x, y, point, ctx)) - goto leave; - - xxx = mpi_new(0); - - /* y^2 == x^3 + a·x + b */ - ec_pow2(y, y, ctx); - - ec_pow3(xxx, x, ctx); - ec_mulm(w, ctx->a, x, ctx); - ec_addm(w, w, ctx->b, ctx); - ec_addm(w, w, xxx, ctx); - - if (!mpi_cmp(y, w)) - res = 1; - - mpi_free(xxx); - } - break; - - case MPI_EC_MONTGOMERY: - { -#define xx y - /* With Montgomery curve, only X-coordinate is valid. */ - if (mpi_ec_get_affine(x, NULL, point, ctx)) - goto leave; - - /* The equation is: b * y^2 == x^3 + a · x^2 + x */ - /* We check if right hand is quadratic residue or not by - * Euler's criterion. - */ - /* CTX->A has (a-2)/4 and CTX->B has b^-1 */ - ec_mulm(w, ctx->a, mpi_const(MPI_C_FOUR), ctx); - ec_addm(w, w, mpi_const(MPI_C_TWO), ctx); - ec_mulm(w, w, x, ctx); - ec_pow2(xx, x, ctx); - ec_addm(w, w, xx, ctx); - ec_addm(w, w, mpi_const(MPI_C_ONE), ctx); - ec_mulm(w, w, x, ctx); - ec_mulm(w, w, ctx->b, ctx); -#undef xx - /* Compute Euler's criterion: w^(p-1)/2 */ -#define p_minus1 y - ec_subm(p_minus1, ctx->p, mpi_const(MPI_C_ONE), ctx); - mpi_rshift(p_minus1, p_minus1, 1); - ec_powm(w, w, p_minus1, ctx); - - res = !mpi_cmp_ui(w, 1); -#undef p_minus1 - } - break; - - case MPI_EC_EDWARDS: - { - if (mpi_ec_get_affine(x, y, point, ctx)) - goto leave; - - mpi_resize(w, ctx->p->nlimbs); - w->nlimbs = ctx->p->nlimbs; - - /* a · x^2 + y^2 - 1 - b · x^2 · y^2 == 0 */ - ctx->pow2(x, x, ctx); - ctx->pow2(y, y, ctx); - if (ctx->dialect == ECC_DIALECT_ED25519) - ctx->subm(w, ctx->p, x, ctx); - else - ctx->mulm(w, ctx->a, x, ctx); - ctx->addm(w, w, y, ctx); - ctx->mulm(x, x, y, ctx); - ctx->mulm(x, x, ctx->b, ctx); - ctx->subm(w, w, x, ctx); - if (!mpi_cmp_ui(w, 1)) - res = 1; - } - break; - } - -leave: - mpi_free(w); - mpi_free(x); - mpi_free(y); - - return res; -} -EXPORT_SYMBOL_GPL(mpi_ec_curve_point); -- cgit v1.2.3 From ac9d45544cd571decca395715d0b0a3b617d02f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 1 Jul 2024 13:33:58 -0700 Subject: locking/csd_lock: Provide an indication of ongoing CSD-lock stall If a CSD-lock stall goes on long enough, it will cause an RCU CPU stall warning. This additional warning provides much additional console-log traffic and little additional information. Therefore, provide a new csd_lock_is_stuck() function that returns true if there is an ongoing CSD-lock stall. This function will be used by the RCU CPU stall warnings to provide a one-line indication of the stall when this function returns true. [ neeraj.upadhyay: Apply Rik van Riel feedback. ] [ neeraj.upadhyay: Apply kernel test robot feedback. ] Signed-off-by: Paul E. McKenney Cc: Imran Khan Cc: Ingo Molnar Cc: Leonardo Bras Cc: "Peter Zijlstra (Intel)" Cc: Rik van Riel Signed-off-by: Neeraj Upadhyay --- include/linux/smp.h | 6 ++++++ kernel/smp.c | 16 ++++++++++++++++ lib/Kconfig.debug | 1 + 3 files changed, 23 insertions(+) (limited to 'lib') diff --git a/include/linux/smp.h b/include/linux/smp.h index fcd61dfe2af3..3871bd32018f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -294,4 +294,10 @@ int smpcfd_prepare_cpu(unsigned int cpu); int smpcfd_dead_cpu(unsigned int cpu); int smpcfd_dying_cpu(unsigned int cpu); +#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG +bool csd_lock_is_stuck(void); +#else +static inline bool csd_lock_is_stuck(void) { return false; } +#endif + #endif /* __LINUX_SMP_H */ diff --git a/kernel/smp.c b/kernel/smp.c index e87953729230..202cda4d2a55 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -208,6 +208,19 @@ static int csd_lock_wait_getcpu(call_single_data_t *csd) return -1; } +static atomic_t n_csd_lock_stuck; + +/** + * csd_lock_is_stuck - Has a CSD-lock acquisition been stuck too long? + * + * Returns @true if a CSD-lock acquisition is stuck and has been stuck + * long enough for a "non-responsive CSD lock" message to be printed. + */ +bool csd_lock_is_stuck(void) +{ + return !!atomic_read(&n_csd_lock_stuck); +} + /* * Complain if too much time spent waiting. Note that only * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU, @@ -229,6 +242,7 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in cpu = csd_lock_wait_getcpu(csd); pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n", *bug_id, raw_smp_processor_id(), cpu); + atomic_dec(&n_csd_lock_stuck); return true; } @@ -252,6 +266,8 @@ static bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, in pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %lld ns for CPU#%02d %pS(%ps).\n", firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), (s64)ts_delta, cpu, csd->func, csd->info); + if (firsttime) + atomic_inc(&n_csd_lock_stuck); /* * If the CSD lock is still stuck after 5 minutes, it is unlikely * to become unstuck. Use a signed comparison to avoid triggering diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..4e5f61cba8e4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1614,6 +1614,7 @@ config SCF_TORTURE_TEST config CSD_LOCK_WAIT_DEBUG bool "Debugging for csd_lock_wait(), called from smp_call_function*()" depends on DEBUG_KERNEL + depends on SMP depends on 64BIT default n help -- cgit v1.2.3 From 4e9903b0861c9df3464b82db4a7025863bac1897 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 28 Jul 2024 00:02:36 +0900 Subject: fortify: refactor test_fortify Makefile to fix some build problems There are some issues in the test_fortify Makefile code. Problem 1: cc-disable-warning invokes compiler dozens of times To see how many times the cc-disable-warning is evaluated, change this code: $(call cc-disable-warning,fortify-source) to: $(call cc-disable-warning,$(shell touch /tmp/fortify-$$$$)fortify-source) Then, build the kernel with CONFIG_FORTIFY_SOURCE=y. You will see a large number of '/tmp/fortify-' files created: $ ls -1 /tmp/fortify-* | wc 80 80 1600 This means the compiler was invoked 80 times just for checking the -Wno-fortify-source flag support. $(call cc-disable-warning,fortify-source) should be added to a simple variable instead of a recursive variable. Problem 2: do not recompile string.o when the test code is updated The test cases are independent of the kernel. However, when the test code is updated, $(obj)/string.o is rebuilt and vmlinux is relinked due to this dependency: $(obj)/string.o: $(obj)/$(TEST_FORTIFY_LOG) always-y is suitable for building the log files. Problem 3: redundant code clean-files += $(addsuffix .o, $(TEST_FORTIFY_LOGS)) ... is unneeded because the top Makefile globally cleans *.o files. This commit fixes these issues and makes the code readable. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727150302.1823750-2-masahiroy@kernel.org Signed-off-by: Kees Cook --- lib/.gitignore | 2 -- lib/Makefile | 38 +------------------------------------- lib/test_fortify/.gitignore | 2 ++ lib/test_fortify/Makefile | 28 ++++++++++++++++++++++++++++ scripts/remove-stale-files | 2 ++ 5 files changed, 33 insertions(+), 39 deletions(-) create mode 100644 lib/test_fortify/.gitignore create mode 100644 lib/test_fortify/Makefile (limited to 'lib') diff --git a/lib/.gitignore b/lib/.gitignore index 54596b634ecb..101a4aa92fb5 100644 --- a/lib/.gitignore +++ b/lib/.gitignore @@ -5,5 +5,3 @@ /gen_crc32table /gen_crc64table /oid_registry_data.c -/test_fortify.log -/test_fortify/*.log diff --git a/lib/Makefile b/lib/Makefile index 322bb127b4dc..4df3c28b23b4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -393,40 +393,4 @@ obj-$(CONFIG_GENERIC_LIB_DEVMEM_IS_ALLOWED) += devmem_is_allowed.o obj-$(CONFIG_FIRMWARE_TABLE) += fw_table.o -# FORTIFY_SOURCE compile-time behavior tests -TEST_FORTIFY_SRCS = $(wildcard $(src)/test_fortify/*-*.c) -TEST_FORTIFY_LOGS = $(patsubst $(src)/%.c, %.log, $(TEST_FORTIFY_SRCS)) -TEST_FORTIFY_LOG = test_fortify.log - -quiet_cmd_test_fortify = TEST $@ - cmd_test_fortify = $(CONFIG_SHELL) $(srctree)/scripts/test_fortify.sh \ - $< $@ "$(NM)" $(CC) $(c_flags) \ - $(call cc-disable-warning,fortify-source) \ - -DKBUILD_EXTRA_WARN1 - -targets += $(TEST_FORTIFY_LOGS) -clean-files += $(TEST_FORTIFY_LOGS) -clean-files += $(addsuffix .o, $(TEST_FORTIFY_LOGS)) -$(obj)/test_fortify/%.log: $(src)/test_fortify/%.c \ - $(src)/test_fortify/test_fortify.h \ - $(srctree)/include/linux/fortify-string.h \ - $(srctree)/scripts/test_fortify.sh \ - FORCE - $(call if_changed,test_fortify) - -quiet_cmd_gen_fortify_log = GEN $@ - cmd_gen_fortify_log = cat /dev/null > $@ || true - -targets += $(TEST_FORTIFY_LOG) -clean-files += $(TEST_FORTIFY_LOG) -$(obj)/$(TEST_FORTIFY_LOG): $(addprefix $(obj)/, $(TEST_FORTIFY_LOGS)) FORCE - $(call if_changed,gen_fortify_log) - -# Fake dependency to trigger the fortify tests. -ifeq ($(CONFIG_FORTIFY_SOURCE),y) -$(obj)/string.o: $(obj)/$(TEST_FORTIFY_LOG) -endif - -# Some architectures define __NO_FORTIFY if __SANITIZE_ADDRESS__ is undefined. -# Pass CFLAGS_KASAN to avoid warnings. -$(foreach x, $(patsubst %.log,%.o,$(TEST_FORTIFY_LOGS)), $(eval KASAN_SANITIZE_$(x) := y)) +subdir-$(CONFIG_FORTIFY_SOURCE) += test_fortify diff --git a/lib/test_fortify/.gitignore b/lib/test_fortify/.gitignore new file mode 100644 index 000000000000..c1ba37d14b50 --- /dev/null +++ b/lib/test_fortify/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +/*.log diff --git a/lib/test_fortify/Makefile b/lib/test_fortify/Makefile new file mode 100644 index 000000000000..3907a2242ef9 --- /dev/null +++ b/lib/test_fortify/Makefile @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: GPL-2.0 + +ccflags-y := $(call cc-disable-warning,fortify-source) + +quiet_cmd_test_fortify = TEST $@ + cmd_test_fortify = $(CONFIG_SHELL) $(srctree)/scripts/test_fortify.sh \ + $< $@ "$(NM)" $(CC) $(c_flags) -DKBUILD_EXTRA_WARN1 + +$(obj)/%.log: $(src)/%.c $(srctree)/scripts/test_fortify.sh \ + $(src)/test_fortify.h \ + $(srctree)/include/linux/fortify-string.h \ + FORCE + $(call if_changed,test_fortify) + +logs = $(patsubst $(src)/%.c, %.log, $(wildcard $(src)/*-*.c)) +targets += $(logs) + +quiet_cmd_gen_fortify_log = CAT $@ + cmd_gen_fortify_log = cat $(or $(real-prereqs),/dev/null) > $@ + +$(obj)/test_fortify.log: $(addprefix $(obj)/, $(logs)) FORCE + $(call if_changed,gen_fortify_log) + +always-y += test_fortify.log + +# Some architectures define __NO_FORTIFY if __SANITIZE_ADDRESS__ is undefined. +# Pass CFLAGS_KASAN to avoid warnings. +KASAN_SANITIZE := y diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files index f38d26b78c2a..8fc55a749ccc 100755 --- a/scripts/remove-stale-files +++ b/scripts/remove-stale-files @@ -21,3 +21,5 @@ set -e # then will be really dead and removed from the code base entirely. rm -f *.spec + +rm -f lib/test_fortify.log -- cgit v1.2.3 From 5a8d0c46c9e024bed4805a9335fe6124d8a78d3a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 28 Jul 2024 00:02:37 +0900 Subject: fortify: move test_fortify.sh to lib/test_fortify/ This script is only used in lib/test_fortify/. There is no reason to keep it in scripts/. Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727150302.1823750-3-masahiroy@kernel.org Signed-off-by: Kees Cook --- MAINTAINERS | 1 - lib/test_fortify/Makefile | 4 +-- lib/test_fortify/test_fortify.sh | 66 ++++++++++++++++++++++++++++++++++++++++ scripts/test_fortify.sh | 66 ---------------------------------------- 4 files changed, 68 insertions(+), 69 deletions(-) create mode 100644 lib/test_fortify/test_fortify.sh delete mode 100644 scripts/test_fortify.sh (limited to 'lib') diff --git a/MAINTAINERS b/MAINTAINERS index 8766f3e5e87e..36c0af94cf08 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8772,7 +8772,6 @@ F: include/linux/fortify-string.h F: lib/fortify_kunit.c F: lib/memcpy_kunit.c F: lib/test_fortify/* -F: scripts/test_fortify.sh K: \b__NO_FORTIFY\b FPGA DFL DRIVERS diff --git a/lib/test_fortify/Makefile b/lib/test_fortify/Makefile index 3907a2242ef9..1826172c32d4 100644 --- a/lib/test_fortify/Makefile +++ b/lib/test_fortify/Makefile @@ -3,10 +3,10 @@ ccflags-y := $(call cc-disable-warning,fortify-source) quiet_cmd_test_fortify = TEST $@ - cmd_test_fortify = $(CONFIG_SHELL) $(srctree)/scripts/test_fortify.sh \ + cmd_test_fortify = $(CONFIG_SHELL) $(src)/test_fortify.sh \ $< $@ "$(NM)" $(CC) $(c_flags) -DKBUILD_EXTRA_WARN1 -$(obj)/%.log: $(src)/%.c $(srctree)/scripts/test_fortify.sh \ +$(obj)/%.log: $(src)/%.c $(src)/test_fortify.sh \ $(src)/test_fortify.h \ $(srctree)/include/linux/fortify-string.h \ FORCE diff --git a/lib/test_fortify/test_fortify.sh b/lib/test_fortify/test_fortify.sh new file mode 100644 index 000000000000..c2688ab8281d --- /dev/null +++ b/lib/test_fortify/test_fortify.sh @@ -0,0 +1,66 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-only +set -e + +# Argument 1: Source file to build. +IN="$1" +shift +# Extract just the filename for error messages below. +FILE="${IN##*/}" +# Extract the function name for error messages below. +FUNC="${FILE#*-}" +FUNC="${FUNC%%-*}" +FUNC="${FUNC%%.*}" +# Extract the symbol to test for in build/symbol test below. +WANT="__${FILE%%-*}" + +# Argument 2: Where to write the build log. +OUT="$1" +shift +TMP="${OUT}.tmp" + +# Argument 3: Path to "nm" tool. +NM="$1" +shift + +# Remaining arguments are: $(CC) $(c_flags) + +# Clean up temporary file at exit. +__cleanup() { + rm -f "$TMP" +} +trap __cleanup EXIT + +# Function names in warnings are wrapped in backticks under UTF-8 locales. +# Run the commands with LANG=C so that grep output will not change. +export LANG=C + +status= +# Attempt to build a source that is expected to fail with a specific warning. +if "$@" -Werror -c "$IN" -o "$OUT".o 2> "$TMP" ; then + # If the build succeeds, either the test has failed or the + # warning may only happen at link time (Clang). In that case, + # make sure the expected symbol is unresolved in the symbol list. + # If so, FORTIFY is working for this case. + if ! $NM -A "$OUT".o | grep -m1 "\bU ${WANT}$" >>"$TMP" ; then + status="warning: unsafe ${FUNC}() usage lacked '$WANT' symbol in $IN" + fi +else + # If the build failed, check for the warning in the stderr. + # GCC: + # ./include/linux/fortify-string.h:316:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] + # Clang 14: + # ./include/linux/fortify-string.h:316:4: error: call to __write_overflow_field declared with 'warning' attribute: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] + if ! grep -Eq -m1 "error: call to .?\b${WANT}\b.?" "$TMP" ; then + status="warning: unsafe ${FUNC}() usage lacked '$WANT' warning in $IN" + fi +fi + +if [ -n "$status" ]; then + # Report on failure results, including compilation warnings. + echo "$status" | tee "$OUT" >&2 +else + # Report on good results, and save any compilation output to log. + echo "ok: unsafe ${FUNC}() usage correctly detected with '$WANT' in $IN" >"$OUT" +fi +cat "$TMP" >>"$OUT" diff --git a/scripts/test_fortify.sh b/scripts/test_fortify.sh deleted file mode 100644 index c2688ab8281d..000000000000 --- a/scripts/test_fortify.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/sh -# SPDX-License-Identifier: GPL-2.0-only -set -e - -# Argument 1: Source file to build. -IN="$1" -shift -# Extract just the filename for error messages below. -FILE="${IN##*/}" -# Extract the function name for error messages below. -FUNC="${FILE#*-}" -FUNC="${FUNC%%-*}" -FUNC="${FUNC%%.*}" -# Extract the symbol to test for in build/symbol test below. -WANT="__${FILE%%-*}" - -# Argument 2: Where to write the build log. -OUT="$1" -shift -TMP="${OUT}.tmp" - -# Argument 3: Path to "nm" tool. -NM="$1" -shift - -# Remaining arguments are: $(CC) $(c_flags) - -# Clean up temporary file at exit. -__cleanup() { - rm -f "$TMP" -} -trap __cleanup EXIT - -# Function names in warnings are wrapped in backticks under UTF-8 locales. -# Run the commands with LANG=C so that grep output will not change. -export LANG=C - -status= -# Attempt to build a source that is expected to fail with a specific warning. -if "$@" -Werror -c "$IN" -o "$OUT".o 2> "$TMP" ; then - # If the build succeeds, either the test has failed or the - # warning may only happen at link time (Clang). In that case, - # make sure the expected symbol is unresolved in the symbol list. - # If so, FORTIFY is working for this case. - if ! $NM -A "$OUT".o | grep -m1 "\bU ${WANT}$" >>"$TMP" ; then - status="warning: unsafe ${FUNC}() usage lacked '$WANT' symbol in $IN" - fi -else - # If the build failed, check for the warning in the stderr. - # GCC: - # ./include/linux/fortify-string.h:316:25: error: call to '__write_overflow_field' declared with attribute warning: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror=attribute-warning] - # Clang 14: - # ./include/linux/fortify-string.h:316:4: error: call to __write_overflow_field declared with 'warning' attribute: detected write beyond size of field (1st parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] - if ! grep -Eq -m1 "error: call to .?\b${WANT}\b.?" "$TMP" ; then - status="warning: unsafe ${FUNC}() usage lacked '$WANT' warning in $IN" - fi -fi - -if [ -n "$status" ]; then - # Report on failure results, including compilation warnings. - echo "$status" | tee "$OUT" >&2 -else - # Report on good results, and save any compilation output to log. - echo "ok: unsafe ${FUNC}() usage correctly detected with '$WANT' in $IN" >"$OUT" -fi -cat "$TMP" >>"$OUT" -- cgit v1.2.3 From 9c6b7fbbd7a2c2772a592adf9b2835371927a1d3 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sun, 28 Jul 2024 00:02:38 +0900 Subject: fortify: use if_changed_dep to record header dependency in *.cmd files After building with CONFIG_FORTIFY_SOURCE=y, many .*.d files are left in lib/test_fortify/ because the compiler outputs header dependencies into *.d without fixdep being invoked. When compiling C files, if_changed_dep should be used so that the auto-generated header dependencies are recorded in .*.cmd files. Currently, if_changed is incorrectly used, and only two headers are hard-coded in lib/Makefile. In the previous patch version, the kbuild test robot detected new errors on GCC 7. GCC 7 or older does not produce test.d with the following test code: $ echo 'void b(void) __attribute__((__error__(""))); void a(void) { b(); }' | gcc -Wp,-MMD,test.d -c -o /dev/null -x c - Perhaps, this was a bug that existed in older GCC versions. Skip the tests for GCC<=7 for now, as this will be eventually solved when we bump the minimal supported GCC version. Link: https://lore.kernel.org/oe-kbuild-all/CAK7LNARmJcyyzL-jVJfBPi3W684LTDmuhMf1koF0TXoCpKTmcw@mail.gmail.com/T/#m13771bf78ae21adff22efc4d310c973fb4bcaf67 Signed-off-by: Masahiro Yamada Link: https://lore.kernel.org/r/20240727150302.1823750-4-masahiroy@kernel.org Signed-off-by: Kees Cook --- lib/test_fortify/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/test_fortify/Makefile b/lib/test_fortify/Makefile index 1826172c32d4..1c3f82ad8bb2 100644 --- a/lib/test_fortify/Makefile +++ b/lib/test_fortify/Makefile @@ -6,11 +6,8 @@ quiet_cmd_test_fortify = TEST $@ cmd_test_fortify = $(CONFIG_SHELL) $(src)/test_fortify.sh \ $< $@ "$(NM)" $(CC) $(c_flags) -DKBUILD_EXTRA_WARN1 -$(obj)/%.log: $(src)/%.c $(src)/test_fortify.sh \ - $(src)/test_fortify.h \ - $(srctree)/include/linux/fortify-string.h \ - FORCE - $(call if_changed,test_fortify) +$(obj)/%.log: $(src)/%.c $(src)/test_fortify.sh FORCE + $(call if_changed_dep,test_fortify) logs = $(patsubst $(src)/%.c, %.log, $(wildcard $(src)/*-*.c)) targets += $(logs) @@ -21,7 +18,10 @@ quiet_cmd_gen_fortify_log = CAT $@ $(obj)/test_fortify.log: $(addprefix $(obj)/, $(logs)) FORCE $(call if_changed,gen_fortify_log) -always-y += test_fortify.log +# GCC<=7 does not always produce *.d files. +# Run the tests only for GCC>=8 or Clang. +always-$(call gcc-min-version, 80000) += test_fortify.log +always-$(CONFIG_CC_IS_CLANG) += test_fortify.log # Some architectures define __NO_FORTIFY if __SANITIZE_ADDRESS__ is undefined. # Pass CFLAGS_KASAN to avoid warnings. -- cgit v1.2.3 From bbf3c7ff9dfa45be51500d23a1276991a7cd8c6e Mon Sep 17 00:00:00 2001 From: Justin Stitt Date: Thu, 8 Aug 2024 14:43:56 -0700 Subject: lib/string_helpers: rework overflow-dependent code When @size is 0, the desired behavior is to allow unlimited bytes to be parsed. Currently, this relies on some intentional arithmetic overflow where --size gives us SIZE_MAX when size is 0. Explicitly spell out the desired behavior without relying on intentional overflow/underflow. Signed-off-by: Justin Stitt Link: https://lore.kernel.org/r/20240808-b4-string_helpers_caa133-v1-1-686a455167c4@google.com Signed-off-by: Kees Cook --- lib/string_helpers.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'lib') diff --git a/lib/string_helpers.c b/lib/string_helpers.c index 69ba49b853c7..4f887aa62fa0 100644 --- a/lib/string_helpers.c +++ b/lib/string_helpers.c @@ -321,6 +321,9 @@ int string_unescape(char *src, char *dst, size_t size, unsigned int flags) { char *out = dst; + if (!size) + size = SIZE_MAX; + while (*src && --size) { if (src[0] == '\\' && src[1] != '\0' && size > 1) { src++; -- cgit v1.2.3 From fca5cb4dd2b4a9423cb6d112cc71c33899955a1f Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 10 Aug 2024 14:20:55 +0800 Subject: Revert "lib/mpi: Extend the MPI library" This partially reverts commit a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e. Most of it is no longer needed since sm2 has been removed. However, the following functions have been kept as they have developed other uses: mpi_copy mpi_mod mpi_test_bit mpi_set_bit mpi_rshift mpi_add mpi_sub mpi_addm mpi_subm mpi_mul mpi_mulm mpi_tdiv_r mpi_fdiv_r Signed-off-by: Herbert Xu --- include/linux/mpi.h | 65 -------- lib/crypto/mpi/Makefile | 1 - lib/crypto/mpi/mpi-add.c | 51 ------- lib/crypto/mpi/mpi-bit.c | 143 ------------------ lib/crypto/mpi/mpi-cmp.c | 46 ++---- lib/crypto/mpi/mpi-div.c | 29 ---- lib/crypto/mpi/mpi-internal.h | 10 -- lib/crypto/mpi/mpi-inv.c | 143 ------------------ lib/crypto/mpi/mpi-mod.c | 144 ------------------ lib/crypto/mpi/mpicoder.c | 336 ------------------------------------------ lib/crypto/mpi/mpih-mul.c | 25 ---- lib/crypto/mpi/mpiutil.c | 182 ----------------------- 12 files changed, 10 insertions(+), 1165 deletions(-) delete mode 100644 lib/crypto/mpi/mpi-inv.c (limited to 'lib') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 89b720893e12..e081428b91ef 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -40,87 +40,33 @@ struct gcry_mpi { typedef struct gcry_mpi *MPI; #define mpi_get_nlimbs(a) ((a)->nlimbs) -#define mpi_has_sign(a) ((a)->sign) /*-- mpiutil.c --*/ MPI mpi_alloc(unsigned nlimbs); -void mpi_clear(MPI a); void mpi_free(MPI a); int mpi_resize(MPI a, unsigned nlimbs); -static inline MPI mpi_new(unsigned int nbits) -{ - return mpi_alloc((nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB); -} - MPI mpi_copy(MPI a); -MPI mpi_alloc_like(MPI a); -void mpi_snatch(MPI w, MPI u); -MPI mpi_set(MPI w, MPI u); -MPI mpi_set_ui(MPI w, unsigned long u); -MPI mpi_alloc_set_ui(unsigned long u); -void mpi_swap_cond(MPI a, MPI b, unsigned long swap); - -/* Constants used to return constant MPIs. See mpi_init if you - * want to add more constants. - */ -#define MPI_NUMBER_OF_CONSTANTS 6 -enum gcry_mpi_constants { - MPI_C_ZERO, - MPI_C_ONE, - MPI_C_TWO, - MPI_C_THREE, - MPI_C_FOUR, - MPI_C_EIGHT -}; - -MPI mpi_const(enum gcry_mpi_constants no); /*-- mpicoder.c --*/ - -/* Different formats of external big integer representation. */ -enum gcry_mpi_format { - GCRYMPI_FMT_NONE = 0, - GCRYMPI_FMT_STD = 1, /* Twos complement stored without length. */ - GCRYMPI_FMT_PGP = 2, /* As used by OpenPGP (unsigned only). */ - GCRYMPI_FMT_SSH = 3, /* As used by SSH (like STD but with length). */ - GCRYMPI_FMT_HEX = 4, /* Hex format. */ - GCRYMPI_FMT_USG = 5, /* Like STD but unsigned. */ - GCRYMPI_FMT_OPAQUE = 8 /* Opaque format (some functions only). */ -}; - MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); -int mpi_fromstr(MPI val, const char *str); -MPI mpi_scanval(const char *string); MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign); int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); -int mpi_print(enum gcry_mpi_format format, unsigned char *buffer, - size_t buflen, size_t *nwritten, MPI a); /*-- mpi-mod.c --*/ void mpi_mod(MPI rem, MPI dividend, MPI divisor); -/* Context used with Barrett reduction. */ -struct barrett_ctx_s; -typedef struct barrett_ctx_s *mpi_barrett_t; - -mpi_barrett_t mpi_barrett_init(MPI m, int copy); -void mpi_barrett_free(mpi_barrett_t ctx); -void mpi_mod_barrett(MPI r, MPI x, mpi_barrett_t ctx); -void mpi_mul_barrett(MPI w, MPI u, MPI v, mpi_barrett_t ctx); - /*-- mpi-pow.c --*/ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); /*-- mpi-cmp.c --*/ int mpi_cmp_ui(MPI u, ulong v); int mpi_cmp(MPI u, MPI v); -int mpi_cmpabs(MPI u, MPI v); /*-- mpi-sub-ui.c --*/ int mpi_sub_ui(MPI w, MPI u, unsigned long vval); @@ -130,16 +76,9 @@ void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); int mpi_test_bit(MPI a, unsigned int n); void mpi_set_bit(MPI a, unsigned int n); -void mpi_set_highbit(MPI a, unsigned int n); -void mpi_clear_highbit(MPI a, unsigned int n); -void mpi_clear_bit(MPI a, unsigned int n); -void mpi_rshift_limbs(MPI a, unsigned int count); void mpi_rshift(MPI x, MPI a, unsigned int n); -void mpi_lshift_limbs(MPI a, unsigned int count); -void mpi_lshift(MPI x, MPI a, unsigned int n); /*-- mpi-add.c --*/ -void mpi_add_ui(MPI w, MPI u, unsigned long v); void mpi_add(MPI w, MPI u, MPI v); void mpi_sub(MPI w, MPI u, MPI v); void mpi_addm(MPI w, MPI u, MPI v, MPI m); @@ -152,10 +91,6 @@ void mpi_mulm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-div.c --*/ void mpi_tdiv_r(MPI rem, MPI num, MPI den); void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); -void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); - -/*-- mpi-inv.c --*/ -int mpi_invm(MPI x, MPI a, MPI n); /* inline functions */ diff --git a/lib/crypto/mpi/Makefile b/lib/crypto/mpi/Makefile index 477debd7ed50..9ad84079025a 100644 --- a/lib/crypto/mpi/Makefile +++ b/lib/crypto/mpi/Makefile @@ -19,7 +19,6 @@ mpi-y = \ mpi-cmp.o \ mpi-sub-ui.o \ mpi-div.o \ - mpi-inv.o \ mpi-mod.o \ mpi-mul.o \ mpih-cmp.o \ diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c index 9056fc5167fc..b47c8c35f5fe 100644 --- a/lib/crypto/mpi/mpi-add.c +++ b/lib/crypto/mpi/mpi-add.c @@ -13,57 +13,6 @@ #include "mpi-internal.h" -/**************** - * Add the unsigned integer V to the mpi-integer U and store the - * result in W. U and V may be the same. - */ -void mpi_add_ui(MPI w, MPI u, unsigned long v) -{ - mpi_ptr_t wp, up; - mpi_size_t usize, wsize; - int usign, wsign; - - usize = u->nlimbs; - usign = u->sign; - wsign = 0; - - /* If not space for W (and possible carry), increase space. */ - wsize = usize + 1; - if (w->alloced < wsize) - mpi_resize(w, wsize); - - /* These must be after realloc (U may be the same as W). */ - up = u->d; - wp = w->d; - - if (!usize) { /* simple */ - wp[0] = v; - wsize = v ? 1:0; - } else if (!usign) { /* mpi is not negative */ - mpi_limb_t cy; - cy = mpihelp_add_1(wp, up, usize, v); - wp[usize] = cy; - wsize = usize + cy; - } else { - /* The signs are different. Need exact comparison to determine - * which operand to subtract from which. - */ - if (usize == 1 && up[0] < v) { - wp[0] = v - up[0]; - wsize = 1; - } else { - mpihelp_sub_1(wp, up, usize, v); - /* Size can decrease with at most one limb. */ - wsize = usize - (wp[usize-1] == 0); - wsign = 1; - } - } - - w->nlimbs = wsize; - w->sign = wsign; -} - - void mpi_add(MPI w, MPI u, MPI v) { mpi_ptr_t wp, up, vp; diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index e08fc202ea5c..c29b85362664 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -32,7 +32,6 @@ void mpi_normalize(MPI a) for (; a->nlimbs && !a->d[a->nlimbs - 1]; a->nlimbs--) ; } -EXPORT_SYMBOL_GPL(mpi_normalize); /**************** * Return the number of bits in A. @@ -93,85 +92,6 @@ void mpi_set_bit(MPI a, unsigned int n) a->d[limbno] |= (A_LIMB_1<= a->nlimbs) { - for (i = a->nlimbs; i < a->alloced; i++) - a->d[i] = 0; - mpi_resize(a, limbno+1); - a->nlimbs = limbno+1; - } - a->d[limbno] |= (A_LIMB_1<d[limbno] &= ~(A_LIMB_1 << bitno); - a->nlimbs = limbno+1; -} -EXPORT_SYMBOL_GPL(mpi_set_highbit); - -/**************** - * clear bit N of A and all bits above - */ -void mpi_clear_highbit(MPI a, unsigned int n) -{ - unsigned int limbno, bitno; - - limbno = n / BITS_PER_MPI_LIMB; - bitno = n % BITS_PER_MPI_LIMB; - - if (limbno >= a->nlimbs) - return; /* not allocated, therefore no need to clear bits :-) */ - - for ( ; bitno < BITS_PER_MPI_LIMB; bitno++) - a->d[limbno] &= ~(A_LIMB_1 << bitno); - a->nlimbs = limbno+1; -} - -/**************** - * Clear bit N of A. - */ -void mpi_clear_bit(MPI a, unsigned int n) -{ - unsigned int limbno, bitno; - - limbno = n / BITS_PER_MPI_LIMB; - bitno = n % BITS_PER_MPI_LIMB; - - if (limbno >= a->nlimbs) - return; /* Don't need to clear this bit, it's far too left. */ - a->d[limbno] &= ~(A_LIMB_1 << bitno); -} -EXPORT_SYMBOL_GPL(mpi_clear_bit); - - -/**************** - * Shift A by COUNT limbs to the right - * This is used only within the MPI library - */ -void mpi_rshift_limbs(MPI a, unsigned int count) -{ - mpi_ptr_t ap = a->d; - mpi_size_t n = a->nlimbs; - unsigned int i; - - if (count >= n) { - a->nlimbs = 0; - return; - } - - for (i = 0; i < n - count; i++) - ap[i] = ap[i+count]; - ap[i] = 0; - a->nlimbs -= count; -} - /* * Shift A by N bits to the right. */ @@ -241,66 +161,3 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) MPN_NORMALIZE(x->d, x->nlimbs); } EXPORT_SYMBOL_GPL(mpi_rshift); - -/**************** - * Shift A by COUNT limbs to the left - * This is used only within the MPI library - */ -void mpi_lshift_limbs(MPI a, unsigned int count) -{ - mpi_ptr_t ap; - int n = a->nlimbs; - int i; - - if (!count || !n) - return; - - RESIZE_IF_NEEDED(a, n+count); - - ap = a->d; - for (i = n-1; i >= 0; i--) - ap[i+count] = ap[i]; - for (i = 0; i < count; i++) - ap[i] = 0; - a->nlimbs += count; -} - -/* - * Shift A by N bits to the left. - */ -void mpi_lshift(MPI x, MPI a, unsigned int n) -{ - unsigned int nlimbs = (n/BITS_PER_MPI_LIMB); - unsigned int nbits = (n%BITS_PER_MPI_LIMB); - - if (x == a && !n) - return; /* In-place shift with an amount of zero. */ - - if (x != a) { - /* Copy A to X. */ - unsigned int alimbs = a->nlimbs; - int asign = a->sign; - mpi_ptr_t xp, ap; - - RESIZE_IF_NEEDED(x, alimbs+nlimbs+1); - xp = x->d; - ap = a->d; - MPN_COPY(xp, ap, alimbs); - x->nlimbs = alimbs; - x->flags = a->flags; - x->sign = asign; - } - - if (nlimbs && !nbits) { - /* Shift a full number of limbs. */ - mpi_lshift_limbs(x, nlimbs); - } else if (n) { - /* We use a very dump approach: Shift left by the number of - * limbs plus one and than fix it up by an rshift. - */ - mpi_lshift_limbs(x, nlimbs+1); - mpi_rshift(x, x, BITS_PER_MPI_LIMB - nbits); - } - - MPN_NORMALIZE(x->d, x->nlimbs); -} diff --git a/lib/crypto/mpi/mpi-cmp.c b/lib/crypto/mpi/mpi-cmp.c index 0835b6213235..ceaebe181cd7 100644 --- a/lib/crypto/mpi/mpi-cmp.c +++ b/lib/crypto/mpi/mpi-cmp.c @@ -45,54 +45,28 @@ int mpi_cmp_ui(MPI u, unsigned long v) } EXPORT_SYMBOL_GPL(mpi_cmp_ui); -static int do_mpi_cmp(MPI u, MPI v, int absmode) +int mpi_cmp(MPI u, MPI v) { - mpi_size_t usize; - mpi_size_t vsize; - int usign; - int vsign; + mpi_size_t usize, vsize; int cmp; mpi_normalize(u); mpi_normalize(v); - usize = u->nlimbs; vsize = v->nlimbs; - usign = absmode ? 0 : u->sign; - vsign = absmode ? 0 : v->sign; - - /* Compare sign bits. */ - - if (!usign && vsign) + if (!u->sign && v->sign) return 1; - if (usign && !vsign) + if (u->sign && !v->sign) return -1; - - /* U and V are either both positive or both negative. */ - - if (usize != vsize && !usign && !vsign) + if (usize != vsize && !u->sign && !v->sign) return usize - vsize; - if (usize != vsize && usign && vsign) - return vsize + usize; + if (usize != vsize && u->sign && v->sign) + return vsize - usize; if (!usize) return 0; cmp = mpihelp_cmp(u->d, v->d, usize); - if (!cmp) - return 0; - if ((cmp < 0?1:0) == (usign?1:0)) - return 1; - - return -1; -} - -int mpi_cmp(MPI u, MPI v) -{ - return do_mpi_cmp(u, v, 0); + if (u->sign) + return -cmp; + return cmp; } EXPORT_SYMBOL_GPL(mpi_cmp); - -int mpi_cmpabs(MPI u, MPI v) -{ - return do_mpi_cmp(u, v, 1); -} -EXPORT_SYMBOL_GPL(mpi_cmpabs); diff --git a/lib/crypto/mpi/mpi-div.c b/lib/crypto/mpi/mpi-div.c index 45beab8b9e9e..2ff0ebd74fd7 100644 --- a/lib/crypto/mpi/mpi-div.c +++ b/lib/crypto/mpi/mpi-div.c @@ -15,7 +15,6 @@ #include "longlong.h" void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); -void mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor); void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) { @@ -40,34 +39,6 @@ void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) mpi_free(temp_divisor); } -void mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor) -{ - MPI tmp = mpi_alloc(mpi_get_nlimbs(quot)); - mpi_fdiv_qr(quot, tmp, dividend, divisor); - mpi_free(tmp); -} - -void mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor) -{ - int divisor_sign = divisor->sign; - MPI temp_divisor = NULL; - - if (quot == divisor || rem == divisor) { - temp_divisor = mpi_copy(divisor); - divisor = temp_divisor; - } - - mpi_tdiv_qr(quot, rem, dividend, divisor); - - if ((divisor_sign ^ dividend->sign) && rem->nlimbs) { - mpi_sub_ui(quot, quot, 1); - mpi_add(rem, rem, divisor); - } - - if (temp_divisor) - mpi_free(temp_divisor); -} - /* If den == quot, den needs temporary storage. * If den == rem, den needs temporary storage. * If num == quot, num needs temporary storage. diff --git a/lib/crypto/mpi/mpi-internal.h b/lib/crypto/mpi/mpi-internal.h index 554002182db1..b6fbb43afbc8 100644 --- a/lib/crypto/mpi/mpi-internal.h +++ b/lib/crypto/mpi/mpi-internal.h @@ -66,14 +66,6 @@ typedef int mpi_size_t; /* (must be a signed type) */ (d)[_i] = (s)[_i]; \ } while (0) -#define MPN_COPY_INCR(d, s, n) \ - do { \ - mpi_size_t _i; \ - for (_i = 0; _i < (n); _i++) \ - (d)[_i] = (s)[_i]; \ - } while (0) - - #define MPN_COPY_DECR(d, s, n) \ do { \ mpi_size_t _i; \ @@ -181,8 +173,6 @@ int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size); void mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace); -void mpihelp_mul_n(mpi_ptr_t prodp, - mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size); int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, diff --git a/lib/crypto/mpi/mpi-inv.c b/lib/crypto/mpi/mpi-inv.c deleted file mode 100644 index 61e37d18f793..000000000000 --- a/lib/crypto/mpi/mpi-inv.c +++ /dev/null @@ -1,143 +0,0 @@ -/* mpi-inv.c - MPI functions - * Copyright (C) 1998, 2001, 2002, 2003 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, see . - */ - -#include "mpi-internal.h" - -/**************** - * Calculate the multiplicative inverse X of A mod N - * That is: Find the solution x for - * 1 = (a*x) mod n - */ -int mpi_invm(MPI x, MPI a, MPI n) -{ - /* Extended Euclid's algorithm (See TAOCP Vol II, 4.5.2, Alg X) - * modified according to Michael Penk's solution for Exercise 35 - * with further enhancement - */ - MPI u, v, u1, u2 = NULL, u3, v1, v2 = NULL, v3, t1, t2 = NULL, t3; - unsigned int k; - int sign; - int odd; - - if (!mpi_cmp_ui(a, 0)) - return 0; /* Inverse does not exists. */ - if (!mpi_cmp_ui(n, 1)) - return 0; /* Inverse does not exists. */ - - u = mpi_copy(a); - v = mpi_copy(n); - - for (k = 0; !mpi_test_bit(u, 0) && !mpi_test_bit(v, 0); k++) { - mpi_rshift(u, u, 1); - mpi_rshift(v, v, 1); - } - odd = mpi_test_bit(v, 0); - - u1 = mpi_alloc_set_ui(1); - if (!odd) - u2 = mpi_alloc_set_ui(0); - u3 = mpi_copy(u); - v1 = mpi_copy(v); - if (!odd) { - v2 = mpi_alloc(mpi_get_nlimbs(u)); - mpi_sub(v2, u1, u); /* U is used as const 1 */ - } - v3 = mpi_copy(v); - if (mpi_test_bit(u, 0)) { /* u is odd */ - t1 = mpi_alloc_set_ui(0); - if (!odd) { - t2 = mpi_alloc_set_ui(1); - t2->sign = 1; - } - t3 = mpi_copy(v); - t3->sign = !t3->sign; - goto Y4; - } else { - t1 = mpi_alloc_set_ui(1); - if (!odd) - t2 = mpi_alloc_set_ui(0); - t3 = mpi_copy(u); - } - - do { - do { - if (!odd) { - if (mpi_test_bit(t1, 0) || mpi_test_bit(t2, 0)) { - /* one is odd */ - mpi_add(t1, t1, v); - mpi_sub(t2, t2, u); - } - mpi_rshift(t1, t1, 1); - mpi_rshift(t2, t2, 1); - mpi_rshift(t3, t3, 1); - } else { - if (mpi_test_bit(t1, 0)) - mpi_add(t1, t1, v); - mpi_rshift(t1, t1, 1); - mpi_rshift(t3, t3, 1); - } -Y4: - ; - } while (!mpi_test_bit(t3, 0)); /* while t3 is even */ - - if (!t3->sign) { - mpi_set(u1, t1); - if (!odd) - mpi_set(u2, t2); - mpi_set(u3, t3); - } else { - mpi_sub(v1, v, t1); - sign = u->sign; u->sign = !u->sign; - if (!odd) - mpi_sub(v2, u, t2); - u->sign = sign; - sign = t3->sign; t3->sign = !t3->sign; - mpi_set(v3, t3); - t3->sign = sign; - } - mpi_sub(t1, u1, v1); - if (!odd) - mpi_sub(t2, u2, v2); - mpi_sub(t3, u3, v3); - if (t1->sign) { - mpi_add(t1, t1, v); - if (!odd) - mpi_sub(t2, t2, u); - } - } while (mpi_cmp_ui(t3, 0)); /* while t3 != 0 */ - /* mpi_lshift( u3, k ); */ - mpi_set(x, u1); - - mpi_free(u1); - mpi_free(v1); - mpi_free(t1); - if (!odd) { - mpi_free(u2); - mpi_free(v2); - mpi_free(t2); - } - mpi_free(u3); - mpi_free(v3); - mpi_free(t3); - - mpi_free(u); - mpi_free(v); - return 1; -} -EXPORT_SYMBOL_GPL(mpi_invm); diff --git a/lib/crypto/mpi/mpi-mod.c b/lib/crypto/mpi/mpi-mod.c index 54fcc01564d9..691bbdc52fc6 100644 --- a/lib/crypto/mpi/mpi-mod.c +++ b/lib/crypto/mpi/mpi-mod.c @@ -5,153 +5,9 @@ * This file is part of Libgcrypt. */ - #include "mpi-internal.h" -#include "longlong.h" - -/* Context used with Barrett reduction. */ -struct barrett_ctx_s { - MPI m; /* The modulus - may not be modified. */ - int m_copied; /* If true, M needs to be released. */ - int k; - MPI y; - MPI r1; /* Helper MPI. */ - MPI r2; /* Helper MPI. */ - MPI r3; /* Helper MPI allocated on demand. */ -}; - - void mpi_mod(MPI rem, MPI dividend, MPI divisor) { mpi_fdiv_r(rem, dividend, divisor); } - -/* This function returns a new context for Barrett based operations on - * the modulus M. This context needs to be released using - * _gcry_mpi_barrett_free. If COPY is true M will be transferred to - * the context and the user may change M. If COPY is false, M may not - * be changed until gcry_mpi_barrett_free has been called. - */ -mpi_barrett_t mpi_barrett_init(MPI m, int copy) -{ - mpi_barrett_t ctx; - MPI tmp; - - mpi_normalize(m); - ctx = kcalloc(1, sizeof(*ctx), GFP_KERNEL); - if (!ctx) - return NULL; - - if (copy) { - ctx->m = mpi_copy(m); - ctx->m_copied = 1; - } else - ctx->m = m; - - ctx->k = mpi_get_nlimbs(m); - tmp = mpi_alloc(ctx->k + 1); - - /* Barrett precalculation: y = floor(b^(2k) / m). */ - mpi_set_ui(tmp, 1); - mpi_lshift_limbs(tmp, 2 * ctx->k); - mpi_fdiv_q(tmp, tmp, m); - - ctx->y = tmp; - ctx->r1 = mpi_alloc(2 * ctx->k + 1); - ctx->r2 = mpi_alloc(2 * ctx->k + 1); - - return ctx; -} - -void mpi_barrett_free(mpi_barrett_t ctx) -{ - if (ctx) { - mpi_free(ctx->y); - mpi_free(ctx->r1); - mpi_free(ctx->r2); - if (ctx->r3) - mpi_free(ctx->r3); - if (ctx->m_copied) - mpi_free(ctx->m); - kfree(ctx); - } -} - - -/* R = X mod M - * - * Using Barrett reduction. Before using this function - * _gcry_mpi_barrett_init must have been called to do the - * precalculations. CTX is the context created by this precalculation - * and also conveys M. If the Barret reduction could no be done a - * straightforward reduction method is used. - * - * We assume that these conditions are met: - * Input: x =(x_2k-1 ...x_0)_b - * m =(m_k-1 ....m_0)_b with m_k-1 != 0 - * Output: r = x mod m - */ -void mpi_mod_barrett(MPI r, MPI x, mpi_barrett_t ctx) -{ - MPI m = ctx->m; - int k = ctx->k; - MPI y = ctx->y; - MPI r1 = ctx->r1; - MPI r2 = ctx->r2; - int sign; - - mpi_normalize(x); - if (mpi_get_nlimbs(x) > 2*k) { - mpi_mod(r, x, m); - return; - } - - sign = x->sign; - x->sign = 0; - - /* 1. q1 = floor( x / b^k-1) - * q2 = q1 * y - * q3 = floor( q2 / b^k+1 ) - * Actually, we don't need qx, we can work direct on r2 - */ - mpi_set(r2, x); - mpi_rshift_limbs(r2, k-1); - mpi_mul(r2, r2, y); - mpi_rshift_limbs(r2, k+1); - - /* 2. r1 = x mod b^k+1 - * r2 = q3 * m mod b^k+1 - * r = r1 - r2 - * 3. if r < 0 then r = r + b^k+1 - */ - mpi_set(r1, x); - if (r1->nlimbs > k+1) /* Quick modulo operation. */ - r1->nlimbs = k+1; - mpi_mul(r2, r2, m); - if (r2->nlimbs > k+1) /* Quick modulo operation. */ - r2->nlimbs = k+1; - mpi_sub(r, r1, r2); - - if (mpi_has_sign(r)) { - if (!ctx->r3) { - ctx->r3 = mpi_alloc(k + 2); - mpi_set_ui(ctx->r3, 1); - mpi_lshift_limbs(ctx->r3, k + 1); - } - mpi_add(r, r, ctx->r3); - } - - /* 4. while r >= m do r = r - m */ - while (mpi_cmp(r, m) >= 0) - mpi_sub(r, r, m); - - x->sign = sign; -} - - -void mpi_mul_barrett(MPI w, MPI u, MPI v, mpi_barrett_t ctx) -{ - mpi_mul(w, u, v); - mpi_mod_barrett(w, w, ctx); -} diff --git a/lib/crypto/mpi/mpicoder.c b/lib/crypto/mpi/mpicoder.c index 3cb6bd148fa9..dde01030807d 100644 --- a/lib/crypto/mpi/mpicoder.c +++ b/lib/crypto/mpi/mpicoder.c @@ -25,7 +25,6 @@ #include #include "mpi-internal.h" -#define MAX_EXTERN_SCAN_BYTES (16*1024*1024) #define MAX_EXTERN_MPI_BITS 16384 /** @@ -110,112 +109,6 @@ MPI mpi_read_from_buffer(const void *xbuffer, unsigned *ret_nread) } EXPORT_SYMBOL_GPL(mpi_read_from_buffer); -/**************** - * Fill the mpi VAL from the hex string in STR. - */ -int mpi_fromstr(MPI val, const char *str) -{ - int sign = 0; - int prepend_zero = 0; - int i, j, c, c1, c2; - unsigned int nbits, nbytes, nlimbs; - mpi_limb_t a; - - if (*str == '-') { - sign = 1; - str++; - } - - /* Skip optional hex prefix. */ - if (*str == '0' && str[1] == 'x') - str += 2; - - nbits = strlen(str); - if (nbits > MAX_EXTERN_SCAN_BYTES) { - mpi_clear(val); - return -EINVAL; - } - nbits *= 4; - if ((nbits % 8)) - prepend_zero = 1; - - nbytes = (nbits+7) / 8; - nlimbs = (nbytes+BYTES_PER_MPI_LIMB-1) / BYTES_PER_MPI_LIMB; - - if (val->alloced < nlimbs) - mpi_resize(val, nlimbs); - - i = BYTES_PER_MPI_LIMB - (nbytes % BYTES_PER_MPI_LIMB); - i %= BYTES_PER_MPI_LIMB; - j = val->nlimbs = nlimbs; - val->sign = sign; - for (; j > 0; j--) { - a = 0; - for (; i < BYTES_PER_MPI_LIMB; i++) { - if (prepend_zero) { - c1 = '0'; - prepend_zero = 0; - } else - c1 = *str++; - - if (!c1) { - mpi_clear(val); - return -EINVAL; - } - c2 = *str++; - if (!c2) { - mpi_clear(val); - return -EINVAL; - } - if (c1 >= '0' && c1 <= '9') - c = c1 - '0'; - else if (c1 >= 'a' && c1 <= 'f') - c = c1 - 'a' + 10; - else if (c1 >= 'A' && c1 <= 'F') - c = c1 - 'A' + 10; - else { - mpi_clear(val); - return -EINVAL; - } - c <<= 4; - if (c2 >= '0' && c2 <= '9') - c |= c2 - '0'; - else if (c2 >= 'a' && c2 <= 'f') - c |= c2 - 'a' + 10; - else if (c2 >= 'A' && c2 <= 'F') - c |= c2 - 'A' + 10; - else { - mpi_clear(val); - return -EINVAL; - } - a <<= 8; - a |= c; - } - i = 0; - val->d[j-1] = a; - } - - return 0; -} -EXPORT_SYMBOL_GPL(mpi_fromstr); - -MPI mpi_scanval(const char *string) -{ - MPI a; - - a = mpi_alloc(0); - if (!a) - return NULL; - - if (mpi_fromstr(a, string)) { - mpi_free(a); - return NULL; - } - mpi_normalize(a); - return a; -} -EXPORT_SYMBOL_GPL(mpi_scanval); - static int count_lzeros(MPI a) { mpi_limb_t alimb; @@ -521,232 +414,3 @@ MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int nbytes) return val; } EXPORT_SYMBOL_GPL(mpi_read_raw_from_sgl); - -/* Perform a two's complement operation on buffer P of size N bytes. */ -static void twocompl(unsigned char *p, unsigned int n) -{ - int i; - - for (i = n-1; i >= 0 && !p[i]; i--) - ; - if (i >= 0) { - if ((p[i] & 0x01)) - p[i] = (((p[i] ^ 0xfe) | 0x01) & 0xff); - else if ((p[i] & 0x02)) - p[i] = (((p[i] ^ 0xfc) | 0x02) & 0xfe); - else if ((p[i] & 0x04)) - p[i] = (((p[i] ^ 0xf8) | 0x04) & 0xfc); - else if ((p[i] & 0x08)) - p[i] = (((p[i] ^ 0xf0) | 0x08) & 0xf8); - else if ((p[i] & 0x10)) - p[i] = (((p[i] ^ 0xe0) | 0x10) & 0xf0); - else if ((p[i] & 0x20)) - p[i] = (((p[i] ^ 0xc0) | 0x20) & 0xe0); - else if ((p[i] & 0x40)) - p[i] = (((p[i] ^ 0x80) | 0x40) & 0xc0); - else - p[i] = 0x80; - - for (i--; i >= 0; i--) - p[i] ^= 0xff; - } -} - -int mpi_print(enum gcry_mpi_format format, unsigned char *buffer, - size_t buflen, size_t *nwritten, MPI a) -{ - unsigned int nbits = mpi_get_nbits(a); - size_t len; - size_t dummy_nwritten; - int negative; - - if (!nwritten) - nwritten = &dummy_nwritten; - - /* Libgcrypt does no always care to set clear the sign if the value - * is 0. For printing this is a bit of a surprise, in particular - * because if some of the formats don't support negative numbers but - * should be able to print a zero. Thus we need this extra test - * for a negative number. - */ - if (a->sign && mpi_cmp_ui(a, 0)) - negative = 1; - else - negative = 0; - - len = buflen; - *nwritten = 0; - if (format == GCRYMPI_FMT_STD) { - unsigned char *tmp; - int extra = 0; - unsigned int n; - - tmp = mpi_get_buffer(a, &n, NULL); - if (!tmp) - return -EINVAL; - - if (negative) { - twocompl(tmp, n); - if (!(*tmp & 0x80)) { - /* Need to extend the sign. */ - n++; - extra = 2; - } - } else if (n && (*tmp & 0x80)) { - /* Positive but the high bit of the returned buffer is set. - * Thus we need to print an extra leading 0x00 so that the - * output is interpreted as a positive number. - */ - n++; - extra = 1; - } - - if (buffer && n > len) { - /* The provided buffer is too short. */ - kfree(tmp); - return -E2BIG; - } - if (buffer) { - unsigned char *s = buffer; - - if (extra == 1) - *s++ = 0; - else if (extra) - *s++ = 0xff; - memcpy(s, tmp, n-!!extra); - } - kfree(tmp); - *nwritten = n; - return 0; - } else if (format == GCRYMPI_FMT_USG) { - unsigned int n = (nbits + 7)/8; - - /* Note: We ignore the sign for this format. */ - /* FIXME: for performance reasons we should put this into - * mpi_aprint because we can then use the buffer directly. - */ - - if (buffer && n > len) - return -E2BIG; - if (buffer) { - unsigned char *tmp; - - tmp = mpi_get_buffer(a, &n, NULL); - if (!tmp) - return -EINVAL; - memcpy(buffer, tmp, n); - kfree(tmp); - } - *nwritten = n; - return 0; - } else if (format == GCRYMPI_FMT_PGP) { - unsigned int n = (nbits + 7)/8; - - /* The PGP format can only handle unsigned integers. */ - if (negative) - return -EINVAL; - - if (buffer && n+2 > len) - return -E2BIG; - - if (buffer) { - unsigned char *tmp; - unsigned char *s = buffer; - - s[0] = nbits >> 8; - s[1] = nbits; - - tmp = mpi_get_buffer(a, &n, NULL); - if (!tmp) - return -EINVAL; - memcpy(s+2, tmp, n); - kfree(tmp); - } - *nwritten = n+2; - return 0; - } else if (format == GCRYMPI_FMT_SSH) { - unsigned char *tmp; - int extra = 0; - unsigned int n; - - tmp = mpi_get_buffer(a, &n, NULL); - if (!tmp) - return -EINVAL; - - if (negative) { - twocompl(tmp, n); - if (!(*tmp & 0x80)) { - /* Need to extend the sign. */ - n++; - extra = 2; - } - } else if (n && (*tmp & 0x80)) { - n++; - extra = 1; - } - - if (buffer && n+4 > len) { - kfree(tmp); - return -E2BIG; - } - - if (buffer) { - unsigned char *s = buffer; - - *s++ = n >> 24; - *s++ = n >> 16; - *s++ = n >> 8; - *s++ = n; - if (extra == 1) - *s++ = 0; - else if (extra) - *s++ = 0xff; - memcpy(s, tmp, n-!!extra); - } - kfree(tmp); - *nwritten = 4+n; - return 0; - } else if (format == GCRYMPI_FMT_HEX) { - unsigned char *tmp; - int i; - int extra = 0; - unsigned int n = 0; - - tmp = mpi_get_buffer(a, &n, NULL); - if (!tmp) - return -EINVAL; - if (!n || (*tmp & 0x80)) - extra = 2; - - if (buffer && 2*n + extra + negative + 1 > len) { - kfree(tmp); - return -E2BIG; - } - if (buffer) { - unsigned char *s = buffer; - - if (negative) - *s++ = '-'; - if (extra) { - *s++ = '0'; - *s++ = '0'; - } - - for (i = 0; i < n; i++) { - unsigned int c = tmp[i]; - - *s++ = (c >> 4) < 10 ? '0'+(c>>4) : 'A'+(c>>4)-10; - c &= 15; - *s++ = c < 10 ? '0'+c : 'A'+c-10; - } - *s++ = 0; - *nwritten = s - buffer; - } else { - *nwritten = 2*n + extra + negative + 1; - } - kfree(tmp); - return 0; - } else - return -EINVAL; -} -EXPORT_SYMBOL_GPL(mpi_print); diff --git a/lib/crypto/mpi/mpih-mul.c b/lib/crypto/mpi/mpih-mul.c index e5f1c84e3c48..a93647564054 100644 --- a/lib/crypto/mpi/mpih-mul.c +++ b/lib/crypto/mpi/mpih-mul.c @@ -317,31 +317,6 @@ mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace) } } - -void mpihelp_mul_n(mpi_ptr_t prodp, - mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size) -{ - if (up == vp) { - if (size < KARATSUBA_THRESHOLD) - mpih_sqr_n_basecase(prodp, up, size); - else { - mpi_ptr_t tspace; - tspace = mpi_alloc_limb_space(2 * size); - mpih_sqr_n(prodp, up, size, tspace); - mpi_free_limb_space(tspace); - } - } else { - if (size < KARATSUBA_THRESHOLD) - mul_n_basecase(prodp, up, vp, size); - else { - mpi_ptr_t tspace; - tspace = mpi_alloc_limb_space(2 * size); - mul_n(prodp, up, vp, size, tspace); - mpi_free_limb_space(tspace); - } - } -} - int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c index aa8c46544af8..d57fd8afef64 100644 --- a/lib/crypto/mpi/mpiutil.c +++ b/lib/crypto/mpi/mpiutil.c @@ -20,63 +20,6 @@ #include "mpi-internal.h" -/* Constants allocated right away at startup. */ -static MPI constants[MPI_NUMBER_OF_CONSTANTS]; - -/* Initialize the MPI subsystem. This is called early and allows to - * do some initialization without taking care of threading issues. - */ -static int __init mpi_init(void) -{ - int idx; - unsigned long value; - - for (idx = 0; idx < MPI_NUMBER_OF_CONSTANTS; idx++) { - switch (idx) { - case MPI_C_ZERO: - value = 0; - break; - case MPI_C_ONE: - value = 1; - break; - case MPI_C_TWO: - value = 2; - break; - case MPI_C_THREE: - value = 3; - break; - case MPI_C_FOUR: - value = 4; - break; - case MPI_C_EIGHT: - value = 8; - break; - default: - pr_err("MPI: invalid mpi_const selector %d\n", idx); - return -EFAULT; - } - constants[idx] = mpi_alloc_set_ui(value); - constants[idx]->flags = (16|32); - } - - return 0; -} -postcore_initcall(mpi_init); - -/* Return a constant MPI descripbed by NO which is one of the - * MPI_C_xxx macros. There is no need to copy this returned value; it - * may be used directly. - */ -MPI mpi_const(enum gcry_mpi_constants no) -{ - if ((int)no < 0 || no > MPI_NUMBER_OF_CONSTANTS) - pr_err("MPI: invalid mpi_const selector %d\n", no); - if (!constants[no]) - pr_err("MPI: MPI subsystem not initialized\n"); - return constants[no]; -} -EXPORT_SYMBOL_GPL(mpi_const); - /**************** * Note: It was a bad idea to use the number of limbs to allocate * because on a alpha the limbs are large but we normally need @@ -163,15 +106,6 @@ int mpi_resize(MPI a, unsigned nlimbs) return 0; } -void mpi_clear(MPI a) -{ - if (!a) - return; - a->nlimbs = 0; - a->flags = 0; -} -EXPORT_SYMBOL_GPL(mpi_clear); - void mpi_free(MPI a) { if (!a) @@ -210,121 +144,5 @@ MPI mpi_copy(MPI a) return b; } -/**************** - * This function allocates an MPI which is optimized to hold - * a value as large as the one given in the argument and allocates it - * with the same flags as A. - */ -MPI mpi_alloc_like(MPI a) -{ - MPI b; - - if (a) { - b = mpi_alloc(a->nlimbs); - b->nlimbs = 0; - b->sign = 0; - b->flags = a->flags; - } else - b = NULL; - - return b; -} - - -/* Set U into W and release U. If W is NULL only U will be released. */ -void mpi_snatch(MPI w, MPI u) -{ - if (w) { - mpi_assign_limb_space(w, u->d, u->alloced); - w->nlimbs = u->nlimbs; - w->sign = u->sign; - w->flags = u->flags; - u->alloced = 0; - u->nlimbs = 0; - u->d = NULL; - } - mpi_free(u); -} - - -MPI mpi_set(MPI w, MPI u) -{ - mpi_ptr_t wp, up; - mpi_size_t usize = u->nlimbs; - int usign = u->sign; - - if (!w) - w = mpi_alloc(mpi_get_nlimbs(u)); - RESIZE_IF_NEEDED(w, usize); - wp = w->d; - up = u->d; - MPN_COPY(wp, up, usize); - w->nlimbs = usize; - w->flags = u->flags; - w->flags &= ~(16|32); /* Reset the immutable and constant flags. */ - w->sign = usign; - return w; -} -EXPORT_SYMBOL_GPL(mpi_set); - -MPI mpi_set_ui(MPI w, unsigned long u) -{ - if (!w) - w = mpi_alloc(1); - /* FIXME: If U is 0 we have no need to resize and thus possible - * allocating the limbs. - */ - RESIZE_IF_NEEDED(w, 1); - w->d[0] = u; - w->nlimbs = u ? 1 : 0; - w->sign = 0; - w->flags = 0; - return w; -} -EXPORT_SYMBOL_GPL(mpi_set_ui); - -MPI mpi_alloc_set_ui(unsigned long u) -{ - MPI w = mpi_alloc(1); - w->d[0] = u; - w->nlimbs = u ? 1 : 0; - w->sign = 0; - return w; -} - -/**************** - * Swap the value of A and B, when SWAP is 1. - * Leave the value when SWAP is 0. - * This implementation should be constant-time regardless of SWAP. - */ -void mpi_swap_cond(MPI a, MPI b, unsigned long swap) -{ - mpi_size_t i; - mpi_size_t nlimbs; - mpi_limb_t mask = ((mpi_limb_t)0) - swap; - mpi_limb_t x; - - if (a->alloced > b->alloced) - nlimbs = b->alloced; - else - nlimbs = a->alloced; - if (a->nlimbs > nlimbs || b->nlimbs > nlimbs) - return; - - for (i = 0; i < nlimbs; i++) { - x = mask & (a->d[i] ^ b->d[i]); - a->d[i] = a->d[i] ^ x; - b->d[i] = b->d[i] ^ x; - } - - x = mask & (a->nlimbs ^ b->nlimbs); - a->nlimbs = a->nlimbs ^ x; - b->nlimbs = b->nlimbs ^ x; - - x = mask & (a->sign ^ b->sign); - a->sign = a->sign ^ x; - b->sign = b->sign ^ x; -} - MODULE_DESCRIPTION("Multiprecision maths library"); MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 8e3a67f2de87ee94ac11ea69beb4edc2993b17a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 10 Aug 2024 14:20:57 +0800 Subject: crypto: lib/mpi - Add error checks to extension The remaining functions added by commit a8ea8bdd9df92a0e5db5b43900abb7a288b8a53e did not check for memory allocation errors. Add the checks and change the API to allow errors to be returned. Fixes: a8ea8bdd9df9 ("lib/mpi: Extend the MPI library") Signed-off-by: Herbert Xu --- include/linux/mpi.h | 22 ++++++++--------- lib/crypto/mpi/mpi-add.c | 38 ++++++++++++++++++++---------- lib/crypto/mpi/mpi-bit.c | 25 ++++++++++++++------ lib/crypto/mpi/mpi-div.c | 55 +++++++++++++++++++++++++++++++------------ lib/crypto/mpi/mpi-internal.h | 11 +++++---- lib/crypto/mpi/mpi-mod.c | 4 ++-- lib/crypto/mpi/mpi-mul.c | 29 ++++++++++++++++++----- lib/crypto/mpi/mpiutil.c | 2 ++ 8 files changed, 128 insertions(+), 58 deletions(-) (limited to 'lib') diff --git a/include/linux/mpi.h b/include/linux/mpi.h index e081428b91ef..47be46f36435 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -59,7 +59,7 @@ int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); /*-- mpi-mod.c --*/ -void mpi_mod(MPI rem, MPI dividend, MPI divisor); +int mpi_mod(MPI rem, MPI dividend, MPI divisor); /*-- mpi-pow.c --*/ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); @@ -75,22 +75,22 @@ int mpi_sub_ui(MPI w, MPI u, unsigned long vval); void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); int mpi_test_bit(MPI a, unsigned int n); -void mpi_set_bit(MPI a, unsigned int n); -void mpi_rshift(MPI x, MPI a, unsigned int n); +int mpi_set_bit(MPI a, unsigned int n); +int mpi_rshift(MPI x, MPI a, unsigned int n); /*-- mpi-add.c --*/ -void mpi_add(MPI w, MPI u, MPI v); -void mpi_sub(MPI w, MPI u, MPI v); -void mpi_addm(MPI w, MPI u, MPI v, MPI m); -void mpi_subm(MPI w, MPI u, MPI v, MPI m); +int mpi_add(MPI w, MPI u, MPI v); +int mpi_sub(MPI w, MPI u, MPI v); +int mpi_addm(MPI w, MPI u, MPI v, MPI m); +int mpi_subm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-mul.c --*/ -void mpi_mul(MPI w, MPI u, MPI v); -void mpi_mulm(MPI w, MPI u, MPI v, MPI m); +int mpi_mul(MPI w, MPI u, MPI v); +int mpi_mulm(MPI w, MPI u, MPI v, MPI m); /*-- mpi-div.c --*/ -void mpi_tdiv_r(MPI rem, MPI num, MPI den); -void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); +int mpi_tdiv_r(MPI rem, MPI num, MPI den); +int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); /* inline functions */ diff --git a/lib/crypto/mpi/mpi-add.c b/lib/crypto/mpi/mpi-add.c index b47c8c35f5fe..3015140d4860 100644 --- a/lib/crypto/mpi/mpi-add.c +++ b/lib/crypto/mpi/mpi-add.c @@ -13,11 +13,12 @@ #include "mpi-internal.h" -void mpi_add(MPI w, MPI u, MPI v) +int mpi_add(MPI w, MPI u, MPI v) { mpi_ptr_t wp, up, vp; mpi_size_t usize, vsize, wsize; int usign, vsign, wsign; + int err; if (u->nlimbs < v->nlimbs) { /* Swap U and V. */ usize = v->nlimbs; @@ -25,7 +26,9 @@ void mpi_add(MPI w, MPI u, MPI v) vsize = u->nlimbs; vsign = u->sign; wsize = usize + 1; - RESIZE_IF_NEEDED(w, wsize); + err = RESIZE_IF_NEEDED(w, wsize); + if (err) + return err; /* These must be after realloc (u or v may be the same as w). */ up = v->d; vp = u->d; @@ -35,7 +38,9 @@ void mpi_add(MPI w, MPI u, MPI v) vsize = v->nlimbs; vsign = v->sign; wsize = usize + 1; - RESIZE_IF_NEEDED(w, wsize); + err = RESIZE_IF_NEEDED(w, wsize); + if (err) + return err; /* These must be after realloc (u or v may be the same as w). */ up = u->d; vp = v->d; @@ -77,28 +82,37 @@ void mpi_add(MPI w, MPI u, MPI v) w->nlimbs = wsize; w->sign = wsign; + return 0; } EXPORT_SYMBOL_GPL(mpi_add); -void mpi_sub(MPI w, MPI u, MPI v) +int mpi_sub(MPI w, MPI u, MPI v) { - MPI vv = mpi_copy(v); + int err; + MPI vv; + + vv = mpi_copy(v); + if (!vv) + return -ENOMEM; + vv->sign = !vv->sign; - mpi_add(w, u, vv); + err = mpi_add(w, u, vv); mpi_free(vv); + + return err; } EXPORT_SYMBOL_GPL(mpi_sub); -void mpi_addm(MPI w, MPI u, MPI v, MPI m) +int mpi_addm(MPI w, MPI u, MPI v, MPI m) { - mpi_add(w, u, v); - mpi_mod(w, w, m); + return mpi_add(w, u, v) ?: + mpi_mod(w, w, m); } EXPORT_SYMBOL_GPL(mpi_addm); -void mpi_subm(MPI w, MPI u, MPI v, MPI m) +int mpi_subm(MPI w, MPI u, MPI v, MPI m) { - mpi_sub(w, u, v); - mpi_mod(w, w, m); + return mpi_sub(w, u, v) ?: + mpi_mod(w, w, m); } EXPORT_SYMBOL_GPL(mpi_subm); diff --git a/lib/crypto/mpi/mpi-bit.c b/lib/crypto/mpi/mpi-bit.c index c29b85362664..835a2f0622a0 100644 --- a/lib/crypto/mpi/mpi-bit.c +++ b/lib/crypto/mpi/mpi-bit.c @@ -76,9 +76,10 @@ EXPORT_SYMBOL_GPL(mpi_test_bit); /**************** * Set bit N of A. */ -void mpi_set_bit(MPI a, unsigned int n) +int mpi_set_bit(MPI a, unsigned int n) { unsigned int i, limbno, bitno; + int err; limbno = n / BITS_PER_MPI_LIMB; bitno = n % BITS_PER_MPI_LIMB; @@ -86,27 +87,31 @@ void mpi_set_bit(MPI a, unsigned int n) if (limbno >= a->nlimbs) { for (i = a->nlimbs; i < a->alloced; i++) a->d[i] = 0; - mpi_resize(a, limbno+1); + err = mpi_resize(a, limbno+1); + if (err) + return err; a->nlimbs = limbno+1; } a->d[limbno] |= (A_LIMB_1<= x->nlimbs) { x->nlimbs = 0; - return; + return 0; } if (nlimbs) { @@ -121,7 +126,9 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) /* Copy and shift by more or equal bits than in a limb. */ xsize = a->nlimbs; x->sign = a->sign; - RESIZE_IF_NEEDED(x, xsize); + err = RESIZE_IF_NEEDED(x, xsize); + if (err) + return err; x->nlimbs = xsize; for (i = 0; i < a->nlimbs; i++) x->d[i] = a->d[i]; @@ -129,7 +136,7 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) if (nlimbs >= x->nlimbs) { x->nlimbs = 0; - return; + return 0; } for (i = 0; i < x->nlimbs - nlimbs; i++) @@ -143,7 +150,9 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) /* Copy and shift by less than bits in a limb. */ xsize = a->nlimbs; x->sign = a->sign; - RESIZE_IF_NEEDED(x, xsize); + err = RESIZE_IF_NEEDED(x, xsize); + if (err) + return err; x->nlimbs = xsize; if (xsize) { @@ -159,5 +168,7 @@ void mpi_rshift(MPI x, MPI a, unsigned int n) } } MPN_NORMALIZE(x->d, x->nlimbs); + + return 0; } EXPORT_SYMBOL_GPL(mpi_rshift); diff --git a/lib/crypto/mpi/mpi-div.c b/lib/crypto/mpi/mpi-div.c index 2ff0ebd74fd7..6e5044e72595 100644 --- a/lib/crypto/mpi/mpi-div.c +++ b/lib/crypto/mpi/mpi-div.c @@ -14,12 +14,13 @@ #include "mpi-internal.h" #include "longlong.h" -void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); +int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); -void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) +int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) { int divisor_sign = divisor->sign; MPI temp_divisor = NULL; + int err; /* We need the original value of the divisor after the remainder has been * preliminary calculated. We have to copy it to temporary space if it's @@ -27,16 +28,22 @@ void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) */ if (rem == divisor) { temp_divisor = mpi_copy(divisor); + if (!temp_divisor) + return -ENOMEM; divisor = temp_divisor; } - mpi_tdiv_r(rem, dividend, divisor); + err = mpi_tdiv_r(rem, dividend, divisor); + if (err) + goto free_temp_divisor; if (((divisor_sign?1:0) ^ (dividend->sign?1:0)) && rem->nlimbs) - mpi_add(rem, rem, divisor); + err = mpi_add(rem, rem, divisor); - if (temp_divisor) - mpi_free(temp_divisor); +free_temp_divisor: + mpi_free(temp_divisor); + + return err; } /* If den == quot, den needs temporary storage. @@ -46,12 +53,12 @@ void mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor) * i.e no extra storage should be allocated. */ -void mpi_tdiv_r(MPI rem, MPI num, MPI den) +int mpi_tdiv_r(MPI rem, MPI num, MPI den) { - mpi_tdiv_qr(NULL, rem, num, den); + return mpi_tdiv_qr(NULL, rem, num, den); } -void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) +int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) { mpi_ptr_t np, dp; mpi_ptr_t qp, rp; @@ -64,13 +71,16 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) mpi_limb_t q_limb; mpi_ptr_t marker[5]; int markidx = 0; + int err; /* Ensure space is enough for quotient and remainder. * We need space for an extra limb in the remainder, because it's * up-shifted (normalized) below. */ rsize = nsize + 1; - mpi_resize(rem, rsize); + err = mpi_resize(rem, rsize); + if (err) + return err; qsize = rsize - dsize; /* qsize cannot be bigger than this. */ if (qsize <= 0) { @@ -86,11 +96,14 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) quot->nlimbs = 0; quot->sign = 0; } - return; + return 0; } - if (quot) - mpi_resize(quot, qsize); + if (quot) { + err = mpi_resize(quot, qsize); + if (err) + return err; + } /* Read pointers here, when reallocation is finished. */ np = num->d; @@ -112,10 +125,10 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) rsize = rlimb != 0?1:0; rem->nlimbs = rsize; rem->sign = sign_remainder; - return; + return 0; } - + err = -ENOMEM; if (quot) { qp = quot->d; /* Make sure QP and NP point to different objects. Otherwise the @@ -123,6 +136,8 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) */ if (qp == np) { /* Copy NP object to temporary space. */ np = marker[markidx++] = mpi_alloc_limb_space(nsize); + if (!np) + goto out_free_marker; MPN_COPY(np, qp, nsize); } } else /* Put quotient at top of remainder. */ @@ -143,6 +158,8 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) * the original contents of the denominator. */ tp = marker[markidx++] = mpi_alloc_limb_space(dsize); + if (!tp) + goto out_free_marker; mpihelp_lshift(tp, dp, dsize, normalization_steps); dp = tp; @@ -164,6 +181,8 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) mpi_ptr_t tp; tp = marker[markidx++] = mpi_alloc_limb_space(dsize); + if (!tp) + goto out_free_marker; MPN_COPY(tp, dp, dsize); dp = tp; } @@ -198,8 +217,14 @@ void mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den) rem->nlimbs = rsize; rem->sign = sign_remainder; + + err = 0; + +out_free_marker: while (markidx) { markidx--; mpi_free_limb_space(marker[markidx]); } + + return err; } diff --git a/lib/crypto/mpi/mpi-internal.h b/lib/crypto/mpi/mpi-internal.h index b6fbb43afbc8..8a4f49e3043c 100644 --- a/lib/crypto/mpi/mpi-internal.h +++ b/lib/crypto/mpi/mpi-internal.h @@ -52,11 +52,12 @@ typedef mpi_limb_t *mpi_ptr_t; /* pointer to a limb */ typedef int mpi_size_t; /* (must be a signed type) */ -#define RESIZE_IF_NEEDED(a, b) \ - do { \ - if ((a)->alloced < (b)) \ - mpi_resize((a), (b)); \ - } while (0) +static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) +{ + if (a->alloced < b) + return mpi_resize(a, b); + return 0; +} /* Copy N limbs from S to D. */ #define MPN_COPY(d, s, n) \ diff --git a/lib/crypto/mpi/mpi-mod.c b/lib/crypto/mpi/mpi-mod.c index 691bbdc52fc6..d5fdaec3d0b6 100644 --- a/lib/crypto/mpi/mpi-mod.c +++ b/lib/crypto/mpi/mpi-mod.c @@ -7,7 +7,7 @@ #include "mpi-internal.h" -void mpi_mod(MPI rem, MPI dividend, MPI divisor) +int mpi_mod(MPI rem, MPI dividend, MPI divisor) { - mpi_fdiv_r(rem, dividend, divisor); + return mpi_fdiv_r(rem, dividend, divisor); } diff --git a/lib/crypto/mpi/mpi-mul.c b/lib/crypto/mpi/mpi-mul.c index 7f4eda8560dc..892a246216b9 100644 --- a/lib/crypto/mpi/mpi-mul.c +++ b/lib/crypto/mpi/mpi-mul.c @@ -13,7 +13,7 @@ #include "mpi-internal.h" -void mpi_mul(MPI w, MPI u, MPI v) +int mpi_mul(MPI w, MPI u, MPI v) { mpi_size_t usize, vsize, wsize; mpi_ptr_t up, vp, wp; @@ -21,6 +21,7 @@ void mpi_mul(MPI w, MPI u, MPI v) int usign, vsign, sign_product; int assign_wp = 0; mpi_ptr_t tmp_limb = NULL; + int err; if (u->nlimbs < v->nlimbs) { /* Swap U and V. */ @@ -46,15 +47,21 @@ void mpi_mul(MPI w, MPI u, MPI v) if (w->alloced < wsize) { if (wp == up || wp == vp) { wp = mpi_alloc_limb_space(wsize); + if (!wp) + return -ENOMEM; assign_wp = 1; } else { - mpi_resize(w, wsize); + err = mpi_resize(w, wsize); + if (err) + return err; wp = w->d; } } else { /* Make U and V not overlap with W. */ if (wp == up) { /* W and U are identical. Allocate temporary space for U. */ up = tmp_limb = mpi_alloc_limb_space(usize); + if (!up) + return -ENOMEM; /* Is V identical too? Keep it identical with U. */ if (wp == vp) vp = up; @@ -63,6 +70,8 @@ void mpi_mul(MPI w, MPI u, MPI v) } else if (wp == vp) { /* W and V are identical. Allocate temporary space for V. */ vp = tmp_limb = mpi_alloc_limb_space(vsize); + if (!vp) + return -ENOMEM; /* Copy to the temporary space. */ MPN_COPY(vp, wp, vsize); } @@ -71,7 +80,12 @@ void mpi_mul(MPI w, MPI u, MPI v) if (!vsize) wsize = 0; else { - mpihelp_mul(wp, up, usize, vp, vsize, &cy); + err = mpihelp_mul(wp, up, usize, vp, vsize, &cy); + if (err) { + if (assign_wp) + mpi_free_limb_space(wp); + goto free_tmp_limb; + } wsize -= cy ? 0:1; } @@ -79,14 +93,17 @@ void mpi_mul(MPI w, MPI u, MPI v) mpi_assign_limb_space(w, wp, wsize); w->nlimbs = wsize; w->sign = sign_product; + +free_tmp_limb: if (tmp_limb) mpi_free_limb_space(tmp_limb); + return err; } EXPORT_SYMBOL_GPL(mpi_mul); -void mpi_mulm(MPI w, MPI u, MPI v, MPI m) +int mpi_mulm(MPI w, MPI u, MPI v, MPI m) { - mpi_mul(w, u, v); - mpi_tdiv_r(w, w, m); + return mpi_mul(w, u, v) ?: + mpi_tdiv_r(w, w, m); } EXPORT_SYMBOL_GPL(mpi_mulm); diff --git a/lib/crypto/mpi/mpiutil.c b/lib/crypto/mpi/mpiutil.c index d57fd8afef64..979ece5a81d2 100644 --- a/lib/crypto/mpi/mpiutil.c +++ b/lib/crypto/mpi/mpiutil.c @@ -133,6 +133,8 @@ MPI mpi_copy(MPI a) if (a) { b = mpi_alloc(a->nlimbs); + if (!b) + return NULL; b->nlimbs = a->nlimbs; b->sign = a->sign; b->flags = a->flags; -- cgit v1.2.3 From 2865baf54077aa98fcdb478cefe6a42c417b9374 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Apr 2024 20:04:58 -0700 Subject: x86: support user address masking instead of non-speculative conditional The Spectre-v1 mitigations made "access_ok()" much more expensive, since it has to serialize execution with the test for a valid user address. All the normal user copy routines avoid this by just masking the user address with a data-dependent mask instead, but the fast "unsafe_user_read()" kind of patterms that were supposed to be a fast case got slowed down. This introduces a notion of using src = masked_user_access_begin(src); to do the user address sanity using a data-dependent mask instead of the more traditional conditional if (user_read_access_begin(src, len)) { model. This model only works for dense accesses that start at 'src' and on architectures that have a guard region that is guaranteed to fault in between the user space and the kernel space area. With this, the user access doesn't need to be manually checked, because a bad address is guaranteed to fault (by some architecture masking trick: on x86-64 this involves just turning an invalid user address into all ones, since we don't map the top of address space). This only converts a couple of examples for now. Example x86-64 code generation for loading two words from user space: stac mov %rax,%rcx sar $0x3f,%rcx or %rax,%rcx mov (%rcx),%r13 mov 0x8(%rcx),%r14 clac where all the error handling and -EFAULT is now purely handled out of line by the exception path. Of course, if the micro-architecture does badly at 'clac' and 'stac', the above is still pitifully slow. But at least we did as well as we could. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_64.h | 8 ++++++++ fs/select.c | 4 +++- include/linux/uaccess.h | 7 +++++++ lib/strncpy_from_user.c | 9 +++++++++ lib/strnlen_user.c | 9 +++++++++ 5 files changed, 36 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 04789f45ab2b..a10149a96d9e 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -53,6 +53,14 @@ static inline unsigned long __untagged_addr_remote(struct mm_struct *mm, */ #define valid_user_address(x) ((__force long)(x) >= 0) +/* + * Masking the user address is an alternative to a conditional + * user_access_begin that can avoid the fencing. This only works + * for dense accesses starting at the address. + */ +#define mask_user_address(x) ((typeof(x))((long)(x)|((long)(x)>>63))) +#define masked_user_access_begin(x) ({ __uaccess_begin(); mask_user_address(x); }) + /* * User pointers can have tag bits on x86-64. This scheme tolerates * arbitrary values in those bits rather then masking them off. diff --git a/fs/select.c b/fs/select.c index 9515c3fa1a03..bc185d111436 100644 --- a/fs/select.c +++ b/fs/select.c @@ -780,7 +780,9 @@ static inline int get_sigset_argpack(struct sigset_argpack *to, { // the path is hot enough for overhead of copy_from_user() to matter if (from) { - if (!user_read_access_begin(from, sizeof(*from))) + if (can_do_masked_user_access()) + from = masked_user_access_begin(from); + else if (!user_read_access_begin(from, sizeof(*from))) return -EFAULT; unsafe_get_user(to->p, &from->p, Efault); unsafe_get_user(to->size, &from->size, Efault); diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 3064314f4832..f18371f6cf36 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -32,6 +32,13 @@ }) #endif +#ifdef masked_user_access_begin + #define can_do_masked_user_access() 1 +#else + #define can_do_masked_user_access() 0 + #define masked_user_access_begin(src) NULL +#endif + /* * Architectures should provide two primitives (raw_copy_{to,from}_user()) * and get rid of their private instances of copy_{to,from}_user() and diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 6432b8c3e431..989a12a67872 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -120,6 +120,15 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (unlikely(count <= 0)) return 0; + if (can_do_masked_user_access()) { + long retval; + + src = masked_user_access_begin(src); + retval = do_strncpy_from_user(dst, src, count, count); + user_read_access_end(); + return retval; + } + max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(src); if (likely(src_addr < max_addr)) { diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index feeb935a2299..6e489f9e90f1 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -96,6 +96,15 @@ long strnlen_user(const char __user *str, long count) if (unlikely(count <= 0)) return 0; + if (can_do_masked_user_access()) { + long retval; + + str = masked_user_access_begin(str); + retval = do_strnlen_user(str, count, count); + user_read_access_end(); + return retval; + } + max_addr = TASK_SIZE_MAX; src_addr = (unsigned long)untagged_addr(str); if (likely(src_addr < max_addr)) { -- cgit v1.2.3 From e68ac2b488495fa4d127d6105ce633849859957a Mon Sep 17 00:00:00 2001 From: Caleb Sander Mateos Date: Thu, 15 Aug 2024 11:15:40 -0600 Subject: softirq: Remove unused 'action' parameter from action callback When soft interrupt actions are called, they are passed a pointer to the struct softirq action which contains the action's function pointer. This pointer isn't useful, as the action callback already knows what function it is. And since each callback handles a specific soft interrupt, the callback also knows which soft interrupt number is running. No soft interrupt action callback actually uses this parameter, so remove it from the function pointer signature. This clarifies that soft interrupt actions are global routines and makes it slightly cheaper to call them. Signed-off-by: Caleb Sander Mateos Signed-off-by: Thomas Gleixner Reviewed-by: Jens Axboe Link: https://lore.kernel.org/all/20240815171549.3260003-1-csander@purestorage.com --- block/blk-mq.c | 2 +- include/linux/interrupt.h | 4 ++-- kernel/rcu/tiny.c | 2 +- kernel/rcu/tree.c | 2 +- kernel/sched/fair.c | 2 +- kernel/softirq.c | 15 +++++++-------- kernel/time/hrtimer.c | 2 +- kernel/time/timer.c | 2 +- lib/irq_poll.c | 2 +- net/core/dev.c | 4 ++-- 10 files changed, 18 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c3c0c21b55..aa28157b1aaf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -1128,7 +1128,7 @@ static void blk_complete_reqs(struct llist_head *list) rq->q->mq_ops->complete(rq); } -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static __latent_entropy void blk_done_softirq(void) { blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 3f30c88e0b4c..694de61e0b38 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -594,7 +594,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; struct softirq_action { - void (*action)(struct softirq_action *); + void (*action)(void); }; asmlinkage void do_softirq(void); @@ -609,7 +609,7 @@ static inline void do_softirq_post_smp_call_flush(unsigned int unused) } #endif -extern void open_softirq(int nr, void (*action)(struct softirq_action *)); +extern void open_softirq(int nr, void (*action)(void)); extern void softirq_init(void); extern void __raise_softirq_irqoff(unsigned int nr); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4402d6f5f857..b3b3ce34df63 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -105,7 +105,7 @@ static inline bool rcu_reclaim_tiny(struct rcu_head *head) } /* Invoke the RCU callbacks whose grace period has elapsed. */ -static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) +static __latent_entropy void rcu_process_callbacks(void) { struct rcu_head *next, *list; unsigned long flags; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e641cc681901..93bd665637c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2855,7 +2855,7 @@ static __latent_entropy void rcu_core(void) queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work); } -static void rcu_core_si(struct softirq_action *h) +static void rcu_core_si(void) { rcu_core(); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9057584ec06d..8dc9385f6da4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -12483,7 +12483,7 @@ out: * - indirectly from a remote scheduler_tick() for NOHZ idle balancing * through the SMP cross-call nohz_csd_func() */ -static __latent_entropy void sched_balance_softirq(struct softirq_action *h) +static __latent_entropy void sched_balance_softirq(void) { struct rq *this_rq = this_rq(); enum cpu_idle_type idle = this_rq->idle_balance; diff --git a/kernel/softirq.c b/kernel/softirq.c index 02582017759a..d082e7840f88 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -551,7 +551,7 @@ restart: kstat_incr_softirqs_this_cpu(vec_nr); trace_softirq_entry(vec_nr); - h->action(h); + h->action(); trace_softirq_exit(vec_nr); if (unlikely(prev_count != preempt_count())) { pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", @@ -700,7 +700,7 @@ void __raise_softirq_irqoff(unsigned int nr) or_softirq_pending(1UL << nr); } -void open_softirq(int nr, void (*action)(struct softirq_action *)) +void open_softirq(int nr, void (*action)(void)) { softirq_vec[nr].action = action; } @@ -760,8 +760,7 @@ static bool tasklet_clear_sched(struct tasklet_struct *t) return false; } -static void tasklet_action_common(struct softirq_action *a, - struct tasklet_head *tl_head, +static void tasklet_action_common(struct tasklet_head *tl_head, unsigned int softirq_nr) { struct tasklet_struct *list; @@ -805,16 +804,16 @@ static void tasklet_action_common(struct softirq_action *a, } } -static __latent_entropy void tasklet_action(struct softirq_action *a) +static __latent_entropy void tasklet_action(void) { workqueue_softirq_action(false); - tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); } -static __latent_entropy void tasklet_hi_action(struct softirq_action *a) +static __latent_entropy void tasklet_hi_action(void) { workqueue_softirq_action(true); - tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); } void tasklet_setup(struct tasklet_struct *t, diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index b8ee320208d4..836157e09e25 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1757,7 +1757,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, } } -static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +static __latent_entropy void hrtimer_run_softirq(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 64b0d8a0aa0f..760bbeb1f331 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -2440,7 +2440,7 @@ static void run_timer_base(int index) /* * This function runs timers and the timer-tq in bottom half context. */ -static __latent_entropy void run_timer_softirq(struct softirq_action *h) +static __latent_entropy void run_timer_softirq(void) { run_timer_base(BASE_LOCAL); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) { diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2d5329a42105..08b242bbdbdf 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) } EXPORT_SYMBOL(irq_poll_complete); -static void __latent_entropy irq_poll_softirq(struct softirq_action *h) +static void __latent_entropy irq_poll_softirq(void) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); int rearm = 0, budget = irq_poll_budget; diff --git a/net/core/dev.c b/net/core/dev.c index 6ea1d20676fb..e24a3bcb496d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5247,7 +5247,7 @@ int netif_rx(struct sk_buff *skb) } EXPORT_SYMBOL(netif_rx); -static __latent_entropy void net_tx_action(struct softirq_action *h) +static __latent_entropy void net_tx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); @@ -6920,7 +6920,7 @@ static int napi_threaded_poll(void *data) return 0; } -static __latent_entropy void net_rx_action(struct softirq_action *h) +static __latent_entropy void net_rx_action(void) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); unsigned long time_limit = jiffies + -- cgit v1.2.3 From 4e1c44b3db79ba910adec32e2e1b920a0e34890a Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 7 Aug 2024 12:31:20 +0200 Subject: kunit, slub: add test_kfree_rcu() and test_leak_destroy() Add a test that will create cache, allocate one object, kfree_rcu() it and attempt to destroy it. As long as the usage of kvfree_rcu_barrier() in kmem_cache_destroy() works correctly, there should be no warnings in dmesg and the test should pass. Additionally add a test_leak_destroy() test that leaks an object on purpose and verifies that kmem_cache_destroy() catches it. Signed-off-by: Vlastimil Babka --- lib/slub_kunit.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'lib') diff --git a/lib/slub_kunit.c b/lib/slub_kunit.c index e6667a28c014..6e3a1e5a7142 100644 --- a/lib/slub_kunit.c +++ b/lib/slub_kunit.c @@ -5,6 +5,7 @@ #include #include #include +#include #include "../mm/slab.h" static struct kunit_resource resource; @@ -157,6 +158,34 @@ static void test_kmalloc_redzone_access(struct kunit *test) kmem_cache_destroy(s); } +struct test_kfree_rcu_struct { + struct rcu_head rcu; +}; + +static void test_kfree_rcu(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + sizeof(struct test_kfree_rcu_struct), + SLAB_NO_MERGE); + struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL); + + kfree_rcu(p, rcu); + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 0, slab_errors); +} + +static void test_leak_destroy(struct kunit *test) +{ + struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu", + 64, SLAB_NO_MERGE); + kmem_cache_alloc(s, GFP_KERNEL); + + kmem_cache_destroy(s); + + KUNIT_EXPECT_EQ(test, 1, slab_errors); +} + static int test_init(struct kunit *test) { slab_errors = 0; @@ -177,6 +206,8 @@ static struct kunit_case test_cases[] = { KUNIT_CASE(test_clobber_redzone_free), KUNIT_CASE(test_kmalloc_redzone_access), + KUNIT_CASE(test_kfree_rcu), + KUNIT_CASE(test_leak_destroy), {} }; -- cgit v1.2.3 From d7bcc37436c7d373a715d48a7cd74cd3b1724a68 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 22 Aug 2024 10:18:53 +0530 Subject: lib/test_bits.c: Add tests for GENMASK_U128() This adds GENMASK_U128() tests although currently only 64 bit wide masks are being tested. Cc: Andrew Morton Cc: linux-kernel@vger.kernel.org Reviewed-by: Arnd Bergmann Signed-off-by: Anshuman Khandual Signed-off-by: Yury Norov --- lib/test_bits.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'lib') diff --git a/lib/test_bits.c b/lib/test_bits.c index 01313980f175..c7b38d91e1f1 100644 --- a/lib/test_bits.c +++ b/lib/test_bits.c @@ -39,6 +39,36 @@ static void genmask_ull_test(struct kunit *test) #endif } +static void genmask_u128_test(struct kunit *test) +{ +#ifdef CONFIG_ARCH_SUPPORTS_INT128 + /* Below 64 bit masks */ + KUNIT_EXPECT_EQ(test, 0x0000000000000001ull, GENMASK_U128(0, 0)); + KUNIT_EXPECT_EQ(test, 0x0000000000000003ull, GENMASK_U128(1, 0)); + KUNIT_EXPECT_EQ(test, 0x0000000000000006ull, GENMASK_U128(2, 1)); + KUNIT_EXPECT_EQ(test, 0x00000000ffffffffull, GENMASK_U128(31, 0)); + KUNIT_EXPECT_EQ(test, 0x000000ffffe00000ull, GENMASK_U128(39, 21)); + KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, GENMASK_U128(63, 0)); + + /* Above 64 bit masks - only 64 bit portion can be validated once */ + KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, GENMASK_U128(64, 0) >> 1); + KUNIT_EXPECT_EQ(test, 0x00000000ffffffffull, GENMASK_U128(81, 50) >> 50); + KUNIT_EXPECT_EQ(test, 0x0000000000ffffffull, GENMASK_U128(87, 64) >> 64); + KUNIT_EXPECT_EQ(test, 0x0000000000ff0000ull, GENMASK_U128(87, 80) >> 64); + + KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, GENMASK_U128(127, 0) >> 64); + KUNIT_EXPECT_EQ(test, 0xffffffffffffffffull, (u64)GENMASK_U128(127, 0)); + KUNIT_EXPECT_EQ(test, 0x0000000000000003ull, GENMASK_U128(127, 126) >> 126); + KUNIT_EXPECT_EQ(test, 0x0000000000000001ull, GENMASK_U128(127, 127) >> 127); +#ifdef TEST_GENMASK_FAILURES + /* these should fail compilation */ + GENMASK_U128(0, 1); + GENMASK_U128(0, 10); + GENMASK_U128(9, 10); +#endif /* TEST_GENMASK_FAILURES */ +#endif /* CONFIG_ARCH_SUPPORTS_INT128 */ +} + static void genmask_input_check_test(struct kunit *test) { unsigned int x, y; @@ -56,12 +86,16 @@ static void genmask_input_check_test(struct kunit *test) /* Valid input */ KUNIT_EXPECT_EQ(test, 0, GENMASK_INPUT_CHECK(1, 1)); KUNIT_EXPECT_EQ(test, 0, GENMASK_INPUT_CHECK(39, 21)); + KUNIT_EXPECT_EQ(test, 0, GENMASK_INPUT_CHECK(100, 80)); + KUNIT_EXPECT_EQ(test, 0, GENMASK_INPUT_CHECK(110, 65)); + KUNIT_EXPECT_EQ(test, 0, GENMASK_INPUT_CHECK(127, 0)); } static struct kunit_case bits_test_cases[] = { KUNIT_CASE(genmask_test), KUNIT_CASE(genmask_ull_test), + KUNIT_CASE(genmask_u128_test), KUNIT_CASE(genmask_input_check_test), {} }; -- cgit v1.2.3 From 590b9d576caec6b4c46bba49ed36223a399c3fc5 Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 22 Jul 2024 18:29:24 +0200 Subject: mm: kvmalloc: align kvrealloc() with krealloc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Besides the obvious (and desired) difference between krealloc() and kvrealloc(), there is some inconsistency in their function signatures and behavior: - krealloc() frees the memory when the requested size is zero, whereas kvrealloc() simply returns a pointer to the existing allocation. - krealloc() behaves like kmalloc() if a NULL pointer is passed, whereas kvrealloc() does not accept a NULL pointer at all and, if passed, would fault instead. - krealloc() is self-contained, whereas kvrealloc() relies on the caller to provide the size of the previous allocation. Inconsistent behavior throughout allocation APIs is error prone, hence make kvrealloc() behave like krealloc(), which seems superior in all mentioned aspects. Besides that, implementing kvrealloc() by making use of krealloc() and vrealloc() provides oppertunities to grow (and shrink) allocations more efficiently. For instance, vrealloc() can be optimized to allocate and map additional pages to grow the allocation or unmap and free unused pages to shrink the allocation. [dakr@kernel.org: document concurrency restrictions] Link: https://lkml.kernel.org/r/20240725125442.4957-1-dakr@kernel.org [dakr@kernel.org: disable KASAN when switching to vmalloc] Link: https://lkml.kernel.org/r/20240730185049.6244-2-dakr@kernel.org [dakr@kernel.org: properly document __GFP_ZERO behavior] Link: https://lkml.kernel.org/r/20240730185049.6244-5-dakr@kernel.org Link: https://lkml.kernel.org/r/20240722163111.4766-3-dakr@kernel.org Signed-off-by: Danilo Krummrich Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Chandan Babu R Cc: Christian König Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Joonsoo Kim Cc: Kees Cook Cc: Marc Zyngier Cc: Michael Ellerman Cc: Miguel Ojeda Cc: Oliver Upton Cc: Pekka Enberg Cc: Roman Gushchin Cc: Uladzislau Rezki Cc: Wedson Almeida Filho Signed-off-by: Andrew Morton --- arch/arm64/kvm/nested.c | 1 - arch/powerpc/platforms/pseries/papr-vpd.c | 5 +- drivers/gpu/drm/drm_exec.c | 3 +- fs/xfs/xfs_log_recover.c | 2 +- include/linux/slab.h | 4 +- kernel/resource.c | 3 +- lib/fortify_kunit.c | 3 +- mm/util.c | 100 +++++++++++++++++++++--------- 8 files changed, 77 insertions(+), 44 deletions(-) (limited to 'lib') diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c index bab27f9d8cc6..8632c05e7997 100644 --- a/arch/arm64/kvm/nested.c +++ b/arch/arm64/kvm/nested.c @@ -62,7 +62,6 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu) */ num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU; tmp = kvrealloc(kvm->arch.nested_mmus, - size_mul(sizeof(*kvm->arch.nested_mmus), kvm->arch.nested_mmus_size), size_mul(sizeof(*kvm->arch.nested_mmus), num_mmus), GFP_KERNEL_ACCOUNT | __GFP_ZERO); if (!tmp) diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index c29e85db5f35..1574176e3ffc 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -156,10 +156,7 @@ static int vpd_blob_extend(struct vpd_blob *blob, const char *data, size_t len) const char *old_ptr = blob->data; char *new_ptr; - new_ptr = old_ptr ? - kvrealloc(old_ptr, old_len, new_len, GFP_KERNEL_ACCOUNT) : - kvmalloc(len, GFP_KERNEL_ACCOUNT); - + new_ptr = kvrealloc(old_ptr, new_len, GFP_KERNEL_ACCOUNT); if (!new_ptr) return -ENOMEM; diff --git a/drivers/gpu/drm/drm_exec.c b/drivers/gpu/drm/drm_exec.c index 2da094bdf8a4..18e366cc4993 100644 --- a/drivers/gpu/drm/drm_exec.c +++ b/drivers/gpu/drm/drm_exec.c @@ -145,8 +145,7 @@ static int drm_exec_obj_locked(struct drm_exec *exec, size_t size = exec->max_objects * sizeof(void *); void *tmp; - tmp = kvrealloc(exec->objects, size, size + PAGE_SIZE, - GFP_KERNEL); + tmp = kvrealloc(exec->objects, size + PAGE_SIZE, GFP_KERNEL); if (!tmp) return -ENOMEM; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4423dd344239..1997981827fb 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2128,7 +2128,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL); + ptr = kvrealloc(old_ptr, len + old_len, GFP_KERNEL); if (!ptr) return -ENOMEM; memcpy(&ptr[old_len], dp, len); diff --git a/include/linux/slab.h b/include/linux/slab.h index eb2bf4629157..c9cb42203183 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -841,8 +841,8 @@ kvmalloc_array_node_noprof(size_t n, size_t size, gfp_t flags, int node) #define kvcalloc_node(...) alloc_hooks(kvcalloc_node_noprof(__VA_ARGS__)) #define kvcalloc(...) alloc_hooks(kvcalloc_noprof(__VA_ARGS__)) -extern void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) - __realloc_size(3); +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) + __realloc_size(2); #define kvrealloc(...) alloc_hooks(kvrealloc_noprof(__VA_ARGS__)) extern void kvfree(const void *addr); diff --git a/kernel/resource.c b/kernel/resource.c index 14777afb0a99..9f747bb7cd03 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -450,8 +450,7 @@ int walk_system_ram_res_rev(u64 start, u64 end, void *arg, /* re-alloc */ struct resource *rams_new; - rams_new = kvrealloc(rams, rams_size * sizeof(struct resource), - (rams_size + 16) * sizeof(struct resource), + rams_new = kvrealloc(rams, (rams_size + 16) * sizeof(struct resource), GFP_KERNEL); if (!rams_new) goto out; diff --git a/lib/fortify_kunit.c b/lib/fortify_kunit.c index f9ad60a9c7bd..ecb638d4cde1 100644 --- a/lib/fortify_kunit.c +++ b/lib/fortify_kunit.c @@ -306,8 +306,7 @@ DEFINE_ALLOC_SIZE_TEST_PAIR(vmalloc) orig = kvmalloc(prev_size, gfp); \ KUNIT_EXPECT_TRUE(test, orig != NULL); \ checker(((expected_pages) * PAGE_SIZE) * 2, \ - kvrealloc(orig, prev_size, \ - ((alloc_pages) * PAGE_SIZE) * 2, gfp), \ + kvrealloc(orig, ((alloc_pages) * PAGE_SIZE) * 2, gfp), \ kvfree(p)); \ } while (0) DEFINE_ALLOC_SIZE_TEST_PAIR(kvmalloc) diff --git a/mm/util.c b/mm/util.c index bd283e2132e0..ac01925a4179 100644 --- a/mm/util.c +++ b/mm/util.c @@ -608,6 +608,28 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size) +{ + /* + * We want to attempt a large physically contiguous block first because + * it is less likely to fragment multiple larger blocks and therefore + * contribute to a long term fragmentation less than vmalloc fallback. + * However make sure that larger requests are not too disruptive - no + * OOM killer and no allocation failure warnings as we have a fallback. + */ + if (size > PAGE_SIZE) { + flags |= __GFP_NOWARN; + + if (!(flags & __GFP_RETRY_MAYFAIL)) + flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + flags &= ~__GFP_NOFAIL; + } + + return flags; +} + /** * __kvmalloc_node - attempt to allocate physically contiguous memory, but upon * failure, fall back to non-contiguous (vmalloc) allocation. @@ -627,32 +649,15 @@ EXPORT_SYMBOL(vm_mmap); */ void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node) { - gfp_t kmalloc_flags = flags; void *ret; - /* - * We want to attempt a large physically contiguous block first because - * it is less likely to fragment multiple larger blocks and therefore - * contribute to a long term fragmentation less than vmalloc fallback. - * However make sure that larger requests are not too disruptive - no - * OOM killer and no allocation failure warnings as we have a fallback. - */ - if (size > PAGE_SIZE) { - kmalloc_flags |= __GFP_NOWARN; - - if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) - kmalloc_flags |= __GFP_NORETRY; - - /* nofail semantic is implemented by the vmalloc fallback */ - kmalloc_flags &= ~__GFP_NOFAIL; - } - - ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node); - /* * It doesn't really make sense to fallback to vmalloc for sub page * requests */ + ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), + kmalloc_gfp_adjust(flags, size), + node); if (ret || size <= PAGE_SIZE) return ret; @@ -715,18 +720,53 @@ void kvfree_sensitive(const void *addr, size_t len) } EXPORT_SYMBOL(kvfree_sensitive); -void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags) +/** + * kvrealloc - reallocate memory; contents remain unchanged + * @p: object to reallocate memory for + * @size: the size to reallocate + * @flags: the flags for the page level allocator + * + * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0 + * and @p is not a %NULL pointer, the object pointed to is freed. + * + * If __GFP_ZERO logic is requested, callers must ensure that, starting with the + * initial memory allocation, every subsequent call to this API for the same + * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that + * __GFP_ZERO is not fully honored by this API. + * + * In any case, the contents of the object pointed to are preserved up to the + * lesser of the new and old sizes. + * + * This function must not be called concurrently with itself or kvfree() for the + * same memory allocation. + * + * Return: pointer to the allocated memory or %NULL in case of error + */ +void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags) { - void *newp; + void *n; - if (oldsize >= newsize) - return (void *)p; - newp = kvmalloc_noprof(newsize, flags); - if (!newp) - return NULL; - memcpy(newp, p, oldsize); - kvfree(p); - return newp; + if (is_vmalloc_addr(p)) + return vrealloc_noprof(p, size, flags); + + n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size)); + if (!n) { + /* We failed to krealloc(), fall back to kvmalloc(). */ + n = kvmalloc_noprof(size, flags); + if (!n) + return NULL; + + if (p) { + /* We already know that `p` is not a vmalloc address. */ + kasan_disable_current(); + memcpy(n, kasan_reset_tag(p), ksize(p)); + kasan_enable_current(); + + kfree(p); + } + } + + return n; } EXPORT_SYMBOL(kvrealloc_noprof); -- cgit v1.2.3 From 592c9330e3692c9636adddfe17004870ae8dc434 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 26 Jul 2024 15:12:46 +0200 Subject: lib: test_hmm: use min() to improve dmirror_exclusive() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use min() to simplify the dmirror_exclusive() function and improve its readability. Link: https://lkml.kernel.org/r/20240726131245.161695-1-thorsten.blum@toblux.com Signed-off-by: Thorsten Blum Reviewed-by: David Hildenbrand Cc: Jérôme Glisse Signed-off-by: Andrew Morton --- lib/test_hmm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/test_hmm.c b/lib/test_hmm.c index ee20e1f9bae9..056f2e411d7b 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -799,10 +799,7 @@ static int dmirror_exclusive(struct dmirror *dmirror, unsigned long mapped = 0; int i; - if (end < addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)) - next = end; - else - next = addr + (ARRAY_SIZE(pages) << PAGE_SHIFT); + next = min(end, addr + (ARRAY_SIZE(pages) << PAGE_SHIFT)); ret = make_device_exclusive_range(mm, addr, next, pages, NULL); /* -- cgit v1.2.3 From e1b8b883bb838339eec728de5d02e2f252a7d3d9 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 12 Aug 2024 15:05:42 -0400 Subject: maple_tree: reset mas->index and mas->last on write retries The following scenario can result in a race condition: Consider a node with the following indices and values a<------->b<----------->c<--------->d 0xA NULL 0xB CPU 1 CPU 2 --------- --------- mas_set_range(a,b) mas_erase() -> range is expanded (a,c) because of null expansion mas_nomem() mas_unlock() mas_store_range(b,c,0xC) The node now looks like: a<------->b<----------->c<--------->d 0xA 0xC 0xB mas_lock() mas_erase() <------ range of erase is still (a,c) The node is now NULL from (a,c) but the write from CPU 2 should have been retained and range (b,c) should still have 0xC as its value. We can fix this by re-intializing to the original index and last. This does not need a cc: Stable as there are no users of the maple tree which use internal locking and this condition is only possible with internal locking. Link: https://lkml.kernel.org/r/20240812190543.71967-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index aa3a5df15b8e..b547ff211ac7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5451,14 +5451,19 @@ EXPORT_SYMBOL_GPL(mas_store); */ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) { + unsigned long index = mas->index; + unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); mas_wr_store_setup(&wr_mas); trace_ma_write(__func__, mas, 0, entry); retry: mas_wr_store_entry(&wr_mas); - if (unlikely(mas_nomem(mas, gfp))) + if (unlikely(mas_nomem(mas, gfp))) { + if (!entry) + __mas_set_range(mas, index, last); goto retry; + } if (unlikely(mas_is_err(mas))) return xa_err(mas->node); @@ -6245,23 +6250,26 @@ EXPORT_SYMBOL_GPL(mas_find_range_rev); void *mas_erase(struct ma_state *mas) { void *entry; + unsigned long index = mas->index; MA_WR_STATE(wr_mas, mas, NULL); if (!mas_is_active(mas) || !mas_is_start(mas)) mas->status = ma_start; - /* Retry unnecessary when holding the write lock. */ +write_retry: entry = mas_state_walk(mas); if (!entry) return NULL; -write_retry: /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); mas_wr_store_setup(&wr_mas); mas_wr_store_entry(&wr_mas); - if (mas_nomem(mas, GFP_KERNEL)) + if (mas_nomem(mas, GFP_KERNEL)) { + /* in case the range of entry changed when unlocked */ + mas->index = mas->last = index; goto write_retry; + } return entry; } -- cgit v1.2.3 From 617f8e4d76b81361884db852005f519012ddd244 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 12 Aug 2024 15:05:43 -0400 Subject: maple_tree: add test to replicate low memory race conditions Add new callback fields to the userspace implementation of struct kmem_cache. This allows for executing callback functions in order to further test low memory scenarios where node allocation is retried. This callback can help test race conditions by calling a function when a low memory event is tested. Link: https://lkml.kernel.org/r/20240812190543.71967-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 +++++++++ tools/testing/radix-tree/maple.c | 63 ++++++++++++++++++++++++++++++++++++++++ tools/testing/shared/linux.c | 26 ++++++++++++++++- 3 files changed, 101 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b547ff211ac7..14d7864b8d53 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7005,6 +7005,19 @@ void mt_set_non_kernel(unsigned int val) kmem_cache_set_non_kernel(maple_node_cache, val); } +extern void kmem_cache_set_callback(struct kmem_cache *cachep, + void (*callback)(void *)); +void mt_set_callback(void (*callback)(void *)) +{ + kmem_cache_set_callback(maple_node_cache, callback); +} + +extern void kmem_cache_set_private(struct kmem_cache *cachep, void *private); +void mt_set_private(void *private) +{ + kmem_cache_set_private(maple_node_cache, private); +} + extern unsigned long kmem_cache_get_alloc(struct kmem_cache *); unsigned long mt_get_alloc_size(void) { diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index cd1cf05503b4..ef5b83cf94ea 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36224,6 +36224,65 @@ static noinline void __init check_mtree_dup(struct maple_tree *mt) extern void test_kmem_cache_bulk(void); +/* callback function used for check_nomem_writer_race() */ +static void writer2(void *maple_tree) +{ + struct maple_tree *mt = (struct maple_tree *)maple_tree; + MA_STATE(mas, mt, 6, 10); + + mtree_lock(mas.tree); + mas_store(&mas, xa_mk_value(0xC)); + mas_destroy(&mas); + mtree_unlock(mas.tree); +} + +/* + * check_nomem_writer_race() - test a possible race in the mas_nomem() path + * @mt: The tree to build. + * + * There is a possible race condition in low memory conditions when mas_nomem() + * gives up its lock. A second writer can chagne the entry that the primary + * writer executing the mas_nomem() path is modifying. This test recreates this + * scenario to ensure we are handling it correctly. + */ +static void check_nomem_writer_race(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 5); + + mt_set_non_kernel(0); + /* setup root with 2 values with NULL in between */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + mtree_store_range(mt, 11, 15, xa_mk_value(0xB), GFP_KERNEL); + + /* setup writer 2 that will trigger the race condition */ + mt_set_private(mt); + mt_set_callback(writer2); + + mtree_lock(mt); + /* erase 0-5 */ + mas_erase(&mas); + + /* index 6-10 should retain the value from writer 2 */ + check_load(mt, 6, xa_mk_value(0xC)); + mtree_unlock(mt); + + /* test for the same race but with mas_store_gfp() */ + mtree_store_range(mt, 0, 5, xa_mk_value(0xA), GFP_KERNEL); + mtree_store_range(mt, 6, 10, NULL, GFP_KERNEL); + + mas_set_range(&mas, 0, 5); + mtree_lock(mt); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + + /* ensure write made by writer 2 is retained */ + check_load(mt, 6, xa_mk_value(0xC)); + + mt_set_private(NULL); + mt_set_callback(NULL); + mtree_unlock(mt); +} + void farmer_tests(void) { struct maple_node *node; @@ -36257,6 +36316,10 @@ void farmer_tests(void) check_dfs_preorder(&tree); mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU); + check_nomem_writer_race(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_prealloc(&tree); mtree_destroy(&tree); diff --git a/tools/testing/shared/linux.c b/tools/testing/shared/linux.c index 4eb442206d01..17263696b5d8 100644 --- a/tools/testing/shared/linux.c +++ b/tools/testing/shared/linux.c @@ -26,8 +26,21 @@ struct kmem_cache { unsigned int non_kernel; unsigned long nr_allocated; unsigned long nr_tallocated; + bool exec_callback; + void (*callback)(void *); + void *private; }; +void kmem_cache_set_callback(struct kmem_cache *cachep, void (*callback)(void *)) +{ + cachep->callback = callback; +} + +void kmem_cache_set_private(struct kmem_cache *cachep, void *private) +{ + cachep->private = private; +} + void kmem_cache_set_non_kernel(struct kmem_cache *cachep, unsigned int val) { cachep->non_kernel = val; @@ -58,9 +71,17 @@ void *kmem_cache_alloc_lru(struct kmem_cache *cachep, struct list_lru *lru, { void *p; + if (cachep->exec_callback) { + if (cachep->callback) + cachep->callback(cachep->private); + cachep->exec_callback = false; + } + if (!(gfp & __GFP_DIRECT_RECLAIM)) { - if (!cachep->non_kernel) + if (!cachep->non_kernel) { + cachep->exec_callback = true; return NULL; + } cachep->non_kernel--; } @@ -223,6 +244,9 @@ kmem_cache_create(const char *name, unsigned int size, unsigned int align, ret->objs = NULL; ret->ctor = ctor; ret->non_kernel = 0; + ret->exec_callback = false; + ret->callback = NULL; + ret->private = NULL; return ret; } -- cgit v1.2.3 From 7a0529d0c2aa552498cc9b5c116f6e2ab8c2ac43 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 12 Aug 2024 15:09:24 +0000 Subject: maple_tree: fix comment typo of ma_root In comment of mas_start(), we lists the return value for different cases. In case of a single entry, we set mas->status to ma_root, while the comment uses mas_root, which is not a maple_status. Fix the typo according to the code. Link: https://lkml.kernel.org/r/20240812150925.31551-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 14d7864b8d53..b3a3d5a00577 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1346,7 +1346,7 @@ static void mas_node_count(struct ma_state *mas, int count) * Return: * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none - * - If it's a single entry: The entry & mas->status == mas_root + * - If it's a single entry: The entry & mas->status == ma_root * - If it's a tree: NULL & mas->status == safe root node. */ static inline struct maple_enode *mas_start(struct ma_state *mas) -- cgit v1.2.3 From c64d66153b3413095c57805e66b8b6e2ff8633c0 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 12 Aug 2024 15:09:25 +0000 Subject: maple_tree: fix comment typo with corresponding maple_status In comment of function mas_start(), we list the return value of different cases. According to the comment context, tell the maple_status here is more consistent with others. Let's correct it with ma_active in the case it's a tree. Link: https://lkml.kernel.org/r/20240812150925.31551-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b3a3d5a00577..884e2d130876 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1347,7 +1347,7 @@ static void mas_node_count(struct ma_state *mas, int count) * - If mas->node is an error or not mas_start, return NULL. * - If it's an empty tree: NULL & mas->status == ma_none * - If it's a single entry: The entry & mas->status == ma_root - * - If it's a tree: NULL & mas->status == safe root node. + * - If it's a tree: NULL & mas->status == ma_active */ static inline struct maple_enode *mas_start(struct ma_state *mas) { -- cgit v1.2.3 From 19138a2cc1ad82b74fb720411a2054db0522be2d Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:29 -0400 Subject: maple_tree: introduce mas_wr_prealloc_setup() Introduce a helper function, mas_wr_prealoc_setup(), that will set up a maple write state in order to start a walk of a maple tree. Link: https://lkml.kernel.org/r/20240814161944.55347-3-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 884e2d130876..407c0be6e42f 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5399,6 +5399,13 @@ reset: mas_reset(wr_mas->mas); } +static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + mas_wr_store_setup(wr_mas); + wr_mas->content = mas_start(mas); +} /* Interface */ /** @@ -5509,8 +5516,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) if (unlikely(!mas->index && mas->last == ULONG_MAX)) goto ask_now; - mas_wr_store_setup(&wr_mas); - wr_mas.content = mas_start(mas); + mas_wr_prealloc_setup(&wr_mas); /* Root expand */ if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) goto ask_now; -- cgit v1.2.3 From 3cc6f42a53f79f8da52b252859145b6bf8dfe12e Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:30 -0400 Subject: maple_tree: move up mas_wr_store_setup() and mas_wr_prealloc_setup() Subsequent patches require these definitions to be higher, no functional changes intended. Link: https://lkml.kernel.org/r/20240814161944.55347-4-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 96 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 48 insertions(+), 48 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 407c0be6e42f..de4a91ced8ca 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4227,6 +4227,54 @@ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) mas_wr_modify(wr_mas); } +static void mas_wr_store_setup(struct ma_wr_state *wr_mas) +{ + if (!mas_is_active(wr_mas->mas)) { + if (mas_is_start(wr_mas->mas)) + return; + + if (unlikely(mas_is_paused(wr_mas->mas))) + goto reset; + + if (unlikely(mas_is_none(wr_mas->mas))) + goto reset; + + if (unlikely(mas_is_overflow(wr_mas->mas))) + goto reset; + + if (unlikely(mas_is_underflow(wr_mas->mas))) + goto reset; + } + + /* + * A less strict version of mas_is_span_wr() where we allow spanning + * writes within this node. This is to stop partial walks in + * mas_prealloc() from being reset. + */ + if (wr_mas->mas->last > wr_mas->mas->max) + goto reset; + + if (wr_mas->entry) + return; + + if (mte_is_leaf(wr_mas->mas->node) && + wr_mas->mas->last == wr_mas->mas->max) + goto reset; + + return; + +reset: + mas_reset(wr_mas->mas); +} + +static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + + mas_wr_store_setup(wr_mas); + wr_mas->content = mas_start(mas); +} + /** * mas_insert() - Internal call to insert a value * @mas: The maple state @@ -5358,54 +5406,6 @@ static inline void mte_destroy_walk(struct maple_enode *enode, mt_destroy_walk(enode, mt, true); } } - -static void mas_wr_store_setup(struct ma_wr_state *wr_mas) -{ - if (!mas_is_active(wr_mas->mas)) { - if (mas_is_start(wr_mas->mas)) - return; - - if (unlikely(mas_is_paused(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_none(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_overflow(wr_mas->mas))) - goto reset; - - if (unlikely(mas_is_underflow(wr_mas->mas))) - goto reset; - } - - /* - * A less strict version of mas_is_span_wr() where we allow spanning - * writes within this node. This is to stop partial walks in - * mas_prealloc() from being reset. - */ - if (wr_mas->mas->last > wr_mas->mas->max) - goto reset; - - if (wr_mas->entry) - return; - - if (mte_is_leaf(wr_mas->mas->node) && - wr_mas->mas->last == wr_mas->mas->max) - goto reset; - - return; - -reset: - mas_reset(wr_mas->mas); -} - -static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) -{ - struct ma_state *mas = wr_mas->mas; - - mas_wr_store_setup(wr_mas); - wr_mas->content = mas_start(mas); -} /* Interface */ /** -- cgit v1.2.3 From 5d659bbb52a24f91cc6c0cf546ffcc0faa7e8f1a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:31 -0400 Subject: maple_tree: introduce mas_wr_store_type() Introduce mas_wr_store_type() which will set the correct store type based on a walk of the tree. In mas_wr_node_store() the <= min_slots condition is changed to < as if new_end is = to mt_min_slots then there is not enough room. mas_prealloc_calc() is also introduced to abstract the calculation used to determine the number of nodes needed for a store operation. In this change a call to mas_reset() is removed in the error case of mas_prealloc(). This is only needed in the MA_STATE_REBALANCE case of mas_destroy(). We can move the call to mas_reset() directly to mas_destroy(). Also, add a test case to validate the order that we check the store type in is correct. This test models a vma expanding and then shrinking which is part of the boot process. Link: https://lkml.kernel.org/r/20240814161944.55347-5-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 217 ++++++++++++++++++++++++++++----------- tools/testing/radix-tree/maple.c | 36 +++++++ 2 files changed, 193 insertions(+), 60 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index de4a91ced8ca..d0b9b3795b96 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1372,9 +1372,9 @@ retry: return NULL; } + mas->node = NULL; /* empty tree */ if (unlikely(!root)) { - mas->node = NULL; mas->status = ma_none; mas->offset = MAPLE_NODE_SLOTS; return NULL; @@ -3890,7 +3890,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, bool in_rcu = mt_in_rcu(mas->tree); /* Check if there is enough data. The room is enough. */ - if (!mte_is_root(mas->node) && (new_end <= mt_min_slots[wr_mas->type]) && + if (!mte_is_root(mas->node) && (new_end < mt_min_slots[wr_mas->type]) && !(mas->mas_flags & MA_STATE_BULK)) return false; @@ -4275,6 +4275,146 @@ static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) wr_mas->content = mas_start(mas); } +/** + * mas_prealloc_calc() - Calculate number of nodes needed for a + * given store oepration + * @mas: The maple state + * @entry: The entry to store into the tree + * + * Return: Number of nodes required for preallocation. + */ +static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) +{ + int ret = mas_mt_height(mas) * 3 + 1; + + switch (mas->store_type) { + case wr_invalid: + WARN_ON_ONCE(1); + break; + case wr_new_root: + ret = 1; + break; + case wr_store_root: + if (likely((mas->last != 0) || (mas->index != 0))) + ret = 1; + else if (((unsigned long) (entry) & 3) == 2) + ret = 1; + else + ret = 0; + break; + case wr_spanning_store: + ret = mas_mt_height(mas) * 3 + 1; + break; + case wr_split_store: + ret = mas_mt_height(mas) * 2 + 1; + break; + case wr_rebalance: + ret = mas_mt_height(mas) * 2 - 1; + break; + case wr_node_store: + ret = mt_in_rcu(mas->tree) ? 1 : 0; + break; + case wr_append: + case wr_exact_fit: + case wr_slot_store: + ret = 0; + } + + return ret; +} + +/* + * mas_wr_store_type() - Set the store type for a given + * store operation. + * @wr_mas: The maple write state + */ +static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +{ + struct ma_state *mas = wr_mas->mas; + unsigned char new_end; + + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { + mas->store_type = wr_store_root; + return; + } + + if (unlikely(!mas_wr_walk(wr_mas))) { + mas->store_type = wr_spanning_store; + return; + } + + /* At this point, we are at the leaf node that needs to be altered. */ + mas_wr_end_piv(wr_mas); + if (!wr_mas->entry) + mas_wr_extend_null(wr_mas); + + new_end = mas_wr_new_end(wr_mas); + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { + mas->store_type = wr_exact_fit; + return; + } + + if (unlikely(!mas->index && mas->last == ULONG_MAX)) { + mas->store_type = wr_new_root; + return; + } + + /* Potential spanning rebalance collapsing a node */ + if (new_end < mt_min_slots[wr_mas->type]) { + if (!mte_is_root(mas->node)) { + mas->store_type = wr_rebalance; + return; + } + mas->store_type = wr_node_store; + return; + } + + if (new_end >= mt_slots[wr_mas->type]) { + mas->store_type = wr_split_store; + return; + } + + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { + mas->store_type = wr_append; + return; + } + + if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || + (wr_mas->offset_end - mas->offset == 1))) { + mas->store_type = wr_slot_store; + return; + } + + if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || + (mas->mas_flags & MA_STATE_BULK)) { + mas->store_type = wr_node_store; + return; + } + + mas->store_type = wr_invalid; + MAS_WARN_ON(mas, 1); +} + +/** + * mas_wr_preallocate() - Preallocate enough nodes for a store operation + * @wr_mas: The maple write state + * @entry: The entry that will be stored + * + */ +static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) +{ + struct ma_state *mas = wr_mas->mas; + int request; + + mas_wr_prealloc_setup(wr_mas); + mas_wr_store_type(wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return; + + mas_node_count(mas, request); +} + /** * mas_insert() - Internal call to insert a value * @mas: The maple state @@ -5508,69 +5648,25 @@ EXPORT_SYMBOL_GPL(mas_store_prealloc); int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) { MA_WR_STATE(wr_mas, mas, entry); - unsigned char node_size; - int request = 1; - int ret; - - - if (unlikely(!mas->index && mas->last == ULONG_MAX)) - goto ask_now; + int ret = 0; + int request; mas_wr_prealloc_setup(&wr_mas); - /* Root expand */ - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) - goto ask_now; - - if (unlikely(!mas_wr_walk(&wr_mas))) { - /* Spanning store, use worst case for now */ - request = 1 + mas_mt_height(mas) * 3; - goto ask_now; - } - - /* At this point, we are at the leaf node that needs to be altered. */ - /* Exact fit, no nodes needed. */ - if (wr_mas.r_min == mas->index && wr_mas.r_max == mas->last) - return 0; - - mas_wr_end_piv(&wr_mas); - node_size = mas_wr_new_end(&wr_mas); - - /* Slot store, does not require additional nodes */ - if (node_size == mas->end) { - /* reuse node */ - if (!mt_in_rcu(mas->tree)) - return 0; - /* shifting boundary */ - if (wr_mas.offset_end - mas->offset == 1) - return 0; - } + mas_wr_store_type(&wr_mas); + request = mas_prealloc_calc(mas, entry); + if (!request) + return ret; - if (node_size >= mt_slots[wr_mas.type]) { - /* Split, worst case for now. */ - request = 1 + mas_mt_height(mas) * 2; - goto ask_now; + mas_node_count_gfp(mas, request, gfp); + if (mas_is_err(mas)) { + mas_set_alloc_req(mas, 0); + ret = xa_err(mas->node); + mas_destroy(mas); + mas_reset(mas); + return ret; } - /* New root needs a single node */ - if (unlikely(mte_is_root(mas->node))) - goto ask_now; - - /* Potential spanning rebalance collapsing a node, use worst-case */ - if (node_size - 1 <= mt_min_slots[wr_mas.type]) - request = mas_mt_height(mas) * 2 - 1; - - /* node store, slot store needs one node */ -ask_now: - mas_node_count_gfp(mas, request, gfp); mas->mas_flags |= MA_STATE_PREALLOC; - if (likely(!mas_is_err(mas))) - return 0; - - mas_set_alloc_req(mas, 0); - ret = xa_err(mas->node); - mas_reset(mas); - mas_destroy(mas); - mas_reset(mas); return ret; } EXPORT_SYMBOL_GPL(mas_preallocate); @@ -5596,7 +5692,8 @@ void mas_destroy(struct ma_state *mas) */ if (mas->mas_flags & MA_STATE_REBALANCE) { unsigned char end; - + if (mas_is_err(mas)) + mas_reset(mas); mas_start(mas); mtree_range_walk(mas); end = mas->end + 1; diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index ef5b83cf94ea..ad42a36231fb 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -36283,6 +36283,38 @@ static void check_nomem_writer_race(struct maple_tree *mt) mtree_unlock(mt); } + /* test to simulate expanding a vma from [0x7fffffffe000, 0x7ffffffff000) + * to [0x7ffde4ca1000, 0x7ffffffff000) and then shrinking the vma to + * [0x7ffde4ca1000, 0x7ffde4ca2000) + */ +static inline int check_vma_modification(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, 0); + + mtree_lock(mt); + /* vma with old start and old end */ + __mas_set_range(&mas, 0x7fffffffe000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* next write occurs partly in previous range [0, 0x7fffffffe000)*/ + mas_prev_range(&mas, 0); + /* expand vma to {0x7ffde4ca1000, 0x7ffffffff000) */ + __mas_set_range(&mas, 0x7ffde4ca1000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, xa_mk_value(1), GFP_KERNEL); + mas_store_prealloc(&mas, xa_mk_value(1)); + + /* shrink vma to [0x7ffde4ca1000, 7ffde4ca2000) */ + __mas_set_range(&mas, 0x7ffde4ca2000, 0x7ffffffff000 - 1); + mas_preallocate(&mas, NULL, GFP_KERNEL); + mas_store_prealloc(&mas, NULL); + mt_dump(mt, mt_dump_hex); + + mas_destroy(&mas); + mtree_unlock(mt); + return 0; +} + void farmer_tests(void) { struct maple_node *node; @@ -36290,6 +36322,10 @@ void farmer_tests(void) mt_dump(&tree, mt_dump_dec); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | MT_FLAGS_USE_RCU); + check_vma_modification(&tree); + mtree_destroy(&tree); + tree.ma_root = xa_mk_value(0); mt_dump(&tree, mt_dump_dec); -- cgit v1.2.3 From 3cd9e92e009de9a036cbf9ec27bad33367fca6f3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:32 -0400 Subject: maple_tree: remove mas_destroy() from mas_nomem() Separate call to mas_destroy() from mas_nomem() so we can check for no memory errors without destroying the current maple state in mas_store_gfp(). We then add calls to mas_destroy() to callers of mas_nomem(). Link: https://lkml.kernel.org/r/20240814161944.55347-6-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 37 ++++++++++++++++++++++++------------- tools/testing/radix-tree/maple.c | 10 ++++++---- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d0b9b3795b96..58985107cf00 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4519,6 +4519,7 @@ int mas_alloc_cyclic(struct ma_state *mas, unsigned long *startp, if (*next == 0) mas->tree->ma_flags |= MT_FLAGS_ALLOC_WRAPPED; + mas_destroy(mas); return ret; } EXPORT_SYMBOL(mas_alloc_cyclic); @@ -5601,21 +5602,25 @@ int mas_store_gfp(struct ma_state *mas, void *entry, gfp_t gfp) unsigned long index = mas->index; unsigned long last = mas->last; MA_WR_STATE(wr_mas, mas, entry); + int ret = 0; - mas_wr_store_setup(&wr_mas); - trace_ma_write(__func__, mas, 0, entry); retry: - mas_wr_store_entry(&wr_mas); + mas_wr_preallocate(&wr_mas, entry); if (unlikely(mas_nomem(mas, gfp))) { if (!entry) __mas_set_range(mas, index, last); goto retry; } - if (unlikely(mas_is_err(mas))) - return xa_err(mas->node); + if (mas_is_err(mas)) { + ret = xa_err(mas->node); + goto out; + } - return 0; + mas_wr_store_entry(&wr_mas); +out: + mas_destroy(mas); + return ret; } EXPORT_SYMBOL_GPL(mas_store_gfp); @@ -6374,6 +6379,7 @@ write_retry: goto write_retry; } + mas_destroy(mas); return entry; } EXPORT_SYMBOL_GPL(mas_erase); @@ -6388,10 +6394,8 @@ EXPORT_SYMBOL_GPL(mas_erase); bool mas_nomem(struct ma_state *mas, gfp_t gfp) __must_hold(mas->tree->ma_lock) { - if (likely(mas->node != MA_ERROR(-ENOMEM))) { - mas_destroy(mas); + if (likely(mas->node != MA_ERROR(-ENOMEM))) return false; - } if (gfpflags_allow_blocking(gfp) && !mt_external_lock(mas->tree)) { mtree_unlock(mas->tree); @@ -6469,6 +6473,7 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, { MA_STATE(mas, mt, index, last); MA_WR_STATE(wr_mas, &mas, entry); + int ret = 0; trace_ma_write(__func__, &mas, 0, entry); if (WARN_ON_ONCE(xa_is_advanced(entry))) @@ -6484,10 +6489,12 @@ retry: goto retry; mtree_unlock(mt); + if (mas_is_err(&mas)) - return xa_err(mas.node); + ret = xa_err(mas.node); - return 0; + mas_destroy(&mas); + return ret; } EXPORT_SYMBOL(mtree_store_range); @@ -6523,6 +6530,7 @@ int mtree_insert_range(struct maple_tree *mt, unsigned long first, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(ms, mt, first, last); + int ret = 0; if (WARN_ON_ONCE(xa_is_advanced(entry))) return -EINVAL; @@ -6538,9 +6546,10 @@ retry: mtree_unlock(mt); if (mas_is_err(&ms)) - return xa_err(ms.node); + ret = xa_err(ms.node); - return 0; + mas_destroy(&ms); + return ret; } EXPORT_SYMBOL(mtree_insert_range); @@ -6595,6 +6604,7 @@ retry: unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_range); @@ -6676,6 +6686,7 @@ retry: unlock: mtree_unlock(mt); + mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_alloc_rrange); diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index ad42a36231fb..c5b00aca9def 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -120,7 +120,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas.alloc->slot[0] == NULL); mas_push_node(&mas, mn); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* free */ + mas_destroy(&mas); mtree_unlock(mt); @@ -144,7 +144,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) mn->parent = ma_parent_ptr(mn); ma_free_rcu(mn); mas.status = ma_start; - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Allocate 3 nodes, will fail. */ mas_node_count(&mas, 3); /* Drop the lock and allocate 3 nodes. */ @@ -161,7 +161,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != 3); /* Free. */ mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); + mas_destroy(&mas); /* Set allocation request to 1. */ mas_set_alloc_req(&mas, 1); @@ -277,6 +277,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } mas_reset(&mas); MT_BUG_ON(mt, mas_nomem(&mas, GFP_KERNEL)); + mas_destroy(&mas); } @@ -299,7 +300,7 @@ static noinline void __init check_new_node(struct maple_tree *mt) } MT_BUG_ON(mt, mas_allocated(&mas) != total); mas_reset(&mas); - mas_nomem(&mas, GFP_KERNEL); /* Free. */ + mas_destroy(&mas); /* Free. */ MT_BUG_ON(mt, mas_allocated(&mas) != 0); for (i = 1; i < 128; i++) { @@ -35847,6 +35848,7 @@ static noinline void __init check_nomem(struct maple_tree *mt) mas_store(&ms, &ms); /* insert 1 -> &ms */ mas_nomem(&ms, GFP_KERNEL); /* Node allocated in here. */ mtree_unlock(mt); + mas_destroy(&ms); mtree_destroy(mt); } -- cgit v1.2.3 From 7e093834ed8c71361bae556f2c1efc2bf2e33971 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:33 -0400 Subject: maple_tree: preallocate nodes in mas_erase() Use mas_wr_preallocate() in mas_erase() to preallocate enough nodes to complete the erase. Add error handling by skipping the store if the preallocation lead to some error besides no memory. Link: https://lkml.kernel.org/r/20240814161944.55347-7-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 58985107cf00..8ba52ffa778e 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6371,14 +6371,18 @@ write_retry: /* Must reset to ensure spanning writes of last slot are detected */ mas_reset(mas); - mas_wr_store_setup(&wr_mas); - mas_wr_store_entry(&wr_mas); + mas_wr_preallocate(&wr_mas, NULL); if (mas_nomem(mas, GFP_KERNEL)) { /* in case the range of entry changed when unlocked */ mas->index = mas->last = index; goto write_retry; } + if (mas_is_err(mas)) + goto out; + + mas_wr_store_entry(&wr_mas); +out: mas_destroy(mas); return entry; } -- cgit v1.2.3 From 85db8f241707778b9755618f350915616d1d861c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:34 -0400 Subject: maple_tree: use mas_store_gfp() in mtree_store_range() Refactor mtree_store_range() to use mas_store_gfp() which will abstract the store, memory allocation, and error handling. Link: https://lkml.kernel.org/r/20240814161944.55347-8-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8ba52ffa778e..e01e05be6301 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -6476,7 +6476,6 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, unsigned long last, void *entry, gfp_t gfp) { MA_STATE(mas, mt, index, last); - MA_WR_STATE(wr_mas, &mas, entry); int ret = 0; trace_ma_write(__func__, &mas, 0, entry); @@ -6487,17 +6486,9 @@ int mtree_store_range(struct maple_tree *mt, unsigned long index, return -EINVAL; mtree_lock(mt); -retry: - mas_wr_store_entry(&wr_mas); - if (mas_nomem(&mas, gfp)) - goto retry; - + ret = mas_store_gfp(&mas, entry, gfp); mtree_unlock(mt); - if (mas_is_err(&mas)) - ret = xa_err(mas.node); - - mas_destroy(&mas); return ret; } EXPORT_SYMBOL(mtree_store_range); -- cgit v1.2.3 From 23e217a848b3b9e1b0cb8c41d731d7968bcc77a5 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:35 -0400 Subject: maple_tree: print store type in mas_dump() Knowing the store type of the maple state could be helpful for debugging. Have mas_dump() print mas->store_type. Link: https://lkml.kernel.org/r/20240814161944.55347-9-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e01e05be6301..a1689fc6227b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7760,6 +7760,40 @@ void mas_dump(const struct ma_state *mas) break; } + pr_err("Store Type: "); + switch (mas->store_type) { + case wr_invalid: + pr_err("invalid store type\n"); + break; + case wr_new_root: + pr_err("new_root\n"); + break; + case wr_store_root: + pr_err("store_root\n"); + break; + case wr_exact_fit: + pr_err("exact_fit\n"); + break; + case wr_split_store: + pr_err("split_store\n"); + break; + case wr_slot_store: + pr_err("slot_store\n"); + break; + case wr_append: + pr_err("append\n"); + break; + case wr_node_store: + pr_err("node_store\n"); + break; + case wr_spanning_store: + pr_err("spanning_store\n"); + break; + case wr_rebalance: + pr_err("rebalance\n"); + break; + } + pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", -- cgit v1.2.3 From 580fcbd67ce2cdf9d70c7a8eda0789b9eba1c3af Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:36 -0400 Subject: maple_tree: use store type in mas_wr_store_entry() When storing an entry, we can read the store type that was set from a previous partial walk of the tree. Now that the type of store is known, select the correct write helper function to use to complete the store. Also noinline mas_wr_spanning_store() to limit stack frame usage in mas_wr_store_entry() as it allocates a maple_big_node on the stack. Link: https://lkml.kernel.org/r/20240814161944.55347-10-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 90 ++++++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 38 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a1689fc6227b..2242e07a46dc 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3780,7 +3780,7 @@ done: * * Return: 0 on error, positive on success. */ -static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) +static noinline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; @@ -4206,43 +4206,62 @@ slow_path: static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; + unsigned char new_end = mas_wr_new_end(wr_mas); - wr_mas->content = mas_start(mas); - if (mas_is_none(mas) || mas_is_ptr(mas)) { - mas_store_root(mas, wr_mas->entry); + switch (mas->store_type) { + case wr_invalid: + MT_BUG_ON(mas->tree, 1); return; - } - - if (unlikely(!mas_wr_walk(wr_mas))) { + case wr_new_root: + mas_new_root(mas, wr_mas->entry); + break; + case wr_store_root: + mas_store_root(mas, wr_mas->entry); + break; + case wr_exact_fit: + rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); + if (!!wr_mas->entry ^ !!wr_mas->content) + mas_update_gap(mas); + break; + case wr_append: + mas_wr_append(wr_mas, new_end); + break; + case wr_slot_store: + mas_wr_slot_store(wr_mas); + break; + case wr_node_store: + mas_wr_node_store(wr_mas, new_end); + break; + case wr_spanning_store: mas_wr_spanning_store(wr_mas); - return; + break; + case wr_split_store: + case wr_rebalance: + mas_wr_bnode(wr_mas); + break; } - /* At this point, we are at the leaf node that needs to be altered. */ - mas_wr_end_piv(wr_mas); - /* New root for a single pointer */ - if (unlikely(!mas->index && mas->last == ULONG_MAX)) - mas_new_root(mas, wr_mas->entry); - else - mas_wr_modify(wr_mas); + return; } -static void mas_wr_store_setup(struct ma_wr_state *wr_mas) +static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) { - if (!mas_is_active(wr_mas->mas)) { - if (mas_is_start(wr_mas->mas)) - return; + struct ma_state *mas = wr_mas->mas; + + if (!mas_is_active(mas)) { + if (mas_is_start(mas)) + goto set_content; - if (unlikely(mas_is_paused(wr_mas->mas))) + if (unlikely(mas_is_paused(mas))) goto reset; - if (unlikely(mas_is_none(wr_mas->mas))) + if (unlikely(mas_is_none(mas))) goto reset; - if (unlikely(mas_is_overflow(wr_mas->mas))) + if (unlikely(mas_is_overflow(mas))) goto reset; - if (unlikely(mas_is_underflow(wr_mas->mas))) + if (unlikely(mas_is_underflow(mas))) goto reset; } @@ -4251,27 +4270,20 @@ static void mas_wr_store_setup(struct ma_wr_state *wr_mas) * writes within this node. This is to stop partial walks in * mas_prealloc() from being reset. */ - if (wr_mas->mas->last > wr_mas->mas->max) + if (mas->last > mas->max) goto reset; if (wr_mas->entry) - return; + goto set_content; - if (mte_is_leaf(wr_mas->mas->node) && - wr_mas->mas->last == wr_mas->mas->max) + if (mte_is_leaf(mas->node) && mas->last == mas->max) goto reset; - return; + goto set_content; reset: - mas_reset(wr_mas->mas); -} - -static inline void mas_wr_prealloc_setup(struct ma_wr_state *wr_mas) -{ - struct ma_state *mas = wr_mas->mas; - - mas_wr_store_setup(wr_mas); + mas_reset(mas); +set_content: wr_mas->content = mas_start(mas); } @@ -5582,7 +5594,8 @@ void *mas_store(struct ma_state *mas, void *entry) * want to examine what happens if a single store operation was to * overwrite multiple entries within a self-balancing B-Tree. */ - mas_wr_store_setup(&wr_mas); + mas_wr_prealloc_setup(&wr_mas); + mas_wr_store_type(&wr_mas); mas_wr_store_entry(&wr_mas); return wr_mas.content; } @@ -5634,7 +5647,8 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); - mas_wr_store_setup(&wr_mas); + mas_wr_prealloc_setup(&wr_mas); + mas_wr_store_type(&wr_mas); trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); -- cgit v1.2.3 From 1fd7c4f3228e9775c97b25a6e5e2420df8cf0e76 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:37 -0400 Subject: maple_tree: convert mas_insert() to preallocate nodes By setting the store type in mas_insert(), we no longer need to use mas_wr_modify() to determine the correct store function to use. Instead, set the store type and call mas_wr_store_entry(). Also, pass in the requested gfp flags to mas_insert() so they can be passed to the call to mas_wr_preallocate(). Link: https://lkml.kernel.org/r/20240814161944.55347-11-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 2242e07a46dc..bb940f61d713 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4457,26 +4457,24 @@ static inline void *mas_insert(struct ma_state *mas, void *entry) if (wr_mas.content) goto exists; - if (mas_is_none(mas) || mas_is_ptr(mas)) { - mas_store_root(mas, entry); + mas_wr_preallocate(&wr_mas, entry); + if (mas_is_err(mas)) return NULL; - } /* spanning writes always overwrite something */ - if (!mas_wr_walk(&wr_mas)) + if (mas->store_type == wr_spanning_store) goto exists; /* At this point, we are at the leaf node that needs to be altered. */ - wr_mas.offset_end = mas->offset; - wr_mas.end_piv = wr_mas.r_max; - - if (wr_mas.content || (mas->last > wr_mas.r_max)) - goto exists; + if (mas->store_type != wr_new_root && mas->store_type != wr_store_root) { + wr_mas.offset_end = mas->offset; + wr_mas.end_piv = wr_mas.r_max; - if (!entry) - return NULL; + if (wr_mas.content || (mas->last > wr_mas.r_max)) + goto exists; + } - mas_wr_modify(&wr_mas); + mas_wr_store_entry(&wr_mas); return wr_mas.content; exists: -- cgit v1.2.3 From 62c7b2b9842c541a09e1cf824ed738ee30069e6f Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:38 -0400 Subject: maple_tree: simplify mas_commit_b_node() The only callers of mas_commit_b_node() are those with store type of wr_rebalance and wr_split_store. Use mas->store_type to dispatch to the correct helper function. This allows the removal of mas_reuse_node() as it is no longer used. Link: https://lkml.kernel.org/r/20240814161944.55347-12-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 64 +++++++------------------------------------------------- 1 file changed, 7 insertions(+), 57 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bb940f61d713..0314e9b52621 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3395,72 +3395,22 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) return 1; } -/* - * mas_reuse_node() - Reuse the node to store the data. - * @wr_mas: The maple write state - * @bn: The maple big node - * @end: The end of the data. - * - * Will always return false in RCU mode. - * - * Return: True if node was reused, false otherwise. - */ -static inline bool mas_reuse_node(struct ma_wr_state *wr_mas, - struct maple_big_node *bn, unsigned char end) -{ - /* Need to be rcu safe. */ - if (mt_in_rcu(wr_mas->mas->tree)) - return false; - - if (end > bn->b_end) { - int clear = mt_slots[wr_mas->type] - bn->b_end; - - memset(wr_mas->slots + bn->b_end, 0, sizeof(void *) * clear--); - memset(wr_mas->pivots + bn->b_end, 0, sizeof(void *) * clear); - } - mab_mas_cp(bn, 0, bn->b_end, wr_mas->mas, false); - return true; -} - /* * mas_commit_b_node() - Commit the big node into the tree. * @wr_mas: The maple write state * @b_node: The maple big node - * @end: The end of the data. */ static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, - struct maple_big_node *b_node, unsigned char end) + struct maple_big_node *b_node) { - struct maple_node *node; - struct maple_enode *old_enode; - unsigned char b_end = b_node->b_end; - enum maple_type b_type = b_node->type; - - old_enode = wr_mas->mas->node; - if ((b_end < mt_min_slots[b_type]) && - (!mte_is_root(old_enode)) && - (mas_mt_height(wr_mas->mas) > 1)) - return mas_rebalance(wr_mas->mas, b_node); - - if (b_end >= mt_slots[b_type]) - return mas_split(wr_mas->mas, b_node); + enum store_type type = wr_mas->mas->store_type; - if (mas_reuse_node(wr_mas, b_node, end)) - goto reuse_node; + WARN_ON_ONCE(type != wr_rebalance && type != wr_split_store); - mas_node_count(wr_mas->mas, 1); - if (mas_is_err(wr_mas->mas)) - return 0; + if (type == wr_rebalance) + return mas_rebalance(wr_mas->mas, b_node); - node = mas_pop_node(wr_mas->mas); - node->parent = mas_mn(wr_mas->mas)->parent; - wr_mas->mas->node = mt_mk_node(node, b_type); - mab_mas_cp(b_node, 0, b_end, wr_mas->mas, false); - mas_replace_node(wr_mas->mas, old_enode); -reuse_node: - mas_update_gap(wr_mas->mas); - wr_mas->mas->end = b_end; - return 1; + return mas_split(wr_mas->mas, b_node); } /* @@ -4155,7 +4105,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) trace_ma_write(__func__, wr_mas->mas, 0, wr_mas->entry); memset(&b_node, 0, sizeof(struct maple_big_node)); mas_store_b_node(wr_mas, &b_node, wr_mas->offset_end); - mas_commit_b_node(wr_mas, &b_node, wr_mas->mas->end); + mas_commit_b_node(wr_mas, &b_node); } static inline void mas_wr_modify(struct ma_wr_state *wr_mas) -- cgit v1.2.3 From 7987d027799cf3a300ace1a22c5e61da878b759a Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:39 -0400 Subject: maple_tree: remove mas_wr_modify() There are no more users of the function, safely remove it. Link: https://lkml.kernel.org/r/20240814161944.55347-13-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 38 -------------------------------------- 1 file changed, 38 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 0314e9b52621..6640ca775808 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4108,44 +4108,6 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) mas_commit_b_node(wr_mas, &b_node); } -static inline void mas_wr_modify(struct ma_wr_state *wr_mas) -{ - struct ma_state *mas = wr_mas->mas; - unsigned char new_end; - - /* Direct replacement */ - if (wr_mas->r_min == mas->index && wr_mas->r_max == mas->last) { - rcu_assign_pointer(wr_mas->slots[mas->offset], wr_mas->entry); - if (!!wr_mas->entry ^ !!wr_mas->content) - mas_update_gap(mas); - return; - } - - /* - * new_end exceeds the size of the maple node and cannot enter the fast - * path. - */ - new_end = mas_wr_new_end(wr_mas); - if (new_end >= mt_slots[wr_mas->type]) - goto slow_path; - - /* Attempt to append */ - if (mas_wr_append(wr_mas, new_end)) - return; - - if (new_end == mas->end && mas_wr_slot_store(wr_mas)) - return; - - if (mas_wr_node_store(wr_mas, new_end)) - return; - - if (mas_is_err(mas)) - return; - -slow_path: - mas_wr_bnode(wr_mas); -} - /* * mas_wr_store_entry() - Internal call to store a value * @mas: The maple state -- cgit v1.2.3 From 4037d44f548fe1f9ca4ad002c39a0eb84d79de60 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:40 -0400 Subject: maple_tree: have mas_store() allocate nodes if needed Not all users of mas_store() enter with nodes already preallocated. Check for the MA_STATE_PREALLOC flag to decide whether to preallocate nodes within mas_store() rather than relying on future write helper functions to perform the allocations. This allows the write helper functions to be simplified as they do not have to do checks to make sure there are enough allocated nodes to perform the write. Link: https://lkml.kernel.org/r/20240814161944.55347-14-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 6640ca775808..d5e020dd93fa 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -5477,13 +5477,12 @@ static inline void mte_destroy_walk(struct maple_enode *enode, * @entry: The entry to store. * * The @mas->index and @mas->last is used to set the range for the @entry. - * Note: The @mas should have pre-allocated entries to ensure there is memory to - * store the entry. Please see mas_expected_entries()/mas_destroy() for more details. * * Return: the first entry between mas->index and mas->last or %NULL. */ void *mas_store(struct ma_state *mas, void *entry) { + int request; MA_WR_STATE(wr_mas, mas, entry); trace_ma_write(__func__, mas, 0, entry); @@ -5506,7 +5505,23 @@ void *mas_store(struct ma_state *mas, void *entry) */ mas_wr_prealloc_setup(&wr_mas); mas_wr_store_type(&wr_mas); + if (mas->mas_flags & MA_STATE_PREALLOC) { + mas_wr_store_entry(&wr_mas); + MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); + return wr_mas.content; + } + + request = mas_prealloc_calc(mas, entry); + if (!request) + goto store; + + mas_node_count(mas, request); + if (mas_is_err(mas)) + return NULL; + +store: mas_wr_store_entry(&wr_mas); + mas_destroy(mas); return wr_mas.content; } EXPORT_SYMBOL_GPL(mas_store); -- cgit v1.2.3 From 9155e8433498cc8be2fdfaf81a31393d3e40781c Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:41 -0400 Subject: maple_tree: remove node allocations from various write helper functions These write helper functions are all called from store paths which preallocate enough nodes that will be needed for the write. There is no more need to allocate within the functions themselves. Link: https://lkml.kernel.org/r/20240814161944.55347-15-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index d5e020dd93fa..5f79be184377 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2976,9 +2976,6 @@ static inline int mas_rebalance(struct ma_state *mas, * tries to combine the data in the same way. If one node contains the * entire range of the tree, then that node is used as a new root node. */ - mas_node_count(mas, empty_count * 2 - 1); - if (mas_is_err(mas)) - return 0; mast.orig_l = &l_mas; mast.orig_r = &r_mas; @@ -3029,11 +3026,6 @@ static inline void mas_destroy_rebalance(struct ma_state *mas, unsigned char end /* set up node. */ if (in_rcu) { - /* Allocate for both left and right as well as parent. */ - mas_node_count(mas, 3); - if (mas_is_err(mas)) - return; - newnode = mas_pop_node(mas); } else { newnode = &reuse; @@ -3341,10 +3333,6 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) trace_ma_op(__func__, mas); mas->depth = mas_mt_height(mas); - /* Allocation failures will happen early. */ - mas_node_count(mas, 1 + mas->depth * 2); - if (mas_is_err(mas)) - return 0; mast.l = &l_mas; mast.r = &r_mas; @@ -3427,10 +3415,6 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) unsigned long *pivots; int slot = 0; - mas_node_count(mas, 1); - if (unlikely(mas_is_err(mas))) - return 0; - node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); @@ -3699,10 +3683,6 @@ static inline int mas_new_root(struct ma_state *mas, void *entry) goto done; } - mas_node_count(mas, 1); - if (mas_is_err(mas)) - return 0; - node = mas_pop_node(mas); pivots = ma_pivots(node, type); slots = ma_slots(node, type); @@ -3765,9 +3745,6 @@ static noinline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) * entries per level plus a new root. */ height = mas_mt_height(mas); - mas_node_count(mas, 1 + height * 3); - if (mas_is_err(mas)) - return 0; /* * Set up right side. Need to get to the next offset after the spanning @@ -3851,10 +3828,6 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, /* set up node. */ if (in_rcu) { - mas_node_count(mas, 1); - if (mas_is_err(mas)) - return false; - newnode = mas_pop_node(mas); } else { memset(&reuse, 0, sizeof(struct maple_node)); -- cgit v1.2.3 From add60ea5f6d84aa73171d4a52b02f565b0b80a90 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:42 -0400 Subject: maple_tree: remove repeated sanity checks from write helper functions These sanity checks are now redundant as they are already checked in mas_wr_store_type(). We can remove them from mas_wr_append() and mas_wr_node_store(). Link: https://lkml.kernel.org/r/20240814161944.55347-16-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 5f79be184377..8c1a1a483395 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3816,11 +3816,6 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char copy_size, node_pivots = mt_pivots[wr_mas->type]; bool in_rcu = mt_in_rcu(mas->tree); - /* Check if there is enough data. The room is enough. */ - if (!mte_is_root(mas->node) && (new_end < mt_min_slots[wr_mas->type]) && - !(mas->mas_flags & MA_STATE_BULK)) - return false; - if (mas->last == wr_mas->end_piv) offset_end++; /* don't copy this offset */ else if (unlikely(wr_mas->r_max == ULONG_MAX)) @@ -4018,17 +4013,9 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) static inline bool mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { - struct ma_state *mas; + struct ma_state *mas = wr_mas->mas; void __rcu **slots; - unsigned char end; - - mas = wr_mas->mas; - if (mt_in_rcu(mas->tree)) - return false; - - end = mas->end; - if (mas->offset != end) - return false; + unsigned char end = mas->end; if (new_end < mt_pivots[wr_mas->type]) { wr_mas->pivots[new_end] = wr_mas->pivots[end]; -- cgit v1.2.3 From c27e6183c654c729bfb7248a9fa938ce74def4be Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:43 -0400 Subject: maple_tree: remove unneeded mas_wr_walk() in mas_store_prealloc() Users of mas_store_prealloc() enter this function with nodes already preallocated. This means the store type must be already set. We can then remove the call to mas_wr_store_type() and initialize the write state to continue the partial walk that was done when determining the store type. Link: https://lkml.kernel.org/r/20240814161944.55347-17-sidhartha.kumar@oracle.com Reviewed-by: Liam R. Howlett Signed-off-by: Sidhartha Kumar Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8c1a1a483395..73ce63d9c3a0 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3979,9 +3979,6 @@ static inline void mas_wr_end_piv(struct ma_wr_state *wr_mas) wr_mas->end_piv = wr_mas->pivots[wr_mas->offset_end]; else wr_mas->end_piv = wr_mas->mas->max; - - if (!wr_mas->entry) - mas_wr_extend_null(wr_mas); } static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) @@ -5532,8 +5529,19 @@ void mas_store_prealloc(struct ma_state *mas, void *entry) { MA_WR_STATE(wr_mas, mas, entry); - mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + if (mas->store_type == wr_store_root) { + mas_wr_prealloc_setup(&wr_mas); + goto store; + } + + mas_wr_walk_descend(&wr_mas); + if (mas->store_type != wr_spanning_store) { + /* set wr_mas->content to current slot */ + wr_mas.content = mas_slot_locked(mas, wr_mas.slots, mas->offset); + mas_wr_end_piv(&wr_mas); + } + +store: trace_ma_write(__func__, mas, 0, entry); mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); -- cgit v1.2.3 From ed4dfd9aa1b1eb18f5bd3a07e7002da41ce20817 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Aug 2024 12:19:44 -0400 Subject: maple_tree: make write helper functions void The return value of various write helper functions are not checked. We can safely change the return type of these functions to be void. Link: https://lkml.kernel.org/r/20240814161944.55347-18-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- lib/maple_tree.c | 47 ++++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 73ce63d9c3a0..755ba8b18e14 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2823,10 +2823,8 @@ dead_node: * orig_l_mas->last is used in mas_consume to find the slots that will need to * be either freed or destroyed. orig_l_mas->depth keeps track of the height of * the new sub-tree in case the sub-tree becomes the full tree. - * - * Return: the number of elements in b_node during the last loop. */ -static int mas_spanning_rebalance(struct ma_state *mas, +static void mas_spanning_rebalance(struct ma_state *mas, struct maple_subtree_state *mast, unsigned char count) { unsigned char split, mid_split; @@ -2942,7 +2940,7 @@ new_root: mas->offset = l_mas.offset; mas_wmb_replace(mas, old_enode); mtree_range_walk(mas); - return mast->bn->b_end; + return; } /* @@ -2952,10 +2950,8 @@ new_root: * * Rebalance two nodes into a single node or two new nodes that are sufficient. * Continue upwards until tree is sufficient. - * - * Return: the number of elements in b_node during the last loop. */ -static inline int mas_rebalance(struct ma_state *mas, +static inline void mas_rebalance(struct ma_state *mas, struct maple_big_node *b_node) { char empty_count = mas_mt_height(mas); @@ -3300,9 +3296,8 @@ static inline bool mas_push_data(struct ma_state *mas, int height, * mas_split() - Split data that is too big for one node into two. * @mas: The maple state * @b_node: The maple big node - * Return: 1 on success, 0 on failure. */ -static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) +static void mas_split(struct ma_state *mas, struct maple_big_node *b_node) { struct maple_subtree_state mast; int height = 0; @@ -3380,7 +3375,7 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) mas->node = l_mas.node; mas_wmb_replace(mas, old); mtree_range_walk(mas); - return 1; + return; } /* @@ -3388,7 +3383,7 @@ static int mas_split(struct ma_state *mas, struct maple_big_node *b_node) * @wr_mas: The maple write state * @b_node: The maple big node */ -static noinline_for_kasan int mas_commit_b_node(struct ma_wr_state *wr_mas, +static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, struct maple_big_node *b_node) { enum store_type type = wr_mas->mas->store_type; @@ -3664,10 +3659,8 @@ static void mte_destroy_walk(struct maple_enode *, struct maple_tree *); * @entry: The entry to store. * * Only valid when the index == 0 and the last == ULONG_MAX - * - * Return 0 on error, 1 on success. */ -static inline int mas_new_root(struct ma_state *mas, void *entry) +static inline void mas_new_root(struct ma_state *mas, void *entry) { struct maple_enode *root = mas_root_locked(mas); enum maple_type type = maple_leaf_64; @@ -3699,7 +3692,7 @@ done: if (xa_is_node(root)) mte_destroy_walk(root, mas->tree); - return 1; + return; } /* * mas_wr_spanning_store() - Create a subtree with the store operation completed @@ -3707,10 +3700,8 @@ done: * Note that mas is expected to point to the node which caused the store to * span. * @wr_mas: The maple write state - * - * Return: 0 on error, positive on success. */ -static noinline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) +static noinline void mas_wr_spanning_store(struct ma_wr_state *wr_mas) { struct maple_subtree_state mast; struct maple_big_node b_node; @@ -3802,10 +3793,8 @@ static noinline int mas_wr_spanning_store(struct ma_wr_state *wr_mas) * @wr_mas: The maple write state * * Attempts to reuse the node, but may allocate. - * - * Return: True if stored, false otherwise */ -static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas, +static inline void mas_wr_node_store(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; @@ -3878,16 +3867,14 @@ done: trace_ma_write(__func__, mas, 0, wr_mas->entry); mas_update_gap(mas); mas->end = new_end; - return true; + return; } /* * mas_wr_slot_store: Attempt to store a value in a slot. * @wr_mas: the maple write state - * - * Return: True if stored, false otherwise */ -static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) +static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char offset = mas->offset; @@ -3919,7 +3906,7 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ } else { - return false; + return; } trace_ma_write(__func__, mas, 0, wr_mas->entry); @@ -3930,7 +3917,7 @@ static inline bool mas_wr_slot_store(struct ma_wr_state *wr_mas) if (!wr_mas->entry || gap) mas_update_gap(mas); - return true; + return; } static inline void mas_wr_extend_null(struct ma_wr_state *wr_mas) @@ -4004,10 +3991,8 @@ static inline unsigned char mas_wr_new_end(struct ma_wr_state *wr_mas) * This is currently unsafe in rcu mode since the end of the node may be cached * by readers while the node contents may be updated which could result in * inaccurate information. - * - * Return: True if appended, false otherwise */ -static inline bool mas_wr_append(struct ma_wr_state *wr_mas, +static inline void mas_wr_append(struct ma_wr_state *wr_mas, unsigned char new_end) { struct ma_state *mas = wr_mas->mas; @@ -4046,7 +4031,7 @@ static inline bool mas_wr_append(struct ma_wr_state *wr_mas, mas->end = new_end; trace_ma_write(__func__, mas, new_end, wr_mas->entry); - return true; + return; } /* -- cgit v1.2.3 From b29a62d87cc0af3e9d134e9e0863b2cb053070b8 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sun, 7 Jul 2024 15:05:19 -0400 Subject: mul_u64_u64_div_u64: make it precise always MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mul_u64_u64_div_u64: new implementation", v3. This provides an implementation for mul_u64_u64_div_u64() that always produces exact results. This patch (of 2): Library facilities must always return exact results. If the caller may be contented with approximations then it should do the approximation on its own. In this particular case the comment in the code says "the algorithm ... below might lose some precision". Well, if you try it with e.g.: a = 18446462598732840960 b = 18446462598732840960 c = 18446462598732840961 then the produced answer is 0 whereas the exact answer should be 18446462598732840959. This is _some_ precision lost indeed! Let's reimplement this function so it always produces the exact result regardless of its inputs while preserving existing fast paths when possible. Uwe said: : My personal interest is to get the calculations in pwm drivers right. : This function is used in several drivers below drivers/pwm/ . With the : errors in mul_u64_u64_div_u64(), pwm consumers might not get the : settings they request. Although I have to admit that I'm not aware it : breaks real use cases (because typically the periods used are too short : to make the involved multiplications overflow), but I pretty sure am : not aware of all usages and it breaks testing. : : Another justification is commits like : https://git.kernel.org/tip/77baa5bafcbe1b2a15ef9c37232c21279c95481c, : where people start to work around the precision shortcomings of : mul_u64_u64_div_u64(). Link: https://lkml.kernel.org/r/20240707190648.1982714-1-nico@fluxnic.net Link: https://lkml.kernel.org/r/20240707190648.1982714-2-nico@fluxnic.net Signed-off-by: Nicolas Pitre Tested-by: Uwe Kleine-König Reviewed-by: Uwe Kleine-König Tested-by: Biju Das Signed-off-by: Andrew Morton --- lib/math/div64.c | 108 +++++++++++++++++++++++++++++++++---------------------- 1 file changed, 65 insertions(+), 43 deletions(-) (limited to 'lib') diff --git a/lib/math/div64.c b/lib/math/div64.c index 191761b1b623..b7fc75246399 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -186,55 +186,77 @@ EXPORT_SYMBOL(iter_div_u64_rem); #ifndef mul_u64_u64_div_u64 u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) { - u64 res = 0, div, rem; - int shift; + if (ilog2(a) + ilog2(b) <= 62) + return div64_u64(a * b, c); - /* can a * b overflow ? */ - if (ilog2(a) + ilog2(b) > 62) { - /* - * Note that the algorithm after the if block below might lose - * some precision and the result is more exact for b > a. So - * exchange a and b if a is bigger than b. - * - * For example with a = 43980465100800, b = 100000000, c = 1000000000 - * the below calculation doesn't modify b at all because div == 0 - * and then shift becomes 45 + 26 - 62 = 9 and so the result - * becomes 4398035251080. However with a and b swapped the exact - * result is calculated (i.e. 4398046510080). - */ - if (a > b) - swap(a, b); +#if defined(__SIZEOF_INT128__) + + /* native 64x64=128 bits multiplication */ + u128 prod = (u128)a * b; + u64 n_lo = prod, n_hi = prod >> 64; + +#else + + /* perform a 64x64=128 bits multiplication manually */ + u32 a_lo = a, a_hi = a >> 32, b_lo = b, b_hi = b >> 32; + u64 x, y, z; + + x = (u64)a_lo * b_lo; + y = (u64)a_lo * b_hi + (u32)(x >> 32); + z = (u64)a_hi * b_hi + (u32)(y >> 32); + y = (u64)a_hi * b_lo + (u32)y; + z += (u32)(y >> 32); + x = (y << 32) + (u32)x; + + u64 n_lo = x, n_hi = z; + +#endif + + int shift = __builtin_ctzll(c); + /* try reducing the fraction in case the dividend becomes <= 64 bits */ + if ((n_hi >> shift) == 0) { + u64 n = (n_lo >> shift) | (n_hi << (64 - shift)); + + return div64_u64(n, c >> shift); /* - * (b * a) / c is equal to - * - * (b / c) * a + - * (b % c) * a / c - * - * if nothing overflows. Can the 1st multiplication - * overflow? Yes, but we do not care: this can only - * happen if the end result can't fit in u64 anyway. - * - * So the code below does - * - * res = (b / c) * a; - * b = b % c; + * The remainder value if needed would be: + * res = div64_u64_rem(n, c >> shift, &rem); + * rem = (rem << shift) + (n_lo - (n << shift)); */ - div = div64_u64_rem(b, c, &rem); - res = div * a; - b = rem; - - shift = ilog2(a) + ilog2(b) - 62; - if (shift > 0) { - /* drop precision */ - b >>= shift; - c >>= shift; - if (!c) - return res; - } } - return res + div64_u64(a * b, c); + if (n_hi >= c) { + /* overflow: result is unrepresentable in a u64 */ + return -1; + } + + /* Do the full 128 by 64 bits division */ + + shift = __builtin_clzll(c); + c <<= shift; + + int p = 64 + shift; + u64 res = 0; + bool carry; + + do { + carry = n_hi >> 63; + shift = carry ? 1 : __builtin_clzll(n_hi); + if (p < shift) + break; + p -= shift; + n_hi <<= shift; + n_hi |= n_lo >> (64 - shift); + n_lo <<= shift; + if (carry || (n_hi >= c)) { + n_hi -= c; + res |= 1ULL << p; + } + } while (n_hi); + /* The remainder value if needed would be n_hi << p */ + + return res; } EXPORT_SYMBOL(mul_u64_u64_div_u64); #endif -- cgit v1.2.3 From 1635e62e75a7bbb1c6274f6b43911cedfe0da60a Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Sun, 7 Jul 2024 15:05:20 -0400 Subject: mul_u64_u64_div_u64: basic sanity test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Verify that edge cases produce proper results, and some more. [npitre@baylibre.com: avoid undefined shift value] Link: https://lkml.kernel.org/r/7rrs9pn1-n266-3013-9q6n-1osp8r8s0rrn@syhkavp.arg Link: https://lkml.kernel.org/r/20240707190648.1982714-3-nico@fluxnic.net Signed-off-by: Nicolas Pitre Reviewed-by: Uwe Kleine-König Cc: Biju Das Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 10 ++++ lib/math/Makefile | 1 + lib/math/div64.c | 9 +++- lib/math/test_mul_u64_u64_div_u64.c | 99 +++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 lib/math/test_mul_u64_u64_div_u64.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..bf0995d328b3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2280,6 +2280,16 @@ config TEST_DIV64 If unsure, say N. +config TEST_MULDIV64 + tristate "mul_u64_u64_div_u64() test" + depends on DEBUG_KERNEL || m + help + Enable this to turn on 'mul_u64_u64_div_u64()' function test. + This test is executed only once during system boot (so affects + only boot time), or at module load time. + + If unsure, say N. + config TEST_IOV_ITER tristate "Test iov_iter operation" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/lib/math/Makefile b/lib/math/Makefile index 91fcdb0c9efe..981a26127e08 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -6,4 +6,5 @@ obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o obj-$(CONFIG_RATIONAL) += rational.o obj-$(CONFIG_TEST_DIV64) += test_div64.o +obj-$(CONFIG_TEST_MULDIV64) += test_mul_u64_u64_div_u64.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o diff --git a/lib/math/div64.c b/lib/math/div64.c index b7fc75246399..5faa29208bdb 100644 --- a/lib/math/div64.c +++ b/lib/math/div64.c @@ -212,11 +212,18 @@ u64 mul_u64_u64_div_u64(u64 a, u64 b, u64 c) #endif + /* make sure c is not zero, trigger exception otherwise */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdiv-by-zero" + if (unlikely(c == 0)) + return 1/0; +#pragma GCC diagnostic pop + int shift = __builtin_ctzll(c); /* try reducing the fraction in case the dividend becomes <= 64 bits */ if ((n_hi >> shift) == 0) { - u64 n = (n_lo >> shift) | (n_hi << (64 - shift)); + u64 n = shift ? (n_lo >> shift) | (n_hi << (64 - shift)) : n_lo; return div64_u64(n, c >> shift); /* diff --git a/lib/math/test_mul_u64_u64_div_u64.c b/lib/math/test_mul_u64_u64_div_u64.c new file mode 100644 index 000000000000..58d058de4e73 --- /dev/null +++ b/lib/math/test_mul_u64_u64_div_u64.c @@ -0,0 +1,99 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2024 BayLibre SAS + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include + +typedef struct { u64 a; u64 b; u64 c; u64 result; } test_params; + +static test_params test_values[] = { +/* this contains many edge values followed by a couple random values */ +{ 0xb, 0x7, 0x3, 0x19 }, +{ 0xffff0000, 0xffff0000, 0xf, 0x1110eeef00000000 }, +{ 0xffffffff, 0xffffffff, 0x1, 0xfffffffe00000001 }, +{ 0xffffffff, 0xffffffff, 0x2, 0x7fffffff00000000 }, +{ 0x1ffffffff, 0xffffffff, 0x2, 0xfffffffe80000000 }, +{ 0x1ffffffff, 0xffffffff, 0x3, 0xaaaaaaa9aaaaaaab }, +{ 0x1ffffffff, 0x1ffffffff, 0x4, 0xffffffff00000000 }, +{ 0xffff000000000000, 0xffff000000000000, 0xffff000000000001, 0xfffeffffffffffff }, +{ 0x3333333333333333, 0x3333333333333333, 0x5555555555555555, 0x1eb851eb851eb851 }, +{ 0x7fffffffffffffff, 0x2, 0x3, 0x5555555555555554 }, +{ 0xffffffffffffffff, 0x2, 0x8000000000000000, 0x3 }, +{ 0xffffffffffffffff, 0x2, 0xc000000000000000, 0x2 }, +{ 0xffffffffffffffff, 0x4000000000000004, 0x8000000000000000, 0x8000000000000007 }, +{ 0xffffffffffffffff, 0x4000000000000001, 0x8000000000000000, 0x8000000000000001 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000001 }, +{ 0xfffffffffffffffe, 0x8000000000000001, 0xffffffffffffffff, 0x8000000000000000 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffe, 0x8000000000000001 }, +{ 0xffffffffffffffff, 0x8000000000000001, 0xfffffffffffffffd, 0x8000000000000002 }, +{ 0x7fffffffffffffff, 0xffffffffffffffff, 0xc000000000000000, 0xaaaaaaaaaaaaaaa8 }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0xa000000000000000, 0xccccccccccccccca }, +{ 0xffffffffffffffff, 0x7fffffffffffffff, 0x9000000000000000, 0xe38e38e38e38e38b }, +{ 0x7fffffffffffffff, 0x7fffffffffffffff, 0x5000000000000000, 0xccccccccccccccc9 }, +{ 0xffffffffffffffff, 0xfffffffffffffffe, 0xffffffffffffffff, 0xfffffffffffffffe }, +{ 0xe6102d256d7ea3ae, 0x70a77d0be4c31201, 0xd63ec35ab3220357, 0x78f8bf8cc86c6e18 }, +{ 0xf53bae05cb86c6e1, 0x3847b32d2f8d32e0, 0xcfd4f55a647f403c, 0x42687f79d8998d35 }, +{ 0x9951c5498f941092, 0x1f8c8bfdf287a251, 0xa3c8dc5f81ea3fe2, 0x1d887cb25900091f }, +{ 0x374fee9daa1bb2bb, 0x0d0bfbff7b8ae3ef, 0xc169337bd42d5179, 0x03bb2dbaffcbb961 }, +{ 0xeac0d03ac10eeaf0, 0x89be05dfa162ed9b, 0x92bb1679a41f0e4b, 0xdc5f5cc9e270d216 }, +}; + +/* + * The above table can be verified with the following shell script: + * + * #!/bin/sh + * sed -ne 's/^{ \+\(.*\), \+\(.*\), \+\(.*\), \+\(.*\) },$/\1 \2 \3 \4/p' \ + * lib/math/test_mul_u64_u64_div_u64.c | + * while read a b c r; do + * expected=$( printf "obase=16; ibase=16; %X * %X / %X\n" $a $b $c | bc ) + * given=$( printf "%X\n" $r ) + * if [ "$expected" = "$given" ]; then + * echo "$a * $b / $c = $r OK" + * else + * echo "$a * $b / $c = $r is wrong" >&2 + * echo "should be equivalent to 0x$expected" >&2 + * exit 1 + * fi + * done + */ + +static int __init test_init(void) +{ + int i; + + pr_info("Starting mul_u64_u64_div_u64() test\n"); + + for (i = 0; i < ARRAY_SIZE(test_values); i++) { + u64 a = test_values[i].a; + u64 b = test_values[i].b; + u64 c = test_values[i].c; + u64 expected_result = test_values[i].result; + u64 result = mul_u64_u64_div_u64(a, b, c); + + if (result != expected_result) { + pr_err("ERROR: 0x%016llx * 0x%016llx / 0x%016llx\n", a, b, c); + pr_err("ERROR: expected result: %016llx\n", expected_result); + pr_err("ERROR: obtained result: %016llx\n", result); + } + } + + pr_info("Completed mul_u64_u64_div_u64() test\n"); + return 0; +} + +static void __exit test_exit(void) +{ +} + +module_init(test_init); +module_exit(test_exit); + +MODULE_AUTHOR("Nicolas Pitre"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("mul_u64_u64_div_u64() test module"); -- cgit v1.2.3 From 053a5e4cbba88625ac6b53dea6371006237c34ba Mon Sep 17 00:00:00 2001 From: Jeff Johnson Date: Mon, 15 Jul 2024 07:18:56 -0700 Subject: lib: test_objpool: add missing MODULE_DESCRIPTION() macro make allmodconfig && make W=1 C=1 reports: WARNING: modpost: missing MODULE_DESCRIPTION() in lib/test_objpool.o Add the missing invocation of the MODULE_DESCRIPTION() macro. Link: https://lkml.kernel.org/r/20240715-md-lib-test_objpool-v2-1-5a2b9369c37e@quicinc.com Signed-off-by: Jeff Johnson Reviewed-by: Matt Wu Cc: Masami Hiramatsu Signed-off-by: Andrew Morton --- lib/test_objpool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_objpool.c b/lib/test_objpool.c index bfdb81599832..5a3f6961a70f 100644 --- a/lib/test_objpool.c +++ b/lib/test_objpool.c @@ -687,4 +687,5 @@ static void __exit ot_mod_exit(void) module_init(ot_mod_init); module_exit(ot_mod_exit); -MODULE_LICENSE("GPL"); \ No newline at end of file +MODULE_DESCRIPTION("Test module for lockless object pool"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From e24f4de8a72b50b67ea116b38152bb98360f81b3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 23 Jul 2024 00:37:26 +0200 Subject: kcov: don't instrument lib/find_bit.c This file produces large amounts of flaky coverage not useful for the KCOV's intended use case (guiding the fuzzing process). Link: https://lkml.kernel.org/r/20240722223726.194658-1-andrey.konovalov@linux.dev Signed-off-by: Andrey Konovalov Reviewed-by: Dmitry Vyukov Cc: Aleksandr Nogikh Cc: Alexander Potapenko Cc: Marco Elver Cc: Rasmus Villemoes Cc: Yury Norov Signed-off-by: Andrew Morton --- lib/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/Makefile b/lib/Makefile index 322bb127b4dc..0fde1c360f32 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -14,6 +14,7 @@ KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n KCOV_INSTRUMENT_fault-inject.o := n +KCOV_INSTRUMENT_find_bit.o := n # string.o implements standard library functions like memset/memcpy etc. # Use -ffreestanding to ensure that the compiler does not try to "optimize" -- cgit v1.2.3 From 836d13a6ef8a2eb0eab2bd2de06f2deabc62b060 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:18 +0300 Subject: xz: switch from public domain to BSD Zero Clause License (0BSD) Remove the public domain notices and add SPDX license identifiers. Change MODULE_LICENSE from "GPL" to "Dual BSD/GPL" because 0BSD should count as a BSD license variant here. The switch to 0BSD was done in the upstream XZ Embedded project because public domain has (real or perceived) legal issues in some jurisdictions. Link: https://lkml.kernel.org/r/20240721133633.47721-4-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Thomas Gleixner Cc: Greg Kroah-Hartman Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/decompress/unxz.h | 5 ++--- include/linux/xz.h | 5 ++--- lib/decompress_unxz.c | 5 ++--- lib/xz/xz_crc32.c | 5 ++--- lib/xz/xz_dec_bcj.c | 5 ++--- lib/xz/xz_dec_lzma2.c | 5 ++--- lib/xz/xz_dec_stream.c | 5 ++--- lib/xz/xz_dec_syms.c | 12 +++--------- lib/xz/xz_dec_test.c | 12 +++--------- lib/xz/xz_lzma2.h | 5 ++--- lib/xz/xz_private.h | 5 ++--- lib/xz/xz_stream.h | 5 ++--- scripts/xz_wrap.sh | 5 +---- 13 files changed, 27 insertions(+), 52 deletions(-) (limited to 'lib') diff --git a/include/linux/decompress/unxz.h b/include/linux/decompress/unxz.h index f764e2a7201e..3dd2658a9dab 100644 --- a/include/linux/decompress/unxz.h +++ b/include/linux/decompress/unxz.h @@ -1,10 +1,9 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef DECOMPRESS_UNXZ_H diff --git a/include/linux/xz.h b/include/linux/xz.h index 7285ca5d56e9..5728d57aecc0 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -1,11 +1,10 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * XZ decompressor * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef XZ_H diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 842894158944..34bb7efc0412 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: 0BSD + /* * Wrapper for decompressing XZ-compressed kernel, initramfs, and initrd * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ /* diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c index 88a2c35e1b59..30b8a27110b1 100644 --- a/lib/xz/xz_crc32.c +++ b/lib/xz/xz_crc32.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: 0BSD + /* * CRC32 using the polynomial from IEEE-802.3 * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ /* diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index ef449e97d1a1..ab9237ed6db8 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: 0BSD + /* * Branch/Call/Jump (BCJ) filter decoders * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #include "xz_private.h" diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 27ce34520e78..613939f5dd6c 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -1,11 +1,10 @@ +// SPDX-License-Identifier: 0BSD + /* * LZMA2 decoder * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #include "xz_private.h" diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c index 683570b93a8c..0058406ccd17 100644 --- a/lib/xz/xz_dec_stream.c +++ b/lib/xz/xz_dec_stream.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: 0BSD + /* * .xz Stream decoder * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #include "xz_private.h" diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 61098c67a413..495d2cc2e6e8 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: 0BSD + /* * XZ decoder module information * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #include @@ -25,9 +24,4 @@ EXPORT_SYMBOL(xz_dec_microlzma_end); MODULE_DESCRIPTION("XZ decompressor"); MODULE_VERSION("1.1"); MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); - -/* - * This code is in the public domain, but in Linux it's simplest to just - * say it's GPL and consider the authors as the copyright holders. - */ -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/xz/xz_dec_test.c b/lib/xz/xz_dec_test.c index da28a19d6c98..53d3600f2ddb 100644 --- a/lib/xz/xz_dec_test.c +++ b/lib/xz/xz_dec_test.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: 0BSD + /* * XZ decoder tester * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #include @@ -212,9 +211,4 @@ module_exit(xz_dec_test_exit); MODULE_DESCRIPTION("XZ decompressor tester"); MODULE_VERSION("1.0"); MODULE_AUTHOR("Lasse Collin "); - -/* - * This code is in the public domain, but in Linux it's simplest to just - * say it's GPL and consider the authors as the copyright holders. - */ -MODULE_LICENSE("GPL"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/xz/xz_lzma2.h b/lib/xz/xz_lzma2.h index 92d852d4f87a..d2632b7dfb9c 100644 --- a/lib/xz/xz_lzma2.h +++ b/lib/xz/xz_lzma2.h @@ -1,11 +1,10 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * LZMA2 definitions * * Authors: Lasse Collin * Igor Pavlov - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef XZ_LZMA2_H diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index bf1e94ec7873..2412a5d54801 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -1,10 +1,9 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Private includes and definitions * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef XZ_PRIVATE_H diff --git a/lib/xz/xz_stream.h b/lib/xz/xz_stream.h index 430bb3a0d195..55f9f6f94b78 100644 --- a/lib/xz/xz_stream.h +++ b/lib/xz/xz_stream.h @@ -1,10 +1,9 @@ +/* SPDX-License-Identifier: 0BSD */ + /* * Definitions for handling the .xz file format * * Author: Lasse Collin - * - * This file has been put into the public domain. - * You can do whatever you want with this file. */ #ifndef XZ_STREAM_H diff --git a/scripts/xz_wrap.sh b/scripts/xz_wrap.sh index d06baf626abe..bb760b721b2c 100755 --- a/scripts/xz_wrap.sh +++ b/scripts/xz_wrap.sh @@ -1,13 +1,10 @@ #!/bin/sh +# SPDX-License-Identifier: 0BSD # # This is a wrapper for xz to compress the kernel image using appropriate # compression options depending on the architecture. # # Author: Lasse Collin -# -# This file has been put into the public domain. -# You can do whatever you want with this file. -# BCJ= LZMA2OPTS= -- cgit v1.2.3 From ff221153aafa08159f3dcc187c6f3a7a837e1c3d Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:19 +0300 Subject: xz: fix comments and coding style - Fix comments that were no longer in sync with the code below them. - Fix language errors. - Fix coding style. Link: https://lkml.kernel.org/r/20240721133633.47721-5-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/decompress_unxz.c | 20 ++++++++++---------- lib/xz/Kconfig | 3 ++- scripts/Makefile.lib | 13 ++++++++----- 3 files changed, 20 insertions(+), 16 deletions(-) (limited to 'lib') diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 34bb7efc0412..46aa3be13fc5 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -102,7 +102,7 @@ #ifdef STATIC # define XZ_PREBOOT #else -#include +# include #endif #ifdef __KERNEL__ # include @@ -219,7 +219,7 @@ void *memmove(void *dest, const void *src, size_t size) #endif /* - * Since we need memmove anyway, would use it as memcpy too. + * Since we need memmove anyway, we could use it as memcpy too. * Commented out for now to avoid breaking things. */ /* @@ -389,17 +389,17 @@ error_alloc_state: } /* - * This macro is used by architecture-specific files to decompress + * This function is used by architecture-specific files to decompress * the kernel image. */ #ifdef XZ_PREBOOT -STATIC int INIT __decompress(unsigned char *buf, long len, - long (*fill)(void*, unsigned long), - long (*flush)(void*, unsigned long), - unsigned char *out_buf, long olen, - long *pos, - void (*error)(char *x)) +STATIC int INIT __decompress(unsigned char *in, long in_size, + long (*fill)(void *dest, unsigned long size), + long (*flush)(void *src, unsigned long size), + unsigned char *out, long out_size, + long *in_used, + void (*error)(char *x)) { - return unxz(buf, len, fill, flush, out_buf, pos, error); + return unxz(in, in_size, fill, flush, out, in_used, error); } #endif diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index aef086a6bf2f..6b80453d8f54 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -5,7 +5,8 @@ config XZ_DEC help LZMA2 compression algorithm and BCJ filters are supported using the .xz file format as the container. For integrity checking, - CRC32 is supported. See Documentation/staging/xz.rst for more information. + CRC32 is supported. See Documentation/staging/xz.rst for more + information. if XZ_DEC diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 207325eaf1d1..dae2089e7bc6 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -530,14 +530,17 @@ quiet_cmd_fit = FIT $@ # XZ # --------------------------------------------------------------------------- -# Use xzkern to compress the kernel image and xzmisc to compress other things. +# Use xzkern or xzkern_with_size to compress the kernel image and xzmisc to +# compress other things. # # xzkern uses a big LZMA2 dictionary since it doesn't increase memory usage # of the kernel decompressor. A BCJ filter is used if it is available for -# the target architecture. xzkern also appends uncompressed size of the data -# using size_append. The .xz format has the size information available at -# the end of the file too, but it's in more complex format and it's good to -# avoid changing the part of the boot code that reads the uncompressed size. +# the target architecture. +# +# xzkern_with_size also appends uncompressed size of the data using +# size_append. The .xz format has the size information available at the end +# of the file too, but it's in more complex format and it's good to avoid +# changing the part of the boot code that reads the uncompressed size. # Note that the bytes added by size_append will make the xz tool think that # the file is corrupt. This is expected. # -- cgit v1.2.3 From 2ee96abef214550d9e92f5143ee3ac1fd1323e67 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:24 +0300 Subject: xz: cleanup CRC32 edits from 2018 In 2018, a dependency on was added to avoid duplicating the same constant in multiple files. Two months later it was found to be a bad idea and the definition of CRC32_POLY_LE macro was moved into xz_private.h to avoid including . xz_private.h is a wrong place for it too. Revert back to the upstream version which has the poly in xz_crc32_init() in xz_crc32.c. Link: https://lkml.kernel.org/r/20240721133633.47721-10-lasse.collin@tukaani.org Fixes: faa16bc404d7 ("lib: Use existing define with polynomial") Fixes: 242cdad873a7 ("lib/xz: Put CRC32_POLY_LE in xz_private.h") Signed-off-by: Lasse Collin Reviewed-by: Sam James Tested-by: Michael Ellerman (powerpc) Cc: Krzysztof Kozlowski Cc: Herbert Xu Cc: Joel Stanley Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/xz/xz_crc32.c | 2 +- lib/xz/xz_private.h | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c index 30b8a27110b1..effdf34ec48d 100644 --- a/lib/xz/xz_crc32.c +++ b/lib/xz/xz_crc32.c @@ -28,7 +28,7 @@ STATIC_RW_DATA uint32_t xz_crc32_table[256]; XZ_EXTERN void xz_crc32_init(void) { - const uint32_t poly = CRC32_POLY_LE; + const uint32_t poly = 0xEDB88320; uint32_t i; uint32_t j; diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 2412a5d54801..811add814ae4 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -104,10 +104,6 @@ # endif #endif -#ifndef CRC32_POLY_LE -#define CRC32_POLY_LE 0xedb88320 -#endif - /* * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used * before calling xz_dec_lzma2_run(). -- cgit v1.2.3 From bdfc0411717d52b9d2f00e48c452a61389814693 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:25 +0300 Subject: xz: optimize for-loop conditions in the BCJ decoders Compilers cannot optimize the addition "i + 4" away since theoretically it could overflow. Link: https://lkml.kernel.org/r/20240721133633.47721-11-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/xz/xz_dec_bcj.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index ab9237ed6db8..e0b4bf4999c0 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -161,7 +161,9 @@ static size_t bcj_powerpc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) size_t i; uint32_t instr; - for (i = 0; i + 4 <= size; i += 4) { + size &= ~(size_t)3; + + for (i = 0; i < size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr & 0xFC000003) == 0x48000001) { instr &= 0x03FFFFFC; @@ -218,7 +220,9 @@ static size_t bcj_ia64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) /* Instruction normalized with bit_res for easier manipulation */ uint64_t norm; - for (i = 0; i + 16 <= size; i += 16) { + size &= ~(size_t)15; + + for (i = 0; i < size; i += 16) { mask = branch_table[buf[i] & 0x1F]; for (slot = 0, bit_pos = 5; slot < 3; ++slot, bit_pos += 41) { if (((mask >> slot) & 1) == 0) @@ -266,7 +270,9 @@ static size_t bcj_arm(struct xz_dec_bcj *s, uint8_t *buf, size_t size) size_t i; uint32_t addr; - for (i = 0; i + 4 <= size; i += 4) { + size &= ~(size_t)3; + + for (i = 0; i < size; i += 4) { if (buf[i + 3] == 0xEB) { addr = (uint32_t)buf[i] | ((uint32_t)buf[i + 1] << 8) | ((uint32_t)buf[i + 2] << 16); @@ -289,7 +295,12 @@ static size_t bcj_armthumb(struct xz_dec_bcj *s, uint8_t *buf, size_t size) size_t i; uint32_t addr; - for (i = 0; i + 4 <= size; i += 2) { + if (size < 4) + return 0; + + size -= 4; + + for (i = 0; i <= size; i += 2) { if ((buf[i + 1] & 0xF8) == 0xF0 && (buf[i + 3] & 0xF8) == 0xF8) { addr = (((uint32_t)buf[i + 1] & 0x07) << 19) @@ -317,7 +328,9 @@ static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) size_t i; uint32_t instr; - for (i = 0; i + 4 <= size; i += 4) { + size &= ~(size_t)3; + + for (i = 0; i < size; i += 4) { instr = get_unaligned_be32(buf + i); if ((instr >> 22) == 0x100 || (instr >> 22) == 0x1FF) { instr <<= 2; -- cgit v1.2.3 From 4b62813f5e7d44a33ebd74f03da041712c702bf0 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:26 +0300 Subject: xz: Add ARM64 BCJ filter Also omit a duplicated check for XZ_DEC_ARM in xz_private.h. A later commit updates lib/decompress_unxz.c to enable this filter for kernel decompression. lib/decompress_unxz.c is already used if CONFIG_EFI_ZBOOT=y && CONFIG_KERNEL_XZ=y. This filter can be used by Squashfs without modifications to the Squashfs kernel code (only needs support in userspace Squashfs-tools). Link: https://lkml.kernel.org/r/20240721133633.47721-12-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/xz/Kconfig | 5 +++++ lib/xz/xz_dec_bcj.c | 52 +++++++++++++++++++++++++++++++++++++++++++++++++++- lib/xz/xz_private.h | 7 +++++-- 3 files changed, 61 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 6b80453d8f54..1166627a87dc 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -30,6 +30,11 @@ config XZ_DEC_ARMTHUMB default y select XZ_DEC_BCJ +config XZ_DEC_ARM64 + bool "ARM64 BCJ filter decoder" if EXPERT + default y + select XZ_DEC_BCJ + config XZ_DEC_SPARC bool "SPARC BCJ filter decoder" if EXPERT default y diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index e0b4bf4999c0..941198a8a55b 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -23,7 +23,8 @@ struct xz_dec_bcj { BCJ_IA64 = 6, /* Big or little endian */ BCJ_ARM = 7, /* Little endian only */ BCJ_ARMTHUMB = 8, /* Little endian only */ - BCJ_SPARC = 9 /* Big or little endian */ + BCJ_SPARC = 9, /* Big or little endian */ + BCJ_ARM64 = 10 /* AArch64 */ } type; /* @@ -346,6 +347,47 @@ static size_t bcj_sparc(struct xz_dec_bcj *s, uint8_t *buf, size_t size) } #endif +#ifdef XZ_DEC_ARM64 +static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t instr; + uint32_t addr; + + size &= ~(size_t)3; + + for (i = 0; i < size; i += 4) { + instr = get_unaligned_le32(buf + i); + + if ((instr >> 26) == 0x25) { + /* BL instruction */ + addr = instr - ((s->pos + (uint32_t)i) >> 2); + instr = 0x94000000 | (addr & 0x03FFFFFF); + put_unaligned_le32(instr, buf + i); + + } else if ((instr & 0x9F000000) == 0x90000000) { + /* ADRP instruction */ + addr = ((instr >> 29) & 3) | ((instr >> 3) & 0x1FFFFC); + + /* Only convert values in the range +/-512 MiB. */ + if ((addr + 0x020000) & 0x1C0000) + continue; + + addr -= (s->pos + (uint32_t)i) >> 12; + + instr &= 0x9000001F; + instr |= (addr & 3) << 29; + instr |= (addr & 0x03FFFC) << 3; + instr |= (0U - (addr & 0x020000)) & 0xE00000; + + put_unaligned_le32(instr, buf + i); + } + } + + return i; +} +#endif + /* * Apply the selected BCJ filter. Update *pos and s->pos to match the amount * of data that got filtered. @@ -392,6 +434,11 @@ static void bcj_apply(struct xz_dec_bcj *s, case BCJ_SPARC: filtered = bcj_sparc(s, buf, size); break; +#endif +#ifdef XZ_DEC_ARM64 + case BCJ_ARM64: + filtered = bcj_arm64(s, buf, size); + break; #endif default: /* Never reached but silence compiler warnings. */ @@ -565,6 +612,9 @@ XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) #endif #ifdef XZ_DEC_SPARC case BCJ_SPARC: +#endif +#ifdef XZ_DEC_ARM64 + case BCJ_ARM64: #endif break; diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 811add814ae4..307e0de8c260 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -36,6 +36,9 @@ # ifdef CONFIG_XZ_DEC_SPARC # define XZ_DEC_SPARC # endif +# ifdef CONFIG_XZ_DEC_ARM64 +# define XZ_DEC_ARM64 +# endif # ifdef CONFIG_XZ_DEC_MICROLZMA # define XZ_DEC_MICROLZMA # endif @@ -97,9 +100,9 @@ */ #ifndef XZ_DEC_BCJ # if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ - || defined(XZ_DEC_IA64) || defined(XZ_DEC_ARM) \ + || defined(XZ_DEC_IA64) \ || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ - || defined(XZ_DEC_SPARC) + || defined(XZ_DEC_SPARC) || defined(XZ_DEC_ARM64) # define XZ_DEC_BCJ # endif #endif -- cgit v1.2.3 From 93d09773d1a5339160e23906c68c42644e13e3d8 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:27 +0300 Subject: xz: add RISC-V BCJ filter A later commit updates lib/decompress_unxz.c to enable this filter for kernel decompression. lib/decompress_unxz.c is already used if CONFIG_EFI_ZBOOT=y && CONFIG_KERNEL_XZ=y. This filter can be used by Squashfs without modifications to the Squashfs kernel code (only needs support in userspace Squashfs-tools). Link: https://lkml.kernel.org/r/20240721133633.47721-13-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/xz/Kconfig | 5 +++ lib/xz/xz_dec_bcj.c | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++- lib/xz/xz_dec_syms.c | 2 +- lib/xz/xz_private.h | 6 ++- 4 files changed, 114 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 1166627a87dc..20aa459bfb3e 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -40,6 +40,11 @@ config XZ_DEC_SPARC default y select XZ_DEC_BCJ +config XZ_DEC_RISCV + bool "RISC-V BCJ filter decoder" if EXPERT + default y + select XZ_DEC_BCJ + config XZ_DEC_MICROLZMA bool "MicroLZMA decoder" default n diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index 941198a8a55b..42d7f268726f 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -24,7 +24,8 @@ struct xz_dec_bcj { BCJ_ARM = 7, /* Little endian only */ BCJ_ARMTHUMB = 8, /* Little endian only */ BCJ_SPARC = 9, /* Big or little endian */ - BCJ_ARM64 = 10 /* AArch64 */ + BCJ_ARM64 = 10, /* AArch64 */ + BCJ_RISCV = 11 /* RV32GQC_Zfh, RV64GQC_Zfh */ } type; /* @@ -388,6 +389,99 @@ static size_t bcj_arm64(struct xz_dec_bcj *s, uint8_t *buf, size_t size) } #endif +#ifdef XZ_DEC_RISCV +static size_t bcj_riscv(struct xz_dec_bcj *s, uint8_t *buf, size_t size) +{ + size_t i; + uint32_t b1; + uint32_t b2; + uint32_t b3; + uint32_t instr; + uint32_t instr2; + uint32_t instr2_rs1; + uint32_t addr; + + if (size < 8) + return 0; + + size -= 8; + + for (i = 0; i <= size; i += 2) { + instr = buf[i]; + + if (instr == 0xEF) { + /* JAL */ + b1 = buf[i + 1]; + if ((b1 & 0x0D) != 0) + continue; + + b2 = buf[i + 2]; + b3 = buf[i + 3]; + + addr = ((b1 & 0xF0) << 13) | (b2 << 9) | (b3 << 1); + addr -= s->pos + (uint32_t)i; + + buf[i + 1] = (uint8_t)((b1 & 0x0F) + | ((addr >> 8) & 0xF0)); + + buf[i + 2] = (uint8_t)(((addr >> 16) & 0x0F) + | ((addr >> 7) & 0x10) + | ((addr << 4) & 0xE0)); + + buf[i + 3] = (uint8_t)(((addr >> 4) & 0x7F) + | ((addr >> 13) & 0x80)); + + i += 4 - 2; + + } else if ((instr & 0x7F) == 0x17) { + /* AUIPC */ + instr |= (uint32_t)buf[i + 1] << 8; + instr |= (uint32_t)buf[i + 2] << 16; + instr |= (uint32_t)buf[i + 3] << 24; + + if (instr & 0xE80) { + /* AUIPC's rd doesn't equal x0 or x2. */ + instr2 = get_unaligned_le32(buf + i + 4); + + if (((instr << 8) ^ (instr2 - 3)) & 0xF8003) { + i += 6 - 2; + continue; + } + + addr = (instr & 0xFFFFF000) + (instr2 >> 20); + + instr = 0x17 | (2 << 7) | (instr2 << 12); + instr2 = addr; + } else { + /* AUIPC's rd equals x0 or x2. */ + instr2_rs1 = instr >> 27; + + if ((uint32_t)((instr - 0x3117) << 18) + >= (instr2_rs1 & 0x1D)) { + i += 4 - 2; + continue; + } + + addr = get_unaligned_be32(buf + i + 4); + addr -= s->pos + (uint32_t)i; + + instr2 = (instr >> 12) | (addr << 20); + + instr = 0x17 | (instr2_rs1 << 7) + | ((addr + 0x800) & 0xFFFFF000); + } + + put_unaligned_le32(instr, buf + i); + put_unaligned_le32(instr2, buf + i + 4); + + i += 8 - 2; + } + } + + return i; +} +#endif + /* * Apply the selected BCJ filter. Update *pos and s->pos to match the amount * of data that got filtered. @@ -439,6 +533,11 @@ static void bcj_apply(struct xz_dec_bcj *s, case BCJ_ARM64: filtered = bcj_arm64(s, buf, size); break; +#endif +#ifdef XZ_DEC_RISCV + case BCJ_RISCV: + filtered = bcj_riscv(s, buf, size); + break; #endif default: /* Never reached but silence compiler warnings. */ @@ -615,6 +714,9 @@ XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) #endif #ifdef XZ_DEC_ARM64 case BCJ_ARM64: +#endif +#ifdef XZ_DEC_RISCV + case BCJ_RISCV: #endif break; diff --git a/lib/xz/xz_dec_syms.c b/lib/xz/xz_dec_syms.c index 495d2cc2e6e8..f40817d65897 100644 --- a/lib/xz/xz_dec_syms.c +++ b/lib/xz/xz_dec_syms.c @@ -22,6 +22,6 @@ EXPORT_SYMBOL(xz_dec_microlzma_end); #endif MODULE_DESCRIPTION("XZ decompressor"); -MODULE_VERSION("1.1"); +MODULE_VERSION("1.2"); MODULE_AUTHOR("Lasse Collin and Igor Pavlov"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index 307e0de8c260..a8b1cbe8d21d 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -39,6 +39,9 @@ # ifdef CONFIG_XZ_DEC_ARM64 # define XZ_DEC_ARM64 # endif +# ifdef CONFIG_XZ_DEC_RISCV +# define XZ_DEC_RISCV +# endif # ifdef CONFIG_XZ_DEC_MICROLZMA # define XZ_DEC_MICROLZMA # endif @@ -102,7 +105,8 @@ # if defined(XZ_DEC_X86) || defined(XZ_DEC_POWERPC) \ || defined(XZ_DEC_IA64) \ || defined(XZ_DEC_ARM) || defined(XZ_DEC_ARMTHUMB) \ - || defined(XZ_DEC_SPARC) || defined(XZ_DEC_ARM64) + || defined(XZ_DEC_SPARC) || defined(XZ_DEC_ARM64) \ + || defined(XZ_DEC_RISCV) # define XZ_DEC_BCJ # endif #endif -- cgit v1.2.3 From 7472ff8adad8655f38b060a602f66e59c93c4793 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Sun, 21 Jul 2024 16:36:29 +0300 Subject: xz: adjust arch-specific options for better kernel compression Use LZMA2 options that match the arch-specific alignment of instructions. This change reduces compressed kernel size 0-2 % depending on the arch. On 1-byte-aligned x86 it makes no difference and on 4-byte-aligned archs it helps the most. Use the ARM-Thumb filter for ARM-Thumb2 kernels. This reduces compressed kernel size about 5 %.[1] Previously such kernels were compressed using the ARM filter which didn't do anything useful with ARM-Thumb2 code. Add BCJ filter support for ARM64 and RISC-V. Compared to unfiltered XZ or plain LZMA, the compressed kernel size is reduced about 5 % on ARM64 and 7 % on RISC-V. A new enough version of the xz tool is required: 5.4.0 for ARM64 and 5.6.0 for RISC-V. With an old xz version, a message is printed to standard error and the kernel is compressed without the filter. Update lib/decompress_unxz.c to match the changes to xz_wrap.sh. Update the CONFIG_KERNEL_XZ help text in init/Kconfig: - Add the RISC-V and ARM64 filters. - Clarify that the PowerPC filter is for big endian only. - Omit IA-64. Link: https://lore.kernel.org/lkml/1637379771-39449-1-git-send-email-zhongjubin@huawei.com/ [1] Link: https://lkml.kernel.org/r/20240721133633.47721-15-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Reviewed-by: Sam James Cc: Simon Glass Cc: Catalin Marinas Cc: Will Deacon Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Jubin Zhong Cc: Jules Maselbas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jonathan Corbet Cc: Krzysztof Kozlowski Cc: Michael Ellerman Cc: Randy Dunlap Cc: Rui Li Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- init/Kconfig | 5 +- lib/decompress_unxz.c | 14 ++++- scripts/xz_wrap.sh | 142 ++++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 152 insertions(+), 9 deletions(-) (limited to 'lib') diff --git a/init/Kconfig b/init/Kconfig index 5783a0b87517..583cb07176a9 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -310,8 +310,9 @@ config KERNEL_XZ BCJ filters which can improve compression ratio of executable code. The size of the kernel is about 30% smaller with XZ in comparison to gzip. On architectures for which there is a BCJ - filter (i386, x86_64, ARM, IA-64, PowerPC, and SPARC), XZ - will create a few percent smaller kernel than plain LZMA. + filter (i386, x86_64, ARM, ARM64, RISC-V, big endian PowerPC, + and SPARC), XZ will create a few percent smaller kernel than + plain LZMA. The speed is about the same as with LZMA: The decompression speed of XZ is better than that of bzip2 but worse than gzip diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index 46aa3be13fc5..cae00395d7a6 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -126,11 +126,21 @@ #ifdef CONFIG_X86 # define XZ_DEC_X86 #endif -#ifdef CONFIG_PPC +#if defined(CONFIG_PPC) && defined(CONFIG_CPU_BIG_ENDIAN) # define XZ_DEC_POWERPC #endif #ifdef CONFIG_ARM -# define XZ_DEC_ARM +# ifdef CONFIG_THUMB2_KERNEL +# define XZ_DEC_ARMTHUMB +# else +# define XZ_DEC_ARM +# endif +#endif +#ifdef CONFIG_ARM64 +# define XZ_DEC_ARM64 +#endif +#ifdef CONFIG_RISCV +# define XZ_DEC_RISCV #endif #ifdef CONFIG_SPARC # define XZ_DEC_SPARC diff --git a/scripts/xz_wrap.sh b/scripts/xz_wrap.sh index c8c36441ab70..f19369687030 100755 --- a/scripts/xz_wrap.sh +++ b/scripts/xz_wrap.sh @@ -6,14 +6,146 @@ # # Author: Lasse Collin +# This has specialized settings for the following archs. However, +# XZ-compressed kernel isn't currently supported on every listed arch. +# +# Arch Align Notes +# arm 2/4 ARM and ARM-Thumb2 +# arm64 4 +# csky 2 +# loongarch 4 +# mips 2/4 MicroMIPS is 2-byte aligned +# parisc 4 +# powerpc 4 Uses its own wrapper for compressors instead of this. +# riscv 2/4 +# s390 2 +# sh 2 +# sparc 4 +# x86 1 + +# A few archs use 2-byte or 4-byte aligned instructions depending on +# the kernel config. This function is used to check if the relevant +# config option is set to "y". +is_enabled() +{ + grep -q "^$1=y$" include/config/auto.conf +} + +# XZ_VERSION is needed to disable features that aren't available in +# old XZ Utils versions. +XZ_VERSION=$($XZ --robot --version) || exit +XZ_VERSION=$(printf '%s\n' "$XZ_VERSION" | sed -n 's/^XZ_VERSION=//p') + +# Assume that no BCJ filter is available. BCJ= -LZMA2OPTS= +# Set the instruction alignment to 1, 2, or 4 bytes. +# +# Set the BCJ filter if one is available. +# It must match the #ifdef usage in lib/decompress_unxz.c. case $SRCARCH in - x86) BCJ=--x86 ;; - powerpc) BCJ=--powerpc ;; - arm) BCJ=--arm ;; - sparc) BCJ=--sparc ;; + arm) + if is_enabled CONFIG_THUMB2_KERNEL; then + ALIGN=2 + BCJ=--armthumb + else + ALIGN=4 + BCJ=--arm + fi + ;; + + arm64) + ALIGN=4 + + # ARM64 filter was added in XZ Utils 5.4.0. + if [ "$XZ_VERSION" -ge 50040002 ]; then + BCJ=--arm64 + else + echo "$0: Upgrading to xz >= 5.4.0" \ + "would enable the ARM64 filter" \ + "for better compression" >&2 + fi + ;; + + csky) + ALIGN=2 + ;; + + loongarch) + ALIGN=4 + ;; + + mips) + if is_enabled CONFIG_CPU_MICROMIPS; then + ALIGN=2 + else + ALIGN=4 + fi + ;; + + parisc) + ALIGN=4 + ;; + + powerpc) + ALIGN=4 + + # The filter is only for big endian instruction encoding. + if is_enabled CONFIG_CPU_BIG_ENDIAN; then + BCJ=--powerpc + fi + ;; + + riscv) + if is_enabled CONFIG_RISCV_ISA_C; then + ALIGN=2 + else + ALIGN=4 + fi + + # RISC-V filter was added in XZ Utils 5.6.0. + if [ "$XZ_VERSION" -ge 50060002 ]; then + BCJ=--riscv + else + echo "$0: Upgrading to xz >= 5.6.0" \ + "would enable the RISC-V filter" \ + "for better compression" >&2 + fi + ;; + + s390) + ALIGN=2 + ;; + + sh) + ALIGN=2 + ;; + + sparc) + ALIGN=4 + BCJ=--sparc + ;; + + x86) + ALIGN=1 + BCJ=--x86 + ;; + + *) + echo "$0: Arch-specific tuning is missing for '$SRCARCH'" >&2 + + # Guess 2-byte-aligned instructions. Guessing too low + # should hurt less than guessing too high. + ALIGN=2 + ;; +esac + +# Select the LZMA2 options matching the instruction alignment. +case $ALIGN in + 1) LZMA2OPTS= ;; + 2) LZMA2OPTS=lp=1 ;; + 4) LZMA2OPTS=lp=2,lc=2 ;; + *) echo "$0: ALIGN wrong or missing" >&2; exit 1 ;; esac # Use single-threaded mode because it compresses a little better -- cgit v1.2.3 From c6f371bab25edccd39caa5dd452b50d9dfdf4ff0 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 24 Jul 2024 14:05:41 +0300 Subject: xz: remove XZ_EXTERN and extern from functions XZ_EXTERN was used to make internal functions static in the preboot code. However, in other decompressors this hasn't been done. On x86-64, this makes no difference to the kernel image size. Omit XZ_EXTERN and let some of the internal functions be extern in the preboot code. Omitting XZ_EXTERN from include/linux/xz.h fixes warnings in "make htmldocs" and makes the intradocument links to xz_dec functions work in Documentation/staging/xz.rst. The alternative would have been to add "XZ_EXTERN" to c_id_attributes in Documentation/conf.py but omitting XZ_EXTERN seemed cleaner. Link: https://lore.kernel.org/lkml/20240723205437.3c0664b0@kaneli/ Link: https://lkml.kernel.org/r/20240724110544.16430-1-lasse.collin@tukaani.org Signed-off-by: Lasse Collin Tested-by: Michael Ellerman (powerpc) Cc: Jonathan Corbet Cc: Sam James Cc: Albert Ou Cc: Catalin Marinas Cc: Emil Renner Berthing Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Joel Stanley Cc: Jubin Zhong Cc: Jules Maselbas Cc: Krzysztof Kozlowski Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Randy Dunlap Cc: Rui Li Cc: Simon Glass Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/staging/xz.rst | 3 --- arch/powerpc/boot/xz_config.h | 3 --- include/linux/xz.h | 35 ++++++++++++----------------------- lib/decompress_unxz.c | 1 - lib/xz/xz_crc32.c | 4 ++-- lib/xz/xz_dec_bcj.c | 9 ++++----- lib/xz/xz_dec_lzma2.c | 10 ++++------ lib/xz/xz_dec_stream.c | 8 ++++---- lib/xz/xz_private.h | 20 ++++++++------------ 9 files changed, 34 insertions(+), 59 deletions(-) (limited to 'lib') diff --git a/Documentation/staging/xz.rst b/Documentation/staging/xz.rst index e1054e9a8e65..6953a189e5f2 100644 --- a/Documentation/staging/xz.rst +++ b/Documentation/staging/xz.rst @@ -95,7 +95,4 @@ xz_dec API This is available with ``#include ``. -``XZ_EXTERN`` is a macro used in the preboot code. Ignore it when -reading this documentation. - .. kernel-doc:: include/linux/xz.h diff --git a/arch/powerpc/boot/xz_config.h b/arch/powerpc/boot/xz_config.h index ebfadd39e192..9506a96ebbcc 100644 --- a/arch/powerpc/boot/xz_config.h +++ b/arch/powerpc/boot/xz_config.h @@ -50,11 +50,8 @@ static inline void put_unaligned_be32(u32 val, void *p) /* prevent the inclusion of the xz-preboot MM headers */ #define DECOMPR_MM_H #define memmove memmove -#define XZ_EXTERN static /* xz.h needs to be included directly since we need enum xz_mode */ #include "../../../include/linux/xz.h" -#undef XZ_EXTERN - #endif diff --git a/include/linux/xz.h b/include/linux/xz.h index 701d62c02b9a..58ae1d746c6f 100644 --- a/include/linux/xz.h +++ b/include/linux/xz.h @@ -18,11 +18,6 @@ # include #endif -/* In Linux, this is used to make extern functions static when needed. */ -#ifndef XZ_EXTERN -# define XZ_EXTERN extern -#endif - /** * enum xz_mode - Operation mode * @@ -190,7 +185,7 @@ struct xz_dec; * ready to be used with xz_dec_run(). If memory allocation fails, * xz_dec_init() returns NULL. */ -XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); +struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); /** * xz_dec_run() - Run the XZ decoder @@ -210,7 +205,7 @@ XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max); * get that amount valid data from the beginning of the stream. You must use * the multi-call decoder if you don't want to uncompress the whole stream. */ -XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); +enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); /** * xz_dec_reset() - Reset an already allocated decoder state @@ -223,14 +218,14 @@ XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b); * xz_dec_run(). Thus, explicit call to xz_dec_reset() is useful only in * multi-call mode. */ -XZ_EXTERN void xz_dec_reset(struct xz_dec *s); +void xz_dec_reset(struct xz_dec *s); /** * xz_dec_end() - Free the memory allocated for the decoder state * @s: Decoder state allocated using xz_dec_init(). If s is NULL, * this function does nothing. */ -XZ_EXTERN void xz_dec_end(struct xz_dec *s); +void xz_dec_end(struct xz_dec *s); /** * DOC: MicroLZMA decompressor @@ -244,10 +239,6 @@ XZ_EXTERN void xz_dec_end(struct xz_dec *s); * 3/0/2, the first byte is 0xA2. This way the first byte can never be 0x00. * Just like with LZMA2, lc + lp <= 4 must be true. The LZMA end-of-stream * marker must not be used. The unused values are reserved for future use. - * - * These functions aren't used or available in preboot code and thus aren't - * marked with XZ_EXTERN. This avoids warnings about static functions that - * are never defined. */ /* @@ -272,8 +263,8 @@ struct xz_dec_microlzma; * struct xz_dec_microlzma. If memory allocation fails or * dict_size is invalid, NULL is returned. */ -extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, - uint32_t dict_size); +struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, + uint32_t dict_size); /** * xz_dec_microlzma_reset() - Reset the MicroLZMA decoder state @@ -289,9 +280,8 @@ extern struct xz_dec_microlzma *xz_dec_microlzma_alloc(enum xz_mode mode, * requiring stdbool.h. This should normally be set to true. * When this is set to false, error detection is weaker. */ -extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, - uint32_t comp_size, uint32_t uncomp_size, - int uncomp_size_is_exact); +void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, uint32_t comp_size, + uint32_t uncomp_size, int uncomp_size_is_exact); /** * xz_dec_microlzma_run() - Run the MicroLZMA decoder @@ -329,15 +319,14 @@ extern void xz_dec_microlzma_reset(struct xz_dec_microlzma *s, * may be changed normally like with XZ_PREALLOC. This way input data can be * provided from non-contiguous memory. */ -extern enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, - struct xz_buf *b); +enum xz_ret xz_dec_microlzma_run(struct xz_dec_microlzma *s, struct xz_buf *b); /** * xz_dec_microlzma_end() - Free the memory allocated for the decoder state * @s: Decoder state allocated using xz_dec_microlzma_alloc(). * If s is NULL, this function does nothing. */ -extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); +void xz_dec_microlzma_end(struct xz_dec_microlzma *s); /* * Standalone build (userspace build or in-kernel build for boot time use) @@ -358,13 +347,13 @@ extern void xz_dec_microlzma_end(struct xz_dec_microlzma *s); * This must be called before any other xz_* function to initialize * the CRC32 lookup table. */ -XZ_EXTERN void xz_crc32_init(void); +void xz_crc32_init(void); /* * Update CRC32 value using the polynomial from IEEE-802.3. To start a new * calculation, the third argument must be zero. To continue the calculation, * the previously returned value is passed as the third argument. */ -XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc); +uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc); #endif #endif diff --git a/lib/decompress_unxz.c b/lib/decompress_unxz.c index cae00395d7a6..32138bb8ef77 100644 --- a/lib/decompress_unxz.c +++ b/lib/decompress_unxz.c @@ -107,7 +107,6 @@ #ifdef __KERNEL__ # include #endif -#define XZ_EXTERN STATIC #ifndef XZ_PREBOOT # include diff --git a/lib/xz/xz_crc32.c b/lib/xz/xz_crc32.c index effdf34ec48d..6a7906a328ba 100644 --- a/lib/xz/xz_crc32.c +++ b/lib/xz/xz_crc32.c @@ -26,7 +26,7 @@ STATIC_RW_DATA uint32_t xz_crc32_table[256]; -XZ_EXTERN void xz_crc32_init(void) +void xz_crc32_init(void) { const uint32_t poly = 0xEDB88320; @@ -45,7 +45,7 @@ XZ_EXTERN void xz_crc32_init(void) return; } -XZ_EXTERN uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) +uint32_t xz_crc32(const uint8_t *buf, size_t size, uint32_t crc) { crc = ~crc; diff --git a/lib/xz/xz_dec_bcj.c b/lib/xz/xz_dec_bcj.c index 42d7f268726f..8237db17eee3 100644 --- a/lib/xz/xz_dec_bcj.c +++ b/lib/xz/xz_dec_bcj.c @@ -572,9 +572,8 @@ static void bcj_flush(struct xz_dec_bcj *s, struct xz_buf *b) * data in chunks of 1-16 bytes. To hide this issue, this function does * some buffering. */ -XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, - struct xz_dec_lzma2 *lzma2, - struct xz_buf *b) +enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, + struct xz_buf *b) { size_t out_start; @@ -682,7 +681,7 @@ XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, return s->ret; } -XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) +struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) { struct xz_dec_bcj *s = kmalloc(sizeof(*s), GFP_KERNEL); if (s != NULL) @@ -691,7 +690,7 @@ XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call) return s; } -XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) +enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id) { switch (id) { #ifdef XZ_DEC_X86 diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index 613939f5dd6c..83bb66b6016d 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -960,8 +960,7 @@ static bool lzma2_lzma(struct xz_dec_lzma2 *s, struct xz_buf *b) * Take care of the LZMA2 control layer, and forward the job of actual LZMA * decoding or copying of uncompressed chunks to other functions. */ -XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, - struct xz_buf *b) +enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, struct xz_buf *b) { uint32_t tmp; @@ -1137,8 +1136,7 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, return XZ_OK; } -XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, - uint32_t dict_max) +struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, uint32_t dict_max) { struct xz_dec_lzma2 *s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) @@ -1161,7 +1159,7 @@ XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, return s; } -XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) +enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) { /* This limits dictionary size to 3 GiB to keep parsing simpler. */ if (props > 39) @@ -1197,7 +1195,7 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props) return XZ_OK; } -XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) +void xz_dec_lzma2_end(struct xz_dec_lzma2 *s) { if (DEC_IS_MULTI(s->dict.mode)) vfree(s->dict.buf); diff --git a/lib/xz/xz_dec_stream.c b/lib/xz/xz_dec_stream.c index 0058406ccd17..f9d003684d56 100644 --- a/lib/xz/xz_dec_stream.c +++ b/lib/xz/xz_dec_stream.c @@ -746,7 +746,7 @@ static enum xz_ret dec_main(struct xz_dec *s, struct xz_buf *b) * actually succeeds (that's the price to pay of using the output buffer as * the workspace). */ -XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) +enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) { size_t in_start; size_t out_start; @@ -782,7 +782,7 @@ XZ_EXTERN enum xz_ret xz_dec_run(struct xz_dec *s, struct xz_buf *b) return ret; } -XZ_EXTERN struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) +struct xz_dec *xz_dec_init(enum xz_mode mode, uint32_t dict_max) { struct xz_dec *s = kmalloc(sizeof(*s), GFP_KERNEL); if (s == NULL) @@ -812,7 +812,7 @@ error_bcj: return NULL; } -XZ_EXTERN void xz_dec_reset(struct xz_dec *s) +void xz_dec_reset(struct xz_dec *s) { s->sequence = SEQ_STREAM_HEADER; s->allow_buf_error = false; @@ -824,7 +824,7 @@ XZ_EXTERN void xz_dec_reset(struct xz_dec *s) s->temp.size = STREAM_HEADER_SIZE; } -XZ_EXTERN void xz_dec_end(struct xz_dec *s) +void xz_dec_end(struct xz_dec *s) { if (s != NULL) { xz_dec_lzma2_end(s->lzma2); diff --git a/lib/xz/xz_private.h b/lib/xz/xz_private.h index a8b1cbe8d21d..5f1294a1408c 100644 --- a/lib/xz/xz_private.h +++ b/lib/xz/xz_private.h @@ -115,8 +115,7 @@ * Allocate memory for LZMA2 decoder. xz_dec_lzma2_reset() must be used * before calling xz_dec_lzma2_run(). */ -XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, - uint32_t dict_max); +struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, uint32_t dict_max); /* * Decode the LZMA2 properties (one byte) and reset the decoder. Return @@ -124,22 +123,20 @@ XZ_EXTERN struct xz_dec_lzma2 *xz_dec_lzma2_create(enum xz_mode mode, * big enough, and XZ_OPTIONS_ERROR if props indicates something that this * decoder doesn't support. */ -XZ_EXTERN enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, - uint8_t props); +enum xz_ret xz_dec_lzma2_reset(struct xz_dec_lzma2 *s, uint8_t props); /* Decode raw LZMA2 stream from b->in to b->out. */ -XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, - struct xz_buf *b); +enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, struct xz_buf *b); /* Free the memory allocated for the LZMA2 decoder. */ -XZ_EXTERN void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); +void xz_dec_lzma2_end(struct xz_dec_lzma2 *s); #ifdef XZ_DEC_BCJ /* * Allocate memory for BCJ decoders. xz_dec_bcj_reset() must be used before * calling xz_dec_bcj_run(). */ -XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call); +struct xz_dec_bcj *xz_dec_bcj_create(bool single_call); /* * Decode the Filter ID of a BCJ filter. This implementation doesn't @@ -147,16 +144,15 @@ XZ_EXTERN struct xz_dec_bcj *xz_dec_bcj_create(bool single_call); * is needed. Returns XZ_OK if the given Filter ID is supported. * Otherwise XZ_OPTIONS_ERROR is returned. */ -XZ_EXTERN enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); +enum xz_ret xz_dec_bcj_reset(struct xz_dec_bcj *s, uint8_t id); /* * Decode raw BCJ + LZMA2 stream. This must be used only if there actually is * a BCJ filter in the chain. If the chain has only LZMA2, xz_dec_lzma2_run() * must be called directly. */ -XZ_EXTERN enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, - struct xz_dec_lzma2 *lzma2, - struct xz_buf *b); +enum xz_ret xz_dec_bcj_run(struct xz_dec_bcj *s, struct xz_dec_lzma2 *lzma2, + struct xz_buf *b); /* Free the memory allocated for the BCJ filters. */ #define xz_dec_bcj_end(s) kfree(s) -- cgit v1.2.3 From 7b76689a021d19a016310bd5da35450641b67966 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 14 Jul 2024 19:33:07 +0200 Subject: dyndbg: use seq_putc() in ddebug_proc_show() Single characters should be put into a sequence. Thus use the corresponding function "seq_putc". This issue was transformed by using the Coccinelle software. Link: https://lkml.kernel.org/r/375b5b4b-6295-419e-bae9-da724a7a682d@web.de Signed-off-by: Markus Elfring Cc: Jason Baron Cc: Jim Cromie Signed-off-by: Andrew Morton --- lib/dynamic_debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index f2c5e7910bb1..5a007952f7f2 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1147,7 +1147,7 @@ static int ddebug_proc_show(struct seq_file *m, void *p) iter->table->mod_name, dp->function, ddebug_describe_flags(dp->flags, &flags)); seq_escape_str(m, dp->format, ESCAPE_SPACE, "\t\r\n\""); - seq_puts(m, "\""); + seq_putc(m, '"'); if (dp->class_id != _DPRINTK_CLASS_DFLT) { class = ddebug_class_name(iter, dp); @@ -1156,7 +1156,7 @@ static int ddebug_proc_show(struct seq_file *m, void *p) else seq_printf(m, " class unknown, _id:%d", dp->class_id); } - seq_puts(m, "\n"); + seq_putc(m, '\n'); return 0; } -- cgit v1.2.3 From fbe617af697c336db7630762158127eaa5a1d223 Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Sun, 14 Jul 2024 19:15:09 +0200 Subject: closures: use seq_putc() in debug_show() A single line break should be put into a sequence. Thus use the corresponding function "seq_putc". This issue was transformed by using the Coccinelle software. Link: https://lkml.kernel.org/r/e7faa2c4-9590-44b4-8669-69ef810277b1@web.de Signed-off-by: Markus Elfring Cc: Kent Overstreet Signed-off-by: Andrew Morton --- lib/closure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/closure.c b/lib/closure.c index 116afae2eed9..2bfe7d2a0048 100644 --- a/lib/closure.c +++ b/lib/closure.c @@ -278,7 +278,7 @@ static int debug_show(struct seq_file *f, void *data) seq_printf(f, " W %pS\n", (void *) cl->waiting_on); - seq_puts(f, "\n"); + seq_putc(f, '\n'); } spin_unlock_irq(&closure_list_lock); -- cgit v1.2.3 From 9a42bfd255b288dad2d1a9df0a1fe58394d5da12 Mon Sep 17 00:00:00 2001 From: Deshan Zhang Date: Thu, 25 Jul 2024 17:30:45 +0800 Subject: lib/lru_cache: fix spelling mistake "colision"->"collision" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a spelling mistake in a literal string and in cariable names. Fix these. Link: https://lkml.kernel.org/r/20240725093044.1742842-1-deshan@nfschina.com Signed-off-by: Deshan Zhang Cc: Christoph Böhmwalder Cc: Lars Ellenberg Cc: Philipp Reisner Signed-off-by: Andrew Morton --- include/linux/lru_cache.h | 4 ++-- lib/lru_cache.c | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index c9afcdd9324c..ff82ef85a084 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -119,7 +119,7 @@ write intent log information, three of which are mentioned here. */ /* this defines an element in a tracked set - * .colision is for hash table lookup. + * .collision is for hash table lookup. * When we process a new IO request, we know its sector, thus can deduce the * region number (label) easily. To do the label -> object lookup without a * full list walk, we use a simple hash table. @@ -145,7 +145,7 @@ write intent log information, three of which are mentioned here. * But it avoids high order page allocations in kmalloc. */ struct lc_element { - struct hlist_node colision; + struct hlist_node collision; struct list_head list; /* LRU list or free list */ unsigned refcnt; /* back "pointer" into lc_cache->element[index], diff --git a/lib/lru_cache.c b/lib/lru_cache.c index b3d9187611de..9e0d469c7658 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -243,7 +243,7 @@ static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr, BUG_ON(!lc); BUG_ON(!lc->nr_elements); - hlist_for_each_entry(e, lc_hash_slot(lc, enr), colision) { + hlist_for_each_entry(e, lc_hash_slot(lc, enr), collision) { /* "about to be changed" elements, pending transaction commit, * are hashed by their "new number". "Normal" elements have * lc_number == lc_new_number. */ @@ -303,7 +303,7 @@ void lc_del(struct lru_cache *lc, struct lc_element *e) BUG_ON(e->refcnt); e->lc_number = e->lc_new_number = LC_FREE; - hlist_del_init(&e->colision); + hlist_del_init(&e->collision); list_move(&e->list, &lc->free); RETURN(); } @@ -324,9 +324,9 @@ static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned n PARANOIA_LC_ELEMENT(lc, e); e->lc_new_number = new_number; - if (!hlist_unhashed(&e->colision)) - __hlist_del(&e->colision); - hlist_add_head(&e->colision, lc_hash_slot(lc, new_number)); + if (!hlist_unhashed(&e->collision)) + __hlist_del(&e->collision); + hlist_add_head(&e->collision, lc_hash_slot(lc, new_number)); list_move(&e->list, &lc->to_be_changed); return e; -- cgit v1.2.3 From b6e21b71208f289a796d786bd695ec25eae4ed9a Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Fri, 26 Jul 2024 17:49:46 +0200 Subject: lib: checksum: use ARRAY_SIZE() to improve assert_setup_correct() Use ARRAY_SIZE() to simplify the assert_setup_correct() function and improve its readability. Link: https://lkml.kernel.org/r/20240726154946.230928-1-thorsten.blum@toblux.com Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- lib/checksum_kunit.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/checksum_kunit.c b/lib/checksum_kunit.c index 4e4d081a1d3b..be04aa42125c 100644 --- a/lib/checksum_kunit.c +++ b/lib/checksum_kunit.c @@ -468,12 +468,9 @@ static __wsum to_wsum(u32 x) static void assert_setup_correct(struct kunit *test) { - CHECK_EQ(sizeof(random_buf) / sizeof(random_buf[0]), MAX_LEN); - CHECK_EQ(sizeof(expected_results) / sizeof(expected_results[0]), - MAX_LEN); - CHECK_EQ(sizeof(init_sums_no_overflow) / - sizeof(init_sums_no_overflow[0]), - MAX_LEN); + CHECK_EQ(ARRAY_SIZE(random_buf), MAX_LEN); + CHECK_EQ(ARRAY_SIZE(expected_results), MAX_LEN); + CHECK_EQ(ARRAY_SIZE(init_sums_no_overflow), MAX_LEN); } /* -- cgit v1.2.3 From e0ba72e3a4422c4255fa80191a637d7c65ef4c59 Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Tue, 23 Jul 2024 16:40:17 +0000 Subject: lockdep: upper limit LOCKDEP_CHAINS_BITS CONFIG_LOCKDEP_CHAINS_BITS value decides the size of chain_hlocks[] in kernel/locking/lockdep.c, and it is checked by add_chain_cache() with BUILD_BUG_ON((1UL << 24) <= ARRAY_SIZE(chain_hlocks)); This patch is just to silence BUILD_BUG_ON(). See also https://lore.kernel.org/all/30795.1620913191@jrobl/ [cmllamas@google.com: fix minor checkpatch issues in commit log] Link: https://lkml.kernel.org/r/20240723164018.2489615-1-cmllamas@google.com Signed-off-by: J. R. Okajima Signed-off-by: Carlos Llamas Acked-by: Tetsuo Handa Cc: Peter Zijlstra Cc: Boqun Feng Cc: Ingo Molnar Cc: Waiman Long Cc: Will Deacon Signed-off-by: Andrew Morton --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index bf0995d328b3..a81d452941ce 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1515,7 +1515,7 @@ config LOCKDEP_BITS config LOCKDEP_CHAINS_BITS int "Bitsize for MAX_LOCKDEP_CHAINS" depends on LOCKDEP && !LOCKDEP_SMALL - range 10 30 + range 10 21 default 16 help Try increasing this value if you hit "BUG: MAX_LOCKDEP_CHAINS too low!" message. -- cgit v1.2.3 From a15bec6a8f2f177e6c1388f23d02436e27994299 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 6 Aug 2024 08:39:27 -0700 Subject: lib/rhashtable: cleanup fallback check in bucket_table_alloc() Upon allocation failure, the current check with the nofail bits is unnecessary, and further stands in the way of discouraging direct use of __GFP_NOFAIL. Remove this and replace with the proper way of determining if doing a non-blocking allocation for the nested table case. Link: https://lkml.kernel.org/r/20240806153927.184515-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Suggested-by: Michal Hocko Cc: Davidlohr Bueso Cc: Herbert Xu Signed-off-by: Andrew Morton --- lib/rhashtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/rhashtable.c b/lib/rhashtable.c index dbbed19f8fff..6c902639728b 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -189,7 +189,7 @@ static struct bucket_table *bucket_table_alloc(struct rhashtable *ht, size = nbuckets; - if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) { + if (tbl == NULL && !gfpflags_allow_blocking(gfp)) { tbl = nested_bucket_table_alloc(ht, nbuckets, gfp); nbuckets = 0; } -- cgit v1.2.3 From 6ce2082fd3a25d5a8c756120959237cace0379f1 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 13 Aug 2024 15:12:35 +0300 Subject: fault-inject: improve build for CONFIG_FAULT_INJECTION=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fault-inject.h users across the kernel need to add a lot of #ifdef CONFIG_FAULT_INJECTION to cater for shortcomings in the header. Make fault-inject.h self-contained for CONFIG_FAULT_INJECTION=n, and add stubs for DECLARE_FAULT_ATTR(), setup_fault_attr(), should_fail_ex(), and should_fail() to allow removal of conditional compilation. [akpm@linux-foundation.org: repair fallout from no longer including debugfs.h into fault-inject.h] [akpm@linux-foundation.org: fix drivers/misc/xilinx_tmr_inject.c] [akpm@linux-foundation.org: Add debugfs.h inclusion to more files, per Stephen] Link: https://lkml.kernel.org/r/20240813121237.2382534-1-jani.nikula@intel.com Fixes: 6ff1cb355e62 ("[PATCH] fault-injection capabilities infrastructure") Signed-off-by: Jani Nikula Cc: Akinobu Mita Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Himal Prasad Ghimiray Cc: Lucas De Marchi Cc: Rob Clark Cc: Rodrigo Vivi Cc: Thomas Hellström Cc: Stephen Rothwell Signed-off-by: Andrew Morton --- drivers/gpu/drm/msm/msm_drv.c | 1 + drivers/iommu/iommufd/selftest.c | 1 + drivers/misc/xilinx_tmr_inject.c | 1 + drivers/nvme/host/fault_inject.c | 1 + drivers/ufs/core/ufs-fault-injection.c | 1 + include/linux/fault-inject.h | 36 +++++++++++++++++++++++++++------- include/linux/mmc/host.h | 1 + include/ufs/ufshcd.h | 1 + kernel/futex/core.c | 1 + lib/fault-inject.c | 1 + mm/fail_page_alloc.c | 1 + mm/failslab.c | 1 + 12 files changed, 40 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c index 9c33f4e3f822..e018bc79e188 100644 --- a/drivers/gpu/drm/msm/msm_drv.c +++ b/drivers/gpu/drm/msm/msm_drv.c @@ -7,6 +7,7 @@ #include #include +#include #include #include diff --git a/drivers/iommu/iommufd/selftest.c b/drivers/iommu/iommufd/selftest.c index 222cfc11ebfd..db4032feccee 100644 --- a/drivers/iommu/iommufd/selftest.c +++ b/drivers/iommu/iommufd/selftest.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/xilinx_tmr_inject.c b/drivers/misc/xilinx_tmr_inject.c index 73c6da7d0963..734fdfac19ef 100644 --- a/drivers/misc/xilinx_tmr_inject.c +++ b/drivers/misc/xilinx_tmr_inject.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 1d1b6441a339..105d6cb41c72 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -6,6 +6,7 @@ */ #include +#include #include "nvme.h" static DECLARE_FAULT_ATTR(fail_default_attr); diff --git a/drivers/ufs/core/ufs-fault-injection.c b/drivers/ufs/core/ufs-fault-injection.c index 169540417079..55db38e75cc4 100644 --- a/drivers/ufs/core/ufs-fault-injection.c +++ b/drivers/ufs/core/ufs-fault-injection.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include "ufs-fault-injection.h" diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 354413950d34..8c829d28dcf3 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -2,13 +2,17 @@ #ifndef _LINUX_FAULT_INJECT_H #define _LINUX_FAULT_INJECT_H +#include +#include + +struct dentry; +struct kmem_cache; + #ifdef CONFIG_FAULT_INJECTION -#include -#include +#include #include #include -#include /* * For explanation of the elements of this struct, see @@ -51,6 +55,28 @@ int setup_fault_attr(struct fault_attr *attr, char *str); bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags); bool should_fail(struct fault_attr *attr, ssize_t size); +#else /* CONFIG_FAULT_INJECTION */ + +struct fault_attr { +}; + +#define DECLARE_FAULT_ATTR(name) struct fault_attr name = {} + +static inline int setup_fault_attr(struct fault_attr *attr, char *str) +{ + return 0; /* Note: 0 means error for __setup() handlers! */ +} +static inline bool should_fail_ex(struct fault_attr *attr, ssize_t size, int flags) +{ + return false; +} +static inline bool should_fail(struct fault_attr *attr, ssize_t size) +{ + return false; +} + +#endif /* CONFIG_FAULT_INJECTION */ + #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *fault_create_debugfs_attr(const char *name, @@ -87,10 +113,6 @@ static inline void fault_config_init(struct fault_config *config, #endif /* CONFIG_FAULT_INJECTION_CONFIGFS */ -#endif /* CONFIG_FAULT_INJECTION */ - -struct kmem_cache; - #ifdef CONFIG_FAIL_PAGE_ALLOC bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order); #else diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 88c6a76042ee..49470188fca7 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h index 0fd2aebac728..3f68ae3e4330 100644 --- a/include/ufs/ufshcd.h +++ b/include/ufs/ufshcd.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 06a1f091be81..136768ae2637 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/fault-inject.c b/lib/fault-inject.c index d608f9b48c10..52eb6ba29698 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/fail_page_alloc.c b/mm/fail_page_alloc.c index 532851ce5132..7647096170e9 100644 --- a/mm/fail_page_alloc.c +++ b/mm/fail_page_alloc.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include +#include #include #include diff --git a/mm/failslab.c b/mm/failslab.c index af16c2ed578f..c3901b136498 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include "slab.h" -- cgit v1.2.3 From cbf164cd44e06c78938b4a4a4479d3541779c319 Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Tue, 13 Aug 2024 01:02:29 +0800 Subject: lib/bcd: optimize _bin2bcd() for improved performance The original _bin2bcd() function used / 10 and % 10 operations for conversion. Although GCC optimizes these operations and does not generate division or modulus instructions, the new implementation reduces the number of mov instructions in the generated code for both x86-64 and ARM architectures. This optimization calculates the tens digit using (val * 103) >> 10, which is accurate for values of 'val' in the range [0, 178]. Given that the valid input range is [0, 99], this method ensures correctness while simplifying the generated code. Link: https://lkml.kernel.org/r/20240812170229.229380-1-visitorckw@gmail.com Signed-off-by: Kuan-Wei Chiu Cc: Ching-Chun (Jim) Huang Signed-off-by: Andrew Morton --- lib/bcd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/bcd.c b/lib/bcd.c index 7e4750b6e801..c5e79ba9cd7b 100644 --- a/lib/bcd.c +++ b/lib/bcd.c @@ -10,6 +10,8 @@ EXPORT_SYMBOL(_bcd2bin); unsigned char _bin2bcd(unsigned val) { - return ((val / 10) << 4) + val % 10; + const unsigned int t = (val * 103) >> 10; + + return (t << 4) | (val - t * 10); } EXPORT_SYMBOL(_bin2bcd); -- cgit v1.2.3 From 16d9691ad4b562ea19271f0788738f649c02cf3c Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 14 Aug 2024 08:44:13 +0200 Subject: lib/percpu_counter: add missing __percpu qualifier to a cast Add missing __percpu qualifier to a (void *) cast to fix percpu_counter.c:212:36: warning: cast removes address space '__percpu' of expression percpu_counter.c:212:33: warning: incorrect type in assignment (different address spaces) percpu_counter.c:212:33: expected signed int [noderef] [usertype] __percpu *counters percpu_counter.c:212:33: got void * sparse warnings. Found by GCC's named address space checks. There were no changes in the resulting object file. Link: https://lkml.kernel.org/r/20240814064437.940162-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton --- lib/percpu_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 51bc5246986d..2891f94a11c6 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -209,7 +209,7 @@ int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, INIT_LIST_HEAD(&fbc[i].list); #endif fbc[i].count = amount; - fbc[i].counters = (void *)counters + (i * counter_size); + fbc[i].counters = (void __percpu *)counters + i * counter_size; debug_percpu_counter_activate(&fbc[i]); } -- cgit v1.2.3 From fb54ea1ee84534cab6a15515c73a0811bdcbc973 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 21 Aug 2024 18:51:04 +0300 Subject: dimlib: use *-y instead of *-objs in Makefile *-objs suffix is reserved rather for (user-space) host programs while usually *-y suffix is used for kernel drivers (although *-objs works for that purpose for now). Let's correct the old usages of *-objs in Makefiles. Link: https://lkml.kernel.org/r/20240821155140.611514-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Florian Fainelli Reviewed-by: Alexander Lobakin Cc: Rasmus Villemoes Cc: Tal Gilboa Signed-off-by: Andrew Morton --- lib/dim/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/dim/Makefile b/lib/dim/Makefile index c4cc4026c451..5b9bfaac7ac1 100644 --- a/lib/dim/Makefile +++ b/lib/dim/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_DIMLIB) += dimlib.o -dimlib-objs := dim.o net_dim.o rdma_dim.o +dimlib-y := dim.o net_dim.o rdma_dim.o -- cgit v1.2.3 From 38676d9e33133c0c39951b812b19cc5b9ff1978a Mon Sep 17 00:00:00 2001 From: Yang Ruibin <11162571@vivo.com> Date: Wed, 21 Aug 2024 03:34:40 -0400 Subject: lib: fix the NULL vs IS_ERR() bug for debugfs_create_dir() debugfs_create_dir() returns error pointers. It never returns NULL. So use IS_ERR() to check it. Link: https://lkml.kernel.org/r/20240821073441.9701-1-11162571@vivo.com Signed-off-by: Yang Ruibin <11162571@vivo.com> Signed-off-by: Andrew Morton --- lib/test_fpu_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/test_fpu_glue.c b/lib/test_fpu_glue.c index 074f30301f29..c0596426370a 100644 --- a/lib/test_fpu_glue.c +++ b/lib/test_fpu_glue.c @@ -42,7 +42,7 @@ static int __init test_fpu_init(void) return -EINVAL; selftest_dir = debugfs_create_dir("selftest_helpers", NULL); - if (!selftest_dir) + if (IS_ERR(selftest_dir)) return -ENOMEM; debugfs_create_file_unsafe("test_fpu", 0444, selftest_dir, NULL, -- cgit v1.2.3 From 00d066a4d4edbe559ba6c35153da71d4b2b8a383 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Thu, 29 Aug 2024 14:33:37 +0200 Subject: netdev_features: convert NETIF_F_LLTX to dev->lltx NETIF_F_LLTX can't be changed via Ethtool and is not a feature, rather an attribute, very similar to IFF_NO_QUEUE (and hot). Free one netdev_features_t bit and make it a "hot" private flag. Signed-off-by: Alexander Lobakin Signed-off-by: Paolo Abeni --- Documentation/networking/net_cachelines/net_device.rst | 1 + Documentation/networking/netdev-features.rst | 8 -------- Documentation/networking/netdevices.rst | 4 ++-- drivers/net/amt.c | 2 +- drivers/net/bareudp.c | 2 +- drivers/net/bonding/bond_main.c | 2 +- drivers/net/dummy.c | 3 ++- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 3 ++- drivers/net/ethernet/freescale/dpaa/dpaa_eth.c | 3 ++- drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c | 3 ++- drivers/net/ethernet/mellanox/mlxsw/spectrum.c | 3 ++- drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 3 +-- drivers/net/ethernet/pasemi/pasemi_mac.c | 5 +++-- drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c | 2 +- drivers/net/ethernet/sfc/ef100_rep.c | 4 ++-- drivers/net/ethernet/tehuti/tehuti.c | 4 ++-- drivers/net/ethernet/tehuti/tehuti.h | 2 +- drivers/net/ethernet/toshiba/spider_net.c | 3 ++- drivers/net/geneve.c | 2 +- drivers/net/gtp.c | 2 +- drivers/net/hamradio/bpqether.c | 2 +- drivers/net/ipvlan/ipvlan_main.c | 3 ++- drivers/net/loopback.c | 2 +- drivers/net/macsec.c | 4 ++-- drivers/net/macvlan.c | 3 ++- drivers/net/net_failover.c | 2 +- drivers/net/netkit.c | 3 ++- drivers/net/nlmon.c | 4 ++-- drivers/net/ppp/ppp_generic.c | 2 +- drivers/net/rionet.c | 2 +- drivers/net/team/team_core.c | 2 +- drivers/net/tun.c | 5 +++-- drivers/net/veth.c | 2 +- drivers/net/vrf.c | 2 +- drivers/net/vsockmon.c | 4 ++-- drivers/net/vxlan/vxlan_core.c | 2 +- drivers/net/wireguard/device.c | 2 +- drivers/staging/octeon/ethernet.c | 2 +- include/linux/netdev_features.h | 6 ++---- include/linux/netdevice.h | 10 +++++++--- lib/test_bpf.c | 3 +-- net/8021q/vlan_dev.c | 4 ++-- net/batman-adv/soft-interface.c | 2 +- net/bridge/br_device.c | 3 ++- net/core/net-sysfs.c | 3 +-- net/dsa/user.c | 3 ++- net/ethtool/common.c | 1 - net/hsr/hsr_device.c | 4 ++-- net/ipv4/ip_gre.c | 4 +++- net/ipv4/ip_vti.c | 2 +- net/ipv4/ipip.c | 2 +- net/ipv6/ip6_gre.c | 4 +++- net/ipv6/ip6_tunnel.c | 2 +- net/ipv6/sit.c | 2 +- net/l2tp/l2tp_eth.c | 2 +- net/openvswitch/vport-internal_dev.c | 9 +++++---- net/xfrm/xfrm_interface_core.c | 2 +- 57 files changed, 93 insertions(+), 84 deletions(-) (limited to 'lib') diff --git a/Documentation/networking/net_cachelines/net_device.rst b/Documentation/networking/net_cachelines/net_device.rst index 1fe3cdfe3582..e55319277074 100644 --- a/Documentation/networking/net_cachelines/net_device.rst +++ b/Documentation/networking/net_cachelines/net_device.rst @@ -8,6 +8,7 @@ net_device struct fast path usage breakdown Type Name fastpath_tx_access fastpath_rx_access Comments ..struct ..net_device unsigned_long:32 priv_flags read_mostly - __dev_queue_xmit(tx) +unsigned_long:1 lltx read_mostly - HARD_TX_LOCK,HARD_TX_TRYLOCK,HARD_TX_UNLOCK(tx) char name[16] - - struct_netdev_name_node* name_node struct_dev_ifalias* ifalias diff --git a/Documentation/networking/netdev-features.rst b/Documentation/networking/netdev-features.rst index d7b15bb64deb..f29d982ebf5d 100644 --- a/Documentation/networking/netdev-features.rst +++ b/Documentation/networking/netdev-features.rst @@ -139,14 +139,6 @@ chained skbs (skb->next/prev list). Features contained in NETIF_F_SOFT_FEATURES are features of networking stack. Driver should not change behaviour based on them. - * LLTX driver (deprecated for hardware drivers) - -NETIF_F_LLTX is meant to be used by drivers that don't need locking at all, -e.g. software tunnels. - -This is also used in a few legacy drivers that implement their -own locking, don't use it for new (hardware) drivers. - * netns-local device NETIF_F_NETNS_LOCAL is set for devices that are not allowed to move between diff --git a/Documentation/networking/netdevices.rst b/Documentation/networking/netdevices.rst index c2476917a6c3..857c9784f87e 100644 --- a/Documentation/networking/netdevices.rst +++ b/Documentation/networking/netdevices.rst @@ -258,11 +258,11 @@ ndo_get_stats: ndo_start_xmit: Synchronization: __netif_tx_lock spinlock. - When the driver sets NETIF_F_LLTX in dev->features this will be + When the driver sets dev->lltx this will be called without holding netif_tx_lock. In this case the driver has to lock by itself when needed. The locking there should also properly protect against - set_rx_mode. WARNING: use of NETIF_F_LLTX is deprecated. + set_rx_mode. WARNING: use of dev->lltx is deprecated. Don't use it for new drivers. Context: Process with BHs disabled or BH (timer), diff --git a/drivers/net/amt.c b/drivers/net/amt.c index 6d15ab3bfbbc..921bbfd72a38 100644 --- a/drivers/net/amt.c +++ b/drivers/net/amt.c @@ -3098,7 +3098,7 @@ static void amt_link_setup(struct net_device *dev) dev->hard_header_len = 0; dev->addr_len = 0; dev->priv_flags |= IFF_NO_QUEUE; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= NETIF_F_GSO_SOFTWARE; dev->features |= NETIF_F_NETNS_LOCAL; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM; diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c index d5c56ca91b77..6f4de883e872 100644 --- a/drivers/net/bareudp.c +++ b/drivers/net/bareudp.c @@ -553,7 +553,6 @@ static void bareudp_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &bareudp_type); dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->hw_features |= NETIF_F_RXCSUM; @@ -566,6 +565,7 @@ static void bareudp_setup(struct net_device *dev) dev->type = ARPHRD_NONE; netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; } diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index cb6a694313d5..8ae887cdc231 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -5928,7 +5928,7 @@ void bond_setup(struct net_device *bond_dev) #endif /* CONFIG_XFRM_OFFLOAD */ /* don't acquire bond device's netif_tx_lock when transmitting */ - bond_dev->features |= NETIF_F_LLTX; + bond_dev->lltx = true; /* By default, we declare the bond to be fully * VLAN hardware accelerated capable. Special diff --git a/drivers/net/dummy.c b/drivers/net/dummy.c index d29b5d7af0d7..e9c5e1e11fa0 100644 --- a/drivers/net/dummy.c +++ b/drivers/net/dummy.c @@ -109,9 +109,10 @@ static void dummy_setup(struct net_device *dev) dev->flags |= IFF_NOARP; dev->flags &= ~IFF_MULTICAST; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->lltx = true; dev->features |= NETIF_F_SG | NETIF_F_FRAGLIST; dev->features |= NETIF_F_GSO_SOFTWARE; - dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX; + dev->features |= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA; dev->features |= NETIF_F_GSO_ENCAP_ALL; dev->hw_features |= dev->features; dev->hw_enc_features |= dev->features; diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 7d7d3e0098df..3b7068832f95 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -1034,7 +1034,8 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) netdev->hw_features |= NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM; netdev->features |= NETIF_F_SG | NETIF_F_IP_CSUM | - NETIF_F_RXCSUM | NETIF_F_LLTX | NETIF_F_HIGHDMA; + NETIF_F_RXCSUM | NETIF_F_HIGHDMA; + netdev->lltx = true; if (vlan_tso_capable(adapter)) { netdev->features |= diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c index 5d99cfb4e994..1d0208b5db7d 100644 --- a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c +++ b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.c @@ -229,7 +229,7 @@ static int dpaa_netdev_init(struct net_device *net_dev, net_dev->max_mtu = dpaa_get_max_mtu(); net_dev->hw_features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | - NETIF_F_LLTX | NETIF_F_RXHASH); + NETIF_F_RXHASH); net_dev->hw_features |= NETIF_F_SG | NETIF_F_HIGHDMA; /* The kernels enables GSO automatically, if we declare NETIF_F_SG. @@ -239,6 +239,7 @@ static int dpaa_netdev_init(struct net_device *net_dev, net_dev->features |= NETIF_F_RXCSUM; net_dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + net_dev->lltx = true; /* we do not want shared skbs on TX */ net_dev->priv_flags &= ~IFF_TX_SKB_SHARING; diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c index 6866807973da..29886a8ba73f 100644 --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c @@ -4594,12 +4594,13 @@ static int dpaa2_eth_netdev_init(struct net_device *net_dev) net_dev->priv_flags |= supported; net_dev->priv_flags &= ~not_supported; + net_dev->lltx = true; /* Features */ net_dev->features = NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | - NETIF_F_LLTX | NETIF_F_HW_TC | NETIF_F_TSO; + NETIF_F_HW_TC | NETIF_F_TSO; net_dev->gso_max_segs = DPAA2_ETH_ENQUEUE_MAX_FDS; net_dev->hw_features = net_dev->features; net_dev->xdp_features = NETDEV_XDP_ACT_BASIC | diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index f064789f3240..44d6e125bd6f 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -1676,9 +1676,10 @@ static int mlxsw_sp_port_create(struct mlxsw_sp *mlxsw_sp, u16 local_port, netif_carrier_off(dev); - dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_LLTX | NETIF_F_SG | + dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_SG | NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_TC; dev->hw_features |= NETIF_F_HW_TC | NETIF_F_LOOPBACK; + dev->lltx = true; dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = MLXSW_PORT_MAX_MTU - MLXSW_PORT_ETH_FRAME_HDR; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index eee0bfc41074..227e7a5d712e 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -248,7 +248,6 @@ nfp_repr_fix_features(struct net_device *netdev, netdev_features_t features) features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_HW_TC); - features |= NETIF_F_LLTX; return features; } @@ -386,7 +385,7 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, netif_set_tso_max_segs(netdev, NFP_NET_LSO_MAX_SEGS); netdev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; - netdev->features |= NETIF_F_LLTX; + netdev->lltx = true; if (nfp_app_has_tc(app)) { netdev->features |= NETIF_F_HW_TC; diff --git a/drivers/net/ethernet/pasemi/pasemi_mac.c b/drivers/net/ethernet/pasemi/pasemi_mac.c index 62ba269da902..cb4e12df7719 100644 --- a/drivers/net/ethernet/pasemi/pasemi_mac.c +++ b/drivers/net/ethernet/pasemi/pasemi_mac.c @@ -1699,8 +1699,9 @@ pasemi_mac_probe(struct pci_dev *pdev, const struct pci_device_id *ent) netif_napi_add(dev, &mac->napi, pasemi_mac_poll); - dev->features = NETIF_F_IP_CSUM | NETIF_F_LLTX | NETIF_F_SG | - NETIF_F_HIGHDMA | NETIF_F_GSO; + dev->features = NETIF_F_IP_CSUM | NETIF_F_SG | NETIF_F_HIGHDMA | + NETIF_F_GSO; + dev->lltx = true; mac->dma_pdev = pci_get_device(PCI_VENDOR_ID_PASEMI, 0xa007, NULL); if (!mac->dma_pdev) { diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c index f1e40aade127..4f0ddcedfa97 100644 --- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c @@ -286,7 +286,7 @@ void rmnet_vnd_setup(struct net_device *rmnet_dev) rmnet_dev->needs_free_netdev = true; rmnet_dev->ethtool_ops = &rmnet_ethtool_ops; - rmnet_dev->features |= NETIF_F_LLTX; + rmnet_dev->lltx = true; /* This perm addr will be used as interface identifier by IPv6 */ rmnet_dev->addr_assign_type = NET_ADDR_RANDOM; diff --git a/drivers/net/ethernet/sfc/ef100_rep.c b/drivers/net/ethernet/sfc/ef100_rep.c index 0b3083ef0ead..e923e1796369 100644 --- a/drivers/net/ethernet/sfc/ef100_rep.c +++ b/drivers/net/ethernet/sfc/ef100_rep.c @@ -233,8 +233,8 @@ static struct efx_rep *efx_ef100_rep_create_netdev(struct efx_nic *efx, net_dev->ethtool_ops = &efx_ef100_rep_ethtool_ops; net_dev->min_mtu = EFX_MIN_MTU; net_dev->max_mtu = EFX_MAX_MTU; - net_dev->features |= NETIF_F_LLTX; - net_dev->hw_features |= NETIF_F_LLTX; + net_dev->lltx = true; + return efv; fail1: free_netdev(net_dev); diff --git a/drivers/net/ethernet/tehuti/tehuti.c b/drivers/net/ethernet/tehuti/tehuti.c index ede5f7890fb4..fc77f424f90b 100644 --- a/drivers/net/ethernet/tehuti/tehuti.c +++ b/drivers/net/ethernet/tehuti/tehuti.c @@ -1671,7 +1671,7 @@ static netdev_tx_t bdx_tx_transmit(struct sk_buff *skb, #endif #ifdef BDX_LLTX - netif_trans_update(ndev); /* NETIF_F_LLTX driver :( */ + netif_trans_update(ndev); /* dev->lltx driver :( */ #endif ndev->stats.tx_packets++; ndev->stats.tx_bytes += skb->len; @@ -2019,7 +2019,7 @@ bdx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) * set multicast list callback has to use priv->tx_lock. */ #ifdef BDX_LLTX - ndev->features |= NETIF_F_LLTX; + ndev->lltx = true; #endif /* MTU range: 60 - 16384 */ ndev->min_mtu = ETH_ZLEN; diff --git a/drivers/net/ethernet/tehuti/tehuti.h b/drivers/net/ethernet/tehuti/tehuti.h index 909e7296cecf..47a2d3e5f8ed 100644 --- a/drivers/net/ethernet/tehuti/tehuti.h +++ b/drivers/net/ethernet/tehuti/tehuti.h @@ -260,7 +260,7 @@ struct bdx_priv { int tx_update_mark; int tx_noupd; #endif - spinlock_t tx_lock; /* NETIF_F_LLTX mode */ + spinlock_t tx_lock; /* dev->lltx mode */ /* rarely used */ u8 port; diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c index 87e67121477c..a4937c18d7cb 100644 --- a/drivers/net/ethernet/toshiba/spider_net.c +++ b/drivers/net/ethernet/toshiba/spider_net.c @@ -2277,10 +2277,11 @@ spider_net_setup_netdev(struct spider_net_card *card) netdev->hw_features = NETIF_F_RXCSUM | NETIF_F_IP_CSUM; if (SPIDER_NET_RX_CSUM_DEFAULT) netdev->features |= NETIF_F_RXCSUM; - netdev->features |= NETIF_F_IP_CSUM | NETIF_F_LLTX; + netdev->features |= NETIF_F_IP_CSUM; /* some time: NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | * NETIF_F_HW_VLAN_CTAG_FILTER */ + netdev->lltx = true; /* MTU range: 64 - 2294 */ netdev->min_mtu = SPIDER_NET_MIN_MTU; diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 838e85ddec67..7f611c74eb62 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -1194,7 +1194,6 @@ static void geneve_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &geneve_type); - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -1215,6 +1214,7 @@ static void geneve_setup(struct net_device *dev) netif_keep_dst(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->lltx = true; eth_hw_addr_random(dev); } diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 2e94d10348cc..a60bfb1abb7f 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -1356,7 +1356,7 @@ static void gtp_link_setup(struct net_device *dev) dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; dev->priv_flags |= IFF_NO_QUEUE; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); dev->needed_headroom = LL_MAX_HEADER + GTP_IPV4_MAXLEN; diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index 83a16d10eedb..bac1bb69d63a 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -458,7 +458,7 @@ static void bpq_setup(struct net_device *dev) dev->needs_free_netdev = true; dev->flags = 0; - dev->features = NETIF_F_LLTX; /* Allow recursion */ + dev->lltx = true; /* Allow recursion */ #if IS_ENABLED(CONFIG_AX25) dev->header_ops = &ax25_header_ops; diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 094f44dac5c8..ee2c3cf4df36 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -114,7 +114,7 @@ static void ipvlan_port_destroy(struct net_device *dev) NETIF_F_GSO_ROBUST | NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL) #define IPVLAN_ALWAYS_ON \ - (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_LLTX | NETIF_F_VLAN_CHALLENGED) + (IPVLAN_ALWAYS_ON_OFLOADS | NETIF_F_VLAN_CHALLENGED) #define IPVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ @@ -141,6 +141,7 @@ static int ipvlan_init(struct net_device *dev) dev->vlan_features = phy_dev->vlan_features & IPVLAN_FEATURES; dev->vlan_features |= IPVLAN_ALWAYS_ON_OFLOADS; dev->hw_enc_features |= dev->features; + dev->lltx = true; netif_inherit_tso_max(dev, phy_dev); dev->hard_header_len = phy_dev->hard_header_len; diff --git a/drivers/net/loopback.c b/drivers/net/loopback.c index 2b486e7c749c..bf857782be0f 100644 --- a/drivers/net/loopback.c +++ b/drivers/net/loopback.c @@ -171,6 +171,7 @@ static void gen_lo_setup(struct net_device *dev, dev->type = ARPHRD_LOOPBACK; /* 0x0001*/ dev->flags = IFF_LOOPBACK; dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE; + dev->lltx = true; netif_keep_dst(dev); dev->hw_features = NETIF_F_GSO_SOFTWARE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST @@ -179,7 +180,6 @@ static void gen_lo_setup(struct net_device *dev, | NETIF_F_RXCSUM | NETIF_F_SCTP_CRC | NETIF_F_HIGHDMA - | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | NETIF_F_VLAN_CHALLENGED | NETIF_F_LOOPBACK; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 2da70bc3dd86..12d1b205f6d1 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -3550,7 +3550,8 @@ static int macsec_dev_init(struct net_device *dev) return err; dev->features = real_dev->features & MACSEC_FEATURES; - dev->features |= NETIF_F_LLTX | NETIF_F_GSO_SOFTWARE; + dev->features |= NETIF_F_GSO_SOFTWARE; + dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; macsec_set_head_tail_room(dev); @@ -3581,7 +3582,6 @@ static netdev_features_t macsec_fix_features(struct net_device *dev, features &= (real_dev->features & MACSEC_FEATURES) | NETIF_F_GSO_SOFTWARE | NETIF_F_SOFT_FEATURES; - features |= NETIF_F_LLTX; return features; } diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index b45f137f365e..cf18e66de142 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -900,7 +900,7 @@ static struct lock_class_key macvlan_netdev_addr_lock_key; (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | \ NETIF_F_GSO_ROBUST | NETIF_F_GSO_ENCAP_ALL) -#define ALWAYS_ON_FEATURES (ALWAYS_ON_OFFLOADS | NETIF_F_LLTX) +#define ALWAYS_ON_FEATURES ALWAYS_ON_OFFLOADS #define MACVLAN_FEATURES \ (NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ @@ -932,6 +932,7 @@ static int macvlan_init(struct net_device *dev) dev->vlan_features = lowerdev->vlan_features & MACVLAN_FEATURES; dev->vlan_features |= ALWAYS_ON_OFFLOADS; dev->hw_enc_features |= dev->features; + dev->lltx = true; netif_inherit_tso_max(dev, lowerdev); dev->hard_header_len = lowerdev->hard_header_len; macvlan_set_lockdep_class(dev); diff --git a/drivers/net/net_failover.c b/drivers/net/net_failover.c index 963d8b4af28d..06728385a35f 100644 --- a/drivers/net/net_failover.c +++ b/drivers/net/net_failover.c @@ -731,7 +731,7 @@ struct failover *net_failover_create(struct net_device *standby_dev) IFF_TX_SKB_SHARING); /* don't acquire failover netdev's netif_tx_lock when transmitting */ - failover_dev->features |= NETIF_F_LLTX; + failover_dev->lltx = true; /* Don't allow failover devices to change network namespaces. */ failover_dev->features |= NETIF_F_NETNS_LOCAL; diff --git a/drivers/net/netkit.c b/drivers/net/netkit.c index 16789cd446e9..79232f5cc088 100644 --- a/drivers/net/netkit.c +++ b/drivers/net/netkit.c @@ -255,11 +255,12 @@ static void netkit_setup(struct net_device *dev) dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; dev->ethtool_ops = &netkit_ethtool_ops; dev->netdev_ops = &netkit_netdev_ops; - dev->features |= netkit_features | NETIF_F_LLTX; + dev->features |= netkit_features; dev->hw_features = netkit_features; dev->hw_enc_features = netkit_features; dev->mpls_features = NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE; diff --git a/drivers/net/nlmon.c b/drivers/net/nlmon.c index e5a0987a263e..8bfd4ee5a8c4 100644 --- a/drivers/net/nlmon.c +++ b/drivers/net/nlmon.c @@ -63,13 +63,13 @@ static void nlmon_setup(struct net_device *dev) { dev->type = ARPHRD_NETLINK; dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; dev->netdev_ops = &nlmon_ops; dev->ethtool_ops = &nlmon_ethtool_ops; dev->needs_free_netdev = true; - dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_LLTX; + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; dev->pcpu_stat_type = NETDEV_PCPU_STAT_LSTATS; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index eb9acfcaeb09..4b2971e2bf48 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1631,7 +1631,7 @@ static void ppp_setup(struct net_device *dev) dev->netdev_ops = &ppp_netdev_ops; SET_NETDEV_DEVTYPE(dev, &ppp_type); - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->hard_header_len = PPP_HDRLEN; dev->mtu = PPP_MRU; diff --git a/drivers/net/rionet.c b/drivers/net/rionet.c index 4eececc94513..318a0ef1af50 100644 --- a/drivers/net/rionet.c +++ b/drivers/net/rionet.c @@ -515,7 +515,7 @@ static int rionet_setup_netdev(struct rio_mport *mport, struct net_device *ndev) /* MTU range: 68 - 4082 */ ndev->min_mtu = ETH_MIN_MTU; ndev->max_mtu = RIONET_MAX_MTU; - ndev->features = NETIF_F_LLTX; + ndev->lltx = true; SET_NETDEV_DEV(ndev, &mport->dev); ndev->ethtool_ops = &rionet_ethtool_ops; diff --git a/drivers/net/team/team_core.c b/drivers/net/team/team_core.c index ab1935a4aa2c..1d1bad3cedc2 100644 --- a/drivers/net/team/team_core.c +++ b/drivers/net/team/team_core.c @@ -2189,8 +2189,8 @@ static void team_setup(struct net_device *dev) * Let this up to underlay drivers. */ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE; + dev->lltx = true; - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_GRO; /* Don't allow team devices to change network namespaces. */ diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 1d06c560c5e6..a645c36f2cee 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -990,10 +990,11 @@ static int tun_net_init(struct net_device *dev) dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST | TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - dev->features = dev->hw_features | NETIF_F_LLTX; + dev->features = dev->hw_features; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX); + dev->lltx = true; tun->flags = (tun->flags & ~TUN_FEATURES) | (ifr->ifr_flags & TUN_FEATURES); @@ -1129,7 +1130,7 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) goto drop; } - /* NETIF_F_LLTX requires to do our own update of trans_start */ + /* dev->lltx requires to do our own update of trans_start */ queue = netdev_get_tx_queue(dev, txq); txq_trans_cond_update(queue); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 34499b91a8bd..18148e068aa0 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1697,11 +1697,11 @@ static void veth_setup(struct net_device *dev) dev->priv_flags |= IFF_NO_QUEUE; dev->priv_flags |= IFF_PHONY_HEADROOM; dev->priv_flags |= IFF_DISABLE_NETPOLL; + dev->lltx = true; dev->netdev_ops = &veth_netdev_ops; dev->xdp_metadata_ops = &veth_xdp_metadata_ops; dev->ethtool_ops = &veth_ethtool_ops; - dev->features |= NETIF_F_LLTX; dev->features |= VETH_FEATURES; dev->vlan_features = dev->features & ~(NETIF_F_HW_VLAN_CTAG_TX | diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index a900908eb24a..b1d4ef538995 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -1635,7 +1635,7 @@ static void vrf_setup(struct net_device *dev) eth_hw_addr_random(dev); /* don't acquire vrf device's netif_tx_lock when transmitting */ - dev->features |= NETIF_F_LLTX; + dev->lltx = true; /* don't allow vrf devices to change network namespaces. */ dev->features |= NETIF_F_NETNS_LOCAL; diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c index 4c260074c091..53fb76d574c6 100644 --- a/drivers/net/vsockmon.c +++ b/drivers/net/vsockmon.c @@ -83,13 +83,13 @@ static void vsockmon_setup(struct net_device *dev) { dev->type = ARPHRD_VSOCKMON; dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; dev->netdev_ops = &vsockmon_ops; dev->ethtool_ops = &vsockmon_ethtool_ops; dev->needs_free_netdev = true; - dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_LLTX; + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA; dev->flags = IFF_NOARP; diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 4a54dd1950c1..53dcb9fffc04 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -3321,7 +3321,6 @@ static void vxlan_setup(struct net_device *dev) dev->needs_free_netdev = true; SET_NETDEV_DEVTYPE(dev, &vxlan_type); - dev->features |= NETIF_F_LLTX; dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST; dev->features |= NETIF_F_RXCSUM; dev->features |= NETIF_F_GSO_SOFTWARE; @@ -3333,6 +3332,7 @@ static void vxlan_setup(struct net_device *dev) netif_keep_dst(dev); dev->priv_flags |= IFF_NO_QUEUE; dev->change_proto_down = true; + dev->lltx = true; /* MTU range: 68 - 65535 */ dev->min_mtu = ETH_MIN_MTU; diff --git a/drivers/net/wireguard/device.c b/drivers/net/wireguard/device.c index 3feb36ee5bfb..45e9b908dbfb 100644 --- a/drivers/net/wireguard/device.c +++ b/drivers/net/wireguard/device.c @@ -289,7 +289,7 @@ static void wg_setup(struct net_device *dev) dev->type = ARPHRD_NONE; dev->flags = IFF_POINTOPOINT | IFF_NOARP; dev->priv_flags |= IFF_NO_QUEUE; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= WG_NETDEV_FEATURES; dev->hw_features |= WG_NETDEV_FEATURES; dev->hw_enc_features |= WG_NETDEV_FEATURES; diff --git a/drivers/staging/octeon/ethernet.c b/drivers/staging/octeon/ethernet.c index 9eee28f2940c..a5e99cc78a45 100644 --- a/drivers/staging/octeon/ethernet.c +++ b/drivers/staging/octeon/ethernet.c @@ -425,7 +425,7 @@ int cvm_oct_common_init(struct net_device *dev) dev->features |= NETIF_F_SG | NETIF_F_IP_CSUM; /* We do our own locking, Linux doesn't need to */ - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->ethtool_ops = &cvm_oct_ethtool_ops; cvm_oct_set_mac_filter(dev); diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 7c2d77d75a88..a2e20b517584 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -24,8 +24,7 @@ enum { NETIF_F_HW_VLAN_CTAG_FILTER_BIT,/* Receive filtering on VLAN CTAGs */ NETIF_F_VLAN_CHALLENGED_BIT, /* Device cannot handle VLAN packets */ NETIF_F_GSO_BIT, /* Enable software GSO. */ - NETIF_F_LLTX_BIT, /* LockLess TX - deprecated. Please */ - /* do not use LLTX in new drivers */ + __UNUSED_NETIF_F_12, NETIF_F_NETNS_LOCAL_BIT, /* Does not change network namespaces */ NETIF_F_GRO_BIT, /* Generic receive offload */ NETIF_F_LRO_BIT, /* large receive offload */ @@ -120,7 +119,6 @@ enum { #define NETIF_F_HW_VLAN_CTAG_TX __NETIF_F(HW_VLAN_CTAG_TX) #define NETIF_F_IP_CSUM __NETIF_F(IP_CSUM) #define NETIF_F_IPV6_CSUM __NETIF_F(IPV6_CSUM) -#define NETIF_F_LLTX __NETIF_F(LLTX) #define NETIF_F_LOOPBACK __NETIF_F(LOOPBACK) #define NETIF_F_LRO __NETIF_F(LRO) #define NETIF_F_NETNS_LOCAL __NETIF_F(NETNS_LOCAL) @@ -193,7 +191,7 @@ static inline int find_next_netdev_feature(u64 feature, unsigned long start) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ #define NETIF_F_NEVER_CHANGE (NETIF_F_VLAN_CHALLENGED | \ - NETIF_F_LLTX | NETIF_F_NETNS_LOCAL) + NETIF_F_NETNS_LOCAL) /* remember that ((t)1 << t_BITS) is undefined in C99 */ #define NETIF_F_ETHTOOL_BITS ((__NETIF_F_BIT(NETDEV_FEATURE_COUNT - 1) | \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index d6f35c9d8580..e22ca5e07490 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1720,6 +1720,9 @@ enum netdev_reg_state { * * @priv_flags: flags invisible to userspace defined as bits, see * enum netdev_priv_flags for the definitions + * @lltx: device supports lockless Tx. Deprecated for real HW + * drivers. Mainly used by logical interfaces, such as + * bonding and tunnels * * @name: This is the first field of the "visible" part of this structure * (i.e. as seen by users in the "Space.c" file). It is the name @@ -2018,6 +2021,7 @@ struct net_device { __cacheline_group_begin(net_device_read_tx); struct_group(priv_flags_fast, unsigned long priv_flags:32; + unsigned long lltx:1; ); const struct net_device_ops *netdev_ops; const struct header_ops *header_ops; @@ -4433,7 +4437,7 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) } #define HARD_TX_LOCK(dev, txq, cpu) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ + if (!(dev)->lltx) { \ __netif_tx_lock(txq, cpu); \ } else { \ __netif_tx_acquire(txq); \ @@ -4441,12 +4445,12 @@ static inline void netif_tx_unlock_bh(struct net_device *dev) } #define HARD_TX_TRYLOCK(dev, txq) \ - (((dev->features & NETIF_F_LLTX) == 0) ? \ + (!(dev)->lltx ? \ __netif_tx_trylock(txq) : \ __netif_tx_acquire(txq)) #define HARD_TX_UNLOCK(dev, txq) { \ - if ((dev->features & NETIF_F_LLTX) == 0) { \ + if (!(dev)->lltx) { \ __netif_tx_unlock(txq); \ } else { \ __netif_tx_release(txq); \ diff --git a/lib/test_bpf.c b/lib/test_bpf.c index ca4b0eea81a2..fa5edd6ef7f7 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -15077,8 +15077,7 @@ static struct skb_segment_test skb_segment_tests[] __initconst = { .build_skb = build_test_skb_linear_no_head_frag, .features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_GSO | - NETIF_F_LLTX | NETIF_F_GRO | - NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | + NETIF_F_GRO | NETIF_F_IPV6_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_STAG_TX } }; diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 217be32426b5..3ca485537d77 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -569,7 +569,8 @@ static int vlan_dev_init(struct net_device *dev) if (real_dev->vlan_features & NETIF_F_HW_MACSEC) dev->hw_features |= NETIF_F_HW_MACSEC; - dev->features |= dev->hw_features | NETIF_F_LLTX; + dev->features |= dev->hw_features; + dev->lltx = true; netif_inherit_tso_max(dev, real_dev); if (dev->features & NETIF_F_VLAN_FEATURES) netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n"); @@ -655,7 +656,6 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, lower_features |= NETIF_F_HW_CSUM; features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); - features |= NETIF_F_LLTX; return features; } diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 30ecbc2ef1fd..e791a73ef901 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1021,8 +1021,8 @@ static void batadv_softif_init_early(struct net_device *dev) dev->needs_free_netdev = true; dev->priv_destructor = batadv_softif_free; dev->features |= NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_NETNS_LOCAL; - dev->features |= NETIF_F_LLTX; dev->priv_flags |= IFF_NO_QUEUE; + dev->lltx = true; /* can't call min_mtu, because the needed variables * have not been initialized yet diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index fb1115857e49..a6d25113dfb1 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -487,8 +487,9 @@ void br_dev_setup(struct net_device *dev) dev->ethtool_ops = &br_ethtool_ops; SET_NETDEV_DEVTYPE(dev, &br_type); dev->priv_flags = IFF_EBRIDGE | IFF_NO_QUEUE; + dev->lltx = true; - dev->features = COMMON_FEATURES | NETIF_F_LLTX | NETIF_F_NETNS_LOCAL | + dev->features = COMMON_FEATURES | NETIF_F_NETNS_LOCAL | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; dev->hw_features = COMMON_FEATURES | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 444f23e74f8e..01a9fb54ef42 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1764,8 +1764,7 @@ static const struct kobj_type netdev_queue_ktype = { static bool netdev_uses_bql(const struct net_device *dev) { - if (dev->features & NETIF_F_LLTX || - dev->priv_flags & IFF_NO_QUEUE) + if (dev->lltx || (dev->priv_flags & IFF_NO_QUEUE)) return false; return IS_ENABLED(CONFIG_BQL); diff --git a/net/dsa/user.c b/net/dsa/user.c index f5adfa1d978a..74eda9b30608 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -2642,11 +2642,12 @@ void dsa_user_setup_tagger(struct net_device *user) user->features = conduit->vlan_features | NETIF_F_HW_TC; user->hw_features |= NETIF_F_HW_TC; - user->features |= NETIF_F_LLTX; if (user->needed_tailroom) user->features &= ~(NETIF_F_SG | NETIF_F_FRAGLIST); if (ds->needs_standalone_vlan_filtering) user->features |= NETIF_F_HW_VLAN_CTAG_FILTER; + + user->lltx = true; } int dsa_user_suspend(struct net_device *user_dev) diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 7257ae272296..447fc80e1b00 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -25,7 +25,6 @@ const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = { [NETIF_F_HW_VLAN_STAG_FILTER_BIT] = "rx-vlan-stag-filter", [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged", [NETIF_F_GSO_BIT] = "tx-generic-segmentation", - [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c index e4cc6b78dcfc..d4c783076662 100644 --- a/net/hsr/hsr_device.c +++ b/net/hsr/hsr_device.c @@ -554,6 +554,8 @@ void hsr_dev_setup(struct net_device *dev) dev->netdev_ops = &hsr_device_ops; SET_NETDEV_DEVTYPE(dev, &hsr_type); dev->priv_flags |= IFF_NO_QUEUE | IFF_DISABLE_NETPOLL; + /* Prevent recursive tx locking */ + dev->lltx = true; dev->needs_free_netdev = true; @@ -563,8 +565,6 @@ void hsr_dev_setup(struct net_device *dev) dev->features = dev->hw_features; - /* Prevent recursive tx locking */ - dev->features |= NETIF_F_LLTX; /* VLAN on top of HSR needs testing and probably some work on * hsr_header_create() etc. */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index ba205473522e..b54c41f3ae3c 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -996,7 +996,7 @@ static void __gre_tunnel_init(struct net_device *dev) tunnel->hlen = tunnel->tun_hlen + tunnel->encap_hlen; dev->needed_headroom = tunnel->hlen + sizeof(tunnel->parms.iph); - dev->features |= GRE_FEATURES | NETIF_F_LLTX; + dev->features |= GRE_FEATURES; dev->hw_features |= GRE_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 @@ -1010,6 +1010,8 @@ static void __gre_tunnel_init(struct net_device *dev) dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ipgre_tunnel_init(struct net_device *dev) diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 14536da9f5dc..f0b4419cef34 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -443,7 +443,7 @@ static int vti_tunnel_init(struct net_device *dev) dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); return ip_tunnel_init(dev); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 923a2ef68c2f..dc0db5895e0e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -378,7 +378,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL; dev->flags = IFF_NOARP; dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; netif_keep_dst(dev); dev->features |= IPIP_FEATURES; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 3942bd2ade78..08beab638bda 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1471,7 +1471,7 @@ static void ip6gre_tnl_init_features(struct net_device *dev) { struct ip6_tnl *nt = netdev_priv(dev); - dev->features |= GRE6_FEATURES | NETIF_F_LLTX; + dev->features |= GRE6_FEATURES; dev->hw_features |= GRE6_FEATURES; /* TCP offload with GRE SEQ is not supported, nor can we support 2 @@ -1485,6 +1485,8 @@ static void ip6gre_tnl_init_features(struct net_device *dev) dev->features |= NETIF_F_GSO_SOFTWARE; dev->hw_features |= NETIF_F_GSO_SOFTWARE; + + dev->lltx = true; } static int ip6gre_tunnel_init_common(struct net_device *dev) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 87dfb565a9f8..cbef0fcece71 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1849,7 +1849,7 @@ static void ip6_tnl_dev_setup(struct net_device *dev) dev->type = ARPHRD_TUNNEL6; dev->flags |= IFF_NOARP; dev->addr_len = sizeof(struct in6_addr); - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; netif_keep_dst(dev); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 3b2eed7fc765..58306522fb1e 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1436,7 +1436,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->flags = IFF_NOARP; netif_keep_dst(dev); dev->addr_len = 4; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= SIT_FEATURES; dev->hw_features |= SIT_FEATURES; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 15a862f33f76..d692b902e120 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -97,7 +97,7 @@ static void l2tp_eth_dev_setup(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &l2tpeth_type); ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->netdev_ops = &l2tp_eth_netdev_ops; dev->needs_free_netdev = true; dev->pcpu_stat_type = NETDEV_PCPU_STAT_TSTATS; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 4b33133cbdff..3a369a31c5cc 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -102,19 +102,20 @@ static void do_setup(struct net_device *netdev) netdev->priv_flags &= ~IFF_TX_SKB_SHARING; netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH | IFF_NO_QUEUE; + netdev->lltx = true; netdev->needs_free_netdev = true; netdev->priv_destructor = NULL; netdev->ethtool_ops = &internal_dev_ethtool_ops; netdev->rtnl_link_ops = &internal_dev_link_ops; - netdev->features = NETIF_F_LLTX | NETIF_F_SG | NETIF_F_FRAGLIST | - NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | - NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ENCAP_ALL; + netdev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | + NETIF_F_HW_CSUM | NETIF_F_GSO_SOFTWARE | + NETIF_F_GSO_ENCAP_ALL; netdev->vlan_features = netdev->features; netdev->hw_enc_features = netdev->features; netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; - netdev->hw_features = netdev->features & ~NETIF_F_LLTX; + netdev->hw_features = netdev->features; eth_hw_addr_random(netdev); } diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c index e50e4bf993fa..98f1e2b67c76 100644 --- a/net/xfrm/xfrm_interface_core.c +++ b/net/xfrm/xfrm_interface_core.c @@ -769,7 +769,7 @@ static int xfrmi_dev_init(struct net_device *dev) if (err) return err; - dev->features |= NETIF_F_LLTX; + dev->lltx = true; dev->features |= XFRMI_FEATURES; dev->hw_features |= XFRMI_FEATURES; -- cgit v1.2.3 From e27ad6560e4b5993315b56d6884ca5a4652468f4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 21 Aug 2024 18:39:09 +0100 Subject: printf: remove %pGt support Patch series "Increase the number of bits available in page_type". Kent wants more than 16 bits in page_type, so I resurrected this old patch and expanded it a bit. It's a bit more efficient than our current scheme (1 4-byte insn vs 3 insns of 13 bytes total) to test a single page type. This patch (of 4): An upcoming patch will convert page type from being a bitfield to a single byte, so we will not be able to use %pG to print the page type any more. The printing of the symbolic name will be restored in that patch. Link: https://lkml.kernel.org/r/20240821173914.2270383-1-willy@infradead.org Link: https://lkml.kernel.org/r/20240821173914.2270383-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com> Cc: Kent Overstreet Signed-off-by: Andrew Morton --- Documentation/core-api/printk-formats.rst | 4 +--- include/trace/events/mmflags.h | 10 ---------- lib/test_printf.c | 26 -------------------------- lib/vsprintf.c | 21 --------------------- mm/debug.c | 2 +- mm/internal.h | 1 - 6 files changed, 2 insertions(+), 62 deletions(-) (limited to 'lib') diff --git a/Documentation/core-api/printk-formats.rst b/Documentation/core-api/printk-formats.rst index 4451ef501936..14e093da3ccd 100644 --- a/Documentation/core-api/printk-formats.rst +++ b/Documentation/core-api/printk-formats.rst @@ -576,13 +576,12 @@ The field width is passed by value, the bitmap is passed by reference. Helper macros cpumask_pr_args() and nodemask_pr_args() are available to ease printing cpumask and nodemask. -Flags bitfields such as page flags, page_type, gfp_flags +Flags bitfields such as page flags and gfp_flags -------------------------------------------------------- :: %pGp 0x17ffffc0002036(referenced|uptodate|lru|active|private|node=0|zone=2|lastcpupid=0x1fffff) - %pGt 0xffffff7f(buddy) %pGg GFP_USER|GFP_DMA32|GFP_NOWARN %pGv read|exec|mayread|maywrite|mayexec|denywrite @@ -591,7 +590,6 @@ would construct the value. The type of flags is given by the third character. Currently supported are: - p - [p]age flags, expects value of type (``unsigned long *``) - - t - page [t]ype, expects value of type (``unsigned int *``) - v - [v]ma_flags, expects value of type (``unsigned long *``) - g - [g]fp_flags, expects value of type (``gfp_t *``) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index b5c4da370a50..c151cc21d367 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -130,16 +130,6 @@ IF_HAVE_PG_ARCH_X(arch_3) __def_pageflag_names \ ) : "none" -#define DEF_PAGETYPE_NAME(_name) { PG_##_name, __stringify(_name) } - -#define __def_pagetype_names \ - DEF_PAGETYPE_NAME(slab), \ - DEF_PAGETYPE_NAME(hugetlb), \ - DEF_PAGETYPE_NAME(offline), \ - DEF_PAGETYPE_NAME(guard), \ - DEF_PAGETYPE_NAME(table), \ - DEF_PAGETYPE_NAME(buddy) - #if defined(CONFIG_X86) #define __VM_ARCH_SPECIFIC_1 {VM_PAT, "pat" } #elif defined(CONFIG_PPC) diff --git a/lib/test_printf.c b/lib/test_printf.c index 965cb6f28527..8448b6d02bd9 100644 --- a/lib/test_printf.c +++ b/lib/test_printf.c @@ -641,26 +641,12 @@ page_flags_test(int section, int node, int zone, int last_cpupid, test(cmp_buf, "%pGp", &flags); } -static void __init page_type_test(unsigned int page_type, const char *name, - char *cmp_buf) -{ - unsigned long size; - - size = scnprintf(cmp_buf, BUF_SIZE, "%#x(", page_type); - if (page_type_has_type(page_type)) - size += scnprintf(cmp_buf + size, BUF_SIZE - size, "%s", name); - - snprintf(cmp_buf + size, BUF_SIZE - size, ")"); - test(cmp_buf, "%pGt", &page_type); -} - static void __init flags(void) { unsigned long flags; char *cmp_buffer; gfp_t gfp; - unsigned int page_type; cmp_buffer = kmalloc(BUF_SIZE, GFP_KERNEL); if (!cmp_buffer) @@ -700,18 +686,6 @@ flags(void) gfp |= __GFP_HIGH; test(cmp_buffer, "%pGg", &gfp); - page_type = ~0; - page_type_test(page_type, "", cmp_buffer); - - page_type = 10; - page_type_test(page_type, "", cmp_buffer); - - page_type = ~PG_buddy; - page_type_test(page_type, "buddy", cmp_buffer); - - page_type = ~(PG_table | PG_buddy); - page_type_test(page_type, "table|buddy", cmp_buffer); - kfree(cmp_buffer); } diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 2d71b1115916..09f022ba1c05 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2054,25 +2054,6 @@ char *format_page_flags(char *buf, char *end, unsigned long flags) return buf; } -static -char *format_page_type(char *buf, char *end, unsigned int page_type) -{ - buf = number(buf, end, page_type, default_flag_spec); - - if (buf < end) - *buf = '('; - buf++; - - if (page_type_has_type(page_type)) - buf = format_flags(buf, end, ~page_type, pagetype_names); - - if (buf < end) - *buf = ')'; - buf++; - - return buf; -} - static noinline_for_stack char *flags_string(char *buf, char *end, void *flags_ptr, struct printf_spec spec, const char *fmt) @@ -2086,8 +2067,6 @@ char *flags_string(char *buf, char *end, void *flags_ptr, switch (fmt[1]) { case 'p': return format_page_flags(buf, end, *(unsigned long *)flags_ptr); - case 't': - return format_page_type(buf, end, *(unsigned int *)flags_ptr); case 'v': flags = *(unsigned long *)flags_ptr; names = vmaflag_names; diff --git a/mm/debug.c b/mm/debug.c index 69e524c3e601..9f8e34537957 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -92,7 +92,7 @@ static void __dump_folio(struct folio *folio, struct page *page, pr_warn("%sflags: %pGp%s\n", type, &folio->flags, is_migrate_cma_folio(folio, pfn) ? " CMA" : ""); if (page_has_type(&folio->page)) - pr_warn("page_type: %pGt\n", &folio->page.page_type); + pr_warn("page_type: %x\n", folio->page.page_type); print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32, sizeof(unsigned long), page, diff --git a/mm/internal.h b/mm/internal.h index 82fb9f1b0cd4..5e6f2abcea28 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1144,7 +1144,6 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm) #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ extern const struct trace_print_flags pageflag_names[]; -extern const struct trace_print_flags pagetype_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; -- cgit v1.2.3 From bd7c8ff9fef4b21a97f9b30a7364845ee6eaaf23 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Wed, 4 Sep 2024 15:04:53 +0200 Subject: treewide: Fix wrong singular form of jiffies in comments There are several comments all over the place, which uses a wrong singular form of jiffies. Replace 'jiffie' by 'jiffy'. No functional change. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Thomas Gleixner Acked-by: Geert Uytterhoeven # m68k Link: https://lore.kernel.org/all/20240904-devel-anna-maria-b4-timers-flseep-v1-3-e98760256370@linutronix.de --- Documentation/admin-guide/media/vivid.rst | 2 +- Documentation/timers/timers-howto.rst | 2 +- .../translations/sp_SP/scheduler/sched-design-CFS.rst | 2 +- arch/arm/mach-versatile/spc.c | 2 +- arch/m68k/q40/q40ints.c | 2 +- arch/x86/kernel/cpu/mce/dev-mcelog.c | 2 +- drivers/char/ipmi/ipmi_ssif.c | 2 +- drivers/dma-buf/st-dma-fence.c | 2 +- drivers/gpu/drm/i915/gem/i915_gem_wait.c | 2 +- drivers/gpu/drm/i915/gt/selftest_execlists.c | 4 ++-- drivers/gpu/drm/i915/i915_utils.c | 2 +- drivers/gpu/drm/v3d/v3d_bo.c | 2 +- drivers/isdn/mISDN/dsp_cmx.c | 2 +- drivers/net/ethernet/marvell/mvmdio.c | 2 +- fs/xfs/xfs_buf.h | 2 +- include/linux/jiffies.h | 2 +- include/linux/timekeeper_internal.h | 2 +- kernel/time/alarmtimer.c | 2 +- kernel/time/clockevents.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/posix-timers.c | 4 ++-- kernel/time/timer.c | 12 ++++++------ lib/Kconfig.debug | 2 +- net/batman-adv/types.h | 2 +- 24 files changed, 31 insertions(+), 31 deletions(-) (limited to 'lib') diff --git a/Documentation/admin-guide/media/vivid.rst b/Documentation/admin-guide/media/vivid.rst index 1306f19ecb5a..c9d301ab46a3 100644 --- a/Documentation/admin-guide/media/vivid.rst +++ b/Documentation/admin-guide/media/vivid.rst @@ -328,7 +328,7 @@ and an HDMI input, one input for each input type. Those are described in more detail below. Special attention has been given to the rate at which new frames become -available. The jitter will be around 1 jiffie (that depends on the HZ +available. The jitter will be around 1 jiffy (that depends on the HZ configuration of your kernel, so usually 1/100, 1/250 or 1/1000 of a second), but the long-term behavior is exactly following the framerate. So a framerate of 59.94 Hz is really different from 60 Hz. If the framerate diff --git a/Documentation/timers/timers-howto.rst b/Documentation/timers/timers-howto.rst index 5c169e3d29a8..ef7a4652ccc9 100644 --- a/Documentation/timers/timers-howto.rst +++ b/Documentation/timers/timers-howto.rst @@ -19,7 +19,7 @@ it really need to delay in atomic context?" If so... ATOMIC CONTEXT: You must use the `*delay` family of functions. These - functions use the jiffie estimation of clock speed + functions use the jiffy estimation of clock speed and will busy wait for enough loop cycles to achieve the desired delay: diff --git a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst index 90a153cad4e8..731c266beb1a 100644 --- a/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst +++ b/Documentation/translations/sp_SP/scheduler/sched-design-CFS.rst @@ -109,7 +109,7 @@ para que se ejecute, y la tarea en ejecución es interrumpida. ================================== CFS usa una granularidad de nanosegundos y no depende de ningún -jiffie o detalles como HZ. De este modo, el gestor de tareas CFS no tiene +jiffy o detalles como HZ. De este modo, el gestor de tareas CFS no tiene noción de "ventanas de tiempo" de la forma en que tenía el gestor de tareas previo, y tampoco tiene heurísticos. Únicamente hay un parámetro central ajustable (se ha de cambiar en CONFIG_SCHED_DEBUG): diff --git a/arch/arm/mach-versatile/spc.c b/arch/arm/mach-versatile/spc.c index 5e44170e1a9a..790092734cf6 100644 --- a/arch/arm/mach-versatile/spc.c +++ b/arch/arm/mach-versatile/spc.c @@ -73,7 +73,7 @@ /* * Even though the SPC takes max 3-5 ms to complete any OPP/COMMS - * operation, the operation could start just before jiffie is about + * operation, the operation could start just before jiffy is about * to be incremented. So setting timeout value of 20ms = 2jiffies@100Hz */ #define TIMEOUT_US 20000 diff --git a/arch/m68k/q40/q40ints.c b/arch/m68k/q40/q40ints.c index 10f1f294e91f..14b774b9d308 100644 --- a/arch/m68k/q40/q40ints.c +++ b/arch/m68k/q40/q40ints.c @@ -106,7 +106,7 @@ void __init q40_init_IRQ(void) * this stuff doesn't really belong here.. */ -int ql_ticks; /* 200Hz ticks since last jiffie */ +int ql_ticks; /* 200Hz ticks since last jiffy */ static int sound_ticks; #define SVOL 45 diff --git a/arch/x86/kernel/cpu/mce/dev-mcelog.c b/arch/x86/kernel/cpu/mce/dev-mcelog.c index a05ac0716ecf..a3aa0199222e 100644 --- a/arch/x86/kernel/cpu/mce/dev-mcelog.c +++ b/arch/x86/kernel/cpu/mce/dev-mcelog.c @@ -314,7 +314,7 @@ static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, /* * Need to give user space some time to set everything up, - * so do it a jiffie or two later everywhere. + * so do it a jiffy or two later everywhere. */ schedule_timeout(2); diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c index 96ad571d041a..e093028391af 100644 --- a/drivers/char/ipmi/ipmi_ssif.c +++ b/drivers/char/ipmi/ipmi_ssif.c @@ -980,7 +980,7 @@ static void msg_written_handler(struct ssif_info *ssif_info, int result, ipmi_ssif_unlock_cond(ssif_info, flags); start_get(ssif_info); } else { - /* Wait a jiffie then request the next message */ + /* Wait a jiffy then request the next message */ ssif_info->waiting_alert = true; ssif_info->retries_left = SSIF_RECV_RETRIES; if (!ssif_info->stopping) diff --git a/drivers/dma-buf/st-dma-fence.c b/drivers/dma-buf/st-dma-fence.c index 6a1bfcd0cc21..cf2ce3744ce6 100644 --- a/drivers/dma-buf/st-dma-fence.c +++ b/drivers/dma-buf/st-dma-fence.c @@ -402,7 +402,7 @@ static int test_wait_timeout(void *arg) if (dma_fence_wait_timeout(wt.f, false, 2) == -ETIME) { if (timer_pending(&wt.timer)) { - pr_notice("Timer did not fire within the jiffie!\n"); + pr_notice("Timer did not fire within the jiffy!\n"); err = 0; /* not our fault! */ } else { pr_err("Wait reported incomplete after timeout\n"); diff --git a/drivers/gpu/drm/i915/gem/i915_gem_wait.c b/drivers/gpu/drm/i915/gem/i915_gem_wait.c index d4b918fb11ce..1f55e62044a4 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_wait.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_wait.c @@ -266,7 +266,7 @@ i915_gem_wait_ioctl(struct drm_device *dev, void *data, struct drm_file *file) if (ret == -ETIME && !nsecs_to_jiffies(args->timeout_ns)) args->timeout_ns = 0; - /* Asked to wait beyond the jiffie/scheduler precision? */ + /* Asked to wait beyond the jiffy/scheduler precision? */ if (ret == -ETIME && args->timeout_ns) ret = -EAGAIN; } diff --git a/drivers/gpu/drm/i915/gt/selftest_execlists.c b/drivers/gpu/drm/i915/gt/selftest_execlists.c index 4202df5b8c12..222ca7c44951 100644 --- a/drivers/gpu/drm/i915/gt/selftest_execlists.c +++ b/drivers/gpu/drm/i915/gt/selftest_execlists.c @@ -93,7 +93,7 @@ static int wait_for_reset(struct intel_engine_cs *engine, return -EINVAL; } - /* Give the request a jiffie to complete after flushing the worker */ + /* Give the request a jiffy to complete after flushing the worker */ if (i915_request_wait(rq, 0, max(0l, (long)(timeout - jiffies)) + 1) < 0) { pr_err("%s: hanging request %llx:%lld did not complete\n", @@ -3426,7 +3426,7 @@ static int live_preempt_timeout(void *arg) cpu_relax(); saved_timeout = engine->props.preempt_timeout_ms; - engine->props.preempt_timeout_ms = 1; /* in ms, -> 1 jiffie */ + engine->props.preempt_timeout_ms = 1; /* in ms, -> 1 jiffy */ i915_request_get(rq); i915_request_add(rq); diff --git a/drivers/gpu/drm/i915/i915_utils.c b/drivers/gpu/drm/i915/i915_utils.c index 6f9e7b354b54..f2ba51c20e97 100644 --- a/drivers/gpu/drm/i915/i915_utils.c +++ b/drivers/gpu/drm/i915/i915_utils.c @@ -110,7 +110,7 @@ void set_timer_ms(struct timer_list *t, unsigned long timeout) * Paranoia to make sure the compiler computes the timeout before * loading 'jiffies' as jiffies is volatile and may be updated in * the background by a timer tick. All to reduce the complexity - * of the addition and reduce the risk of losing a jiffie. + * of the addition and reduce the risk of losing a jiffy. */ barrier(); diff --git a/drivers/gpu/drm/v3d/v3d_bo.c b/drivers/gpu/drm/v3d/v3d_bo.c index a165cbcdd27b..9eafe53a8f41 100644 --- a/drivers/gpu/drm/v3d/v3d_bo.c +++ b/drivers/gpu/drm/v3d/v3d_bo.c @@ -279,7 +279,7 @@ v3d_wait_bo_ioctl(struct drm_device *dev, void *data, else args->timeout_ns = 0; - /* Asked to wait beyond the jiffie/scheduler precision? */ + /* Asked to wait beyond the jiffy/scheduler precision? */ if (ret == -ETIME && args->timeout_ns) ret = -EAGAIN; diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c index 61cb45c5d0d8..53fad9487574 100644 --- a/drivers/isdn/mISDN/dsp_cmx.c +++ b/drivers/isdn/mISDN/dsp_cmx.c @@ -82,7 +82,7 @@ * - has multiple clocks. * - has no usable clock due to jitter or packet loss (VoIP). * In this case the system's clock is used. The clock resolution depends on - * the jiffie resolution. + * the jiffy resolution. * * If a member joins a conference: * diff --git a/drivers/net/ethernet/marvell/mvmdio.c b/drivers/net/ethernet/marvell/mvmdio.c index 9190eff6c0bb..e1d003fdbc2e 100644 --- a/drivers/net/ethernet/marvell/mvmdio.c +++ b/drivers/net/ethernet/marvell/mvmdio.c @@ -104,7 +104,7 @@ static int orion_mdio_wait_ready(const struct orion_mdio_ops *ops, return 0; } else { /* wait_event_timeout does not guarantee a delay of at - * least one whole jiffie, so timeout must be no less + * least one whole jiffy, so timeout must be no less * than two. */ timeout = max(usecs_to_jiffies(MVMDIO_SMI_TIMEOUT), 2); diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index b1580644501f..209a389f2abc 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -210,7 +210,7 @@ struct xfs_buf { * success the write is considered to be failed permanently and the * iodone handler will take appropriate action. * - * For retry timeouts, we record the jiffie of the first failure. This + * For retry timeouts, we record the jiffy of the first failure. This * means that we can change the retry timeout for buffers already under * I/O and thus avoid getting stuck in a retry loop with a long timeout. * diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index d9f1435a5a13..1220f0fbe5bf 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -418,7 +418,7 @@ extern unsigned long preset_lpj; #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* - * The maximum jiffie value is (MAX_INT >> 1). Here we translate that + * The maximum jiffy value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 84ff2844df2a..902c20ef495a 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -73,7 +73,7 @@ struct tk_read_base { * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what - * we need to add to xtime (or xtime corrected for sub jiffie times) + * we need to add to xtime (or xtime corrected for sub jiffy times) * to get to monotonic time. Monotonic is pegged at zero at system * boot time, so wall_to_monotonic will be negative, however, we will * ALWAYS keep the tv_nsec part positive so we can use the usual diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 76bd4fda3472..8bf888641694 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -493,7 +493,7 @@ static u64 __alarm_forward_now(struct alarm *alarm, ktime_t interval, bool throt * promised in the context of posix_timer_fn() never * materialized, but someone should really work on it. * - * To prevent DOS fake @now to be 1 jiffie out which keeps + * To prevent DOS fake @now to be 1 jiffy out which keeps * the overrun accounting correct but creates an * inconsistency vs. timer_gettime(2). */ diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 60a6484831b1..78c7bd64d0dd 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -190,7 +190,7 @@ int clockevents_tick_resume(struct clock_event_device *dev) #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST -/* Limit min_delta to a jiffie */ +/* Limit min_delta to a jiffy */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) /** diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index a023946f8558..e834b2bd83df 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1177,7 +1177,7 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution - * (i.e. one jiffie) to prevent short timeouts. + * (i.e. one jiffy) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 1cc830ef93a7..4576aaed13b2 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -339,14 +339,14 @@ static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) * change to the signal handling code. * * For now let timers with an interval less than a - * jiffie expire every jiffie and recheck for a + * jiffy expire every jiffy and recheck for a * valid signal handler. * * This avoids interrupt starvation in case of a * very small interval, which would expire the * timer immediately again. * - * Moving now ahead of time by one jiffie tricks + * Moving now ahead of time by one jiffy tricks * hrtimer_forward() to expire the timer later, * while it still maintains the overrun accuracy * for the price of a slight inconsistency in the diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 5e021a2d8d61..2b38f3035a3e 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -365,7 +365,7 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu, rem = j % HZ; /* - * If the target jiffie is just after a whole second (which can happen + * If the target jiffy is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. @@ -1930,7 +1930,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next - * expiring jiffie. So in case of: + * expiring jiffy. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 @@ -1995,7 +1995,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) return basem; /* - * Round up to the next jiffie. High resolution timers are + * Round up to the next jiffy. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. @@ -2254,7 +2254,7 @@ static inline u64 __get_next_timer_interrupt(unsigned long basej, u64 basem, base_global, &tevt); /* - * If the next event is only one jiffie ahead there is no need to call + * If the next event is only one jiffy ahead there is no need to call * timer migration hierarchy related functions. The value for the next * global timer in @tevt struct equals then KTIME_MAX. This is also * true, when the timer base is idle. @@ -2486,11 +2486,11 @@ static void run_local_timers(void) * updated. When this update is missed, this isn't a * problem, as an IPI is executed nevertheless when the CPU * was idle before. When the CPU wasn't idle but the update - * is missed, then the timer would expire one jiffie late - + * is missed, then the timer would expire one jiffy late - * bad luck. * * Those unlikely corner cases where the worst outcome is only a - * one jiffie delay or a superfluous raise of the softirq are + * one jiffy delay or a superfluous raise of the softirq are * not that expensive as doing the check always while holding * the lock. * diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..a40aa606cd04 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -97,7 +97,7 @@ config BOOT_PRINTK_DELAY using "boot_delay=N". It is likely that you would also need to use "lpj=M" to preset - the "loops per jiffie" value. + the "loops per jiffy" value. See a previous boot log for the "lpj" value to use for your system, and then set "lpj=M" before setting "boot_delay=N". NOTE: Using this option may adversely affect SMP systems. diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 00840d5784fe..04f6398b3a40 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -287,7 +287,7 @@ struct batadv_frag_table_entry { /** @lock: lock to protect the list of fragments */ spinlock_t lock; - /** @timestamp: time (jiffie) of last received fragment */ + /** @timestamp: time (jiffy) of last received fragment */ unsigned long timestamp; /** @seqno: sequence number of the fragments in the list */ -- cgit v1.2.3 From f6594633817374a64a5fb31007bd810cad1425ff Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 17 Jun 2024 19:00:33 -0400 Subject: lib/generic-radix-tree.c: genradix_ptr_inlined() Provide an inlined fast path Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 75 ++++++++++++++++++++++++++++++++++++++ lib/generic-radix-tree.c | 64 +------------------------------- 2 files changed, 76 insertions(+), 63 deletions(-) (limited to 'lib') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index f3512fddf3d7..8a3e1e886d1c 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -48,6 +48,49 @@ struct genradix_root; #define GENRADIX_NODE_SHIFT 9 #define GENRADIX_NODE_SIZE (1U << GENRADIX_NODE_SHIFT) +#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) +#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) + +/* depth that's needed for a genradix that can address up to ULONG_MAX: */ +#define GENRADIX_MAX_DEPTH \ + DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) + +#define GENRADIX_DEPTH_MASK \ + ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) + +static inline int genradix_depth_shift(unsigned depth) +{ + return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; +} + +/* + * Returns size (of data, in bytes) that a tree of a given depth holds: + */ +static inline size_t genradix_depth_size(unsigned depth) +{ + return 1UL << genradix_depth_shift(depth); +} + +static inline unsigned genradix_root_to_depth(struct genradix_root *r) +{ + return (unsigned long) r & GENRADIX_DEPTH_MASK; +} + +static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) +{ + return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); +} + +struct genradix_node { + union { + /* Interior node: */ + struct genradix_node *children[GENRADIX_ARY]; + + /* Leaf: */ + u8 data[GENRADIX_NODE_SIZE]; + }; +}; + struct __genradix { struct genradix_root *root; }; @@ -128,6 +171,30 @@ static inline size_t __idx_to_offset(size_t idx, size_t obj_size) #define __genradix_idx_to_offset(_radix, _idx) \ __idx_to_offset(_idx, __genradix_obj_size(_radix)) +static inline void *__genradix_ptr_inlined(struct __genradix *radix, size_t offset) +{ + struct genradix_root *r = READ_ONCE(radix->root); + struct genradix_node *n = genradix_root_to_node(r); + unsigned level = genradix_root_to_depth(r); + unsigned shift = genradix_depth_shift(level); + + if (unlikely(ilog2(offset) >= genradix_depth_shift(level))) + return NULL; + + while (n && shift > GENRADIX_NODE_SHIFT) { + shift -= GENRADIX_ARY_SHIFT; + n = n->children[offset >> shift]; + offset &= (1UL << shift) - 1; + } + + return n ? &n->data[offset] : NULL; +} + +#define genradix_ptr_inlined(_radix, _idx) \ + (__genradix_cast(_radix) \ + __genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx))) + void *__genradix_ptr(struct __genradix *, size_t); /** @@ -144,6 +211,14 @@ void *__genradix_ptr(struct __genradix *, size_t); void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); +#define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _gfp))) + /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it * if necessary diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index fa692c86f069..4efae0663049 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -5,75 +5,13 @@ #include #include -#define GENRADIX_ARY (GENRADIX_NODE_SIZE / sizeof(struct genradix_node *)) -#define GENRADIX_ARY_SHIFT ilog2(GENRADIX_ARY) - -struct genradix_node { - union { - /* Interior node: */ - struct genradix_node *children[GENRADIX_ARY]; - - /* Leaf: */ - u8 data[GENRADIX_NODE_SIZE]; - }; -}; - -static inline int genradix_depth_shift(unsigned depth) -{ - return GENRADIX_NODE_SHIFT + GENRADIX_ARY_SHIFT * depth; -} - -/* - * Returns size (of data, in bytes) that a tree of a given depth holds: - */ -static inline size_t genradix_depth_size(unsigned depth) -{ - return 1UL << genradix_depth_shift(depth); -} - -/* depth that's needed for a genradix that can address up to ULONG_MAX: */ -#define GENRADIX_MAX_DEPTH \ - DIV_ROUND_UP(BITS_PER_LONG - GENRADIX_NODE_SHIFT, GENRADIX_ARY_SHIFT) - -#define GENRADIX_DEPTH_MASK \ - ((unsigned long) (roundup_pow_of_two(GENRADIX_MAX_DEPTH + 1) - 1)) - -static inline unsigned genradix_root_to_depth(struct genradix_root *r) -{ - return (unsigned long) r & GENRADIX_DEPTH_MASK; -} - -static inline struct genradix_node *genradix_root_to_node(struct genradix_root *r) -{ - return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); -} - /* * Returns pointer to the specified byte @offset within @radix, or NULL if not * allocated */ void *__genradix_ptr(struct __genradix *radix, size_t offset) { - struct genradix_root *r = READ_ONCE(radix->root); - struct genradix_node *n = genradix_root_to_node(r); - unsigned level = genradix_root_to_depth(r); - - if (ilog2(offset) >= genradix_depth_shift(level)) - return NULL; - - while (1) { - if (!n) - return NULL; - if (!level) - break; - - level--; - - n = n->children[offset >> genradix_depth_shift(level)]; - offset &= genradix_depth_size(level) - 1; - } - - return &n->data[offset]; + return __genradix_ptr_inlined(radix, offset); } EXPORT_SYMBOL(__genradix_ptr); -- cgit v1.2.3 From b3f9da79e7783ca4f1c1d4214af976c548629c1d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 10 Aug 2024 23:14:44 -0400 Subject: lib/generic-radix-tree.c: add preallocation Signed-off-by: Kent Overstreet --- include/linux/generic-radix-tree.h | 38 ++++++++++++++++++++++++++++++++------ lib/generic-radix-tree.c | 16 +++++----------- 2 files changed, 37 insertions(+), 17 deletions(-) (limited to 'lib') diff --git a/include/linux/generic-radix-tree.h b/include/linux/generic-radix-tree.h index 8a3e1e886d1c..5b51c3d582d6 100644 --- a/include/linux/generic-radix-tree.h +++ b/include/linux/generic-radix-tree.h @@ -41,6 +41,7 @@ #include #include #include +#include #include struct genradix_root; @@ -81,6 +82,10 @@ static inline struct genradix_node *genradix_root_to_node(struct genradix_root * return (void *) ((unsigned long) r & ~GENRADIX_DEPTH_MASK); } +struct __genradix { + struct genradix_root *root; +}; + struct genradix_node { union { /* Interior node: */ @@ -91,9 +96,15 @@ struct genradix_node { }; }; -struct __genradix { - struct genradix_root *root; -}; +static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) +{ + return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); +} + +static inline void genradix_free_node(struct genradix_node *node) +{ + kfree(node); +} /* * NOTE: currently, sizeof(_type) must not be larger than GENRADIX_NODE_SIZE: @@ -209,7 +220,8 @@ void *__genradix_ptr(struct __genradix *, size_t); __genradix_ptr(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx))) -void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); +void *__genradix_ptr_alloc(struct __genradix *, size_t, + struct genradix_node **, gfp_t); #define genradix_ptr_alloc_inlined(_radix, _idx, _gfp) \ (__genradix_cast(_radix) \ @@ -217,7 +229,15 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); __genradix_idx_to_offset(_radix, _idx)) ?: \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ - _gfp))) + NULL, _gfp))) + +#define genradix_ptr_alloc_preallocated_inlined(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + (__genradix_ptr_inlined(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx)) ?: \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp))) /** * genradix_ptr_alloc - get a pointer to a genradix entry, allocating it @@ -232,7 +252,13 @@ void *__genradix_ptr_alloc(struct __genradix *, size_t, gfp_t); (__genradix_cast(_radix) \ __genradix_ptr_alloc(&(_radix)->tree, \ __genradix_idx_to_offset(_radix, _idx), \ - _gfp)) + NULL, _gfp)) + +#define genradix_ptr_alloc_preallocated(_radix, _idx, _new_node, _gfp)\ + (__genradix_cast(_radix) \ + __genradix_ptr_alloc(&(_radix)->tree, \ + __genradix_idx_to_offset(_radix, _idx), \ + _new_node, _gfp)) struct genradix_iter { size_t offset; diff --git a/lib/generic-radix-tree.c b/lib/generic-radix-tree.c index 4efae0663049..79e067b51488 100644 --- a/lib/generic-radix-tree.c +++ b/lib/generic-radix-tree.c @@ -15,27 +15,21 @@ void *__genradix_ptr(struct __genradix *radix, size_t offset) } EXPORT_SYMBOL(__genradix_ptr); -static inline struct genradix_node *genradix_alloc_node(gfp_t gfp_mask) -{ - return kzalloc(GENRADIX_NODE_SIZE, gfp_mask); -} - -static inline void genradix_free_node(struct genradix_node *node) -{ - kfree(node); -} - /* * Returns pointer to the specified byte @offset within @radix, allocating it if * necessary - newly allocated slots are always zeroed out: */ void *__genradix_ptr_alloc(struct __genradix *radix, size_t offset, + struct genradix_node **preallocated, gfp_t gfp_mask) { struct genradix_root *v = READ_ONCE(radix->root); struct genradix_node *n, *new_node = NULL; unsigned level; + if (preallocated) + swap(new_node, *preallocated); + /* Increase tree depth if necessary: */ while (1) { struct genradix_root *r = v, *new_root; @@ -219,7 +213,7 @@ int __genradix_prealloc(struct __genradix *radix, size_t size, size_t offset; for (offset = 0; offset < size; offset += GENRADIX_NODE_SIZE) - if (!__genradix_ptr_alloc(radix, offset, gfp_mask)) + if (!__genradix_ptr_alloc(radix, offset, NULL, gfp_mask)) return -ENOMEM; return 0; -- cgit v1.2.3 From e4757c710ba27a1d499cf5941fc3950e9e542731 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Sep 2024 21:39:39 +0800 Subject: debugobjects: Fix the compilation attributes of some global variables 1. Both debug_objects_pool_min_level and debug_objects_pool_size are read-only after initialization, change attribute '__read_mostly' to '__ro_after_init', and remove '__data_racy'. 2. Many global variables are read in the debug_stats_show() function, but didn't mask KCSAN's detection. Add '__data_racy' for them. Suggested-by: Thomas Gleixner Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240904133944.2124-2-thunder.leizhen@huawei.com --- lib/debugobjects.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 7cea91e193a8..7226fdb5e129 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -70,10 +70,10 @@ static HLIST_HEAD(obj_to_free); * made at debug_stats_show(). Both obj_pool_min_free and obj_pool_max_used * can be off. */ -static int obj_pool_min_free = ODEBUG_POOL_SIZE; -static int obj_pool_free = ODEBUG_POOL_SIZE; +static int __data_racy obj_pool_min_free = ODEBUG_POOL_SIZE; +static int __data_racy obj_pool_free = ODEBUG_POOL_SIZE; static int obj_pool_used; -static int obj_pool_max_used; +static int __data_racy obj_pool_max_used; static bool obj_freeing; /* The number of objs on the global free list */ static int obj_nr_tofree; @@ -84,9 +84,9 @@ static int __data_racy debug_objects_fixups __read_mostly; static int __data_racy debug_objects_warnings __read_mostly; static int __data_racy debug_objects_enabled __read_mostly = CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT; -static int __data_racy debug_objects_pool_size __read_mostly +static int debug_objects_pool_size __ro_after_init = ODEBUG_POOL_SIZE; -static int __data_racy debug_objects_pool_min_level __read_mostly +static int debug_objects_pool_min_level __ro_after_init = ODEBUG_POOL_MIN_LEVEL; static const struct debug_obj_descr *descr_test __read_mostly; @@ -95,8 +95,8 @@ static struct kmem_cache *obj_cache __ro_after_init; /* * Track numbers of kmem_cache_alloc()/free() calls done. */ -static int debug_objects_allocated; -static int debug_objects_freed; +static int __data_racy debug_objects_allocated; +static int __data_racy debug_objects_freed; static void free_obj_work(struct work_struct *work); static DECLARE_DELAYED_WORK(debug_obj_work, free_obj_work); -- cgit v1.2.3 From 684d28feb8546d1e9597aa363c3bfcf52fe250b7 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Sep 2024 21:39:40 +0800 Subject: debugobjects: Fix conditions in fill_pool() fill_pool() uses 'obj_pool_min_free' to decide whether objects should be handed back to the kmem cache. But 'obj_pool_min_free' records the lowest historical value of the number of objects in the object pool and not the minimum number of objects which should be kept in the pool. Use 'debug_objects_pool_min_level' instead, which holds the minimum number which was scaled to the number of CPUs at boot time. [ tglx: Massage change log ] Fixes: d26bf5056fc0 ("debugobjects: Reduce number of pool_lock acquisitions in fill_pool()") Fixes: 36c4ead6f6df ("debugobjects: Add global free list and the counter") Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20240904133944.2124-3-thunder.leizhen@huawei.com --- lib/debugobjects.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 7226fdb5e129..6329a86edcf1 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -142,13 +142,14 @@ static void fill_pool(void) * READ_ONCE()s pair with the WRITE_ONCE()s in pool_lock critical * sections. */ - while (READ_ONCE(obj_nr_tofree) && (READ_ONCE(obj_pool_free) < obj_pool_min_free)) { + while (READ_ONCE(obj_nr_tofree) && + READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { raw_spin_lock_irqsave(&pool_lock, flags); /* * Recheck with the lock held as the worker thread might have * won the race and freed the global free list already. */ - while (obj_nr_tofree && (obj_pool_free < obj_pool_min_free)) { + while (obj_nr_tofree && (obj_pool_free < debug_objects_pool_min_level)) { obj = hlist_entry(obj_to_free.first, typeof(*obj), node); hlist_del(&obj->node); WRITE_ONCE(obj_nr_tofree, obj_nr_tofree - 1); -- cgit v1.2.3 From 63a4a9b52c3c7f86351710739011717a36652b72 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 4 Sep 2024 21:39:41 +0800 Subject: debugobjects: Remove redundant checks in fill_pool() fill_pool() checks locklessly at the beginning whether the pool has to be refilled. After that it checks locklessly in a loop whether the free list contains objects and repeats the refill check. If both conditions are true, it acquires the pool lock and tries to move objects from the free list to the pool repeating the same checks again. There are two redundant issues with that: 1) The repeated check for the fill condition 2) The loop processing The repeated check is pointless as it was just established that fill is required. The condition has to be re-evaluated under the lock anyway. The loop processing is not required either because there is practically zero chance that a repeated attempt will succeed if the checks under the lock terminate the moving of objects. Remove the redundant check and replace the loop with a simple if condition. [ tglx: Massaged change log ] Signed-off-by: Zhen Lei Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20240904133944.2124-4-thunder.leizhen@huawei.com --- lib/debugobjects.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 6329a86edcf1..5ce473ad499b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -135,15 +135,13 @@ static void fill_pool(void) return; /* - * Reuse objs from the global free list; they will be reinitialized - * when allocating. + * Reuse objs from the global obj_to_free list; they will be + * reinitialized when allocating. * - * Both obj_nr_tofree and obj_pool_free are checked locklessly; the - * READ_ONCE()s pair with the WRITE_ONCE()s in pool_lock critical - * sections. + * obj_nr_tofree is checked locklessly; the READ_ONCE() pairs with + * the WRITE_ONCE() in pool_lock critical sections. */ - while (READ_ONCE(obj_nr_tofree) && - READ_ONCE(obj_pool_free) < debug_objects_pool_min_level) { + if (READ_ONCE(obj_nr_tofree)) { raw_spin_lock_irqsave(&pool_lock, flags); /* * Recheck with the lock held as the worker thread might have -- cgit v1.2.3 From 81528310698726e8f79b9c3de01ebe02567119dd Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 26 Aug 2024 01:24:21 +0000 Subject: maple_tree: arange64 node is not a leaf node mt_dump_arange64() only applies to an entry whose type is maple_arange_64, in which mte_is_leaf() must return false. Since mte_is_leaf() here is always false, we can remove this condition check. Link: https://lkml.kernel.org/r/20240826012422.29935-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 755ba8b18e14..f7a971c462eb 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7203,7 +7203,6 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, enum mt_dump_format format) { struct maple_arange_64 *node = &mte_to_node(entry)->ma64; - bool leaf = mte_is_leaf(entry); unsigned long first = min; int i; @@ -7237,10 +7236,7 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, break; if (last == 0 && i > 0) break; - if (leaf) - mt_dump_entry(mt_slot(mt, node->slot, i), - first, last, depth + 1, format); - else if (node->slot[i]) + if (node->slot[i]) mt_dump_node(mt, mt_slot(mt, node->slot, i), first, last, depth + 1, format); -- cgit v1.2.3 From 21a449bedf3f01ea30ed657deeb2e8942ad45936 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Mon, 26 Aug 2024 01:24:22 +0000 Subject: maple_tree: dump error message based on format Just do what mt_dump_range64() does. Dump the error message based on format. Link: https://lkml.kernel.org/r/20240826012422.29935-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Cc: Liam R. Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index f7a971c462eb..8a3c2c344f62 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7243,9 +7243,15 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, if (last == max) break; if (last > max) { - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + switch(format) { + case mt_dump_hex: + pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); - break; + break; + case mt_dump_dec: + pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + node, last, max, i); + } } first = last + 1; } -- cgit v1.2.3 From 96ae4c9019c5651ded92e97828e341529d5863bc Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 30 Aug 2024 22:04:00 +0000 Subject: maple_tree: cleanup function descriptions This patch tries to cleanup some function description: * function name mismatch * parameter name mismatch * parameter all end up with ':' * not prefix '*' if parameter is a pointer There is still some missing description of parameters, I didn't add them since I am not sure the exact meaning. Link: https://lkml.kernel.org/r/20240830220400.2007-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 105 +++++++++++++++++++++++++------------------------------ 1 file changed, 47 insertions(+), 58 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 8a3c2c344f62..2058452e5ff3 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -474,6 +474,7 @@ enum maple_type mas_parent_type(struct ma_state *mas, struct maple_enode *enode) /* * mas_set_parent() - Set the parent node and encode the slot + * @mas: The maple state * @enode: The encoded maple node. * @parent: The encoded maple node that is the parent of @enode. * @slot: The slot that @enode resides in @parent. @@ -534,7 +535,7 @@ unsigned int mte_parent_slot(const struct maple_enode *enode) /* * mte_parent() - Get the parent of @node. - * @node: The encoded maple node. + * @enode: The encoded maple node. * * Return: The parent maple node. */ @@ -641,8 +642,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas) /* * ma_pivots() - Get a pointer to the maple node pivots. - * @node - the maple node - * @type - the node type + * @node: the maple node + * @type: the node type * * In the event of a dead node, this array may be %NULL * @@ -665,8 +666,8 @@ static inline unsigned long *ma_pivots(struct maple_node *node, /* * ma_gaps() - Get a pointer to the maple node gaps. - * @node - the maple node - * @type - the node type + * @node: the maple node + * @type: the node type * * Return: A pointer to the maple node gaps */ @@ -880,8 +881,6 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt, * @mt: The maple tree * @mn: The maple node * @type: The maple node type - * @offset: The offset of the highest sub-gap in this node. - * @end: The end of the data in this node. */ static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn, enum maple_type type) @@ -939,7 +938,7 @@ static inline unsigned char ma_meta_gap(struct maple_node *mn) /* * ma_set_meta_gap() - Set the largest gap location in a nodes metadata * @mn: The maple node - * @mn: The maple node type + * @mt: The maple node type * @offset: The location of the largest gap. */ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, @@ -953,8 +952,8 @@ static inline void ma_set_meta_gap(struct maple_node *mn, enum maple_type mt, /* * mat_add() - Add a @dead_enode to the ma_topiary of a list of dead nodes. - * @mat - the ma_topiary, a linked list of dead nodes. - * @dead_enode - the node to be marked as dead and added to the tail of the list + * @mat: the ma_topiary, a linked list of dead nodes. + * @dead_enode: the node to be marked as dead and added to the tail of the list * * Add the @dead_enode to the linked list in @mat. */ @@ -977,8 +976,8 @@ static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt, bool free); /* * mas_mat_destroy() - Free all nodes and subtrees in a dead list. - * @mas - the maple state - * @mat - the ma_topiary linked list of dead nodes to free. + * @mas: the maple state + * @mat: the ma_topiary linked list of dead nodes to free. * * Destroy walk a dead list. */ @@ -999,7 +998,7 @@ static void mas_mat_destroy(struct ma_state *mas, struct ma_topiary *mat) } /* * mas_descend() - Descend into the slot stored in the ma_state. - * @mas - the maple state. + * @mas: the maple state. * * Note: Not RCU safe, only use in write side or debug code. */ @@ -1462,7 +1461,7 @@ static inline unsigned char mas_data_end(struct ma_state *mas) /* * mas_leaf_max_gap() - Returns the largest gap in a leaf node - * @mas - the maple state + * @mas: the maple state * * Return: The maximum gap in the leaf. */ @@ -1544,7 +1543,7 @@ static unsigned long mas_leaf_max_gap(struct ma_state *mas) * @node: The maple node * @gaps: The pointer to the gaps * @mt: The maple node type - * @*off: Pointer to store the offset location of the gap. + * @off: Pointer to store the offset location of the gap. * * Uses the metadata data end to scan backwards across set gaps. * @@ -1651,7 +1650,7 @@ ascend: /* * mas_update_gap() - Update a nodes gaps and propagate up if necessary. - * @mas - the maple state. + * @mas: the maple state. */ static inline void mas_update_gap(struct ma_state *mas) { @@ -1678,8 +1677,8 @@ static inline void mas_update_gap(struct ma_state *mas) /* * mas_adopt_children() - Set the parent pointer of all nodes in @parent to * @parent with the slot encoded. - * @mas - the maple state (for the tree) - * @parent - the maple encoded node containing the children. + * @mas: the maple state (for the tree) + * @parent: the maple encoded node containing the children. */ static inline void mas_adopt_children(struct ma_state *mas, struct maple_enode *parent) @@ -1701,8 +1700,8 @@ static inline void mas_adopt_children(struct ma_state *mas, /* * mas_put_in_tree() - Put a new node in the tree, smp_wmb(), and mark the old * node as dead. - * @mas - the maple state with the new node - * @old_enode - The old maple encoded node to replace. + * @mas: the maple state with the new node + * @old_enode: The old maple encoded node to replace. */ static inline void mas_put_in_tree(struct ma_state *mas, struct maple_enode *old_enode) @@ -1730,8 +1729,8 @@ static inline void mas_put_in_tree(struct ma_state *mas, * mas_replace_node() - Replace a node by putting it in the tree, marking it * dead, and freeing it. * the parent encoding to locate the maple node in the tree. - * @mas - the ma_state with @mas->node pointing to the new node. - * @old_enode - The old maple encoded node. + * @mas: the ma_state with @mas->node pointing to the new node. + * @old_enode: The old maple encoded node. */ static inline void mas_replace_node(struct ma_state *mas, struct maple_enode *old_enode) @@ -1796,7 +1795,6 @@ static inline void mab_shift_right(struct maple_big_node *b_node, /* * mab_middle_node() - Check if a middle node is needed (unlikely) * @b_node: the maple_big_node that contains the data. - * @size: the amount of data in the b_node * @split: the potential split location * @slot_count: the size that can be stored in a single node being considered. * @@ -1844,6 +1842,7 @@ static inline int mab_no_null_split(struct maple_big_node *b_node, /* * mab_calc_split() - Calculate the split location and if there needs to be two * splits. + * @mas: The maple state * @bn: The maple_big_node with the data * @mid_split: The second split, if required. 0 otherwise. * @@ -2177,7 +2176,8 @@ static inline bool mas_next_sibling(struct ma_state *mas) } /* - * mte_node_or_none() - Set the enode and state. + * mas_node_or_none() - Set the enode and state. + * @mas: the maple state * @enode: The encoded maple node. * * Set the node to the enode and the status. @@ -2228,7 +2228,6 @@ static inline void mas_wr_node_walk(struct ma_wr_state *wr_mas) /* * mast_rebalance_next() - Rebalance against the next node * @mast: The maple subtree state - * @old_r: The encoded maple node to the right (next node). */ static inline void mast_rebalance_next(struct maple_subtree_state *mast) { @@ -2242,7 +2241,6 @@ static inline void mast_rebalance_next(struct maple_subtree_state *mast) /* * mast_rebalance_prev() - Rebalance against the previous node * @mast: The maple subtree state - * @old_l: The encoded maple node to the left (previous node) */ static inline void mast_rebalance_prev(struct maple_subtree_state *mast) { @@ -2393,9 +2391,9 @@ static inline unsigned char mas_mab_to_node(struct ma_state *mas, /* * mab_set_b_end() - Add entry to b_node at b_node->b_end and increment the end * pointer. - * @b_node - the big node to add the entry - * @mas - the maple state to get the pivot (mas->max) - * @entry - the entry to add, if NULL nothing happens. + * @b_node: the big node to add the entry + * @mas: the maple state to get the pivot (mas->max) + * @entry: the entry to add, if NULL nothing happens. */ static inline void mab_set_b_end(struct maple_big_node *b_node, struct ma_state *mas, @@ -2414,11 +2412,11 @@ static inline void mab_set_b_end(struct maple_big_node *b_node, * mas_set_split_parent() - combine_then_separate helper function. Sets the parent * of @mas->node to either @left or @right, depending on @slot and @split * - * @mas - the maple state with the node that needs a parent - * @left - possible parent 1 - * @right - possible parent 2 - * @slot - the slot the mas->node was placed - * @split - the split location between @left and @right + * @mas: the maple state with the node that needs a parent + * @left: possible parent 1 + * @right: possible parent 2 + * @slot: the slot the mas->node was placed + * @split: the split location between @left and @right */ static inline void mas_set_split_parent(struct ma_state *mas, struct maple_enode *left, @@ -2438,11 +2436,11 @@ static inline void mas_set_split_parent(struct ma_state *mas, /* * mte_mid_split_check() - Check if the next node passes the mid-split - * @**l: Pointer to left encoded maple node. - * @**m: Pointer to middle encoded maple node. - * @**r: Pointer to right encoded maple node. + * @l: Pointer to left encoded maple node. + * @m: Pointer to middle encoded maple node. + * @r: Pointer to right encoded maple node. * @slot: The offset - * @*split: The split location. + * @split: The split location. * @mid_split: The middle split. */ static inline void mte_mid_split_check(struct maple_enode **l, @@ -2466,10 +2464,10 @@ static inline void mte_mid_split_check(struct maple_enode **l, /* * mast_set_split_parents() - Helper function to set three nodes parents. Slot * is taken from @mast->l. - * @mast - the maple subtree state - * @left - the left node - * @right - the right node - * @split - the split location. + * @mast: the maple subtree state + * @left: the left node + * @right: the right node + * @split: the split location. */ static inline void mast_set_split_parents(struct maple_subtree_state *mast, struct maple_enode *left, @@ -2503,7 +2501,6 @@ static inline void mast_set_split_parents(struct maple_subtree_state *mast, /* * mas_topiary_node() - Dispose of a single node * @mas: The maple state for pushing nodes - * @enode: The encoded maple node * @in_rcu: If the tree is in rcu mode * * The node will either be RCU freed or pushed back on the maple state. @@ -2635,7 +2632,7 @@ static inline void mas_topiary_replace(struct ma_state *mas, /* * mas_wmb_replace() - Write memory barrier and replace * @mas: The maple state - * @old: The old maple encoded node that is being replaced. + * @old_enode: The old maple encoded node that is being replaced. * * Updates gap as necessary. */ @@ -3455,10 +3452,7 @@ static inline void mas_store_root(struct ma_state *mas, void *entry) /* * mas_is_span_wr() - Check if the write needs to be treated as a write that * spans the node. - * @mas: The maple state - * @piv: The pivot value being written - * @type: The maple node type - * @entry: The data to write + * @wr_mas: The maple write state * * Spanning writes are writes that start in one node and end in another OR if * the write of a %NULL will cause the node to end with a %NULL. @@ -4052,10 +4046,7 @@ static void mas_wr_bnode(struct ma_wr_state *wr_mas) /* * mas_wr_store_entry() - Internal call to store a value - * @mas: The maple state - * @entry: The entry to store. - * - * Return: The contents that was stored at the index. + * @wr_mas: The maple write state */ static inline void mas_wr_store_entry(struct ma_wr_state *wr_mas) { @@ -4493,9 +4484,8 @@ no_entry: * mas_prev_slot() - Get the entry in the previous slot * * @mas: The maple state - * @max: The minimum starting range + * @min: The minimum starting range * @empty: Can be empty - * @set_underflow: Set the @mas->node to underflow state on limit. * * Return: The entry in the previous slot which is possibly NULL */ @@ -4578,6 +4568,7 @@ underflow: /* * mas_next_node() - Get the next node at the same level in the tree. * @mas: The maple state + * @node: The maple node * @max: The maximum pivot value to check. * * The next value will be mas->node[mas->offset] or the status will have @@ -4668,8 +4659,6 @@ overflow: * @mas: The maple state * @max: The maximum starting range * @empty: Can be empty - * @set_overflow: Should @mas->node be set to overflow when the limit is - * reached. * * Return: The entry in the next slot which is possibly NULL */ @@ -5203,9 +5192,9 @@ EXPORT_SYMBOL_GPL(mas_empty_area_rev); /* * mte_dead_leaves() - Mark all leaves of a node as dead. - * @mas: The maple state + * @enode: the encoded node + * @mt: the maple tree * @slots: Pointer to the slot array - * @type: The maple node type * * Must hold the write lock. * -- cgit v1.2.3 From 4fc4187984e5dbd7ad63c3acf0b228fa9bf298d3 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 2 Sep 2024 19:55:49 +0900 Subject: lib: zstd: export API needed for dictionary support Patch series "zram: introduce custom comp backends API", v7. This series introduces support for run-time compression algorithms tuning, so users, for instance, can adjust compression/acceleration levels and provide pre-trained compression/decompression dictionaries which certain algorithms support. At this point we stop supporting (old/deprecated) comp API. We may add new acomp API support in the future, but before that zram needs to undergo some major rework (we are not ready for async compression). Some benchmarks for reference (look at column #2) *** init zstd /sys/block/zram0/mm_stat 1750659072 504622188 514355200 0 514355200 1 0 34204 34204 *** init zstd dict=/home/ss/zstd-dict-amd64 /sys/block/zram0/mm_stat 1750650880 465908890 475398144 0 475398144 1 0 34185 34185 *** init zstd level=8 dict=/home/ss/zstd-dict-amd64 /sys/block/zram0/mm_stat 1750654976 430803319 439873536 0 439873536 1 0 34185 34185 *** init lz4 /sys/block/zram0/mm_stat 1750646784 664266564 677060608 0 677060608 1 0 34288 34288 *** init lz4 dict=/home/ss/lz4-dict-amd64 /sys/block/zram0/mm_stat 1750650880 619990300 632102912 0 632102912 1 0 34278 34278 *** init lz4hc /sys/block/zram0/mm_stat 1750630400 609023822 621232128 0 621232128 1 0 34288 34288 *** init lz4hc dict=/home/ss/lz4-dict-amd64 /sys/block/zram0/mm_stat 1750659072 505133172 515231744 0 515231744 1 0 34278 34278 Recompress init zram zstd (prio=0), zstd level=5 (prio 1), zstd with dict (prio 2) *** zstd /sys/block/zram0/mm_stat 1750982656 504630584 514269184 0 514269184 1 0 34204 34204 *** idle recompress priority=1 (zstd level=5) /sys/block/zram0/mm_stat 1750982656 488645601 525438976 0 514269184 1 0 34204 34204 *** idle recompress priority=2 (zstd dict) /sys/block/zram0/mm_stat 1750982656 460869640 517914624 0 514269184 1 0 34185 34204 This patch (of 24): We need to export a number of API functions that enable advanced zstd usage - C/D dictionaries, dictionaries sharing between contexts, etc. Link: https://lkml.kernel.org/r/20240902105656.1383858-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20240902105656.1383858-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Nick Terrell Cc: Minchan Kim Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton --- include/linux/zstd.h | 167 ++++++++++++++++++++++++++++++++++++++ lib/zstd/zstd_compress_module.c | 49 +++++++++++ lib/zstd/zstd_decompress_module.c | 36 ++++++++ 3 files changed, 252 insertions(+) (limited to 'lib') diff --git a/include/linux/zstd.h b/include/linux/zstd.h index 113408eef6ec..b2c7cf310c8f 100644 --- a/include/linux/zstd.h +++ b/include/linux/zstd.h @@ -77,6 +77,30 @@ int zstd_min_clevel(void); */ int zstd_max_clevel(void); +/** + * zstd_default_clevel() - default compression level + * + * Return: Default compression level. + */ +int zstd_default_clevel(void); + +/** + * struct zstd_custom_mem - custom memory allocation + */ +typedef ZSTD_customMem zstd_custom_mem; + +/** + * struct zstd_dict_load_method - Dictionary load method. + * See zstd_lib.h. + */ +typedef ZSTD_dictLoadMethod_e zstd_dict_load_method; + +/** + * struct zstd_dict_content_type - Dictionary context type. + * See zstd_lib.h. + */ +typedef ZSTD_dictContentType_e zstd_dict_content_type; + /* ====== Parameter Selection ====== */ /** @@ -136,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; zstd_parameters zstd_get_params(int level, unsigned long long estimated_src_size); + +/** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level + * @level: The compression level + * @estimated_src_size: The estimated source size to compress or 0 + * if unknown. + * @dict_size: Dictionary size. + * + * Return: The selected zstd_compression_parameters. + */ +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size); + /* ====== Single-pass Compression ====== */ typedef ZSTD_CCtx zstd_cctx; @@ -180,6 +217,71 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size); size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters); +/** + * zstd_create_cctx_advanced() - Create compression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to compression context otherwise. + */ +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_cctx() - Free compression context + * @cdict: Pointer to compression context. + * + * Return: Always 0. + */ +size_t zstd_free_cctx(zstd_cctx* cctx); + +/** + * struct zstd_cdict - Compression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_CDict zstd_cdict; + +/** + * zstd_create_cdict_byreference() - Create compression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_cdict is destroyed. + * + * Return: NULL on error, pointer to compression dictionary + * otherwise. + */ +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem); + +/** + * zstd_free_cdict() - Free compression dictionary + * @cdict: Pointer to compression dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_cdict(zstd_cdict* cdict); + +/** + * zstd_compress_using_cdict() - compress src into dst using a dictionary + * @cctx: The context. Must have been initialized with zstd_init_cctx(). + * @dst: The buffer to compress src into. + * @dst_capacity: The size of the destination buffer. May be any size, but + * ZSTD_compressBound(srcSize) is guaranteed to be large enough. + * @src: The data to compress. + * @src_size: The size of the data to compress. + * @cdict: The dictionary to be used. + * + * Return: The compressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const zstd_cdict *cdict); + /* ====== Single-pass Decompression ====== */ typedef ZSTD_DCtx zstd_dctx; @@ -220,6 +322,71 @@ zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size); size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, const void *src, size_t src_size); +/** + * struct zstd_ddict - Decompression dictionary. + * See zstd_lib.h. + */ +typedef ZSTD_DDict zstd_ddict; + +/** + * zstd_create_ddict_byreference() - Create decompression dictionary + * @dict: Pointer to dictionary buffer. + * @dict_size: Size of the dictionary buffer. + * @dict_load_method: Dictionary load method. + * @dict_content_type: Dictionary content type. + * @custom_mem: Memory allocator. + * + * Note, this uses @dict by reference (ZSTD_dlm_byRef), so it should be + * free before zstd_ddict is destroyed. + * + * Return: NULL on error, pointer to decompression dictionary + * otherwise. + */ +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem); +/** + * zstd_free_ddict() - Free decompression dictionary + * @dict: Pointer to the dictionary. + * + * Return: Always 0. + */ +size_t zstd_free_ddict(zstd_ddict *ddict); + +/** + * zstd_create_dctx_advanced() - Create decompression context + * @custom_mem: Custom allocator. + * + * Return: NULL on error, pointer to decompression context otherwise. + */ +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem); + +/** + * zstd_free_dctx() -- Free decompression context + * @dctx: Pointer to decompression context. + * Return: Always 0. + */ +size_t zstd_free_dctx(zstd_dctx *dctx); + +/** + * zstd_decompress_using_ddict() - decompress src into dst using a dictionary + * @dctx: The decompression context. + * @dst: The buffer to decompress src into. + * @dst_capacity: The size of the destination buffer. Must be at least as large + * as the decompressed size. If the caller cannot upper bound the + * decompressed size, then it's better to use the streaming API. + * @src: The zstd compressed data to decompress. Multiple concatenated + * frames and skippable frames are allowed. + * @src_size: The exact size of the data to decompress. + * @ddict: The dictionary to be used. + * + * Return: The decompressed size or an error, which can be checked using + * zstd_is_error(). + */ +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void *src, size_t src_size, + const zstd_ddict *ddict); + + /* ====== Streaming Buffers ====== */ /** diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c index 04e1b5c01d9b..bd8784449b31 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -66,6 +66,12 @@ int zstd_max_clevel(void) } EXPORT_SYMBOL(zstd_max_clevel); +int zstd_default_clevel(void) +{ + return ZSTD_defaultCLevel(); +} +EXPORT_SYMBOL(zstd_default_clevel); + size_t zstd_compress_bound(size_t src_size) { return ZSTD_compressBound(src_size); @@ -79,6 +85,13 @@ zstd_parameters zstd_get_params(int level, } EXPORT_SYMBOL(zstd_get_params); +zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size) +{ + return ZSTD_getCParams(level, estimated_src_size, dict_size); +} +EXPORT_SYMBOL(zstd_get_cparams); + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) { return ZSTD_estimateCCtxSize_usingCParams(*cparams); @@ -93,6 +106,33 @@ zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) } EXPORT_SYMBOL(zstd_init_cctx); +zstd_cctx *zstd_create_cctx_advanced(zstd_custom_mem custom_mem) +{ + return ZSTD_createCCtx_advanced(custom_mem); +} +EXPORT_SYMBOL(zstd_create_cctx_advanced); + +size_t zstd_free_cctx(zstd_cctx *cctx) +{ + return ZSTD_freeCCtx(cctx); +} +EXPORT_SYMBOL(zstd_free_cctx); + +zstd_cdict *zstd_create_cdict_byreference(const void *dict, size_t dict_size, + zstd_compression_parameters cparams, + zstd_custom_mem custom_mem) +{ + return ZSTD_createCDict_advanced(dict, dict_size, ZSTD_dlm_byRef, + ZSTD_dct_auto, cparams, custom_mem); +} +EXPORT_SYMBOL(zstd_create_cdict_byreference); + +size_t zstd_free_cdict(zstd_cdict *cdict) +{ + return ZSTD_freeCDict(cdict); +} +EXPORT_SYMBOL(zstd_free_cdict); + size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, const void *src, size_t src_size, const zstd_parameters *parameters) { @@ -101,6 +141,15 @@ size_t zstd_compress_cctx(zstd_cctx *cctx, void *dst, size_t dst_capacity, } EXPORT_SYMBOL(zstd_compress_cctx); +size_t zstd_compress_using_cdict(zstd_cctx *cctx, void *dst, + size_t dst_capacity, const void *src, size_t src_size, + const ZSTD_CDict *cdict) +{ + return ZSTD_compress_usingCDict(cctx, dst, dst_capacity, + src, src_size, cdict); +} +EXPORT_SYMBOL(zstd_compress_using_cdict); + size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams) { return ZSTD_estimateCStreamSize_usingCParams(*cparams); diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c index f4ed952ed485..469fc3059be0 100644 --- a/lib/zstd/zstd_decompress_module.c +++ b/lib/zstd/zstd_decompress_module.c @@ -44,6 +44,33 @@ size_t zstd_dctx_workspace_bound(void) } EXPORT_SYMBOL(zstd_dctx_workspace_bound); +zstd_dctx *zstd_create_dctx_advanced(zstd_custom_mem custom_mem) +{ + return ZSTD_createDCtx_advanced(custom_mem); +} +EXPORT_SYMBOL(zstd_create_dctx_advanced); + +size_t zstd_free_dctx(zstd_dctx *dctx) +{ + return ZSTD_freeDCtx(dctx); +} +EXPORT_SYMBOL(zstd_free_dctx); + +zstd_ddict *zstd_create_ddict_byreference(const void *dict, size_t dict_size, + zstd_custom_mem custom_mem) +{ + return ZSTD_createDDict_advanced(dict, dict_size, ZSTD_dlm_byRef, + ZSTD_dct_auto, custom_mem); + +} +EXPORT_SYMBOL(zstd_create_ddict_byreference); + +size_t zstd_free_ddict(zstd_ddict *ddict) +{ + return ZSTD_freeDDict(ddict); +} +EXPORT_SYMBOL(zstd_free_ddict); + zstd_dctx *zstd_init_dctx(void *workspace, size_t workspace_size) { if (workspace == NULL) @@ -59,6 +86,15 @@ size_t zstd_decompress_dctx(zstd_dctx *dctx, void *dst, size_t dst_capacity, } EXPORT_SYMBOL(zstd_decompress_dctx); +size_t zstd_decompress_using_ddict(zstd_dctx *dctx, + void *dst, size_t dst_capacity, const void* src, size_t src_size, + const zstd_ddict* ddict) +{ + return ZSTD_decompress_usingDDict(dctx, dst, dst_capacity, src, + src_size, ddict); +} +EXPORT_SYMBOL(zstd_decompress_using_ddict); + size_t zstd_dstream_workspace_bound(size_t max_window_size) { return ZSTD_estimateDStreamSize(max_window_size); -- cgit v1.2.3 From 751884743025deeb52f3a2ad39acaf0f57e6d496 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 2 Sep 2024 19:55:50 +0900 Subject: lib: lz4hc: export LZ4_resetStreamHC symbol This symbol is needed to enable lz4hc dictionary support. Link: https://lkml.kernel.org/r/20240902105656.1383858-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Nick Terrell Cc: Minchan Kim Signed-off-by: Andrew Morton --- lib/lz4/lz4hc_compress.c | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c index e7ac8694b797..bc45594ad2a8 100644 --- a/lib/lz4/lz4hc_compress.c +++ b/lib/lz4/lz4hc_compress.c @@ -621,6 +621,7 @@ void LZ4_resetStreamHC(LZ4_streamHC_t *LZ4_streamHCPtr, int compressionLevel) LZ4_streamHCPtr->internal_donotuse.base = NULL; LZ4_streamHCPtr->internal_donotuse.compressionLevel = (unsigned int)compressionLevel; } +EXPORT_SYMBOL(LZ4_resetStreamHC); int LZ4_loadDictHC(LZ4_streamHC_t *LZ4_streamHCPtr, const char *dictionary, -- cgit v1.2.3 From f3c11cf5cae044668f888a50abb37b29600ca197 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Mon, 2 Sep 2024 19:55:51 +0900 Subject: lib: zstd: fix null-deref in ZSTD_createCDict_advanced2() ZSTD_createCDict_advanced2() must ensure that ZSTD_createCDict_advanced_internal() has successfully allocated cdict. customMalloc() may be called under low memory condition and may be unable to allocate workspace for cdict. Link: https://lkml.kernel.org/r/20240902105656.1383858-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Nick Terrell Cc: Minchan Kim Signed-off-by: Andrew Morton --- lib/zstd/compress/zstd_compress.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c index f620cafca633..16bb995bc6c4 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -4810,6 +4810,8 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( dictLoadMethod, cctxParams.cParams, cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, customMem); + if (!cdict) + return NULL; if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, -- cgit v1.2.3 From 1930c6ad93ad01f82bb7965bbc04eb5a763f856d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 6 Sep 2024 22:15:06 -0400 Subject: maple_tree: mark three functions as __maybe_unused People keep trying to remove three functions that are going to be used in a feature that is being developed. Dropping the functions entirely may end up with people trying to use the bit for other uses, as people have tried in the past. Adding __maybe_unused stops compilers complaining about the unused functions so they can be silently optimised out of the compiled code and people won't try to claim the bit for another use. Link: https://lore.kernel.org/all/20230726080916.17454-2-zhangpeng.00@bytedance.com/ Link: https://lore.kernel.org/all/202408310728.S7EE59BN-lkp@intel.com/ Link: https://lkml.kernel.org/r/20240907021506.4018676-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Lorenzo Stoakes Reviewed-by: Kuan-Wei Chiu Signed-off-by: Andrew Morton --- lib/maple_tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 2058452e5ff3..2a57e489b206 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -348,17 +348,17 @@ static inline void *mte_safe_root(const struct maple_enode *node) return (void *)((unsigned long)node & ~MAPLE_ROOT_NODE); } -static inline void *mte_set_full(const struct maple_enode *node) +static inline void __maybe_unused *mte_set_full(const struct maple_enode *node) { return (void *)((unsigned long)node & ~MAPLE_ENODE_NULL); } -static inline void *mte_clear_full(const struct maple_enode *node) +static inline void __maybe_unused *mte_clear_full(const struct maple_enode *node) { return (void *)((unsigned long)node | MAPLE_ENODE_NULL); } -static inline bool mte_has_null(const struct maple_enode *node) +static inline bool __maybe_unused mte_has_null(const struct maple_enode *node) { return (unsigned long)node & MAPLE_ENODE_NULL; } -- cgit v1.2.3 From 7b0a5b666959719043123a8882bae49ec699d948 Mon Sep 17 00:00:00 2001 From: Alok Swaminathan Date: Mon, 26 Aug 2024 11:57:09 -0400 Subject: lib: glob.c: added null check for character class Add null check for character class. Previously, an inverted character class could result in a nul byte being matched and lead to the function reading past the end of the inputted string. Link: https://lkml.kernel.org/r/20240826155709.12383-1-swaminathanalok@gmail.com Signed-off-by: Alok Swaminathan Signed-off-by: Andrew Morton --- lib/glob.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'lib') diff --git a/lib/glob.c b/lib/glob.c index 15b73f490720..aa57900d2062 100644 --- a/lib/glob.c +++ b/lib/glob.c @@ -68,6 +68,8 @@ bool __pure glob_match(char const *pat, char const *str) back_str = --str; /* Allow zero-length match */ break; case '[': { /* Character class */ + if (c == '\0') /* No possible match */ + return false; bool match = false, inverted = (*pat == '!'); char const *class = pat + inverted; unsigned char a = *class++; -- cgit v1.2.3 From 905415ff3ffb1d7e5afa62bacabd79776bd24606 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:23 -0700 Subject: lib/buildid: harden build ID parsing logic Harden build ID parsing logic, adding explicit READ_ONCE() where it's important to have a consistent value read and validated just once. Also, as pointed out by Andi Kleen, we need to make sure that entire ELF note is within a page bounds, so move the overflow check up and add an extra note_size boundaries validation. Fixes tag below points to the code that moved this code into lib/buildid.c, and then subsequently was used in perf subsystem, making this code exposed to perf_event_open() users in v5.12+. Cc: stable@vger.kernel.org Reviewed-by: Eduard Zingerman Reviewed-by: Jann Horn Suggested-by: Andi Kleen Fixes: bd7525dacd7e ("bpf: Move stack_map_get_build_id into lib") Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-2-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 76 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index e02b5507418b..26007cc99a38 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -18,31 +18,37 @@ static int parse_build_id_buf(unsigned char *build_id, const void *note_start, Elf32_Word note_size) { - Elf32_Word note_offs = 0, new_offs; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + const char note_name[] = "GNU"; + const size_t note_name_sz = sizeof(note_name); + u64 note_off = 0, new_off, name_sz, desc_sz; + const char *data; + + while (note_off + sizeof(Elf32_Nhdr) < note_size && + note_off + sizeof(Elf32_Nhdr) > note_off /* overflow */) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_off); + + name_sz = READ_ONCE(nhdr->n_namesz); + desc_sz = READ_ONCE(nhdr->n_descsz); + + new_off = note_off + sizeof(Elf32_Nhdr); + if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) || + check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) || + new_off > note_size) + break; if (nhdr->n_type == BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - !strcmp((char *)(nhdr + 1), "GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BUILD_ID_SIZE_MAX - nhdr->n_descsz); + name_sz == note_name_sz && + memcmp(nhdr + 1, note_name, note_name_sz) == 0 && + desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { + data = note_start + note_off + ALIGN(note_name_sz, 4); + memcpy(build_id, data, desc_sz); + memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) - *size = nhdr->n_descsz; + *size = desc_sz; return 0; } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; + + note_off = new_off; } return -EINVAL; @@ -71,7 +77,7 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; - int i; + __u32 i, phnum; /* * FIXME @@ -80,18 +86,19 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL; phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) { + for (i = 0; i < phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL; @@ -103,7 +110,7 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; - int i; + __u32 i, phnum; /* * FIXME @@ -112,18 +119,19 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, */ if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) return -EINVAL; + + phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL; phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < ehdr->e_phnum; ++i) { + for (i = 0; i < phnum; ++i) { if (phdr[i].p_type == PT_NOTE && !parse_build_id(page_addr, build_id, size, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) + page_addr + READ_ONCE(phdr[i].p_offset), + READ_ONCE(phdr[i].p_filesz))) return 0; } return -EINVAL; @@ -152,6 +160,10 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, page = find_get_page(vma->vm_file->f_mapping, 0); if (!page) return -EFAULT; /* page not mapped */ + if (!PageUptodate(page)) { + put_page(page); + return -EFAULT; + } ret = -EINVAL; page_addr = kmap_local_page(page); -- cgit v1.2.3 From de3ec364c3c37971dbba1e37a55ae5b646c6f24e Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:24 -0700 Subject: lib/buildid: add single folio-based file reader abstraction Add freader abstraction that transparently manages fetching and local mapping of the underlying file page(s) and provides a simple direct data access interface. freader_fetch() is the only and single interface necessary. It accepts file offset and desired number of bytes that should be accessed, and will return a kernel mapped pointer that caller can use to dereference data up to requested size. Requested size can't be bigger than the size of the extra buffer provided during initialization (because, worst case, all requested data has to be copied into it, so it's better to flag wrongly sized buffer unconditionally, regardless if requested data range is crossing page boundaries or not). If folio is not paged in, or some of the conditions are not satisfied, NULL is returned and more detailed error code can be accessed through freader->err field. This approach makes the usage of freader_fetch() cleaner. To accommodate accessing file data that crosses folio boundaries, user has to provide an extra buffer that will be used to make a local copy, if necessary. This is done to maintain a simple linear pointer data access interface. We switch existing build ID parsing logic to it, without changing or lifting any of the existing constraints, yet. This will be done separately. Given existing code was written with the assumption that it's always working with a single (first) page of the underlying ELF file, logic passes direct pointers around, which doesn't really work well with freader approach and would be limiting when removing the single page (folio) limitation. So we adjust all the logic to work in terms of file offsets. There is also a memory buffer-based version (freader_init_from_mem()) for cases when desired data is already available in kernel memory. This is used for parsing vmlinux's own build ID note. In this mode assumption is that provided data starts at "file offset" zero, which works great when parsing ELF notes sections, as all the parsing logic is relative to note section's start. Reviewed-by: Eduard Zingerman Reviewed-by: Shakeel Butt Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-3-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 263 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 210 insertions(+), 53 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index 26007cc99a38..bfe00b66b1e8 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -8,24 +8,155 @@ #define BUILD_ID 3 +struct freader { + void *buf; + u32 buf_sz; + int err; + union { + struct { + struct address_space *mapping; + struct folio *folio; + void *addr; + loff_t folio_off; + }; + struct { + const char *data; + u64 data_sz; + }; + }; +}; + +static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, + struct address_space *mapping) +{ + memset(r, 0, sizeof(*r)); + r->buf = buf; + r->buf_sz = buf_sz; + r->mapping = mapping; +} + +static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) +{ + memset(r, 0, sizeof(*r)); + r->data = data; + r->data_sz = data_sz; +} + +static void freader_put_folio(struct freader *r) +{ + if (!r->folio) + return; + kunmap_local(r->addr); + folio_put(r->folio); + r->folio = NULL; +} + +static int freader_get_folio(struct freader *r, loff_t file_off) +{ + /* check if we can just reuse current folio */ + if (r->folio && file_off >= r->folio_off && + file_off < r->folio_off + folio_size(r->folio)) + return 0; + + freader_put_folio(r); + + r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT); + if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) { + if (!IS_ERR(r->folio)) + folio_put(r->folio); + r->folio = NULL; + return -EFAULT; + } + + r->folio_off = folio_pos(r->folio); + r->addr = kmap_local_folio(r->folio, 0); + + return 0; +} + +static const void *freader_fetch(struct freader *r, loff_t file_off, size_t sz) +{ + size_t folio_sz; + + /* provided internal temporary buffer should be sized correctly */ + if (WARN_ON(r->buf && sz > r->buf_sz)) { + r->err = -E2BIG; + return NULL; + } + + if (unlikely(file_off + sz < file_off)) { + r->err = -EOVERFLOW; + return NULL; + } + + /* working with memory buffer is much more straightforward */ + if (!r->buf) { + if (file_off + sz > r->data_sz) { + r->err = -ERANGE; + return NULL; + } + return r->data + file_off; + } + + /* fetch or reuse folio for given file offset */ + r->err = freader_get_folio(r, file_off); + if (r->err) + return NULL; + + /* if requested data is crossing folio boundaries, we have to copy + * everything into our local buffer to keep a simple linear memory + * access interface + */ + folio_sz = folio_size(r->folio); + if (file_off + sz > r->folio_off + folio_sz) { + int part_sz = r->folio_off + folio_sz - file_off; + + /* copy the part that resides in the current folio */ + memcpy(r->buf, r->addr + (file_off - r->folio_off), part_sz); + + /* fetch next folio */ + r->err = freader_get_folio(r, r->folio_off + folio_sz); + if (r->err) + return NULL; + + /* copy the rest of requested data */ + memcpy(r->buf + part_sz, r->addr, sz - part_sz); + + return r->buf; + } + + /* if data fits in a single folio, just return direct pointer */ + return r->addr + (file_off - r->folio_off); +} + +static void freader_cleanup(struct freader *r) +{ + if (!r->buf) + return; /* non-file-backed mode */ + + freader_put_folio(r); +} + /* * Parse build id from the note segment. This logic can be shared between * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are * identical. */ -static int parse_build_id_buf(unsigned char *build_id, - __u32 *size, - const void *note_start, - Elf32_Word note_size) +static int parse_build_id_buf(struct freader *r, + unsigned char *build_id, __u32 *size, + loff_t note_off, Elf32_Word note_size) { const char note_name[] = "GNU"; const size_t note_name_sz = sizeof(note_name); - u64 note_off = 0, new_off, name_sz, desc_sz; + u32 build_id_off, new_off, note_end, name_sz, desc_sz; + const Elf32_Nhdr *nhdr; const char *data; - while (note_off + sizeof(Elf32_Nhdr) < note_size && - note_off + sizeof(Elf32_Nhdr) > note_off /* overflow */) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_off); + note_end = note_off + note_size; + while (note_end - note_off > sizeof(Elf32_Nhdr) + note_name_sz) { + nhdr = freader_fetch(r, note_off, sizeof(Elf32_Nhdr) + note_name_sz); + if (!nhdr) + return r->err; name_sz = READ_ONCE(nhdr->n_namesz); desc_sz = READ_ONCE(nhdr->n_descsz); @@ -33,14 +164,20 @@ static int parse_build_id_buf(unsigned char *build_id, new_off = note_off + sizeof(Elf32_Nhdr); if (check_add_overflow(new_off, ALIGN(name_sz, 4), &new_off) || check_add_overflow(new_off, ALIGN(desc_sz, 4), &new_off) || - new_off > note_size) + new_off > note_end) break; if (nhdr->n_type == BUILD_ID && name_sz == note_name_sz && memcmp(nhdr + 1, note_name, note_name_sz) == 0 && desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { - data = note_start + note_off + ALIGN(note_name_sz, 4); + build_id_off = note_off + sizeof(Elf32_Nhdr) + ALIGN(note_name_sz, 4); + + /* freader_fetch() will invalidate nhdr pointer */ + data = freader_fetch(r, build_id_off, desc_sz); + if (!data) + return r->err; + memcpy(build_id, data, desc_sz); memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) @@ -54,30 +191,33 @@ static int parse_build_id_buf(unsigned char *build_id, return -EINVAL; } -static inline int parse_build_id(const void *page_addr, +static inline int parse_build_id(struct freader *r, unsigned char *build_id, __u32 *size, - const void *note_start, + loff_t note_start_off, Elf32_Word note_size) { /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) + if (note_start_off + note_size < note_start_off) return -EINVAL; /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) + if (note_start_off + note_size > PAGE_SIZE) return -EINVAL; - return parse_build_id_buf(build_id, size, note_start, note_size); + return parse_build_id_buf(r, build_id, size, note_start_off, note_size); } /* Parse build ID from 32-bit ELF */ -static int get_build_id_32(const void *page_addr, unsigned char *build_id, - __u32 *size) +static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *size) { - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - __u32 i, phnum; + const Elf32_Ehdr *ehdr; + const Elf32_Phdr *phdr; + __u32 phnum, i; + + ehdr = freader_fetch(r, 0, sizeof(Elf32_Ehdr)); + if (!ehdr) + return r->err; /* * FIXME @@ -87,30 +227,35 @@ static int get_build_id_32(const void *page_addr, unsigned char *build_id, if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) return -EINVAL; + /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL; - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - for (i = 0; i < phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, size, - page_addr + READ_ONCE(phdr[i].p_offset), - READ_ONCE(phdr[i].p_filesz))) + phdr = freader_fetch(r, i * sizeof(Elf32_Phdr), sizeof(Elf32_Phdr)); + if (!phdr) + return r->err; + + if (phdr->p_type == PT_NOTE && + !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset), + READ_ONCE(phdr->p_filesz))) return 0; } return -EINVAL; } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(const void *page_addr, unsigned char *build_id, - __u32 *size) +static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *size) { - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - __u32 i, phnum; + const Elf64_Ehdr *ehdr; + const Elf64_Phdr *phdr; + __u32 phnum, i; + + ehdr = freader_fetch(r, 0, sizeof(Elf64_Ehdr)); + if (!ehdr) + return r->err; /* * FIXME @@ -120,23 +265,29 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) return -EINVAL; + /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ phnum = READ_ONCE(ehdr->e_phnum); /* only supports phdr that fits in one page */ if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL; - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - for (i = 0; i < phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, size, - page_addr + READ_ONCE(phdr[i].p_offset), - READ_ONCE(phdr[i].p_filesz))) + phdr = freader_fetch(r, i * sizeof(Elf64_Phdr), sizeof(Elf64_Phdr)); + if (!phdr) + return r->err; + + if (phdr->p_type == PT_NOTE && + !parse_build_id(r, build_id, size, READ_ONCE(phdr->p_offset), + READ_ONCE(phdr->p_filesz))) return 0; } + return -EINVAL; } +/* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */ +#define MAX_FREADER_BUF_SZ 64 + /* * Parse build ID of ELF file mapped to vma * @vma: vma object @@ -148,26 +299,25 @@ static int get_build_id_64(const void *page_addr, unsigned char *build_id, int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) { - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; + const Elf32_Ehdr *ehdr; + struct freader r; + char buf[MAX_FREADER_BUF_SZ]; int ret; /* only works for page backed storage */ if (!vma->vm_file) return -EINVAL; - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - if (!PageUptodate(page)) { - put_page(page); - return -EFAULT; + freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file->f_mapping); + + /* fetch first 18 bytes of ELF header for checks */ + ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type)); + if (!ehdr) { + ret = r.err; + goto out; } ret = -EINVAL; - page_addr = kmap_local_page(page); - ehdr = (Elf32_Ehdr *)page_addr; /* compare magic x7f "ELF" */ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) @@ -178,12 +328,11 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(page_addr, build_id, size); + ret = get_build_id_32(&r, build_id, size); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(page_addr, build_id, size); + ret = get_build_id_64(&r, build_id, size); out: - kunmap_local(page_addr); - put_page(page); + freader_cleanup(&r); return ret; } @@ -197,7 +346,15 @@ out: */ int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size) { - return parse_build_id_buf(build_id, NULL, buf, buf_size); + struct freader r; + int err; + + freader_init_from_mem(&r, buf, buf_size); + + err = parse_build_id(&r, build_id, NULL, 0, buf_size); + + freader_cleanup(&r); + return err; } #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) -- cgit v1.2.3 From d4deb82423416e3ace7889816eea630af81fe702 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:25 -0700 Subject: lib/buildid: take into account e_phoff when fetching program headers Current code assumption is that program (segment) headers are following ELF header immediately. This is a common case, but is not guaranteed. So take into account e_phoff field of the ELF header when accessing program headers. Reviewed-by: Eduard Zingerman Reported-by: Alexey Dobriyan Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-4-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index bfe00b66b1e8..7fb08a1d98bd 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -213,28 +213,26 @@ static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *si { const Elf32_Ehdr *ehdr; const Elf32_Phdr *phdr; - __u32 phnum, i; + __u32 phnum, phoff, i; ehdr = freader_fetch(r, 0, sizeof(Elf32_Ehdr)); if (!ehdr) return r->err; - /* - * FIXME - * Neither ELF spec nor ELF loader require that program headers - * start immediately after ELF header. - */ - if (ehdr->e_phoff != sizeof(Elf32_Ehdr)) - return -EINVAL; - /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ phnum = READ_ONCE(ehdr->e_phnum); + phoff = READ_ONCE(ehdr->e_phoff); + /* only supports phdr that fits in one page */ if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) return -EINVAL; + /* check that phoff is not large enough to cause an overflow */ + if (phoff + phnum * sizeof(Elf32_Phdr) < phoff) + return -EINVAL; + for (i = 0; i < phnum; ++i) { - phdr = freader_fetch(r, i * sizeof(Elf32_Phdr), sizeof(Elf32_Phdr)); + phdr = freader_fetch(r, phoff + i * sizeof(Elf32_Phdr), sizeof(Elf32_Phdr)); if (!phdr) return r->err; @@ -252,27 +250,26 @@ static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si const Elf64_Ehdr *ehdr; const Elf64_Phdr *phdr; __u32 phnum, i; + __u64 phoff; ehdr = freader_fetch(r, 0, sizeof(Elf64_Ehdr)); if (!ehdr) return r->err; - /* - * FIXME - * Neither ELF spec nor ELF loader require that program headers - * start immediately after ELF header. - */ - if (ehdr->e_phoff != sizeof(Elf64_Ehdr)) - return -EINVAL; - /* subsequent freader_fetch() calls invalidate pointers, so remember locally */ phnum = READ_ONCE(ehdr->e_phnum); + phoff = READ_ONCE(ehdr->e_phoff); + /* only supports phdr that fits in one page */ if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) return -EINVAL; + /* check that phoff is not large enough to cause an overflow */ + if (phoff + phnum * sizeof(Elf64_Phdr) < phoff) + return -EINVAL; + for (i = 0; i < phnum; ++i) { - phdr = freader_fetch(r, i * sizeof(Elf64_Phdr), sizeof(Elf64_Phdr)); + phdr = freader_fetch(r, phoff + i * sizeof(Elf64_Phdr), sizeof(Elf64_Phdr)); if (!phdr) return r->err; -- cgit v1.2.3 From 4e9d360c4cdf2dc11a30fd5caf39e8c31f0896cb Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:26 -0700 Subject: lib/buildid: remove single-page limit for PHDR search Now that freader allows to access multiple pages transparently, there is no need to limit program headers to the very first ELF file page. Remove this limitation, but still put some sane limit on amount of program headers that we are willing to iterate over (set arbitrarily to 256). Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-5-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index 7fb08a1d98bd..e8fc4aeb01f2 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -8,6 +8,8 @@ #define BUILD_ID 3 +#define MAX_PHDR_CNT 256 + struct freader { void *buf; u32 buf_sz; @@ -223,9 +225,9 @@ static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *si phnum = READ_ONCE(ehdr->e_phnum); phoff = READ_ONCE(ehdr->e_phoff); - /* only supports phdr that fits in one page */ - if (phnum > (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; + /* set upper bound on amount of segments (phdrs) we iterate */ + if (phnum > MAX_PHDR_CNT) + phnum = MAX_PHDR_CNT; /* check that phoff is not large enough to cause an overflow */ if (phoff + phnum * sizeof(Elf32_Phdr) < phoff) @@ -260,9 +262,9 @@ static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si phnum = READ_ONCE(ehdr->e_phnum); phoff = READ_ONCE(ehdr->e_phoff); - /* only supports phdr that fits in one page */ - if (phnum > (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; + /* set upper bound on amount of segments (phdrs) we iterate */ + if (phnum > MAX_PHDR_CNT) + phnum = MAX_PHDR_CNT; /* check that phoff is not large enough to cause an overflow */ if (phoff + phnum * sizeof(Elf64_Phdr) < phoff) -- cgit v1.2.3 From 45b8fc3096542a53bfd245a9ad8ef870384b4897 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:27 -0700 Subject: lib/buildid: rename build_id_parse() into build_id_parse_nofault() Make it clear that build_id_parse() assumes that it can take no page fault by renaming it and current few users to build_id_parse_nofault(). Also add build_id_parse() stub which for now falls back to non-sleepable implementation, but will be changed in subsequent patches to take advantage of sleepable context. PROCMAP_QUERY ioctl() on /proc//maps file is using build_id_parse() and will automatically take advantage of more reliable sleepable context implementation. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-6-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/buildid.h | 4 ++-- kernel/bpf/stackmap.c | 2 +- kernel/events/core.c | 2 +- lib/buildid.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 20aa3c2d89f7..014a88c41073 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -7,8 +7,8 @@ #define BUILD_ID_SIZE_MAX 20 struct vm_area_struct; -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size); int build_id_parse_buf(const void *buf, unsigned char *build_id, u32 buf_size); #if IS_ENABLED(CONFIG_STACKTRACE_BUILD_ID) || IS_ENABLED(CONFIG_VMCORE_INFO) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index c99f8e5234ac..770ae8e88016 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -156,7 +156,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, goto build_id_valid; } vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { + if (!vma || build_id_parse_nofault(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/kernel/events/core.c b/kernel/events/core.c index c973e3c11e03..c78a77f9dce4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8851,7 +8851,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; if (atomic_read(&nr_build_id_events)) - build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + build_id_parse_nofault(vma, mmap_event->build_id, &mmap_event->build_id_size); perf_iterate_sb(perf_event_mmap_output, mmap_event, diff --git a/lib/buildid.c b/lib/buildid.c index e8fc4aeb01f2..c1cbd34f3685 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -293,10 +293,12 @@ static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si * @build_id: buffer to store build id, at least BUILD_ID_SIZE long * @size: returns actual build id size in case of success * - * Return: 0 on success, -EINVAL otherwise + * Assumes no page fault can be taken, so if relevant portions of ELF file are + * not already paged in, fetching of build ID fails. + * + * Return: 0 on success; negative error, otherwise */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, - __u32 *size) +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) { const Elf32_Ehdr *ehdr; struct freader r; @@ -335,6 +337,23 @@ out: return ret; } +/* + * Parse build ID of ELF file mapped to VMA + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Assumes faultable context and can cause page faults to bring in file data + * into page cache. + * + * Return: 0 on success; negative error, otherwise + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +{ + /* fallback to non-faultable version for now */ + return build_id_parse_nofault(vma, build_id, size); +} + /** * build_id_parse_buf - Get build ID from a buffer * @buf: ELF note section(s) to parse -- cgit v1.2.3 From ad41251c290dfe3c01472c94d2439a59de23fe97 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:28 -0700 Subject: lib/buildid: implement sleepable build_id_parse() API Extend freader with a flag specifying whether it's OK to cause page fault to fetch file data that is not already physically present in memory. With this, it's now easy to wait for data if the caller is running in sleepable (faultable) context. We utilize read_cache_folio() to bring the desired folio into page cache, after which the rest of the logic works just the same at folio level. Suggested-by: Omar Sandoval Cc: Shakeel Butt Cc: Johannes Weiner Reviewed-by: Eduard Zingerman Reviewed-by: Shakeel Butt Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-7-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 54 +++++++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 19 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index c1cbd34f3685..18ef55812c64 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -16,10 +16,11 @@ struct freader { int err; union { struct { - struct address_space *mapping; + struct file *file; struct folio *folio; void *addr; loff_t folio_off; + bool may_fault; }; struct { const char *data; @@ -29,12 +30,13 @@ struct freader { }; static void freader_init_from_file(struct freader *r, void *buf, u32 buf_sz, - struct address_space *mapping) + struct file *file, bool may_fault) { memset(r, 0, sizeof(*r)); r->buf = buf; r->buf_sz = buf_sz; - r->mapping = mapping; + r->file = file; + r->may_fault = may_fault; } static void freader_init_from_mem(struct freader *r, const char *data, u64 data_sz) @@ -62,7 +64,16 @@ static int freader_get_folio(struct freader *r, loff_t file_off) freader_put_folio(r); - r->folio = filemap_get_folio(r->mapping, file_off >> PAGE_SHIFT); + r->folio = filemap_get_folio(r->file->f_mapping, file_off >> PAGE_SHIFT); + + /* if sleeping is allowed, wait for the page, if necessary */ + if (r->may_fault && (IS_ERR(r->folio) || !folio_test_uptodate(r->folio))) { + filemap_invalidate_lock_shared(r->file->f_mapping); + r->folio = read_cache_folio(r->file->f_mapping, file_off >> PAGE_SHIFT, + NULL, r->file); + filemap_invalidate_unlock_shared(r->file->f_mapping); + } + if (IS_ERR(r->folio) || !folio_test_uptodate(r->folio)) { if (!IS_ERR(r->folio)) folio_put(r->folio); @@ -287,18 +298,8 @@ static int get_build_id_64(struct freader *r, unsigned char *build_id, __u32 *si /* enough for Elf64_Ehdr, Elf64_Phdr, and all the smaller requests */ #define MAX_FREADER_BUF_SZ 64 -/* - * Parse build ID of ELF file mapped to vma - * @vma: vma object - * @build_id: buffer to store build id, at least BUILD_ID_SIZE long - * @size: returns actual build id size in case of success - * - * Assumes no page fault can be taken, so if relevant portions of ELF file are - * not already paged in, fetching of build ID fails. - * - * Return: 0 on success; negative error, otherwise - */ -int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +static int __build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size, bool may_fault) { const Elf32_Ehdr *ehdr; struct freader r; @@ -309,7 +310,7 @@ int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, if (!vma->vm_file) return -EINVAL; - freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file->f_mapping); + freader_init_from_file(&r, buf, sizeof(buf), vma->vm_file, may_fault); /* fetch first 18 bytes of ELF header for checks */ ehdr = freader_fetch(&r, 0, offsetofend(Elf32_Ehdr, e_type)); @@ -337,6 +338,22 @@ out: return ret; } +/* + * Parse build ID of ELF file mapped to vma + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Assumes no page fault can be taken, so if relevant portions of ELF file are + * not already paged in, fetching of build ID fails. + * + * Return: 0 on success; negative error, otherwise + */ +int build_id_parse_nofault(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) +{ + return __build_id_parse(vma, build_id, size, false /* !may_fault */); +} + /* * Parse build ID of ELF file mapped to VMA * @vma: vma object @@ -350,8 +367,7 @@ out: */ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, __u32 *size) { - /* fallback to non-faultable version for now */ - return build_id_parse_nofault(vma, build_id, size); + return __build_id_parse(vma, build_id, size, true /* may_fault */); } /** -- cgit v1.2.3 From cdbb44f9a74fe7d01090ae492672e89cf7d83ce5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 29 Aug 2024 10:42:29 -0700 Subject: lib/buildid: don't limit .note.gnu.build-id to the first page in ELF With freader we don't need to restrict ourselves to a single page, so let's allow ELF notes to be at any valid position with the file. We also merge parse_build_id() and parse_build_id_buf() as now the only difference between them is note offset overflow, which makes sense to check in all situations. Reviewed-by: Eduard Zingerman Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/r/20240829174232.3133883-8-andrii@kernel.org Signed-off-by: Alexei Starovoitov --- lib/buildid.c | 26 +++++--------------------- 1 file changed, 5 insertions(+), 21 deletions(-) (limited to 'lib') diff --git a/lib/buildid.c b/lib/buildid.c index 18ef55812c64..290641d92ac1 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -155,9 +155,8 @@ static void freader_cleanup(struct freader *r) * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are * identical. */ -static int parse_build_id_buf(struct freader *r, - unsigned char *build_id, __u32 *size, - loff_t note_off, Elf32_Word note_size) +static int parse_build_id(struct freader *r, unsigned char *build_id, __u32 *size, + loff_t note_off, Elf32_Word note_size) { const char note_name[] = "GNU"; const size_t note_name_sz = sizeof(note_name); @@ -165,7 +164,9 @@ static int parse_build_id_buf(struct freader *r, const Elf32_Nhdr *nhdr; const char *data; - note_end = note_off + note_size; + if (check_add_overflow(note_off, note_size, ¬e_end)) + return -EINVAL; + while (note_end - note_off > sizeof(Elf32_Nhdr) + note_name_sz) { nhdr = freader_fetch(r, note_off, sizeof(Elf32_Nhdr) + note_name_sz); if (!nhdr) @@ -204,23 +205,6 @@ static int parse_build_id_buf(struct freader *r, return -EINVAL; } -static inline int parse_build_id(struct freader *r, - unsigned char *build_id, - __u32 *size, - loff_t note_start_off, - Elf32_Word note_size) -{ - /* check for overflow */ - if (note_start_off + note_size < note_start_off) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start_off + note_size > PAGE_SIZE) - return -EINVAL; - - return parse_build_id_buf(r, build_id, size, note_start_off, note_size); -} - /* Parse build ID from 32-bit ELF */ static int get_build_id_32(struct freader *r, unsigned char *build_id, __u32 *size) { -- cgit v1.2.3 From db0aa2e9566fda2d23dc8f6c102856ead95578a4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 19 Jun 2024 00:20:42 +0100 Subject: mm: Define struct folio_queue and ITER_FOLIOQ to handle a sequence of folios Define a data structure, struct folio_queue, to represent a sequence of folios and a kernel-internal I/O iterator type, ITER_FOLIOQ, to allow a list of folio_queue structures to be used to provide a buffer to iov_iter-taking functions, such as sendmsg and recvmsg. The folio_queue structure looks like: struct folio_queue { struct folio_batch vec; u8 orders[PAGEVEC_SIZE]; struct folio_queue *next; struct folio_queue *prev; unsigned long marks; unsigned long marks2; }; It does not use a list_head so that next and/or prev can be set to NULL at the ends of the list, allowing iov_iter-handling routines to determine that they *are* the ends without needing to store a head pointer in the iov_iter struct. A folio_batch struct is used to hold the folio pointers which allows the batch to be passed to batch handling functions. Two mark bits are available per slot. The intention is to use at least one of them to mark folios that need putting, but that might not be ultimately necessary. Accessor functions are used to access the slots to do the masking and an additional accessor function is used to indicate the size of the array. The order of each folio is also stored in the structure to avoid the need for iov_iter_advance() and iov_iter_revert() to have to query each folio to find its size. With careful barriering, this can be used as an extending buffer with new folios inserted and new folio_queue structs added without the need for a lock. Further, provided we always keep at least one struct in the buffer, we can also remove consumed folios and consumed structs from the head end as we without the need for locks. [Questions/thoughts] (1) To manage this, I need a head pointer, a tail pointer, a tail slot number (assuming insertion happens at the tail end and the next pointers point from head to tail). Should I put these into a struct of their own, say "folio_queue_head" or "rolling_buffer"? I will end up with two of these in netfs_io_request eventually, one keeping track of the pagecache I'm dealing with for buffered I/O and the other to hold a bounce buffer when we need one. (2) Should I make the slots {folio,off,len} or bio_vec? (3) This is intended to replace ITER_XARRAY eventually. Using an xarray in I/O iteration requires the taking of the RCU read lock, doing copying under the RCU read lock, walking the xarray (which may change under us), handling retries and dealing with special values. The advantage of ITER_XARRAY is that when we're dealing with the pagecache directly, we don't need any allocation - but if we're doing encrypted comms, there's a good chance we'd be using a bounce buffer anyway. This will require afs, erofs, cifs, orangefs and fscache to be converted to not use this. afs still uses it for dirs and symlinks; some of erofs usages should be easy to change, but there's one which won't be so easy; ceph's use via fscache can be fixed by porting ceph to netfslib; cifs is using xarray as a bounce buffer - that can be moved to use sheaves instead; and orangefs has a similar problem to erofs - maybe orangefs could use netfslib? Signed-off-by: David Howells cc: Matthew Wilcox cc: Jeff Layton cc: Steve French cc: Ilya Dryomov cc: Gao Xiang cc: Mike Marshall cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: linux-mm@kvack.org cc: linux-afs@lists.infradead.org cc: linux-cifs@vger.kernel.org cc: ceph-devel@vger.kernel.org cc: linux-erofs@lists.ozlabs.org cc: devel@lists.orangefs.org Link: https://lore.kernel.org/r/20240814203850.2240469-13-dhowells@redhat.com/ # v2 Signed-off-by: Christian Brauner --- include/linux/folio_queue.h | 138 +++++++++++++++++++++++ include/linux/iov_iter.h | 57 ++++++++++ include/linux/uio.h | 12 ++ lib/iov_iter.c | 240 +++++++++++++++++++++++++++++++++++++++- lib/kunit_iov_iter.c | 259 ++++++++++++++++++++++++++++++++++++++++++++ lib/scatterlist.c | 69 +++++++++++- 6 files changed, 771 insertions(+), 4 deletions(-) create mode 100644 include/linux/folio_queue.h (limited to 'lib') diff --git a/include/linux/folio_queue.h b/include/linux/folio_queue.h new file mode 100644 index 000000000000..52773613bf23 --- /dev/null +++ b/include/linux/folio_queue.h @@ -0,0 +1,138 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* Queue of folios definitions + * + * Copyright (C) 2024 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#ifndef _LINUX_FOLIO_QUEUE_H +#define _LINUX_FOLIO_QUEUE_H + +#include + +/* + * Segment in a queue of running buffers. Each segment can hold a number of + * folios and a portion of the queue can be referenced with the ITER_FOLIOQ + * iterator. The possibility exists of inserting non-folio elements into the + * queue (such as gaps). + * + * Explicit prev and next pointers are used instead of a list_head to make it + * easier to add segments to tail and remove them from the head without the + * need for a lock. + */ +struct folio_queue { + struct folio_batch vec; /* Folios in the queue segment */ + u8 orders[PAGEVEC_SIZE]; /* Order of each folio */ + struct folio_queue *next; /* Next queue segment or NULL */ + struct folio_queue *prev; /* Previous queue segment of NULL */ + unsigned long marks; /* 1-bit mark per folio */ + unsigned long marks2; /* Second 1-bit mark per folio */ +#if PAGEVEC_SIZE > BITS_PER_LONG +#error marks is not big enough +#endif +}; + +static inline void folioq_init(struct folio_queue *folioq) +{ + folio_batch_init(&folioq->vec); + folioq->next = NULL; + folioq->prev = NULL; + folioq->marks = 0; + folioq->marks2 = 0; +} + +static inline unsigned int folioq_nr_slots(const struct folio_queue *folioq) +{ + return PAGEVEC_SIZE; +} + +static inline unsigned int folioq_count(struct folio_queue *folioq) +{ + return folio_batch_count(&folioq->vec); +} + +static inline bool folioq_full(struct folio_queue *folioq) +{ + //return !folio_batch_space(&folioq->vec); + return folioq_count(folioq) >= folioq_nr_slots(folioq); +} + +static inline bool folioq_is_marked(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks); +} + +static inline void folioq_mark(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks); +} + +static inline void folioq_unmark(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks); +} + +static inline bool folioq_is_marked2(const struct folio_queue *folioq, unsigned int slot) +{ + return test_bit(slot, &folioq->marks2); +} + +static inline void folioq_mark2(struct folio_queue *folioq, unsigned int slot) +{ + set_bit(slot, &folioq->marks2); +} + +static inline void folioq_unmark2(struct folio_queue *folioq, unsigned int slot) +{ + clear_bit(slot, &folioq->marks2); +} + +static inline unsigned int __folio_order(struct folio *folio) +{ + if (!folio_test_large(folio)) + return 0; + return folio->_flags_1 & 0xff; +} + +static inline unsigned int folioq_append(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + return slot; +} + +static inline unsigned int folioq_append_mark(struct folio_queue *folioq, struct folio *folio) +{ + unsigned int slot = folioq->vec.nr++; + + folioq->vec.folios[slot] = folio; + folioq->orders[slot] = __folio_order(folio); + folioq_mark(folioq, slot); + return slot; +} + +static inline struct folio *folioq_folio(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->vec.folios[slot]; +} + +static inline unsigned int folioq_folio_order(const struct folio_queue *folioq, unsigned int slot) +{ + return folioq->orders[slot]; +} + +static inline size_t folioq_folio_size(const struct folio_queue *folioq, unsigned int slot) +{ + return PAGE_SIZE << folioq_folio_order(folioq, slot); +} + +static inline void folioq_clear(struct folio_queue *folioq, unsigned int slot) +{ + folioq->vec.folios[slot] = NULL; + folioq_unmark(folioq, slot); + folioq_unmark2(folioq, slot); +} + +#endif /* _LINUX_FOLIO_QUEUE_H */ diff --git a/include/linux/iov_iter.h b/include/linux/iov_iter.h index 270454a6703d..a223370a59a7 100644 --- a/include/linux/iov_iter.h +++ b/include/linux/iov_iter.h @@ -10,6 +10,7 @@ #include #include +#include typedef size_t (*iov_step_f)(void *iter_base, size_t progress, size_t len, void *priv, void *priv2); @@ -140,6 +141,60 @@ size_t iterate_bvec(struct iov_iter *iter, size_t len, void *priv, void *priv2, return progress; } +/* + * Handle ITER_FOLIOQ. + */ +static __always_inline +size_t iterate_folioq(struct iov_iter *iter, size_t len, void *priv, void *priv2, + iov_step_f step) +{ + const struct folio_queue *folioq = iter->folioq; + unsigned int slot = iter->folioq_slot; + size_t progress = 0, skip = iter->iov_offset; + + if (slot == folioq_nr_slots(folioq)) { + /* The iterator may have been extended. */ + folioq = folioq->next; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t part, remain, consumed; + size_t fsize; + void *base; + + if (!folio) + break; + + fsize = folioq_folio_size(folioq, slot); + base = kmap_local_folio(folio, skip); + part = umin(len, PAGE_SIZE - skip % PAGE_SIZE); + remain = step(base, progress, part, priv, priv2); + kunmap_local(base); + consumed = part - remain; + len -= consumed; + progress += consumed; + skip += consumed; + if (skip >= fsize) { + skip = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + if (remain) + break; + } while (len); + + iter->folioq_slot = slot; + iter->folioq = folioq; + iter->iov_offset = skip; + iter->count -= progress; + return progress; +} + /* * Handle ITER_XARRAY. */ @@ -249,6 +304,8 @@ size_t iterate_and_advance2(struct iov_iter *iter, size_t len, void *priv, return iterate_bvec(iter, len, priv, priv2, step); if (iov_iter_is_kvec(iter)) return iterate_kvec(iter, len, priv, priv2, step); + if (iov_iter_is_folioq(iter)) + return iterate_folioq(iter, len, priv, priv2, step); if (iov_iter_is_xarray(iter)) return iterate_xarray(iter, len, priv, priv2, step); return iterate_discard(iter, len, priv, priv2, step); diff --git a/include/linux/uio.h b/include/linux/uio.h index 7020adedfa08..845d110acadc 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -11,6 +11,7 @@ #include struct page; +struct folio_queue; typedef unsigned int __bitwise iov_iter_extraction_t; @@ -25,6 +26,7 @@ enum iter_type { ITER_IOVEC, ITER_BVEC, ITER_KVEC, + ITER_FOLIOQ, ITER_XARRAY, ITER_DISCARD, }; @@ -66,6 +68,7 @@ struct iov_iter { const struct iovec *__iov; const struct kvec *kvec; const struct bio_vec *bvec; + const struct folio_queue *folioq; struct xarray *xarray; void __user *ubuf; }; @@ -74,6 +77,7 @@ struct iov_iter { }; union { unsigned long nr_segs; + u8 folioq_slot; loff_t xarray_start; }; }; @@ -126,6 +130,11 @@ static inline bool iov_iter_is_discard(const struct iov_iter *i) return iov_iter_type(i) == ITER_DISCARD; } +static inline bool iov_iter_is_folioq(const struct iov_iter *i) +{ + return iov_iter_type(i) == ITER_FOLIOQ; +} + static inline bool iov_iter_is_xarray(const struct iov_iter *i) { return iov_iter_type(i) == ITER_XARRAY; @@ -273,6 +282,9 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, + unsigned int first_slot, unsigned int offset, size_t count); void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray, loff_t start, size_t count); ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages, diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4a6a9f419bd7..97003155bfac 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -527,6 +527,39 @@ static void iov_iter_iovec_advance(struct iov_iter *i, size_t size) i->__iov = iov; } +static void iov_iter_folioq_advance(struct iov_iter *i, size_t size) +{ + const struct folio_queue *folioq = i->folioq; + unsigned int slot = i->folioq_slot; + + if (!i->count) + return; + i->count -= size; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + } + + size += i->iov_offset; /* From beginning of current segment. */ + do { + size_t fsize = folioq_folio_size(folioq, slot); + + if (likely(size < fsize)) + break; + size -= fsize; + slot++; + if (slot >= folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } while (size); + + i->iov_offset = size; + i->folioq_slot = slot; + i->folioq = folioq; +} + void iov_iter_advance(struct iov_iter *i, size_t size) { if (unlikely(i->count < size)) @@ -539,12 +572,40 @@ void iov_iter_advance(struct iov_iter *i, size_t size) iov_iter_iovec_advance(i, size); } else if (iov_iter_is_bvec(i)) { iov_iter_bvec_advance(i, size); + } else if (iov_iter_is_folioq(i)) { + iov_iter_folioq_advance(i, size); } else if (iov_iter_is_discard(i)) { i->count -= size; } } EXPORT_SYMBOL(iov_iter_advance); +static void iov_iter_folioq_revert(struct iov_iter *i, size_t unroll) +{ + const struct folio_queue *folioq = i->folioq; + unsigned int slot = i->folioq_slot; + + for (;;) { + size_t fsize; + + if (slot == 0) { + folioq = folioq->prev; + slot = folioq_nr_slots(folioq); + } + slot--; + + fsize = folioq_folio_size(folioq, slot); + if (unroll <= fsize) { + i->iov_offset = fsize - unroll; + break; + } + unroll -= fsize; + } + + i->folioq_slot = slot; + i->folioq = folioq; +} + void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) @@ -576,6 +637,9 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) } unroll -= n; } + } else if (iov_iter_is_folioq(i)) { + i->iov_offset = 0; + iov_iter_folioq_revert(i, unroll); } else { /* same logics for iovec and kvec */ const struct iovec *iov = iter_iov(i); while (1) { @@ -603,6 +667,9 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return min(i->count, i->bvec->bv_len - i->iov_offset); } + if (unlikely(iov_iter_is_folioq(i))) + return !i->count ? 0 : + umin(folioq_folio_size(i->folioq, i->folioq_slot), i->count); return i->count; } EXPORT_SYMBOL(iov_iter_single_seg_count); @@ -639,6 +706,36 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction, } EXPORT_SYMBOL(iov_iter_bvec); +/** + * iov_iter_folio_queue - Initialise an I/O iterator to use the folios in a folio queue + * @i: The iterator to initialise. + * @direction: The direction of the transfer. + * @folioq: The starting point in the folio queue. + * @first_slot: The first slot in the folio queue to use + * @offset: The offset into the folio in the first slot to start at + * @count: The size of the I/O buffer in bytes. + * + * Set up an I/O iterator to either draw data out of the pages attached to an + * inode or to inject data into those pages. The pages *must* be prevented + * from evaporation, either by taking a ref on them or locking them by the + * caller. + */ +void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction, + const struct folio_queue *folioq, unsigned int first_slot, + unsigned int offset, size_t count) +{ + BUG_ON(direction & ~1); + *i = (struct iov_iter) { + .iter_type = ITER_FOLIOQ, + .data_source = direction, + .folioq = folioq, + .folioq_slot = first_slot, + .count = count, + .iov_offset = offset, + }; +} +EXPORT_SYMBOL(iov_iter_folio_queue); + /** * iov_iter_xarray - Initialise an I/O iterator to use the pages in an xarray * @i: The iterator to initialise. @@ -765,12 +862,19 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask, if (iov_iter_is_bvec(i)) return iov_iter_aligned_bvec(i, addr_mask, len_mask); + /* With both xarray and folioq types, we're dealing with whole folios. */ if (iov_iter_is_xarray(i)) { if (i->count & len_mask) return false; if ((i->xarray_start + i->iov_offset) & addr_mask) return false; } + if (iov_iter_is_folioq(i)) { + if (i->count & len_mask) + return false; + if (i->iov_offset & addr_mask) + return false; + } return true; } @@ -835,6 +939,9 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) if (iov_iter_is_bvec(i)) return iov_iter_alignment_bvec(i); + /* With both xarray and folioq types, we're dealing with whole folios. */ + if (iov_iter_is_folioq(i)) + return i->iov_offset | i->count; if (iov_iter_is_xarray(i)) return (i->xarray_start + i->iov_offset) | i->count; @@ -887,6 +994,62 @@ static int want_pages_array(struct page ***res, size_t size, return count; } +static ssize_t iter_folioq_get_pages(struct iov_iter *iter, + struct page ***ppages, size_t maxsize, + unsigned maxpages, size_t *_start_offset) +{ + const struct folio_queue *folioq = iter->folioq; + struct page **pages; + unsigned int slot = iter->folioq_slot; + size_t extracted = 0, count = iter->count, iov_offset = iter->iov_offset; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + if (WARN_ON(iov_offset != 0)) + return -EIO; + } + + maxpages = want_pages_array(ppages, maxsize, iov_offset & ~PAGE_MASK, maxpages); + if (!maxpages) + return -ENOMEM; + *_start_offset = iov_offset & ~PAGE_MASK; + pages = *ppages; + + for (;;) { + struct folio *folio = folioq_folio(folioq, slot); + size_t offset = iov_offset, fsize = folioq_folio_size(folioq, slot); + size_t part = PAGE_SIZE - offset % PAGE_SIZE; + + part = umin(part, umin(maxsize - extracted, fsize - offset)); + count -= part; + iov_offset += part; + extracted += part; + + *pages = folio_page(folio, offset / PAGE_SIZE); + get_page(*pages); + pages++; + maxpages--; + if (maxpages == 0 || extracted >= maxsize) + break; + + if (offset >= fsize) { + iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + } + + iter->count = count; + iter->iov_offset = iov_offset; + iter->folioq = folioq; + iter->folioq_slot = slot; + return extracted; +} + static ssize_t iter_xarray_populate_pages(struct page **pages, struct xarray *xa, pgoff_t index, unsigned int nr_pages) { @@ -1034,6 +1197,8 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i, } return maxsize; } + if (iov_iter_is_folioq(i)) + return iter_folioq_get_pages(i, pages, maxsize, maxpages, start); if (iov_iter_is_xarray(i)) return iter_xarray_get_pages(i, pages, maxsize, maxpages, start); return -EFAULT; @@ -1118,6 +1283,11 @@ int iov_iter_npages(const struct iov_iter *i, int maxpages) return iov_npages(i, maxpages); if (iov_iter_is_bvec(i)) return bvec_npages(i, maxpages); + if (iov_iter_is_folioq(i)) { + unsigned offset = i->iov_offset % PAGE_SIZE; + int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); + return min(npages, maxpages); + } if (iov_iter_is_xarray(i)) { unsigned offset = (i->xarray_start + i->iov_offset) % PAGE_SIZE; int npages = DIV_ROUND_UP(offset + i->count, PAGE_SIZE); @@ -1398,6 +1568,68 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state) i->nr_segs = state->nr_segs; } +/* + * Extract a list of contiguous pages from an ITER_FOLIOQ iterator. This does + * not get references on the pages, nor does it get a pin on them. + */ +static ssize_t iov_iter_extract_folioq_pages(struct iov_iter *i, + struct page ***pages, size_t maxsize, + unsigned int maxpages, + iov_iter_extraction_t extraction_flags, + size_t *offset0) +{ + const struct folio_queue *folioq = i->folioq; + struct page **p; + unsigned int nr = 0; + size_t extracted = 0, offset, slot = i->folioq_slot; + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + slot = 0; + if (WARN_ON(i->iov_offset != 0)) + return -EIO; + } + + offset = i->iov_offset & ~PAGE_MASK; + *offset0 = offset; + + maxpages = want_pages_array(pages, maxsize, offset, maxpages); + if (!maxpages) + return -ENOMEM; + p = *pages; + + for (;;) { + struct folio *folio = folioq_folio(folioq, slot); + size_t offset = i->iov_offset, fsize = folioq_folio_size(folioq, slot); + size_t part = PAGE_SIZE - offset % PAGE_SIZE; + + if (offset < fsize) { + part = umin(part, umin(maxsize - extracted, fsize - offset)); + i->count -= part; + i->iov_offset += part; + extracted += part; + + p[nr++] = folio_page(folio, offset / PAGE_SIZE); + } + + if (nr >= maxpages || extracted >= maxsize) + break; + + if (i->iov_offset >= fsize) { + i->iov_offset = 0; + slot++; + if (slot == folioq_nr_slots(folioq) && folioq->next) { + folioq = folioq->next; + slot = 0; + } + } + } + + i->folioq = folioq; + i->folioq_slot = slot; + return extracted; +} + /* * Extract a list of contiguous pages from an ITER_XARRAY iterator. This does not * get references on the pages, nor does it get a pin on them. @@ -1618,8 +1850,8 @@ static ssize_t iov_iter_extract_user_pages(struct iov_iter *i, * added to the pages, but refs will not be taken. * iov_iter_extract_will_pin() will return true. * - * (*) If the iterator is ITER_KVEC, ITER_BVEC or ITER_XARRAY, the pages are - * merely listed; no extra refs or pins are obtained. + * (*) If the iterator is ITER_KVEC, ITER_BVEC, ITER_FOLIOQ or ITER_XARRAY, the + * pages are merely listed; no extra refs or pins are obtained. * iov_iter_extract_will_pin() will return 0. * * Note also: @@ -1654,6 +1886,10 @@ ssize_t iov_iter_extract_pages(struct iov_iter *i, return iov_iter_extract_bvec_pages(i, pages, maxsize, maxpages, extraction_flags, offset0); + if (iov_iter_is_folioq(i)) + return iov_iter_extract_folioq_pages(i, pages, maxsize, + maxpages, extraction_flags, + offset0); if (iov_iter_is_xarray(i)) return iov_iter_extract_xarray_pages(i, pages, maxsize, maxpages, extraction_flags, diff --git a/lib/kunit_iov_iter.c b/lib/kunit_iov_iter.c index 27e0c8ee71d8..13e15687675a 100644 --- a/lib/kunit_iov_iter.c +++ b/lib/kunit_iov_iter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include MODULE_DESCRIPTION("iov_iter testing"); @@ -62,6 +63,9 @@ static void *__init iov_kunit_create_buffer(struct kunit *test, KUNIT_ASSERT_EQ(test, got, npages); } + for (int i = 0; i < npages; i++) + pages[i]->index = i; + buffer = vmap(pages, npages, VM_MAP | VM_MAP_PUT_PAGES, PAGE_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buffer); @@ -362,6 +366,179 @@ stop: KUNIT_SUCCEED(test); } +static void iov_kunit_destroy_folioq(void *data) +{ + struct folio_queue *folioq, *next; + + for (folioq = data; folioq; folioq = next) { + next = folioq->next; + for (int i = 0; i < folioq_nr_slots(folioq); i++) + if (folioq_folio(folioq, i)) + folio_put(folioq_folio(folioq, i)); + kfree(folioq); + } +} + +static void __init iov_kunit_load_folioq(struct kunit *test, + struct iov_iter *iter, int dir, + struct folio_queue *folioq, + struct page **pages, size_t npages) +{ + struct folio_queue *p = folioq; + size_t size = 0; + int i; + + for (i = 0; i < npages; i++) { + if (folioq_full(p)) { + p->next = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, p->next); + folioq_init(p->next); + p->next->prev = p; + p = p->next; + } + folioq_append(p, page_folio(pages[i])); + size += PAGE_SIZE; + } + iov_iter_folio_queue(iter, dir, folioq, 0, 0, size); +} + +static struct folio_queue *iov_kunit_create_folioq(struct kunit *test) +{ + struct folio_queue *folioq; + + folioq = kzalloc(sizeof(struct folio_queue), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, folioq); + kunit_add_action_or_reset(test, iov_kunit_destroy_folioq, folioq); + folioq_init(folioq); + return folioq; +} + +/* + * Test copying to a ITER_FOLIOQ-type iterator. + */ +static void __init iov_kunit_copy_to_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct iov_iter iter; + struct folio_queue *folioq; + struct page **spages, **bpages; + u8 *scratch, *buffer; + size_t bufsize, npages, size, copied; + int i, patt; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + scratch = iov_kunit_create_buffer(test, &spages, npages); + for (i = 0; i < bufsize; i++) + scratch[i] = pattern(i); + + buffer = iov_kunit_create_buffer(test, &bpages, npages); + memset(buffer, 0, bufsize); + + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + i = 0; + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + size = pr->to - pr->from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, READ, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, pr->from); + copied = copy_to_iter(scratch + i, size, &iter); + + KUNIT_EXPECT_EQ(test, copied, size); + KUNIT_EXPECT_EQ(test, iter.count, 0); + KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); + i += size; + if (test->status == KUNIT_FAILURE) + goto stop; + } + + /* Build the expected image in the scratch buffer. */ + patt = 0; + memset(scratch, 0, bufsize); + for (pr = kvec_test_ranges; pr->from >= 0; pr++) + for (i = pr->from; i < pr->to; i++) + scratch[i] = pattern(patt++); + + /* Compare the images */ + for (i = 0; i < bufsize; i++) { + KUNIT_EXPECT_EQ_MSG(test, buffer[i], scratch[i], "at i=%x", i); + if (buffer[i] != scratch[i]) + return; + } + +stop: + KUNIT_SUCCEED(test); +} + +/* + * Test copying from a ITER_FOLIOQ-type iterator. + */ +static void __init iov_kunit_copy_from_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct iov_iter iter; + struct folio_queue *folioq; + struct page **spages, **bpages; + u8 *scratch, *buffer; + size_t bufsize, npages, size, copied; + int i, j; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + buffer = iov_kunit_create_buffer(test, &bpages, npages); + for (i = 0; i < bufsize; i++) + buffer[i] = pattern(i); + + scratch = iov_kunit_create_buffer(test, &spages, npages); + memset(scratch, 0, bufsize); + + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + i = 0; + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + size = pr->to - pr->from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, pr->from); + copied = copy_from_iter(scratch + i, size, &iter); + + KUNIT_EXPECT_EQ(test, copied, size); + KUNIT_EXPECT_EQ(test, iter.count, 0); + KUNIT_EXPECT_EQ(test, iter.iov_offset, pr->to % PAGE_SIZE); + i += size; + } + + /* Build the expected image in the main buffer. */ + i = 0; + memset(buffer, 0, bufsize); + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + for (j = pr->from; j < pr->to; j++) { + buffer[i++] = pattern(j); + if (i >= bufsize) + goto stop; + } + } +stop: + + /* Compare the images */ + for (i = 0; i < bufsize; i++) { + KUNIT_EXPECT_EQ_MSG(test, scratch[i], buffer[i], "at i=%x", i); + if (scratch[i] != buffer[i]) + return; + } + + KUNIT_SUCCEED(test); +} + static void iov_kunit_destroy_xarray(void *data) { struct xarray *xarray = data; @@ -677,6 +854,85 @@ stop: KUNIT_SUCCEED(test); } +/* + * Test the extraction of ITER_FOLIOQ-type iterators. + */ +static void __init iov_kunit_extract_pages_folioq(struct kunit *test) +{ + const struct kvec_test_range *pr; + struct folio_queue *folioq; + struct iov_iter iter; + struct page **bpages, *pagelist[8], **pages = pagelist; + ssize_t len; + size_t bufsize, size = 0, npages; + int i, from; + + bufsize = 0x100000; + npages = bufsize / PAGE_SIZE; + + folioq = iov_kunit_create_folioq(test); + + iov_kunit_create_buffer(test, &bpages, npages); + iov_kunit_load_folioq(test, &iter, READ, folioq, bpages, npages); + + for (pr = kvec_test_ranges; pr->from >= 0; pr++) { + from = pr->from; + size = pr->to - from; + KUNIT_ASSERT_LE(test, pr->to, bufsize); + + iov_iter_folio_queue(&iter, WRITE, folioq, 0, 0, pr->to); + iov_iter_advance(&iter, from); + + do { + size_t offset0 = LONG_MAX; + + for (i = 0; i < ARRAY_SIZE(pagelist); i++) + pagelist[i] = (void *)(unsigned long)0xaa55aa55aa55aa55ULL; + + len = iov_iter_extract_pages(&iter, &pages, 100 * 1024, + ARRAY_SIZE(pagelist), 0, &offset0); + KUNIT_EXPECT_GE(test, len, 0); + if (len < 0) + break; + KUNIT_EXPECT_LE(test, len, size); + KUNIT_EXPECT_EQ(test, iter.count, size - len); + if (len == 0) + break; + size -= len; + KUNIT_EXPECT_GE(test, (ssize_t)offset0, 0); + KUNIT_EXPECT_LT(test, offset0, PAGE_SIZE); + + for (i = 0; i < ARRAY_SIZE(pagelist); i++) { + struct page *p; + ssize_t part = min_t(ssize_t, len, PAGE_SIZE - offset0); + int ix; + + KUNIT_ASSERT_GE(test, part, 0); + ix = from / PAGE_SIZE; + KUNIT_ASSERT_LT(test, ix, npages); + p = bpages[ix]; + KUNIT_EXPECT_PTR_EQ(test, pagelist[i], p); + KUNIT_EXPECT_EQ(test, offset0, from % PAGE_SIZE); + from += part; + len -= part; + KUNIT_ASSERT_GE(test, len, 0); + if (len == 0) + break; + offset0 = 0; + } + + if (test->status == KUNIT_FAILURE) + goto stop; + } while (iov_iter_count(&iter) > 0); + + KUNIT_EXPECT_EQ(test, size, 0); + KUNIT_EXPECT_EQ(test, iter.count, 0); + } + +stop: + KUNIT_SUCCEED(test); +} + /* * Test the extraction of ITER_XARRAY-type iterators. */ @@ -761,10 +1017,13 @@ static struct kunit_case __refdata iov_kunit_cases[] = { KUNIT_CASE(iov_kunit_copy_from_kvec), KUNIT_CASE(iov_kunit_copy_to_bvec), KUNIT_CASE(iov_kunit_copy_from_bvec), + KUNIT_CASE(iov_kunit_copy_to_folioq), + KUNIT_CASE(iov_kunit_copy_from_folioq), KUNIT_CASE(iov_kunit_copy_to_xarray), KUNIT_CASE(iov_kunit_copy_from_xarray), KUNIT_CASE(iov_kunit_extract_pages_kvec), KUNIT_CASE(iov_kunit_extract_pages_bvec), + KUNIT_CASE(iov_kunit_extract_pages_folioq), KUNIT_CASE(iov_kunit_extract_pages_xarray), {} }; diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 7bc2220fea80..473b2646f71c 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -11,6 +11,7 @@ #include #include #include +#include /** * sg_next - return the next scatterlist entry in a list @@ -1261,6 +1262,67 @@ static ssize_t extract_kvec_to_sg(struct iov_iter *iter, return ret; } +/* + * Extract up to sg_max folios from an FOLIOQ-type iterator and add them to + * the scatterlist. The pages are not pinned. + */ +static ssize_t extract_folioq_to_sg(struct iov_iter *iter, + ssize_t maxsize, + struct sg_table *sgtable, + unsigned int sg_max, + iov_iter_extraction_t extraction_flags) +{ + const struct folio_queue *folioq = iter->folioq; + struct scatterlist *sg = sgtable->sgl + sgtable->nents; + unsigned int slot = iter->folioq_slot; + ssize_t ret = 0; + size_t offset = iter->iov_offset; + + BUG_ON(!folioq); + + if (slot >= folioq_nr_slots(folioq)) { + folioq = folioq->next; + if (WARN_ON_ONCE(!folioq)) + return 0; + slot = 0; + } + + do { + struct folio *folio = folioq_folio(folioq, slot); + size_t fsize = folioq_folio_size(folioq, slot); + + if (offset < fsize) { + size_t part = umin(maxsize - ret, fsize - offset); + + sg_set_page(sg, folio_page(folio, 0), part, offset); + sgtable->nents++; + sg++; + sg_max--; + offset += part; + ret += part; + } + + if (offset >= fsize) { + offset = 0; + slot++; + if (slot >= folioq_nr_slots(folioq)) { + if (!folioq->next) { + WARN_ON_ONCE(ret < iter->count); + break; + } + folioq = folioq->next; + slot = 0; + } + } + } while (sg_max > 0 && ret < maxsize); + + iter->folioq = folioq; + iter->folioq_slot = slot; + iter->iov_offset = offset; + iter->count -= ret; + return ret; +} + /* * Extract up to sg_max folios from an XARRAY-type iterator and add them to * the scatterlist. The pages are not pinned. @@ -1323,8 +1385,8 @@ static ssize_t extract_xarray_to_sg(struct iov_iter *iter, * addition of @sg_max elements. * * The pages referred to by UBUF- and IOVEC-type iterators are extracted and - * pinned; BVEC-, KVEC- and XARRAY-type are extracted but aren't pinned; PIPE- - * and DISCARD-type are not supported. + * pinned; BVEC-, KVEC-, FOLIOQ- and XARRAY-type are extracted but aren't + * pinned; DISCARD-type is not supported. * * No end mark is placed on the scatterlist; that's left to the caller. * @@ -1356,6 +1418,9 @@ ssize_t extract_iter_to_sg(struct iov_iter *iter, size_t maxsize, case ITER_KVEC: return extract_kvec_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); + case ITER_FOLIOQ: + return extract_folioq_to_sg(iter, maxsize, sgtable, sg_max, + extraction_flags); case ITER_XARRAY: return extract_xarray_to_sg(iter, maxsize, sgtable, sg_max, extraction_flags); -- cgit v1.2.3 From 7fcc9b53216cd87f73cc6dbb404220350ddc93b8 Mon Sep 17 00:00:00 2001 From: Luis Felipe Hernandez Date: Mon, 9 Sep 2024 21:10:34 -0400 Subject: lib/math: Add int_pow test suite Adds test suite for integer based power function which performs integer exponentiation. The test suite is designed to verify that the implementation of int_pow correctly computes the power of a given base raised to a given exponent. The tests check various scenarios and edge cases to ensure the accuracy and reliability of the exponentiation function. Updated commit with test information at commit time: Shuah Khan Signed-off-by: Luis Felipe Hernandez Reviewed-by: David Gow Signed-off-by: Shuah Khan --- lib/Kconfig.debug | 16 +++++++++++++ lib/math/Makefile | 1 + lib/math/tests/Makefile | 3 +++ lib/math/tests/int_pow_kunit.c | 52 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 72 insertions(+) create mode 100644 lib/math/tests/Makefile create mode 100644 lib/math/tests/int_pow_kunit.c (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..b5696659f027 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -3051,3 +3051,19 @@ config RUST_KERNEL_DOCTESTS endmenu # "Rust" endmenu # Kernel hacking + +config INT_POW_TEST + tristate "Integer exponentiation (int_pow) test" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + This option enables the KUnit test suite for the int_pow function, + which performs integer exponentiation. The test suite is designed to + verify that the implementation of int_pow correctly computes the power + of a given base raised to a given exponent. + + Enabling this option will include tests that check various scenarios + and edge cases to ensure the accuracy and reliability of the exponentiation + function. + + If unsure, say N diff --git a/lib/math/Makefile b/lib/math/Makefile index 91fcdb0c9efe..3c1f92a7459d 100644 --- a/lib/math/Makefile +++ b/lib/math/Makefile @@ -5,5 +5,6 @@ obj-$(CONFIG_CORDIC) += cordic.o obj-$(CONFIG_PRIME_NUMBERS) += prime_numbers.o obj-$(CONFIG_RATIONAL) += rational.o +obj-$(CONFIG_INT_POW_TEST) += tests/int_pow_kunit.o obj-$(CONFIG_TEST_DIV64) += test_div64.o obj-$(CONFIG_RATIONAL_KUNIT_TEST) += rational-test.o diff --git a/lib/math/tests/Makefile b/lib/math/tests/Makefile new file mode 100644 index 000000000000..6a169123320a --- /dev/null +++ b/lib/math/tests/Makefile @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INT_POW_TEST) += int_pow_kunit.o diff --git a/lib/math/tests/int_pow_kunit.c b/lib/math/tests/int_pow_kunit.c new file mode 100644 index 000000000000..34b33677d458 --- /dev/null +++ b/lib/math/tests/int_pow_kunit.c @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include + +struct test_case_params { + u64 base; + unsigned int exponent; + u64 expected_result; + const char *name; +}; + +static const struct test_case_params params[] = { + { 64, 0, 1, "Power of zero" }, + { 64, 1, 64, "Power of one"}, + { 0, 5, 0, "Base zero" }, + { 1, 64, 1, "Base one" }, + { 2, 2, 4, "Two squared"}, + { 2, 3, 8, "Two cubed"}, + { 5, 5, 3125, "Five raised to the fifth power" }, + { U64_MAX, 1, U64_MAX, "Max base" }, + { 2, 63, 9223372036854775808ULL, "Large result"}, +}; + +static void get_desc(const struct test_case_params *tc, char *desc) +{ + strscpy(desc, tc->name, KUNIT_PARAM_DESC_SIZE); +} + +KUNIT_ARRAY_PARAM(int_pow, params, get_desc); + +static void int_pow_test(struct kunit *test) +{ + const struct test_case_params *tc = (const struct test_case_params *)test->param_value; + + KUNIT_EXPECT_EQ(test, tc->expected_result, int_pow(tc->base, tc->exponent)); +} + +static struct kunit_case math_int_pow_test_cases[] = { + KUNIT_CASE_PARAM(int_pow_test, int_pow_gen_params), + {} +}; + +static struct kunit_suite int_pow_test_suite = { + .name = "math-int_pow", + .test_cases = math_int_pow_test_cases, +}; + +kunit_test_suites(&int_pow_test_suite); + +MODULE_DESCRIPTION("math.int_pow KUnit test suite"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 81c6896049b6ca69a9e737656ac33b3fd96a277c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 27 Aug 2024 09:31:50 +0200 Subject: random: vDSO: don't use 64-bit atomics on 32-bit architectures Performing SMP atomic operations on u64 fails on powerpc32: CC drivers/char/random.o In file included from : drivers/char/random.c: In function 'crng_reseed': ././include/linux/compiler_types.h:510:45: error: call to '__compiletime_assert_391' declared with attribute error: Need native word sized stores/loads for atomicity. 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ ././include/linux/compiler_types.h:491:25: note: in definition of macro '__compiletime_assert' 491 | prefix ## suffix(); \ | ^~~~~~ ././include/linux/compiler_types.h:510:9: note: in expansion of macro '_compiletime_assert' 510 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ ././include/linux/compiler_types.h:513:9: note: in expansion of macro 'compiletime_assert' 513 | compiletime_assert(__native_word(t), \ | ^~~~~~~~~~~~~~~~~~ ./arch/powerpc/include/asm/barrier.h:74:9: note: in expansion of macro 'compiletime_assert_atomic_type' 74 | compiletime_assert_atomic_type(*p); \ | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ./include/asm-generic/barrier.h:172:55: note: in expansion of macro '__smp_store_release' 172 | #define smp_store_release(p, v) do { kcsan_release(); __smp_store_release(p, v); } while (0) | ^~~~~~~~~~~~~~~~~~~ drivers/char/random.c:286:9: note: in expansion of macro 'smp_store_release' 286 | smp_store_release(&__arch_get_k_vdso_rng_data()->generation, next_gen + 1); | ^~~~~~~~~~~~~~~~~ The kernel-side generation counter in the random driver is handled as an unsigned long, not as a u64, in base_crng and struct crng. But on the vDSO side, it needs to be an u64, not just an unsigned long, in order to support a 32-bit vDSO atop a 64-bit kernel. On kernel side, however, it is an unsigned long, hence a 32-bit value on 32-bit architectures, so just cast it to unsigned long for the smp_store_release(). A side effect is that on big endian architectures the store will be performed in the upper 32 bits. It is not an issue on its own because the vDSO site doesn't mind the value, as it only checks differences. Just make sure that the vDSO side checks the full 64 bits. For that, the local current_generation has to be u64 as well. Signed-off-by: Christophe Leroy Suggested-by: Thomas Gleixner Signed-off-by: Jason A. Donenfeld --- drivers/char/random.c | 9 ++++++++- lib/vdso/getrandom.c | 2 +- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/drivers/char/random.c b/drivers/char/random.c index 87fe61295ea1..c12f906e05d6 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -281,8 +281,15 @@ static void crng_reseed(struct work_struct *work) * former to arrive at the latter. Use smp_store_release so that this * is ordered with the write above to base_crng.generation. Pairs with * the smp_rmb() before the syscall in the vDSO code. + * + * Cast to unsigned long for 32-bit architectures, since atomic 64-bit + * operations are not supported on those architectures. This is safe + * because base_crng.generation is a 32-bit value. On big-endian + * architectures it will be stored in the upper 32 bits, but that's okay + * because the vDSO side only checks whether the value changed, without + * actually using or interpreting the value. */ - smp_store_release(&_vdso_rng_data.generation, next_gen + 1); + smp_store_release((unsigned long *)&_vdso_rng_data.generation, next_gen + 1); #endif if (!static_branch_likely(&crng_is_ready)) crng_init = CRNG_READY; diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index e1db228bc4f0..bacf19dbb6a1 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -68,8 +68,8 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_ struct vgetrandom_state *state = opaque_state; size_t batch_len, nblocks, orig_len = len; bool in_use, have_retried = false; - unsigned long current_generation; void *orig_buffer = buffer; + u64 current_generation; u32 counter[2] = { 0 }; if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { -- cgit v1.2.3 From 81723e3ac388271cf1fe9d9ca8f4e9c74689ea0e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2024 09:13:12 +0200 Subject: random: vDSO: add missing c-getrandom-y in Makefile Same as for the gettimeofday CVDSO implementation, add c-getrandom-y to ease the inclusion of lib/vdso/getrandom.c in architectures' VDSO builds. Signed-off-by: Christophe Leroy Signed-off-by: Jason A. Donenfeld --- lib/vdso/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/vdso/Makefile b/lib/vdso/Makefile index 9f031eafc465..cedbf15f8087 100644 --- a/lib/vdso/Makefile +++ b/lib/vdso/Makefile @@ -4,6 +4,7 @@ GENERIC_VDSO_MK_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) GENERIC_VDSO_DIR := $(dir $(GENERIC_VDSO_MK_PATH)) c-gettimeofday-$(CONFIG_GENERIC_GETTIMEOFDAY) := $(addprefix $(GENERIC_VDSO_DIR), gettimeofday.c) +c-getrandom-$(CONFIG_VDSO_GETRANDOM) := $(addprefix $(GENERIC_VDSO_DIR), getrandom.c) # This cmd checks that the vdso library does not contain dynamic relocations. # It has to be called after the linking of the vdso library and requires it -- cgit v1.2.3 From b7bad082e113640fc81200ff869e5c2d7a9c29a2 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 22 Aug 2024 09:13:13 +0200 Subject: random: vDSO: avoid call to out of line memset() With the current implementation, __cvdso_getrandom_data() calls memset() on certain architectures, which is unexpected in the VDSO. Rather than providing a memset(), simply rewrite opaque data initialization to avoid memset(). Signed-off-by: Christophe Leroy Acked-by: Ard Biesheuvel Signed-off-by: Jason A. Donenfeld --- lib/vdso/getrandom.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index bacf19dbb6a1..1281fa3546c2 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -3,6 +3,7 @@ * Copyright (C) 2022-2024 Jason A. Donenfeld . All Rights Reserved. */ +#include #include #include #include @@ -73,11 +74,12 @@ __cvdso_getrandom_data(const struct vdso_rng_data *rng_info, void *buffer, size_ u32 counter[2] = { 0 }; if (unlikely(opaque_len == ~0UL && !buffer && !len && !flags)) { - *(struct vgetrandom_opaque_params *)opaque_state = (struct vgetrandom_opaque_params) { - .size_of_opaque_state = sizeof(*state), - .mmap_prot = PROT_READ | PROT_WRITE, - .mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS - }; + struct vgetrandom_opaque_params *params = opaque_state; + params->size_of_opaque_state = sizeof(*state); + params->mmap_prot = PROT_READ | PROT_WRITE; + params->mmap_flags = MAP_DROPPABLE | MAP_ANONYMOUS; + for (size_t i = 0; i < ARRAY_SIZE(params->reserved); ++i) + params->reserved[i] = 0; return 0; } -- cgit v1.2.3 From 7f053812dab3946cb704520b72c381f605ecdf95 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Tue, 27 Aug 2024 09:31:47 +0200 Subject: random: vDSO: minimize and simplify header includes Depending on the architecture, building a 32-bit vDSO on a 64-bit kernel is problematic when some system headers are included. Minimise the amount of headers by moving needed items, such as __{get,put}_unaligned_t, into dedicated common headers and in general use more specific headers, similar to what was done in commit 8165b57bca21 ("linux/const.h: Extract common header for vDSO") and commit 8c59ab839f52 ("lib/vdso: Enable common headers"). On some architectures this results in missing PAGE_SIZE, as was described by commit 8b3843ae3634 ("vdso/datapage: Quick fix - use asm/page-def.h for ARM64"), so define this if necessary, in the same way as done prior by commit cffaefd15a8f ("vdso: Use CONFIG_PAGE_SHIFT in vdso/datapage.h"). Removing linux/time64.h leads to missing 'struct timespec64' in x86's asm/pvclock.h. Add a forward declaration of that struct in that file. Signed-off-by: Christophe Leroy Signed-off-by: Jason A. Donenfeld --- arch/x86/include/asm/pvclock.h | 1 + include/asm-generic/unaligned.h | 11 +---------- include/vdso/helpers.h | 1 + include/vdso/unaligned.h | 15 +++++++++++++++ lib/vdso/getrandom.c | 13 ++++++++----- 5 files changed, 26 insertions(+), 15 deletions(-) create mode 100644 include/vdso/unaligned.h (limited to 'lib') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 0c92db84469d..6e4f8fae3ce9 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -5,6 +5,7 @@ #include #include +struct timespec64; /* some helper functions for xen and kvm pv clock sources */ u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u64 pvclock_clocksource_read_nowd(struct pvclock_vcpu_time_info *src); diff --git a/include/asm-generic/unaligned.h b/include/asm-generic/unaligned.h index a84c64e5f11e..95acdd70b3b2 100644 --- a/include/asm-generic/unaligned.h +++ b/include/asm-generic/unaligned.h @@ -8,16 +8,7 @@ */ #include #include - -#define __get_unaligned_t(type, ptr) ({ \ - const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ - __pptr->x; \ -}) - -#define __put_unaligned_t(type, val, ptr) do { \ - struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ - __pptr->x = (val); \ -} while (0) +#include #define get_unaligned(ptr) __get_unaligned_t(typeof(*(ptr)), (ptr)) #define put_unaligned(val, ptr) __put_unaligned_t(typeof(*(ptr)), (val), (ptr)) diff --git a/include/vdso/helpers.h b/include/vdso/helpers.h index 73501149439d..3ddb03bb05cb 100644 --- a/include/vdso/helpers.h +++ b/include/vdso/helpers.h @@ -4,6 +4,7 @@ #ifndef __ASSEMBLY__ +#include #include static __always_inline u32 vdso_read_begin(const struct vdso_data *vd) diff --git a/include/vdso/unaligned.h b/include/vdso/unaligned.h new file mode 100644 index 000000000000..eee3d2a4dbe4 --- /dev/null +++ b/include/vdso/unaligned.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __VDSO_UNALIGNED_H +#define __VDSO_UNALIGNED_H + +#define __get_unaligned_t(type, ptr) ({ \ + const struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ + __pptr->x; \ +}) + +#define __put_unaligned_t(type, val, ptr) do { \ + struct { type x; } __packed *__pptr = (typeof(__pptr))(ptr); \ + __pptr->x = (val); \ +} while (0) + +#endif /* __VDSO_UNALIGNED_H */ diff --git a/lib/vdso/getrandom.c b/lib/vdso/getrandom.c index 1281fa3546c2..938ca539aaa6 100644 --- a/lib/vdso/getrandom.c +++ b/lib/vdso/getrandom.c @@ -4,15 +4,18 @@ */ #include -#include -#include -#include +#include #include #include +#include #include -#include -#include #include +#include + +#undef PAGE_SIZE +#undef PAGE_MASK +#define PAGE_SIZE (1UL << CONFIG_PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE - 1)) #define MEMCPY_AND_ZERO_SRC(type, dst, src, len) do { \ while (len >= sizeof(type)) { \ -- cgit v1.2.3 From 42450f7a9086cf38e97c3aeeaabd229af7abbbad Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 14 Sep 2024 02:37:53 +0900 Subject: btf: move pahole check in scripts/link-vmlinux.sh to lib/Kconfig.debug When DEBUG_INFO_DWARF5 is selected, pahole 1.21+ is required to enable DEBUG_INFO_BTF. When DEBUG_INFO_DWARF4 or DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is selected, DEBUG_INFO_BTF can be enabled without pahole installed, but a build error will occur in scripts/link-vmlinux.sh: LD .tmp_vmlinux1 BTF: .tmp_vmlinux1: pahole (pahole) is not available Failed to generate BTF for vmlinux Try to disable CONFIG_DEBUG_INFO_BTF We did not guard DEBUG_INFO_BTF by PAHOLE_VERSION when previously discussed [1]. However, commit 613fe1692377 ("kbuild: Add CONFIG_PAHOLE_VERSION") added CONFIG_PAHOLE_VERSION after all. Now several CONFIG options, as well as the combination of DEBUG_INFO_BTF and DEBUG_INFO_DWARF5, are guarded by PAHOLE_VERSION. The remaining compile-time check in scripts/link-vmlinux.sh now appears to be an awkward inconsistency. This commit adopts Nathan's original work. [1]: https://lore.kernel.org/lkml/20210111180609.713998-1-natechancellor@gmail.com/ Signed-off-by: Masahiro Yamada Reviewed-by: Alan Maguire Acked-by: Andrii Nakryiko Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240913173759.1316390-2-masahiroy@kernel.org Signed-off-by: Alexei Starovoitov --- lib/Kconfig.debug | 6 ++++-- scripts/link-vmlinux.sh | 12 ------------ 2 files changed, 4 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..f627147d6c25 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -379,13 +379,15 @@ config DEBUG_INFO_BTF depends on !DEBUG_INFO_SPLIT && !DEBUG_INFO_REDUCED depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL + depends on PAHOLE_VERSION >= 116 depends on !DEBUG_INFO_DWARF5 || PAHOLE_VERSION >= 121 # pahole uses elfutils, which does not have support for Hexagon relocations depends on !HEXAGON help Generate deduplicated BTF type information from DWARF debug info. - Turning this on expects presence of pahole tool, which will convert - DWARF type info into equivalent deduplicated BTF type info. + Turning this on requires pahole v1.16 or later (v1.21 or later to + support DWARF 5), which will convert DWARF type info into equivalent + deduplicated BTF type info. config PAHOLE_HAS_SPLIT_BTF def_bool PAHOLE_VERSION >= 119 diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 8a06907fea32..069d34112e6a 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -107,20 +107,8 @@ vmlinux_link() # ${1} - vmlinux image gen_btf() { - local pahole_ver local btf_data=${1}.btf.o - if ! [ -x "$(command -v ${PAHOLE})" ]; then - echo >&2 "BTF: ${1}: pahole (${PAHOLE}) is not available" - return 1 - fi - - pahole_ver=$(${PAHOLE} --version | sed -E 's/v([0-9]+)\.([0-9]+)/\1\2/') - if [ "${pahole_ver}" -lt "116" ]; then - echo >&2 "BTF: ${1}: pahole version $(${PAHOLE} --version) is too old, need at least v1.16" - return 1 - fi - info BTF "${btf_data}" LLVM_OBJCOPY="${OBJCOPY}" ${PAHOLE} -J ${PAHOLE_FLAGS} ${1} -- cgit v1.2.3 From 5277d130947ba8c0d54c16eed89eb97f0b6d2e5a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 14 Sep 2024 02:37:54 +0900 Subject: btf: require pahole 1.21+ for DEBUG_INFO_BTF with default DWARF version As described in commit 42d9b379e3e1 ("lib/Kconfig.debug: Allow BTF + DWARF5 with pahole 1.21+"), the combination of CONFIG_DEBUG_INFO_BTF and CONFIG_DEBUG_INFO_DWARF5 requires pahole 1.21+. GCC 11+ and Clang 14+ default to DWARF 5 when the -g flag is passed. For the same reason, the combination of CONFIG_DEBUG_INFO_BTF and CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT is also likely to require pahole 1.21+ these days. (At least, it is uncertain whether the actual requirement is pahole 1.16+ or 1.21+.) Signed-off-by: Masahiro Yamada Reviewed-by: Alan Maguire Acked-by: Andrii Nakryiko Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20240913173759.1316390-3-masahiroy@kernel.org Signed-off-by: Alexei Starovoitov --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f627147d6c25..478d9ac7ead4 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -380,7 +380,7 @@ config DEBUG_INFO_BTF depends on !GCC_PLUGIN_RANDSTRUCT || COMPILE_TEST depends on BPF_SYSCALL depends on PAHOLE_VERSION >= 116 - depends on !DEBUG_INFO_DWARF5 || PAHOLE_VERSION >= 121 + depends on DEBUG_INFO_DWARF4 || PAHOLE_VERSION >= 121 # pahole uses elfutils, which does not have support for Hexagon relocations depends on !HEXAGON help -- cgit v1.2.3 From 99185c10d5d9214d0d0c8b7866660203e344ee3b Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 6 Sep 2024 11:07:13 +0800 Subject: resource, kunit: add test case for region_intersects() Patch series "resource: Fix region_intersects() vs add_memory_driver_managed()", v3. The patchset fixes a bug of region_intersects() for systems with CXL memory. The details of the bug can be found in [1/3]. To avoid similar bugs in the future. A kunit test case for region_intersects() is added in [3/3]. [2/3] is a preparation patch for [3/3]. This patch (of 3): region_intersects() is important because it's used for /dev/mem permission checking. To avoid possible bug of region_intersects() in the future, a kunit test case for region_intersects() is added. Link: https://lkml.kernel.org/r/20240906030713.204292-1-ying.huang@intel.com Link: https://lkml.kernel.org/r/20240906030713.204292-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Dan Williams Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: Jonathan Cameron Cc: Dave Jiang Cc: Alison Schofield Cc: Vishal Verma Cc: Ira Weiny Cc: Alistair Popple Cc: Andy Shevchenko Cc: Bjorn Helgaas Cc: Baoquan He Signed-off-by: Andrew Morton --- kernel/resource.c | 20 +++++-- kernel/resource_kunit.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 1 + 3 files changed, 158 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/kernel/resource.c b/kernel/resource.c index 5e76b98c9771..4c9cb0363f9a 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1817,7 +1817,17 @@ EXPORT_SYMBOL(resource_list_free); #ifdef CONFIG_GET_FREE_REGION #define GFR_DESCENDING (1UL << 0) #define GFR_REQUEST_REGION (1UL << 1) -#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#ifdef PA_SECTION_SHIFT +#define GFR_DEFAULT_ALIGN (1UL << PA_SECTION_SHIFT) +#else +#define GFR_DEFAULT_ALIGN PAGE_SIZE +#endif + +#ifdef MAX_PHYSMEM_BITS +#define MAX_PHYS_ADDR ((1ULL << MAX_PHYSMEM_BITS) - 1) +#else +#define MAX_PHYS_ADDR (-1ULL) +#endif static resource_size_t gfr_start(struct resource *base, resource_size_t size, resource_size_t align, unsigned long flags) @@ -1825,8 +1835,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + end = min_t(resource_size_t, base->end, MAX_PHYS_ADDR); return end - size + 1; } @@ -1843,8 +1852,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, - (1ULL << MAX_PHYSMEM_BITS) - 1); + addr <= min_t(resource_size_t, base->end, MAX_PHYS_ADDR); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, @@ -2005,7 +2013,7 @@ struct resource *alloc_free_mem_region(struct resource *base, return get_free_mem_region(NULL, base, size, align, name, IORES_DESC_NONE, flags); } -EXPORT_SYMBOL_NS_GPL(alloc_free_mem_region, CXL); +EXPORT_SYMBOL_GPL(alloc_free_mem_region); #endif /* CONFIG_GET_FREE_REGION */ static int __init strict_iomem(char *str) diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 0e509985a44a..42d2d8d20f5d 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -7,6 +7,8 @@ #include #include #include +#include +#include #define R0_START 0x0000 #define R0_END 0xffff @@ -137,9 +139,150 @@ static void resource_test_intersection(struct kunit *test) } while (++i < ARRAY_SIZE(results_for_intersection)); } +/* + * The test resource tree for region_intersects() test: + * + * BASE-BASE+1M-1 : Test System RAM 0 + * # hole 0 (BASE+1M-BASE+2M) + * BASE+2M-BASE+3M-1 : Test CXL Window 0 + * BASE+3M-BASE+4M-1 : Test System RAM 1 + * BASE+4M-BASE+7M-1 : Test CXL Window 1 + * BASE+4M-BASE+5M-1 : Test System RAM 2 + * BASE+4M+128K-BASE+4M+256K-1: Test Code + * BASE+5M-BASE+6M-1 : Test System RAM 3 + */ +#define RES_TEST_RAM0_OFFSET 0 +#define RES_TEST_RAM0_SIZE SZ_1M +#define RES_TEST_HOLE0_OFFSET (RES_TEST_RAM0_OFFSET + RES_TEST_RAM0_SIZE) +#define RES_TEST_HOLE0_SIZE SZ_1M +#define RES_TEST_WIN0_OFFSET (RES_TEST_HOLE0_OFFSET + RES_TEST_HOLE0_SIZE) +#define RES_TEST_WIN0_SIZE SZ_1M +#define RES_TEST_RAM1_OFFSET (RES_TEST_WIN0_OFFSET + RES_TEST_WIN0_SIZE) +#define RES_TEST_RAM1_SIZE SZ_1M +#define RES_TEST_WIN1_OFFSET (RES_TEST_RAM1_OFFSET + RES_TEST_RAM1_SIZE) +#define RES_TEST_WIN1_SIZE (SZ_1M * 3) +#define RES_TEST_RAM2_OFFSET RES_TEST_WIN1_OFFSET +#define RES_TEST_RAM2_SIZE SZ_1M +#define RES_TEST_CODE_OFFSET (RES_TEST_RAM2_OFFSET + SZ_128K) +#define RES_TEST_CODE_SIZE SZ_128K +#define RES_TEST_RAM3_OFFSET (RES_TEST_RAM2_OFFSET + RES_TEST_RAM2_SIZE) +#define RES_TEST_RAM3_SIZE SZ_1M +#define RES_TEST_TOTAL_SIZE ((RES_TEST_WIN1_OFFSET + RES_TEST_WIN1_SIZE)) + +static void remove_free_resource(void *ctx) +{ + struct resource *res = (struct resource *)ctx; + + remove_resource(res); + kfree(res); +} + +static void resource_test_request_region(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = __request_region(parent, start, size, name, flags); + KUNIT_ASSERT_NOT_NULL(test, res); + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_insert_resource(struct kunit *test, struct resource *parent, + resource_size_t start, resource_size_t size, + const char *name, unsigned long flags) +{ + struct resource *res; + + res = kzalloc(sizeof(*res), GFP_KERNEL); + KUNIT_ASSERT_NOT_NULL(test, res); + + res->name = name; + res->start = start; + res->end = start + size - 1; + res->flags = flags; + if (insert_resource(parent, res)) { + kfree(res); + KUNIT_FAIL_AND_ABORT(test, "Fail to insert resource %pR\n", res); + } + + kunit_add_action_or_reset(test, remove_free_resource, res); +} + +static void resource_test_region_intersects(struct kunit *test) +{ + unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + struct resource *parent; + resource_size_t start; + + /* Find an iomem_resource hole to hold test resources */ + parent = alloc_free_mem_region(&iomem_resource, RES_TEST_TOTAL_SIZE, SZ_1M, + "test resources"); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, parent); + start = parent->start; + kunit_add_action_or_reset(test, remove_free_resource, parent); + + resource_test_request_region(test, parent, start + RES_TEST_RAM0_OFFSET, + RES_TEST_RAM0_SIZE, "Test System RAM 0", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN0_OFFSET, + RES_TEST_WIN0_SIZE, "Test CXL Window 0", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM1_OFFSET, + RES_TEST_RAM1_SIZE, "Test System RAM 1", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_WIN1_OFFSET, + RES_TEST_WIN1_SIZE, "Test CXL Window 1", + IORESOURCE_MEM); + resource_test_request_region(test, parent, start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE, "Test System RAM 2", flags); + resource_test_insert_resource(test, parent, start + RES_TEST_CODE_OFFSET, + RES_TEST_CODE_SIZE, "Test Code", flags); + resource_test_request_region(test, parent, start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE, "Test System RAM 3", flags); + kunit_release_action(test, remove_free_resource, parent); + + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM0_OFFSET + + RES_TEST_RAM0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_DISJOINT, + region_intersects(start + RES_TEST_HOLE0_OFFSET + + RES_TEST_HOLE0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_WIN0_OFFSET + + RES_TEST_WIN0_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM1_OFFSET + + RES_TEST_RAM1_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET + + RES_TEST_RAM2_SIZE - PAGE_SIZE, 2 * PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_CODE_OFFSET, PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_INTERSECTS, + region_intersects(start + RES_TEST_RAM2_OFFSET, + RES_TEST_RAM2_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); + KUNIT_EXPECT_EQ(test, REGION_MIXED, + region_intersects(start + RES_TEST_RAM3_OFFSET, + RES_TEST_RAM3_SIZE + PAGE_SIZE, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE)); +} + static struct kunit_case resource_test_cases[] = { KUNIT_CASE(resource_test_union), KUNIT_CASE(resource_test_intersection), + KUNIT_CASE(resource_test_region_intersects), {} }; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..383453cbf4af 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2616,6 +2616,7 @@ config RESOURCE_KUNIT_TEST tristate "KUnit test for resource API" if !KUNIT_ALL_TESTS depends on KUNIT default KUNIT_ALL_TESTS + select GET_FREE_REGION help This builds the resource API unit test. Tests the logic of API provided by resource.c and ioport.h. -- cgit v1.2.3 From e620799c414a035dea1208bcb51c869744931dbb Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 10 Sep 2024 12:35:31 +0800 Subject: list: test: fix tests for list_cut_position() Fix test for list_cut_position*() for the missing check of integer "i" after the second loop. The variable should be checked for second time to make sure both lists after the cut operation are formed as expected. Link: https://lkml.kernel.org/r/20240910043531.71343-1-richard120310@gmail.com Signed-off-by: I Hsin Cheng Cc: David Gow Signed-off-by: Andrew Morton --- lib/list-test.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'lib') diff --git a/lib/list-test.c b/lib/list-test.c index 37cbc33e9fdb..8d1d47a9fe9e 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -404,10 +404,13 @@ static void list_test_list_cut_position(struct kunit *test) KUNIT_EXPECT_EQ(test, i, 2); + i = 0; list_for_each(cur, &list1) { KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 1); } static void list_test_list_cut_before(struct kunit *test) @@ -432,10 +435,13 @@ static void list_test_list_cut_before(struct kunit *test) KUNIT_EXPECT_EQ(test, i, 1); + i = 0; list_for_each(cur, &list1) { KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } + + KUNIT_EXPECT_EQ(test, i, 2); } static void list_test_list_splice(struct kunit *test) -- cgit v1.2.3 From 5e06e08939df1cafef97a8e04f4b88c2806b538a Mon Sep 17 00:00:00 2001 From: I Hsin Cheng Date: Tue, 10 Sep 2024 12:08:18 +0800 Subject: list: test: increase coverage of list_test_list_replace*() Increase the test coverage of list_test_list_replace*() by adding the checks to compare the pointer of "a_new.next" and "a_new.prev" to make sure a perfect circular doubly linked list is formed after the replacement. Link: https://lkml.kernel.org/r/20240910040818.65723-1-richard120310@gmail.com Signed-off-by: I Hsin Cheng Cc: David Gow Signed-off-by: Andrew Morton --- lib/list-test.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/list-test.c b/lib/list-test.c index 8d1d47a9fe9e..4f3dc75baec1 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -102,6 +102,8 @@ static void list_test_list_replace(struct kunit *test) /* now: [list] -> a_new -> b */ KUNIT_EXPECT_PTR_EQ(test, list.next, &a_new); KUNIT_EXPECT_PTR_EQ(test, b.prev, &a_new); + KUNIT_EXPECT_PTR_EQ(test, a_new.next, &b); + KUNIT_EXPECT_PTR_EQ(test, a_new.prev, &list); } static void list_test_list_replace_init(struct kunit *test) @@ -118,6 +120,8 @@ static void list_test_list_replace_init(struct kunit *test) /* now: [list] -> a_new -> b */ KUNIT_EXPECT_PTR_EQ(test, list.next, &a_new); KUNIT_EXPECT_PTR_EQ(test, b.prev, &a_new); + KUNIT_EXPECT_PTR_EQ(test, a_new.next, &b); + KUNIT_EXPECT_PTR_EQ(test, a_new.prev, &list); /* check a_old is empty (initialized) */ KUNIT_EXPECT_TRUE(test, list_empty_careful(&a_old)); -- cgit v1.2.3 From 5f5e7344322f0b0676579af054c787ed57d1c1df Mon Sep 17 00:00:00 2001 From: Kris Van Hees Date: Fri, 6 Sep 2024 10:45:03 -0400 Subject: kbuild: generate offset range data for builtin modules Create file module.builtin.ranges that can be used to find where built-in modules are located by their addresses. This will be useful for tracing tools to find what functions are for various built-in modules. The offset range data for builtin modules is generated using: - modules.builtin: associates object files with module names - vmlinux.map: provides load order of sections and offset of first member per section - vmlinux.o.map: provides offset of object file content per section - .*.cmd: build cmd file with KBUILD_MODFILE The generated data will look like: .text 00000000-00000000 = _text .text 0000baf0-0000cb10 amd_uncore .text 0009bd10-0009c8e0 iosf_mbi ... .text 00b9f080-00ba011a intel_skl_int3472_discrete .text 00ba0120-00ba03c0 intel_skl_int3472_discrete intel_skl_int3472_tps68470 .text 00ba03c0-00ba08d6 intel_skl_int3472_tps68470 ... .data 00000000-00000000 = _sdata .data 0000f020-0000f680 amd_uncore For each ELF section, it lists the offset of the first symbol. This can be used to determine the base address of the section at runtime. Next, it lists (in strict ascending order) offset ranges in that section that cover the symbols of one or more builtin modules. Multiple ranges can apply to a single module, and ranges can be shared between modules. The CONFIG_BUILTIN_MODULE_RANGES option controls whether offset range data is generated for kernel modules that are built into the kernel image. How it works: 1. The modules.builtin file is parsed to obtain a list of built-in module names and their associated object names (the .ko file that the module would be in if it were a loadable module, hereafter referred to as ). This object name can be used to identify objects in the kernel compile because any C or assembler code that ends up into a built-in module will have the option -DKBUILD_MODFILE= present in its build command, and those can be found in the ..cmd file in the kernel build tree. If an object is part of multiple modules, they will all be listed in the KBUILD_MODFILE option argument. This allows us to conclusively determine whether an object in the kernel build belong to any modules, and which. 2. The vmlinux.map is parsed next to determine the base address of each top level section so that all addresses into the section can be turned into offsets. This makes it possible to handle sections getting loaded at different addresses at system boot. We also determine an 'anchor' symbol at the beginning of each section to make it possible to calculate the true base address of a section at runtime (i.e. symbol address - symbol offset). We collect start addresses of sections that are included in the top level section. This is used when vmlinux is linked using vmlinux.o, because in that case, we need to look at the vmlinux.o linker map to know what object a symbol is found in. And finally, we process each symbol that is listed in vmlinux.map (or vmlinux.o.map) based on the following structure: vmlinux linked from vmlinux.a: vmlinux.map: -- might be same as top level section) -- built-in association known -- belongs to module(s) object belongs to ... vmlinux linked from vmlinux.o: vmlinux.map: -- might be same as top level section) vmlinux.o -- need to use vmlinux.o.map -- ignored ... vmlinux.o.map:
-- built-in association known -- belongs to module(s) object belongs to ... 3. As sections, objects, and symbols are processed, offset ranges are constructed in a straight-forward way: - If the symbol belongs to one or more built-in modules: - If we were working on the same module(s), extend the range to include this object - If we were working on another module(s), close that range, and start the new one - If the symbol does not belong to any built-in modules: - If we were working on a module(s) range, close that range Signed-off-by: Kris Van Hees Reviewed-by: Nick Alcock Reviewed-by: Alan Maguire Reviewed-by: Steven Rostedt (Google) Tested-by: Sam James Reviewed-by: Sami Tolvanen Tested-by: Sami Tolvanen Signed-off-by: Masahiro Yamada --- .gitignore | 1 + Documentation/dontdiff | 1 + Documentation/kbuild/kbuild.rst | 5 + Documentation/process/changes.rst | 7 + Makefile | 1 + lib/Kconfig.debug | 15 ++ scripts/Makefile.vmlinux | 18 ++ scripts/Makefile.vmlinux_o | 3 + scripts/generate_builtin_ranges.awk | 508 ++++++++++++++++++++++++++++++++++++ 9 files changed, 559 insertions(+) create mode 100755 scripts/generate_builtin_ranges.awk (limited to 'lib') diff --git a/.gitignore b/.gitignore index c06a3ef6d6c6..625bf59ad845 100644 --- a/.gitignore +++ b/.gitignore @@ -69,6 +69,7 @@ modules.order /Module.markers /modules.builtin /modules.builtin.modinfo +/modules.builtin.ranges /modules.nsdeps # diff --git a/Documentation/dontdiff b/Documentation/dontdiff index 3c399f132e2d..a867aea95c40 100644 --- a/Documentation/dontdiff +++ b/Documentation/dontdiff @@ -180,6 +180,7 @@ modpost modules-only.symvers modules.builtin modules.builtin.modinfo +modules.builtin.ranges modules.nsdeps modules.order modversions.h* diff --git a/Documentation/kbuild/kbuild.rst b/Documentation/kbuild/kbuild.rst index 9c8d1d046ea5..a0f7726c46f8 100644 --- a/Documentation/kbuild/kbuild.rst +++ b/Documentation/kbuild/kbuild.rst @@ -22,6 +22,11 @@ modules.builtin.modinfo This file contains modinfo from all modules that are built into the kernel. Unlike modinfo of a separate module, all fields are prefixed with module name. +modules.builtin.ranges +---------------------- +This file contains address offset ranges (per ELF section) for all modules +that are built into the kernel. Together with System.map, it can be used +to associate module names with symbols. Environment variables ===================== diff --git a/Documentation/process/changes.rst b/Documentation/process/changes.rst index 3fc63f27c226..00f1ed7c59c3 100644 --- a/Documentation/process/changes.rst +++ b/Documentation/process/changes.rst @@ -64,6 +64,7 @@ GNU tar 1.28 tar --version gtags (optional) 6.6.5 gtags --version mkimage (optional) 2017.01 mkimage --version Python (optional) 3.5.x python3 --version +GNU AWK (optional) 5.1.0 gawk --version ====================== =============== ======================================== .. [#f1] Sphinx is needed only to build the Kernel documentation @@ -192,6 +193,12 @@ platforms. The tool is available via the ``u-boot-tools`` package or can be built from the U-Boot source code. See the instructions at https://docs.u-boot.org/en/latest/build/tools.html#building-tools-for-linux +GNU AWK +------- + +GNU AWK is needed if you want kernel builds to generate address range data for +builtin modules (CONFIG_BUILTIN_MODULE_RANGES). + System utilities **************** diff --git a/Makefile b/Makefile index 145112bf281a..b4a941a30c73 100644 --- a/Makefile +++ b/Makefile @@ -1482,6 +1482,7 @@ endif # CONFIG_MODULES # Directories & files removed with 'make clean' CLEAN_FILES += vmlinux.symvers modules-only.symvers \ modules.builtin modules.builtin.modinfo modules.nsdeps \ + modules.builtin.ranges vmlinux.o.map \ compile_commands.json rust/test \ rust-project.json .vmlinux.objs .vmlinux.export.c diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a30c03a66172..5e2f30921cb2 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -571,6 +571,21 @@ config VMLINUX_MAP pieces of code get eliminated with CONFIG_LD_DEAD_CODE_DATA_ELIMINATION. +config BUILTIN_MODULE_RANGES + bool "Generate address range information for builtin modules" + depends on !LTO + depends on VMLINUX_MAP + help + When modules are built into the kernel, there will be no module name + associated with its symbols in /proc/kallsyms. Tracers may want to + identify symbols by module name and symbol name regardless of whether + the module is configured as loadable or not. + + This option generates modules.builtin.ranges in the build tree with + offset ranges (per ELF section) for the module(s) they belong to. + It also records an anchor symbol to determine the load address of the + section. + config DEBUG_FORCE_WEAK_PER_CPU bool "Force weak per-cpu definitions" depends on DEBUG_KERNEL diff --git a/scripts/Makefile.vmlinux b/scripts/Makefile.vmlinux index 5ceecbed31eb..1284f05555b9 100644 --- a/scripts/Makefile.vmlinux +++ b/scripts/Makefile.vmlinux @@ -33,6 +33,24 @@ targets += vmlinux vmlinux: scripts/link-vmlinux.sh vmlinux.o $(KBUILD_LDS) FORCE +$(call if_changed_dep,link_vmlinux) +# module.builtin.ranges +# --------------------------------------------------------------------------- +ifdef CONFIG_BUILTIN_MODULE_RANGES +__default: modules.builtin.ranges + +quiet_cmd_modules_builtin_ranges = GEN $@ + cmd_modules_builtin_ranges = gawk -f $(real-prereqs) > $@ + +targets += modules.builtin.ranges +modules.builtin.ranges: $(srctree)/scripts/generate_builtin_ranges.awk \ + modules.builtin vmlinux.map vmlinux.o.map FORCE + $(call if_changed,modules_builtin_ranges) + +vmlinux.map: vmlinux + @: + +endif + # Add FORCE to the prerequisites of a target to force it to be always rebuilt. # --------------------------------------------------------------------------- diff --git a/scripts/Makefile.vmlinux_o b/scripts/Makefile.vmlinux_o index d64070b6b4bc..0b6e2ebf60dc 100644 --- a/scripts/Makefile.vmlinux_o +++ b/scripts/Makefile.vmlinux_o @@ -45,9 +45,12 @@ objtool-args = $(vmlinux-objtool-args-y) --link # Link of vmlinux.o used for section mismatch analysis # --------------------------------------------------------------------------- +vmlinux-o-ld-args-$(CONFIG_BUILTIN_MODULE_RANGES) += -Map=$@.map + quiet_cmd_ld_vmlinux.o = LD $@ cmd_ld_vmlinux.o = \ $(LD) ${KBUILD_LDFLAGS} -r -o $@ \ + $(vmlinux-o-ld-args-y) \ $(addprefix -T , $(initcalls-lds)) \ --whole-archive vmlinux.a --no-whole-archive \ --start-group $(KBUILD_VMLINUX_LIBS) --end-group \ diff --git a/scripts/generate_builtin_ranges.awk b/scripts/generate_builtin_ranges.awk new file mode 100755 index 000000000000..b9ec761b3bef --- /dev/null +++ b/scripts/generate_builtin_ranges.awk @@ -0,0 +1,508 @@ +#!/usr/bin/gawk -f +# SPDX-License-Identifier: GPL-2.0 +# generate_builtin_ranges.awk: Generate address range data for builtin modules +# Written by Kris Van Hees +# +# Usage: generate_builtin_ranges.awk modules.builtin vmlinux.map \ +# vmlinux.o.map > modules.builtin.ranges +# + +# Return the module name(s) (if any) associated with the given object. +# +# If we have seen this object before, return information from the cache. +# Otherwise, retrieve it from the corresponding .cmd file. +# +function get_module_info(fn, mod, obj, s) { + if (fn in omod) + return omod[fn]; + + if (match(fn, /\/[^/]+$/) == 0) + return ""; + + obj = fn; + mod = ""; + fn = substr(fn, 1, RSTART) "." substr(fn, RSTART + 1) ".cmd"; + if (getline s 0) { + mod = substr(s, RSTART + 16, RLENGTH - 16); + gsub(/['"]/, "", mod); + } else if (match(s, /RUST_MODFILE=[^ ]+/) > 0) + mod = substr(s, RSTART + 13, RLENGTH - 13); + } + close(fn); + + # A single module (common case) also reflects objects that are not part + # of a module. Some of those objects have names that are also a module + # name (e.g. core). We check the associated module file name, and if + # they do not match, the object is not part of a module. + if (mod !~ / /) { + if (!(mod in mods)) + mod = ""; + } + + gsub(/([^/ ]*\/)+/, "", mod); + gsub(/-/, "_", mod); + + # At this point, mod is a single (valid) module name, or a list of + # module names (that do not need validation). + omod[obj] = mod; + + return mod; +} + +# Update the ranges entry for the given module 'mod' in section 'osect'. +# +# We use a modified absolute start address (soff + base) as index because we +# may need to insert an anchor record later that must be at the start of the +# section data, and the first module may very well start at the same address. +# So, we use (addr << 1) + 1 to allow a possible anchor record to be placed at +# (addr << 1). This is safe because the index is only used to sort the entries +# before writing them out. +# +function update_entry(osect, mod, soff, eoff, sect, idx) { + sect = sect_in[osect]; + idx = sprintf("%016x", (soff + sect_base[osect]) * 2 + 1); + entries[idx] = sprintf("%s %08x-%08x %s", sect, soff, eoff, mod); + count[sect]++; +} + +# (1) Build a lookup map of built-in module names. +# +# The first file argument is used as input (modules.builtin). +# +# Lines will be like: +# kernel/crypto/lzo-rle.ko +# and we record the object name "crypto/lzo-rle". +# +ARGIND == 1 { + sub(/kernel\//, ""); # strip off "kernel/" prefix + sub(/\.ko$/, ""); # strip off .ko suffix + + mods[$1] = 1; + next; +} + +# (2) Collect address information for each section. +# +# The second file argument is used as input (vmlinux.map). +# +# We collect the base address of the section in order to convert all addresses +# in the section into offset values. +# +# We collect the address of the anchor (or first symbol in the section if there +# is no explicit anchor) to allow users of the range data to calculate address +# ranges based on the actual load address of the section in the running kernel. +# +# We collect the start address of any sub-section (section included in the top +# level section being processed). This is needed when the final linking was +# done using vmlinux.a because then the list of objects contained in each +# section is to be obtained from vmlinux.o.map. The offset of the sub-section +# is recorded here, to be used as an addend when processing vmlinux.o.map +# later. +# + +# Both GNU ld and LLVM lld linker map format are supported by converting LLVM +# lld linker map records into equivalent GNU ld linker map records. +# +# The first record of the vmlinux.map file provides enough information to know +# which format we are dealing with. +# +ARGIND == 2 && FNR == 1 && NF == 7 && $1 == "VMA" && $7 == "Symbol" { + map_is_lld = 1; + if (dbg) + printf "NOTE: %s uses LLVM lld linker map format\n", FILENAME >"/dev/stderr"; + next; +} + +# (LLD) Convert a section record fronm lld format to ld format. +# +# lld: ffffffff82c00000 2c00000 2493c0 8192 .data +# -> +# ld: .data 0xffffffff82c00000 0x2493c0 load address 0x0000000002c00000 +# +ARGIND == 2 && map_is_lld && NF == 5 && /[0-9] [^ ]+$/ { + $0 = $5 " 0x"$1 " 0x"$3 " load address 0x"$2; +} + +# (LLD) Convert an anchor record from lld format to ld format. +# +# lld: ffffffff81000000 1000000 0 1 _text = . +# -> +# ld: 0xffffffff81000000 _text = . +# +ARGIND == 2 && map_is_lld && !anchor && NF == 7 && raw_addr == "0x"$1 && $6 == "=" && $7 == "." { + $0 = " 0x"$1 " " $5 " = ."; +} + +# (LLD) Convert an object record from lld format to ld format. +# +# lld: 11480 11480 1f07 16 vmlinux.a(arch/x86/events/amd/uncore.o):(.text) +# -> +# ld: .text 0x0000000000011480 0x1f07 arch/x86/events/amd/uncore.o +# +ARGIND == 2 && map_is_lld && NF == 5 && $5 ~ /:\(/ { + gsub(/\)/, ""); + sub(/ vmlinux\.a\(/, " "); + sub(/:\(/, " "); + $0 = " "$6 " 0x"$1 " 0x"$3 " " $5; +} + +# (LLD) Convert a symbol record from lld format to ld format. +# +# We only care about these while processing a section for which no anchor has +# been determined yet. +# +# lld: ffffffff82a859a4 2a859a4 0 1 btf_ksym_iter_id +# -> +# ld: 0xffffffff82a859a4 btf_ksym_iter_id +# +ARGIND == 2 && map_is_lld && sect && !anchor && NF == 5 && $5 ~ /^[_A-Za-z][_A-Za-z0-9]*$/ { + $0 = " 0x"$1 " " $5; +} + +# (LLD) We do not need any other ldd linker map records. +# +ARGIND == 2 && map_is_lld && /^[0-9a-f]{16} / { + next; +} + +# (LD) Section records with just the section name at the start of the line +# need to have the next line pulled in to determine whether it is a +# loadable section. If it is, the next line will contains a hex value +# as first and second items. +# +ARGIND == 2 && !map_is_lld && NF == 1 && /^[^ ]/ { + s = $0; + getline; + if ($1 !~ /^0x/ || $2 !~ /^0x/) + next; + + $0 = s " " $0; +} + +# (LD) Object records with just the section name denote records with a long +# section name for which the remainder of the record can be found on the +# next line. +# +# (This is also needed for vmlinux.o.map, when used.) +# +ARGIND >= 2 && !map_is_lld && NF == 1 && /^ [^ \*]/ { + s = $0; + getline; + $0 = s " " $0; +} + +# Beginning a new section - done with the previous one (if any). +# +ARGIND == 2 && /^[^ ]/ { + sect = 0; +} + +# Process a loadable section (we only care about .-sections). +# +# Record the section name and its base address. +# We also record the raw (non-stripped) address of the section because it can +# be used to identify an anchor record. +# +# Note: +# Since some AWK implementations cannot handle large integers, we strip off the +# first 4 hex digits from the address. This is safe because the kernel space +# is not large enough for addresses to extend into those digits. The portion +# to strip off is stored in addr_prefix as a regexp, so further clauses can +# perform a simple substitution to do the address stripping. +# +ARGIND == 2 && /^\./ { + # Explicitly ignore a few sections that are not relevant here. + if ($1 ~ /^\.orc_/ || $1 ~ /_sites$/ || $1 ~ /\.percpu/) + next; + + # Sections with a 0-address can be ignored as well. + if ($2 ~ /^0x0+$/) + next; + + raw_addr = $2; + addr_prefix = "^" substr($2, 1, 6); + base = $2; + sub(addr_prefix, "0x", base); + base = strtonum(base); + sect = $1; + anchor = 0; + sect_base[sect] = base; + sect_size[sect] = strtonum($3); + + if (dbg) + printf "[%s] BASE %016x\n", sect, base >"/dev/stderr"; + + next; +} + +# If we are not in a section we care about, we ignore the record. +# +ARGIND == 2 && !sect { + next; +} + +# Record the first anchor symbol for the current section. +# +# An anchor record for the section bears the same raw address as the section +# record. +# +ARGIND == 2 && !anchor && NF == 4 && raw_addr == $1 && $3 == "=" && $4 == "." { + anchor = sprintf("%s %08x-%08x = %s", sect, 0, 0, $2); + sect_anchor[sect] = anchor; + + if (dbg) + printf "[%s] ANCHOR %016x = %s (.)\n", sect, 0, $2 >"/dev/stderr"; + + next; +} + +# If no anchor record was found for the current section, use the first symbol +# in the section as anchor. +# +ARGIND == 2 && !anchor && NF == 2 && $1 ~ /^0x/ && $2 !~ /^0x/ { + addr = $1; + sub(addr_prefix, "0x", addr); + addr = strtonum(addr) - base; + anchor = sprintf("%s %08x-%08x = %s", sect, addr, addr, $2); + sect_anchor[sect] = anchor; + + if (dbg) + printf "[%s] ANCHOR %016x = %s\n", sect, addr, $2 >"/dev/stderr"; + + next; +} + +# The first occurrence of a section name in an object record establishes the +# addend (often 0) for that section. This information is needed to handle +# sections that get combined in the final linking of vmlinux (e.g. .head.text +# getting included at the start of .text). +# +# If the section does not have a base yet, use the base of the encapsulating +# section. +# +ARGIND == 2 && sect && NF == 4 && /^ [^ \*]/ && !($1 in sect_addend) { + if (!($1 in sect_base)) { + sect_base[$1] = base; + + if (dbg) + printf "[%s] BASE %016x\n", $1, base >"/dev/stderr"; + } + + addr = $2; + sub(addr_prefix, "0x", addr); + addr = strtonum(addr); + sect_addend[$1] = addr - sect_base[$1]; + sect_in[$1] = sect; + + if (dbg) + printf "[%s] ADDEND %016x - %016x = %016x\n", $1, addr, base, sect_addend[$1] >"/dev/stderr"; + + # If the object is vmlinux.o then we will need vmlinux.o.map to get the + # actual offsets of objects. + if ($4 == "vmlinux.o") + need_o_map = 1; +} + +# (3) Collect offset ranges (relative to the section base address) for built-in +# modules. +# +# If the final link was done using the actual objects, vmlinux.map contains all +# the information we need (see section (3a)). +# If linking was done using vmlinux.a as intermediary, we will need to process +# vmlinux.o.map (see section (3b)). + +# (3a) Determine offset range info using vmlinux.map. +# +# Since we are already processing vmlinux.map, the top level section that is +# being processed is already known. If we do not have a base address for it, +# we do not need to process records for it. +# +# Given the object name, we determine the module(s) (if any) that the current +# object is associated with. +# +# If we were already processing objects for a (list of) module(s): +# - If the current object belongs to the same module(s), update the range data +# to include the current object. +# - Otherwise, ensure that the end offset of the range is valid. +# +# If the current object does not belong to a built-in module, ignore it. +# +# If it does, we add a new built-in module offset range record. +# +ARGIND == 2 && !need_o_map && /^ [^ ]/ && NF == 4 && $3 != "0x0" { + if (!(sect in sect_base)) + next; + + # Turn the address into an offset from the section base. + soff = $2; + sub(addr_prefix, "0x", soff); + soff = strtonum(soff) - sect_base[sect]; + eoff = soff + strtonum($3); + + # Determine which (if any) built-in modules the object belongs to. + mod = get_module_info($4); + + # If we are processing a built-in module: + # - If the current object is within the same module, we update its + # entry by extending the range and move on + # - Otherwise: + # + If we are still processing within the same main section, we + # validate the end offset against the start offset of the + # current object (e.g. .rodata.str1.[18] objects are often + # listed with an incorrect size in the linker map) + # + Otherwise, we validate the end offset against the section + # size + if (mod_name) { + if (mod == mod_name) { + mod_eoff = eoff; + update_entry(mod_sect, mod_name, mod_soff, eoff); + + next; + } else if (sect == sect_in[mod_sect]) { + if (mod_eoff > soff) + update_entry(mod_sect, mod_name, mod_soff, soff); + } else { + v = sect_size[sect_in[mod_sect]]; + if (mod_eoff > v) + update_entry(mod_sect, mod_name, mod_soff, v); + } + } + + mod_name = mod; + + # If we encountered an object that is not part of a built-in module, we + # do not need to record any data. + if (!mod) + next; + + # At this point, we encountered the start of a new built-in module. + mod_name = mod; + mod_soff = soff; + mod_eoff = eoff; + mod_sect = $1; + update_entry($1, mod, soff, mod_eoff); + + next; +} + +# If we do not need to parse the vmlinux.o.map file, we are done. +# +ARGIND == 3 && !need_o_map { + if (dbg) + printf "Note: %s is not needed.\n", FILENAME >"/dev/stderr"; + exit; +} + +# (3) Collect offset ranges (relative to the section base address) for built-in +# modules. +# + +# (LLD) Convert an object record from lld format to ld format. +# +ARGIND == 3 && map_is_lld && NF == 5 && $5 ~ /:\(/ { + gsub(/\)/, ""); + sub(/:\(/, " "); + + sect = $6; + if (!(sect in sect_addend)) + next; + + sub(/ vmlinux\.a\(/, " "); + $0 = " "sect " 0x"$1 " 0x"$3 " " $5; +} + +# (3b) Determine offset range info using vmlinux.o.map. +# +# If we do not know an addend for the object's section, we are interested in +# anything within that section. +# +# Determine the top-level section that the object's section was included in +# during the final link. This is the section name offset range data will be +# associated with for this object. +# +# The remainder of the processing of the current object record follows the +# procedure outlined in (3a). +# +ARGIND == 3 && /^ [^ ]/ && NF == 4 && $3 != "0x0" { + osect = $1; + if (!(osect in sect_addend)) + next; + + # We need to work with the main section. + sect = sect_in[osect]; + + # Turn the address into an offset from the section base. + soff = $2; + sub(addr_prefix, "0x", soff); + soff = strtonum(soff) + sect_addend[osect]; + eoff = soff + strtonum($3); + + # Determine which (if any) built-in modules the object belongs to. + mod = get_module_info($4); + + # If we are processing a built-in module: + # - If the current object is within the same module, we update its + # entry by extending the range and move on + # - Otherwise: + # + If we are still processing within the same main section, we + # validate the end offset against the start offset of the + # current object (e.g. .rodata.str1.[18] objects are often + # listed with an incorrect size in the linker map) + # + Otherwise, we validate the end offset against the section + # size + if (mod_name) { + if (mod == mod_name) { + mod_eoff = eoff; + update_entry(mod_sect, mod_name, mod_soff, eoff); + + next; + } else if (sect == sect_in[mod_sect]) { + if (mod_eoff > soff) + update_entry(mod_sect, mod_name, mod_soff, soff); + } else { + v = sect_size[sect_in[mod_sect]]; + if (mod_eoff > v) + update_entry(mod_sect, mod_name, mod_soff, v); + } + } + + mod_name = mod; + + # If we encountered an object that is not part of a built-in module, we + # do not need to record any data. + if (!mod) + next; + + # At this point, we encountered the start of a new built-in module. + mod_name = mod; + mod_soff = soff; + mod_eoff = eoff; + mod_sect = osect; + update_entry(osect, mod, soff, mod_eoff); + + next; +} + +# (4) Generate the output. +# +# Anchor records are added for each section that contains offset range data +# records. They are added at an adjusted section base address (base << 1) to +# ensure they come first in the second records (see update_entry() above for +# more information). +# +# All entries are sorted by (adjusted) address to ensure that the output can be +# parsed in strict ascending address order. +# +END { + for (sect in count) { + if (sect in sect_anchor) { + idx = sprintf("%016x", sect_base[sect] * 2); + entries[idx] = sect_anchor[sect]; + } + } + + n = asorti(entries, indices); + for (i = 1; i <= n; i++) + print entries[indices[i]]; +} -- cgit v1.2.3 From 65f666c6203600053478ce8e34a1db269a8701c9 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 19 Sep 2024 10:17:09 +0800 Subject: lib/sbitmap: define swap_lock as raw_spinlock_t When called from sbitmap_queue_get(), sbitmap_deferred_clear() may be run with preempt disabled. In RT kernel, spin_lock() can sleep, then warning of "BUG: sleeping function called from invalid context" can be triggered. Fix it by replacing it with raw_spin_lock. Cc: Yang Yang Fixes: 72d04bdcf3f7 ("sbitmap: fix io hung due to race on sbitmap_word::cleared") Signed-off-by: Ming Lei Reviewed-by: Yang Yang Link: https://lore.kernel.org/r/20240919021709.511329-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- include/linux/sbitmap.h | 2 +- lib/sbitmap.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index c09cdcc99471..189140bf11fc 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -40,7 +40,7 @@ struct sbitmap_word { /** * @swap_lock: serializes simultaneous updates of ->word and ->cleared */ - spinlock_t swap_lock; + raw_spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 5e2e93307f0d..d3412984170c 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -65,7 +65,7 @@ static inline bool sbitmap_deferred_clear(struct sbitmap_word *map, { unsigned long mask, word_mask; - guard(spinlock_irqsave)(&map->swap_lock); + guard(raw_spinlock_irqsave)(&map->swap_lock); if (!map->cleared) { if (depth == 0) @@ -136,7 +136,7 @@ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, } for (i = 0; i < sb->map_nr; i++) - spin_lock_init(&sb->map[i].swap_lock); + raw_spin_lock_init(&sb->map[i].swap_lock); return 0; } -- cgit v1.2.3 From c509f67df398f985717601563db577e91e67787f Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Sun, 22 Sep 2024 08:05:07 -0700 Subject: Revert "list: test: fix tests for list_cut_position()" This reverts commit e620799c414a035dea1208bcb51c869744931dbb. The commit introduces unit test failures. Expected cur == &entries[i], but cur == 0000037fffadfd80 &entries[i] == 0000037fffadfd60 # list_test_list_cut_position: pass:0 fail:1 skip:0 total:1 not ok 21 list_test_list_cut_position # list_test_list_cut_before: EXPECTATION FAILED at lib/list-test.c:444 Expected cur == &entries[i], but cur == 0000037fffa9fd70 &entries[i] == 0000037fffa9fd60 # list_test_list_cut_before: EXPECTATION FAILED at lib/list-test.c:444 Expected cur == &entries[i], but cur == 0000037fffa9fd80 &entries[i] == 0000037fffa9fd70 Revert it. Link: https://lkml.kernel.org/r/20240922150507.553814-1-linux@roeck-us.net Fixes: e620799c414a ("list: test: fix tests for list_cut_position()") Signed-off-by: Guenter Roeck Cc: I Hsin Cheng Cc: David Gow Signed-off-by: Andrew Morton --- lib/list-test.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'lib') diff --git a/lib/list-test.c b/lib/list-test.c index 4f3dc75baec1..e207c4c98d70 100644 --- a/lib/list-test.c +++ b/lib/list-test.c @@ -408,13 +408,10 @@ static void list_test_list_cut_position(struct kunit *test) KUNIT_EXPECT_EQ(test, i, 2); - i = 0; list_for_each(cur, &list1) { KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } - - KUNIT_EXPECT_EQ(test, i, 1); } static void list_test_list_cut_before(struct kunit *test) @@ -439,13 +436,10 @@ static void list_test_list_cut_before(struct kunit *test) KUNIT_EXPECT_EQ(test, i, 1); - i = 0; list_for_each(cur, &list1) { KUNIT_EXPECT_PTR_EQ(test, cur, &entries[i]); i++; } - - KUNIT_EXPECT_EQ(test, i, 2); } static void list_test_list_splice(struct kunit *test) -- cgit v1.2.3