diff options
Diffstat (limited to 'tools/testing')
117 files changed, 6902 insertions, 1762 deletions
diff --git a/tools/testing/cxl/Kbuild b/tools/testing/cxl/Kbuild index 0d5ce4b74b9f..0e151d0572d1 100644 --- a/tools/testing/cxl/Kbuild +++ b/tools/testing/cxl/Kbuild @@ -4,13 +4,12 @@ ldflags-y += --wrap=is_acpi_device_node ldflags-y += --wrap=acpi_evaluate_integer ldflags-y += --wrap=acpi_pci_find_root ldflags-y += --wrap=nvdimm_bus_register -ldflags-y += --wrap=devm_cxl_port_enumerate_dports ldflags-y += --wrap=cxl_await_media_ready ldflags-y += --wrap=devm_cxl_add_rch_dport -ldflags-y += --wrap=cxl_rcd_component_reg_phys ldflags-y += --wrap=cxl_endpoint_parse_cdat ldflags-y += --wrap=cxl_dport_init_ras_reporting ldflags-y += --wrap=devm_cxl_endpoint_decoders_setup +ldflags-y += --wrap=hmat_get_extended_linear_cache_size DRIVERS := ../../../drivers CXL_SRC := $(DRIVERS)/cxl diff --git a/tools/testing/cxl/test/Kbuild b/tools/testing/cxl/test/Kbuild index 6b1927897856..af50972c8b6d 100644 --- a/tools/testing/cxl/test/Kbuild +++ b/tools/testing/cxl/test/Kbuild @@ -4,6 +4,7 @@ ccflags-y := -I$(srctree)/drivers/cxl/ -I$(srctree)/drivers/cxl/core obj-m += cxl_test.o obj-m += cxl_mock.o obj-m += cxl_mock_mem.o +obj-m += cxl_translate.o cxl_test-y := cxl.o cxl_mock-y := mock.o diff --git a/tools/testing/cxl/test/cxl.c b/tools/testing/cxl/test/cxl.c index 2d135ca533d0..81e2aef3627a 100644 --- a/tools/testing/cxl/test/cxl.c +++ b/tools/testing/cxl/test/cxl.c @@ -15,6 +15,7 @@ #include "mock.h" static int interleave_arithmetic; +static bool extended_linear_cache; #define FAKE_QTG_ID 42 @@ -26,6 +27,9 @@ static int interleave_arithmetic; #define NR_CXL_PORT_DECODERS 8 #define NR_BRIDGES (NR_CXL_HOST_BRIDGES + NR_CXL_SINGLE_HOST + NR_CXL_RCH) +#define MOCK_AUTO_REGION_SIZE_DEFAULT SZ_512M +static int mock_auto_region_size = MOCK_AUTO_REGION_SIZE_DEFAULT; + static struct platform_device *cxl_acpi; static struct platform_device *cxl_host_bridge[NR_CXL_HOST_BRIDGES]; #define NR_MULTI_ROOT (NR_CXL_HOST_BRIDGES * NR_CXL_ROOT_PORTS) @@ -426,6 +430,22 @@ static struct cxl_mock_res *alloc_mock_res(resource_size_t size, int align) return res; } +/* Only update CFMWS0 as this is used by the auto region. */ +static void cfmws_elc_update(struct acpi_cedt_cfmws *window, int index) +{ + if (!extended_linear_cache) + return; + + if (index != 0) + return; + + /* + * The window size should be 2x of the CXL region size where half is + * DRAM and half is CXL + */ + window->window_size = mock_auto_region_size * 2; +} + static int populate_cedt(void) { struct cxl_mock_res *res; @@ -450,6 +470,7 @@ static int populate_cedt(void) for (i = cfmws_start; i <= cfmws_end; i++) { struct acpi_cedt_cfmws *window = mock_cfmws[i]; + cfmws_elc_update(window, i); res = alloc_mock_res(window->window_size, SZ_256M); if (!res) return -ENOMEM; @@ -591,6 +612,25 @@ mock_acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, return AE_OK; } +static int +mock_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, resource_size_t *cache_size) +{ + struct acpi_cedt_cfmws *window = mock_cfmws[0]; + struct resource cfmws0_res = + DEFINE_RES_MEM(window->base_hpa, window->window_size); + + if (!extended_linear_cache || + !resource_contains(&cfmws0_res, backing_res)) { + return hmat_get_extended_linear_cache_size(backing_res, + nid, cache_size); + } + + *cache_size = mock_auto_region_size; + + return 0; +} + static struct pci_bus mock_pci_bus[NR_BRIDGES]; static struct acpi_pci_root mock_pci_root[ARRAY_SIZE(mock_pci_bus)] = { [0] = { @@ -738,7 +778,6 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) struct cxl_endpoint_decoder *cxled; struct cxl_switch_decoder *cxlsd; struct cxl_port *port, *iter; - const int size = SZ_512M; struct cxl_memdev *cxlmd; struct cxl_dport *dport; struct device *dev; @@ -781,9 +820,11 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) } base = window->base_hpa; + if (extended_linear_cache) + base += mock_auto_region_size; cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; cxld->interleave_ways = 2; @@ -792,7 +833,8 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->flags = CXL_DECODER_F_ENABLE; cxled->state = CXL_DECODER_STATE_AUTO; port->commit_end = cxld->id; - devm_cxl_dpa_reserve(cxled, 0, size / cxld->interleave_ways, 0); + devm_cxl_dpa_reserve(cxled, 0, + mock_auto_region_size / cxld->interleave_ways, 0); cxld->commit = mock_decoder_commit; cxld->reset = mock_decoder_reset; @@ -841,7 +883,7 @@ static void mock_init_hdm_decoder(struct cxl_decoder *cxld) cxld->interleave_granularity = 4096; cxld->hpa_range = (struct range) { .start = base, - .end = base + size - 1, + .end = base + mock_auto_region_size - 1, }; put_device(dev); } @@ -995,37 +1037,6 @@ static int get_port_array(struct cxl_port *port, return 0; } -static int mock_cxl_port_enumerate_dports(struct cxl_port *port) -{ - struct platform_device **array; - int i, array_size; - int rc; - - rc = get_port_array(port, &array, &array_size); - if (rc) - return rc; - - for (i = 0; i < array_size; i++) { - struct platform_device *pdev = array[i]; - struct cxl_dport *dport; - - if (pdev->dev.parent != port->uport_dev) { - dev_dbg(&port->dev, "%s: mismatch parent %s\n", - dev_name(port->uport_dev), - dev_name(pdev->dev.parent)); - continue; - } - - dport = devm_cxl_add_dport(port, &pdev->dev, pdev->id, - CXL_RESOURCE_NONE); - - if (IS_ERR(dport)) - return PTR_ERR(dport); - } - - return 0; -} - static struct cxl_dport *mock_cxl_add_dport_by_dev(struct cxl_port *port, struct device *dport_dev) { @@ -1114,9 +1125,10 @@ static struct cxl_mock_ops cxl_mock_ops = { .acpi_pci_find_root = mock_acpi_pci_find_root, .devm_cxl_switch_port_decoders_setup = mock_cxl_switch_port_decoders_setup, .devm_cxl_endpoint_decoders_setup = mock_cxl_endpoint_decoders_setup, - .devm_cxl_port_enumerate_dports = mock_cxl_port_enumerate_dports, .cxl_endpoint_parse_cdat = mock_cxl_endpoint_parse_cdat, .devm_cxl_add_dport_by_dev = mock_cxl_add_dport_by_dev, + .hmat_get_extended_linear_cache_size = + mock_hmat_get_extended_linear_cache_size, .list = LIST_HEAD_INIT(cxl_mock_ops.list), }; @@ -1606,6 +1618,8 @@ static __exit void cxl_test_exit(void) module_param(interleave_arithmetic, int, 0444); MODULE_PARM_DESC(interleave_arithmetic, "Modulo:0, XOR:1"); +module_param(extended_linear_cache, bool, 0444); +MODULE_PARM_DESC(extended_linear_cache, "Enable extended linear cache support"); module_init(cxl_test_init); module_exit(cxl_test_exit); MODULE_LICENSE("GPL v2"); diff --git a/tools/testing/cxl/test/cxl_translate.c b/tools/testing/cxl/test/cxl_translate.c new file mode 100644 index 000000000000..2200ae21795c --- /dev/null +++ b/tools/testing/cxl/test/cxl_translate.c @@ -0,0 +1,445 @@ +// SPDX-License-Identifier: GPL-2.0-only +// Copyright(c) 2025 Intel Corporation. All rights reserved. + +/* Preface all log entries with "cxl_translate" */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/moduleparam.h> +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/acpi.h> +#include <cxlmem.h> +#include <cxl.h> + +/* Maximum number of test vectors and entry length */ +#define MAX_TABLE_ENTRIES 128 +#define MAX_ENTRY_LEN 128 + +/* Expected number of parameters in each test vector */ +#define EXPECTED_PARAMS 7 + +/* Module parameters for test vectors */ +static char *table[MAX_TABLE_ENTRIES]; +static int table_num; + +/* Interleave Arithmetic */ +#define MODULO_MATH 0 +#define XOR_MATH 1 + +/* + * XOR mapping configuration + * The test data sets all use the same set of xormaps. When additional + * data sets arrive for validation, this static setup will need to + * be changed to accept xormaps as additional parameters. + */ +struct cxl_cxims_data *cximsd; +static u64 xormaps[] = { + 0x2020900, + 0x4041200, + 0x1010400, + 0x800, +}; + +static int nr_maps = ARRAY_SIZE(xormaps); + +#define HBIW_TO_NR_MAPS_SIZE (CXL_DECODER_MAX_INTERLEAVE + 1) +static const int hbiw_to_nr_maps[HBIW_TO_NR_MAPS_SIZE] = { + [1] = 0, [2] = 1, [3] = 0, [4] = 2, [6] = 1, [8] = 3, [12] = 2, [16] = 4 +}; + +/** + * to_hpa - calculate an HPA offset from a DPA offset and position + * + * dpa_offset: device physical address offset + * pos: devices position in interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: host physical address offset + */ +static u64 to_hpa(u64 dpa_offset, int pos, u8 r_eiw, u16 r_eig, u8 hb_ways, + u8 math) +{ + u64 hpa_offset; + + /* Calculate base HPA offset from DPA and position */ + hpa_offset = cxl_calculate_hpa_offset(dpa_offset, pos, r_eiw, r_eig); + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + return cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return hpa_offset; +} + +/** + * to_dpa - translate an HPA offset to DPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: device physical address offset + */ +static u64 to_dpa(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + if (math == XOR_MATH) { + cximsd->nr_maps = hbiw_to_nr_maps[hb_ways]; + if (cximsd->nr_maps) + offset = + cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + } + return cxl_calculate_dpa_offset(offset, r_eiw, r_eig); +} + +/** + * to_pos - extract an interleave position from an HPA offset + * + * hpa_offset: host physical address offset + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * + * Returns: devices position in region interleave + */ +static u64 to_pos(u64 hpa_offset, u8 r_eiw, u16 r_eig, u8 hb_ways, u8 math) +{ + u64 offset = hpa_offset; + + /* Reverse XOR mapping if specified */ + if (math == XOR_MATH) + offset = cxl_do_xormap_calc(cximsd, hpa_offset, hb_ways); + + return cxl_calculate_position(offset, r_eiw, r_eig); +} + +/** + * run_translation_test - execute forward and reverse translations + * + * @dpa: device physical address + * @pos: expected position in region interleave + * @r_eiw: region encoded interleave ways + * @r_eig: region encoded interleave granularity + * @hb_ways: host bridge interleave ways + * @math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * @expect_spa: expected system physical address + * + * Returns: 0 on success, -1 on failure + */ +static int run_translation_test(u64 dpa, int pos, u8 r_eiw, u16 r_eig, + u8 hb_ways, int math, u64 expect_hpa) +{ + u64 translated_spa, reverse_dpa; + int reverse_pos; + + /* Test Device to Host translation: DPA + POS -> SPA */ + translated_spa = to_hpa(dpa, pos, r_eiw, r_eig, hb_ways, math); + if (translated_spa != expect_hpa) { + pr_err("Device to host failed: expected HPA %llu, got %llu\n", + expect_hpa, translated_spa); + return -1; + } + + /* Test Host to Device DPA translation: SPA -> DPA */ + reverse_dpa = to_dpa(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_dpa != dpa) { + pr_err("Host to Device DPA failed: expected %llu, got %llu\n", + dpa, reverse_dpa); + return -1; + } + + /* Test Host to Device Position translation: SPA -> POS */ + reverse_pos = to_pos(translated_spa, r_eiw, r_eig, hb_ways, math); + if (reverse_pos != pos) { + pr_err("Position lookup failed: expected %d, got %d\n", pos, + reverse_pos); + return -1; + } + + return 0; +} + +/** + * parse_test_vector - parse a single test vector string + * + * entry: test vector string to parse + * dpa: device physical address + * pos: expected position in region interleave + * r_eiw: region encoded interleave ways + * r_eig: region encoded interleave granularity + * hb_ways: host bridge interleave ways + * math: interleave arithmetic (MODULO_MATH or XOR_MATH) + * expect_spa: expected system physical address + * + * Returns: 0 on success, negative error code on failure + */ +static int parse_test_vector(const char *entry, u64 *dpa, int *pos, u8 *r_eiw, + u16 *r_eig, u8 *hb_ways, int *math, + u64 *expect_hpa) +{ + unsigned int tmp_r_eiw, tmp_r_eig, tmp_hb_ways; + int parsed; + + parsed = sscanf(entry, "%llu %d %u %u %u %d %llu", dpa, pos, &tmp_r_eiw, + &tmp_r_eig, &tmp_hb_ways, math, expect_hpa); + + if (parsed != EXPECTED_PARAMS) { + pr_err("Parse error: expected %d parameters, got %d in '%s'\n", + EXPECTED_PARAMS, parsed, entry); + return -EINVAL; + } + if (tmp_r_eiw > U8_MAX || tmp_r_eig > U16_MAX || tmp_hb_ways > U8_MAX) { + pr_err("Parameter overflow in entry: '%s'\n", entry); + return -ERANGE; + } + if (*math != MODULO_MATH && *math != XOR_MATH) { + pr_err("Invalid math type %d in entry: '%s'\n", *math, entry); + return -EINVAL; + } + *r_eiw = tmp_r_eiw; + *r_eig = tmp_r_eig; + *hb_ways = tmp_hb_ways; + + return 0; +} + +/* + * setup_xor_mapping - Initialize XOR mapping data structure + * + * The test data sets all use the same HBIG so we can use one set + * of xormaps, and set the number to apply based on HBIW before + * calling cxl_do_xormap_calc(). + * + * When additional data sets arrive for validation with different + * HBIG's this static setup will need to be updated. + * + * Returns: 0 on success, negative error code on failure + */ +static int setup_xor_mapping(void) +{ + if (nr_maps <= 0) + return -EINVAL; + + cximsd = kzalloc(struct_size(cximsd, xormaps, nr_maps), GFP_KERNEL); + if (!cximsd) + return -ENOMEM; + + memcpy(cximsd->xormaps, xormaps, nr_maps * sizeof(*cximsd->xormaps)); + cximsd->nr_maps = nr_maps; + + return 0; +} + +static int test_random_params(void) +{ + u8 valid_eiws[] = { 0, 1, 2, 3, 4, 8, 9, 10 }; + u16 valid_eigs[] = { 0, 1, 2, 3, 4, 5, 6 }; + int i, ways, pos, reverse_pos; + u64 dpa, hpa, reverse_dpa; + int iterations = 10000; + int failures = 0; + + for (i = 0; i < iterations; i++) { + /* Generate valid random parameters for eiw, eig, pos, dpa */ + u8 eiw = valid_eiws[get_random_u32() % ARRAY_SIZE(valid_eiws)]; + u16 eig = valid_eigs[get_random_u32() % ARRAY_SIZE(valid_eigs)]; + + eiw_to_ways(eiw, &ways); + pos = get_random_u32() % ways; + dpa = get_random_u64() >> 12; + + hpa = cxl_calculate_hpa_offset(dpa, pos, eiw, eig); + reverse_dpa = cxl_calculate_dpa_offset(hpa, eiw, eig); + reverse_pos = cxl_calculate_position(hpa, eiw, eig); + + if (reverse_dpa != dpa || reverse_pos != pos) { + pr_err("test random iter %d FAIL hpa=%llu, dpa=%llu reverse_dpa=%llu, pos=%d reverse_pos=%d eiw=%u eig=%u\n", + i, hpa, dpa, reverse_dpa, pos, reverse_pos, eiw, + eig); + + if (failures++ > 10) { + pr_err("test random too many failures, stop\n"); + break; + } + } + } + pr_info("..... test random: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +struct param_test { + u8 eiw; + u16 eig; + int pos; + bool expect; /* true: expect pass, false: expect fail */ + const char *desc; +}; + +static struct param_test param_tests[] = { + { 0x0, 0, 0, true, "1-way, min eig=0, pos=0" }, + { 0x0, 3, 0, true, "1-way, mid eig=3, pos=0" }, + { 0x0, 6, 0, true, "1-way, max eig=6, pos=0" }, + { 0x1, 0, 0, true, "2-way, eig=0, pos=0" }, + { 0x1, 3, 1, true, "2-way, eig=3, max pos=1" }, + { 0x1, 6, 1, true, "2-way, eig=6, max pos=1" }, + { 0x2, 0, 0, true, "4-way, eig=0, pos=0" }, + { 0x2, 3, 3, true, "4-way, eig=3, max pos=3" }, + { 0x2, 6, 3, true, "4-way, eig=6, max pos=3" }, + { 0x3, 0, 0, true, "8-way, eig=0, pos=0" }, + { 0x3, 3, 7, true, "8-way, eig=3, max pos=7" }, + { 0x3, 6, 7, true, "8-way, eig=6, max pos=7" }, + { 0x4, 0, 0, true, "16-way, eig=0, pos=0" }, + { 0x4, 3, 15, true, "16-way, eig=3, max pos=15" }, + { 0x4, 6, 15, true, "16-way, eig=6, max pos=15" }, + { 0x8, 0, 0, true, "3-way, eig=0, pos=0" }, + { 0x8, 3, 2, true, "3-way, eig=3, max pos=2" }, + { 0x8, 6, 2, true, "3-way, eig=6, max pos=2" }, + { 0x9, 0, 0, true, "6-way, eig=0, pos=0" }, + { 0x9, 3, 5, true, "6-way, eig=3, max pos=5" }, + { 0x9, 6, 5, true, "6-way, eig=6, max pos=5" }, + { 0xA, 0, 0, true, "12-way, eig=0, pos=0" }, + { 0xA, 3, 11, true, "12-way, eig=3, max pos=11" }, + { 0xA, 6, 11, true, "12-way, eig=6, max pos=11" }, + { 0x5, 0, 0, false, "invalid eiw=5" }, + { 0x7, 0, 0, false, "invalid eiw=7" }, + { 0xB, 0, 0, false, "invalid eiw=0xB" }, + { 0xFF, 0, 0, false, "invalid eiw=0xFF" }, + { 0x1, 7, 0, false, "invalid eig=7 (out of range)" }, + { 0x2, 0x10, 0, false, "invalid eig=0x10" }, + { 0x3, 0xFFFF, 0, false, "invalid eig=0xFFFF" }, + { 0x1, 0, -1, false, "pos < 0" }, + { 0x1, 0, 2, false, "2-way, pos=2 (>= ways)" }, + { 0x2, 0, 4, false, "4-way, pos=4 (>= ways)" }, + { 0x3, 0, 8, false, "8-way, pos=8 (>= ways)" }, + { 0x4, 0, 16, false, "16-way, pos=16 (>= ways)" }, + { 0x8, 0, 3, false, "3-way, pos=3 (>= ways)" }, + { 0x9, 0, 6, false, "6-way, pos=6 (>= ways)" }, + { 0xA, 0, 12, false, "12-way, pos=12 (>= ways)" }, +}; + +static int test_cxl_validate_translation_params(void) +{ + int i, rc, failures = 0; + bool valid; + + for (i = 0; i < ARRAY_SIZE(param_tests); i++) { + struct param_test *t = ¶m_tests[i]; + + rc = cxl_validate_translation_params(t->eiw, t->eig, t->pos); + valid = (rc == 0); + + if (valid != t->expect) { + pr_err("test params failed: %s\n", t->desc); + failures++; + } + } + pr_info("..... test params: PASS %d FAIL %d\n", i - failures, failures); + + if (failures) + return -EINVAL; + + return 0; +} + +/* + * cxl_translate_init + * + * Run the internal validation tests when no params are passed. + * Otherwise, parse the parameters (test vectors), and kick off + * the translation test. + * + * Returns: 0 on success, negative error code on failure + */ +static int __init cxl_translate_init(void) +{ + int rc, i; + + /* If no tables are passed, validate module params only */ + if (table_num == 0) { + pr_info("Internal validation test start...\n"); + rc = test_cxl_validate_translation_params(); + if (rc) + return rc; + + rc = test_random_params(); + if (rc) + return rc; + + pr_info("Internal validation test completed successfully\n"); + + return 0; + } + + pr_info("CXL translate test module loaded with %d test vectors\n", + table_num); + + rc = setup_xor_mapping(); + if (rc) + return rc; + + /* Process each test vector */ + for (i = 0; i < table_num; i++) { + u64 dpa, expect_spa; + int pos, math; + u8 r_eiw, hb_ways; + u16 r_eig; + + pr_debug("Processing test vector %d: '%s'\n", i, table[i]); + + /* Parse the test vector */ + rc = parse_test_vector(table[i], &dpa, &pos, &r_eiw, &r_eig, + &hb_ways, &math, &expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " Failed to parse test vector '%s'\n", + i, table[i]); + continue; + } + /* Run the translation test */ + rc = run_translation_test(dpa, pos, r_eiw, r_eig, hb_ways, math, + expect_spa); + if (rc) { + pr_err("CXL Translate Test %d: FAIL\n" + " dpa=%llu pos=%d r_eiw=%u r_eig=%u hb_ways=%u math=%s expect_spa=%llu\n", + i, dpa, pos, r_eiw, r_eig, hb_ways, + (math == XOR_MATH) ? "XOR" : "MODULO", + expect_spa); + } else { + pr_info("CXL Translate Test %d: PASS\n", i); + } + } + + kfree(cximsd); + pr_info("CXL translate test completed\n"); + + return 0; +} + +static void __exit cxl_translate_exit(void) +{ + pr_info("CXL translate test module unloaded\n"); +} + +module_param_array(table, charp, &table_num, 0444); +MODULE_PARM_DESC(table, "Test vectors as space-separated decimal strings"); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("cxl_test: cxl address translation test module"); +MODULE_IMPORT_NS("CXL"); + +module_init(cxl_translate_init); +module_exit(cxl_translate_exit); diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index d533481672b7..176dcde570cd 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -250,22 +250,21 @@ static void mes_add_event(struct mock_event_store *mes, * Vary the number of events returned to simulate events occuring while the * logs are being read. */ -static int ret_limit = 0; +static atomic_t event_counter = ATOMIC_INIT(0); static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) { struct cxl_get_event_payload *pl; struct mock_event_log *log; - u16 nr_overflow; + int ret_limit; u8 log_type; int i; if (cmd->size_in != sizeof(log_type)) return -EINVAL; - ret_limit = (ret_limit + 1) % CXL_TEST_EVENT_RET_MAX; - if (!ret_limit) - ret_limit = 1; + /* Vary return limit from 1 to CXL_TEST_EVENT_RET_MAX */ + ret_limit = (atomic_inc_return(&event_counter) % CXL_TEST_EVENT_RET_MAX) + 1; if (cmd->size_out < struct_size(pl, records, ret_limit)) return -EINVAL; @@ -299,7 +298,7 @@ static int mock_get_event(struct device *dev, struct cxl_mbox_cmd *cmd) u64 ns; pl->flags |= CXL_GET_EVENT_FLAG_OVERFLOW; - pl->overflow_err_count = cpu_to_le16(nr_overflow); + pl->overflow_err_count = cpu_to_le16(log->nr_overflow); ns = ktime_get_real_ns(); ns -= 5000000000; /* 5s ago */ pl->first_overflow_timestamp = cpu_to_le64(ns); diff --git a/tools/testing/cxl/test/mock.c b/tools/testing/cxl/test/mock.c index 995269a75cbd..44bce80ef3ff 100644 --- a/tools/testing/cxl/test/mock.c +++ b/tools/testing/cxl/test/mock.c @@ -111,6 +111,26 @@ acpi_status __wrap_acpi_evaluate_integer(acpi_handle handle, } EXPORT_SYMBOL(__wrap_acpi_evaluate_integer); +int __wrap_hmat_get_extended_linear_cache_size(struct resource *backing_res, + int nid, + resource_size_t *cache_size) +{ + int index, rc; + struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); + + if (ops) + rc = ops->hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + else + rc = hmat_get_extended_linear_cache_size(backing_res, nid, + cache_size); + + put_cxl_mock_ops(index); + + return rc; +} +EXPORT_SYMBOL_GPL(__wrap_hmat_get_extended_linear_cache_size); + struct acpi_pci_root *__wrap_acpi_pci_find_root(acpi_handle handle) { int index; @@ -172,21 +192,6 @@ int __wrap_devm_cxl_endpoint_decoders_setup(struct cxl_port *port) } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_endpoint_decoders_setup, "CXL"); -int __wrap_devm_cxl_port_enumerate_dports(struct cxl_port *port) -{ - int rc, index; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(port->uport_dev)) - rc = ops->devm_cxl_port_enumerate_dports(port); - else - rc = devm_cxl_port_enumerate_dports(port); - put_cxl_mock_ops(index); - - return rc; -} -EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_port_enumerate_dports, "CXL"); - int __wrap_cxl_await_media_ready(struct cxl_dev_state *cxlds) { int rc, index; @@ -226,23 +231,6 @@ struct cxl_dport *__wrap_devm_cxl_add_rch_dport(struct cxl_port *port, } EXPORT_SYMBOL_NS_GPL(__wrap_devm_cxl_add_rch_dport, "CXL"); -resource_size_t __wrap_cxl_rcd_component_reg_phys(struct device *dev, - struct cxl_dport *dport) -{ - int index; - resource_size_t component_reg_phys; - struct cxl_mock_ops *ops = get_cxl_mock_ops(&index); - - if (ops && ops->is_mock_port(dev)) - component_reg_phys = CXL_RESOURCE_NONE; - else - component_reg_phys = cxl_rcd_component_reg_phys(dev, dport); - put_cxl_mock_ops(index); - - return component_reg_phys; -} -EXPORT_SYMBOL_NS_GPL(__wrap_cxl_rcd_component_reg_phys, "CXL"); - void __wrap_cxl_endpoint_parse_cdat(struct cxl_port *port) { int index; diff --git a/tools/testing/cxl/test/mock.h b/tools/testing/cxl/test/mock.h index 4ed932e76aae..2684b89c8aa2 100644 --- a/tools/testing/cxl/test/mock.h +++ b/tools/testing/cxl/test/mock.h @@ -19,12 +19,14 @@ struct cxl_mock_ops { bool (*is_mock_bus)(struct pci_bus *bus); bool (*is_mock_port)(struct device *dev); bool (*is_mock_dev)(struct device *dev); - int (*devm_cxl_port_enumerate_dports)(struct cxl_port *port); int (*devm_cxl_switch_port_decoders_setup)(struct cxl_port *port); int (*devm_cxl_endpoint_decoders_setup)(struct cxl_port *port); void (*cxl_endpoint_parse_cdat)(struct cxl_port *port); struct cxl_dport *(*devm_cxl_add_dport_by_dev)(struct cxl_port *port, struct device *dport_dev); + int (*hmat_get_extended_linear_cache_size)(struct resource *backing_res, + int nid, + resource_size_t *cache_size); }; void register_cxl_mock_ops(struct cxl_mock_ops *ops); diff --git a/tools/testing/ktest/config-bisect.pl b/tools/testing/ktest/config-bisect.pl index 6fd864935319..bee7cb34a289 100755 --- a/tools/testing/ktest/config-bisect.pl +++ b/tools/testing/ktest/config-bisect.pl @@ -741,9 +741,9 @@ if ($start) { die "Can not find file $bad\n"; } if ($val eq "good") { - run_command "cp $output_config $good" or die "failed to copy $config to $good\n"; + run_command "cp $output_config $good" or die "failed to copy $output_config to $good\n"; } elsif ($val eq "bad") { - run_command "cp $output_config $bad" or die "failed to copy $config to $bad\n"; + run_command "cp $output_config $bad" or die "failed to copy $output_config to $bad\n"; } } diff --git a/tools/testing/selftests/damon/_damon_sysfs.py b/tools/testing/selftests/damon/_damon_sysfs.py index a0e6290833fb..748778b563cd 100644 --- a/tools/testing/selftests/damon/_damon_sysfs.py +++ b/tools/testing/selftests/damon/_damon_sysfs.py @@ -475,12 +475,14 @@ class Damos: class DamonTarget: pid = None + obsolete = None # todo: Support target regions if test is made idx = None context = None - def __init__(self, pid): + def __init__(self, pid, obsolete=False): self.pid = pid + self.obsolete = obsolete def sysfs_dir(self): return os.path.join( @@ -491,8 +493,13 @@ class DamonTarget: os.path.join(self.sysfs_dir(), 'regions', 'nr_regions'), '0') if err is not None: return err - return write_file( + err = write_file( os.path.join(self.sysfs_dir(), 'pid_target'), self.pid) + if err is not None: + return err + return write_file( + os.path.join(self.sysfs_dir(), 'obsolete_target'), + 'Y' if self.obsolete else 'N') class IntervalsGoal: access_bp = None diff --git a/tools/testing/selftests/damon/drgn_dump_damon_status.py b/tools/testing/selftests/damon/drgn_dump_damon_status.py index 7233369a3a44..5374d18d1fa8 100755 --- a/tools/testing/selftests/damon/drgn_dump_damon_status.py +++ b/tools/testing/selftests/damon/drgn_dump_damon_status.py @@ -73,6 +73,7 @@ def target_to_dict(target): ['pid', int], ['nr_regions', int], ['regions_list', regions_to_list], + ['obsolete', bool], ]) def targets_to_list(targets): @@ -174,11 +175,11 @@ def scheme_to_dict(scheme): ['target_nid', int], ['migrate_dests', damos_migrate_dests_to_dict], ]) - filters = [] + core_filters = [] for f in list_for_each_entry( - 'struct damos_filter', scheme.filters.address_of_(), 'list'): - filters.append(damos_filter_to_dict(f)) - dict_['filters'] = filters + 'struct damos_filter', scheme.core_filters.address_of_(), 'list'): + core_filters.append(damos_filter_to_dict(f)) + dict_['core_filters'] = core_filters ops_filters = [] for f in list_for_each_entry( 'struct damos_filter', scheme.ops_filters.address_of_(), 'list'): diff --git a/tools/testing/selftests/damon/sysfs.py b/tools/testing/selftests/damon/sysfs.py index 2666c6f0f1a5..9cca71eb0325 100755 --- a/tools/testing/selftests/damon/sysfs.py +++ b/tools/testing/selftests/damon/sysfs.py @@ -132,7 +132,7 @@ def assert_scheme_committed(scheme, dump): assert_watermarks_committed(scheme.watermarks, dump['wmarks']) # TODO: test filters directory for idx, f in enumerate(scheme.core_filters.filters): - assert_filter_committed(f, dump['filters'][idx]) + assert_filter_committed(f, dump['core_filters'][idx]) for idx, f in enumerate(scheme.ops_filters.filters): assert_filter_committed(f, dump['ops_filters'][idx]) @@ -164,6 +164,16 @@ def assert_monitoring_attrs_committed(attrs, dump): assert_true(dump['max_nr_regions'] == attrs.max_nr_regions, 'max_nr_regions', dump) +def assert_monitoring_target_committed(target, dump): + # target.pid is the pid "number", while dump['pid'] is 'struct pid' + # pointer, and hence cannot be compared. + assert_true(dump['obsolete'] == target.obsolete, 'target obsolete', dump) + +def assert_monitoring_targets_committed(targets, dump): + assert_true(len(targets) == len(dump), 'len_targets', dump) + for idx, target in enumerate(targets): + assert_monitoring_target_committed(target, dump[idx]) + def assert_ctx_committed(ctx, dump): ops_val = { 'vaddr': 0, @@ -172,9 +182,18 @@ def assert_ctx_committed(ctx, dump): } assert_true(dump['ops']['id'] == ops_val[ctx.ops], 'ops_id', dump) assert_monitoring_attrs_committed(ctx.monitoring_attrs, dump['attrs']) + assert_monitoring_targets_committed(ctx.targets, dump['adaptive_targets']) assert_schemes_committed(ctx.schemes, dump['schemes']) -def assert_ctxs_committed(ctxs, dump): +def assert_ctxs_committed(kdamonds): + status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) + if err is not None: + print(err) + kdamonds.stop() + exit(1) + + ctxs = kdamonds.kdamonds[0].contexts + dump = status['contexts'] assert_true(len(ctxs) == len(dump), 'ctxs length', dump) for idx, ctx in enumerate(ctxs): assert_ctx_committed(ctx, dump[idx]) @@ -191,13 +210,7 @@ def main(): print('kdamond start failed: %s' % err) exit(1) - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - kdamonds.stop() - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) context = _damon_sysfs.DamonCtx( monitoring_attrs=_damon_sysfs.DamonAttrs( @@ -245,12 +258,7 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) - - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + assert_ctxs_committed(kdamonds) # test online commitment of minimum context. context = _damon_sysfs.DamonCtx() @@ -259,13 +267,36 @@ def main(): kdamonds.kdamonds[0].contexts = [context] kdamonds.kdamonds[0].commit() - status, err = dump_damon_status_dict(kdamonds.kdamonds[0].pid) - if err is not None: - print(err) - exit(1) + assert_ctxs_committed(kdamonds) - assert_ctxs_committed(kdamonds.kdamonds[0].contexts, status['contexts']) + kdamonds.stop() + # test obsolete_target. + proc1 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc2 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + proc3 = subprocess.Popen(['sh'], stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + kdamonds = _damon_sysfs.Kdamonds( + [_damon_sysfs.Kdamond( + contexts=[_damon_sysfs.DamonCtx( + ops='vaddr', + targets=[ + _damon_sysfs.DamonTarget(pid=proc1.pid), + _damon_sysfs.DamonTarget(pid=proc2.pid), + _damon_sysfs.DamonTarget(pid=proc3.pid), + ], + schemes=[_damon_sysfs.Damos()], + )])]) + err = kdamonds.start() + if err is not None: + print('kdamond start failed: %s' % err) + exit(1) + kdamonds.kdamonds[0].contexts[0].targets[1].obsolete = True + kdamonds.kdamonds[0].commit() + del kdamonds.kdamonds[0].contexts[0].targets[1] + assert_ctxs_committed(kdamonds) kdamonds.stop() if __name__ == '__main__': diff --git a/tools/testing/selftests/ftrace/ftracetest b/tools/testing/selftests/ftrace/ftracetest index cce72f8b03dc..3230bd54dba8 100755 --- a/tools/testing/selftests/ftrace/ftracetest +++ b/tools/testing/selftests/ftrace/ftracetest @@ -22,6 +22,7 @@ echo " --fail-unresolved Treat UNRESOLVED as a failure" echo " -d|--debug Debug mode (trace all shell commands)" echo " -l|--logdir <dir> Save logs on the <dir>" echo " If <dir> is -, all logs output in console only" +echo " --rv Run RV selftests instead of ftrace ones" exit $1 } @@ -133,6 +134,10 @@ parse_opts() { # opts LINK_PTR= shift 2 ;; + --rv) + RV_TEST=1 + shift 1 + ;; *.tc) if [ -f "$1" ]; then OPT_TEST_CASES="$OPT_TEST_CASES `abspath $1`" @@ -152,9 +157,13 @@ parse_opts() { # opts ;; esac done - if [ ! -z "$OPT_TEST_CASES" ]; then + if [ -n "$OPT_TEST_CASES" ]; then TEST_CASES=$OPT_TEST_CASES fi + if [ -n "$OPT_TEST_DIR" -a -f "$OPT_TEST_DIR"/test.d/functions ]; then + TOP_DIR=$OPT_TEST_DIR + TEST_DIR=$TOP_DIR/test.d + fi } # Parameters @@ -190,10 +199,6 @@ fi TOP_DIR=`absdir $0` TEST_DIR=$TOP_DIR/test.d TEST_CASES=`find_testcases $TEST_DIR` -LOG_TOP_DIR=$TOP_DIR/logs -LOG_DATE=`date +%Y%m%d-%H%M%S` -LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/ -LINK_PTR=$LOG_TOP_DIR/latest KEEP_LOG=0 KTAP=0 DEBUG=0 @@ -201,14 +206,23 @@ VERBOSE=0 UNSUPPORTED_RESULT=0 UNRESOLVED_RESULT=0 STOP_FAILURE=0 +RV_TEST=0 # Parse command-line options parse_opts $* +LOG_TOP_DIR=$TOP_DIR/logs +LOG_DATE=`date +%Y%m%d-%H%M%S` +LOG_DIR=$LOG_TOP_DIR/$LOG_DATE/ +LINK_PTR=$LOG_TOP_DIR/latest + [ $DEBUG -ne 0 ] && set -x -# Verify parameters -if [ -z "$TRACING_DIR" -o ! -d "$TRACING_DIR" ]; then - errexit "No ftrace directory found" +if [ $RV_TEST -ne 0 ]; then + TRACING_DIR=$TRACING_DIR/rv + if [ ! -d "$TRACING_DIR" ]; then + err_ret=$err_skip + errexit "rv is not configured in this kernel" + fi fi # Preparing logs @@ -419,7 +433,7 @@ trap 'SIG_RESULT=$XFAIL' $SIG_XFAIL __run_test() { # testfile # setup PID and PPID, $$ is not updated. (cd $TRACING_DIR; read PID _ < /proc/self/stat; set -e; set -x; - checkreq $1; initialize_ftrace; . $1) + checkreq $1; initialize_system; . $1) [ $? -ne 0 ] && kill -s $SIG_FAIL $SIG_PID } @@ -496,7 +510,7 @@ for t in $TEST_CASES; do exit 1 fi done -(cd $TRACING_DIR; finish_ftrace) # for cleanup +(cd $TRACING_DIR; finish_system) # for cleanup prlog "" prlog "# of passed: " `echo $PASSED_CASES | wc -w` diff --git a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc index 8a7ce647a60d..318939451caf 100644 --- a/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc +++ b/tools/testing/selftests/ftrace/test.d/00basic/mount_options.tc @@ -28,7 +28,7 @@ unmount_tracefs() { local mount_point="$1" # Need to make sure the mount isn't busy so that we can umount it - (cd $mount_point; finish_ftrace;) + (cd $mount_point; finish_system;) cleanup } diff --git a/tools/testing/selftests/ftrace/test.d/functions b/tools/testing/selftests/ftrace/test.d/functions index a1052bf460fc..e8e718139294 100644 --- a/tools/testing/selftests/ftrace/test.d/functions +++ b/tools/testing/selftests/ftrace/test.d/functions @@ -104,7 +104,7 @@ clear_dynamic_events() { # reset all current dynamic events done } -initialize_ftrace() { # Reset ftrace to initial-state +initialize_system() { # Reset ftrace to initial-state # As the initial state, ftrace will be set to nop tracer, # no events, no triggers, no filters, no function filters, # no probes, and tracing on. @@ -134,8 +134,8 @@ initialize_ftrace() { # Reset ftrace to initial-state enable_tracing } -finish_ftrace() { - initialize_ftrace +finish_system() { + initialize_system # And recover it to default. [ -f options/pause-on-trace ] && echo 0 > options/pause-on-trace } diff --git a/tools/testing/selftests/hid/tests/test_tablet.py b/tools/testing/selftests/hid/tests/test_tablet.py index 50d5699812bb..5b9abb616db4 100644 --- a/tools/testing/selftests/hid/tests/test_tablet.py +++ b/tools/testing/selftests/hid/tests/test_tablet.py @@ -452,6 +452,7 @@ class Pen(object): def __init__(self, x, y): self.x = x self.y = y + self.distance = -10 self.tipswitch = False self.tippressure = 15 self.azimuth = 0 @@ -473,6 +474,7 @@ class Pen(object): for i in [ "x", "y", + "distance", "tippressure", "azimuth", "width", @@ -554,6 +556,7 @@ class PenDigitizer(base.UHIDTestDevice): pen.tipswitch = False pen.tippressure = 0 pen.azimuth = 0 + pen.distance = 0 pen.inrange = False pen.width = 0 pen.height = 0 @@ -868,6 +871,29 @@ class BaseTest: state machine.""" self._test_states(state_list, scribble, allow_intermediate_states=True) + @pytest.mark.skip_if_uhdev( + lambda uhdev: "Z" not in uhdev.fields, + "Device not compatible, missing Z usage", + ) + @pytest.mark.parametrize("scribble", [True, False], ids=["scribble", "static"]) + @pytest.mark.parametrize( + "state_list", + [pytest.param(v, id=k) for k, v in PenState.legal_transitions().items()], + ) + def test_z_reported_as_distance(self, state_list, scribble): + """Verify stylus Z values are reported as ABS_DISTANCE.""" + self._test_states(state_list, scribble, allow_intermediate_states=False) + + uhdev = self.uhdev + evdev = uhdev.get_evdev() + p = Pen(0, 0) + uhdev.move_to(p, PenState.PEN_IS_OUT_OF_RANGE, None) + p = Pen(100, 200) + uhdev.move_to(p, PenState.PEN_IS_IN_RANGE, None) + p.distance = -1 + events = self.post(uhdev, p, None) + assert evdev.value[libevdev.EV_ABS.ABS_DISTANCE] == -1 + class GXTP_pen(PenDigitizer): def event(self, pen, test_button): @@ -1292,6 +1318,35 @@ class Huion_Kamvas_Pro_19_256c_006b(PenDigitizer): return rs +class Wacom_2d1f_014b(PenDigitizer): + """ + Pen that reports distance values with HID_GD_Z usage. + """ + def __init__( + self, + name, + rdesc_str=None, + rdesc=None, + application="Pen", + physical="Stylus", + input_info=(BusType.USB, 0x2D1F, 0x014B), + evdev_name_suffix=None, + ): + super().__init__( + name, rdesc_str, rdesc, application, physical, input_info, evdev_name_suffix + ) + + def match_evdev_rule(self, application, evdev): + # there are 2 nodes created by the device, only one matters + return evdev.name.endswith("Stylus") + + def event(self, pen, test_button): + # this device reports the distance through Z + pen.z = pen.distance + + return super().event(pen, test_button) + + ################################################################################ # # Windows 7 compatible devices @@ -1504,3 +1559,19 @@ class TestHuion_Kamvas_Pro_19_256c_006b(BaseTest.TestTablet): rdesc="05 0d 09 02 a1 01 85 0a 09 20 a1 01 09 42 09 44 09 43 09 3c 09 45 15 00 25 01 75 01 95 06 81 02 09 32 75 01 95 01 81 02 81 03 05 01 09 30 09 31 55 0d 65 33 26 ff 7f 35 00 46 00 08 75 10 95 02 81 02 05 0d 09 30 26 ff 3f 75 10 95 01 81 02 09 3d 09 3e 15 a6 25 5a 75 08 95 02 81 02 c0 c0 05 0d 09 04 a1 01 85 04 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 22 a1 02 05 0d 95 01 75 06 09 51 15 00 25 3f 81 02 09 42 25 01 75 01 95 01 81 02 75 01 95 01 81 03 05 01 75 10 55 0e 65 11 09 30 26 ff 7f 35 00 46 15 0c 81 42 09 31 26 ff 7f 46 cb 06 81 42 05 0d 09 30 26 ff 1f 75 10 95 01 81 02 c0 05 0d 09 56 55 00 65 00 27 ff ff ff 7f 95 01 75 20 81 02 09 54 25 7f 95 01 75 08 81 02 75 08 95 08 81 03 85 05 09 55 25 0a 75 08 95 01 b1 02 06 00 ff 09 c5 85 06 15 00 26 ff 00 75 08 96 00 01 b1 02 c0", input_info=(BusType.USB, 0x256C, 0x006B), ) + + +################################################################################ +# +# Devices Reporting Distance +# +################################################################################ + + +class TestWacom_2d1f_014b(BaseTest.TestTablet): + def create_device(self): + return Wacom_2d1f_014b( + "uhid test Wacom 2d1f_014b", + rdesc="05 0d 09 02 a1 01 85 02 09 20 a1 00 09 42 09 44 09 45 09 3c 09 5a 09 32 15 00 25 01 75 01 95 06 81 02 95 02 81 03 05 01 09 30 26 88 3e 46 88 3e 65 11 55 0d 75 10 95 01 81 02 09 31 26 60 53 46 60 53 81 02 05 0d 09 30 26 ff 0f 45 00 65 00 55 00 81 02 06 00 ff 09 04 75 08 26 ff 00 46 ff 00 65 11 55 0e 81 02 05 0d 09 3d 75 10 16 d8 dc 26 28 23 36 d8 dc 46 28 23 65 14 81 02 09 3e 81 02 05 01 09 32 16 01 ff 25 00 36 01 ff 45 00 65 11 81 02 05 0d 09 56 15 00 27 ff ff 00 00 35 00 47 ff ff 00 00 66 01 10 55 0c 81 02 45 00 65 00 55 00 c0 09 00 75 08 26 ff 00 b1 12 85 03 09 00 95 12 b1 12 85 05 09 00 95 04 b1 02 85 06 09 00 95 24 b1 02 85 16 09 00 15 00 26 ff 00 95 06 b1 02 85 17 09 00 95 0c b1 02 85 19 09 00 95 01 b1 02 85 0a 09 00 75 08 95 01 15 10 26 ff 00 b1 02 85 1e 09 00 95 10 b1 02 c0 06 00 ff 09 00 a1 01 85 09 05 0d 09 20 a1 00 09 00 15 00 26 ff 00 75 08 95 10 81 02 c0 09 00 95 03 b1 12 c0 06 00 ff 09 02 a1 01 85 07 09 00 96 09 01 b1 02 85 08 09 00 95 03 81 02 09 00 b1 02 85 0e 09 00 96 0a 01 b1 02 c0 05 0d 09 02 a1 01 85 1a 09 20 a1 00 09 42 09 44 09 45 09 3c 09 5a 09 32 15 00 25 01 75 01 95 06 81 02 09 38 25 03 75 02 95 01 81 02 05 01 09 30 26 88 3e 46 88 3e 65 11 55 0d 75 10 95 01 81 02 09 31 26 60 53 46 60 53 81 02 05 0d 09 30 26 ff 0f 46 b0 0f 66 11 e1 55 02 81 02 06 00 ff 09 04 75 08 26 ff 00 46 ff 00 65 11 55 0e 81 02 05 0d 09 3d 75 10 16 d8 dc 26 28 23 36 d8 dc 46 28 23 65 14 81 02 09 3e 81 02 05 01 09 32 16 01 ff 25 00 36 01 ff 45 00 65 11 81 02 05 0d 09 56 15 00 27 ff ff 00 00 35 00 47 ff ff 00 00 66 01 10 55 0c 81 02 45 00 65 00 55 00 c0 c0 06 00 ff 09 00 a1 01 85 1b 05 0d 09 20 a1 00 09 00 26 ff 00 75 08 95 10 81 02 c0 c0", + input_info=(BusType.USB, 0x2D1F, 0x014B), + ) diff --git a/tools/testing/selftests/iommu/iommufd.c b/tools/testing/selftests/iommu/iommufd.c index bb4d33dde3c8..10e051b6f592 100644 --- a/tools/testing/selftests/iommu/iommufd.c +++ b/tools/testing/selftests/iommu/iommufd.c @@ -13,9 +13,6 @@ static unsigned long HUGEPAGE_SIZE; -#define MOCK_PAGE_SIZE (PAGE_SIZE / 2) -#define MOCK_HUGE_PAGE_SIZE (512 * MOCK_PAGE_SIZE) - static unsigned long get_huge_page_size(void) { char buf[80]; @@ -1574,6 +1571,49 @@ TEST_F(iommufd_ioas, copy_sweep) test_ioctl_destroy(dst_ioas_id); } +TEST_F(iommufd_ioas, dmabuf_simple) +{ + size_t buf_size = PAGE_SIZE*4; + __u64 iova; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, 0, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, buf_size, buf_size, &iova); + test_err_ioctl_ioas_map_file(EINVAL, dfd, 0, buf_size + 1, &iova); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + + close(dfd); +} + +TEST_F(iommufd_ioas, dmabuf_revoke) +{ + size_t buf_size = PAGE_SIZE*4; + __u32 hwpt_id; + __u64 iova; + __u64 iova2; + int dfd; + + test_cmd_get_dmabuf(buf_size, &dfd); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova); + test_cmd_revoke_dmabuf(dfd, true); + + if (variant->mock_domains) + test_cmd_hwpt_alloc(self->device_id, self->ioas_id, 0, + &hwpt_id); + + test_err_ioctl_ioas_map_file(ENODEV, dfd, 0, buf_size, &iova2); + + test_cmd_revoke_dmabuf(dfd, false); + test_ioctl_ioas_map_file(dfd, 0, buf_size, &iova2); + + /* Restore the iova back */ + test_ioctl_ioas_unmap(iova, buf_size); + test_ioctl_ioas_map_fixed_file(dfd, 0, buf_size, iova); + + close(dfd); +} + FIXTURE(iommufd_mock_domain) { int fd; @@ -2058,6 +2098,12 @@ FIXTURE_VARIANT(iommufd_dirty_tracking) FIXTURE_SETUP(iommufd_dirty_tracking) { + struct iommu_option cmd = { + .size = sizeof(cmd), + .option_id = IOMMU_OPTION_HUGE_PAGES, + .op = IOMMU_OPTION_OP_SET, + .val64 = 0, + }; size_t mmap_buffer_size; unsigned long size; int mmap_flags; @@ -2066,7 +2112,7 @@ FIXTURE_SETUP(iommufd_dirty_tracking) if (variant->buffer_size < MOCK_PAGE_SIZE) { SKIP(return, - "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%lu", + "Skipping buffer_size=%lu, less than MOCK_PAGE_SIZE=%u", variant->buffer_size, MOCK_PAGE_SIZE); } @@ -2114,16 +2160,18 @@ FIXTURE_SETUP(iommufd_dirty_tracking) assert((uintptr_t)self->bitmap % PAGE_SIZE == 0); test_ioctl_ioas_alloc(&self->ioas_id); - /* Enable 1M mock IOMMU hugepages */ - if (variant->hugepages) { - test_cmd_mock_domain_flags(self->ioas_id, - MOCK_FLAGS_DEVICE_HUGE_IOVA, - &self->stdev_id, &self->hwpt_id, - &self->idev_id); - } else { - test_cmd_mock_domain(self->ioas_id, &self->stdev_id, - &self->hwpt_id, &self->idev_id); - } + + /* + * For dirty testing it is important that the page size fed into + * the iommu page tables matches the size the dirty logic + * expects, or set_dirty can touch too much stuff. + */ + cmd.object_id = self->ioas_id; + if (!variant->hugepages) + ASSERT_EQ(0, ioctl(self->fd, IOMMU_OPTION, &cmd)); + + test_cmd_mock_domain(self->ioas_id, &self->stdev_id, &self->hwpt_id, + &self->idev_id); } FIXTURE_TEARDOWN(iommufd_dirty_tracking) @@ -2248,18 +2296,23 @@ TEST_F(iommufd_dirty_tracking, device_dirty_capability) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); @@ -2285,18 +2338,24 @@ TEST_F(iommufd_dirty_tracking, get_dirty_bitmap) TEST_F(iommufd_dirty_tracking, get_dirty_bitmap_no_clear) { uint32_t page_size = MOCK_PAGE_SIZE; + uint32_t ioas_id = self->ioas_id; uint32_t hwpt_id; - uint32_t ioas_id; if (variant->hugepages) page_size = MOCK_HUGE_PAGE_SIZE; - test_ioctl_ioas_alloc(&ioas_id); test_ioctl_ioas_map_fixed_id(ioas_id, self->buffer, variant->buffer_size, MOCK_APERTURE_START); - test_cmd_hwpt_alloc(self->idev_id, ioas_id, - IOMMU_HWPT_ALLOC_DIRTY_TRACKING, &hwpt_id); + + if (variant->hugepages) + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_HUGE, &hwpt_id); + else + test_cmd_hwpt_alloc_iommupt(self->idev_id, ioas_id, + IOMMU_HWPT_ALLOC_DIRTY_TRACKING, + MOCK_IOMMUPT_DEFAULT, &hwpt_id); test_cmd_set_dirty_tracking(hwpt_id, true); diff --git a/tools/testing/selftests/iommu/iommufd_utils.h b/tools/testing/selftests/iommu/iommufd_utils.h index 9f472c20c190..0a0ff6f7926d 100644 --- a/tools/testing/selftests/iommu/iommufd_utils.h +++ b/tools/testing/selftests/iommu/iommufd_utils.h @@ -215,6 +215,18 @@ static int _test_cmd_hwpt_alloc(int fd, __u32 device_id, __u32 pt_id, __u32 ft_i ASSERT_EQ(0, _test_cmd_hwpt_alloc(self->fd, device_id, pt_id, 0, flags, \ hwpt_id, IOMMU_HWPT_DATA_NONE, NULL, \ 0)) +#define test_cmd_hwpt_alloc_iommupt(device_id, pt_id, flags, iommupt_type, \ + hwpt_id) \ + ({ \ + struct iommu_hwpt_selftest user_cfg = { \ + .pagetable_type = iommupt_type \ + }; \ + \ + ASSERT_EQ(0, _test_cmd_hwpt_alloc( \ + self->fd, device_id, pt_id, 0, flags, \ + hwpt_id, IOMMU_HWPT_DATA_SELFTEST, \ + &user_cfg, sizeof(user_cfg))); \ + }) #define test_err_hwpt_alloc(_errno, device_id, pt_id, flags, hwpt_id) \ EXPECT_ERRNO(_errno, _test_cmd_hwpt_alloc( \ self->fd, device_id, pt_id, 0, flags, \ @@ -548,6 +560,39 @@ static int _test_cmd_destroy_access_pages(int fd, unsigned int access_id, EXPECT_ERRNO(_errno, _test_cmd_destroy_access_pages( \ self->fd, access_id, access_pages_id)) +static int _test_cmd_get_dmabuf(int fd, size_t len, int *out_fd) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_GET, + .dmabuf_get = { .length = len, .open_flags = O_CLOEXEC }, + }; + + *out_fd = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (*out_fd < 0) + return -1; + return 0; +} +#define test_cmd_get_dmabuf(len, out_fd) \ + ASSERT_EQ(0, _test_cmd_get_dmabuf(self->fd, len, out_fd)) + +static int _test_cmd_revoke_dmabuf(int fd, int dmabuf_fd, bool revoked) +{ + struct iommu_test_cmd cmd = { + .size = sizeof(cmd), + .op = IOMMU_TEST_OP_DMABUF_REVOKE, + .dmabuf_revoke = { .dmabuf_fd = dmabuf_fd, .revoked = revoked }, + }; + int ret; + + ret = ioctl(fd, IOMMU_TEST_CMD, &cmd); + if (ret < 0) + return -1; + return 0; +} +#define test_cmd_revoke_dmabuf(dmabuf_fd, revoke) \ + ASSERT_EQ(0, _test_cmd_revoke_dmabuf(self->fd, dmabuf_fd, revoke)) + static int _test_ioctl_destroy(int fd, unsigned int id) { struct iommu_destroy cmd = { @@ -718,6 +763,17 @@ static int _test_ioctl_ioas_map_file(int fd, unsigned int ioas_id, int mfd, self->fd, ioas_id, mfd, start, length, iova_p, \ IOMMU_IOAS_MAP_WRITEABLE | IOMMU_IOAS_MAP_READABLE)) +#define test_ioctl_ioas_map_fixed_file(mfd, start, length, iova) \ + ({ \ + __u64 __iova = iova; \ + ASSERT_EQ(0, _test_ioctl_ioas_map_file( \ + self->fd, self->ioas_id, mfd, start, \ + length, &__iova, \ + IOMMU_IOAS_MAP_FIXED_IOVA | \ + IOMMU_IOAS_MAP_WRITEABLE | \ + IOMMU_IOAS_MAP_READABLE)); \ + }) + static int _test_ioctl_set_temp_memory_limit(int fd, unsigned int limit) { struct iommu_test_cmd memlimit_cmd = { diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index d9fffe06d3ea..f2b223072b62 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -6,7 +6,7 @@ ARCH ?= $(SUBARCH) ifeq ($(ARCH),$(filter $(ARCH),arm64 s390 riscv x86 x86_64 loongarch)) # Top-level selftests allows ARCH=x86_64 :-( ifeq ($(ARCH),x86_64) - ARCH := x86 + override ARCH := x86 endif include Makefile.kvm else diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm index 148d427ff24b..ba5c2b643efa 100644 --- a/tools/testing/selftests/kvm/Makefile.kvm +++ b/tools/testing/selftests/kvm/Makefile.kvm @@ -88,8 +88,12 @@ TEST_GEN_PROGS_x86 += x86/kvm_pv_test TEST_GEN_PROGS_x86 += x86/kvm_buslock_test TEST_GEN_PROGS_x86 += x86/monitor_mwait_test TEST_GEN_PROGS_x86 += x86/msrs_test +TEST_GEN_PROGS_x86 += x86/nested_close_kvm_test TEST_GEN_PROGS_x86 += x86/nested_emulation_test TEST_GEN_PROGS_x86 += x86/nested_exceptions_test +TEST_GEN_PROGS_x86 += x86/nested_invalid_cr3_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_adjust_test +TEST_GEN_PROGS_x86 += x86/nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/platform_info_test TEST_GEN_PROGS_x86 += x86/pmu_counters_test TEST_GEN_PROGS_x86 += x86/pmu_event_filter_test @@ -111,14 +115,12 @@ TEST_GEN_PROGS_x86 += x86/ucna_injection_test TEST_GEN_PROGS_x86 += x86/userspace_io_test TEST_GEN_PROGS_x86 += x86/userspace_msr_exit_test TEST_GEN_PROGS_x86 += x86/vmx_apic_access_test -TEST_GEN_PROGS_x86 += x86/vmx_close_while_nested_test TEST_GEN_PROGS_x86 += x86/vmx_dirty_log_test TEST_GEN_PROGS_x86 += x86/vmx_exception_with_invalid_guest_state TEST_GEN_PROGS_x86 += x86/vmx_msrs_test TEST_GEN_PROGS_x86 += x86/vmx_invalid_nested_guest_state +TEST_GEN_PROGS_x86 += x86/vmx_nested_la57_state_test TEST_GEN_PROGS_x86 += x86/vmx_set_nested_state_test -TEST_GEN_PROGS_x86 += x86/vmx_tsc_adjust_test -TEST_GEN_PROGS_x86 += x86/vmx_nested_tsc_scaling_test TEST_GEN_PROGS_x86 += x86/apic_bus_clock_test TEST_GEN_PROGS_x86 += x86/xapic_ipi_test TEST_GEN_PROGS_x86 += x86/xapic_state_test @@ -156,6 +158,7 @@ TEST_GEN_PROGS_EXTENDED_x86 += x86/nx_huge_pages_test TEST_GEN_PROGS_arm64 = $(TEST_GEN_PROGS_COMMON) TEST_GEN_PROGS_arm64 += arm64/aarch32_id_regs TEST_GEN_PROGS_arm64 += arm64/arch_timer_edge_cases +TEST_GEN_PROGS_arm64 += arm64/at TEST_GEN_PROGS_arm64 += arm64/debug-exceptions TEST_GEN_PROGS_arm64 += arm64/hello_el2 TEST_GEN_PROGS_arm64 += arm64/host_sve @@ -163,6 +166,7 @@ TEST_GEN_PROGS_arm64 += arm64/hypercalls TEST_GEN_PROGS_arm64 += arm64/external_aborts TEST_GEN_PROGS_arm64 += arm64/page_fault_test TEST_GEN_PROGS_arm64 += arm64/psci_test +TEST_GEN_PROGS_arm64 += arm64/sea_to_user TEST_GEN_PROGS_arm64 += arm64/set_id_regs TEST_GEN_PROGS_arm64 += arm64/smccc_filter TEST_GEN_PROGS_arm64 += arm64/vcpu_width_config @@ -194,6 +198,7 @@ TEST_GEN_PROGS_s390 += s390/debug_test TEST_GEN_PROGS_s390 += s390/cpumodel_subfuncs_test TEST_GEN_PROGS_s390 += s390/shared_zeropage_test TEST_GEN_PROGS_s390 += s390/ucontrol_test +TEST_GEN_PROGS_s390 += s390/user_operexec TEST_GEN_PROGS_s390 += rseq_test TEST_GEN_PROGS_riscv = $(TEST_GEN_PROGS_COMMON) @@ -210,6 +215,7 @@ TEST_GEN_PROGS_riscv += mmu_stress_test TEST_GEN_PROGS_riscv += rseq_test TEST_GEN_PROGS_riscv += steal_time +TEST_GEN_PROGS_loongarch = arch_timer TEST_GEN_PROGS_loongarch += coalesced_io_test TEST_GEN_PROGS_loongarch += demand_paging_test TEST_GEN_PROGS_loongarch += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/arm64/at.c b/tools/testing/selftests/kvm/arm64/at.c new file mode 100644 index 000000000000..c8ee6f520734 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/at.c @@ -0,0 +1,166 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * at - Test for KVM's AT emulation in the EL2&0 and EL1&0 translation regimes. + */ +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" +#include "ucall.h" + +#include <asm/sysreg.h> + +#define TEST_ADDR 0x80000000 + +enum { + CLEAR_ACCESS_FLAG, + TEST_ACCESS_FLAG, +}; + +static u64 *ptep_hva; + +#define copy_el2_to_el1(reg) \ + write_sysreg_s(read_sysreg_s(SYS_##reg##_EL1), SYS_##reg##_EL12) + +/* Yes, this is an ugly hack */ +#define __at(op, addr) write_sysreg_s(addr, op) + +#define test_at_insn(op, expect_fault) \ +do { \ + u64 par, fsc; \ + bool fault; \ + \ + GUEST_SYNC(CLEAR_ACCESS_FLAG); \ + \ + __at(OP_AT_##op, TEST_ADDR); \ + isb(); \ + par = read_sysreg(par_el1); \ + \ + fault = par & SYS_PAR_EL1_F; \ + fsc = FIELD_GET(SYS_PAR_EL1_FST, par); \ + \ + __GUEST_ASSERT((expect_fault) == fault, \ + "AT "#op": %sexpected fault (par: %lx)1", \ + (expect_fault) ? "" : "un", par); \ + if ((expect_fault)) { \ + __GUEST_ASSERT(fsc == ESR_ELx_FSC_ACCESS_L(3), \ + "AT "#op": expected access flag fault (par: %lx)", \ + par); \ + } else { \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_ATTR, par), MAIR_ATTR_NORMAL); \ + GUEST_ASSERT_EQ(FIELD_GET(SYS_PAR_EL1_SH, par), PTE_SHARED >> 8); \ + GUEST_ASSERT_EQ(par & SYS_PAR_EL1_PA, TEST_ADDR); \ + GUEST_SYNC(TEST_ACCESS_FLAG); \ + } \ +} while (0) + +static void test_at(bool expect_fault) +{ + test_at_insn(S1E2R, expect_fault); + test_at_insn(S1E2W, expect_fault); + + /* Reuse the stage-1 MMU context from EL2 at EL1 */ + copy_el2_to_el1(SCTLR); + copy_el2_to_el1(MAIR); + copy_el2_to_el1(TCR); + copy_el2_to_el1(TTBR0); + copy_el2_to_el1(TTBR1); + + /* Disable stage-2 translation and enter a non-host context */ + write_sysreg(0, vtcr_el2); + write_sysreg(0, vttbr_el2); + sysreg_clear_set(hcr_el2, HCR_EL2_TGE | HCR_EL2_VM, 0); + isb(); + + test_at_insn(S1E1R, expect_fault); + test_at_insn(S1E1W, expect_fault); +} + +static void guest_code(void) +{ + sysreg_clear_set(tcr_el1, TCR_HA, 0); + isb(); + + test_at(true); + + if (!SYS_FIELD_GET(ID_AA64MMFR1_EL1, HAFDBS, read_sysreg(id_aa64mmfr1_el1))) + GUEST_DONE(); + + /* + * KVM's software PTW makes the implementation choice that the AT + * instruction sets the access flag. + */ + sysreg_clear_set(tcr_el1, 0, TCR_HA); + isb(); + test_at(false); + + GUEST_DONE(); +} + +static void handle_sync(struct kvm_vcpu *vcpu, struct ucall *uc) +{ + switch (uc->args[1]) { + case CLEAR_ACCESS_FLAG: + /* + * Delete + reinstall the memslot to invalidate stage-2 + * mappings of the stage-1 page tables, forcing KVM to + * use the 'slow' AT emulation path. + * + * This and clearing the access flag from host userspace + * ensures that the access flag cannot be set speculatively + * and is reliably cleared at the time of the AT instruction. + */ + clear_bit(__ffs(PTE_AF), ptep_hva); + vm_mem_region_reload(vcpu->vm, vcpu->vm->memslots[MEM_REGION_PT]); + break; + case TEST_ACCESS_FLAG: + TEST_ASSERT(test_bit(__ffs(PTE_AF), ptep_hva), + "Expected access flag to be set (desc: %lu)", *ptep_hva); + break; + default: + TEST_FAIL("Unexpected SYNC arg: %lu", uc->args[1]); + } +} + +static void run_test(struct kvm_vcpu *vcpu) +{ + struct ucall uc; + + while (true) { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_DONE: + return; + case UCALL_SYNC: + handle_sync(vcpu, &uc); + continue; + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + return; + default: + TEST_FAIL("Unexpected ucall: %lu", uc.cmd); + } + } +} + +int main(void) +{ + struct kvm_vcpu_init init; + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + + TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2)); + + vm = vm_create(1); + + kvm_get_default_vcpu_target(vm, &init); + init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2); + vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code); + kvm_arch_vm_finalize_vcpus(vm); + + virt_map(vm, TEST_ADDR, TEST_ADDR, 1); + ptep_hva = virt_get_pte_hva_at_level(vm, TEST_ADDR, 3); + run_test(vcpu); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/sea_to_user.c b/tools/testing/selftests/kvm/arm64/sea_to_user.c new file mode 100644 index 000000000000..573dd790aeb8 --- /dev/null +++ b/tools/testing/selftests/kvm/arm64/sea_to_user.c @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Test KVM returns to userspace with KVM_EXIT_ARM_SEA if host APEI fails + * to handle SEA and userspace has opt-ed in KVM_CAP_ARM_SEA_TO_USER. + * + * After reaching userspace with expected arm_sea info, also test userspace + * injecting a synchronous external data abort into the guest. + * + * This test utilizes EINJ to generate a REAL synchronous external data + * abort by consuming a recoverable uncorrectable memory error. Therefore + * the device under test must support EINJ in both firmware and host kernel, + * including the notrigger feature. Otherwise the test will be skipped. + * The under-test platform's APEI should be unable to claim SEA. Otherwise + * the test will also be skipped. + */ + +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "guest_modes.h" + +#define PAGE_PRESENT (1ULL << 63) +#define PAGE_PHYSICAL 0x007fffffffffffffULL +#define PAGE_ADDR_MASK (~(0xfffULL)) + +/* Group ISV and ISS[23:14]. */ +#define ESR_ELx_INST_SYNDROME ((ESR_ELx_ISV) | (ESR_ELx_SAS) | \ + (ESR_ELx_SSE) | (ESR_ELx_SRT_MASK) | \ + (ESR_ELx_SF) | (ESR_ELx_AR)) + +#define EINJ_ETYPE "/sys/kernel/debug/apei/einj/error_type" +#define EINJ_ADDR "/sys/kernel/debug/apei/einj/param1" +#define EINJ_MASK "/sys/kernel/debug/apei/einj/param2" +#define EINJ_FLAGS "/sys/kernel/debug/apei/einj/flags" +#define EINJ_NOTRIGGER "/sys/kernel/debug/apei/einj/notrigger" +#define EINJ_DOIT "/sys/kernel/debug/apei/einj/error_inject" +/* Memory Uncorrectable non-fatal. */ +#define ERROR_TYPE_MEMORY_UER 0x10 +/* Memory address and mask valid (param1 and param2). */ +#define MASK_MEMORY_UER 0b10 + +/* Guest virtual address region = [2G, 3G). */ +#define START_GVA 0x80000000UL +#define VM_MEM_SIZE 0x40000000UL +/* Note: EINJ_OFFSET must < VM_MEM_SIZE. */ +#define EINJ_OFFSET 0x01234badUL +#define EINJ_GVA ((START_GVA) + (EINJ_OFFSET)) + +static vm_paddr_t einj_gpa; +static void *einj_hva; +static uint64_t einj_hpa; +static bool far_invalid; + +static uint64_t translate_to_host_paddr(unsigned long vaddr) +{ + uint64_t pinfo; + int64_t offset = vaddr / getpagesize() * sizeof(pinfo); + int fd; + uint64_t page_addr; + uint64_t paddr; + + fd = open("/proc/self/pagemap", O_RDONLY); + if (fd < 0) + ksft_exit_fail_perror("Failed to open /proc/self/pagemap"); + if (pread(fd, &pinfo, sizeof(pinfo), offset) != sizeof(pinfo)) { + close(fd); + ksft_exit_fail_perror("Failed to read /proc/self/pagemap"); + } + + close(fd); + + if ((pinfo & PAGE_PRESENT) == 0) + ksft_exit_fail_perror("Page not present"); + + page_addr = (pinfo & PAGE_PHYSICAL) << MIN_PAGE_SHIFT; + paddr = page_addr + (vaddr & (getpagesize() - 1)); + return paddr; +} + +static void write_einj_entry(const char *einj_path, uint64_t val) +{ + char cmd[256] = {0}; + FILE *cmdfile = NULL; + + sprintf(cmd, "echo %#lx > %s", val, einj_path); + cmdfile = popen(cmd, "r"); + + if (pclose(cmdfile) == 0) + ksft_print_msg("echo %#lx > %s - done\n", val, einj_path); + else + ksft_exit_fail_perror("Failed to write EINJ entry"); +} + +static void inject_uer(uint64_t paddr) +{ + if (access("/sys/firmware/acpi/tables/EINJ", R_OK) == -1) + ksft_test_result_skip("EINJ table no available in firmware"); + + if (access(EINJ_ETYPE, R_OK | W_OK) == -1) + ksft_test_result_skip("EINJ module probably not loaded?"); + + write_einj_entry(EINJ_ETYPE, ERROR_TYPE_MEMORY_UER); + write_einj_entry(EINJ_FLAGS, MASK_MEMORY_UER); + write_einj_entry(EINJ_ADDR, paddr); + write_einj_entry(EINJ_MASK, ~0x0UL); + write_einj_entry(EINJ_NOTRIGGER, 1); + write_einj_entry(EINJ_DOIT, 1); +} + +/* + * When host APEI successfully claims the SEA caused by guest_code, kernel + * will send SIGBUS signal with BUS_MCEERR_AR to test thread. + * + * We set up this SIGBUS handler to skip the test for that case. + */ +static void sigbus_signal_handler(int sig, siginfo_t *si, void *v) +{ + ksft_print_msg("SIGBUS (%d) received, dumping siginfo...\n", sig); + ksft_print_msg("si_signo=%d, si_errno=%d, si_code=%d, si_addr=%p\n", + si->si_signo, si->si_errno, si->si_code, si->si_addr); + if (si->si_code == BUS_MCEERR_AR) + ksft_test_result_skip("SEA is claimed by host APEI\n"); + else + ksft_test_result_fail("Exit with signal unhandled\n"); + + exit(0); +} + +static void setup_sigbus_handler(void) +{ + struct sigaction act; + + memset(&act, 0, sizeof(act)); + sigemptyset(&act.sa_mask); + act.sa_sigaction = sigbus_signal_handler; + act.sa_flags = SA_SIGINFO; + TEST_ASSERT(sigaction(SIGBUS, &act, NULL) == 0, + "Failed to setup SIGBUS handler"); +} + +static void guest_code(void) +{ + uint64_t guest_data; + + /* Consumes error will cause a SEA. */ + guest_data = *(uint64_t *)EINJ_GVA; + + GUEST_FAIL("Poison not protected by SEA: gva=%#lx, guest_data=%#lx\n", + EINJ_GVA, guest_data); +} + +static void expect_sea_handler(struct ex_regs *regs) +{ + u64 esr = read_sysreg(esr_el1); + u64 far = read_sysreg(far_el1); + bool expect_far_invalid = far_invalid; + + GUEST_PRINTF("Handling Guest SEA\n"); + GUEST_PRINTF("ESR_EL1=%#lx, FAR_EL1=%#lx\n", esr, far); + + GUEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_CUR); + GUEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + + if (expect_far_invalid) { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, ESR_ELx_FnV); + GUEST_PRINTF("Guest observed garbage value in FAR\n"); + } else { + GUEST_ASSERT_EQ(esr & ESR_ELx_FnV, 0); + GUEST_ASSERT_EQ(far, EINJ_GVA); + } + + GUEST_DONE(); +} + +static void vcpu_inject_sea(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_events events = {}; + + events.exception.ext_dabt_pending = true; + vcpu_events_set(vcpu, &events); +} + +static void run_vm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) +{ + struct ucall uc; + bool guest_done = false; + struct kvm_run *run = vcpu->run; + u64 esr; + + /* Resume the vCPU after error injection to consume the error. */ + vcpu_run(vcpu); + + ksft_print_msg("Dump kvm_run info about KVM_EXIT_%s\n", + exit_reason_str(run->exit_reason)); + ksft_print_msg("kvm_run.arm_sea: esr=%#llx, flags=%#llx\n", + run->arm_sea.esr, run->arm_sea.flags); + ksft_print_msg("kvm_run.arm_sea: gva=%#llx, gpa=%#llx\n", + run->arm_sea.gva, run->arm_sea.gpa); + + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_ARM_SEA); + + esr = run->arm_sea.esr; + TEST_ASSERT_EQ(ESR_ELx_EC(esr), ESR_ELx_EC_DABT_LOW); + TEST_ASSERT_EQ(esr & ESR_ELx_FSC_TYPE, ESR_ELx_FSC_EXTABT); + TEST_ASSERT_EQ(ESR_ELx_ISS2(esr), 0); + TEST_ASSERT_EQ((esr & ESR_ELx_INST_SYNDROME), 0); + TEST_ASSERT_EQ(esr & ESR_ELx_VNCR, 0); + + if (!(esr & ESR_ELx_FnV)) { + ksft_print_msg("Expect gva to match given FnV bit is 0\n"); + TEST_ASSERT_EQ(run->arm_sea.gva, EINJ_GVA); + } + + if (run->arm_sea.flags & KVM_EXIT_ARM_SEA_FLAG_GPA_VALID) { + ksft_print_msg("Expect gpa to match given KVM_EXIT_ARM_SEA_FLAG_GPA_VALID is set\n"); + TEST_ASSERT_EQ(run->arm_sea.gpa, einj_gpa & PAGE_ADDR_MASK); + } + + far_invalid = esr & ESR_ELx_FnV; + + /* Inject a SEA into guest and expect handled in SEA handler. */ + vcpu_inject_sea(vcpu); + + /* Expect the guest to reach GUEST_DONE gracefully. */ + do { + vcpu_run(vcpu); + switch (get_ucall(vcpu, &uc)) { + case UCALL_PRINTF: + ksft_print_msg("From guest: %s", uc.buffer); + break; + case UCALL_DONE: + ksft_print_msg("Guest done gracefully!\n"); + guest_done = 1; + break; + case UCALL_ABORT: + ksft_print_msg("Guest aborted!\n"); + guest_done = 1; + REPORT_GUEST_ASSERT(uc); + break; + default: + TEST_FAIL("Unexpected ucall: %lu\n", uc.cmd); + } + } while (!guest_done); +} + +static struct kvm_vm *vm_create_with_sea_handler(struct kvm_vcpu **vcpu) +{ + size_t backing_page_size; + size_t guest_page_size; + size_t alignment; + uint64_t num_guest_pages; + vm_paddr_t start_gpa; + enum vm_mem_backing_src_type src_type = VM_MEM_SRC_ANONYMOUS_HUGETLB_1GB; + struct kvm_vm *vm; + + backing_page_size = get_backing_src_pagesz(src_type); + guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; + alignment = max(backing_page_size, guest_page_size); + num_guest_pages = VM_MEM_SIZE / guest_page_size; + + vm = __vm_create_with_one_vcpu(vcpu, num_guest_pages, guest_code); + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(*vcpu); + + vm_install_sync_handler(vm, + /*vector=*/VECTOR_SYNC_CURRENT, + /*ec=*/ESR_ELx_EC_DABT_CUR, + /*handler=*/expect_sea_handler); + + start_gpa = (vm->max_gfn - num_guest_pages) * guest_page_size; + start_gpa = align_down(start_gpa, alignment); + + vm_userspace_mem_region_add( + /*vm=*/vm, + /*src_type=*/src_type, + /*guest_paddr=*/start_gpa, + /*slot=*/1, + /*npages=*/num_guest_pages, + /*flags=*/0); + + virt_map(vm, START_GVA, start_gpa, num_guest_pages); + + ksft_print_msg("Mapped %#lx pages: gva=%#lx to gpa=%#lx\n", + num_guest_pages, START_GVA, start_gpa); + return vm; +} + +static void vm_inject_memory_uer(struct kvm_vm *vm) +{ + uint64_t guest_data; + + einj_gpa = addr_gva2gpa(vm, EINJ_GVA); + einj_hva = addr_gva2hva(vm, EINJ_GVA); + + /* Populate certain data before injecting UER. */ + *(uint64_t *)einj_hva = 0xBAADCAFE; + guest_data = *(uint64_t *)einj_hva; + ksft_print_msg("Before EINJect: data=%#lx\n", + guest_data); + + einj_hpa = translate_to_host_paddr((unsigned long)einj_hva); + + ksft_print_msg("EINJ_GVA=%#lx, einj_gpa=%#lx, einj_hva=%p, einj_hpa=%#lx\n", + EINJ_GVA, einj_gpa, einj_hva, einj_hpa); + + inject_uer(einj_hpa); + ksft_print_msg("Memory UER EINJected\n"); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_ARM_SEA_TO_USER)); + + setup_sigbus_handler(); + + vm = vm_create_with_sea_handler(&vcpu); + vm_enable_cap(vm, KVM_CAP_ARM_SEA_TO_USER, 0); + vm_inject_memory_uer(vm); + run_vm(vm, vcpu); + kvm_vm_free(vm); + + return 0; +} diff --git a/tools/testing/selftests/kvm/arm64/vgic_irq.c b/tools/testing/selftests/kvm/arm64/vgic_irq.c index 6338f5bbdb70..2fb2c7939fe9 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_irq.c +++ b/tools/testing/selftests/kvm/arm64/vgic_irq.c @@ -29,6 +29,7 @@ struct test_args { bool level_sensitive; /* 1 is level, 0 is edge */ int kvm_max_routes; /* output of KVM_CAP_IRQ_ROUTING */ bool kvm_supports_irqfd; /* output of KVM_CAP_IRQFD */ + uint32_t shared_data; }; /* @@ -205,7 +206,7 @@ static void kvm_inject_call(kvm_inject_cmd cmd, uint32_t first_intid, do { \ uint32_t _intid; \ _intid = gic_get_and_ack_irq(); \ - GUEST_ASSERT(_intid == 0 || _intid == IAR_SPURIOUS); \ + GUEST_ASSERT(_intid == IAR_SPURIOUS); \ } while (0) #define CAT_HELPER(a, b) a ## b @@ -359,8 +360,9 @@ static uint32_t wait_for_and_activate_irq(void) * interrupts for the whole test. */ static void test_inject_preemption(struct test_args *args, - uint32_t first_intid, int num, - kvm_inject_cmd cmd) + uint32_t first_intid, int num, + const unsigned long *exclude, + kvm_inject_cmd cmd) { uint32_t intid, prio, step = KVM_PRIO_STEPS; int i; @@ -379,6 +381,10 @@ static void test_inject_preemption(struct test_args *args, for (i = 0; i < num; i++) { uint32_t tmp; intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + KVM_INJECT(cmd, intid); /* Each successive IRQ will preempt the previous one. */ tmp = wait_for_and_activate_irq(); @@ -390,15 +396,33 @@ static void test_inject_preemption(struct test_args *args, /* finish handling the IRQs starting with the highest priority one. */ for (i = 0; i < num; i++) { intid = num - i - 1 + first_intid; + + if (exclude && test_bit(intid - first_intid, exclude)) + continue; + gic_set_eoi(intid); - if (args->eoi_split) - gic_set_dir(intid); + } + + if (args->eoi_split) { + for (i = 0; i < num; i++) { + intid = i + first_intid; + + if (exclude && test_bit(i, exclude)) + continue; + + if (args->eoi_split) + gic_set_dir(intid); + } } local_irq_enable(); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { + if (exclude && test_bit(i, exclude)) + continue; + GUEST_ASSERT(!gic_irq_get_active(i + first_intid)); + } GUEST_ASSERT_EQ(gic_read_ap1r0(), 0); GUEST_ASSERT_IAR_EMPTY(); @@ -436,33 +460,32 @@ static void test_injection_failure(struct test_args *args, static void test_preemption(struct test_args *args, struct kvm_inject_desc *f) { - /* - * Test up to 4 levels of preemption. The reason is that KVM doesn't - * currently implement the ability to have more than the number-of-LRs - * number of concurrently active IRQs. The number of LRs implemented is - * IMPLEMENTATION DEFINED, however, it seems that most implement 4. - */ + /* Timer PPIs cannot be injected from userspace */ + static const unsigned long ppi_exclude = (BIT(27 - MIN_PPI) | + BIT(30 - MIN_PPI) | + BIT(28 - MIN_PPI) | + BIT(26 - MIN_PPI)); + if (f->sgi) - test_inject_preemption(args, MIN_SGI, 4, f->cmd); + test_inject_preemption(args, MIN_SGI, 16, NULL, f->cmd); if (f->ppi) - test_inject_preemption(args, MIN_PPI, 4, f->cmd); + test_inject_preemption(args, MIN_PPI, 16, &ppi_exclude, f->cmd); if (f->spi) - test_inject_preemption(args, MIN_SPI, 4, f->cmd); + test_inject_preemption(args, MIN_SPI, 31, NULL, f->cmd); } static void test_restore_active(struct test_args *args, struct kvm_inject_desc *f) { - /* Test up to 4 active IRQs. Same reason as in test_preemption. */ if (f->sgi) - guest_restore_active(args, MIN_SGI, 4, f->cmd); + guest_restore_active(args, MIN_SGI, 16, f->cmd); if (f->ppi) - guest_restore_active(args, MIN_PPI, 4, f->cmd); + guest_restore_active(args, MIN_PPI, 16, f->cmd); if (f->spi) - guest_restore_active(args, MIN_SPI, 4, f->cmd); + guest_restore_active(args, MIN_SPI, 31, f->cmd); } static void guest_code(struct test_args *args) @@ -473,12 +496,12 @@ static void guest_code(struct test_args *args) gic_init(GIC_V3, 1); - for (i = 0; i < nr_irqs; i++) - gic_irq_enable(i); - for (i = MIN_SPI; i < nr_irqs; i++) gic_irq_set_config(i, !level_sensitive); + for (i = 0; i < nr_irqs; i++) + gic_irq_enable(i); + gic_set_eoi_split(args->eoi_split); reset_priorities(args); @@ -636,7 +659,7 @@ static void kvm_routing_and_irqfd_check(struct kvm_vm *vm, } for (f = 0, i = intid; i < (uint64_t)intid + num; i++, f++) - close(fd[f]); + kvm_close(fd[f]); } /* handles the valid case: intid=0xffffffff num=1 */ @@ -779,6 +802,221 @@ done: kvm_vm_free(vm); } +static void guest_code_asym_dir(struct test_args *args, int cpuid) +{ + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + if (cpuid == 0) { + uint32_t intid; + + local_irq_disable(); + + gic_set_priority(MIN_PPI, IRQ_DEFAULT_PRIO); + gic_irq_enable(MIN_SPI); + gic_irq_set_pending(MIN_SPI); + + intid = wait_for_and_activate_irq(); + GUEST_ASSERT_EQ(intid, MIN_SPI); + + gic_set_eoi(intid); + isb(); + + WRITE_ONCE(args->shared_data, MIN_SPI); + dsb(ishst); + + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) == MIN_SPI); + GUEST_ASSERT(!gic_irq_get_active(MIN_SPI)); + } else { + do { + dsb(ishld); + } while (READ_ONCE(args->shared_data) != MIN_SPI); + + gic_set_dir(MIN_SPI); + isb(); + + WRITE_ONCE(args->shared_data, 0); + dsb(ishst); + } + + GUEST_DONE(); +} + +static void guest_code_group_en(struct test_args *args, int cpuid) +{ + uint32_t intid; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(0); + gic_set_priority_mask(CPU_PRIO_MASK); + /* SGI0 is G0, which is disabled */ + gic_irq_set_group(0, 0); + + /* Configure all SGIs with decreasing priority */ + for (intid = 0; intid < MIN_PPI; intid++) { + gic_set_priority(intid, (intid + 1) * 8); + gic_irq_enable(intid); + gic_irq_set_pending(intid); + } + + /* Ack and EOI all G1 interrupts */ + for (int i = 1; i < MIN_PPI; i++) { + intid = wait_for_and_activate_irq(); + + GUEST_ASSERT(intid < MIN_PPI); + gic_set_eoi(intid); + isb(); + } + + /* + * Check that SGI0 is still pending, inactive, and that we cannot + * ack anything. + */ + GUEST_ASSERT(gic_irq_get_pending(0)); + GUEST_ASSERT(!gic_irq_get_active(0)); + GUEST_ASSERT_IAR_EMPTY(); + GUEST_ASSERT(read_sysreg_s(SYS_ICC_IAR0_EL1) == IAR_SPURIOUS); + + /* Open the G0 gates, and verify we can ack SGI0 */ + write_sysreg_s(1, SYS_ICC_IGRPEN0_EL1); + isb(); + + do { + intid = read_sysreg_s(SYS_ICC_IAR0_EL1); + } while (intid == IAR_SPURIOUS); + + GUEST_ASSERT(intid == 0); + GUEST_DONE(); +} + +static void guest_code_timer_spi(struct test_args *args, int cpuid) +{ + uint32_t intid; + u64 val; + + gic_init(GIC_V3, 2); + + gic_set_eoi_split(1); + gic_set_priority_mask(CPU_PRIO_MASK); + + /* Add a pending SPI so that KVM starts trapping DIR */ + gic_set_priority(MIN_SPI + cpuid, IRQ_DEFAULT_PRIO); + gic_irq_set_pending(MIN_SPI + cpuid); + + /* Configure the timer with a higher priority, make it pending */ + gic_set_priority(27, IRQ_DEFAULT_PRIO - 8); + + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* Enable both interrupts */ + gic_irq_enable(MIN_SPI + cpuid); + gic_irq_enable(27); + + /* The timer must fire */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + /* Check that we can deassert it */ + write_sysreg(0, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(!gic_irq_get_pending(27)); + + /* + * Priority drop, deactivation -- we expect that the host + * deactivation will have been effective + */ + gic_set_eoi(27); + gic_set_dir(27); + + GUEST_ASSERT(!gic_irq_get_active(27)); + + /* Do it one more time */ + isb(); + val = read_sysreg(cntvct_el0); + write_sysreg(val, cntv_cval_el0); + write_sysreg(1, cntv_ctl_el0); + isb(); + + GUEST_ASSERT(gic_irq_get_pending(27)); + + /* The timer must fire again */ + intid = wait_for_and_activate_irq(); + GUEST_ASSERT(intid == 27); + + GUEST_DONE(); +} + +static void *test_vcpu_run(void *arg) +{ + struct kvm_vcpu *vcpu = arg; + struct ucall uc; + + while (1) { + vcpu_run(vcpu); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + break; + case UCALL_DONE: + return NULL; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + + return NULL; +} + +static void test_vgic_two_cpus(void *gcode) +{ + pthread_t thr[2]; + struct kvm_vcpu *vcpus[2]; + struct test_args args = {}; + struct kvm_vm *vm; + vm_vaddr_t args_gva; + int gic_fd, ret; + + vm = vm_create_with_vcpus(2, gcode, vcpus); + + vm_init_descriptor_tables(vm); + vcpu_init_descriptor_tables(vcpus[0]); + vcpu_init_descriptor_tables(vcpus[1]); + + /* Setup the guest args page (so it gets the args). */ + args_gva = vm_vaddr_alloc_page(vm); + memcpy(addr_gva2hva(vm, args_gva), &args, sizeof(args)); + vcpu_args_set(vcpus[0], 2, args_gva, 0); + vcpu_args_set(vcpus[1], 2, args_gva, 1); + + gic_fd = vgic_v3_setup(vm, 2, 64); + + ret = pthread_create(&thr[0], NULL, test_vcpu_run, vcpus[0]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 0 (%d)\n", ret); + ret = pthread_create(&thr[1], NULL, test_vcpu_run, vcpus[1]); + if (ret) + TEST_FAIL("Can't create thread for vcpu 1 (%d)\n", ret); + + pthread_join(thr[0], NULL); + pthread_join(thr[1], NULL); + + close(gic_fd); + kvm_vm_free(vm); +} + static void help(const char *name) { printf( @@ -835,6 +1073,9 @@ int main(int argc, char **argv) test_vgic(nr_irqs, false /* level */, true /* eoi_split */); test_vgic(nr_irqs, true /* level */, false /* eoi_split */); test_vgic(nr_irqs, true /* level */, true /* eoi_split */); + test_vgic_two_cpus(guest_code_asym_dir); + test_vgic_two_cpus(guest_code_group_en); + test_vgic_two_cpus(guest_code_timer_spi); } else { test_vgic(nr_irqs, level_sensitive, eoi_split); } diff --git a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c index 687d04463983..e857a605f577 100644 --- a/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c +++ b/tools/testing/selftests/kvm/arm64/vgic_lpi_stress.c @@ -118,6 +118,10 @@ static void guest_setup_gic(void) guest_setup_its_mappings(); guest_invalidate_all_rdists(); + + /* SYNC to ensure ITS setup is complete */ + for (cpuid = 0; cpuid < test_data.nr_cpus; cpuid++) + its_send_sync_cmd(test_data.cmdq_base_va, cpuid); } static void guest_code(size_t nr_lpis) diff --git a/tools/testing/selftests/kvm/guest_memfd_test.c b/tools/testing/selftests/kvm/guest_memfd_test.c index e7d9aeb418d3..618c937f3c90 100644 --- a/tools/testing/selftests/kvm/guest_memfd_test.c +++ b/tools/testing/selftests/kvm/guest_memfd_test.c @@ -19,6 +19,7 @@ #include <sys/stat.h> #include "kvm_util.h" +#include "numaif.h" #include "test_util.h" #include "ucall_common.h" @@ -75,6 +76,101 @@ static void test_mmap_supported(int fd, size_t total_size) kvm_munmap(mem, total_size); } +static void test_mbind(int fd, size_t total_size) +{ + const unsigned long nodemask_0 = 1; /* nid: 0 */ + unsigned long nodemask = 0; + unsigned long maxnode = 8; + int policy; + char *mem; + int ret; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + /* Test MPOL_INTERLEAVE policy */ + kvm_mbind(mem, page_size * 2, MPOL_INTERLEAVE, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_INTERLEAVE && nodemask == nodemask_0, + "Wanted MPOL_INTERLEAVE (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_INTERLEAVE, nodemask_0, policy, nodemask); + + /* Test basic MPOL_BIND policy */ + kvm_mbind(mem + page_size * 2, page_size * 2, MPOL_BIND, &nodemask_0, maxnode, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem + page_size * 2, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_BIND && nodemask == nodemask_0, + "Wanted MPOL_BIND (%u) and nodemask 0x%lx, got %u and 0x%lx", + MPOL_BIND, nodemask_0, policy, nodemask); + + /* Test MPOL_DEFAULT policy */ + kvm_mbind(mem, total_size, MPOL_DEFAULT, NULL, 0, 0); + kvm_get_mempolicy(&policy, &nodemask, maxnode, mem, MPOL_F_ADDR); + TEST_ASSERT(policy == MPOL_DEFAULT && !nodemask, + "Wanted MPOL_DEFAULT (%u) and nodemask 0x0, got %u and 0x%lx", + MPOL_DEFAULT, policy, nodemask); + + /* Test with invalid policy */ + ret = mbind(mem, page_size, 999, &nodemask_0, maxnode, 0); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "mbind with invalid policy should fail with EINVAL"); + + kvm_munmap(mem, total_size); +} + +static void test_numa_allocation(int fd, size_t total_size) +{ + unsigned long node0_mask = 1; /* Node 0 */ + unsigned long node1_mask = 2; /* Node 1 */ + unsigned long maxnode = 8; + void *pages[4]; + int status[4]; + char *mem; + int i; + + if (!is_multi_numa_node_system()) + return; + + mem = kvm_mmap(total_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd); + + for (i = 0; i < 4; i++) + pages[i] = (char *)mem + page_size * i; + + /* Set NUMA policy after allocation */ + memset(mem, 0xaa, page_size); + kvm_mbind(pages[0], page_size, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, page_size); + + /* Set NUMA policy before allocation */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + /* Validate if pages are allocated on specified NUMA nodes */ + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 1, "Expected page 0 on node 1, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 1, "Expected page 1 on node 1, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 0, "Expected page 2 on node 0, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 0, "Expected page 3 on node 0, got it on node %d", status[3]); + + /* Punch hole for all pages */ + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, total_size); + + /* Change NUMA policy nodes and reallocate */ + kvm_mbind(pages[0], page_size * 2, MPOL_BIND, &node0_mask, maxnode, 0); + kvm_mbind(pages[2], page_size * 2, MPOL_BIND, &node1_mask, maxnode, 0); + memset(mem, 0xaa, total_size); + + kvm_move_pages(0, 4, pages, NULL, status, 0); + TEST_ASSERT(status[0] == 0, "Expected page 0 on node 0, got it on node %d", status[0]); + TEST_ASSERT(status[1] == 0, "Expected page 1 on node 0, got it on node %d", status[1]); + TEST_ASSERT(status[2] == 1, "Expected page 2 on node 1, got it on node %d", status[2]); + TEST_ASSERT(status[3] == 1, "Expected page 3 on node 1, got it on node %d", status[3]); + + kvm_munmap(mem, total_size); +} + static void test_fault_sigbus(int fd, size_t accessible_size, size_t map_size) { const char val = 0xaa; @@ -273,11 +369,13 @@ static void __test_guest_memfd(struct kvm_vm *vm, uint64_t flags) if (flags & GUEST_MEMFD_FLAG_INIT_SHARED) { gmem_test(mmap_supported, vm, flags); gmem_test(fault_overflow, vm, flags); + gmem_test(numa_allocation, vm, flags); } else { gmem_test(fault_private, vm, flags); } gmem_test(mmap_cow, vm, flags); + gmem_test(mbind, vm, flags); } else { gmem_test(mmap_not_supported, vm, flags); } diff --git a/tools/testing/selftests/kvm/include/arm64/gic.h b/tools/testing/selftests/kvm/include/arm64/gic.h index baeb3c859389..cc7a7f34ed37 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic.h +++ b/tools/testing/selftests/kvm/include/arm64/gic.h @@ -57,6 +57,7 @@ void gic_irq_set_pending(unsigned int intid); void gic_irq_clear_pending(unsigned int intid); bool gic_irq_get_pending(unsigned int intid); void gic_irq_set_config(unsigned int intid, bool is_edge); +void gic_irq_set_group(unsigned int intid, bool group); void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, vm_paddr_t pend_table); diff --git a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h index 3722ed9c8f96..58feef3eb386 100644 --- a/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h +++ b/tools/testing/selftests/kvm/include/arm64/gic_v3_its.h @@ -15,5 +15,6 @@ void its_send_mapc_cmd(void *cmdq_base, u32 vcpu_id, u32 collection_id, bool val void its_send_mapti_cmd(void *cmdq_base, u32 device_id, u32 event_id, u32 collection_id, u32 intid); void its_send_invall_cmd(void *cmdq_base, u32 collection_id); +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id); #endif // __SELFTESTS_GIC_V3_ITS_H__ diff --git a/tools/testing/selftests/kvm/include/kvm_syscalls.h b/tools/testing/selftests/kvm/include/kvm_syscalls.h new file mode 100644 index 000000000000..d4e613162bba --- /dev/null +++ b/tools/testing/selftests/kvm/include/kvm_syscalls.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTEST_KVM_SYSCALLS_H +#define SELFTEST_KVM_SYSCALLS_H + +#include <sys/syscall.h> + +#define MAP_ARGS0(m,...) +#define MAP_ARGS1(m,t,a,...) m(t,a) +#define MAP_ARGS2(m,t,a,...) m(t,a), MAP_ARGS1(m,__VA_ARGS__) +#define MAP_ARGS3(m,t,a,...) m(t,a), MAP_ARGS2(m,__VA_ARGS__) +#define MAP_ARGS4(m,t,a,...) m(t,a), MAP_ARGS3(m,__VA_ARGS__) +#define MAP_ARGS5(m,t,a,...) m(t,a), MAP_ARGS4(m,__VA_ARGS__) +#define MAP_ARGS6(m,t,a,...) m(t,a), MAP_ARGS5(m,__VA_ARGS__) +#define MAP_ARGS(n,...) MAP_ARGS##n(__VA_ARGS__) + +#define __DECLARE_ARGS(t, a) t a +#define __UNPACK_ARGS(t, a) a + +#define DECLARE_ARGS(nr_args, args...) MAP_ARGS(nr_args, __DECLARE_ARGS, args) +#define UNPACK_ARGS(nr_args, args...) MAP_ARGS(nr_args, __UNPACK_ARGS, args) + +#define __KVM_SYSCALL_ERROR(_name, _ret) \ + "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) + +/* Define a kvm_<syscall>() API to assert success. */ +#define __KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline void kvm_##name(DECLARE_ARGS(nr_args, args)) \ +{ \ + int r; \ + \ + r = name(UNPACK_ARGS(nr_args, args)); \ + TEST_ASSERT(!r, __KVM_SYSCALL_ERROR(#name, r)); \ +} + +/* + * Macro to define syscall APIs, either because KVM selftests doesn't link to + * the standard library, e.g. libnuma, or because there is no library that yet + * provides the syscall. These + */ +#define KVM_SYSCALL_DEFINE(name, nr_args, args...) \ +static inline long name(DECLARE_ARGS(nr_args, args)) \ +{ \ + return syscall(__NR_##name, UNPACK_ARGS(nr_args, args)); \ +} \ +__KVM_SYSCALL_DEFINE(name, nr_args, args) + +/* + * Special case mmap(), as KVM selftest rarely/never specific an address, + * rarely specify an offset, and because the unique return code requires + * special handling anyways. + */ +static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, + off_t offset) +{ + void *mem; + + mem = mmap(NULL, size, prot, flags, fd, offset); + TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", + (int)(unsigned long)MAP_FAILED)); + return mem; +} + +static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) +{ + return __kvm_mmap(size, prot, flags, fd, 0); +} + +static inline int kvm_dup(int fd) +{ + int new_fd = dup(fd); + + TEST_ASSERT(new_fd >= 0, __KVM_SYSCALL_ERROR("dup()", new_fd)); + return new_fd; +} + +__KVM_SYSCALL_DEFINE(munmap, 2, void *, mem, size_t, size); +__KVM_SYSCALL_DEFINE(close, 1, int, fd); +__KVM_SYSCALL_DEFINE(fallocate, 4, int, fd, int, mode, loff_t, offset, loff_t, len); +__KVM_SYSCALL_DEFINE(ftruncate, 2, unsigned int, fd, off_t, length); + +#endif /* SELFTEST_KVM_SYSCALLS_H */ diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index d3f3e455c031..81f4355ff28a 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -23,6 +23,7 @@ #include <pthread.h> +#include "kvm_syscalls.h" #include "kvm_util_arch.h" #include "kvm_util_types.h" #include "sparsebit.h" @@ -177,7 +178,7 @@ enum vm_guest_mode { VM_MODE_P40V48_4K, VM_MODE_P40V48_16K, VM_MODE_P40V48_64K, - VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */ + VM_MODE_PXXVYY_4K, /* For 48-bit or 57-bit VA, depending on host support */ VM_MODE_P47V64_4K, VM_MODE_P44V64_4K, VM_MODE_P36V48_4K, @@ -219,7 +220,7 @@ extern enum vm_guest_mode vm_mode_default; #elif defined(__x86_64__) -#define VM_MODE_DEFAULT VM_MODE_PXXV48_4K +#define VM_MODE_DEFAULT VM_MODE_PXXVYY_4K #define MIN_PAGE_SHIFT 12U #define ptes_per_page(page_size) ((page_size) / 8) @@ -283,34 +284,6 @@ static inline bool kvm_has_cap(long cap) return kvm_check_cap(cap); } -#define __KVM_SYSCALL_ERROR(_name, _ret) \ - "%s failed, rc: %i errno: %i (%s)", (_name), (_ret), errno, strerror(errno) - -static inline void *__kvm_mmap(size_t size, int prot, int flags, int fd, - off_t offset) -{ - void *mem; - - mem = mmap(NULL, size, prot, flags, fd, offset); - TEST_ASSERT(mem != MAP_FAILED, __KVM_SYSCALL_ERROR("mmap()", - (int)(unsigned long)MAP_FAILED)); - - return mem; -} - -static inline void *kvm_mmap(size_t size, int prot, int flags, int fd) -{ - return __kvm_mmap(size, prot, flags, fd, 0); -} - -static inline void kvm_munmap(void *mem, size_t size) -{ - int ret; - - ret = munmap(mem, size); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("munmap()", ret)); -} - /* * Use the "inner", double-underscore macro when reporting errors from within * other macros so that the name of ioctl() and not its literal numeric value @@ -700,12 +673,12 @@ int __vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flag uint32_t guest_memfd, uint64_t guest_memfd_offset); void vm_userspace_mem_region_add(struct kvm_vm *vm, - enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags); + enum vm_mem_backing_src_type src_type, + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags); void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd_fd, uint64_t guest_memfd_offset); + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd_fd, uint64_t guest_memfd_offset); #ifndef vm_arch_has_protected_memory static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) @@ -715,6 +688,7 @@ static inline bool vm_arch_has_protected_memory(struct kvm_vm *vm) #endif void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags); +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot); void vm_mem_region_move(struct kvm_vm *vm, uint32_t slot, uint64_t new_gpa); void vm_mem_region_delete(struct kvm_vm *vm, uint32_t slot); struct kvm_vcpu *__vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpu_id); @@ -1230,6 +1204,7 @@ void virt_arch_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr); static inline void virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr) { virt_arch_pg_map(vm, vaddr, paddr); + sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } diff --git a/tools/testing/selftests/kvm/include/loongarch/arch_timer.h b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h new file mode 100644 index 000000000000..2ed106b32c81 --- /dev/null +++ b/tools/testing/selftests/kvm/include/loongarch/arch_timer.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * LoongArch Constant Timer specific interface + */ +#ifndef SELFTEST_KVM_ARCH_TIMER_H +#define SELFTEST_KVM_ARCH_TIMER_H + +#include "processor.h" + +/* LoongArch timer frequency is constant 100MHZ */ +#define TIMER_FREQ (100UL << 20) +#define msec_to_cycles(msec) (TIMER_FREQ * (unsigned long)(msec) / 1000) +#define usec_to_cycles(usec) (TIMER_FREQ * (unsigned long)(usec) / 1000000) +#define cycles_to_usec(cycles) ((unsigned long)(cycles) * 1000000 / TIMER_FREQ) + +static inline unsigned long timer_get_cycles(void) +{ + unsigned long val = 0; + + __asm__ __volatile__( + "rdtime.d %0, $zero\n\t" + : "=r"(val) + : + ); + + return val; +} + +static inline unsigned long timer_get_cfg(void) +{ + return csr_read(LOONGARCH_CSR_TCFG); +} + +static inline unsigned long timer_get_val(void) +{ + return csr_read(LOONGARCH_CSR_TVAL); +} + +static inline void disable_timer(void) +{ + csr_write(0, LOONGARCH_CSR_TCFG); +} + +static inline void timer_irq_enable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val |= ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_irq_disable(void) +{ + unsigned long val; + + val = csr_read(LOONGARCH_CSR_ECFG); + val &= ~ECFGF_TIMER; + csr_write(val, LOONGARCH_CSR_ECFG); +} + +static inline void timer_set_next_cmp_ms(unsigned int msec, bool period) +{ + unsigned long val; + + val = msec_to_cycles(msec) & CSR_TCFG_VAL; + val |= CSR_TCFG_EN; + if (period) + val |= CSR_TCFG_PERIOD; + csr_write(val, LOONGARCH_CSR_TCFG); +} + +static inline void __delay(uint64_t cycles) +{ + uint64_t start = timer_get_cycles(); + + while ((timer_get_cycles() - start) < cycles) + cpu_relax(); +} + +static inline void udelay(unsigned long usec) +{ + __delay(usec_to_cycles(usec)); +} +#endif /* SELFTEST_KVM_ARCH_TIMER_H */ diff --git a/tools/testing/selftests/kvm/include/loongarch/processor.h b/tools/testing/selftests/kvm/include/loongarch/processor.h index 6427a3275e6a..76840ddda57d 100644 --- a/tools/testing/selftests/kvm/include/loongarch/processor.h +++ b/tools/testing/selftests/kvm/include/loongarch/processor.h @@ -83,7 +83,14 @@ #define LOONGARCH_CSR_PRMD 0x1 #define LOONGARCH_CSR_EUEN 0x2 #define LOONGARCH_CSR_ECFG 0x4 +#define ECFGB_TIMER 11 +#define ECFGF_TIMER (BIT_ULL(ECFGB_TIMER)) #define LOONGARCH_CSR_ESTAT 0x5 /* Exception status */ +#define CSR_ESTAT_EXC_SHIFT 16 +#define CSR_ESTAT_EXC_WIDTH 6 +#define CSR_ESTAT_EXC (0x3f << CSR_ESTAT_EXC_SHIFT) +#define EXCCODE_INT 0 /* Interrupt */ +#define INT_TI 11 /* Timer interrupt*/ #define LOONGARCH_CSR_ERA 0x6 /* ERA */ #define LOONGARCH_CSR_BADV 0x7 /* Bad virtual address */ #define LOONGARCH_CSR_EENTRY 0xc @@ -106,6 +113,14 @@ #define LOONGARCH_CSR_KS1 0x31 #define LOONGARCH_CSR_TMID 0x40 #define LOONGARCH_CSR_TCFG 0x41 +#define CSR_TCFG_VAL (BIT_ULL(48) - BIT_ULL(2)) +#define CSR_TCFG_PERIOD_SHIFT 1 +#define CSR_TCFG_PERIOD (0x1UL << CSR_TCFG_PERIOD_SHIFT) +#define CSR_TCFG_EN (0x1UL) +#define LOONGARCH_CSR_TVAL 0x42 +#define LOONGARCH_CSR_TINTCLR 0x44 /* Timer interrupt clear */ +#define CSR_TINTCLR_TI_SHIFT 0 +#define CSR_TINTCLR_TI (1 << CSR_TINTCLR_TI_SHIFT) /* TLB refill exception entry */ #define LOONGARCH_CSR_TLBRENTRY 0x88 #define LOONGARCH_CSR_TLBRSAVE 0x8b @@ -113,6 +128,28 @@ #define CSR_TLBREHI_PS_SHIFT 0 #define CSR_TLBREHI_PS (0x3fUL << CSR_TLBREHI_PS_SHIFT) +#define csr_read(csr) \ +({ \ + register unsigned long __v; \ + __asm__ __volatile__( \ + "csrrd %[val], %[reg]\n\t" \ + : [val] "=r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + +#define csr_write(v, csr) \ +({ \ + register unsigned long __v = v; \ + __asm__ __volatile__ ( \ + "csrwr %[val], %[reg]\n\t" \ + : [val] "+r" (__v) \ + : [reg] "i" (csr) \ + : "memory"); \ + __v; \ +}) + #define EXREGS_GPRS (32) #ifndef __ASSEMBLER__ @@ -124,18 +161,60 @@ struct ex_regs { unsigned long pc; unsigned long estat; unsigned long badv; + unsigned long prmd; }; #define PC_OFFSET_EXREGS offsetof(struct ex_regs, pc) #define ESTAT_OFFSET_EXREGS offsetof(struct ex_regs, estat) #define BADV_OFFSET_EXREGS offsetof(struct ex_regs, badv) +#define PRMD_OFFSET_EXREGS offsetof(struct ex_regs, prmd) #define EXREGS_SIZE sizeof(struct ex_regs) +#define VECTOR_NUM 64 + +typedef void(*handler_fn)(struct ex_regs *); + +struct handlers { + handler_fn exception_handlers[VECTOR_NUM]; +}; + +void vm_init_descriptor_tables(struct kvm_vm *vm); +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler); + +static inline void cpu_relax(void) +{ + asm volatile("nop" ::: "memory"); +} + +static inline void local_irq_enable(void) +{ + unsigned int flags = CSR_CRMD_IE; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} + +static inline void local_irq_disable(void) +{ + unsigned int flags = 0; + register unsigned int mask asm("$t0") = CSR_CRMD_IE; + + __asm__ __volatile__( + "csrxchg %[val], %[mask], %[reg]\n\t" + : [val] "+r" (flags) + : [mask] "r" (mask), [reg] "i" (LOONGARCH_CSR_CRMD) + : "memory"); +} #else #define PC_OFFSET_EXREGS ((EXREGS_GPRS + 0) * 8) #define ESTAT_OFFSET_EXREGS ((EXREGS_GPRS + 1) * 8) #define BADV_OFFSET_EXREGS ((EXREGS_GPRS + 2) * 8) -#define EXREGS_SIZE ((EXREGS_GPRS + 3) * 8) +#define PRMD_OFFSET_EXREGS ((EXREGS_GPRS + 3) * 8) +#define EXREGS_SIZE ((EXREGS_GPRS + 4) * 8) #endif #endif /* SELFTEST_KVM_PROCESSOR_H */ diff --git a/tools/testing/selftests/kvm/include/numaif.h b/tools/testing/selftests/kvm/include/numaif.h index b020547403fd..29572a6d789c 100644 --- a/tools/testing/selftests/kvm/include/numaif.h +++ b/tools/testing/selftests/kvm/include/numaif.h @@ -1,55 +1,83 @@ /* SPDX-License-Identifier: GPL-2.0-only */ -/* - * tools/testing/selftests/kvm/include/numaif.h - * - * Copyright (C) 2020, Google LLC. - * - * This work is licensed under the terms of the GNU GPL, version 2. - * - * Header file that provides access to NUMA API functions not explicitly - * exported to user space. - */ +/* Copyright (C) 2020, Google LLC. */ #ifndef SELFTEST_KVM_NUMAIF_H #define SELFTEST_KVM_NUMAIF_H -#define __NR_get_mempolicy 239 -#define __NR_migrate_pages 256 +#include <dirent.h> -/* System calls */ -long get_mempolicy(int *policy, const unsigned long *nmask, - unsigned long maxnode, void *addr, int flags) +#include <linux/mempolicy.h> + +#include "kvm_syscalls.h" + +KVM_SYSCALL_DEFINE(get_mempolicy, 5, int *, policy, const unsigned long *, nmask, + unsigned long, maxnode, void *, addr, int, flags); + +KVM_SYSCALL_DEFINE(set_mempolicy, 3, int, mode, const unsigned long *, nmask, + unsigned long, maxnode); + +KVM_SYSCALL_DEFINE(set_mempolicy_home_node, 4, unsigned long, start, + unsigned long, len, unsigned long, home_node, + unsigned long, flags); + +KVM_SYSCALL_DEFINE(migrate_pages, 4, int, pid, unsigned long, maxnode, + const unsigned long *, frommask, const unsigned long *, tomask); + +KVM_SYSCALL_DEFINE(move_pages, 6, int, pid, unsigned long, count, void *, pages, + const int *, nodes, int *, status, int, flags); + +KVM_SYSCALL_DEFINE(mbind, 6, void *, addr, unsigned long, size, int, mode, + const unsigned long *, nodemask, unsigned long, maxnode, + unsigned int, flags); + +static inline int get_max_numa_node(void) { - return syscall(__NR_get_mempolicy, policy, nmask, - maxnode, addr, flags); + struct dirent *de; + int max_node = 0; + DIR *d; + + /* + * Assume there's a single node if the kernel doesn't support NUMA, + * or if no nodes are found. + */ + d = opendir("/sys/devices/system/node"); + if (!d) + return 0; + + while ((de = readdir(d)) != NULL) { + int node_id; + char *endptr; + + if (strncmp(de->d_name, "node", 4) != 0) + continue; + + node_id = strtol(de->d_name + 4, &endptr, 10); + if (*endptr != '\0') + continue; + + if (node_id > max_node) + max_node = node_id; + } + closedir(d); + + return max_node; } -long migrate_pages(int pid, unsigned long maxnode, - const unsigned long *frommask, - const unsigned long *tomask) +static bool is_numa_available(void) { - return syscall(__NR_migrate_pages, pid, maxnode, frommask, tomask); + /* + * Probe for NUMA by doing a dummy get_mempolicy(). If the syscall + * fails with ENOSYS, then the kernel was built without NUMA support. + * if the syscall fails with EPERM, then the process/user lacks the + * necessary capabilities (CAP_SYS_NICE). + */ + return !get_mempolicy(NULL, NULL, 0, NULL, 0) || + (errno != ENOSYS && errno != EPERM); } -/* Policies */ -#define MPOL_DEFAULT 0 -#define MPOL_PREFERRED 1 -#define MPOL_BIND 2 -#define MPOL_INTERLEAVE 3 - -#define MPOL_MAX MPOL_INTERLEAVE - -/* Flags for get_mem_policy */ -#define MPOL_F_NODE (1<<0) /* return next il node or node of address */ - /* Warning: MPOL_F_NODE is unsupported and - * subject to change. Don't use. - */ -#define MPOL_F_ADDR (1<<1) /* look up vma using address */ -#define MPOL_F_MEMS_ALLOWED (1<<2) /* query nodes allowed in cpuset */ - -/* Flags for mbind */ -#define MPOL_MF_STRICT (1<<0) /* Verify existing pages in the mapping */ -#define MPOL_MF_MOVE (1<<1) /* Move pages owned by this process to conform to mapping */ -#define MPOL_MF_MOVE_ALL (1<<2) /* Move every page to conform to mapping */ +static inline bool is_multi_numa_node_system(void) +{ + return is_numa_available() && get_max_numa_node() >= 1; +} #endif /* SELFTEST_KVM_NUMAIF_H */ diff --git a/tools/testing/selftests/kvm/include/x86/processor.h b/tools/testing/selftests/kvm/include/x86/processor.h index 51cd84b9ca66..57d62a425109 100644 --- a/tools/testing/selftests/kvm/include/x86/processor.h +++ b/tools/testing/selftests/kvm/include/x86/processor.h @@ -1441,7 +1441,7 @@ enum pg_level { PG_LEVEL_2M, PG_LEVEL_1G, PG_LEVEL_512G, - PG_LEVEL_NUM + PG_LEVEL_256T }; #define PG_LEVEL_SHIFT(_level) ((_level - 1) * 9 + 12) diff --git a/tools/testing/selftests/kvm/include/x86/vmx.h b/tools/testing/selftests/kvm/include/x86/vmx.h index edb3c391b982..96e2b4c630a9 100644 --- a/tools/testing/selftests/kvm/include/x86/vmx.h +++ b/tools/testing/selftests/kvm/include/x86/vmx.h @@ -568,8 +568,7 @@ void nested_map_memslot(struct vmx_pages *vmx, struct kvm_vm *vm, void nested_identity_map_1g(struct vmx_pages *vmx, struct kvm_vm *vm, uint64_t addr, uint64_t size); bool kvm_cpu_has_ept(void); -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot); +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm); void prepare_virtualize_apic_accesses(struct vmx_pages *vmx, struct kvm_vm *vm); #endif /* SELFTEST_KVM_VMX_H */ diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index f02355c3c4c2..b7dbde9c0843 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -239,14 +239,14 @@ int main(int argc, char *argv[]) * single stats file works and doesn't cause explosions. */ vm_stats_fds = vm_get_stats_fd(vms[i]); - stats_test(dup(vm_stats_fds)); + stats_test(kvm_dup(vm_stats_fds)); /* Verify userspace can instantiate multiple stats files. */ stats_test(vm_get_stats_fd(vms[i])); for (j = 0; j < max_vcpu; ++j) { vcpu_stats_fds[j] = vcpu_get_stats_fd(vcpus[i * max_vcpu + j]); - stats_test(dup(vcpu_stats_fds[j])); + stats_test(kvm_dup(vcpu_stats_fds[j])); stats_test(vcpu_get_stats_fd(vcpus[i * max_vcpu + j])); } diff --git a/tools/testing/selftests/kvm/lib/arm64/gic.c b/tools/testing/selftests/kvm/lib/arm64/gic.c index 7abbf8866512..b023868fe0b8 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic.c @@ -155,3 +155,9 @@ void gic_irq_set_config(unsigned int intid, bool is_edge) GUEST_ASSERT(gic_common_ops); gic_common_ops->gic_irq_set_config(intid, is_edge); } + +void gic_irq_set_group(unsigned int intid, bool group) +{ + GUEST_ASSERT(gic_common_ops); + gic_common_ops->gic_irq_set_group(intid, group); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_private.h b/tools/testing/selftests/kvm/lib/arm64/gic_private.h index d24e9ecc96c6..b6a7e30c3eb1 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_private.h +++ b/tools/testing/selftests/kvm/lib/arm64/gic_private.h @@ -25,6 +25,7 @@ struct gic_common_ops { void (*gic_irq_clear_pending)(uint32_t intid); bool (*gic_irq_get_pending)(uint32_t intid); void (*gic_irq_set_config)(uint32_t intid, bool is_edge); + void (*gic_irq_set_group)(uint32_t intid, bool group); }; extern const struct gic_common_ops gicv3_ops; diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c index 66d05506f78b..50754a27f493 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3.c @@ -293,17 +293,36 @@ static void gicv3_enable_redist(volatile void *redist_base) } } +static void gicv3_set_group(uint32_t intid, bool grp) +{ + uint32_t cpu_or_dist; + uint32_t val; + + cpu_or_dist = (get_intid_range(intid) == SPI_RANGE) ? DIST_BIT : guest_get_vcpuid(); + val = gicv3_reg_readl(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4); + if (grp) + val |= BIT(intid % 32); + else + val &= ~BIT(intid % 32); + gicv3_reg_writel(cpu_or_dist, GICD_IGROUPR + (intid / 32) * 4, val); +} + static void gicv3_cpu_init(unsigned int cpu) { volatile void *sgi_base; unsigned int i; volatile void *redist_base_cpu; + u64 typer; GUEST_ASSERT(cpu < gicv3_data.nr_cpus); redist_base_cpu = gicr_base_cpu(cpu); sgi_base = sgi_base_from_redist(redist_base_cpu); + /* Verify assumption that GICR_TYPER.Processor_number == cpu */ + typer = readq_relaxed(redist_base_cpu + GICR_TYPER); + GUEST_ASSERT_EQ(GICR_TYPER_CPU_NUMBER(typer), cpu); + gicv3_enable_redist(redist_base_cpu); /* @@ -328,6 +347,8 @@ static void gicv3_cpu_init(unsigned int cpu) /* Set a default priority threshold */ write_sysreg_s(ICC_PMR_DEF_PRIO, SYS_ICC_PMR_EL1); + /* Disable Group-0 interrupts */ + write_sysreg_s(ICC_IGRPEN0_EL1_MASK, SYS_ICC_IGRPEN1_EL1); /* Enable non-secure Group-1 interrupts */ write_sysreg_s(ICC_IGRPEN1_EL1_MASK, SYS_ICC_IGRPEN1_EL1); } @@ -400,6 +421,7 @@ const struct gic_common_ops gicv3_ops = { .gic_irq_clear_pending = gicv3_irq_clear_pending, .gic_irq_get_pending = gicv3_irq_get_pending, .gic_irq_set_config = gicv3_irq_set_config, + .gic_irq_set_group = gicv3_set_group, }; void gic_rdist_enable_lpis(vm_paddr_t cfg_table, size_t cfg_table_size, diff --git a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c index 0e2f8ed90f30..7f9fdcf42ae6 100644 --- a/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c +++ b/tools/testing/selftests/kvm/lib/arm64/gic_v3_its.c @@ -253,3 +253,13 @@ void its_send_invall_cmd(void *cmdq_base, u32 collection_id) its_send_cmd(cmdq_base, &cmd); } + +void its_send_sync_cmd(void *cmdq_base, u32 vcpu_id) +{ + struct its_cmd_block cmd = {}; + + its_encode_cmd(&cmd, GITS_CMD_SYNC); + its_encode_target(&cmd, procnum_to_rdbase(vcpu_id)); + + its_send_cmd(cmdq_base, &cmd); +} diff --git a/tools/testing/selftests/kvm/lib/arm64/processor.c b/tools/testing/selftests/kvm/lib/arm64/processor.c index 54f6d17c78f7..d46e4b13b92c 100644 --- a/tools/testing/selftests/kvm/lib/arm64/processor.c +++ b/tools/testing/selftests/kvm/lib/arm64/processor.c @@ -324,7 +324,7 @@ void aarch64_vcpu_setup(struct kvm_vcpu *vcpu, struct kvm_vcpu_init *init) /* Configure base granule size */ switch (vm->mode) { - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: TEST_FAIL("AArch64 does not support 4K sized pages " "with ANY-bit physical address ranges"); case VM_MODE_P52V48_64K: diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 1a93d6361671..8279b6ced8d2 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -201,7 +201,7 @@ const char *vm_guest_mode_string(uint32_t i) [VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages", [VM_MODE_P40V48_16K] = "PA-bits:40, VA-bits:48, 16K pages", [VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages", - [VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages", + [VM_MODE_PXXVYY_4K] = "PA-bits:ANY, VA-bits:48 or 57, 4K pages", [VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages", [VM_MODE_P44V64_4K] = "PA-bits:44, VA-bits:64, 4K pages", [VM_MODE_P36V48_4K] = "PA-bits:36, VA-bits:48, 4K pages", @@ -228,7 +228,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = { [VM_MODE_P40V48_4K] = { 40, 48, 0x1000, 12 }, [VM_MODE_P40V48_16K] = { 40, 48, 0x4000, 14 }, [VM_MODE_P40V48_64K] = { 40, 48, 0x10000, 16 }, - [VM_MODE_PXXV48_4K] = { 0, 0, 0x1000, 12 }, + [VM_MODE_PXXVYY_4K] = { 0, 0, 0x1000, 12 }, [VM_MODE_P47V64_4K] = { 47, 64, 0x1000, 12 }, [VM_MODE_P44V64_4K] = { 44, 64, 0x1000, 12 }, [VM_MODE_P36V48_4K] = { 36, 48, 0x1000, 12 }, @@ -310,24 +310,26 @@ struct kvm_vm *____vm_create(struct vm_shape shape) case VM_MODE_P36V47_16K: vm->pgtable_levels = 3; break; - case VM_MODE_PXXV48_4K: + case VM_MODE_PXXVYY_4K: #ifdef __x86_64__ kvm_get_cpu_address_width(&vm->pa_bits, &vm->va_bits); kvm_init_vm_address_properties(vm); - /* - * Ignore KVM support for 5-level paging (vm->va_bits == 57), - * it doesn't take effect unless a CR4.LA57 is set, which it - * isn't for this mode (48-bit virtual address space). - */ - TEST_ASSERT(vm->va_bits == 48 || vm->va_bits == 57, - "Linear address width (%d bits) not supported", - vm->va_bits); + pr_debug("Guest physical address width detected: %d\n", vm->pa_bits); - vm->pgtable_levels = 4; - vm->va_bits = 48; + pr_debug("Guest virtual address width detected: %d\n", + vm->va_bits); + + if (vm->va_bits == 57) { + vm->pgtable_levels = 5; + } else { + TEST_ASSERT(vm->va_bits == 48, + "Unexpected guest virtual address width: %d", + vm->va_bits); + vm->pgtable_levels = 4; + } #else - TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms"); + TEST_FAIL("VM_MODE_PXXVYY_4K not supported on non-x86 platforms"); #endif break; case VM_MODE_P47V64_4K: @@ -704,8 +706,6 @@ userspace_mem_region_find(struct kvm_vm *vm, uint64_t start, uint64_t end) static void kvm_stats_release(struct kvm_binary_stats *stats) { - int ret; - if (stats->fd < 0) return; @@ -714,8 +714,7 @@ static void kvm_stats_release(struct kvm_binary_stats *stats) stats->desc = NULL; } - ret = close(stats->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(stats->fd); stats->fd = -1; } @@ -738,8 +737,6 @@ __weak void vcpu_arch_free(struct kvm_vcpu *vcpu) */ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { - int ret; - if (vcpu->dirty_gfns) { kvm_munmap(vcpu->dirty_gfns, vm->dirty_ring_size); vcpu->dirty_gfns = NULL; @@ -747,9 +744,7 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) kvm_munmap(vcpu->run, vcpu_mmap_sz()); - ret = close(vcpu->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - + kvm_close(vcpu->fd); kvm_stats_release(&vcpu->stats); list_del(&vcpu->list); @@ -761,16 +756,12 @@ static void vm_vcpu_rm(struct kvm_vm *vm, struct kvm_vcpu *vcpu) void kvm_vm_release(struct kvm_vm *vmp) { struct kvm_vcpu *vcpu, *tmp; - int ret; list_for_each_entry_safe(vcpu, tmp, &vmp->vcpus, list) vm_vcpu_rm(vmp, vcpu); - ret = close(vmp->fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); - - ret = close(vmp->kvm_fd); - TEST_ASSERT(!ret, __KVM_SYSCALL_ERROR("close()", ret)); + kvm_close(vmp->fd); + kvm_close(vmp->kvm_fd); /* Free cached stats metadata and close FD */ kvm_stats_release(&vmp->stats); @@ -828,7 +819,7 @@ void kvm_vm_free(struct kvm_vm *vmp) int kvm_memfd_alloc(size_t size, bool hugepages) { int memfd_flags = MFD_CLOEXEC; - int fd, r; + int fd; if (hugepages) memfd_flags |= MFD_HUGETLB; @@ -836,11 +827,8 @@ int kvm_memfd_alloc(size_t size, bool hugepages) fd = memfd_create("kvm_selftest", memfd_flags); TEST_ASSERT(fd != -1, __KVM_SYSCALL_ERROR("memfd_create()", fd)); - r = ftruncate(fd, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("ftruncate()", r)); - - r = fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_ftruncate(fd, size); + kvm_fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0, size); return fd; } @@ -957,8 +945,8 @@ void vm_set_user_memory_region2(struct kvm_vm *vm, uint32_t slot, uint32_t flags /* FIXME: This thing needs to be ripped apart and rewritten. */ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, uint64_t npages, - uint32_t flags, int guest_memfd, uint64_t guest_memfd_offset) + uint64_t gpa, uint32_t slot, uint64_t npages, uint32_t flags, + int guest_memfd, uint64_t guest_memfd_offset) { int ret; struct userspace_mem_region *region; @@ -972,30 +960,29 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "Number of guest pages is not compatible with the host. " "Try npages=%d", vm_adjust_num_guest_pages(vm->mode, npages)); - TEST_ASSERT((guest_paddr % vm->page_size) == 0, "Guest physical " + TEST_ASSERT((gpa % vm->page_size) == 0, "Guest physical " "address not on a page boundary.\n" - " guest_paddr: 0x%lx vm->page_size: 0x%x", - guest_paddr, vm->page_size); - TEST_ASSERT((((guest_paddr >> vm->page_shift) + npages) - 1) + " gpa: 0x%lx vm->page_size: 0x%x", + gpa, vm->page_size); + TEST_ASSERT((((gpa >> vm->page_shift) + npages) - 1) <= vm->max_gfn, "Physical range beyond maximum " "supported physical address,\n" - " guest_paddr: 0x%lx npages: 0x%lx\n" + " gpa: 0x%lx npages: 0x%lx\n" " vm->max_gfn: 0x%lx vm->page_size: 0x%x", - guest_paddr, npages, vm->max_gfn, vm->page_size); + gpa, npages, vm->max_gfn, vm->page_size); /* * Confirm a mem region with an overlapping address doesn't * already exist. */ region = (struct userspace_mem_region *) userspace_mem_region_find( - vm, guest_paddr, (guest_paddr + npages * vm->page_size) - 1); + vm, gpa, (gpa + npages * vm->page_size) - 1); if (region != NULL) TEST_FAIL("overlapping userspace_mem_region already " "exists\n" - " requested guest_paddr: 0x%lx npages: 0x%lx " - "page_size: 0x%x\n" - " existing guest_paddr: 0x%lx size: 0x%lx", - guest_paddr, npages, vm->page_size, + " requested gpa: 0x%lx npages: 0x%lx page_size: 0x%x\n" + " existing gpa: 0x%lx size: 0x%lx", + gpa, npages, vm->page_size, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); @@ -1009,8 +996,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, "already exists.\n" " requested slot: %u paddr: 0x%lx npages: 0x%lx\n" " existing slot: %u paddr: 0x%lx size: 0x%lx", - slot, guest_paddr, npages, - region->region.slot, + slot, gpa, npages, region->region.slot, (uint64_t) region->region.guest_phys_addr, (uint64_t) region->region.memory_size); } @@ -1036,7 +1022,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, if (src_type == VM_MEM_SRC_ANONYMOUS_THP) alignment = max(backing_src_pagesz, alignment); - TEST_ASSERT_EQ(guest_paddr, align_up(guest_paddr, backing_src_pagesz)); + TEST_ASSERT_EQ(gpa, align_up(gpa, backing_src_pagesz)); /* Add enough memory to align up if necessary */ if (alignment > 1) @@ -1084,8 +1070,7 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, * needing to track if the fd is owned by the framework * or by the caller. */ - guest_memfd = dup(guest_memfd); - TEST_ASSERT(guest_memfd >= 0, __KVM_SYSCALL_ERROR("dup()", guest_memfd)); + guest_memfd = kvm_dup(guest_memfd); } region->region.guest_memfd = guest_memfd; @@ -1097,20 +1082,18 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, region->unused_phy_pages = sparsebit_alloc(); if (vm_arch_has_protected_memory(vm)) region->protected_phy_pages = sparsebit_alloc(); - sparsebit_set_num(region->unused_phy_pages, - guest_paddr >> vm->page_shift, npages); + sparsebit_set_num(region->unused_phy_pages, gpa >> vm->page_shift, npages); region->region.slot = slot; region->region.flags = flags; - region->region.guest_phys_addr = guest_paddr; + region->region.guest_phys_addr = gpa; region->region.memory_size = npages * vm->page_size; region->region.userspace_addr = (uintptr_t) region->host_mem; ret = __vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION2 IOCTL failed,\n" " rc: %i errno: %i\n" " slot: %u flags: 0x%x\n" - " guest_phys_addr: 0x%lx size: 0x%lx guest_memfd: %d", - ret, errno, slot, flags, - guest_paddr, (uint64_t) region->region.memory_size, + " guest_phys_addr: 0x%lx size: 0x%llx guest_memfd: %d", + ret, errno, slot, flags, gpa, region->region.memory_size, region->region.guest_memfd); /* Add to quick lookup data structures */ @@ -1132,10 +1115,10 @@ void vm_mem_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, void vm_userspace_mem_region_add(struct kvm_vm *vm, enum vm_mem_backing_src_type src_type, - uint64_t guest_paddr, uint32_t slot, - uint64_t npages, uint32_t flags) + uint64_t gpa, uint32_t slot, uint64_t npages, + uint32_t flags) { - vm_mem_add(vm, src_type, guest_paddr, slot, npages, flags, -1, 0); + vm_mem_add(vm, src_type, gpa, slot, npages, flags, -1, 0); } /* @@ -1201,6 +1184,16 @@ void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags) ret, errno, slot, flags); } +void vm_mem_region_reload(struct kvm_vm *vm, uint32_t slot) +{ + struct userspace_mem_region *region = memslot2region(vm, slot); + struct kvm_userspace_memory_region2 tmp = region->region; + + tmp.memory_size = 0; + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, &tmp); + vm_ioctl(vm, KVM_SET_USER_MEMORY_REGION2, ®ion->region); +} + /* * VM Memory Region Move * @@ -1456,8 +1449,6 @@ static vm_vaddr_t ____vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, pages--, vaddr += vm->page_size, paddr += vm->page_size) { virt_pg_map(vm, vaddr, paddr); - - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); } return vaddr_start; @@ -1571,7 +1562,6 @@ void virt_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, while (npages--) { virt_pg_map(vm, vaddr, paddr); - sparsebit_set(vm->vpages_mapped, vaddr >> vm->page_shift); vaddr += page_size; paddr += page_size; @@ -2025,6 +2015,7 @@ static struct exit_reason { KVM_EXIT_STRING(NOTIFY), KVM_EXIT_STRING(LOONGARCH_IOCSR), KVM_EXIT_STRING(MEMORY_FAULT), + KVM_EXIT_STRING(ARM_SEA), }; /* @@ -2305,11 +2296,35 @@ __weak void kvm_selftest_arch_init(void) { } +static void report_unexpected_signal(int signum) +{ +#define KVM_CASE_SIGNUM(sig) \ + case sig: TEST_FAIL("Unexpected " #sig " (%d)\n", signum) + + switch (signum) { + KVM_CASE_SIGNUM(SIGBUS); + KVM_CASE_SIGNUM(SIGSEGV); + KVM_CASE_SIGNUM(SIGILL); + KVM_CASE_SIGNUM(SIGFPE); + default: + TEST_FAIL("Unexpected signal %d\n", signum); + } +} + void __attribute((constructor)) kvm_selftest_init(void) { + struct sigaction sig_sa = { + .sa_handler = report_unexpected_signal, + }; + /* Tell stdout not to buffer its content. */ setbuf(stdout, NULL); + sigaction(SIGBUS, &sig_sa, NULL); + sigaction(SIGSEGV, &sig_sa, NULL); + sigaction(SIGILL, &sig_sa, NULL); + sigaction(SIGFPE, &sig_sa, NULL); + guest_random_seed = last_guest_seed = random(); pr_info("Random seed: 0x%x\n", guest_random_seed); diff --git a/tools/testing/selftests/kvm/lib/loongarch/exception.S b/tools/testing/selftests/kvm/lib/loongarch/exception.S index 88bfa505c6f5..3f1e4b67c5ae 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/exception.S +++ b/tools/testing/selftests/kvm/lib/loongarch/exception.S @@ -51,9 +51,15 @@ handle_exception: st.d t0, sp, ESTAT_OFFSET_EXREGS csrrd t0, LOONGARCH_CSR_BADV st.d t0, sp, BADV_OFFSET_EXREGS + csrrd t0, LOONGARCH_CSR_PRMD + st.d t0, sp, PRMD_OFFSET_EXREGS or a0, sp, zero bl route_exception + ld.d t0, sp, PC_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_ERA + ld.d t0, sp, PRMD_OFFSET_EXREGS + csrwr t0, LOONGARCH_CSR_PRMD restore_gprs sp csrrd sp, LOONGARCH_CSR_KS0 ertn diff --git a/tools/testing/selftests/kvm/lib/loongarch/processor.c b/tools/testing/selftests/kvm/lib/loongarch/processor.c index 0ac1abcb71cb..07c103369ddb 100644 --- a/tools/testing/selftests/kvm/lib/loongarch/processor.c +++ b/tools/testing/selftests/kvm/lib/loongarch/processor.c @@ -3,6 +3,7 @@ #include <assert.h> #include <linux/compiler.h> +#include <asm/kvm.h> #include "kvm_util.h" #include "processor.h" #include "ucall_common.h" @@ -11,6 +12,7 @@ #define LOONGARCH_GUEST_STACK_VADDR_MIN 0x200000 static vm_paddr_t invalid_pgtable[4]; +static vm_vaddr_t exception_handlers; static uint64_t virt_pte_index(struct kvm_vm *vm, vm_vaddr_t gva, int level) { @@ -183,7 +185,14 @@ void assert_on_unhandled_exception(struct kvm_vcpu *vcpu) void route_exception(struct ex_regs *regs) { + int vector; unsigned long pc, estat, badv; + struct handlers *handlers; + + handlers = (struct handlers *)exception_handlers; + vector = (regs->estat & CSR_ESTAT_EXC) >> CSR_ESTAT_EXC_SHIFT; + if (handlers && handlers->exception_handlers[vector]) + return handlers->exception_handlers[vector](regs); pc = regs->pc; badv = regs->badv; @@ -192,6 +201,32 @@ void route_exception(struct ex_regs *regs) while (1) ; } +void vm_init_descriptor_tables(struct kvm_vm *vm) +{ + void *addr; + + vm->handlers = __vm_vaddr_alloc(vm, sizeof(struct handlers), + LOONGARCH_GUEST_STACK_VADDR_MIN, MEM_REGION_DATA); + + addr = addr_gva2hva(vm, vm->handlers); + memset(addr, 0, vm->page_size); + exception_handlers = vm->handlers; + sync_global_to_guest(vm, exception_handlers); +} + +void vm_install_exception_handler(struct kvm_vm *vm, int vector, handler_fn handler) +{ + struct handlers *handlers = addr_gva2hva(vm, vm->handlers); + + assert(vector < VECTOR_NUM); + handlers->exception_handlers[vector] = handler; +} + +uint32_t guest_get_vcpuid(void) +{ + return csr_read(LOONGARCH_CSR_CPUID); +} + void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) { int i; @@ -211,6 +246,11 @@ void vcpu_args_set(struct kvm_vcpu *vcpu, unsigned int num, ...) vcpu_regs_set(vcpu, ®s); } +static void loongarch_set_reg(struct kvm_vcpu *vcpu, uint64_t id, uint64_t val) +{ + __vcpu_set_reg(vcpu, id, val); +} + static void loongarch_get_csr(struct kvm_vcpu *vcpu, uint64_t id, void *addr) { uint64_t csrid; @@ -242,8 +282,8 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) TEST_FAIL("Unknown guest mode, mode: 0x%x", vm->mode); } - /* user mode and page enable mode */ - val = PLV_USER | CSR_CRMD_PG; + /* kernel mode and page enable mode */ + val = PLV_KERN | CSR_CRMD_PG; loongarch_set_csr(vcpu, LOONGARCH_CSR_CRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_PRMD, val); loongarch_set_csr(vcpu, LOONGARCH_CSR_EUEN, 1); @@ -251,7 +291,10 @@ static void loongarch_vcpu_setup(struct kvm_vcpu *vcpu) loongarch_set_csr(vcpu, LOONGARCH_CSR_TCFG, 0); loongarch_set_csr(vcpu, LOONGARCH_CSR_ASID, 1); + /* time count start from 0 */ val = 0; + loongarch_set_reg(vcpu, KVM_REG_LOONGARCH_COUNTER, val); + width = vm->page_shift - 3; switch (vm->pgtable_levels) { diff --git a/tools/testing/selftests/kvm/lib/x86/memstress.c b/tools/testing/selftests/kvm/lib/x86/memstress.c index 7f5d62a65c68..0b1f288ad556 100644 --- a/tools/testing/selftests/kvm/lib/x86/memstress.c +++ b/tools/testing/selftests/kvm/lib/x86/memstress.c @@ -63,7 +63,7 @@ void memstress_setup_ept(struct vmx_pages *vmx, struct kvm_vm *vm) { uint64_t start, end; - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); /* * Identity map the first 4G and the test region with 1G pages so that diff --git a/tools/testing/selftests/kvm/lib/x86/processor.c b/tools/testing/selftests/kvm/lib/x86/processor.c index b418502c5ecc..36104d27f3d9 100644 --- a/tools/testing/selftests/kvm/lib/x86/processor.c +++ b/tools/testing/selftests/kvm/lib/x86/processor.c @@ -158,10 +158,10 @@ bool kvm_is_tdp_enabled(void) void virt_arch_pgd_alloc(struct kvm_vm *vm) { - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); - /* If needed, create page map l4 table. */ + /* If needed, create the top-level page table. */ if (!vm->pgd_created) { vm->pgd = vm_alloc_page_table(vm); vm->pgd_created = true; @@ -218,11 +218,11 @@ static uint64_t *virt_create_upper_pte(struct kvm_vm *vm, void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) { const uint64_t pg_size = PG_LEVEL_SIZE(level); - uint64_t *pml4e, *pdpe, *pde; - uint64_t *pte; + uint64_t *pte = &vm->pgd; + int current_level; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, - "Unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((vaddr % pg_size) == 0, "Virtual address not aligned,\n" @@ -243,20 +243,17 @@ void __virt_pg_map(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, int level) * Allocate upper level page tables, if not already present. Return * early if a hugepage was created. */ - pml4e = virt_create_upper_pte(vm, &vm->pgd, vaddr, paddr, PG_LEVEL_512G, level); - if (*pml4e & PTE_LARGE_MASK) - return; - - pdpe = virt_create_upper_pte(vm, pml4e, vaddr, paddr, PG_LEVEL_1G, level); - if (*pdpe & PTE_LARGE_MASK) - return; - - pde = virt_create_upper_pte(vm, pdpe, vaddr, paddr, PG_LEVEL_2M, level); - if (*pde & PTE_LARGE_MASK) - return; + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_create_upper_pte(vm, pte, vaddr, paddr, + current_level, level); + if (*pte & PTE_LARGE_MASK) + return; + } /* Fill in page table entry. */ - pte = virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + pte = virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); TEST_ASSERT(!(*pte & PTE_PRESENT_MASK), "PTE already present for 4k page at vaddr: 0x%lx", vaddr); *pte = PTE_PRESENT_MASK | PTE_WRITABLE_MASK | (paddr & PHYSICAL_PAGE_MASK); @@ -289,6 +286,8 @@ void virt_map_level(struct kvm_vm *vm, uint64_t vaddr, uint64_t paddr, for (i = 0; i < nr_pages; i++) { __virt_pg_map(vm, vaddr, paddr, level); + sparsebit_set_num(vm->vpages_mapped, vaddr >> vm->page_shift, + nr_bytes / PAGE_SIZE); vaddr += pg_size; paddr += pg_size; @@ -310,40 +309,38 @@ static bool vm_is_target_pte(uint64_t *pte, int *level, int current_level) uint64_t *__vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr, int *level) { - uint64_t *pml4e, *pdpe, *pde; + int va_width = 12 + (vm->pgtable_levels) * 9; + uint64_t *pte = &vm->pgd; + int current_level; TEST_ASSERT(!vm->arch.is_pt_protected, "Walking page tables of protected guests is impossible"); - TEST_ASSERT(*level >= PG_LEVEL_NONE && *level < PG_LEVEL_NUM, + TEST_ASSERT(*level >= PG_LEVEL_NONE && *level <= vm->pgtable_levels, "Invalid PG_LEVEL_* '%d'", *level); - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT(sparsebit_is_set(vm->vpages_valid, (vaddr >> vm->page_shift)), "Invalid virtual address, vaddr: 0x%lx", vaddr); /* - * Based on the mode check above there are 48 bits in the vaddr, so - * shift 16 to sign extend the last bit (bit-47), + * Check that the vaddr is a sign-extended va_width value. */ - TEST_ASSERT(vaddr == (((int64_t)vaddr << 16) >> 16), - "Canonical check failed. The virtual address is invalid."); - - pml4e = virt_get_pte(vm, &vm->pgd, vaddr, PG_LEVEL_512G); - if (vm_is_target_pte(pml4e, level, PG_LEVEL_512G)) - return pml4e; - - pdpe = virt_get_pte(vm, pml4e, vaddr, PG_LEVEL_1G); - if (vm_is_target_pte(pdpe, level, PG_LEVEL_1G)) - return pdpe; - - pde = virt_get_pte(vm, pdpe, vaddr, PG_LEVEL_2M); - if (vm_is_target_pte(pde, level, PG_LEVEL_2M)) - return pde; + TEST_ASSERT(vaddr == + (((int64_t)vaddr << (64 - va_width) >> (64 - va_width))), + "Canonical check failed. The virtual address is invalid."); + + for (current_level = vm->pgtable_levels; + current_level > PG_LEVEL_4K; + current_level--) { + pte = virt_get_pte(vm, pte, vaddr, current_level); + if (vm_is_target_pte(pte, level, current_level)) + return pte; + } - return virt_get_pte(vm, pde, vaddr, PG_LEVEL_4K); + return virt_get_pte(vm, pte, vaddr, PG_LEVEL_4K); } uint64_t *vm_get_page_table_entry(struct kvm_vm *vm, uint64_t vaddr) @@ -526,7 +523,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) { struct kvm_sregs sregs; - TEST_ASSERT_EQ(vm->mode, VM_MODE_PXXV48_4K); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); /* Set mode specific system register values. */ vcpu_sregs_get(vcpu, &sregs); @@ -540,6 +538,8 @@ static void vcpu_init_sregs(struct kvm_vm *vm, struct kvm_vcpu *vcpu) sregs.cr4 |= X86_CR4_PAE | X86_CR4_OSFXSR; if (kvm_cpu_has(X86_FEATURE_XSAVE)) sregs.cr4 |= X86_CR4_OSXSAVE; + if (vm->pgtable_levels == 5) + sregs.cr4 |= X86_CR4_LA57; sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX); kvm_seg_set_unusable(&sregs.ldt); diff --git a/tools/testing/selftests/kvm/lib/x86/vmx.c b/tools/testing/selftests/kvm/lib/x86/vmx.c index d4d1208dd023..29b082a58daa 100644 --- a/tools/testing/selftests/kvm/lib/x86/vmx.c +++ b/tools/testing/selftests/kvm/lib/x86/vmx.c @@ -401,11 +401,11 @@ void __nested_pg_map(struct vmx_pages *vmx, struct kvm_vm *vm, struct eptPageTableEntry *pt = vmx->eptp_hva, *pte; uint16_t index; - TEST_ASSERT(vm->mode == VM_MODE_PXXV48_4K, "Attempt to use " - "unknown or unsupported guest mode, mode: 0x%x", vm->mode); + TEST_ASSERT(vm->mode == VM_MODE_PXXVYY_4K, + "Unknown or unsupported guest mode: 0x%x", vm->mode); TEST_ASSERT((nested_paddr >> 48) == 0, - "Nested physical address 0x%lx requires 5-level paging", + "Nested physical address 0x%lx is > 48-bits and requires 5-level EPT", nested_paddr); TEST_ASSERT((nested_paddr % page_size) == 0, "Nested physical address not on page boundary,\n" @@ -534,8 +534,7 @@ bool kvm_cpu_has_ept(void) return ctrl & SECONDARY_EXEC_ENABLE_EPT; } -void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm, - uint32_t eptp_memslot) +void prepare_eptp(struct vmx_pages *vmx, struct kvm_vm *vm) { TEST_ASSERT(kvm_cpu_has_ept(), "KVM doesn't support nested EPT"); diff --git a/tools/testing/selftests/kvm/loongarch/arch_timer.c b/tools/testing/selftests/kvm/loongarch/arch_timer.c new file mode 100644 index 000000000000..355ecac30954 --- /dev/null +++ b/tools/testing/selftests/kvm/loongarch/arch_timer.c @@ -0,0 +1,200 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * The test validates periodic/one-shot constant timer IRQ using + * CSR.TCFG and CSR.TVAL registers. + */ +#include "arch_timer.h" +#include "kvm_util.h" +#include "processor.h" +#include "timer_test.h" +#include "ucall_common.h" + +static void do_idle(void) +{ + unsigned int intid; + unsigned long estat; + + __asm__ __volatile__("idle 0" : : : "memory"); + + estat = csr_read(LOONGARCH_CSR_ESTAT); + intid = !!(estat & BIT(INT_TI)); + + /* Make sure pending timer IRQ arrived */ + GUEST_ASSERT_EQ(intid, 1); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); +} + +static void guest_irq_handler(struct ex_regs *regs) +{ + unsigned int intid; + uint32_t cpu = guest_get_vcpuid(); + uint64_t xcnt, val, cfg, xcnt_diff_us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + intid = !!(regs->estat & BIT(INT_TI)); + + /* Make sure we are dealing with the correct timer IRQ */ + GUEST_ASSERT_EQ(intid, 1); + + cfg = timer_get_cfg(); + if (cfg & CSR_TCFG_PERIOD) { + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter - 1); + if (shared_data->nr_iter == 0) + disable_timer(); + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + return; + } + + /* + * On real machine, value of LOONGARCH_CSR_TVAL is BIT_ULL(48) - 1 + * On virtual machine, its value counts down from BIT_ULL(48) - 1 + */ + val = timer_get_val(); + xcnt = timer_get_cycles(); + xcnt_diff_us = cycles_to_usec(xcnt - shared_data->xcnt); + + /* Basic 'timer condition met' check */ + __GUEST_ASSERT(val > cfg, + "val = 0x%lx, cfg = 0x%lx, xcnt_diff_us = 0x%lx", + val, cfg, xcnt_diff_us); + + csr_write(CSR_TINTCLR_TI, LOONGARCH_CSR_TINTCLR); + WRITE_ONCE(shared_data->nr_iter, shared_data->nr_iter + 1); +} + +static void guest_test_period_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = test_args.nr_iter; + shared_data->xcnt = timer_get_cycles(); + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + timer_set_next_cmp_ms(test_args.timer_period_ms, true); + + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + } + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(irq_iter == 0, + "irq_iter = 0x%x.\n" + " Guest period timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + irq_iter); +} + +static void guest_test_oneshot_timer(uint32_t cpu) +{ + uint32_t irq_iter, config_iter; + uint64_t us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + shared_data->nr_iter = 0; + shared_data->guest_stage = 0; + us = msecs_to_usecs(test_args.timer_period_ms) + test_args.timer_err_margin_us; + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + /* Setup a timeout for the interrupt to arrive */ + udelay(us); + + irq_iter = READ_ONCE(shared_data->nr_iter); + __GUEST_ASSERT(config_iter + 1 == irq_iter, + "config_iter + 1 = 0x%x, irq_iter = 0x%x.\n" + " Guest timer interrupt was not triggered within the specified\n" + " interval, try to increase the error margin by [-e] option.\n", + config_iter + 1, irq_iter); + } +} + +static void guest_test_emulate_timer(uint32_t cpu) +{ + uint32_t config_iter; + uint64_t xcnt_diff_us, us; + struct test_vcpu_shared_data *shared_data = &vcpu_shared_data[cpu]; + + local_irq_disable(); + shared_data->nr_iter = 0; + us = msecs_to_usecs(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + shared_data->xcnt = timer_get_cycles(); + + /* Setup the next interrupt */ + timer_set_next_cmp_ms(test_args.timer_period_ms, false); + do_idle(); + + xcnt_diff_us = cycles_to_usec(timer_get_cycles() - shared_data->xcnt); + __GUEST_ASSERT(xcnt_diff_us >= us, + "xcnt_diff_us = 0x%lx, us = 0x%lx.\n", + xcnt_diff_us, us); + } + local_irq_enable(); +} + +static void guest_time_count_test(uint32_t cpu) +{ + uint32_t config_iter; + unsigned long start, end, prev, us; + + /* Assuming that test case starts to run in 1 second */ + start = timer_get_cycles(); + us = msec_to_cycles(1000); + __GUEST_ASSERT(start <= us, + "start = 0x%lx, us = 0x%lx.\n", + start, us); + + us = msec_to_cycles(test_args.timer_period_ms); + for (config_iter = 0; config_iter < test_args.nr_iter; config_iter++) { + start = timer_get_cycles(); + end = start + us; + /* test time count growing up always */ + while (start < end) { + prev = start; + start = timer_get_cycles(); + __GUEST_ASSERT(prev <= start, + "prev = 0x%lx, start = 0x%lx.\n", + prev, start); + } + } +} + +static void guest_code(void) +{ + uint32_t cpu = guest_get_vcpuid(); + + /* must run at first */ + guest_time_count_test(cpu); + + timer_irq_enable(); + local_irq_enable(); + guest_test_period_timer(cpu); + guest_test_oneshot_timer(cpu); + guest_test_emulate_timer(cpu); + + GUEST_DONE(); +} + +struct kvm_vm *test_vm_create(void) +{ + struct kvm_vm *vm; + int nr_vcpus = test_args.nr_vcpus; + + vm = vm_create_with_vcpus(nr_vcpus, guest_code, vcpus); + vm_init_descriptor_tables(vm); + vm_install_exception_handler(vm, EXCCODE_INT, guest_irq_handler); + + /* Make all the test's cmdline args visible to the guest */ + sync_global_to_guest(vm, test_args); + + return vm; +} + +void test_vm_cleanup(struct kvm_vm *vm) +{ + kvm_vm_free(vm); +} diff --git a/tools/testing/selftests/kvm/mmu_stress_test.c b/tools/testing/selftests/kvm/mmu_stress_test.c index 37b7e6524533..51c070556f3e 100644 --- a/tools/testing/selftests/kvm/mmu_stress_test.c +++ b/tools/testing/selftests/kvm/mmu_stress_test.c @@ -263,8 +263,10 @@ static void calc_default_nr_vcpus(void) TEST_ASSERT(!r, "sched_getaffinity failed, errno = %d (%s)", errno, strerror(errno)); - nr_vcpus = CPU_COUNT(&possible_mask) * 3/4; + nr_vcpus = CPU_COUNT(&possible_mask); TEST_ASSERT(nr_vcpus > 0, "Uh, no CPUs?"); + if (nr_vcpus >= 2) + nr_vcpus = nr_vcpus * 3/4; } int main(int argc, char *argv[]) @@ -360,11 +362,9 @@ int main(int argc, char *argv[]) #ifdef __x86_64__ /* Identity map memory in the guest using 1gb pages. */ - for (i = 0; i < slot_size; i += SZ_1G) - __virt_pg_map(vm, gpa + i, gpa + i, PG_LEVEL_1G); + virt_map_level(vm, gpa, gpa, slot_size, PG_LEVEL_1G); #else - for (i = 0; i < slot_size; i += vm->page_size) - virt_pg_map(vm, gpa + i, gpa + i); + virt_map(vm, gpa, gpa, slot_size >> vm->page_shift); #endif } diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c index f04768c1d2e4..93e603d91311 100644 --- a/tools/testing/selftests/kvm/pre_fault_memory_test.c +++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c @@ -17,13 +17,13 @@ #define TEST_NPAGES (TEST_SIZE / PAGE_SIZE) #define TEST_SLOT 10 -static void guest_code(uint64_t base_gpa) +static void guest_code(uint64_t base_gva) { volatile uint64_t val __used; int i; for (i = 0; i < TEST_NPAGES; i++) { - uint64_t *src = (uint64_t *)(base_gpa + i * PAGE_SIZE); + uint64_t *src = (uint64_t *)(base_gva + i * PAGE_SIZE); val = *src; } @@ -161,6 +161,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset, static void __test_pre_fault_memory(unsigned long vm_type, bool private) { + uint64_t gpa, gva, alignment, guest_page_size; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, .type = vm_type, @@ -170,35 +171,30 @@ static void __test_pre_fault_memory(unsigned long vm_type, bool private) struct kvm_vm *vm; struct ucall uc; - uint64_t guest_test_phys_mem; - uint64_t guest_test_virt_mem; - uint64_t alignment, guest_page_size; - vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code); alignment = guest_page_size = vm_guest_mode_params[VM_MODE_DEFAULT].page_size; - guest_test_phys_mem = (vm->max_gfn - TEST_NPAGES) * guest_page_size; + gpa = (vm->max_gfn - TEST_NPAGES) * guest_page_size; #ifdef __s390x__ alignment = max(0x100000UL, guest_page_size); #else alignment = SZ_2M; #endif - guest_test_phys_mem = align_down(guest_test_phys_mem, alignment); - guest_test_virt_mem = guest_test_phys_mem & ((1ULL << (vm->va_bits - 1)) - 1); + gpa = align_down(gpa, alignment); + gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1); - vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, - guest_test_phys_mem, TEST_SLOT, TEST_NPAGES, - private ? KVM_MEM_GUEST_MEMFD : 0); - virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, TEST_NPAGES); + vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, gpa, TEST_SLOT, + TEST_NPAGES, private ? KVM_MEM_GUEST_MEMFD : 0); + virt_map(vm, gva, gpa, TEST_NPAGES); if (private) - vm_mem_set_private(vm, guest_test_phys_mem, TEST_SIZE); + vm_mem_set_private(vm, gpa, TEST_SIZE); - pre_fault_memory(vcpu, guest_test_phys_mem, 0, SZ_2M, 0, private); - pre_fault_memory(vcpu, guest_test_phys_mem, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); - pre_fault_memory(vcpu, guest_test_phys_mem, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, 0, SZ_2M, 0, private); + pre_fault_memory(vcpu, gpa, SZ_2M, PAGE_SIZE * 2, PAGE_SIZE, private); + pre_fault_memory(vcpu, gpa, TEST_SIZE, PAGE_SIZE, PAGE_SIZE, private); - vcpu_args_set(vcpu, 1, guest_test_virt_mem); + vcpu_args_set(vcpu, 1, gva); vcpu_run(vcpu); run = vcpu->run; diff --git a/tools/testing/selftests/kvm/riscv/get-reg-list.c b/tools/testing/selftests/kvm/riscv/get-reg-list.c index 705ab3d7778b..cb54a56990a0 100644 --- a/tools/testing/selftests/kvm/riscv/get-reg-list.c +++ b/tools/testing/selftests/kvm/riscv/get-reg-list.c @@ -133,6 +133,7 @@ bool filter_reg(__u64 reg) case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_SUSP: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_STA: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_FWFT: + case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_MPXY: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_EXPERIMENTAL: case KVM_REG_RISCV_SBI_EXT | KVM_REG_RISCV_SBI_SINGLE | KVM_RISCV_SBI_EXT_VENDOR: return true; @@ -639,6 +640,7 @@ static const char *sbi_ext_single_id_to_str(__u64 reg_off) KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_SUSP), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_STA), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_FWFT), + KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_MPXY), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_EXPERIMENTAL), KVM_SBI_EXT_ARR(KVM_RISCV_SBI_EXT_VENDOR), }; @@ -1142,6 +1144,7 @@ KVM_SBI_EXT_SUBLIST_CONFIG(sta, STA); KVM_SBI_EXT_SIMPLE_CONFIG(pmu, PMU); KVM_SBI_EXT_SIMPLE_CONFIG(dbcn, DBCN); KVM_SBI_EXT_SIMPLE_CONFIG(susp, SUSP); +KVM_SBI_EXT_SIMPLE_CONFIG(mpxy, MPXY); KVM_SBI_EXT_SUBLIST_CONFIG(fwft, FWFT); KVM_ISA_EXT_SUBLIST_CONFIG(aia, AIA); @@ -1222,6 +1225,7 @@ struct vcpu_reg_list *vcpu_configs[] = { &config_sbi_pmu, &config_sbi_dbcn, &config_sbi_susp, + &config_sbi_mpxy, &config_sbi_fwft, &config_aia, &config_fp_f, diff --git a/tools/testing/selftests/kvm/s390/user_operexec.c b/tools/testing/selftests/kvm/s390/user_operexec.c new file mode 100644 index 000000000000..714906c1d12a --- /dev/null +++ b/tools/testing/selftests/kvm/s390/user_operexec.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test operation exception forwarding. + * + * Copyright IBM Corp. 2025 + * + * Authors: + * Janosch Frank <frankja@linux.ibm.com> + */ +#include "kselftest.h" +#include "kvm_util.h" +#include "test_util.h" +#include "sie.h" + +#include <linux/kvm.h> + +static void guest_code_instr0(void) +{ + asm(".word 0x0000"); +} + +static void test_user_instr0(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +static void guest_code_user_operexec(void) +{ + asm(".word 0x0807"); +} + +static void test_user_operexec(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* + * Since user_operexec is the superset it can be used for the + * 0 instruction. + */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_instr0); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0); + + kvm_vm_free(vm); +} + +/* combine user_instr0 and user_operexec */ +static void test_user_operexec_combined(void) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + int rc; + + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); + + /* Reverse enablement order */ + vm = vm_create_with_one_vcpu(&vcpu, guest_code_user_operexec); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_OPEREXEC, 0); + TEST_ASSERT_EQ(0, rc); + rc = __vm_enable_cap(vm, KVM_CAP_S390_USER_INSTR0, 0); + TEST_ASSERT_EQ(0, rc); + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_S390_SIEIC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.icptcode, ICPT_OPEREXC); + TEST_ASSERT_EQ(vcpu->run->s390_sieic.ipa, 0x0807); + + kvm_vm_free(vm); +} + +/* + * Run all tests above. + * + * Enablement after VCPU has been added is automatically tested since + * we enable the capability after VCPU creation. + */ +static struct testdef { + const char *name; + void (*test)(void); +} testlist[] = { + { "instr0", test_user_instr0 }, + { "operexec", test_user_operexec }, + { "operexec_combined", test_user_operexec_combined}, +}; + +int main(int argc, char *argv[]) +{ + int idx; + + TEST_REQUIRE(kvm_has_cap(KVM_CAP_S390_USER_INSTR0)); + + ksft_print_header(); + ksft_set_plan(ARRAY_SIZE(testlist)); + for (idx = 0; idx < ARRAY_SIZE(testlist); idx++) { + testlist[idx].test(); + ksft_test_result_pass("%s\n", testlist[idx].name); + } + ksft_finished(); +} diff --git a/tools/testing/selftests/kvm/x86/hyperv_features.c b/tools/testing/selftests/kvm/x86/hyperv_features.c index 99d327084172..130b9ce7e5dd 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_features.c +++ b/tools/testing/selftests/kvm/x86/hyperv_features.c @@ -94,7 +94,7 @@ static void guest_hcall(vm_vaddr_t pgs_gpa, struct hcall_data *hcall) if (!(hcall->control & HV_HYPERCALL_FAST_BIT)) { input = pgs_gpa; - output = pgs_gpa + 4096; + output = pgs_gpa + PAGE_SIZE; } else { input = output = 0; } diff --git a/tools/testing/selftests/kvm/x86/hyperv_ipi.c b/tools/testing/selftests/kvm/x86/hyperv_ipi.c index 2b5b4bc6ef7e..ca61836c4e32 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_ipi.c +++ b/tools/testing/selftests/kvm/x86/hyperv_ipi.c @@ -102,7 +102,7 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) /* 'Slow' HvCallSendSyntheticClusterIpi to RECEIVER_VCPU_ID_1 */ ipi->vector = IPI_VECTOR; ipi->cpu_mask = 1 << RECEIVER_VCPU_ID_1; - hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -116,13 +116,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_1 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 0; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ipis_expected[1]); @@ -138,13 +138,13 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to RECEIVER_VCPU_ID_2 */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (1 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -160,14 +160,14 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to both RECEIVER_VCPU_ID_{1,2} */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_SPARSE_4K; ipi_ex->vp_set.valid_bank_mask = 1 << 1 | 1; ipi_ex->vp_set.bank_contents[0] = BIT(RECEIVER_VCPU_ID_1); ipi_ex->vp_set.bank_contents[1] = BIT(RECEIVER_VCPU_ID_2 - 64); hyperv_hypercall(HVCALL_SEND_IPI_EX | (2 << HV_HYPERCALL_VARHEAD_OFFSET), - pgs_gpa, pgs_gpa + 4096); + pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); @@ -183,10 +183,10 @@ static void sender_guest_code(void *hcall_page, vm_vaddr_t pgs_gpa) GUEST_SYNC(stage++); /* 'Slow' HvCallSendSyntheticClusterIpiEx to HV_GENERIC_SET_ALL */ - memset(hcall_page, 0, 4096); + memset(hcall_page, 0, PAGE_SIZE); ipi_ex->vector = IPI_VECTOR; ipi_ex->vp_set.format = HV_GENERIC_SET_ALL; - hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + 4096); + hyperv_hypercall(HVCALL_SEND_IPI_EX, pgs_gpa, pgs_gpa + PAGE_SIZE); nop_loop(); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_1] == ++ipis_expected[0]); GUEST_ASSERT(ipis_rcvd[RECEIVER_VCPU_ID_2] == ++ipis_expected[1]); diff --git a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c index 077cd0ec3040..a3b7ce155981 100644 --- a/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c +++ b/tools/testing/selftests/kvm/x86/hyperv_tlb_flush.c @@ -621,7 +621,7 @@ int main(int argc, char *argv[]) for (i = 0; i < NTEST_PAGES; i++) { pte = vm_get_page_table_entry(vm, data->test_pages + i * PAGE_SIZE); gpa = addr_hva2gpa(vm, pte); - __virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK, PG_LEVEL_4K); + virt_pg_map(vm, gva + PAGE_SIZE * i, gpa & PAGE_MASK); data->test_pages_pte[i] = gva + (gpa & ~PAGE_MASK); } diff --git a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c index dad988351493..f001cb836bfa 100644 --- a/tools/testing/selftests/kvm/x86/vmx_close_while_nested_test.c +++ b/tools/testing/selftests/kvm/x86/nested_close_kvm_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_close_while_nested - * * Copyright (C) 2019, Red Hat, Inc. * * Verify that nothing bad happens if a KVM user exits with open @@ -12,6 +10,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -22,6 +21,8 @@ enum { PORT_L0_EXIT = 0x2000, }; +#define L2_GUEST_STACK_SIZE 64 + static void l2_guest_code(void) { /* Exit to L0 */ @@ -29,9 +30,8 @@ static void l2_guest_code(void) : : [port] "d" (PORT_L0_EXIT) : "rax"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_vmx_code(struct vmx_pages *vmx_pages) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); @@ -45,19 +45,43 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_ASSERT(0); } +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* Prepare the VMCB for L2 execution. */ + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(0); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva; struct kvm_vcpu *vcpu; struct kvm_vm *vm; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); for (;;) { volatile struct kvm_run *run = vcpu->run; diff --git a/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c new file mode 100644 index 000000000000..a6b6da9cf7fe --- /dev/null +++ b/tools/testing/selftests/kvm/x86/nested_invalid_cr3_test.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * This test verifies that L1 fails to enter L2 with an invalid CR3, and + * succeeds otherwise. + */ +#include "kvm_util.h" +#include "vmx.h" +#include "svm_util.h" +#include "kselftest.h" + + +#define L2_GUEST_STACK_SIZE 64 + +static void l2_guest_code(void) +{ + vmcall(); +} + +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = svm->vmcb->save.cr3; + svm->vmcb->save.cr3 = -1ull; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_ERR); + + /* Now restore CR3 and make sure L2 runs successfully */ + svm->vmcb->save.cr3 = save_cr3; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + uintptr_t save_cr3; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* Try to run L2 with invalid CR3 and make sure it fails */ + save_cr3 = vmreadz(GUEST_CR3); + vmwrite(GUEST_CR3, -1ull); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == + (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); + + /* Now restore CR3 and make sure L2 runs successfully */ + vmwrite(GUEST_CR3, save_cr3); + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + + GUEST_DONE(); +} + +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + +int main(int argc, char *argv[]) +{ + struct kvm_vcpu *vcpu; + struct kvm_vm *vm; + vm_vaddr_t guest_gva = 0; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); + + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); + + for (;;) { + struct ucall uc; + + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c index 2ceb5c78c442..2839f650e5c9 100644 --- a/tools/testing/selftests/kvm/x86/vmx_tsc_adjust_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_adjust_test.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * vmx_tsc_adjust_test - * * Copyright (C) 2018, Google LLC. * * IA32_TSC_ADJUST test @@ -22,6 +20,7 @@ #include "kvm_util.h" #include "processor.h" #include "vmx.h" +#include "svm_util.h" #include <string.h> #include <sys/ioctl.h> @@ -35,6 +34,8 @@ #define TSC_ADJUST_VALUE (1ll << 32) #define TSC_OFFSET_VALUE -(1ll << 48) +#define L2_GUEST_STACK_SIZE 64 + enum { PORT_ABORT = 0x1000, PORT_REPORT, @@ -72,42 +73,47 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_guest_code(void *data) { -#define L2_GUEST_STACK_SIZE 64 unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; - uint32_t control; - uintptr_t save_cr3; + /* Set TSC from L1 and make sure TSC_ADJUST is updated correctly */ GUEST_ASSERT(rdtsc() < TSC_ADJUST_VALUE); wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE); check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); - GUEST_ASSERT(load_vmcs(vmx_pages)); - - /* Prepare the VMCS for L2 execution. */ - prepare_vmcs(vmx_pages, l2_guest_code, - &l2_guest_stack[L2_GUEST_STACK_SIZE]); - control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); - control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; - vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); - vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); - - /* Jump into L2. First, test failure to load guest CR3. */ - save_cr3 = vmreadz(GUEST_CR3); - vmwrite(GUEST_CR3, -1ull); - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == - (EXIT_REASON_FAILED_VMENTRY | EXIT_REASON_INVALID_STATE)); - check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE); - vmwrite(GUEST_CR3, save_cr3); - - GUEST_ASSERT(!vmlaunch()); - GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + /* + * Run L2 with TSC_OFFSET. L2 will write to TSC, and L1 is not + * intercepting the write so it should update L1's TSC_ADJUST. + */ + if (this_cpu_has(X86_FEATURE_VMX)) { + struct vmx_pages *vmx_pages = data; + uint32_t control; + + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + control = vmreadz(CPU_BASED_VM_EXEC_CONTROL); + control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETTING; + vmwrite(CPU_BASED_VM_EXEC_CONTROL, control); + vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE); + + GUEST_ASSERT(!vmlaunch()); + GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL); + } else { + struct svm_test_data *svm = data; + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + svm->vmcb->control.tsc_offset = TSC_OFFSET_VALUE; + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + } check_ia32_tsc_adjust(-2 * TSC_ADJUST_VALUE); - GUEST_DONE(); } @@ -119,16 +125,19 @@ static void report(int64_t val) int main(int argc, char *argv[]) { - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t nested_gva; struct kvm_vcpu *vcpu; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); - vm = vm_create_with_one_vcpu(&vcpu, (void *) l1_guest_code); + vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &nested_gva); + else + vcpu_alloc_svm(vm, &nested_gva); - /* Allocate VMX pages and shared descriptors (vmx_pages). */ - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + vcpu_args_set(vcpu, 1, nested_gva); for (;;) { struct ucall uc; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c index 1759fa5cb3f2..4260c9e4f489 100644 --- a/tools/testing/selftests/kvm/x86/vmx_nested_tsc_scaling_test.c +++ b/tools/testing/selftests/kvm/x86/nested_tsc_scaling_test.c @@ -13,6 +13,7 @@ #include "kvm_util.h" #include "vmx.h" +#include "svm_util.h" #include "kselftest.h" /* L2 is scaled up (from L1's perspective) by this factor */ @@ -79,7 +80,30 @@ static void l2_guest_code(void) __asm__ __volatile__("vmcall"); } -static void l1_guest_code(struct vmx_pages *vmx_pages) +static void l1_svm_code(struct svm_test_data *svm) +{ + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + + /* check that L1's frequency looks alright before launching L2 */ + check_tsc_freq(UCHECK_L1); + + generic_svm_setup(svm, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* enable TSC scaling for L2 */ + wrmsr(MSR_AMD64_TSC_RATIO, L2_SCALE_FACTOR << 32); + + /* launch L2 */ + run_guest(svm->vmcb, svm->vmcb_gpa); + GUEST_ASSERT(svm->vmcb->control.exit_code == SVM_EXIT_VMMCALL); + + /* check that L1's frequency still looks good */ + check_tsc_freq(UCHECK_L1); + + GUEST_DONE(); +} + +static void l1_vmx_code(struct vmx_pages *vmx_pages) { unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; uint32_t control; @@ -116,11 +140,19 @@ static void l1_guest_code(struct vmx_pages *vmx_pages) GUEST_DONE(); } +static void l1_guest_code(void *data) +{ + if (this_cpu_has(X86_FEATURE_VMX)) + l1_vmx_code(data); + else + l1_svm_code(data); +} + int main(int argc, char *argv[]) { struct kvm_vcpu *vcpu; struct kvm_vm *vm; - vm_vaddr_t vmx_pages_gva; + vm_vaddr_t guest_gva = 0; uint64_t tsc_start, tsc_end; uint64_t tsc_khz; @@ -129,7 +161,8 @@ int main(int argc, char *argv[]) uint64_t l1_tsc_freq = 0; uint64_t l2_tsc_freq = 0; - TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX) || + kvm_cpu_has(X86_FEATURE_SVM)); TEST_REQUIRE(kvm_has_cap(KVM_CAP_TSC_CONTROL)); TEST_REQUIRE(sys_clocksource_is_based_on_tsc()); @@ -152,8 +185,13 @@ int main(int argc, char *argv[]) printf("real TSC frequency is around: %"PRIu64"\n", l0_tsc_freq); vm = vm_create_with_one_vcpu(&vcpu, l1_guest_code); - vcpu_alloc_vmx(vm, &vmx_pages_gva); - vcpu_args_set(vcpu, 1, vmx_pages_gva); + + if (kvm_cpu_has(X86_FEATURE_VMX)) + vcpu_alloc_vmx(vm, &guest_gva); + else + vcpu_alloc_svm(vm, &guest_gva); + + vcpu_args_set(vcpu, 1, guest_gva); tsc_khz = __vcpu_ioctl(vcpu, KVM_GET_TSC_KHZ, NULL); TEST_ASSERT(tsc_khz != -1, "vcpu ioctl KVM_GET_TSC_KHZ failed"); diff --git a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c index 82a8d88b5338..1969f4ab9b28 100644 --- a/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c +++ b/tools/testing/selftests/kvm/x86/private_mem_conversions_test.c @@ -380,7 +380,7 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; pthread_t threads[KVM_MAX_VCPUS]; struct kvm_vm *vm; - int memfd, i, r; + int memfd, i; const struct vm_shape shape = { .mode = VM_MODE_DEFAULT, @@ -428,11 +428,8 @@ static void test_mem_conversions(enum vm_mem_backing_src_type src_type, uint32_t * should prevent the VM from being fully destroyed until the last * reference to the guest_memfd is also put. */ - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); - - r = fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); - TEST_ASSERT(!r, __KVM_SYSCALL_ERROR("fallocate()", r)); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, 0, memfd_size); + kvm_fallocate(memfd, FALLOC_FL_KEEP_SIZE, 0, memfd_size); close(memfd); } diff --git a/tools/testing/selftests/kvm/x86/sev_smoke_test.c b/tools/testing/selftests/kvm/x86/sev_smoke_test.c index 77256c89bb8d..86ad1c7d068f 100644 --- a/tools/testing/selftests/kvm/x86/sev_smoke_test.c +++ b/tools/testing/selftests/kvm/x86/sev_smoke_test.c @@ -104,7 +104,7 @@ static void test_sync_vmsa(uint32_t type, uint64_t policy) vm_sev_launch(vm, policy, NULL); /* This page is shared, so make it decrypted. */ - memset(hva, 0, 4096); + memset(hva, 0, PAGE_SIZE); vcpu_run(vcpu); diff --git a/tools/testing/selftests/kvm/x86/state_test.c b/tools/testing/selftests/kvm/x86/state_test.c index 141b7fc0c965..f2c7a1c297e3 100644 --- a/tools/testing/selftests/kvm/x86/state_test.c +++ b/tools/testing/selftests/kvm/x86/state_test.c @@ -141,7 +141,7 @@ static void __attribute__((__flatten__)) guest_code(void *arg) if (this_cpu_has(X86_FEATURE_XSAVE)) { uint64_t supported_xcr0 = this_cpu_supported_xcr0(); - uint8_t buffer[4096]; + uint8_t buffer[PAGE_SIZE]; memset(buffer, 0xcc, sizeof(buffer)); diff --git a/tools/testing/selftests/kvm/x86/userspace_io_test.c b/tools/testing/selftests/kvm/x86/userspace_io_test.c index 9481cbcf284f..be7d72f3c029 100644 --- a/tools/testing/selftests/kvm/x86/userspace_io_test.c +++ b/tools/testing/selftests/kvm/x86/userspace_io_test.c @@ -85,7 +85,7 @@ int main(int argc, char *argv[]) regs.rcx = 1; if (regs.rcx == 3) regs.rcx = 8192; - memset((void *)run + run->io.data_offset, 0xaa, 4096); + memset((void *)run + run->io.data_offset, 0xaa, PAGE_SIZE); vcpu_regs_set(vcpu, ®s); } diff --git a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c index fa512d033205..98cb6bdab3e6 100644 --- a/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c +++ b/tools/testing/selftests/kvm/x86/vmx_dirty_log_test.c @@ -120,17 +120,17 @@ static void test_vmx_dirty_log(bool enable_ept) * GPAs as the EPT enabled case. */ if (enable_ept) { - prepare_eptp(vmx, vm, 0); + prepare_eptp(vmx, vm); nested_map_memslot(vmx, vm, 0); - nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, 4096); - nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, 4096); + nested_map(vmx, vm, NESTED_TEST_MEM1, GUEST_TEST_MEM, PAGE_SIZE); + nested_map(vmx, vm, NESTED_TEST_MEM2, GUEST_TEST_MEM, PAGE_SIZE); } bmap = bitmap_zalloc(TEST_MEM_PAGES); host_test_mem = addr_gpa2hva(vm, GUEST_TEST_MEM); while (!done) { - memset(host_test_mem, 0xaa, TEST_MEM_PAGES * 4096); + memset(host_test_mem, 0xaa, TEST_MEM_PAGES * PAGE_SIZE); vcpu_run(vcpu); TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); @@ -153,9 +153,9 @@ static void test_vmx_dirty_log(bool enable_ept) } TEST_ASSERT(!test_bit(1, bmap), "Page 1 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[4096 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 1 written by guest"); TEST_ASSERT(!test_bit(2, bmap), "Page 2 incorrectly reported dirty"); - TEST_ASSERT(host_test_mem[8192 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); + TEST_ASSERT(host_test_mem[PAGE_SIZE*2 / 8] == 0xaaaaaaaaaaaaaaaaULL, "Page 2 written by guest"); break; case UCALL_DONE: done = true; diff --git a/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c new file mode 100644 index 000000000000..cf1d2d1f2a8f --- /dev/null +++ b/tools/testing/selftests/kvm/x86/vmx_nested_la57_state_test.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2025, Google LLC. + * + * Test KVM's ability to save and restore nested state when the L1 guest + * is using 5-level paging and the L2 guest is using 4-level paging. + * + * This test would have failed prior to commit 9245fd6b8531 ("KVM: x86: + * model canonical checks more precisely"). + */ +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "vmx.h" + +#define LA57_GS_BASE 0xff2bc0311fb00000ull + +static void l2_guest_code(void) +{ + /* + * Sync with L0 to trigger save/restore. After + * resuming, execute VMCALL to exit back to L1. + */ + GUEST_SYNC(1); + vmcall(); +} + +static void l1_guest_code(struct vmx_pages *vmx_pages) +{ +#define L2_GUEST_STACK_SIZE 64 + unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE]; + u64 guest_cr4; + vm_paddr_t pml5_pa, pml4_pa; + u64 *pml5; + u64 exit_reason; + + /* Set GS_BASE to a value that is only canonical with LA57. */ + wrmsr(MSR_GS_BASE, LA57_GS_BASE); + GUEST_ASSERT(rdmsr(MSR_GS_BASE) == LA57_GS_BASE); + + GUEST_ASSERT(vmx_pages->vmcs_gpa); + GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages)); + GUEST_ASSERT(load_vmcs(vmx_pages)); + + prepare_vmcs(vmx_pages, l2_guest_code, + &l2_guest_stack[L2_GUEST_STACK_SIZE]); + + /* + * Set up L2 with a 4-level page table by pointing its CR3 to + * L1's first PML4 table and clearing CR4.LA57. This creates + * the CR4.LA57 mismatch that exercises the bug. + */ + pml5_pa = get_cr3() & PHYSICAL_PAGE_MASK; + pml5 = (u64 *)pml5_pa; + pml4_pa = pml5[0] & PHYSICAL_PAGE_MASK; + vmwrite(GUEST_CR3, pml4_pa); + + guest_cr4 = vmreadz(GUEST_CR4); + guest_cr4 &= ~X86_CR4_LA57; + vmwrite(GUEST_CR4, guest_cr4); + + GUEST_ASSERT(!vmlaunch()); + + exit_reason = vmreadz(VM_EXIT_REASON); + GUEST_ASSERT(exit_reason == EXIT_REASON_VMCALL); +} + +void guest_code(struct vmx_pages *vmx_pages) +{ + l1_guest_code(vmx_pages); + GUEST_DONE(); +} + +int main(int argc, char *argv[]) +{ + vm_vaddr_t vmx_pages_gva = 0; + struct kvm_vm *vm; + struct kvm_vcpu *vcpu; + struct kvm_x86_state *state; + struct ucall uc; + int stage; + + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_VMX)); + TEST_REQUIRE(kvm_cpu_has(X86_FEATURE_LA57)); + TEST_REQUIRE(kvm_has_cap(KVM_CAP_NESTED_STATE)); + + vm = vm_create_with_one_vcpu(&vcpu, guest_code); + + /* + * L1 needs to read its own PML5 table to set up L2. Identity map + * the PML5 table to facilitate this. + */ + virt_map(vm, vm->pgd, vm->pgd, 1); + + vcpu_alloc_vmx(vm, &vmx_pages_gva); + vcpu_args_set(vcpu, 1, vmx_pages_gva); + + for (stage = 1;; stage++) { + vcpu_run(vcpu); + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO); + + switch (get_ucall(vcpu, &uc)) { + case UCALL_ABORT: + REPORT_GUEST_ASSERT(uc); + /* NOT REACHED */ + case UCALL_SYNC: + break; + case UCALL_DONE: + goto done; + default: + TEST_FAIL("Unknown ucall %lu", uc.cmd); + } + + TEST_ASSERT(uc.args[1] == stage, + "Expected stage %d, got stage %lu", stage, (ulong)uc.args[1]); + if (stage == 1) { + pr_info("L2 is active; performing save/restore.\n"); + state = vcpu_save_state(vcpu); + + kvm_vm_release(vm); + + /* Restore state in a new VM. */ + vcpu = vm_recreate_with_one_vcpu(vm); + vcpu_load_state(vcpu, state); + kvm_x86_state_cleanup(state); + } + } + +done: + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c index 35cb9de54a82..ae4a4b6c05ca 100644 --- a/tools/testing/selftests/kvm/x86/xapic_ipi_test.c +++ b/tools/testing/selftests/kvm/x86/xapic_ipi_test.c @@ -256,7 +256,7 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, int nodes = 0; time_t start_time, last_update, now; time_t interval_secs = 1; - int i, r; + int i; int from, to; unsigned long bit; uint64_t hlt_count; @@ -267,9 +267,8 @@ void do_migrations(struct test_data_page *data, int run_secs, int delay_usecs, delay_usecs); /* Get set of first 64 numa nodes available */ - r = get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, + kvm_get_mempolicy(NULL, &nodemask, sizeof(nodemask) * 8, 0, MPOL_F_MEMS_ALLOWED); - TEST_ASSERT(r == 0, "get_mempolicy failed errno=%d", errno); fprintf(stderr, "Numa nodes found amongst first %lu possible nodes " "(each 1-bit indicates node is present): %#lx\n", diff --git a/tools/testing/selftests/mm/guard-regions.c b/tools/testing/selftests/mm/guard-regions.c index 8dd81c0a4a5a..795bf3f39f44 100644 --- a/tools/testing/selftests/mm/guard-regions.c +++ b/tools/testing/selftests/mm/guard-regions.c @@ -94,6 +94,7 @@ static void *mmap_(FIXTURE_DATA(guard_regions) * self, case ANON_BACKED: flags |= MAP_PRIVATE | MAP_ANON; fd = -1; + offset = 0; break; case SHMEM_BACKED: case LOCAL_FILE_BACKED: @@ -260,6 +261,54 @@ static bool is_buf_eq(char *buf, size_t size, char chr) return true; } +/* + * Some file systems have issues with merging due to changing merge-sensitive + * parameters in the .mmap callback, and prior to .mmap_prepare being + * implemented everywhere this will now result in an unexpected failure to + * merge (e.g. - overlayfs). + * + * Perform a simple test to see if the local file system suffers from this, if + * it does then we can skip test logic that assumes local file system merging is + * sane. + */ +static bool local_fs_has_sane_mmap(FIXTURE_DATA(guard_regions) * self, + const FIXTURE_VARIANT(guard_regions) * variant) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr2; + struct procmap_fd procmap; + + if (variant->backing != LOCAL_FILE_BACKED) + return true; + + /* Map 10 pages. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + if (ptr == MAP_FAILED) + return false; + /* Unmap the middle. */ + munmap(&ptr[5 * page_size], page_size); + + /* Map again. */ + ptr2 = mmap_(self, variant, &ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED, 5 * page_size); + + if (ptr2 == MAP_FAILED) + return false; + + /* Now make sure they all merged. */ + if (open_self_procmap(&procmap) != 0) + return false; + if (!find_vma_procmap(&procmap, ptr)) + return false; + if (procmap.query.vma_start != (unsigned long)ptr) + return false; + if (procmap.query.vma_end != (unsigned long)ptr + 10 * page_size) + return false; + close_procmap(&procmap); + + return true; +} + FIXTURE_SETUP(guard_regions) { self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); @@ -2138,4 +2187,140 @@ TEST_F(guard_regions, pagemap_scan) ASSERT_EQ(munmap(ptr, 10 * page_size), 0); } +TEST_F(guard_regions, collapse) +{ + const unsigned long page_size = self->page_size; + const unsigned long size = 2 * HPAGE_SIZE; + const unsigned long num_pages = size / page_size; + char *ptr; + int i; + + /* Need file to be correct size for tests for non-anon. */ + if (variant->backing != ANON_BACKED) + ASSERT_EQ(ftruncate(self->fd, size), 0); + + /* + * We must close and re-open local-file backed as read-only for + * CONFIG_READ_ONLY_THP_FOR_FS to work. + */ + if (variant->backing == LOCAL_FILE_BACKED) { + ASSERT_EQ(close(self->fd), 0); + + self->fd = open(self->path, O_RDONLY); + ASSERT_GE(self->fd, 0); + } + + ptr = mmap_(self, variant, NULL, size, PROT_READ, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Prevent being faulted-in as huge. */ + ASSERT_EQ(madvise(ptr, size, MADV_NOHUGEPAGE), 0); + /* Fault in. */ + ASSERT_EQ(madvise(ptr, size, MADV_POPULATE_READ), 0); + + /* Install guard regions in ever other page. */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + ASSERT_EQ(madvise(ptr_page, page_size, MADV_GUARD_INSTALL), 0); + /* Accesses should now fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } + + /* Allow huge page throughout region. */ + ASSERT_EQ(madvise(ptr, size, MADV_HUGEPAGE), 0); + + /* + * Now collapse the entire region. This should fail in all cases. + * + * The madvise() call will also fail if CONFIG_READ_ONLY_THP_FOR_FS is + * not set for the local file case, but we can't differentiate whether + * this occurred or if the collapse was rightly rejected. + */ + EXPECT_NE(madvise(ptr, size, MADV_COLLAPSE), 0); + + /* + * If we introduce a bug that causes the collapse to succeed, gather + * data on whether guard regions are at least preserved. The test will + * fail at this point in any case. + */ + for (i = 0; i < num_pages; i += 2) { + char *ptr_page = &ptr[i * page_size]; + + /* Accesses should still fail. */ + ASSERT_FALSE(try_read_buf(ptr_page)); + } +} + +TEST_F(guard_regions, smaps) +{ + const unsigned long page_size = self->page_size; + struct procmap_fd procmap; + char *ptr, *ptr2; + int i; + + /* Map a region. */ + ptr = mmap_(self, variant, NULL, 10 * page_size, PROT_READ | PROT_WRITE, 0, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* We shouldn't yet see a guard flag. */ + ASSERT_FALSE(check_vmflag_guard(ptr)); + + /* Install a single guard region. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + + /* Now we should see a guard flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* + * Removing the guard region should not change things because we simply + * cannot accurately track whether a given VMA has had all of its guard + * regions removed. + */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_REMOVE), 0); + ASSERT_TRUE(check_vmflag_guard(ptr)); + + /* Install guard regions throughout. */ + for (i = 0; i < 10; i++) { + ASSERT_EQ(madvise(&ptr[i * page_size], page_size, MADV_GUARD_INSTALL), 0); + /* We should always see the guard region flag. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + } + + /* Split into two VMAs. */ + ASSERT_EQ(munmap(&ptr[4 * page_size], page_size), 0); + + /* Both VMAs should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); + ASSERT_TRUE(check_vmflag_guard(&ptr[5 * page_size])); + + /* + * If the local file system is unable to merge VMAs due to having + * unusual characteristics, there is no point in asserting merge + * behaviour. + */ + if (!local_fs_has_sane_mmap(self, variant)) { + TH_LOG("local filesystem does not support sane merging skipping merge test"); + return; + } + + /* Map a fresh VMA between the two split VMAs. */ + ptr2 = mmap_(self, variant, &ptr[4 * page_size], page_size, + PROT_READ | PROT_WRITE, MAP_FIXED, 4 * page_size); + ASSERT_NE(ptr2, MAP_FAILED); + + /* + * Check the procmap to ensure that this VMA merged with the adjacent + * two. The guard region flag is 'sticky' so should not preclude + * merging. + */ + ASSERT_EQ(open_self_procmap(&procmap), 0); + ASSERT_TRUE(find_vma_procmap(&procmap, ptr)); + ASSERT_EQ(procmap.query.vma_start, (unsigned long)ptr); + ASSERT_EQ(procmap.query.vma_end, (unsigned long)ptr + 10 * page_size); + ASSERT_EQ(close_procmap(&procmap), 0); + /* And, of course, this VMA should have the guard flag set. */ + ASSERT_TRUE(check_vmflag_guard(ptr)); +} + TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/gup_test.c b/tools/testing/selftests/mm/gup_test.c index 8900b840c17a..40c1538a17b4 100644 --- a/tools/testing/selftests/mm/gup_test.c +++ b/tools/testing/selftests/mm/gup_test.c @@ -17,9 +17,8 @@ #define MB (1UL << 20) -/* Just the flags we need, copied from mm.h: */ +/* Just the flags we need, copied from the kernel internals. */ #define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ #define GUP_TEST_FILE "/sys/kernel/debug/gup_test" @@ -93,7 +92,7 @@ int main(int argc, char **argv) { struct gup_test gup = { 0 }; int filed, i, opt, nr_pages = 1, thp = -1, write = 1, nthreads = 1, ret; - int flags = MAP_PRIVATE, touch = 0; + int flags = MAP_PRIVATE; char *file = "/dev/zero"; pthread_t *tid; char *p; @@ -170,10 +169,6 @@ int main(int argc, char **argv) case 'H': flags |= (MAP_HUGETLB | MAP_ANONYMOUS); break; - case 'z': - /* fault pages in gup, do not fault in userland */ - touch = 1; - break; default: ksft_exit_fail_msg("Wrong argument\n"); } @@ -244,18 +239,9 @@ int main(int argc, char **argv) else if (thp == 0) madvise(p, size, MADV_NOHUGEPAGE); - /* - * FOLL_TOUCH, in gup_test, is used as an either/or case: either - * fault pages in from the kernel via FOLL_TOUCH, or fault them - * in here, from user space. This allows comparison of performance - * between those two cases. - */ - if (touch) { - gup.gup_flags |= FOLL_TOUCH; - } else { - for (; (unsigned long)p < gup.addr + size; p += psize()) - p[0] = 0; - } + /* Fault them in here, from user space. */ + for (; (unsigned long)p < gup.addr + size; p += psize()) + p[0] = 0; tid = malloc(sizeof(pthread_t) * nthreads); assert(tid); diff --git a/tools/testing/selftests/mm/hmm-tests.c b/tools/testing/selftests/mm/hmm-tests.c index 15aadaf24a66..5a1525f72daa 100644 --- a/tools/testing/selftests/mm/hmm-tests.c +++ b/tools/testing/selftests/mm/hmm-tests.c @@ -25,6 +25,7 @@ #include <sys/stat.h> #include <sys/mman.h> #include <sys/ioctl.h> +#include <sys/time.h> /* @@ -50,6 +51,8 @@ enum { HMM_COHERENCE_DEVICE_TWO, }; +#define ONEKB (1 << 10) +#define ONEMEG (1 << 20) #define TWOMEG (1 << 21) #define HMM_BUFFER_SIZE (1024 << 12) #define HMM_PATH_MAX 64 @@ -207,8 +210,10 @@ static void hmm_buffer_free(struct hmm_buffer *buffer) if (buffer == NULL) return; - if (buffer->ptr) + if (buffer->ptr) { munmap(buffer->ptr, buffer->size); + buffer->ptr = NULL; + } free(buffer->mirror); free(buffer); } @@ -525,6 +530,8 @@ TEST_F(hmm, anon_write_prot) /* * Check that a device writing an anonymous private mapping * will copy-on-write if a child process inherits the mapping. + * + * Also verifies after fork() memory the device can be read by child. */ TEST_F(hmm, anon_write_child) { @@ -532,72 +539,101 @@ TEST_F(hmm, anon_write_child) unsigned long npages; unsigned long size; unsigned long i; + void *old_ptr; + void *map; int *ptr; pid_t pid; int child_fd; - int ret; - - npages = ALIGN(HMM_BUFFER_SIZE, self->page_size) >> self->page_shift; - ASSERT_NE(npages, 0); - size = npages << self->page_shift; - - buffer = malloc(sizeof(*buffer)); - ASSERT_NE(buffer, NULL); - - buffer->fd = -1; - buffer->size = size; - buffer->mirror = malloc(size); - ASSERT_NE(buffer->mirror, NULL); - - buffer->ptr = mmap(NULL, size, - PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, - buffer->fd, 0); - ASSERT_NE(buffer->ptr, MAP_FAILED); - - /* Initialize buffer->ptr so we can tell if it is written. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ptr[i] = i; - - /* Initialize data that the device will write to buffer->ptr. */ - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ptr[i] = -i; + int ret, use_thp, migrate; + + for (migrate = 0; migrate < 2; ++migrate) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + npages = ALIGN(use_thp ? TWOMEG : HMM_BUFFER_SIZE, + self->page_size) >> self->page_shift; + ASSERT_NE(npages, 0); + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size * 2; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + + buffer->ptr = mmap(NULL, size * 2, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + old_ptr = buffer->ptr; + if (use_thp) { + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + } + + /* Initialize buffer->ptr so we can tell if it is written. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Initialize data that the device will write to buffer->ptr. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ptr[i] = -i; + + if (migrate) { + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + } + + pid = fork(); + if (pid == -1) + ASSERT_EQ(pid, 0); + if (pid != 0) { + waitpid(pid, &ret, 0); + ASSERT_EQ(WIFEXITED(ret), 1); + + /* Check that the parent's buffer did not change. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + continue; + } + + /* Check that we see the parent's values. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + if (!migrate) { + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } + + /* The child process needs its own mirror to its own mm. */ + child_fd = hmm_open(0); + ASSERT_GE(child_fd, 0); + + /* Simulate a device writing system memory. */ + ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + ASSERT_EQ(buffer->faults, 1); - pid = fork(); - if (pid == -1) - ASSERT_EQ(pid, 0); - if (pid != 0) { - waitpid(pid, &ret, 0); - ASSERT_EQ(WIFEXITED(ret), 1); + /* Check what the device wrote. */ + if (!migrate) { + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], -i); + } - /* Check that the parent's buffer did not change. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - return; + close(child_fd); + exit(0); + } } - - /* Check that we see the parent's values. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], i); - for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - /* The child process needs its own mirror to its own mm. */ - child_fd = hmm_open(0); - ASSERT_GE(child_fd, 0); - - /* Simulate a device writing system memory. */ - ret = hmm_dmirror_cmd(child_fd, HMM_DMIRROR_WRITE, buffer, npages); - ASSERT_EQ(ret, 0); - ASSERT_EQ(buffer->cpages, npages); - ASSERT_EQ(buffer->faults, 1); - - /* Check what the device wrote. */ - for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) - ASSERT_EQ(ptr[i], -i); - - close(child_fd); - exit(0); } /* @@ -2055,4 +2091,765 @@ TEST_F(hmm, hmm_cow_in_device) hmm_buffer_free(buffer); } + +/* + * Migrate private anonymous huge empty page. + */ +TEST_F(hmm, migrate_anon_huge_empty) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page. + */ +TEST_F(hmm, migrate_anon_huge_zero) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + int val; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize a read-only zero huge page. */ + val = *(int *)buffer->ptr; + ASSERT_EQ(val, 0); + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], 0); + /* If it asserts once, it probably will 500,000 times */ + if (ptr[i] != 0) + break; + } + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and free. + */ +TEST_F(hmm, migrate_anon_huge_free) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Try freeing it. */ + ret = madvise(map, size, MADV_FREE); + ASSERT_EQ(ret, 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge page and fault back to sysmem. + */ +TEST_F(hmm, migrate_anon_huge_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate memory and fault back to sysmem after partially unmapping. + */ +TEST_F(hmm, migrate_partial_unmap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret, j, use_thp; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, 2 * size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(buffer->ptr + offsets[j], ONEMEG); + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + if (i * sizeof(int) < offsets[j] || + i * sizeof(int) >= offsets[j] + ONEMEG) + ASSERT_EQ(ptr[i], i); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } +} + +TEST_F(hmm, migrate_remap_fault) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size = TWOMEG; + unsigned long i; + void *old_ptr, *new_ptr = NULL; + void *map; + int *ptr; + int ret, j, use_thp, dont_unmap, before; + int offsets[] = { 0, 512 * ONEKB, ONEMEG }; + + for (before = 0; before < 2; ++before) { + for (dont_unmap = 0; dont_unmap < 2; ++dont_unmap) { + for (use_thp = 0; use_thp < 2; ++use_thp) { + for (j = 0; j < ARRAY_SIZE(offsets); ++j) { + int flags = MREMAP_MAYMOVE | MREMAP_FIXED; + + if (dont_unmap) + flags |= MREMAP_DONTUNMAP; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 8 * size; + buffer->mirror = malloc(size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, size); + + buffer->ptr = mmap(NULL, buffer->size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, + buffer->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)buffer->ptr, size); + if (use_thp) + ret = madvise(map, size, MADV_HUGEPAGE); + else + ret = madvise(map, size, MADV_NOHUGEPAGE); + ASSERT_EQ(ret, 0); + old_ptr = buffer->ptr; + munmap(map + size, size * 2); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + if (before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Migrate memory to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + if (!before) { + new_ptr = mremap((void *)map, size, size, flags, + map + size + offsets[j]); + ASSERT_NE(new_ptr, MAP_FAILED); + buffer->ptr = new_ptr; + } + + /* Fault pages back to system memory and check them. */ + for (i = 0, ptr = buffer->ptr; + i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], i); + + munmap(new_ptr, size); + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); + } + } + } + } +} + +/* + * Migrate private anonymous huge page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) { + ASSERT_EQ(ptr[i], i); + if (ptr[i] != i) + break; + } + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ptr[i] = i; + + /* Migrate THP to device. */ + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* + * Force an allocation error when faulting back a THP resident in the + * device. + */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + + ret = hmm_migrate_dev_to_sys(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 2048); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +/* + * Migrate private anonymous huge zero page with allocation errors. + */ +TEST_F(hmm, migrate_anon_huge_zero_err) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned long i; + void *old_ptr; + void *map; + int *ptr; + int ret; + + size = TWOMEG; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = 2 * size; + buffer->mirror = malloc(2 * size); + ASSERT_NE(buffer->mirror, NULL); + memset(buffer->mirror, 0xFF, 2 * size); + + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + + npages = size >> self->page_shift; + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device read. */ + for (i = 0, ptr = buffer->mirror; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + /* Try faulting back a single (PAGE_SIZE) page. */ + ptr = buffer->ptr; + ASSERT_EQ(ptr[2048], 0); + + /* unmap and remap the region to reset things. */ + ret = munmap(old_ptr, 2 * size); + ASSERT_EQ(ret, 0); + old_ptr = mmap(NULL, 2 * size, PROT_READ, + MAP_PRIVATE | MAP_ANONYMOUS, buffer->fd, 0); + ASSERT_NE(old_ptr, MAP_FAILED); + map = (void *)ALIGN((uintptr_t)old_ptr, size); + ret = madvise(map, size, MADV_HUGEPAGE); + ASSERT_EQ(ret, 0); + buffer->ptr = map; + + /* Initialize buffer in system memory (zero THP page). */ + ret = ptr[0]; + ASSERT_EQ(ret, 0); + + /* Migrate memory to device but force a THP allocation error. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_FLAGS, buffer, + HMM_DMIRROR_FLAG_FAIL_ALLOC); + ASSERT_EQ(ret, 0); + ret = hmm_migrate_sys_to_dev(self->fd, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Fault the device memory back and check it. */ + for (i = 0, ptr = buffer->ptr; i < size / sizeof(*ptr); ++i) + ASSERT_EQ(ptr[i], 0); + + buffer->ptr = old_ptr; + hmm_buffer_free(buffer); +} + +struct benchmark_results { + double sys_to_dev_time; + double dev_to_sys_time; + double throughput_s2d; + double throughput_d2s; +}; + +static double get_time_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000.0) + (tv.tv_usec / 1000.0); +} + +static inline struct hmm_buffer *hmm_buffer_alloc(unsigned long size) +{ + struct hmm_buffer *buffer; + + buffer = malloc(sizeof(*buffer)); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(size); + memset(buffer->mirror, 0xFF, size); + return buffer; +} + +static void print_benchmark_results(const char *test_name, size_t buffer_size, + struct benchmark_results *thp, + struct benchmark_results *regular) +{ + double s2d_improvement = ((regular->sys_to_dev_time - thp->sys_to_dev_time) / + regular->sys_to_dev_time) * 100.0; + double d2s_improvement = ((regular->dev_to_sys_time - thp->dev_to_sys_time) / + regular->dev_to_sys_time) * 100.0; + double throughput_s2d_improvement = ((thp->throughput_s2d - regular->throughput_s2d) / + regular->throughput_s2d) * 100.0; + double throughput_d2s_improvement = ((thp->throughput_d2s - regular->throughput_d2s) / + regular->throughput_d2s) * 100.0; + + printf("\n=== %s (%.1f MB) ===\n", test_name, buffer_size / (1024.0 * 1024.0)); + printf(" | With THP | Without THP | Improvement\n"); + printf("---------------------------------------------------------------------\n"); + printf("Sys->Dev Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->sys_to_dev_time, regular->sys_to_dev_time, s2d_improvement); + printf("Dev->Sys Migration | %.3f ms | %.3f ms | %.1f%%\n", + thp->dev_to_sys_time, regular->dev_to_sys_time, d2s_improvement); + printf("S->D Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_s2d, regular->throughput_s2d, throughput_s2d_improvement); + printf("D->S Throughput | %.2f GB/s | %.2f GB/s | %.1f%%\n", + thp->throughput_d2s, regular->throughput_d2s, throughput_d2s_improvement); +} + +/* + * Run a single migration benchmark + * fd: file descriptor for hmm device + * use_thp: whether to use THP + * buffer_size: size of buffer to allocate + * iterations: number of iterations + * results: where to store results + */ +static inline int run_migration_benchmark(int fd, int use_thp, size_t buffer_size, + int iterations, struct benchmark_results *results) +{ + struct hmm_buffer *buffer; + unsigned long npages = buffer_size / sysconf(_SC_PAGESIZE); + double start, end; + double s2d_total = 0, d2s_total = 0; + int ret, i; + int *ptr; + + buffer = hmm_buffer_alloc(buffer_size); + + /* Map memory */ + buffer->ptr = mmap(NULL, buffer_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (!buffer->ptr) + return -1; + + /* Apply THP hint if requested */ + if (use_thp) + ret = madvise(buffer->ptr, buffer_size, MADV_HUGEPAGE); + else + ret = madvise(buffer->ptr, buffer_size, MADV_NOHUGEPAGE); + + if (ret) + return ret; + + /* Initialize memory to make sure pages are allocated */ + ptr = (int *)buffer->ptr; + for (i = 0; i < buffer_size / sizeof(int); i++) + ptr[i] = i & 0xFF; + + /* Warmup iteration */ + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + /* Benchmark iterations */ + for (i = 0; i < iterations; i++) { + /* System to device migration */ + start = get_time_ms(); + + ret = hmm_migrate_sys_to_dev(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + s2d_total += (end - start); + + /* Device to system migration */ + start = get_time_ms(); + + ret = hmm_migrate_dev_to_sys(fd, buffer, npages); + if (ret) + return ret; + + end = get_time_ms(); + d2s_total += (end - start); + } + + /* Calculate average times and throughput */ + results->sys_to_dev_time = s2d_total / iterations; + results->dev_to_sys_time = d2s_total / iterations; + results->throughput_s2d = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->sys_to_dev_time / 1000.0); + results->throughput_d2s = (buffer_size / (1024.0 * 1024.0 * 1024.0)) / + (results->dev_to_sys_time / 1000.0); + + /* Cleanup */ + hmm_buffer_free(buffer); + return 0; +} + +/* + * Benchmark THP migration with different buffer sizes + */ +TEST_F_TIMEOUT(hmm, benchmark_thp_migration, 120) +{ + struct benchmark_results thp_results, regular_results; + size_t thp_size = 2 * 1024 * 1024; /* 2MB - typical THP size */ + int iterations = 5; + + printf("\nHMM THP Migration Benchmark\n"); + printf("---------------------------\n"); + printf("System page size: %ld bytes\n", sysconf(_SC_PAGESIZE)); + + /* Test different buffer sizes */ + size_t test_sizes[] = { + thp_size / 4, /* 512KB - smaller than THP */ + thp_size / 2, /* 1MB - half THP */ + thp_size, /* 2MB - single THP */ + thp_size * 2, /* 4MB - two THPs */ + thp_size * 4, /* 8MB - four THPs */ + thp_size * 8, /* 16MB - eight THPs */ + thp_size * 128, /* 256MB - one twenty eight THPs */ + }; + + static const char *const test_names[] = { + "Small Buffer (512KB)", + "Half THP Size (1MB)", + "Single THP Size (2MB)", + "Two THP Size (4MB)", + "Four THP Size (8MB)", + "Eight THP Size (16MB)", + "One twenty eight THP Size (256MB)" + }; + + int num_tests = ARRAY_SIZE(test_sizes); + + /* Run all tests */ + for (int i = 0; i < num_tests; i++) { + /* Test with THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 1, test_sizes[i], + iterations, &thp_results), 0); + + /* Test without THP */ + ASSERT_EQ(run_migration_benchmark(self->fd, 0, test_sizes[i], + iterations, ®ular_results), 0); + + /* Print results */ + print_benchmark_results(test_names[i], test_sizes[i], + &thp_results, ®ular_results); + } +} TEST_HARNESS_MAIN diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c index ac136f04b8d6..95afa5cfc062 100644 --- a/tools/testing/selftests/mm/ksm_functional_tests.c +++ b/tools/testing/selftests/mm/ksm_functional_tests.c @@ -38,6 +38,8 @@ enum ksm_merge_mode { }; static int mem_fd; +static int pages_to_scan_fd; +static int sleep_millisecs_fd; static int pagemap_fd; static size_t pagesize; @@ -493,6 +495,46 @@ static void test_prctl_fork(void) ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } +static int start_ksmd_and_set_frequency(char *pages_to_scan, char *sleep_ms) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "1", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, pages_to_scan, strlen(pages_to_scan)) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, sleep_ms, strlen(sleep_ms)) <= 0) + return -errno; + + return 0; +} + +static int stop_ksmd_and_restore_frequency(void) +{ + int ksm_fd; + + ksm_fd = open("/sys/kernel/mm/ksm/run", O_RDWR); + if (ksm_fd < 0) + return -errno; + + if (write(ksm_fd, "2", 1) != 1) + return -errno; + + if (write(pages_to_scan_fd, "100", 3) <= 0) + return -errno; + + if (write(sleep_millisecs_fd, "20", 2) <= 0) + return -errno; + + return 0; +} + static void test_prctl_fork_exec(void) { int ret, status; @@ -500,6 +542,9 @@ static void test_prctl_fork_exec(void) ksft_print_msg("[RUN] %s\n", __func__); + if (start_ksmd_and_set_frequency("2000", "0")) + ksft_test_result_fail("set ksmd's scanning frequency failed\n"); + ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0); if (ret < 0 && errno == EINVAL) { ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n"); @@ -542,6 +587,11 @@ static void test_prctl_fork_exec(void) return; } + if (stop_ksmd_and_restore_frequency()) { + ksft_test_result_fail("restore ksmd frequency failed\n"); + return; + } + ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n"); } @@ -656,6 +706,13 @@ static void init_global_file_handles(void) ksft_exit_skip("open(\"/proc/self/pagemap\") failed\n"); if (ksm_get_self_merging_pages() < 0) ksft_exit_skip("accessing \"/proc/self/ksm_merging_pages\") failed\n"); + + pages_to_scan_fd = open("/sys/kernel/mm/ksm/pages_to_scan", O_RDWR); + if (pages_to_scan_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/pages_to_scan failed\n"); + sleep_millisecs_fd = open("/sys/kernel/mm/ksm/sleep_millisecs", O_RDWR); + if (sleep_millisecs_fd < 0) + ksft_exit_fail_msg("opening /sys/kernel/mm/ksm/sleep_millisecs failed\n"); } int main(int argc, char **argv) diff --git a/tools/testing/selftests/mm/mremap_test.c b/tools/testing/selftests/mm/mremap_test.c index bf2863b102e3..5f073504e0b1 100644 --- a/tools/testing/selftests/mm/mremap_test.c +++ b/tools/testing/selftests/mm/mremap_test.c @@ -994,7 +994,7 @@ static void mremap_move_multi_invalid_vmas(FILE *maps_fp, unsigned long page_siz static long long remap_region(struct config c, unsigned int threshold_mb, char *rand_addr) { - void *addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; + void *addr, *tmp_addr, *src_addr, *dest_addr, *dest_preamble_addr = NULL; unsigned long long t, d; struct timespec t_start = {0, 0}, t_end = {0, 0}; long long start_ns, end_ns, align_mask, ret, offset; @@ -1032,7 +1032,8 @@ static long long remap_region(struct config c, unsigned int threshold_mb, /* Don't destroy existing mappings unless expected to overlap */ while (!is_remap_region_valid(addr, c.region_size) && !c.overlapping) { /* Check for unsigned overflow */ - if (addr + c.dest_alignment < addr) { + tmp_addr = addr + c.dest_alignment; + if (tmp_addr < addr) { ksft_print_msg("Couldn't find a valid region to remap to\n"); ret = -1; goto clean_up_src; diff --git a/tools/testing/selftests/mm/soft-dirty.c b/tools/testing/selftests/mm/soft-dirty.c index 4ee4db3750c1..c3a9585de98c 100644 --- a/tools/testing/selftests/mm/soft-dirty.c +++ b/tools/testing/selftests/mm/soft-dirty.c @@ -184,6 +184,130 @@ static void test_mprotect(int pagemap_fd, int pagesize, bool anon) close(test_fd); } +static void test_merge(int pagemap_fd, int pagesize) +{ + char *reserved, *map, *map2; + + /* + * Reserve space for tests: + * + * ---padding to --- + * | avoid adj. | + * v merge v + * |---|---|---|---|---| + * | | 1 | 2 | 3 | | + * |---|---|---|---|---| + */ + reserved = mmap(NULL, 5 * pagesize, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (reserved == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + munmap(reserved, 4 * pagesize); + + /* + * Establish initial VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* This will clear VM_SOFTDIRTY too. */ + clear_softdirty(); + + /* + * Now place a new mapping which will be marked VM_SOFTDIRTY. Away from + * map: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | | 2 | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[3 * pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now remap it immediately adjacent to map, if the merge correctly + * propagates VM_SOFTDIRTY, we should then observe the VMA as a whole + * being marked soft-dirty: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + map2 = mremap(map2, pagesize, pagesize, MREMAP_FIXED | MREMAP_MAYMOVE, + &reserved[2 * pagesize]); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mremap failed\n"); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after remap merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after remap merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); + + /* + * Now establish another VMA: + * + * S/D + * |---|---|---|---|---| + * | | 1 | | | | + * |---|---|---|---|---| + */ + map = mmap(&reserved[pagesize], pagesize, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* Clear VM_SOFTDIRTY... */ + clear_softdirty(); + /* ...and establish incompatible adjacent VMA: + * + * - S/D + * |---|---|---|---|---| + * | | 1 | 2 | | | + * |---|---|---|---|---| + */ + map2 = mmap(&reserved[2 * pagesize], pagesize, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANON | MAP_PRIVATE | MAP_FIXED, -1, 0); + if (map2 == MAP_FAILED) + ksft_exit_fail_msg("mmap failed\n"); + + /* + * Now mprotect() VMA 1 so it's compatible with 2 and therefore merges: + * + * merge + * S/D + * |---|-------|---|---| + * | | 1 | | | + * |---|-------|---|---| + */ + if (mprotect(map, pagesize, PROT_READ | PROT_WRITE | PROT_EXEC)) + ksft_exit_fail_msg("mprotect failed\n"); + + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map) == 1, + "Test %s-anon soft-dirty after mprotect merge 1st pg\n", + __func__); + ksft_test_result(pagemap_is_softdirty(pagemap_fd, map2) == 1, + "Test %s-anon soft-dirty after mprotect merge 2nd pg\n", + __func__); + + munmap(map, 2 * pagesize); +} + static void test_mprotect_anon(int pagemap_fd, int pagesize) { test_mprotect(pagemap_fd, pagesize, true); @@ -204,7 +328,7 @@ int main(int argc, char **argv) if (!softdirty_supported()) ksft_exit_skip("soft-dirty is not support\n"); - ksft_set_plan(15); + ksft_set_plan(19); pagemap_fd = open(PAGEMAP_FILE_PATH, O_RDONLY); if (pagemap_fd < 0) ksft_exit_fail_msg("Failed to open %s\n", PAGEMAP_FILE_PATH); @@ -216,6 +340,7 @@ int main(int argc, char **argv) test_hugepage(pagemap_fd, pagesize); test_mprotect_anon(pagemap_fd, pagesize); test_mprotect_file(pagemap_fd, pagesize); + test_merge(pagemap_fd, pagesize); close(pagemap_fd); diff --git a/tools/testing/selftests/mm/uffd-common.c b/tools/testing/selftests/mm/uffd-common.c index 994fe8c03923..edd02328f77b 100644 --- a/tools/testing/selftests/mm/uffd-common.c +++ b/tools/testing/selftests/mm/uffd-common.c @@ -10,7 +10,6 @@ uffd_test_ops_t *uffd_test_ops; uffd_test_case_ops_t *uffd_test_case_ops; -#define BASE_PMD_ADDR ((void *)(1UL << 30)) /* pthread_mutex_t starts at page offset 0 */ pthread_mutex_t *area_mutex(char *area, unsigned long nr, uffd_global_test_opts_t *gopts) @@ -142,30 +141,37 @@ static int shmem_allocate_area(uffd_global_test_opts_t *gopts, void **alloc_area unsigned long offset = is_src ? 0 : bytes; char *p = NULL, *p_alias = NULL; int mem_fd = uffd_mem_fd_create(bytes * 2, false); + size_t region_size = bytes * 2 + hpage_size; - /* TODO: clean this up. Use a static addr is ugly */ - p = BASE_PMD_ADDR; - if (!is_src) - /* src map + alias + interleaved hpages */ - p += 2 * (bytes + hpage_size); + void *reserve = mmap(NULL, region_size, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, + -1, 0); + if (reserve == MAP_FAILED) { + close(mem_fd); + return -errno; + } + + p = reserve; p_alias = p; p_alias += bytes; p_alias += hpage_size; /* Prevent src/dst VMA merge */ - *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (*alloc_area == MAP_FAILED) { *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (*alloc_area != p) err("mmap of memfd failed at %p", p); - area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED, mem_fd, offset); if (area_alias == MAP_FAILED) { - munmap(*alloc_area, bytes); *alloc_area = NULL; + munmap(reserve, region_size); + close(mem_fd); return -errno; } if (area_alias != p_alias) diff --git a/tools/testing/selftests/mm/uffd-stress.c b/tools/testing/selftests/mm/uffd-stress.c index b51c89e1cd1a..700fbaa18d44 100644 --- a/tools/testing/selftests/mm/uffd-stress.c +++ b/tools/testing/selftests/mm/uffd-stress.c @@ -241,7 +241,7 @@ static int stress(struct uffd_args *args) return 1; for (cpu = 0; cpu < gopts->nr_parallel; cpu++) { - char c; + char c = '\0'; if (bounces & BOUNCE_POLL) { if (write(gopts->pipefd[cpu*2+1], &c, 1) != 1) err("pipefd write error"); diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c index f917b4c4c943..f4807242c5b2 100644 --- a/tools/testing/selftests/mm/uffd-unit-tests.c +++ b/tools/testing/selftests/mm/uffd-unit-tests.c @@ -543,7 +543,7 @@ static void uffd_minor_test_common(uffd_global_test_opts_t *gopts, bool test_col { unsigned long p; pthread_t uffd_mon; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -759,7 +759,7 @@ static void uffd_sigbus_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -819,7 +819,7 @@ static void uffd_events_test_common(uffd_global_test_opts_t *gopts, bool wp) pthread_t uffd_mon; pid_t pid; int err; - char c; + char c = '\0'; struct uffd_args args = { 0 }; args.gopts = gopts; @@ -1125,7 +1125,7 @@ uffd_move_test_common(uffd_global_test_opts_t *gopts, { unsigned long nr; pthread_t uffd_mon; - char c; + char c = '\0'; unsigned long long count; struct uffd_args args = { 0 }; char *orig_area_src = NULL, *orig_area_dst = NULL; diff --git a/tools/testing/selftests/mm/vm_util.c b/tools/testing/selftests/mm/vm_util.c index e33cda301dad..605cb58ea5c3 100644 --- a/tools/testing/selftests/mm/vm_util.c +++ b/tools/testing/selftests/mm/vm_util.c @@ -449,6 +449,11 @@ bool check_vmflag_pfnmap(void *addr) return check_vmflag(addr, "pf"); } +bool check_vmflag_guard(void *addr) +{ + return check_vmflag(addr, "gu"); +} + bool softdirty_supported(void) { char *addr; diff --git a/tools/testing/selftests/mm/vm_util.h b/tools/testing/selftests/mm/vm_util.h index 26c30fdc0241..a8abdf414d46 100644 --- a/tools/testing/selftests/mm/vm_util.h +++ b/tools/testing/selftests/mm/vm_util.h @@ -98,6 +98,7 @@ int uffd_register_with_ioctls(int uffd, void *addr, uint64_t len, unsigned long get_free_hugepages(void); bool check_vmflag_io(void *addr); bool check_vmflag_pfnmap(void *addr); +bool check_vmflag_guard(void *addr); int open_procmap(pid_t pid, struct procmap_fd *procmap_out); int query_procmap(struct procmap_fd *procmap); bool find_vma_procmap(struct procmap_fd *procmap, void *address); diff --git a/tools/testing/selftests/riscv/hwprobe/cbo.c b/tools/testing/selftests/riscv/hwprobe/cbo.c index 5e96ef785d0d..6d99726aceac 100644 --- a/tools/testing/selftests/riscv/hwprobe/cbo.c +++ b/tools/testing/selftests/riscv/hwprobe/cbo.c @@ -15,24 +15,31 @@ #include <linux/compiler.h> #include <linux/kernel.h> #include <asm/ucontext.h> +#include <getopt.h> #include "hwprobe.h" #include "../../kselftest.h" #define MK_CBO(fn) le32_bswap((uint32_t)(fn) << 20 | 10 << 15 | 2 << 12 | 0 << 7 | 15) +#define MK_PREFETCH(fn) \ + le32_bswap(0 << 25 | (uint32_t)(fn) << 20 | 10 << 15 | 6 << 12 | 0 << 7 | 19) static char mem[4096] __aligned(4096) = { [0 ... 4095] = 0xa5 }; -static bool illegal_insn; +static bool got_fault; -static void sigill_handler(int sig, siginfo_t *info, void *context) +static void fault_handler(int sig, siginfo_t *info, void *context) { unsigned long *regs = (unsigned long *)&((ucontext_t *)context)->uc_mcontext; uint32_t insn = *(uint32_t *)regs[0]; - assert(insn == MK_CBO(regs[11])); + if (sig == SIGILL) + assert(insn == MK_CBO(regs[11])); - illegal_insn = true; + if (sig == SIGSEGV || sig == SIGBUS) + assert(insn == MK_PREFETCH(regs[11])); + + got_fault = true; regs[0] += 4; } @@ -45,39 +52,51 @@ static void sigill_handler(int sig, siginfo_t *info, void *context) : : "r" (base), "i" (fn), "i" (MK_CBO(fn)) : "a0", "a1", "memory"); \ }) +#define prefetch_insn(base, fn) \ +({ \ + asm volatile( \ + "mv a0, %0\n" \ + "li a1, %1\n" \ + ".4byte %2\n" \ + : : "r" (base), "i" (fn), "i" (MK_PREFETCH(fn)) : "a0", "a1"); \ +}) + static void cbo_inval(char *base) { cbo_insn(base, 0); } static void cbo_clean(char *base) { cbo_insn(base, 1); } static void cbo_flush(char *base) { cbo_insn(base, 2); } static void cbo_zero(char *base) { cbo_insn(base, 4); } +static void prefetch_i(char *base) { prefetch_insn(base, 0); } +static void prefetch_r(char *base) { prefetch_insn(base, 1); } +static void prefetch_w(char *base) { prefetch_insn(base, 3); } static void test_no_cbo_inval(void *arg) { ksft_print_msg("Testing cbo.inval instruction remain privileged\n"); - illegal_insn = false; + got_fault = false; cbo_inval(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.inval\n"); + ksft_test_result(got_fault, "No cbo.inval\n"); } static void test_no_zicbom(void *arg) { ksft_print_msg("Testing Zicbom instructions remain privileged\n"); - illegal_insn = false; + got_fault = false; cbo_clean(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.clean\n"); + ksft_test_result(got_fault, "No cbo.clean\n"); - illegal_insn = false; + got_fault = false; cbo_flush(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.flush\n"); + ksft_test_result(got_fault, "No cbo.flush\n"); } static void test_no_zicboz(void *arg) { ksft_print_msg("No Zicboz, testing cbo.zero remains privileged\n"); - illegal_insn = false; + got_fault = false; cbo_zero(&mem[0]); - ksft_test_result(illegal_insn, "No cbo.zero\n"); + ksft_test_result(got_fault, "No cbo.zero\n"); } static bool is_power_of_2(__u64 n) @@ -85,6 +104,51 @@ static bool is_power_of_2(__u64 n) return n != 0 && (n & (n - 1)) == 0; } +static void test_zicbop(void *arg) +{ + struct riscv_hwprobe pair = { + .key = RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE, + }; + struct sigaction act = { + .sa_sigaction = &fault_handler, + .sa_flags = SA_SIGINFO + }; + struct sigaction dfl = { + .sa_handler = SIG_DFL + }; + cpu_set_t *cpus = (cpu_set_t *)arg; + __u64 block_size; + long rc; + + rc = sigaction(SIGSEGV, &act, NULL); + assert(rc == 0); + rc = sigaction(SIGBUS, &act, NULL); + assert(rc == 0); + + rc = riscv_hwprobe(&pair, 1, sizeof(cpu_set_t), (unsigned long *)cpus, 0); + block_size = pair.value; + ksft_test_result(rc == 0 && pair.key == RISCV_HWPROBE_KEY_ZICBOP_BLOCK_SIZE && + is_power_of_2(block_size), "Zicbop block size\n"); + ksft_print_msg("Zicbop block size: %llu\n", block_size); + + got_fault = false; + prefetch_i(&mem[0]); + prefetch_r(&mem[0]); + prefetch_w(&mem[0]); + ksft_test_result(!got_fault, "Zicbop prefetch.* on valid address\n"); + + got_fault = false; + prefetch_i(NULL); + prefetch_r(NULL); + prefetch_w(NULL); + ksft_test_result(!got_fault, "Zicbop prefetch.* on NULL\n"); + + rc = sigaction(SIGBUS, &dfl, NULL); + assert(rc == 0); + rc = sigaction(SIGSEGV, &dfl, NULL); + assert(rc == 0); +} + static void test_zicbom(void *arg) { struct riscv_hwprobe pair = { @@ -100,13 +164,13 @@ static void test_zicbom(void *arg) is_power_of_2(block_size), "Zicbom block size\n"); ksft_print_msg("Zicbom block size: %llu\n", block_size); - illegal_insn = false; + got_fault = false; cbo_clean(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.clean\n"); + ksft_test_result(!got_fault, "cbo.clean\n"); - illegal_insn = false; + got_fault = false; cbo_flush(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.flush\n"); + ksft_test_result(!got_fault, "cbo.flush\n"); } static void test_zicboz(void *arg) @@ -125,11 +189,11 @@ static void test_zicboz(void *arg) is_power_of_2(block_size), "Zicboz block size\n"); ksft_print_msg("Zicboz block size: %llu\n", block_size); - illegal_insn = false; + got_fault = false; cbo_zero(&mem[block_size]); - ksft_test_result(!illegal_insn, "cbo.zero\n"); + ksft_test_result(!got_fault, "cbo.zero\n"); - if (illegal_insn || !is_power_of_2(block_size)) { + if (got_fault || !is_power_of_2(block_size)) { ksft_test_result_skip("cbo.zero check\n"); return; } @@ -177,7 +241,19 @@ static void check_no_zicbo_cpus(cpu_set_t *cpus, __u64 cbo) rc = riscv_hwprobe(&pair, 1, sizeof(cpu_set_t), (unsigned long *)&one_cpu, 0); assert(rc == 0 && pair.key == RISCV_HWPROBE_KEY_IMA_EXT_0); - cbostr = cbo == RISCV_HWPROBE_EXT_ZICBOZ ? "Zicboz" : "Zicbom"; + switch (cbo) { + case RISCV_HWPROBE_EXT_ZICBOZ: + cbostr = "Zicboz"; + break; + case RISCV_HWPROBE_EXT_ZICBOM: + cbostr = "Zicbom"; + break; + case RISCV_HWPROBE_EXT_ZICBOP: + cbostr = "Zicbop"; + break; + default: + ksft_exit_fail_msg("Internal error: invalid cbo %llu\n", cbo); + } if (pair.value & cbo) ksft_exit_fail_msg("%s is only present on a subset of harts.\n" @@ -194,6 +270,7 @@ enum { TEST_ZICBOM, TEST_NO_ZICBOM, TEST_NO_CBO_INVAL, + TEST_ZICBOP, }; static struct test_info { @@ -206,26 +283,51 @@ static struct test_info { [TEST_ZICBOM] = { .nr_tests = 3, test_zicbom }, [TEST_NO_ZICBOM] = { .nr_tests = 2, test_no_zicbom }, [TEST_NO_CBO_INVAL] = { .nr_tests = 1, test_no_cbo_inval }, + [TEST_ZICBOP] = { .nr_tests = 3, test_zicbop }, +}; + +static const struct option long_opts[] = { + {"zicbom-raises-sigill", no_argument, 0, 'm'}, + {"zicboz-raises-sigill", no_argument, 0, 'z'}, + {0, 0, 0, 0} }; int main(int argc, char **argv) { struct sigaction act = { - .sa_sigaction = &sigill_handler, + .sa_sigaction = &fault_handler, .sa_flags = SA_SIGINFO, }; struct riscv_hwprobe pair; unsigned int plan = 0; cpu_set_t cpus; long rc; - int i; - - if (argc > 1 && !strcmp(argv[1], "--sigill")) { - rc = sigaction(SIGILL, &act, NULL); - assert(rc == 0); - tests[TEST_NO_ZICBOZ].enabled = true; - tests[TEST_NO_ZICBOM].enabled = true; - tests[TEST_NO_CBO_INVAL].enabled = true; + int i, opt, long_index; + + long_index = 0; + + while ((opt = getopt_long(argc, argv, "mz", long_opts, &long_index)) != -1) { + switch (opt) { + case 'm': + tests[TEST_NO_ZICBOM].enabled = true; + tests[TEST_NO_CBO_INVAL].enabled = true; + rc = sigaction(SIGILL, &act, NULL); + assert(rc == 0); + break; + case 'z': + tests[TEST_NO_ZICBOZ].enabled = true; + tests[TEST_NO_CBO_INVAL].enabled = true; + rc = sigaction(SIGILL, &act, NULL); + assert(rc == 0); + break; + case '?': + fprintf(stderr, + "Usage: %s [--zicbom-raises-sigill|-m] [--zicboz-raises-sigill|-z]\n", + argv[0]); + exit(1); + default: + break; + } } rc = sched_getaffinity(0, sizeof(cpu_set_t), &cpus); @@ -253,6 +355,11 @@ int main(int argc, char **argv) check_no_zicbo_cpus(&cpus, RISCV_HWPROBE_EXT_ZICBOM); } + if (pair.value & RISCV_HWPROBE_EXT_ZICBOP) + tests[TEST_ZICBOP].enabled = true; + else + check_no_zicbo_cpus(&cpus, RISCV_HWPROBE_EXT_ZICBOP); + for (i = 0; i < ARRAY_SIZE(tests); ++i) plan += tests[i].enabled ? tests[i].nr_tests : 0; diff --git a/tools/testing/selftests/riscv/vector/Makefile b/tools/testing/selftests/riscv/vector/Makefile index 6f7497f4e7b3..2c2a33fc083e 100644 --- a/tools/testing/selftests/riscv/vector/Makefile +++ b/tools/testing/selftests/riscv/vector/Makefile @@ -2,7 +2,7 @@ # Copyright (C) 2021 ARM Limited # Originally tools/testing/arm64/abi/Makefile -TEST_GEN_PROGS := v_initval vstate_prctl +TEST_GEN_PROGS := v_initval vstate_prctl vstate_ptrace TEST_GEN_PROGS_EXTENDED := vstate_exec_nolibc v_exec_initval_nolibc include ../../lib.mk @@ -26,3 +26,6 @@ $(OUTPUT)/v_initval: v_initval.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o $(OUTPUT)/v_exec_initval_nolibc: v_exec_initval_nolibc.c $(CC) -nostdlib -static -include ../../../../include/nolibc/nolibc.h \ -Wall $(CFLAGS) $(LDFLAGS) $^ -o $@ -lgcc + +$(OUTPUT)/vstate_ptrace: vstate_ptrace.c $(OUTPUT)/sys_hwprobe.o $(OUTPUT)/v_helpers.o + $(CC) -static -o$@ $(CFLAGS) $(LDFLAGS) $^ diff --git a/tools/testing/selftests/riscv/vector/vstate_ptrace.c b/tools/testing/selftests/riscv/vector/vstate_ptrace.c new file mode 100644 index 000000000000..1479abc0c9cb --- /dev/null +++ b/tools/testing/selftests/riscv/vector/vstate_ptrace.c @@ -0,0 +1,134 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <stdio.h> +#include <stdlib.h> +#include <asm/ptrace.h> +#include <linux/elf.h> +#include <sys/ptrace.h> +#include <sys/uio.h> +#include <sys/wait.h> +#include "../../kselftest.h" +#include "v_helpers.h" + +int parent_set_val, child_set_val; + +static long do_ptrace(enum __ptrace_request op, pid_t pid, long type, size_t size, void *data) +{ + struct iovec v_iovec = { + .iov_len = size, + .iov_base = data + }; + + return ptrace(op, pid, type, &v_iovec); +} + +static int do_child(void) +{ + int out; + + if (ptrace(PTRACE_TRACEME, -1, NULL, NULL)) { + ksft_perror("PTRACE_TRACEME failed\n"); + return EXIT_FAILURE; + } + + asm volatile (".option push\n\t" + ".option arch, +v\n\t" + ".option norvc\n\t" + "vsetivli x0, 1, e32, m1, ta, ma\n\t" + "vmv.s.x v31, %[in]\n\t" + "ebreak\n\t" + "vmv.x.s %[out], v31\n\t" + ".option pop\n\t" + : [out] "=r" (out) + : [in] "r" (child_set_val)); + + if (out != parent_set_val) + return EXIT_FAILURE; + + return EXIT_SUCCESS; +} + +static void do_parent(pid_t child) +{ + int status; + void *data = NULL; + + /* Attach to the child */ + while (waitpid(child, &status, 0)) { + if (WIFEXITED(status)) { + ksft_test_result(WEXITSTATUS(status) == 0, "SETREGSET vector\n"); + goto out; + } else if (WIFSTOPPED(status) && (WSTOPSIG(status) == SIGTRAP)) { + size_t size; + void *data, *v31; + struct __riscv_v_regset_state *v_regset_hdr; + struct user_regs_struct *gpreg; + + size = sizeof(*v_regset_hdr); + data = malloc(size); + if (!data) + goto out; + v_regset_hdr = (struct __riscv_v_regset_state *)data; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + ksft_print_msg("vlenb %ld\n", v_regset_hdr->vlenb); + data = realloc(data, size + v_regset_hdr->vlenb * 32); + if (!data) + goto out; + v_regset_hdr = (struct __riscv_v_regset_state *)data; + v31 = (void *)(data + size + v_regset_hdr->vlenb * 31); + size += v_regset_hdr->vlenb * 32; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + ksft_test_result(*(int *)v31 == child_set_val, "GETREGSET vector\n"); + + *(int *)v31 = parent_set_val; + if (do_ptrace(PTRACE_SETREGSET, child, NT_RISCV_VECTOR, size, data)) + goto out; + + /* move the pc forward */ + size = sizeof(*gpreg); + data = realloc(data, size); + gpreg = (struct user_regs_struct *)data; + + if (do_ptrace(PTRACE_GETREGSET, child, NT_PRSTATUS, size, data)) + goto out; + + gpreg->pc += 4; + if (do_ptrace(PTRACE_SETREGSET, child, NT_PRSTATUS, size, data)) + goto out; + } + + ptrace(PTRACE_CONT, child, NULL, NULL); + } + +out: + free(data); +} + +int main(void) +{ + pid_t child; + + ksft_set_plan(2); + if (!is_vector_supported() && !is_xtheadvector_supported()) + ksft_exit_skip("Vector not supported\n"); + + srandom(getpid()); + parent_set_val = rand(); + child_set_val = rand(); + + child = fork(); + if (child < 0) + ksft_exit_fail_msg("Fork failed %d\n", child); + + if (!child) + return do_child(); + + do_parent(child); + + ksft_finished(); +} diff --git a/tools/testing/selftests/tpm2/tpm2.py b/tools/testing/selftests/tpm2/tpm2.py index bba8cb54548e..3d130c30bc7c 100644 --- a/tools/testing/selftests/tpm2/tpm2.py +++ b/tools/testing/selftests/tpm2/tpm2.py @@ -437,7 +437,7 @@ class Client: def extend_pcr(self, i, dig, bank_alg = TPM2_ALG_SHA1): ds = get_digest_size(bank_alg) - assert(ds == len(dig)) + assert ds == len(dig) auth_cmd = AuthCommand() @@ -589,7 +589,7 @@ class Client: def seal(self, parent_key, data, auth_value, policy_dig, name_alg = TPM2_ALG_SHA1): ds = get_digest_size(name_alg) - assert(not policy_dig or ds == len(policy_dig)) + assert not policy_dig or ds == len(policy_dig) attributes = 0 if not policy_dig: diff --git a/tools/testing/selftests/verification/.gitignore b/tools/testing/selftests/verification/.gitignore new file mode 100644 index 000000000000..2659417cb2c7 --- /dev/null +++ b/tools/testing/selftests/verification/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +logs diff --git a/tools/testing/selftests/verification/Makefile b/tools/testing/selftests/verification/Makefile new file mode 100644 index 000000000000..aa8790c22a71 --- /dev/null +++ b/tools/testing/selftests/verification/Makefile @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +all: + +TEST_PROGS := verificationtest-ktap +TEST_FILES := test.d settings +EXTRA_CLEAN := $(OUTPUT)/logs/* + +include ../lib.mk diff --git a/tools/testing/selftests/verification/config b/tools/testing/selftests/verification/config new file mode 100644 index 000000000000..43072c1c38f4 --- /dev/null +++ b/tools/testing/selftests/verification/config @@ -0,0 +1 @@ +CONFIG_RV=y diff --git a/tools/testing/selftests/verification/settings b/tools/testing/selftests/verification/settings new file mode 100644 index 000000000000..e7b9417537fb --- /dev/null +++ b/tools/testing/selftests/verification/settings @@ -0,0 +1 @@ +timeout=0 diff --git a/tools/testing/selftests/verification/test.d/functions b/tools/testing/selftests/verification/test.d/functions new file mode 100644 index 000000000000..ec36a092f56e --- /dev/null +++ b/tools/testing/selftests/verification/test.d/functions @@ -0,0 +1,39 @@ +check_requires() { # Check required files, monitors and reactors + for i in "$@" ; do + p=${i%:program} + m=${i%:monitor} + r=${i%:reactor} + if [ $p != $i ]; then + if ! which $p ; then + echo "Required program $p is not found." + exit_unresolved + fi + elif [ $m != $i ]; then + if ! grep -wq $m available_monitors ; then + echo "Required monitor $m is not configured." + exit_unsupported + fi + elif [ $r != $i ]; then + if ! grep -wq $r available_reactors ; then + echo "Required reactor $r is not configured." + exit_unsupported + fi + elif [ ! -e $i ]; then + echo "Required feature interface $i doesn't exist." + exit_unsupported + fi + done +} + +initialize_system() { # Reset RV to initial-state + echo > enabled_monitors + for m in monitors/*; do + echo nop > $m/reactors || true + done + echo 1 > monitoring_on + echo 1 > reacting_on || true +} + +finish_system() { + initialize_system +} diff --git a/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc b/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc new file mode 100644 index 000000000000..f29236defb5a --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitor_enable_disable.tc @@ -0,0 +1,75 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test monitor enable/disable + +test_simple_monitor() { + local monitor="$1" + local prefix="$2" # nested monitors + + echo 1 > "monitors/$prefix$monitor/enable" + grep -q "$monitor$" enabled_monitors + + echo 0 > "monitors/$prefix$monitor/enable" + ! grep -q "$monitor$" enabled_monitors + + echo "$monitor" >> enabled_monitors + grep -q 1 "monitors/$prefix$monitor/enable" + + echo "!$monitor" >> enabled_monitors + grep -q 0 "monitors/$prefix$monitor/enable" +} + +test_container_monitor() { + local monitor="$1" + local nested + + echo 1 > "monitors/$monitor/enable" + grep -q "^$monitor$" enabled_monitors + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "^$monitor:$nested$" enabled_monitors + done + test -n "$nested" + + echo 0 > "monitors/$monitor/enable" + ! grep -q "^$monitor$" enabled_monitors + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + ! grep -q "^$monitor:$nested$" enabled_monitors + done + + echo "$monitor" >> enabled_monitors + grep -q 1 "monitors/$monitor/enable" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "^$monitor:$nested$" enabled_monitors + done + + echo "!$monitor" >> enabled_monitors + grep -q 0 "monitors/$monitor/enable" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + test_simple_monitor "$nested" "$monitor/" + done +} + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + if find "$monitor_dir" -mindepth 1 -type d | grep -q .; then + test_container_monitor "$monitor" + else + test_simple_monitor "$monitor" + fi +done + +! echo non_existent_monitor > enabled_monitors +! grep -q "^non_existent_monitor$" enabled_monitors diff --git a/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc b/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc new file mode 100644 index 000000000000..2958bf849338 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitor_reactor.tc @@ -0,0 +1,68 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test monitor reactor setting +# requires: available_reactors + +test_monitor_reactor() { + local monitor="$1" + local prefix="$2" # nested monitors + + while read -r reactor; do + [ "$reactor" = nop ] && continue + + echo "$reactor" > "monitors/$prefix$monitor/reactors" + grep -q "\\[$reactor\\]" "monitors/$prefix$monitor/reactors" + done < available_reactors + + echo nop > "monitors/$prefix$monitor/reactors" + grep -q "\\[nop\\]" "monitors/$prefix$monitor/reactors" +} + +test_container_monitor() { + local monitor="$1" + local nested + + while read -r reactor; do + [ "$reactor" = nop ] && continue + + echo "$reactor" > "monitors/$monitor/reactors" + grep -q "\\[$reactor\\]" "monitors/$monitor/reactors" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "\\[$reactor\\]" "monitors/$monitor/$nested/reactors" + done + done < available_reactors + test -n "$nested" + + echo nop > "monitors/$monitor/reactors" + grep -q "\\[nop\\]" "monitors/$monitor/reactors" + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + grep -q "\\[nop\\]" "monitors/$monitor/$nested/reactors" + done + + for nested_dir in "monitors/$monitor"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + test_monitor_reactor "$nested" "$monitor/" + done +} + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + if find "$monitor_dir" -mindepth 1 -type d | grep -q .; then + test_container_monitor "$monitor" + else + test_monitor_reactor "$monitor" + fi +done + +monitor=$(ls /sys/kernel/tracing/rv/monitors -1 | head -n 1) +test -f "monitors/$monitor/reactors" +! echo non_existent_reactor > "monitors/$monitor/reactors" +! grep -q "\\[non_existent_reactor\\]" "monitors/$monitor/reactors" diff --git a/tools/testing/selftests/verification/test.d/rv_monitors_available.tc b/tools/testing/selftests/verification/test.d/rv_monitors_available.tc new file mode 100644 index 000000000000..e6a4a1410690 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_monitors_available.tc @@ -0,0 +1,18 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Check available monitors + +for monitor_dir in monitors/*; do + monitor=$(basename "$monitor_dir") + + grep -q "^$monitor$" available_monitors + grep -q . "$monitor_dir"/desc + + for nested_dir in "$monitor_dir"/*; do + [ -d "$nested_dir" ] || continue + nested=$(basename "$nested_dir") + + grep -q "^$monitor:$nested$" available_monitors + grep -q . "$nested_dir"/desc + done +done diff --git a/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc b/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc new file mode 100644 index 000000000000..5a59432b1d93 --- /dev/null +++ b/tools/testing/selftests/verification/test.d/rv_wwnr_printk.tc @@ -0,0 +1,30 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later +# description: Test wwnr monitor with printk reactor +# requires: available_reactors wwnr:monitor printk:reactor stress-ng:program + +load() { # returns true if there was a reaction + local lines_before num + num=$((($(nproc) + 1) / 2)) + lines_before=$(dmesg | wc -l) + stress-ng --cpu-sched "$num" --timer "$num" -t 5 -q + dmesg | tail -n $((lines_before + 1)) | grep -q "rv: monitor wwnr does not allow event" +} + +echo 1 > monitors/wwnr/enable +echo printk > monitors/wwnr/reactors + +load + +echo 0 > monitoring_on +! load +echo 1 > monitoring_on + +load + +echo 0 > reacting_on +! load +echo 1 > reacting_on + +echo nop > monitors/wwnr/reactors +echo 0 > monitors/wwnr/enable diff --git a/tools/testing/selftests/verification/verificationtest-ktap b/tools/testing/selftests/verification/verificationtest-ktap new file mode 100755 index 000000000000..18f7fe324e2f --- /dev/null +++ b/tools/testing/selftests/verification/verificationtest-ktap @@ -0,0 +1,8 @@ +#!/bin/sh -e +# SPDX-License-Identifier: GPL-2.0-only +# +# ftracetest-ktap: Wrapper to integrate ftracetest with the kselftest runner +# +# Copyright (C) Arm Ltd., 2023 + +../ftrace/ftracetest -K -v --rv ../verification diff --git a/tools/testing/selftests/vfio/Makefile b/tools/testing/selftests/vfio/Makefile index 324ba0175a33..3c796ca99a50 100644 --- a/tools/testing/selftests/vfio/Makefile +++ b/tools/testing/selftests/vfio/Makefile @@ -2,8 +2,14 @@ CFLAGS = $(KHDR_INCLUDES) TEST_GEN_PROGS += vfio_dma_mapping_test TEST_GEN_PROGS += vfio_iommufd_setup_test TEST_GEN_PROGS += vfio_pci_device_test +TEST_GEN_PROGS += vfio_pci_device_init_perf_test TEST_GEN_PROGS += vfio_pci_driver_test -TEST_PROGS_EXTENDED := run.sh + +TEST_FILES += scripts/cleanup.sh +TEST_FILES += scripts/lib.sh +TEST_FILES += scripts/run.sh +TEST_FILES += scripts/setup.sh + include ../lib.mk include lib/libvfio.mk @@ -11,6 +17,8 @@ CFLAGS += -I$(top_srcdir)/tools/include CFLAGS += -MD CFLAGS += $(EXTRA_CFLAGS) +LDFLAGS += -pthread + $(TEST_GEN_PROGS): %: %.o $(LIBVFIO_O) $(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $< $(LIBVFIO_O) $(LDLIBS) -o $@ diff --git a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c index 0ca2cbc2a316..c75045bcab79 100644 --- a/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c +++ b/tools/testing/selftests/vfio/lib/drivers/dsa/dsa.c @@ -9,7 +9,7 @@ #include <linux/pci_ids.h> #include <linux/sizes.h> -#include <vfio_util.h> +#include <libvfio.h> #include "registers.h" @@ -70,7 +70,7 @@ static int dsa_probe(struct vfio_pci_device *device) return -EINVAL; if (dsa_int_handle_request_required(device)) { - printf("Device requires requesting interrupt handles\n"); + dev_err(device, "Device requires requesting interrupt handles\n"); return -EINVAL; } @@ -91,23 +91,23 @@ static void dsa_check_sw_err(struct vfio_pci_device *device) return; } - fprintf(stderr, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", + dev_err(device, "SWERR: 0x%016lx 0x%016lx 0x%016lx 0x%016lx\n", err.bits[0], err.bits[1], err.bits[2], err.bits[3]); - fprintf(stderr, " valid: 0x%x\n", err.valid); - fprintf(stderr, " overflow: 0x%x\n", err.overflow); - fprintf(stderr, " desc_valid: 0x%x\n", err.desc_valid); - fprintf(stderr, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); - fprintf(stderr, " batch: 0x%x\n", err.batch); - fprintf(stderr, " fault_rw: 0x%x\n", err.fault_rw); - fprintf(stderr, " priv: 0x%x\n", err.priv); - fprintf(stderr, " error: 0x%x\n", err.error); - fprintf(stderr, " wq_idx: 0x%x\n", err.wq_idx); - fprintf(stderr, " operation: 0x%x\n", err.operation); - fprintf(stderr, " pasid: 0x%x\n", err.pasid); - fprintf(stderr, " batch_idx: 0x%x\n", err.batch_idx); - fprintf(stderr, " invalid_flags: 0x%x\n", err.invalid_flags); - fprintf(stderr, " fault_addr: 0x%lx\n", err.fault_addr); + dev_err(device, " valid: 0x%x\n", err.valid); + dev_err(device, " overflow: 0x%x\n", err.overflow); + dev_err(device, " desc_valid: 0x%x\n", err.desc_valid); + dev_err(device, " wq_idx_valid: 0x%x\n", err.wq_idx_valid); + dev_err(device, " batch: 0x%x\n", err.batch); + dev_err(device, " fault_rw: 0x%x\n", err.fault_rw); + dev_err(device, " priv: 0x%x\n", err.priv); + dev_err(device, " error: 0x%x\n", err.error); + dev_err(device, " wq_idx: 0x%x\n", err.wq_idx); + dev_err(device, " operation: 0x%x\n", err.operation); + dev_err(device, " pasid: 0x%x\n", err.pasid); + dev_err(device, " batch_idx: 0x%x\n", err.batch_idx); + dev_err(device, " invalid_flags: 0x%x\n", err.invalid_flags); + dev_err(device, " fault_addr: 0x%lx\n", err.fault_addr); VFIO_FAIL("Software Error Detected!\n"); } @@ -256,7 +256,7 @@ static int dsa_completion_wait(struct vfio_pci_device *device, if (status == DSA_COMP_SUCCESS) return 0; - printf("Error detected during memcpy operation: 0x%x\n", status); + dev_err(device, "Error detected during memcpy operation: 0x%x\n", status); return -1; } diff --git a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c index c3b91d9b1f59..a871b935542b 100644 --- a/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c +++ b/tools/testing/selftests/vfio/lib/drivers/ioat/ioat.c @@ -7,7 +7,7 @@ #include <linux/pci_ids.h> #include <linux/sizes.h> -#include <vfio_util.h> +#include <libvfio.h> #include "hw.h" #include "registers.h" @@ -51,7 +51,7 @@ static int ioat_probe(struct vfio_pci_device *device) r = 0; break; default: - printf("ioat: Unsupported version: 0x%x\n", version); + dev_err(device, "ioat: Unsupported version: 0x%x\n", version); r = -EINVAL; } return r; @@ -135,13 +135,13 @@ static void ioat_handle_error(struct vfio_pci_device *device) { void *registers = ioat_channel_registers(device); - printf("Error detected during memcpy operation!\n" - " CHANERR: 0x%x\n" - " CHANERR_INT: 0x%x\n" - " DMAUNCERRSTS: 0x%x\n", - readl(registers + IOAT_CHANERR_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), - vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); + dev_err(device, "Error detected during memcpy operation!\n" + " CHANERR: 0x%x\n" + " CHANERR_INT: 0x%x\n" + " DMAUNCERRSTS: 0x%x\n", + readl(registers + IOAT_CHANERR_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_CHANERR_INT_OFFSET), + vfio_pci_config_readl(device, IOAT_PCI_DMAUNCERRSTS_OFFSET)); ioat_reset(device); } diff --git a/tools/testing/selftests/vfio/lib/include/libvfio.h b/tools/testing/selftests/vfio/lib/include/libvfio.h new file mode 100644 index 000000000000..279ddcd70194 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H + +#include <libvfio/assert.h> +#include <libvfio/iommu.h> +#include <libvfio/iova_allocator.h> +#include <libvfio/vfio_pci_device.h> +#include <libvfio/vfio_pci_driver.h> + +/* + * Return the BDF string of the device that the test should use. + * + * If a BDF string is provided by the user on the command line (as the last + * element of argv[]), then this function will return that and decrement argc + * by 1. + * + * Otherwise this function will attempt to use the environment variable + * $VFIO_SELFTESTS_BDF. + * + * If BDF cannot be determined then the test will exit with KSFT_SKIP. + */ +const char *vfio_selftests_get_bdf(int *argc, char *argv[]); +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/assert.h b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h new file mode 100644 index 000000000000..f4ebd122d9b6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/assert.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H + +#include <stdio.h> +#include <string.h> +#include <sys/ioctl.h> + +#include "../../../../kselftest.h" + +#define VFIO_LOG_AND_EXIT(...) do { \ + fprintf(stderr, " " __VA_ARGS__); \ + fprintf(stderr, "\n"); \ + exit(KSFT_FAIL); \ +} while (0) + +#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ + typeof(_lhs) __lhs = (_lhs); \ + typeof(_rhs) __rhs = (_rhs); \ + \ + if (__lhs _op __rhs) \ + break; \ + \ + fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ + fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ + fprintf(stderr, " Observed: %#lx %s %#lx\n", \ + (u64)__lhs, #_op, (u64)__rhs); \ + fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ + VFIO_LOG_AND_EXIT(__VA_ARGS__); \ +} while (0) + +#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) +#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) +#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) +#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) +#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) +#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) +#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) +#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) +#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) + +#define VFIO_FAIL(_fmt, ...) do { \ + fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ + VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ +} while (0) + +#define ioctl_assert(_fd, _op, _arg) do { \ + void *__arg = (_arg); \ + int __ret = ioctl((_fd), (_op), (__arg)); \ + VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ +} while (0) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_ASSERT_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h new file mode 100644 index 000000000000..5c9b9dc6d993 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iommu.h @@ -0,0 +1,76 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H + +#include <linux/list.h> +#include <linux/types.h> + +#include <libvfio/assert.h> + +typedef u64 iova_t; + +struct iommu_mode { + const char *name; + const char *container_path; + unsigned long iommu_type; +}; + +extern const char *default_iommu_mode; + +struct dma_region { + struct list_head link; + void *vaddr; + iova_t iova; + u64 size; +}; + +struct iommu { + const struct iommu_mode *mode; + int container_fd; + int iommufd; + u32 ioas_id; + struct list_head dma_regions; +}; + +struct iommu *iommu_init(const char *iommu_mode); +void iommu_cleanup(struct iommu *iommu); + +int __iommu_map(struct iommu *iommu, struct dma_region *region); + +static inline void iommu_map(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_map(iommu, region), 0); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped); + +static inline void iommu_unmap(struct iommu *iommu, struct dma_region *region) +{ + VFIO_ASSERT_EQ(__iommu_unmap(iommu, region, NULL), 0); +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped); + +static inline void iommu_unmap_all(struct iommu *iommu) +{ + VFIO_ASSERT_EQ(__iommu_unmap_all(iommu, NULL), 0); +} + +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova); +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr); + +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges); + +/* + * Generator for VFIO selftests fixture variants that replicate across all + * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() + * which should then use FIXTURE_VARIANT_ADD() to create the variant. + */ +#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ +FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOMMU_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h new file mode 100644 index 000000000000..8f1d994e9ea2 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/iova_allocator.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H + +#include <uapi/linux/types.h> +#include <linux/list.h> +#include <linux/types.h> +#include <linux/iommufd.h> + +#include <libvfio/iommu.h> + +struct iova_allocator { + struct iommu_iova_range *ranges; + u32 nranges; + u32 range_idx; + u64 range_offset; +}; + +struct iova_allocator *iova_allocator_init(struct iommu *iommu); +void iova_allocator_cleanup(struct iova_allocator *allocator); +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_IOVA_ALLOCATOR_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h new file mode 100644 index 000000000000..2858885a89bb --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_device.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H + +#include <fcntl.h> +#include <linux/vfio.h> +#include <linux/pci_regs.h> + +#include <libvfio/assert.h> +#include <libvfio/iommu.h> +#include <libvfio/vfio_pci_driver.h> + +struct vfio_pci_bar { + struct vfio_region_info info; + void *vaddr; +}; + +struct vfio_pci_device { + const char *bdf; + int fd; + int group_fd; + + struct iommu *iommu; + + struct vfio_device_info info; + struct vfio_region_info config_space; + struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; + + struct vfio_irq_info msi_info; + struct vfio_irq_info msix_info; + + /* eventfds for MSI and MSI-x interrupts */ + int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; + + struct vfio_pci_driver driver; +}; + +#define dev_info(_dev, _fmt, ...) printf("%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) +#define dev_err(_dev, _fmt, ...) fprintf(stderr, "%s: " _fmt, (_dev)->bdf, ##__VA_ARGS__) + +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu); +void vfio_pci_device_cleanup(struct vfio_pci_device *device); + +void vfio_pci_device_reset(struct vfio_pci_device *device); + +void vfio_pci_config_access(struct vfio_pci_device *device, bool write, + size_t config, size_t size, void *data); + +#define vfio_pci_config_read(_device, _offset, _type) ({ \ + _type __data; \ + vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ + __data; \ +}) + +#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) +#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) +#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) + +#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ + _type __data = (_value); \ + vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ +} while (0) + +#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) +#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) +#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) + +void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, + u32 vector, int count); +void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); +void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); + +static inline void fcntl_set_nonblock(int fd) +{ + int r; + + r = fcntl(fd, F_GETFL, 0); + VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); + + r = fcntl(fd, F_SETFL, r | O_NONBLOCK); + VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); +} + +static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); +} + +static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, + u32 vector, int count) +{ + vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); +} + +static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) +{ + vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); +} + +static inline int __to_iova(struct vfio_pci_device *device, void *vaddr, iova_t *iova) +{ + return __iommu_hva2iova(device->iommu, vaddr, iova); +} + +static inline iova_t to_iova(struct vfio_pci_device *device, void *vaddr) +{ + return iommu_hva2iova(device->iommu, vaddr); +} + +static inline bool vfio_pci_device_match(struct vfio_pci_device *device, + u16 vendor_id, u16 device_id) +{ + return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && + (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); +} + +const char *vfio_pci_get_cdev_path(const char *bdf); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DEVICE_H */ diff --git a/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h new file mode 100644 index 000000000000..e5ada209b1d1 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/include/libvfio/vfio_pci_driver.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +#ifndef SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H +#define SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H + +#include <libvfio/iommu.h> + +struct vfio_pci_device; + +struct vfio_pci_driver_ops { + const char *name; + + /** + * @probe() - Check if the driver supports the given device. + * + * Return: 0 on success, non-0 on failure. + */ + int (*probe)(struct vfio_pci_device *device); + + /** + * @init() - Initialize the driver for @device. + * + * Must be called after device->driver.region has been initialized. + */ + void (*init)(struct vfio_pci_device *device); + + /** + * remove() - Deinitialize the driver for @device. + */ + void (*remove)(struct vfio_pci_device *device); + + /** + * memcpy_start() - Kick off @count repeated memcpy operations from + * [@src, @src + @size) to [@dst, @dst + @size). + * + * Guarantees: + * - The device will attempt DMA reads on [src, src + size). + * - The device will attempt DMA writes on [dst, dst + size). + * - The device will not generate any interrupts. + * + * memcpy_start() returns immediately, it does not wait for the + * copies to complete. + */ + void (*memcpy_start)(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, u64 count); + + /** + * memcpy_wait() - Wait until the memcpy operations started by + * memcpy_start() have finished. + * + * Guarantees: + * - All in-flight DMAs initiated by memcpy_start() are fully complete + * before memcpy_wait() returns. + * + * Returns non-0 if the driver detects that an error occurred during the + * memcpy, 0 otherwise. + */ + int (*memcpy_wait)(struct vfio_pci_device *device); + + /** + * send_msi() - Make the device send the MSI device->driver.msi. + * + * Guarantees: + * - The device will send the MSI once. + */ + void (*send_msi)(struct vfio_pci_device *device); +}; + +struct vfio_pci_driver { + const struct vfio_pci_driver_ops *ops; + bool initialized; + bool memcpy_in_progress; + + /* Region to be used by the driver (e.g. for in-memory descriptors) */ + struct dma_region region; + + /* The maximum size that can be passed to memcpy_start(). */ + u64 max_memcpy_size; + + /* The maximum count that can be passed to memcpy_start(). */ + u64 max_memcpy_count; + + /* The MSI vector the device will signal in ops->send_msi(). */ + int msi; +}; + +void vfio_pci_driver_probe(struct vfio_pci_device *device); +void vfio_pci_driver_init(struct vfio_pci_device *device); +void vfio_pci_driver_remove(struct vfio_pci_device *device); +int vfio_pci_driver_memcpy(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size); +void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, + iova_t src, iova_t dst, u64 size, + u64 count); +int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); +void vfio_pci_driver_send_msi(struct vfio_pci_device *device); + +#endif /* SELFTESTS_VFIO_LIB_INCLUDE_LIBVFIO_VFIO_PCI_DRIVER_H */ diff --git a/tools/testing/selftests/vfio/lib/include/vfio_util.h b/tools/testing/selftests/vfio/lib/include/vfio_util.h deleted file mode 100644 index 69ec0c856481..000000000000 --- a/tools/testing/selftests/vfio/lib/include/vfio_util.h +++ /dev/null @@ -1,331 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -#ifndef SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H -#define SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H - -#include <fcntl.h> -#include <string.h> - -#include <uapi/linux/types.h> -#include <linux/iommufd.h> -#include <linux/list.h> -#include <linux/pci_regs.h> -#include <linux/vfio.h> - -#include "../../../kselftest.h" - -#define VFIO_LOG_AND_EXIT(...) do { \ - fprintf(stderr, " " __VA_ARGS__); \ - fprintf(stderr, "\n"); \ - exit(KSFT_FAIL); \ -} while (0) - -#define VFIO_ASSERT_OP(_lhs, _rhs, _op, ...) do { \ - typeof(_lhs) __lhs = (_lhs); \ - typeof(_rhs) __rhs = (_rhs); \ - \ - if (__lhs _op __rhs) \ - break; \ - \ - fprintf(stderr, "%s:%u: Assertion Failure\n\n", __FILE__, __LINE__); \ - fprintf(stderr, " Expression: " #_lhs " " #_op " " #_rhs "\n"); \ - fprintf(stderr, " Observed: %#lx %s %#lx\n", \ - (u64)__lhs, #_op, (u64)__rhs); \ - fprintf(stderr, " [errno: %d - %s]\n", errno, strerror(errno)); \ - VFIO_LOG_AND_EXIT(__VA_ARGS__); \ -} while (0) - -#define VFIO_ASSERT_EQ(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, ==, ##__VA_ARGS__) -#define VFIO_ASSERT_NE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, !=, ##__VA_ARGS__) -#define VFIO_ASSERT_LT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <, ##__VA_ARGS__) -#define VFIO_ASSERT_LE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, <=, ##__VA_ARGS__) -#define VFIO_ASSERT_GT(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >, ##__VA_ARGS__) -#define VFIO_ASSERT_GE(_a, _b, ...) VFIO_ASSERT_OP(_a, _b, >=, ##__VA_ARGS__) -#define VFIO_ASSERT_TRUE(_a, ...) VFIO_ASSERT_NE(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_FALSE(_a, ...) VFIO_ASSERT_EQ(false, (_a), ##__VA_ARGS__) -#define VFIO_ASSERT_NULL(_a, ...) VFIO_ASSERT_EQ(NULL, _a, ##__VA_ARGS__) -#define VFIO_ASSERT_NOT_NULL(_a, ...) VFIO_ASSERT_NE(NULL, _a, ##__VA_ARGS__) - -#define VFIO_FAIL(_fmt, ...) do { \ - fprintf(stderr, "%s:%u: FAIL\n\n", __FILE__, __LINE__); \ - VFIO_LOG_AND_EXIT(_fmt, ##__VA_ARGS__); \ -} while (0) - -struct vfio_iommu_mode { - const char *name; - const char *container_path; - unsigned long iommu_type; -}; - -/* - * Generator for VFIO selftests fixture variants that replicate across all - * possible IOMMU modes. Tests must define FIXTURE_VARIANT_ADD_IOMMU_MODE() - * which should then use FIXTURE_VARIANT_ADD() to create the variant. - */ -#define FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(...) \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(vfio_type1v2_iommu, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd_compat_type1v2, ##__VA_ARGS__); \ -FIXTURE_VARIANT_ADD_IOMMU_MODE(iommufd, ##__VA_ARGS__) - -struct vfio_pci_bar { - struct vfio_region_info info; - void *vaddr; -}; - -typedef u64 iova_t; - -#define INVALID_IOVA UINT64_MAX - -struct vfio_dma_region { - struct list_head link; - void *vaddr; - iova_t iova; - u64 size; -}; - -struct vfio_pci_device; - -struct vfio_pci_driver_ops { - const char *name; - - /** - * @probe() - Check if the driver supports the given device. - * - * Return: 0 on success, non-0 on failure. - */ - int (*probe)(struct vfio_pci_device *device); - - /** - * @init() - Initialize the driver for @device. - * - * Must be called after device->driver.region has been initialized. - */ - void (*init)(struct vfio_pci_device *device); - - /** - * remove() - Deinitialize the driver for @device. - */ - void (*remove)(struct vfio_pci_device *device); - - /** - * memcpy_start() - Kick off @count repeated memcpy operations from - * [@src, @src + @size) to [@dst, @dst + @size). - * - * Guarantees: - * - The device will attempt DMA reads on [src, src + size). - * - The device will attempt DMA writes on [dst, dst + size). - * - The device will not generate any interrupts. - * - * memcpy_start() returns immediately, it does not wait for the - * copies to complete. - */ - void (*memcpy_start)(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, u64 count); - - /** - * memcpy_wait() - Wait until the memcpy operations started by - * memcpy_start() have finished. - * - * Guarantees: - * - All in-flight DMAs initiated by memcpy_start() are fully complete - * before memcpy_wait() returns. - * - * Returns non-0 if the driver detects that an error occurred during the - * memcpy, 0 otherwise. - */ - int (*memcpy_wait)(struct vfio_pci_device *device); - - /** - * send_msi() - Make the device send the MSI device->driver.msi. - * - * Guarantees: - * - The device will send the MSI once. - */ - void (*send_msi)(struct vfio_pci_device *device); -}; - -struct vfio_pci_driver { - const struct vfio_pci_driver_ops *ops; - bool initialized; - bool memcpy_in_progress; - - /* Region to be used by the driver (e.g. for in-memory descriptors) */ - struct vfio_dma_region region; - - /* The maximum size that can be passed to memcpy_start(). */ - u64 max_memcpy_size; - - /* The maximum count that can be passed to memcpy_start(). */ - u64 max_memcpy_count; - - /* The MSI vector the device will signal in ops->send_msi(). */ - int msi; -}; - -struct vfio_pci_device { - int fd; - - const struct vfio_iommu_mode *iommu_mode; - int group_fd; - int container_fd; - - int iommufd; - u32 ioas_id; - - struct vfio_device_info info; - struct vfio_region_info config_space; - struct vfio_pci_bar bars[PCI_STD_NUM_BARS]; - - struct vfio_irq_info msi_info; - struct vfio_irq_info msix_info; - - struct list_head dma_regions; - - /* eventfds for MSI and MSI-x interrupts */ - int msi_eventfds[PCI_MSIX_FLAGS_QSIZE + 1]; - - struct vfio_pci_driver driver; -}; - -struct iova_allocator { - struct iommu_iova_range *ranges; - u32 nranges; - u32 range_idx; - u64 range_offset; -}; - -/* - * Return the BDF string of the device that the test should use. - * - * If a BDF string is provided by the user on the command line (as the last - * element of argv[]), then this function will return that and decrement argc - * by 1. - * - * Otherwise this function will attempt to use the environment variable - * $VFIO_SELFTESTS_BDF. - * - * If BDF cannot be determined then the test will exit with KSFT_SKIP. - */ -const char *vfio_selftests_get_bdf(int *argc, char *argv[]); -const char *vfio_pci_get_cdev_path(const char *bdf); - -extern const char *default_iommu_mode; - -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode); -void vfio_pci_device_cleanup(struct vfio_pci_device *device); -void vfio_pci_device_reset(struct vfio_pci_device *device); - -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges); - -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device); -void iova_allocator_cleanup(struct iova_allocator *allocator); -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size); - -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region); -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, - u64 *unmapped); -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped); - -static inline void vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_map(device, region), 0); -} - -static inline void vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap(device, region, NULL), 0); -} - -static inline void vfio_pci_dma_unmap_all(struct vfio_pci_device *device) -{ - VFIO_ASSERT_EQ(__vfio_pci_dma_unmap_all(device, NULL), 0); -} - -void vfio_pci_config_access(struct vfio_pci_device *device, bool write, - size_t config, size_t size, void *data); - -#define vfio_pci_config_read(_device, _offset, _type) ({ \ - _type __data; \ - vfio_pci_config_access((_device), false, _offset, sizeof(__data), &__data); \ - __data; \ -}) - -#define vfio_pci_config_readb(_d, _o) vfio_pci_config_read(_d, _o, u8) -#define vfio_pci_config_readw(_d, _o) vfio_pci_config_read(_d, _o, u16) -#define vfio_pci_config_readl(_d, _o) vfio_pci_config_read(_d, _o, u32) - -#define vfio_pci_config_write(_device, _offset, _value, _type) do { \ - _type __data = (_value); \ - vfio_pci_config_access((_device), true, _offset, sizeof(_type), &__data); \ -} while (0) - -#define vfio_pci_config_writeb(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u8) -#define vfio_pci_config_writew(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u16) -#define vfio_pci_config_writel(_d, _o, _v) vfio_pci_config_write(_d, _o, _v, u32) - -void vfio_pci_irq_enable(struct vfio_pci_device *device, u32 index, - u32 vector, int count); -void vfio_pci_irq_disable(struct vfio_pci_device *device, u32 index); -void vfio_pci_irq_trigger(struct vfio_pci_device *device, u32 index, u32 vector); - -static inline void fcntl_set_nonblock(int fd) -{ - int r; - - r = fcntl(fd, F_GETFL, 0); - VFIO_ASSERT_NE(r, -1, "F_GETFL failed for fd %d\n", fd); - - r = fcntl(fd, F_SETFL, r | O_NONBLOCK); - VFIO_ASSERT_NE(r, -1, "F_SETFL O_NONBLOCK failed for fd %d\n", fd); -} - -static inline void vfio_pci_msi_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSI_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msi_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSI_IRQ_INDEX); -} - -static inline void vfio_pci_msix_enable(struct vfio_pci_device *device, - u32 vector, int count) -{ - vfio_pci_irq_enable(device, VFIO_PCI_MSIX_IRQ_INDEX, vector, count); -} - -static inline void vfio_pci_msix_disable(struct vfio_pci_device *device) -{ - vfio_pci_irq_disable(device, VFIO_PCI_MSIX_IRQ_INDEX); -} - -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr); -iova_t to_iova(struct vfio_pci_device *device, void *vaddr); - -static inline bool vfio_pci_device_match(struct vfio_pci_device *device, - u16 vendor_id, u16 device_id) -{ - return (vendor_id == vfio_pci_config_readw(device, PCI_VENDOR_ID)) && - (device_id == vfio_pci_config_readw(device, PCI_DEVICE_ID)); -} - -void vfio_pci_driver_probe(struct vfio_pci_device *device); -void vfio_pci_driver_init(struct vfio_pci_device *device); -void vfio_pci_driver_remove(struct vfio_pci_device *device); -int vfio_pci_driver_memcpy(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size); -void vfio_pci_driver_memcpy_start(struct vfio_pci_device *device, - iova_t src, iova_t dst, u64 size, - u64 count); -int vfio_pci_driver_memcpy_wait(struct vfio_pci_device *device); -void vfio_pci_driver_send_msi(struct vfio_pci_device *device); - -#endif /* SELFTESTS_VFIO_LIB_INCLUDE_VFIO_UTIL_H */ diff --git a/tools/testing/selftests/vfio/lib/iommu.c b/tools/testing/selftests/vfio/lib/iommu.c new file mode 100644 index 000000000000..8079d43523f3 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iommu.c @@ -0,0 +1,465 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <uapi/linux/types.h> +#include <linux/limits.h> +#include <linux/mman.h> +#include <linux/types.h> +#include <linux/vfio.h> +#include <linux/iommufd.h> + +#include "../../../kselftest.h" +#include <libvfio.h> + +const char *default_iommu_mode = "iommufd"; + +/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ +static const struct iommu_mode iommu_modes[] = { + { + .name = "vfio_type1_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "vfio_type1v2_iommu", + .container_path = "/dev/vfio/vfio", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd_compat_type1", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1_IOMMU, + }, + { + .name = "iommufd_compat_type1v2", + .container_path = "/dev/iommu", + .iommu_type = VFIO_TYPE1v2_IOMMU, + }, + { + .name = "iommufd", + }, +}; + +static const struct iommu_mode *lookup_iommu_mode(const char *iommu_mode) +{ + int i; + + if (!iommu_mode) + iommu_mode = default_iommu_mode; + + for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { + if (strcmp(iommu_mode, iommu_modes[i].name)) + continue; + + return &iommu_modes[i]; + } + + VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); +} + +int __iommu_hva2iova(struct iommu *iommu, void *vaddr, iova_t *iova) +{ + struct dma_region *region; + + list_for_each_entry(region, &iommu->dma_regions, link) { + if (vaddr < region->vaddr) + continue; + + if (vaddr >= region->vaddr + region->size) + continue; + + if (iova) + *iova = region->iova + (vaddr - region->vaddr); + + return 0; + } + + return -ENOENT; +} + +iova_t iommu_hva2iova(struct iommu *iommu, void *vaddr) +{ + iova_t iova; + int ret; + + ret = __iommu_hva2iova(iommu, vaddr, &iova); + VFIO_ASSERT_EQ(ret, 0, "%p is not mapped into the iommu\n", vaddr); + + return iova; +} + +static int vfio_iommu_map(struct iommu *iommu, struct dma_region *region) +{ + struct vfio_iommu_type1_dma_map args = { + .argsz = sizeof(args), + .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, + .vaddr = (u64)region->vaddr, + .iova = region->iova, + .size = region->size, + }; + + if (ioctl(iommu->container_fd, VFIO_IOMMU_MAP_DMA, &args)) + return -errno; + + return 0; +} + +static int iommufd_map(struct iommu *iommu, struct dma_region *region) +{ + struct iommu_ioas_map args = { + .size = sizeof(args), + .flags = IOMMU_IOAS_MAP_READABLE | + IOMMU_IOAS_MAP_WRITEABLE | + IOMMU_IOAS_MAP_FIXED_IOVA, + .user_va = (u64)region->vaddr, + .iova = region->iova, + .length = region->size, + .ioas_id = iommu->ioas_id, + }; + + if (ioctl(iommu->iommufd, IOMMU_IOAS_MAP, &args)) + return -errno; + + return 0; +} + +int __iommu_map(struct iommu *iommu, struct dma_region *region) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_map(iommu, region); + else + ret = vfio_iommu_map(iommu, region); + + if (ret) + return ret; + + list_add(®ion->link, &iommu->dma_regions); + + return 0; +} + +static int __vfio_iommu_unmap(int fd, u64 iova, u64 size, u32 flags, u64 *unmapped) +{ + struct vfio_iommu_type1_dma_unmap args = { + .argsz = sizeof(args), + .iova = iova, + .size = size, + .flags = flags, + }; + + if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) + return -errno; + + if (unmapped) + *unmapped = args.size; + + return 0; +} + +static int vfio_iommu_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __vfio_iommu_unmap(iommu->container_fd, region->iova, + region->size, 0, unmapped); +} + +static int __iommufd_unmap(int fd, u64 iova, u64 length, u32 ioas_id, u64 *unmapped) +{ + struct iommu_ioas_unmap args = { + .size = sizeof(args), + .iova = iova, + .length = length, + .ioas_id = ioas_id, + }; + + if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) + return -errno; + + if (unmapped) + *unmapped = args.length; + + return 0; +} + +static int iommufd_unmap(struct iommu *iommu, struct dma_region *region, + u64 *unmapped) +{ + return __iommufd_unmap(iommu->iommufd, region->iova, region->size, + iommu->ioas_id, unmapped); +} + +int __iommu_unmap(struct iommu *iommu, struct dma_region *region, u64 *unmapped) +{ + int ret; + + if (iommu->iommufd) + ret = iommufd_unmap(iommu, region, unmapped); + else + ret = vfio_iommu_unmap(iommu, region, unmapped); + + if (ret) + return ret; + + list_del_init(®ion->link); + + return 0; +} + +int __iommu_unmap_all(struct iommu *iommu, u64 *unmapped) +{ + int ret; + struct dma_region *curr, *next; + + if (iommu->iommufd) + ret = __iommufd_unmap(iommu->iommufd, 0, UINT64_MAX, + iommu->ioas_id, unmapped); + else + ret = __vfio_iommu_unmap(iommu->container_fd, 0, 0, + VFIO_DMA_UNMAP_FLAG_ALL, unmapped); + + if (ret) + return ret; + + list_for_each_entry_safe(curr, next, &iommu->dma_regions, link) + list_del_init(&curr->link); + + return 0; +} + +static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, + u32 *cap_offset) +{ + struct vfio_info_cap_header *hdr; + + if (!*cap_offset) + return NULL; + + VFIO_ASSERT_LT(*cap_offset, bufsz); + VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); + + hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); + *cap_offset = hdr->next; + + return hdr; +} + +static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, + u16 cap_id) +{ + struct vfio_info_cap_header *hdr; + u32 cap_offset = info->cap_offset; + u32 max_depth; + u32 depth = 0; + + if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) + return NULL; + + if (cap_offset) + VFIO_ASSERT_GE(cap_offset, sizeof(*info)); + + max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); + + while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { + depth++; + VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); + + if (hdr->id == cap_id) + return hdr; + } + + return NULL; +} + +/* Return buffer including capability chain, if present. Free with free() */ +static struct vfio_iommu_type1_info *vfio_iommu_get_info(int container_fd) +{ + struct vfio_iommu_type1_info *info; + + info = malloc(sizeof(*info)); + VFIO_ASSERT_NOT_NULL(info); + + *info = (struct vfio_iommu_type1_info) { + .argsz = sizeof(*info), + }; + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + info = realloc(info, info->argsz); + VFIO_ASSERT_NOT_NULL(info); + + ioctl_assert(container_fd, VFIO_IOMMU_GET_INFO, info); + VFIO_ASSERT_GE(info->argsz, sizeof(*info)); + + return info; +} + +/* + * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to + * report iommufd's iommu_iova_range. Free with free(). + */ +static struct iommu_iova_range *vfio_iommu_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct vfio_iommu_type1_info_cap_iova_range *cap_range; + struct vfio_iommu_type1_info *info; + struct vfio_info_cap_header *hdr; + struct iommu_iova_range *ranges = NULL; + + info = vfio_iommu_get_info(iommu->container_fd); + hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); + VFIO_ASSERT_NOT_NULL(hdr); + + cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); + VFIO_ASSERT_GT(cap_range->nr_iovas, 0); + + ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + for (u32 i = 0; i < cap_range->nr_iovas; i++) { + ranges[i] = (struct iommu_iova_range){ + .start = cap_range->iova_ranges[i].start, + .last = cap_range->iova_ranges[i].end, + }; + } + + *nranges = cap_range->nr_iovas; + + free(info); + return ranges; +} + +/* Return iova ranges of the device's IOAS. Free with free() */ +static struct iommu_iova_range *iommufd_iova_ranges(struct iommu *iommu, + u32 *nranges) +{ + struct iommu_iova_range *ranges; + int ret; + + struct iommu_ioas_iova_ranges query = { + .size = sizeof(query), + .ioas_id = iommu->ioas_id, + }; + + ret = ioctl(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + VFIO_ASSERT_EQ(ret, -1); + VFIO_ASSERT_EQ(errno, EMSGSIZE); + VFIO_ASSERT_GT(query.num_iovas, 0); + + ranges = calloc(query.num_iovas, sizeof(*ranges)); + VFIO_ASSERT_NOT_NULL(ranges); + + query.allowed_iovas = (uintptr_t)ranges; + + ioctl_assert(iommu->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); + *nranges = query.num_iovas; + + return ranges; +} + +static int iova_range_comp(const void *a, const void *b) +{ + const struct iommu_iova_range *ra = a, *rb = b; + + if (ra->start < rb->start) + return -1; + + if (ra->start > rb->start) + return 1; + + return 0; +} + +/* Return sorted IOVA ranges of the device. Free with free(). */ +struct iommu_iova_range *iommu_iova_ranges(struct iommu *iommu, u32 *nranges) +{ + struct iommu_iova_range *ranges; + + if (iommu->iommufd) + ranges = iommufd_iova_ranges(iommu, nranges); + else + ranges = vfio_iommu_iova_ranges(iommu, nranges); + + if (!ranges) + return NULL; + + VFIO_ASSERT_GT(*nranges, 0); + + /* Sort and check that ranges are sane and non-overlapping */ + qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); + VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); + + for (u32 i = 1; i < *nranges; i++) { + VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); + VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); + } + + return ranges; +} + +static u32 iommufd_ioas_alloc(int iommufd) +{ + struct iommu_ioas_alloc args = { + .size = sizeof(args), + }; + + ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); + return args.out_ioas_id; +} + +struct iommu *iommu_init(const char *iommu_mode) +{ + const char *container_path; + struct iommu *iommu; + int version; + + iommu = calloc(1, sizeof(*iommu)); + VFIO_ASSERT_NOT_NULL(iommu); + + INIT_LIST_HEAD(&iommu->dma_regions); + + iommu->mode = lookup_iommu_mode(iommu_mode); + + container_path = iommu->mode->container_path; + if (container_path) { + iommu->container_fd = open(container_path, O_RDWR); + VFIO_ASSERT_GE(iommu->container_fd, 0, "open(%s) failed\n", container_path); + + version = ioctl(iommu->container_fd, VFIO_GET_API_VERSION); + VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); + } else { + /* + * Require device->iommufd to be >0 so that a simple non-0 check can be + * used to check if iommufd is enabled. In practice open() will never + * return 0 unless stdin is closed. + */ + iommu->iommufd = open("/dev/iommu", O_RDWR); + VFIO_ASSERT_GT(iommu->iommufd, 0); + + iommu->ioas_id = iommufd_ioas_alloc(iommu->iommufd); + } + + return iommu; +} + +void iommu_cleanup(struct iommu *iommu) +{ + if (iommu->iommufd) + VFIO_ASSERT_EQ(close(iommu->iommufd), 0); + else + VFIO_ASSERT_EQ(close(iommu->container_fd), 0); + + free(iommu); +} diff --git a/tools/testing/selftests/vfio/lib/iova_allocator.c b/tools/testing/selftests/vfio/lib/iova_allocator.c new file mode 100644 index 000000000000..a12b0a51e9e6 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/iova_allocator.c @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <dirent.h> +#include <fcntl.h> +#include <libgen.h> +#include <stdint.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <sys/eventfd.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <uapi/linux/types.h> +#include <linux/iommufd.h> +#include <linux/limits.h> +#include <linux/mman.h> +#include <linux/overflow.h> +#include <linux/types.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +struct iova_allocator *iova_allocator_init(struct iommu *iommu) +{ + struct iova_allocator *allocator; + struct iommu_iova_range *ranges; + u32 nranges; + + ranges = iommu_iova_ranges(iommu, &nranges); + VFIO_ASSERT_NOT_NULL(ranges); + + allocator = malloc(sizeof(*allocator)); + VFIO_ASSERT_NOT_NULL(allocator); + + *allocator = (struct iova_allocator){ + .ranges = ranges, + .nranges = nranges, + .range_idx = 0, + .range_offset = 0, + }; + + return allocator; +} + +void iova_allocator_cleanup(struct iova_allocator *allocator) +{ + free(allocator->ranges); + free(allocator); +} + +iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) +{ + VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); + VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); + + for (;;) { + struct iommu_iova_range *range; + iova_t iova, last; + + VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, + "IOVA allocator out of space\n"); + + range = &allocator->ranges[allocator->range_idx]; + iova = range->start + allocator->range_offset; + + /* Check for sufficient space at the current offset */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + /* Align iova to size */ + iova = last & ~(size - 1); + + /* Check for sufficient space at the aligned iova */ + if (check_add_overflow(iova, size - 1, &last) || + last > range->last) + goto next_range; + + if (last == range->last) { + allocator->range_idx++; + allocator->range_offset = 0; + } else { + allocator->range_offset = last - range->start + 1; + } + + return iova; + +next_range: + allocator->range_idx++; + allocator->range_offset = 0; + } +} + diff --git a/tools/testing/selftests/vfio/lib/libvfio.c b/tools/testing/selftests/vfio/lib/libvfio.c new file mode 100644 index 000000000000..a23a3cc5be69 --- /dev/null +++ b/tools/testing/selftests/vfio/lib/libvfio.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <stdio.h> +#include <stdlib.h> + +#include "../../../kselftest.h" +#include <libvfio.h> + +static bool is_bdf(const char *str) +{ + unsigned int s, b, d, f; + int length, count; + + count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); + return count == 4 && length == strlen(str); +} + +static char **get_bdfs_cmdline(int *argc, char *argv[], int *nr_bdfs) +{ + int i; + + for (i = *argc - 1; i > 0 && is_bdf(argv[i]); i--) + continue; + + i++; + *nr_bdfs = *argc - i; + *argc -= *nr_bdfs; + + return *nr_bdfs ? &argv[i] : NULL; +} + +static char *get_bdf_env(void) +{ + char *bdf; + + bdf = getenv("VFIO_SELFTESTS_BDF"); + if (!bdf) + return NULL; + + VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); + return bdf; +} + +char **vfio_selftests_get_bdfs(int *argc, char *argv[], int *nr_bdfs) +{ + static char *env_bdf; + char **bdfs; + + bdfs = get_bdfs_cmdline(argc, argv, nr_bdfs); + if (bdfs) + return bdfs; + + env_bdf = get_bdf_env(); + if (env_bdf) { + *nr_bdfs = 1; + return &env_bdf; + } + + fprintf(stderr, "Unable to determine which device(s) to use, skipping test.\n"); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address via environment variable:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " export VFIO_SELFTESTS_BDF=\"segment:bus:device.function\"\n"); + fprintf(stderr, " %s [options]\n", argv[0]); + fprintf(stderr, "\n"); + fprintf(stderr, "To pass the device address(es) via argv:\n"); + fprintf(stderr, "\n"); + fprintf(stderr, " %s [options] segment:bus:device.function ...\n", argv[0]); + fprintf(stderr, "\n"); + exit(KSFT_SKIP); +} + +const char *vfio_selftests_get_bdf(int *argc, char *argv[]) +{ + int nr_bdfs; + + return vfio_selftests_get_bdfs(argc, argv, &nr_bdfs)[0]; +} diff --git a/tools/testing/selftests/vfio/lib/libvfio.mk b/tools/testing/selftests/vfio/lib/libvfio.mk index 5d11c3a89a28..9f47bceed16f 100644 --- a/tools/testing/selftests/vfio/lib/libvfio.mk +++ b/tools/testing/selftests/vfio/lib/libvfio.mk @@ -1,24 +1,29 @@ include $(top_srcdir)/scripts/subarch.include ARCH ?= $(SUBARCH) -VFIO_DIR := $(selfdir)/vfio +LIBVFIO_SRCDIR := $(selfdir)/vfio/lib -LIBVFIO_C := lib/vfio_pci_device.c -LIBVFIO_C += lib/vfio_pci_driver.c +LIBVFIO_C := iommu.c +LIBVFIO_C += iova_allocator.c +LIBVFIO_C += libvfio.c +LIBVFIO_C += vfio_pci_device.c +LIBVFIO_C += vfio_pci_driver.c ifeq ($(ARCH:x86_64=x86),x86) -LIBVFIO_C += lib/drivers/ioat/ioat.c -LIBVFIO_C += lib/drivers/dsa/dsa.c +LIBVFIO_C += drivers/ioat/ioat.c +LIBVFIO_C += drivers/dsa/dsa.c endif -LIBVFIO_O := $(patsubst %.c, $(OUTPUT)/%.o, $(LIBVFIO_C)) +LIBVFIO_OUTPUT := $(OUTPUT)/libvfio + +LIBVFIO_O := $(patsubst %.c, $(LIBVFIO_OUTPUT)/%.o, $(LIBVFIO_C)) LIBVFIO_O_DIRS := $(shell dirname $(LIBVFIO_O) | uniq) $(shell mkdir -p $(LIBVFIO_O_DIRS)) -CFLAGS += -I$(VFIO_DIR)/lib/include +CFLAGS += -I$(LIBVFIO_SRCDIR)/include -$(LIBVFIO_O): $(OUTPUT)/%.o : $(VFIO_DIR)/%.c +$(LIBVFIO_O): $(LIBVFIO_OUTPUT)/%.o : $(LIBVFIO_SRCDIR)/%.c $(CC) $(CFLAGS) $(CPPFLAGS) $(TARGET_ARCH) -c $< -o $@ -EXTRA_CLEAN += $(LIBVFIO_O) +EXTRA_CLEAN += $(LIBVFIO_OUTPUT) diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_device.c b/tools/testing/selftests/vfio/lib/vfio_pci_device.c index b479a359da12..13fdb4b0b10f 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_device.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_device.c @@ -20,286 +20,10 @@ #include <linux/vfio.h> #include "../../../kselftest.h" -#include <vfio_util.h> +#include <libvfio.h> #define PCI_SYSFS_PATH "/sys/bus/pci/devices" -#define ioctl_assert(_fd, _op, _arg) do { \ - void *__arg = (_arg); \ - int __ret = ioctl((_fd), (_op), (__arg)); \ - VFIO_ASSERT_EQ(__ret, 0, "ioctl(%s, %s, %s) returned %d\n", #_fd, #_op, #_arg, __ret); \ -} while (0) - -static struct vfio_info_cap_header *next_cap_hdr(void *buf, u32 bufsz, - u32 *cap_offset) -{ - struct vfio_info_cap_header *hdr; - - if (!*cap_offset) - return NULL; - - VFIO_ASSERT_LT(*cap_offset, bufsz); - VFIO_ASSERT_GE(bufsz - *cap_offset, sizeof(*hdr)); - - hdr = (struct vfio_info_cap_header *)((u8 *)buf + *cap_offset); - *cap_offset = hdr->next; - - return hdr; -} - -static struct vfio_info_cap_header *vfio_iommu_info_cap_hdr(struct vfio_iommu_type1_info *info, - u16 cap_id) -{ - struct vfio_info_cap_header *hdr; - u32 cap_offset = info->cap_offset; - u32 max_depth; - u32 depth = 0; - - if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) - return NULL; - - if (cap_offset) - VFIO_ASSERT_GE(cap_offset, sizeof(*info)); - - max_depth = (info->argsz - sizeof(*info)) / sizeof(*hdr); - - while ((hdr = next_cap_hdr(info, info->argsz, &cap_offset))) { - depth++; - VFIO_ASSERT_LE(depth, max_depth, "Capability chain contains a cycle\n"); - - if (hdr->id == cap_id) - return hdr; - } - - return NULL; -} - -/* Return buffer including capability chain, if present. Free with free() */ -static struct vfio_iommu_type1_info *vfio_iommu_get_info(struct vfio_pci_device *device) -{ - struct vfio_iommu_type1_info *info; - - info = malloc(sizeof(*info)); - VFIO_ASSERT_NOT_NULL(info); - - *info = (struct vfio_iommu_type1_info) { - .argsz = sizeof(*info), - }; - - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - info = realloc(info, info->argsz); - VFIO_ASSERT_NOT_NULL(info); - - ioctl_assert(device->container_fd, VFIO_IOMMU_GET_INFO, info); - VFIO_ASSERT_GE(info->argsz, sizeof(*info)); - - return info; -} - -/* - * Return iova ranges for the device's container. Normalize vfio_iommu_type1 to - * report iommufd's iommu_iova_range. Free with free(). - */ -static struct iommu_iova_range *vfio_iommu_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct vfio_iommu_type1_info_cap_iova_range *cap_range; - struct vfio_iommu_type1_info *info; - struct vfio_info_cap_header *hdr; - struct iommu_iova_range *ranges = NULL; - - info = vfio_iommu_get_info(device); - hdr = vfio_iommu_info_cap_hdr(info, VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE); - VFIO_ASSERT_NOT_NULL(hdr); - - cap_range = container_of(hdr, struct vfio_iommu_type1_info_cap_iova_range, header); - VFIO_ASSERT_GT(cap_range->nr_iovas, 0); - - ranges = calloc(cap_range->nr_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - for (u32 i = 0; i < cap_range->nr_iovas; i++) { - ranges[i] = (struct iommu_iova_range){ - .start = cap_range->iova_ranges[i].start, - .last = cap_range->iova_ranges[i].end, - }; - } - - *nranges = cap_range->nr_iovas; - - free(info); - return ranges; -} - -/* Return iova ranges of the device's IOAS. Free with free() */ -static struct iommu_iova_range *iommufd_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - int ret; - - struct iommu_ioas_iova_ranges query = { - .size = sizeof(query), - .ioas_id = device->ioas_id, - }; - - ret = ioctl(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - VFIO_ASSERT_EQ(ret, -1); - VFIO_ASSERT_EQ(errno, EMSGSIZE); - VFIO_ASSERT_GT(query.num_iovas, 0); - - ranges = calloc(query.num_iovas, sizeof(*ranges)); - VFIO_ASSERT_NOT_NULL(ranges); - - query.allowed_iovas = (uintptr_t)ranges; - - ioctl_assert(device->iommufd, IOMMU_IOAS_IOVA_RANGES, &query); - *nranges = query.num_iovas; - - return ranges; -} - -static int iova_range_comp(const void *a, const void *b) -{ - const struct iommu_iova_range *ra = a, *rb = b; - - if (ra->start < rb->start) - return -1; - - if (ra->start > rb->start) - return 1; - - return 0; -} - -/* Return sorted IOVA ranges of the device. Free with free(). */ -struct iommu_iova_range *vfio_pci_iova_ranges(struct vfio_pci_device *device, - u32 *nranges) -{ - struct iommu_iova_range *ranges; - - if (device->iommufd) - ranges = iommufd_iova_ranges(device, nranges); - else - ranges = vfio_iommu_iova_ranges(device, nranges); - - if (!ranges) - return NULL; - - VFIO_ASSERT_GT(*nranges, 0); - - /* Sort and check that ranges are sane and non-overlapping */ - qsort(ranges, *nranges, sizeof(*ranges), iova_range_comp); - VFIO_ASSERT_LT(ranges[0].start, ranges[0].last); - - for (u32 i = 1; i < *nranges; i++) { - VFIO_ASSERT_LT(ranges[i].start, ranges[i].last); - VFIO_ASSERT_LT(ranges[i - 1].last, ranges[i].start); - } - - return ranges; -} - -struct iova_allocator *iova_allocator_init(struct vfio_pci_device *device) -{ - struct iova_allocator *allocator; - struct iommu_iova_range *ranges; - u32 nranges; - - ranges = vfio_pci_iova_ranges(device, &nranges); - VFIO_ASSERT_NOT_NULL(ranges); - - allocator = malloc(sizeof(*allocator)); - VFIO_ASSERT_NOT_NULL(allocator); - - *allocator = (struct iova_allocator){ - .ranges = ranges, - .nranges = nranges, - .range_idx = 0, - .range_offset = 0, - }; - - return allocator; -} - -void iova_allocator_cleanup(struct iova_allocator *allocator) -{ - free(allocator->ranges); - free(allocator); -} - -iova_t iova_allocator_alloc(struct iova_allocator *allocator, size_t size) -{ - VFIO_ASSERT_GT(size, 0, "Invalid size arg, zero\n"); - VFIO_ASSERT_EQ(size & (size - 1), 0, "Invalid size arg, non-power-of-2\n"); - - for (;;) { - struct iommu_iova_range *range; - iova_t iova, last; - - VFIO_ASSERT_LT(allocator->range_idx, allocator->nranges, - "IOVA allocator out of space\n"); - - range = &allocator->ranges[allocator->range_idx]; - iova = range->start + allocator->range_offset; - - /* Check for sufficient space at the current offset */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - /* Align iova to size */ - iova = last & ~(size - 1); - - /* Check for sufficient space at the aligned iova */ - if (check_add_overflow(iova, size - 1, &last) || - last > range->last) - goto next_range; - - if (last == range->last) { - allocator->range_idx++; - allocator->range_offset = 0; - } else { - allocator->range_offset = last - range->start + 1; - } - - return iova; - -next_range: - allocator->range_idx++; - allocator->range_offset = 0; - } -} - -iova_t __to_iova(struct vfio_pci_device *device, void *vaddr) -{ - struct vfio_dma_region *region; - - list_for_each_entry(region, &device->dma_regions, link) { - if (vaddr < region->vaddr) - continue; - - if (vaddr >= region->vaddr + region->size) - continue; - - return region->iova + (vaddr - region->vaddr); - } - - return INVALID_IOVA; -} - -iova_t to_iova(struct vfio_pci_device *device, void *vaddr) -{ - iova_t iova; - - iova = __to_iova(device, vaddr); - VFIO_ASSERT_NE(iova, INVALID_IOVA, "%p is not mapped into device.\n", vaddr); - - return iova; -} - static void vfio_pci_irq_set(struct vfio_pci_device *device, u32 index, u32 vector, u32 count, int *fds) { @@ -386,141 +110,6 @@ static void vfio_pci_irq_get(struct vfio_pci_device *device, u32 index, ioctl_assert(device->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info); } -static int vfio_iommu_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - struct vfio_iommu_type1_dma_map args = { - .argsz = sizeof(args), - .flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE, - .vaddr = (u64)region->vaddr, - .iova = region->iova, - .size = region->size, - }; - - if (ioctl(device->container_fd, VFIO_IOMMU_MAP_DMA, &args)) - return -errno; - - return 0; -} - -static int iommufd_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - struct iommu_ioas_map args = { - .size = sizeof(args), - .flags = IOMMU_IOAS_MAP_READABLE | - IOMMU_IOAS_MAP_WRITEABLE | - IOMMU_IOAS_MAP_FIXED_IOVA, - .user_va = (u64)region->vaddr, - .iova = region->iova, - .length = region->size, - .ioas_id = device->ioas_id, - }; - - if (ioctl(device->iommufd, IOMMU_IOAS_MAP, &args)) - return -errno; - - return 0; -} - -int __vfio_pci_dma_map(struct vfio_pci_device *device, - struct vfio_dma_region *region) -{ - int ret; - - if (device->iommufd) - ret = iommufd_dma_map(device, region); - else - ret = vfio_iommu_dma_map(device, region); - - if (ret) - return ret; - - list_add(®ion->link, &device->dma_regions); - - return 0; -} - -static int vfio_iommu_dma_unmap(int fd, u64 iova, u64 size, u32 flags, - u64 *unmapped) -{ - struct vfio_iommu_type1_dma_unmap args = { - .argsz = sizeof(args), - .iova = iova, - .size = size, - .flags = flags, - }; - - if (ioctl(fd, VFIO_IOMMU_UNMAP_DMA, &args)) - return -errno; - - if (unmapped) - *unmapped = args.size; - - return 0; -} - -static int iommufd_dma_unmap(int fd, u64 iova, u64 length, u32 ioas_id, - u64 *unmapped) -{ - struct iommu_ioas_unmap args = { - .size = sizeof(args), - .iova = iova, - .length = length, - .ioas_id = ioas_id, - }; - - if (ioctl(fd, IOMMU_IOAS_UNMAP, &args)) - return -errno; - - if (unmapped) - *unmapped = args.length; - - return 0; -} - -int __vfio_pci_dma_unmap(struct vfio_pci_device *device, - struct vfio_dma_region *region, u64 *unmapped) -{ - int ret; - - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, region->iova, - region->size, device->ioas_id, - unmapped); - else - ret = vfio_iommu_dma_unmap(device->container_fd, region->iova, - region->size, 0, unmapped); - - if (ret) - return ret; - - list_del_init(®ion->link); - - return 0; -} - -int __vfio_pci_dma_unmap_all(struct vfio_pci_device *device, u64 *unmapped) -{ - int ret; - struct vfio_dma_region *curr, *next; - - if (device->iommufd) - ret = iommufd_dma_unmap(device->iommufd, 0, UINT64_MAX, - device->ioas_id, unmapped); - else - ret = vfio_iommu_dma_unmap(device->container_fd, 0, 0, - VFIO_DMA_UNMAP_FLAG_ALL, unmapped); - - if (ret) - return ret; - - list_for_each_entry_safe(curr, next, &device->dma_regions, link) - list_del_init(&curr->link); - - return 0; -} - static void vfio_pci_region_get(struct vfio_pci_device *device, int index, struct vfio_region_info *info) { @@ -627,28 +216,26 @@ static void vfio_pci_group_setup(struct vfio_pci_device *device, const char *bdf ioctl_assert(device->group_fd, VFIO_GROUP_GET_STATUS, &group_status); VFIO_ASSERT_TRUE(group_status.flags & VFIO_GROUP_FLAGS_VIABLE); - ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->container_fd); + ioctl_assert(device->group_fd, VFIO_GROUP_SET_CONTAINER, &device->iommu->container_fd); } static void vfio_pci_container_setup(struct vfio_pci_device *device, const char *bdf) { - unsigned long iommu_type = device->iommu_mode->iommu_type; - const char *path = device->iommu_mode->container_path; - int version; + struct iommu *iommu = device->iommu; + unsigned long iommu_type = iommu->mode->iommu_type; int ret; - device->container_fd = open(path, O_RDWR); - VFIO_ASSERT_GE(device->container_fd, 0, "open(%s) failed\n", path); - - version = ioctl(device->container_fd, VFIO_GET_API_VERSION); - VFIO_ASSERT_EQ(version, VFIO_API_VERSION, "Unsupported version: %d\n", version); - vfio_pci_group_setup(device, bdf); - ret = ioctl(device->container_fd, VFIO_CHECK_EXTENSION, iommu_type); + ret = ioctl(iommu->container_fd, VFIO_CHECK_EXTENSION, iommu_type); VFIO_ASSERT_GT(ret, 0, "VFIO IOMMU type %lu not supported\n", iommu_type); - ioctl_assert(device->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); + /* + * Allow multiple threads to race to set the IOMMU type on the + * container. The first will succeed and the rest should fail + * because the IOMMU type is already set. + */ + (void)ioctl(iommu->container_fd, VFIO_SET_IOMMU, (void *)iommu_type); device->fd = ioctl(device->group_fd, VFIO_GROUP_GET_DEVICE_FD, bdf); VFIO_ASSERT_GE(device->fd, 0); @@ -712,52 +299,6 @@ const char *vfio_pci_get_cdev_path(const char *bdf) return cdev_path; } -/* Reminder: Keep in sync with FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(). */ -static const struct vfio_iommu_mode iommu_modes[] = { - { - .name = "vfio_type1_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "vfio_type1v2_iommu", - .container_path = "/dev/vfio/vfio", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd_compat_type1", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1_IOMMU, - }, - { - .name = "iommufd_compat_type1v2", - .container_path = "/dev/iommu", - .iommu_type = VFIO_TYPE1v2_IOMMU, - }, - { - .name = "iommufd", - }, -}; - -const char *default_iommu_mode = "iommufd"; - -static const struct vfio_iommu_mode *lookup_iommu_mode(const char *iommu_mode) -{ - int i; - - if (!iommu_mode) - iommu_mode = default_iommu_mode; - - for (i = 0; i < ARRAY_SIZE(iommu_modes); i++) { - if (strcmp(iommu_mode, iommu_modes[i].name)) - continue; - - return &iommu_modes[i]; - } - - VFIO_FAIL("Unrecognized IOMMU mode: %s\n", iommu_mode); -} - static void vfio_device_bind_iommufd(int device_fd, int iommufd) { struct vfio_device_bind_iommufd args = { @@ -768,16 +309,6 @@ static void vfio_device_bind_iommufd(int device_fd, int iommufd) ioctl_assert(device_fd, VFIO_DEVICE_BIND_IOMMUFD, &args); } -static u32 iommufd_ioas_alloc(int iommufd) -{ - struct iommu_ioas_alloc args = { - .size = sizeof(args), - }; - - ioctl_assert(iommufd, IOMMU_IOAS_ALLOC, &args); - return args.out_ioas_id; -} - static void vfio_device_attach_iommufd_pt(int device_fd, u32 pt_id) { struct vfio_device_attach_iommufd_pt args = { @@ -796,31 +327,22 @@ static void vfio_pci_iommufd_setup(struct vfio_pci_device *device, const char *b VFIO_ASSERT_GE(device->fd, 0); free((void *)cdev_path); - /* - * Require device->iommufd to be >0 so that a simple non-0 check can be - * used to check if iommufd is enabled. In practice open() will never - * return 0 unless stdin is closed. - */ - device->iommufd = open("/dev/iommu", O_RDWR); - VFIO_ASSERT_GT(device->iommufd, 0); - - vfio_device_bind_iommufd(device->fd, device->iommufd); - device->ioas_id = iommufd_ioas_alloc(device->iommufd); - vfio_device_attach_iommufd_pt(device->fd, device->ioas_id); + vfio_device_bind_iommufd(device->fd, device->iommu->iommufd); + vfio_device_attach_iommufd_pt(device->fd, device->iommu->ioas_id); } -struct vfio_pci_device *vfio_pci_device_init(const char *bdf, const char *iommu_mode) +struct vfio_pci_device *vfio_pci_device_init(const char *bdf, struct iommu *iommu) { struct vfio_pci_device *device; device = calloc(1, sizeof(*device)); VFIO_ASSERT_NOT_NULL(device); - INIT_LIST_HEAD(&device->dma_regions); - - device->iommu_mode = lookup_iommu_mode(iommu_mode); + VFIO_ASSERT_NOT_NULL(iommu); + device->iommu = iommu; + device->bdf = bdf; - if (device->iommu_mode->container_path) + if (iommu->mode->container_path) vfio_pci_container_setup(device, bdf); else vfio_pci_iommufd_setup(device, bdf); @@ -849,48 +371,8 @@ void vfio_pci_device_cleanup(struct vfio_pci_device *device) VFIO_ASSERT_EQ(close(device->msi_eventfds[i]), 0); } - if (device->iommufd) { - VFIO_ASSERT_EQ(close(device->iommufd), 0); - } else { + if (device->group_fd) VFIO_ASSERT_EQ(close(device->group_fd), 0); - VFIO_ASSERT_EQ(close(device->container_fd), 0); - } free(device); } - -static bool is_bdf(const char *str) -{ - unsigned int s, b, d, f; - int length, count; - - count = sscanf(str, "%4x:%2x:%2x.%2x%n", &s, &b, &d, &f, &length); - return count == 4 && length == strlen(str); -} - -const char *vfio_selftests_get_bdf(int *argc, char *argv[]) -{ - char *bdf; - - if (*argc > 1 && is_bdf(argv[*argc - 1])) - return argv[--(*argc)]; - - bdf = getenv("VFIO_SELFTESTS_BDF"); - if (bdf) { - VFIO_ASSERT_TRUE(is_bdf(bdf), "Invalid BDF: %s\n", bdf); - return bdf; - } - - fprintf(stderr, "Unable to determine which device to use, skipping test.\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via environment variable:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " export VFIO_SELFTESTS_BDF=segment:bus:device.function\n"); - fprintf(stderr, " %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "To pass the device address via argv:\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " %s [options] segment:bus:device.function\n", argv[0]); - fprintf(stderr, "\n"); - exit(KSFT_SKIP); -} diff --git a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c index e5e8723ecb41..ca0e25efbfa1 100644 --- a/tools/testing/selftests/vfio/lib/vfio_pci_driver.c +++ b/tools/testing/selftests/vfio/lib/vfio_pci_driver.c @@ -1,8 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only -#include <stdio.h> - #include "../../../kselftest.h" -#include <vfio_util.h> +#include <libvfio.h> #ifdef __x86_64__ extern struct vfio_pci_driver_ops dsa_ops; @@ -29,7 +27,6 @@ void vfio_pci_driver_probe(struct vfio_pci_device *device) if (ops->probe(device)) continue; - printf("Driver found: %s\n", ops->name); device->driver.ops = ops; } } @@ -58,17 +55,6 @@ void vfio_pci_driver_init(struct vfio_pci_device *device) driver->ops->init(device); driver->initialized = true; - - printf("%s: region: vaddr %p, iova 0x%lx, size 0x%lx\n", - driver->ops->name, - driver->region.vaddr, - driver->region.iova, - driver->region.size); - - printf("%s: max_memcpy_size 0x%lx, max_memcpy_count 0x%lx\n", - driver->ops->name, - driver->max_memcpy_size, - driver->max_memcpy_count); } void vfio_pci_driver_remove(struct vfio_pci_device *device) diff --git a/tools/testing/selftests/vfio/run.sh b/tools/testing/selftests/vfio/run.sh deleted file mode 100755 index 0476b6d7adc3..000000000000 --- a/tools/testing/selftests/vfio/run.sh +++ /dev/null @@ -1,109 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-or-later - -# Global variables initialized in main() and then used during cleanup() when -# the script exits. -declare DEVICE_BDF -declare NEW_DRIVER -declare OLD_DRIVER -declare OLD_NUMVFS -declare DRIVER_OVERRIDE - -function write_to() { - # Unfortunately set -x does not show redirects so use echo to manually - # tell the user what commands are being run. - echo "+ echo \"${2}\" > ${1}" - echo "${2}" > ${1} -} - -function bind() { - write_to /sys/bus/pci/drivers/${2}/bind ${1} -} - -function unbind() { - write_to /sys/bus/pci/drivers/${2}/unbind ${1} -} - -function set_sriov_numvfs() { - write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} -} - -function set_driver_override() { - write_to /sys/bus/pci/devices/${1}/driver_override ${2} -} - -function clear_driver_override() { - set_driver_override ${1} "" -} - -function cleanup() { - if [ "${NEW_DRIVER}" ]; then unbind ${DEVICE_BDF} ${NEW_DRIVER} ; fi - if [ "${DRIVER_OVERRIDE}" ]; then clear_driver_override ${DEVICE_BDF} ; fi - if [ "${OLD_DRIVER}" ]; then bind ${DEVICE_BDF} ${OLD_DRIVER} ; fi - if [ "${OLD_NUMVFS}" ]; then set_sriov_numvfs ${DEVICE_BDF} ${OLD_NUMVFS} ; fi -} - -function usage() { - echo "usage: $0 [-d segment:bus:device.function] [-s] [-h] [cmd ...]" >&2 - echo >&2 - echo " -d: The BDF of the device to use for the test (required)" >&2 - echo " -h: Show this help message" >&2 - echo " -s: Drop into a shell rather than running a command" >&2 - echo >&2 - echo " cmd: The command to run and arguments to pass to it." >&2 - echo " Required when not using -s. The SBDF will be " >&2 - echo " appended to the argument list." >&2 - exit 1 -} - -function main() { - local shell - - while getopts "d:hs" opt; do - case $opt in - d) DEVICE_BDF="$OPTARG" ;; - s) shell=true ;; - *) usage ;; - esac - done - - # Shift past all optional arguments. - shift $((OPTIND - 1)) - - # Check that the user passed in the command to run. - [ ! "${shell}" ] && [ $# = 0 ] && usage - - # Check that the user passed in a BDF. - [ "${DEVICE_BDF}" ] || usage - - trap cleanup EXIT - set -e - - test -d /sys/bus/pci/devices/${DEVICE_BDF} - - if [ -f /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs ]; then - OLD_NUMVFS=$(cat /sys/bus/pci/devices/${DEVICE_BDF}/sriov_numvfs) - set_sriov_numvfs ${DEVICE_BDF} 0 - fi - - if [ -L /sys/bus/pci/devices/${DEVICE_BDF}/driver ]; then - OLD_DRIVER=$(basename $(readlink -m /sys/bus/pci/devices/${DEVICE_BDF}/driver)) - unbind ${DEVICE_BDF} ${OLD_DRIVER} - fi - - set_driver_override ${DEVICE_BDF} vfio-pci - DRIVER_OVERRIDE=true - - bind ${DEVICE_BDF} vfio-pci - NEW_DRIVER=vfio-pci - - echo - if [ "${shell}" ]; then - echo "Dropping into ${SHELL} with VFIO_SELFTESTS_BDF=${DEVICE_BDF}" - VFIO_SELFTESTS_BDF=${DEVICE_BDF} ${SHELL} - else - "$@" ${DEVICE_BDF} - fi - echo -} - -main "$@" diff --git a/tools/testing/selftests/vfio/scripts/cleanup.sh b/tools/testing/selftests/vfio/scripts/cleanup.sh new file mode 100755 index 000000000000..69c922d8aafb --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/cleanup.sh @@ -0,0 +1,41 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function cleanup_devices() { + local device_bdf + local device_dir + + for device_bdf in "$@"; do + device_dir=${DEVICES_DIR}/${device_bdf} + + if [ -f ${device_dir}/vfio-pci ]; then + unbind ${device_bdf} vfio-pci + fi + + if [ -f ${device_dir}/driver_override ]; then + clear_driver_override ${device_bdf} + fi + + if [ -f ${device_dir}/driver ]; then + bind ${device_bdf} $(cat ${device_dir}/driver) + fi + + if [ -f ${device_dir}/sriov_numvfs ]; then + set_sriov_numvfs ${device_bdf} $(cat ${device_dir}/sriov_numvfs) + fi + + rm -rf ${device_dir} + done +} + +function main() { + if [ $# = 0 ]; then + cleanup_devices $(ls ${DEVICES_DIR}) + rmdir ${DEVICES_DIR} + else + cleanup_devices "$@" + fi +} + +main "$@" diff --git a/tools/testing/selftests/vfio/scripts/lib.sh b/tools/testing/selftests/vfio/scripts/lib.sh new file mode 100755 index 000000000000..9f05f29c7b86 --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/lib.sh @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +readonly DEVICES_DIR="${TMPDIR:-/tmp}/vfio-selftests-devices" + +function write_to() { + # Unfortunately set -x does not show redirects so use echo to manually + # tell the user what commands are being run. + echo "+ echo \"${2}\" > ${1}" + echo "${2}" > ${1} +} + +function get_driver() { + if [ -L /sys/bus/pci/devices/${1}/driver ]; then + basename $(readlink -m /sys/bus/pci/devices/${1}/driver) + fi +} + +function bind() { + write_to /sys/bus/pci/drivers/${2}/bind ${1} +} + +function unbind() { + write_to /sys/bus/pci/drivers/${2}/unbind ${1} +} + +function set_sriov_numvfs() { + write_to /sys/bus/pci/devices/${1}/sriov_numvfs ${2} +} + +function get_sriov_numvfs() { + if [ -f /sys/bus/pci/devices/${1}/sriov_numvfs ]; then + cat /sys/bus/pci/devices/${1}/sriov_numvfs + fi +} + +function set_driver_override() { + write_to /sys/bus/pci/devices/${1}/driver_override ${2} +} + +function clear_driver_override() { + set_driver_override ${1} "" +} diff --git a/tools/testing/selftests/vfio/scripts/run.sh b/tools/testing/selftests/vfio/scripts/run.sh new file mode 100755 index 000000000000..91fd38f9f6f6 --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/run.sh @@ -0,0 +1,16 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function main() { + local device_bdfs=$(ls ${DEVICES_DIR}) + + if [ -z "${device_bdfs}" ]; then + echo "No devices found, skipping." + exit 4 + fi + + "$@" ${device_bdfs} +} + +main "$@" diff --git a/tools/testing/selftests/vfio/scripts/setup.sh b/tools/testing/selftests/vfio/scripts/setup.sh new file mode 100755 index 000000000000..49a499e51cbe --- /dev/null +++ b/tools/testing/selftests/vfio/scripts/setup.sh @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: GPL-2.0-or-later +set -e + +source $(dirname -- "${BASH_SOURCE[0]}")/lib.sh + +function main() { + local device_bdf + local device_dir + local numvfs + local driver + + if [ $# = 0 ]; then + echo "usage: $0 segment:bus:device.function ..." >&2 + exit 1 + fi + + for device_bdf in "$@"; do + test -d /sys/bus/pci/devices/${device_bdf} + + device_dir=${DEVICES_DIR}/${device_bdf} + if [ -d "${device_dir}" ]; then + echo "${device_bdf} has already been set up, exiting." + exit 0 + fi + + mkdir -p ${device_dir} + + numvfs=$(get_sriov_numvfs ${device_bdf}) + if [ "${numvfs}" ]; then + set_sriov_numvfs ${device_bdf} 0 + echo ${numvfs} > ${device_dir}/sriov_numvfs + fi + + driver=$(get_driver ${device_bdf}) + if [ "${driver}" ]; then + unbind ${device_bdf} ${driver} + echo ${driver} > ${device_dir}/driver + fi + + set_driver_override ${device_bdf} vfio-pci + touch ${device_dir}/driver_override + + bind ${device_bdf} vfio-pci + touch ${device_dir}/vfio-pci + done +} + +main "$@" diff --git a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c index 102603d4407d..5397822c3dd4 100644 --- a/tools/testing/selftests/vfio/vfio_dma_mapping_test.c +++ b/tools/testing/selftests/vfio/vfio_dma_mapping_test.c @@ -10,7 +10,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -94,6 +94,7 @@ static int iommu_mapping_get(const char *bdf, u64 iova, } FIXTURE(vfio_dma_mapping_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; }; @@ -119,21 +120,23 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(anonymous_hugetlb_1gb, SZ_1G, MAP_HUGETLB | FIXTURE_SETUP(vfio_dma_mapping_test) { - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); - self->iova_allocator = iova_allocator_init(self->device); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); } FIXTURE_TEARDOWN(vfio_dma_mapping_test) { iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_dma_mapping_test, dma_map_unmap) { const u64 size = variant->size ?: getpagesize(); const int flags = variant->mmap_flags; - struct vfio_dma_region region; + struct dma_region region; struct iommu_mapping mapping; u64 mapping_size = size; u64 unmapped; @@ -150,7 +153,7 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) region.iova = iova_allocator_alloc(self->iova_allocator, size); region.size = size; - vfio_pci_dma_map(self->device, ®ion); + iommu_map(self->iommu, ®ion); printf("Mapped HVA %p (size 0x%lx) at IOVA 0x%lx\n", region.vaddr, size, region.iova); ASSERT_EQ(region.iova, to_iova(self->device, region.vaddr)); @@ -192,19 +195,20 @@ TEST_F(vfio_dma_mapping_test, dma_map_unmap) } unmap: - rc = __vfio_pci_dma_unmap(self->device, ®ion, &unmapped); + rc = __iommu_unmap(self->iommu, ®ion, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region.size); printf("Unmapped IOVA 0x%lx\n", region.iova); - ASSERT_EQ(INVALID_IOVA, __to_iova(self->device, region.vaddr)); + ASSERT_NE(0, __to_iova(self->device, region.vaddr, NULL)); ASSERT_NE(0, iommu_mapping_get(device_bdf, region.iova, &mapping)); ASSERT_TRUE(!munmap(region.vaddr, size)); } FIXTURE(vfio_dma_map_limit_test) { + struct iommu *iommu; struct vfio_pci_device *device; - struct vfio_dma_region region; + struct dma_region region; size_t mmap_size; }; @@ -223,7 +227,7 @@ FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); FIXTURE_SETUP(vfio_dma_map_limit_test) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; struct iommu_iova_range *ranges; u64 region_size = getpagesize(); iova_t last_iova; @@ -235,12 +239,13 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) */ self->mmap_size = 2 * region_size; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); region->vaddr = mmap(NULL, self->mmap_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ASSERT_NE(region->vaddr, MAP_FAILED); - ranges = vfio_pci_iova_ranges(self->device, &nranges); + ranges = iommu_iova_ranges(self->iommu, &nranges); VFIO_ASSERT_NOT_NULL(ranges); last_iova = ranges[nranges - 1].last; free(ranges); @@ -253,49 +258,50 @@ FIXTURE_SETUP(vfio_dma_map_limit_test) FIXTURE_TEARDOWN(vfio_dma_map_limit_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); ASSERT_EQ(munmap(self->region.vaddr, self->mmap_size), 0); } TEST_F(vfio_dma_map_limit_test, unmap_range) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap(self->device, region, &unmapped); + rc = __iommu_unmap(self->iommu, region, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } TEST_F(vfio_dma_map_limit_test, unmap_all) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; u64 unmapped; int rc; - vfio_pci_dma_map(self->device, region); + iommu_map(self->iommu, region); ASSERT_EQ(region->iova, to_iova(self->device, region->vaddr)); - rc = __vfio_pci_dma_unmap_all(self->device, &unmapped); + rc = __iommu_unmap_all(self->iommu, &unmapped); ASSERT_EQ(rc, 0); ASSERT_EQ(unmapped, region->size); } TEST_F(vfio_dma_map_limit_test, overflow) { - struct vfio_dma_region *region = &self->region; + struct dma_region *region = &self->region; int rc; region->iova = ~(iova_t)0 & ~(region->size - 1); region->size = self->mmap_size; - rc = __vfio_pci_dma_map(self->device, region); + rc = __iommu_map(self->iommu, region); ASSERT_EQ(rc, -EOVERFLOW); - rc = __vfio_pci_dma_unmap(self->device, region, NULL); + rc = __iommu_unmap(self->iommu, region, NULL); ASSERT_EQ(rc, -EOVERFLOW); } diff --git a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c index 3655106b912d..caf1c6291f3d 100644 --- a/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c +++ b/tools/testing/selftests/vfio/vfio_iommufd_setup_test.c @@ -10,7 +10,7 @@ #include <sys/ioctl.h> #include <unistd.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" static const char iommu_dev_path[] = "/dev/iommu"; diff --git a/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c new file mode 100644 index 000000000000..33b0c31fe2ed --- /dev/null +++ b/tools/testing/selftests/vfio/vfio_pci_device_init_perf_test.c @@ -0,0 +1,168 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <pthread.h> +#include <sys/ioctl.h> +#include <sys/mman.h> + +#include <linux/sizes.h> +#include <linux/time64.h> +#include <linux/vfio.h> + +#include <libvfio.h> + +#include "../kselftest_harness.h" + +static char **device_bdfs; +static int nr_devices; + +struct thread_args { + struct iommu *iommu; + int device_index; + struct timespec start; + struct timespec end; + pthread_barrier_t *barrier; +}; + +FIXTURE(vfio_pci_device_init_perf_test) { + pthread_t *threads; + pthread_barrier_t barrier; + struct thread_args *thread_args; + struct iommu *iommu; +}; + +FIXTURE_VARIANT(vfio_pci_device_init_perf_test) { + const char *iommu_mode; +}; + +#define FIXTURE_VARIANT_ADD_IOMMU_MODE(_iommu_mode) \ +FIXTURE_VARIANT_ADD(vfio_pci_device_init_perf_test, _iommu_mode) { \ + .iommu_mode = #_iommu_mode, \ +} + +FIXTURE_VARIANT_ADD_ALL_IOMMU_MODES(); + +FIXTURE_SETUP(vfio_pci_device_init_perf_test) +{ + int i; + + self->iommu = iommu_init(variant->iommu_mode); + self->threads = calloc(nr_devices, sizeof(self->threads[0])); + self->thread_args = calloc(nr_devices, sizeof(self->thread_args[0])); + + pthread_barrier_init(&self->barrier, NULL, nr_devices); + + for (i = 0; i < nr_devices; i++) { + self->thread_args[i].iommu = self->iommu; + self->thread_args[i].barrier = &self->barrier; + self->thread_args[i].device_index = i; + } +} + +FIXTURE_TEARDOWN(vfio_pci_device_init_perf_test) +{ + iommu_cleanup(self->iommu); + free(self->threads); + free(self->thread_args); +} + +static s64 to_ns(struct timespec ts) +{ + return (s64)ts.tv_nsec + NSEC_PER_SEC * (s64)ts.tv_sec; +} + +static struct timespec to_timespec(s64 ns) +{ + struct timespec ts = { + .tv_nsec = ns % NSEC_PER_SEC, + .tv_sec = ns / NSEC_PER_SEC, + }; + + return ts; +} + +static struct timespec timespec_sub(struct timespec a, struct timespec b) +{ + return to_timespec(to_ns(a) - to_ns(b)); +} + +static struct timespec timespec_min(struct timespec a, struct timespec b) +{ + return to_ns(a) < to_ns(b) ? a : b; +} + +static struct timespec timespec_max(struct timespec a, struct timespec b) +{ + return to_ns(a) > to_ns(b) ? a : b; +} + +static void *thread_main(void *__args) +{ + struct thread_args *args = __args; + struct vfio_pci_device *device; + + pthread_barrier_wait(args->barrier); + + clock_gettime(CLOCK_MONOTONIC, &args->start); + device = vfio_pci_device_init(device_bdfs[args->device_index], args->iommu); + clock_gettime(CLOCK_MONOTONIC, &args->end); + + pthread_barrier_wait(args->barrier); + + vfio_pci_device_cleanup(device); + return NULL; +} + +TEST_F(vfio_pci_device_init_perf_test, init) +{ + struct timespec start = to_timespec(INT64_MAX), end = {}; + struct timespec min = to_timespec(INT64_MAX); + struct timespec max = {}; + struct timespec avg = {}; + struct timespec wall_time; + s64 thread_ns = 0; + int i; + + for (i = 0; i < nr_devices; i++) { + pthread_create(&self->threads[i], NULL, thread_main, + &self->thread_args[i]); + } + + for (i = 0; i < nr_devices; i++) { + struct thread_args *args = &self->thread_args[i]; + struct timespec init_time; + + pthread_join(self->threads[i], NULL); + + start = timespec_min(start, args->start); + end = timespec_max(end, args->end); + + init_time = timespec_sub(args->end, args->start); + min = timespec_min(min, init_time); + max = timespec_max(max, init_time); + thread_ns += to_ns(init_time); + } + + avg = to_timespec(thread_ns / nr_devices); + wall_time = timespec_sub(end, start); + + printf("Wall time: %lu.%09lus\n", + wall_time.tv_sec, wall_time.tv_nsec); + printf("Min init time (per device): %lu.%09lus\n", + min.tv_sec, min.tv_nsec); + printf("Max init time (per device): %lu.%09lus\n", + max.tv_sec, max.tv_nsec); + printf("Avg init time (per device): %lu.%09lus\n", + avg.tv_sec, avg.tv_nsec); +} + +int main(int argc, char *argv[]) +{ + int i; + + device_bdfs = vfio_selftests_get_bdfs(&argc, argv, &nr_devices); + + printf("Testing parallel initialization of %d devices:\n", nr_devices); + for (i = 0; i < nr_devices; i++) + printf(" %s\n", device_bdfs[i]); + + return test_harness_run(argc, argv); +} diff --git a/tools/testing/selftests/vfio/vfio_pci_device_test.c b/tools/testing/selftests/vfio/vfio_pci_device_test.c index 7a270698e4d2..ecbb669b3765 100644 --- a/tools/testing/selftests/vfio/vfio_pci_device_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_device_test.c @@ -10,7 +10,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -23,17 +23,20 @@ static const char *device_bdf; #define MAX_TEST_MSI 16U FIXTURE(vfio_pci_device_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; FIXTURE_SETUP(vfio_pci_device_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_device_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } #define read_pci_id_from_sysfs(_file) ({ \ @@ -99,6 +102,7 @@ TEST_F(vfio_pci_device_test, validate_bars) } FIXTURE(vfio_pci_irq_test) { + struct iommu *iommu; struct vfio_pci_device *device; }; @@ -116,12 +120,14 @@ FIXTURE_VARIANT_ADD(vfio_pci_irq_test, msix) { FIXTURE_SETUP(vfio_pci_irq_test) { - self->device = vfio_pci_device_init(device_bdf, default_iommu_mode); + self->iommu = iommu_init(default_iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); } FIXTURE_TEARDOWN(vfio_pci_irq_test) { vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_irq_test, enable_trigger_disable) diff --git a/tools/testing/selftests/vfio/vfio_pci_driver_test.c b/tools/testing/selftests/vfio/vfio_pci_driver_test.c index f69eec8b928d..f0ca8310d6a8 100644 --- a/tools/testing/selftests/vfio/vfio_pci_driver_test.c +++ b/tools/testing/selftests/vfio/vfio_pci_driver_test.c @@ -5,7 +5,7 @@ #include <linux/sizes.h> #include <linux/vfio.h> -#include <vfio_util.h> +#include <libvfio.h> #include "../kselftest_harness.h" @@ -18,9 +18,9 @@ static const char *device_bdf; ASSERT_EQ(EAGAIN, errno); \ } while (0) -static void region_setup(struct vfio_pci_device *device, +static void region_setup(struct iommu *iommu, struct iova_allocator *iova_allocator, - struct vfio_dma_region *region, u64 size) + struct dma_region *region, u64 size) { const int flags = MAP_SHARED | MAP_ANONYMOUS; const int prot = PROT_READ | PROT_WRITE; @@ -33,20 +33,20 @@ static void region_setup(struct vfio_pci_device *device, region->iova = iova_allocator_alloc(iova_allocator, size); region->size = size; - vfio_pci_dma_map(device, region); + iommu_map(iommu, region); } -static void region_teardown(struct vfio_pci_device *device, - struct vfio_dma_region *region) +static void region_teardown(struct iommu *iommu, struct dma_region *region) { - vfio_pci_dma_unmap(device, region); + iommu_unmap(iommu, region); VFIO_ASSERT_EQ(munmap(region->vaddr, region->size), 0); } FIXTURE(vfio_pci_driver_test) { + struct iommu *iommu; struct vfio_pci_device *device; struct iova_allocator *iova_allocator; - struct vfio_dma_region memcpy_region; + struct dma_region memcpy_region; void *vaddr; int msi_fd; @@ -73,13 +73,14 @@ FIXTURE_SETUP(vfio_pci_driver_test) { struct vfio_pci_driver *driver; - self->device = vfio_pci_device_init(device_bdf, variant->iommu_mode); - self->iova_allocator = iova_allocator_init(self->device); + self->iommu = iommu_init(variant->iommu_mode); + self->device = vfio_pci_device_init(device_bdf, self->iommu); + self->iova_allocator = iova_allocator_init(self->iommu); driver = &self->device->driver; - region_setup(self->device, self->iova_allocator, &self->memcpy_region, SZ_1G); - region_setup(self->device, self->iova_allocator, &driver->region, SZ_2M); + region_setup(self->iommu, self->iova_allocator, &self->memcpy_region, SZ_1G); + region_setup(self->iommu, self->iova_allocator, &driver->region, SZ_2M); /* Any IOVA that doesn't overlap memcpy_region and driver->region. */ self->unmapped_iova = iova_allocator_alloc(self->iova_allocator, SZ_1G); @@ -108,11 +109,12 @@ FIXTURE_TEARDOWN(vfio_pci_driver_test) vfio_pci_driver_remove(self->device); - region_teardown(self->device, &self->memcpy_region); - region_teardown(self->device, &driver->region); + region_teardown(self->iommu, &self->memcpy_region); + region_teardown(self->iommu, &driver->region); iova_allocator_cleanup(self->iova_allocator); vfio_pci_device_cleanup(self->device); + iommu_cleanup(self->iommu); } TEST_F(vfio_pci_driver_test, init_remove) @@ -231,18 +233,31 @@ TEST_F_TIMEOUT(vfio_pci_driver_test, memcpy_storm, 60) ASSERT_NO_MSI(self->msi_fd); } -int main(int argc, char *argv[]) +static bool device_has_selftests_driver(const char *bdf) { struct vfio_pci_device *device; + struct iommu *iommu; + bool has_driver; + + iommu = iommu_init(default_iommu_mode); + device = vfio_pci_device_init(device_bdf, iommu); + + has_driver = !!device->driver.ops; + + vfio_pci_device_cleanup(device); + iommu_cleanup(iommu); + return has_driver; +} + +int main(int argc, char *argv[]) +{ device_bdf = vfio_selftests_get_bdf(&argc, argv); - device = vfio_pci_device_init(device_bdf, default_iommu_mode); - if (!device->driver.ops) { + if (!device_has_selftests_driver(device_bdf)) { fprintf(stderr, "No driver found for device %s\n", device_bdf); return KSFT_SKIP; } - vfio_pci_device_cleanup(device); return test_harness_run(argc, argv); } diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index 656e1c75b711..93d21bc7e112 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -48,6 +48,8 @@ static struct anon_vma dummy_anon_vma; #define ASSERT_EQ(_val1, _val2) ASSERT_TRUE((_val1) == (_val2)) #define ASSERT_NE(_val1, _val2) ASSERT_TRUE((_val1) != (_val2)) +#define IS_SET(_val, _flags) ((_val & _flags) == _flags) + static struct task_struct __current; struct task_struct *get_current(void) @@ -67,18 +69,18 @@ static struct vm_area_struct *alloc_vma(struct mm_struct *mm, pgoff_t pgoff, vm_flags_t vm_flags) { - struct vm_area_struct *ret = vm_area_alloc(mm); + struct vm_area_struct *vma = vm_area_alloc(mm); - if (ret == NULL) + if (vma == NULL) return NULL; - ret->vm_start = start; - ret->vm_end = end; - ret->vm_pgoff = pgoff; - ret->__vm_flags = vm_flags; - vma_assert_detached(ret); + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + vm_flags_reset(vma, vm_flags); + vma_assert_detached(vma); - return ret; + return vma; } /* Helper function to allocate a VMA and link it to the tree. */ @@ -339,6 +341,7 @@ static bool test_simple_modify(void) struct mm_struct mm = {}; struct vm_area_struct *init_vma = alloc_vma(&mm, 0, 0x3000, 0, vm_flags); VMA_ITERATOR(vmi, &mm, 0x1000); + vm_flags_t flags = VM_READ | VM_MAYREAD; ASSERT_FALSE(attach_vma(&mm, init_vma)); @@ -347,7 +350,7 @@ static bool test_simple_modify(void) * performs the merge/split only. */ vma = vma_modify_flags(&vmi, init_vma, init_vma, - 0x1000, 0x2000, VM_READ | VM_MAYREAD); + 0x1000, 0x2000, &flags); ASSERT_NE(vma, NULL); /* We modify the provided VMA, and on split allocate new VMAs. */ ASSERT_EQ(vma, init_vma); @@ -441,7 +444,7 @@ static bool test_simple_shrink(void) return true; } -static bool test_merge_new(void) +static bool __test_merge_new(bool is_sticky, bool a_is_sticky, bool b_is_sticky, bool c_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; struct mm_struct mm = {}; @@ -469,23 +472,32 @@ static bool test_merge_new(void) struct vm_area_struct *vma, *vma_a, *vma_b, *vma_c, *vma_d; bool merged; + if (is_sticky) + vm_flags |= VM_STICKY; + /* * 0123456789abc * AA B CC */ vma_a = alloc_and_link_vma(&mm, 0, 0x2000, 0, vm_flags); ASSERT_NE(vma_a, NULL); + if (a_is_sticky) + vm_flags_set(vma_a, VM_STICKY); /* We give each VMA a single avc so we can test anon_vma duplication. */ INIT_LIST_HEAD(&vma_a->anon_vma_chain); list_add(&dummy_anon_vma_chain_a.same_vma, &vma_a->anon_vma_chain); vma_b = alloc_and_link_vma(&mm, 0x3000, 0x4000, 3, vm_flags); ASSERT_NE(vma_b, NULL); + if (b_is_sticky) + vm_flags_set(vma_b, VM_STICKY); INIT_LIST_HEAD(&vma_b->anon_vma_chain); list_add(&dummy_anon_vma_chain_b.same_vma, &vma_b->anon_vma_chain); vma_c = alloc_and_link_vma(&mm, 0xb000, 0xc000, 0xb, vm_flags); ASSERT_NE(vma_c, NULL); + if (c_is_sticky) + vm_flags_set(vma_c, VM_STICKY); INIT_LIST_HEAD(&vma_c->anon_vma_chain); list_add(&dummy_anon_vma_chain_c.same_vma, &vma_c->anon_vma_chain); @@ -520,6 +532,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky || b_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to PREVIOUS VMA. @@ -537,6 +551,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -556,6 +572,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 3); + if (is_sticky) /* D uses is_sticky. */ + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -574,6 +592,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || a_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge to NEXT VMA. @@ -592,6 +612,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Merge BOTH sides. @@ -609,6 +631,8 @@ static bool test_merge_new(void) ASSERT_EQ(vma->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 1); + if (is_sticky || a_is_sticky || c_is_sticky) + ASSERT_TRUE(IS_SET(vma->vm_flags, VM_STICKY)); /* * Final state. @@ -637,6 +661,20 @@ static bool test_merge_new(void) return true; } +static bool test_merge_new(void) +{ + int i, j, k, l; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + for (l = 0; l < 2; l++) + ASSERT_TRUE(__test_merge_new(i, j, k, l)); + + return true; +} + static bool test_vma_merge_special_flags(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; @@ -676,7 +714,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_new(&vmg); ASSERT_EQ(vma, NULL); @@ -698,7 +736,7 @@ static bool test_vma_merge_special_flags(void) for (i = 0; i < ARRAY_SIZE(special_flags); i++) { vm_flags_t special_flag = special_flags[i]; - vma_left->__vm_flags = vm_flags | special_flag; + vm_flags_reset(vma_left, vm_flags | special_flag); vmg.vm_flags = vm_flags | special_flag; vma = merge_existing(&vmg); ASSERT_EQ(vma, NULL); @@ -973,9 +1011,11 @@ static bool test_vma_merge_new_with_close(void) return true; } -static bool test_merge_existing(void) +static bool __test_merge_existing(bool prev_is_sticky, bool middle_is_sticky, bool next_is_sticky) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; + vm_flags_t prev_flags = vm_flags; + vm_flags_t next_flags = vm_flags; struct mm_struct mm = {}; VMA_ITERATOR(vmi, &mm, 0); struct vm_area_struct *vma, *vma_prev, *vma_next; @@ -988,6 +1028,13 @@ static bool test_merge_existing(void) }; struct anon_vma_chain avc = {}; + if (prev_is_sticky) + prev_flags |= VM_STICKY; + if (middle_is_sticky) + vm_flags |= VM_STICKY; + if (next_is_sticky) + next_flags |= VM_STICKY; + /* * Merge right case - partial span. * @@ -1000,7 +1047,7 @@ static bool test_merge_existing(void) */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x3000, 0x6000, 3, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1018,6 +1065,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma)); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 2); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1033,7 +1082,7 @@ static bool test_merge_existing(void) * NNNNNNN */ vma = alloc_and_link_vma(&mm, 0x2000, 0x6000, 2, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x6000, 0x9000, 6, next_flags); vma_next->vm_ops = &vm_ops; /* This should have no impact. */ vmg_set_range_anon_vma(&vmg, 0x2000, 0x6000, 2, vm_flags, &dummy_anon_vma); vmg.middle = vma; @@ -1046,6 +1095,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_next->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_next)); ASSERT_EQ(mm.map_count, 1); + if (middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_next->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1060,7 +1111,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPV */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vma->vm_ops = &vm_ops; /* This should have no impact. */ @@ -1080,6 +1131,8 @@ static bool test_merge_existing(void) ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_TRUE(vma_write_started(vma)); ASSERT_EQ(mm.map_count, 2); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 2); @@ -1094,7 +1147,7 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); @@ -1109,6 +1162,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted vma. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1123,10 +1178,10 @@ static bool test_merge_existing(void) * 0123456789 * PPPPPPPPPP */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma_prev->vm_ops = &vm_ops; /* This should have no impact. */ vma = alloc_and_link_vma(&mm, 0x3000, 0x7000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x7000, 0x9000, 7, next_flags); vmg_set_range_anon_vma(&vmg, 0x3000, 0x7000, 3, vm_flags, &dummy_anon_vma); vmg.prev = vma_prev; vmg.middle = vma; @@ -1139,6 +1194,8 @@ static bool test_merge_existing(void) ASSERT_EQ(vma_prev->anon_vma, &dummy_anon_vma); ASSERT_TRUE(vma_write_started(vma_prev)); ASSERT_EQ(mm.map_count, 1); + if (prev_is_sticky || middle_is_sticky || next_is_sticky) + ASSERT_TRUE(IS_SET(vma_prev->vm_flags, VM_STICKY)); /* Clear down and reset. We should have deleted prev and next. */ ASSERT_EQ(cleanup_mm(&mm, &vmi), 1); @@ -1158,9 +1215,9 @@ static bool test_merge_existing(void) * PPPVVVVVNNN */ - vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, vm_flags); + vma_prev = alloc_and_link_vma(&mm, 0, 0x3000, 0, prev_flags); vma = alloc_and_link_vma(&mm, 0x3000, 0x8000, 3, vm_flags); - vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, vm_flags); + vma_next = alloc_and_link_vma(&mm, 0x8000, 0xa000, 8, next_flags); vmg_set_range(&vmg, 0x4000, 0x5000, 4, vm_flags); vmg.prev = vma; @@ -1203,6 +1260,19 @@ static bool test_merge_existing(void) return true; } +static bool test_merge_existing(void) +{ + int i, j, k; + + /* Generate every possible permutation of sticky flags. */ + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + ASSERT_TRUE(__test_merge_existing(i, j, k)); + + return true; +} + static bool test_anon_vma_non_mergeable(void) { vm_flags_t vm_flags = VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE; diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index dc976a285ad2..9f0a9f5ed0fe 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -46,42 +46,272 @@ extern unsigned long dac_mmap_min_addr; #define MMF_HAS_MDWE 28 +/* + * vm_flags in vm_area_struct, see mm_types.h. + * When changing, update also include/trace/events/mmflags.h + */ + #define VM_NONE 0x00000000 -#define VM_READ 0x00000001 -#define VM_WRITE 0x00000002 -#define VM_EXEC 0x00000004 -#define VM_SHARED 0x00000008 -#define VM_MAYREAD 0x00000010 -#define VM_MAYWRITE 0x00000020 -#define VM_MAYEXEC 0x00000040 -#define VM_GROWSDOWN 0x00000100 -#define VM_PFNMAP 0x00000400 -#define VM_LOCKED 0x00002000 -#define VM_IO 0x00004000 -#define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ -#define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ -#define VM_DONTEXPAND 0x00040000 -#define VM_LOCKONFAULT 0x00080000 -#define VM_ACCOUNT 0x00100000 -#define VM_NORESERVE 0x00200000 -#define VM_MIXEDMAP 0x10000000 -#define VM_STACK VM_GROWSDOWN -#define VM_SHADOW_STACK VM_NONE -#define VM_SOFTDIRTY 0 -#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ -#define VM_GROWSUP VM_NONE -#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/** + * typedef vma_flag_t - specifies an individual VMA flag by bit number. + * + * This value is made type safe by sparse to avoid passing invalid flag values + * around. + */ +typedef int __bitwise vma_flag_t; +#define DECLARE_VMA_BIT(name, bitnum) \ + VMA_ ## name ## _BIT = ((__force vma_flag_t)bitnum) +#define DECLARE_VMA_BIT_ALIAS(name, aliased) \ + VMA_ ## name ## _BIT = VMA_ ## aliased ## _BIT +enum { + DECLARE_VMA_BIT(READ, 0), + DECLARE_VMA_BIT(WRITE, 1), + DECLARE_VMA_BIT(EXEC, 2), + DECLARE_VMA_BIT(SHARED, 3), + /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ + DECLARE_VMA_BIT(MAYREAD, 4), /* limits for mprotect() etc. */ + DECLARE_VMA_BIT(MAYWRITE, 5), + DECLARE_VMA_BIT(MAYEXEC, 6), + DECLARE_VMA_BIT(MAYSHARE, 7), + DECLARE_VMA_BIT(GROWSDOWN, 8), /* general info on the segment */ +#ifdef CONFIG_MMU + DECLARE_VMA_BIT(UFFD_MISSING, 9),/* missing pages tracking */ +#else + /* nommu: R/O MAP_PRIVATE mapping that might overlay a file mapping */ + DECLARE_VMA_BIT(MAYOVERLAY, 9), +#endif /* CONFIG_MMU */ + /* Page-ranges managed without "struct page", just pure PFN */ + DECLARE_VMA_BIT(PFNMAP, 10), + DECLARE_VMA_BIT(MAYBE_GUARD, 11), + DECLARE_VMA_BIT(UFFD_WP, 12), /* wrprotect pages tracking */ + DECLARE_VMA_BIT(LOCKED, 13), + DECLARE_VMA_BIT(IO, 14), /* Memory mapped I/O or similar */ + DECLARE_VMA_BIT(SEQ_READ, 15), /* App will access data sequentially */ + DECLARE_VMA_BIT(RAND_READ, 16), /* App will not benefit from clustered reads */ + DECLARE_VMA_BIT(DONTCOPY, 17), /* Do not copy this vma on fork */ + DECLARE_VMA_BIT(DONTEXPAND, 18),/* Cannot expand with mremap() */ + DECLARE_VMA_BIT(LOCKONFAULT, 19),/* Lock pages covered when faulted in */ + DECLARE_VMA_BIT(ACCOUNT, 20), /* Is a VM accounted object */ + DECLARE_VMA_BIT(NORESERVE, 21), /* should the VM suppress accounting */ + DECLARE_VMA_BIT(HUGETLB, 22), /* Huge TLB Page VM */ + DECLARE_VMA_BIT(SYNC, 23), /* Synchronous page faults */ + DECLARE_VMA_BIT(ARCH_1, 24), /* Architecture-specific flag */ + DECLARE_VMA_BIT(WIPEONFORK, 25),/* Wipe VMA contents in child. */ + DECLARE_VMA_BIT(DONTDUMP, 26), /* Do not include in the core dump */ + DECLARE_VMA_BIT(SOFTDIRTY, 27), /* NOT soft dirty clean area */ + DECLARE_VMA_BIT(MIXEDMAP, 28), /* Can contain struct page and pure PFN pages */ + DECLARE_VMA_BIT(HUGEPAGE, 29), /* MADV_HUGEPAGE marked this vma */ + DECLARE_VMA_BIT(NOHUGEPAGE, 30),/* MADV_NOHUGEPAGE marked this vma */ + DECLARE_VMA_BIT(MERGEABLE, 31), /* KSM may merge identical pages */ + /* These bits are reused, we define specific uses below. */ + DECLARE_VMA_BIT(HIGH_ARCH_0, 32), + DECLARE_VMA_BIT(HIGH_ARCH_1, 33), + DECLARE_VMA_BIT(HIGH_ARCH_2, 34), + DECLARE_VMA_BIT(HIGH_ARCH_3, 35), + DECLARE_VMA_BIT(HIGH_ARCH_4, 36), + DECLARE_VMA_BIT(HIGH_ARCH_5, 37), + DECLARE_VMA_BIT(HIGH_ARCH_6, 38), + /* + * This flag is used to connect VFIO to arch specific KVM code. It + * indicates that the memory under this VMA is safe for use with any + * non-cachable memory type inside KVM. Some VFIO devices, on some + * platforms, are thought to be unsafe and can cause machine crashes + * if KVM does not lock down the memory type. + */ + DECLARE_VMA_BIT(ALLOW_ANY_UNCACHED, 39), +#ifdef CONFIG_PPC32 + DECLARE_VMA_BIT_ALIAS(DROPPABLE, ARCH_1), +#else + DECLARE_VMA_BIT(DROPPABLE, 40), +#endif + DECLARE_VMA_BIT(UFFD_MINOR, 41), + DECLARE_VMA_BIT(SEALED, 42), + /* Flags that reuse flags above. */ + DECLARE_VMA_BIT_ALIAS(PKEY_BIT0, HIGH_ARCH_0), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT1, HIGH_ARCH_1), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT2, HIGH_ARCH_2), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT3, HIGH_ARCH_3), + DECLARE_VMA_BIT_ALIAS(PKEY_BIT4, HIGH_ARCH_4), +#if defined(CONFIG_X86_USER_SHADOW_STACK) + /* + * VM_SHADOW_STACK should not be set with VM_SHARED because of lack of + * support core mm. + * + * These VMAs will get a single end guard page. This helps userspace + * protect itself from attacks. A single page is enough for current + * shadow stack archs (x86). See the comments near alloc_shstk() in + * arch/x86/kernel/shstk.c for more details on the guard size. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_5), +#elif defined(CONFIG_ARM64_GCS) + /* + * arm64's Guarded Control Stack implements similar functionality and + * has similar constraints to shadow stacks. + */ + DECLARE_VMA_BIT_ALIAS(SHADOW_STACK, HIGH_ARCH_6), +#endif + DECLARE_VMA_BIT_ALIAS(SAO, ARCH_1), /* Strong Access Ordering (powerpc) */ + DECLARE_VMA_BIT_ALIAS(GROWSUP, ARCH_1), /* parisc */ + DECLARE_VMA_BIT_ALIAS(SPARC_ADI, ARCH_1), /* sparc64 */ + DECLARE_VMA_BIT_ALIAS(ARM64_BTI, ARCH_1), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(ARCH_CLEAR, ARCH_1), /* sparc64, arm64 */ + DECLARE_VMA_BIT_ALIAS(MAPPED_COPY, ARCH_1), /* !CONFIG_MMU */ + DECLARE_VMA_BIT_ALIAS(MTE, HIGH_ARCH_4), /* arm64 */ + DECLARE_VMA_BIT_ALIAS(MTE_ALLOWED, HIGH_ARCH_5),/* arm64 */ #ifdef CONFIG_STACK_GROWSUP -#define VM_STACK VM_GROWSUP -#define VM_STACK_EARLY VM_GROWSDOWN + DECLARE_VMA_BIT_ALIAS(STACK, GROWSUP), + DECLARE_VMA_BIT_ALIAS(STACK_EARLY, GROWSDOWN), +#else + DECLARE_VMA_BIT_ALIAS(STACK, GROWSDOWN), +#endif +}; + +#define INIT_VM_FLAG(name) BIT((__force int) VMA_ ## name ## _BIT) +#define VM_READ INIT_VM_FLAG(READ) +#define VM_WRITE INIT_VM_FLAG(WRITE) +#define VM_EXEC INIT_VM_FLAG(EXEC) +#define VM_SHARED INIT_VM_FLAG(SHARED) +#define VM_MAYREAD INIT_VM_FLAG(MAYREAD) +#define VM_MAYWRITE INIT_VM_FLAG(MAYWRITE) +#define VM_MAYEXEC INIT_VM_FLAG(MAYEXEC) +#define VM_MAYSHARE INIT_VM_FLAG(MAYSHARE) +#define VM_GROWSDOWN INIT_VM_FLAG(GROWSDOWN) +#ifdef CONFIG_MMU +#define VM_UFFD_MISSING INIT_VM_FLAG(UFFD_MISSING) +#else +#define VM_UFFD_MISSING VM_NONE +#define VM_MAYOVERLAY INIT_VM_FLAG(MAYOVERLAY) +#endif +#define VM_PFNMAP INIT_VM_FLAG(PFNMAP) +#define VM_MAYBE_GUARD INIT_VM_FLAG(MAYBE_GUARD) +#define VM_UFFD_WP INIT_VM_FLAG(UFFD_WP) +#define VM_LOCKED INIT_VM_FLAG(LOCKED) +#define VM_IO INIT_VM_FLAG(IO) +#define VM_SEQ_READ INIT_VM_FLAG(SEQ_READ) +#define VM_RAND_READ INIT_VM_FLAG(RAND_READ) +#define VM_DONTCOPY INIT_VM_FLAG(DONTCOPY) +#define VM_DONTEXPAND INIT_VM_FLAG(DONTEXPAND) +#define VM_LOCKONFAULT INIT_VM_FLAG(LOCKONFAULT) +#define VM_ACCOUNT INIT_VM_FLAG(ACCOUNT) +#define VM_NORESERVE INIT_VM_FLAG(NORESERVE) +#define VM_HUGETLB INIT_VM_FLAG(HUGETLB) +#define VM_SYNC INIT_VM_FLAG(SYNC) +#define VM_ARCH_1 INIT_VM_FLAG(ARCH_1) +#define VM_WIPEONFORK INIT_VM_FLAG(WIPEONFORK) +#define VM_DONTDUMP INIT_VM_FLAG(DONTDUMP) +#ifdef CONFIG_MEM_SOFT_DIRTY +#define VM_SOFTDIRTY INIT_VM_FLAG(SOFTDIRTY) +#else +#define VM_SOFTDIRTY VM_NONE +#endif +#define VM_MIXEDMAP INIT_VM_FLAG(MIXEDMAP) +#define VM_HUGEPAGE INIT_VM_FLAG(HUGEPAGE) +#define VM_NOHUGEPAGE INIT_VM_FLAG(NOHUGEPAGE) +#define VM_MERGEABLE INIT_VM_FLAG(MERGEABLE) +#define VM_STACK INIT_VM_FLAG(STACK) +#ifdef CONFIG_STACK_GROWS_UP +#define VM_STACK_EARLY INIT_VM_FLAG(STACK_EARLY) +#else +#define VM_STACK_EARLY VM_NONE +#endif +#ifdef CONFIG_ARCH_HAS_PKEYS +#define VM_PKEY_SHIFT ((__force int)VMA_HIGH_ARCH_0_BIT) +/* Despite the naming, these are FLAGS not bits. */ +#define VM_PKEY_BIT0 INIT_VM_FLAG(PKEY_BIT0) +#define VM_PKEY_BIT1 INIT_VM_FLAG(PKEY_BIT1) +#define VM_PKEY_BIT2 INIT_VM_FLAG(PKEY_BIT2) +#if CONFIG_ARCH_PKEY_BITS > 3 +#define VM_PKEY_BIT3 INIT_VM_FLAG(PKEY_BIT3) +#else +#define VM_PKEY_BIT3 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 3 */ +#if CONFIG_ARCH_PKEY_BITS > 4 +#define VM_PKEY_BIT4 INIT_VM_FLAG(PKEY_BIT4) +#else +#define VM_PKEY_BIT4 VM_NONE +#endif /* CONFIG_ARCH_PKEY_BITS > 4 */ +#endif /* CONFIG_ARCH_HAS_PKEYS */ +#if defined(CONFIG_X86_USER_SHADOW_STACK) || defined(CONFIG_ARM64_GCS) +#define VM_SHADOW_STACK INIT_VM_FLAG(SHADOW_STACK) +#else +#define VM_SHADOW_STACK VM_NONE +#endif +#if defined(CONFIG_PPC64) +#define VM_SAO INIT_VM_FLAG(SAO) +#elif defined(CONFIG_PARISC) +#define VM_GROWSUP INIT_VM_FLAG(GROWSUP) +#elif defined(CONFIG_SPARC64) +#define VM_SPARC_ADI INIT_VM_FLAG(SPARC_ADI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif defined(CONFIG_ARM64) +#define VM_ARM64_BTI INIT_VM_FLAG(ARM64_BTI) +#define VM_ARCH_CLEAR INIT_VM_FLAG(ARCH_CLEAR) +#elif !defined(CONFIG_MMU) +#define VM_MAPPED_COPY INIT_VM_FLAG(MAPPED_COPY) +#endif +#ifndef VM_GROWSUP +#define VM_GROWSUP VM_NONE +#endif +#ifdef CONFIG_ARM64_MTE +#define VM_MTE INIT_VM_FLAG(MTE) +#define VM_MTE_ALLOWED INIT_VM_FLAG(MTE_ALLOWED) +#else +#define VM_MTE VM_NONE +#define VM_MTE_ALLOWED VM_NONE +#endif +#ifdef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR +#define VM_UFFD_MINOR INIT_VM_FLAG(UFFD_MINOR) #else -#define VM_STACK VM_GROWSDOWN -#define VM_STACK_EARLY 0 +#define VM_UFFD_MINOR VM_NONE +#endif +#ifdef CONFIG_64BIT +#define VM_ALLOW_ANY_UNCACHED INIT_VM_FLAG(ALLOW_ANY_UNCACHED) +#define VM_SEALED INIT_VM_FLAG(SEALED) +#else +#define VM_ALLOW_ANY_UNCACHED VM_NONE +#define VM_SEALED VM_NONE +#endif +#if defined(CONFIG_64BIT) || defined(CONFIG_PPC32) +#define VM_DROPPABLE INIT_VM_FLAG(DROPPABLE) +#else +#define VM_DROPPABLE VM_NONE +#endif + +/* Bits set in the VMA until the stack is in its final location */ +#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) + +#define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) + +/* Common data flag combinations */ +#define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ + VM_MAYWRITE | VM_MAYEXEC) +#define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ +#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC +#endif + +#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ +#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif +#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) + +#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) + +/* VMA basic access permission flags */ +#define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) + +/* + * Special vmas that are non-mergable, non-mlock()able. + */ +#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) + #define DEFAULT_MAP_WINDOW ((1UL << 47) - PAGE_SIZE) #define TASK_SIZE_LOW DEFAULT_MAP_WINDOW #define TASK_SIZE_MAX DEFAULT_MAP_WINDOW @@ -96,25 +326,58 @@ extern unsigned long dac_mmap_min_addr; #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) -#define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_TSK_EXEC - -#define VM_STARTGAP_FLAGS (VM_GROWSDOWN | VM_SHADOW_STACK) - -#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS -#define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) -#define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ | VM_STACK_EARLY) - #define RLIMIT_STACK 3 /* max stack size */ #define RLIMIT_MEMLOCK 8 /* max locked-in-memory address space */ #define CAP_IPC_LOCK 14 -#ifdef CONFIG_64BIT -#define VM_SEALED_BIT 42 -#define VM_SEALED BIT(VM_SEALED_BIT) -#else -#define VM_SEALED VM_NONE -#endif +/* + * Flags which should be 'sticky' on merge - that is, flags which, when one VMA + * possesses it but the other does not, the merged VMA should nonetheless have + * applied to it: + * + * VM_SOFTDIRTY - if a VMA is marked soft-dirty, that is has not had its + * references cleared via /proc/$pid/clear_refs, any merged VMA + * should be considered soft-dirty also as it operates at a VMA + * granularity. + */ +#define VM_STICKY (VM_SOFTDIRTY | VM_MAYBE_GUARD) + +/* + * VMA flags we ignore for the purposes of merge, i.e. one VMA possessing one + * of these flags and the other not does not preclude a merge. + * + * VM_STICKY - When merging VMAs, VMA flags must match, unless they are + * 'sticky'. If any sticky flags exist in either VMA, we simply + * set all of them on the merged VMA. + */ +#define VM_IGNORE_MERGE VM_STICKY + +/* + * Flags which should result in page tables being copied on fork. These are + * flags which indicate that the VMA maps page tables which cannot be + * reconsistuted upon page fault, so necessitate page table copying upon + * + * VM_PFNMAP / VM_MIXEDMAP - These contain kernel-mapped data which cannot be + * reasonably reconstructed on page fault. + * + * VM_UFFD_WP - Encodes metadata about an installed uffd + * write protect handler, which cannot be + * reconstructed on page fault. + * + * We always copy pgtables when dst_vma has uffd-wp + * enabled even if it's file-backed + * (e.g. shmem). Because when uffd-wp is enabled, + * pgtable contains uffd-wp protection information, + * that's something we can't retrieve from page cache, + * and skip copying will lose those info. + * + * VM_MAYBE_GUARD - Could contain page guard region markers which + * by design are a property of the page tables + * only and thus cannot be reconstructed on page + * fault. + */ +#define VM_COPY_ON_FORK (VM_PFNMAP | VM_MIXEDMAP | VM_UFFD_WP | VM_MAYBE_GUARD) #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL @@ -163,6 +426,8 @@ typedef __bitwise unsigned int vm_fault_t; #define ASSERT_EXCLUSIVE_WRITER(x) +#define pgtable_supports_soft_dirty() 1 + /** * swap - swap values of @a and @b * @a: first value @@ -259,6 +524,15 @@ typedef struct { __private DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); } mm_flags_t; +/* + * Opaque type representing current VMA (vm_area_struct) flag state. Must be + * accessed via vma_flags_xxx() helper functions. + */ +#define NUM_VMA_FLAG_BITS BITS_PER_LONG +typedef struct { + DECLARE_BITMAP(__vma_flags, NUM_VMA_FLAG_BITS); +} __private vma_flags_t; + struct mm_struct { struct maple_tree mm_mt; int map_count; /* number of VMAs */ @@ -275,6 +549,57 @@ struct mm_struct { struct vm_area_struct; + +/* What action should be taken after an .mmap_prepare call is complete? */ +enum mmap_action_type { + MMAP_NOTHING, /* Mapping is complete, no further action. */ + MMAP_REMAP_PFN, /* Remap PFN range. */ + MMAP_IO_REMAP_PFN, /* I/O remap PFN range. */ +}; + +/* + * Describes an action an mmap_prepare hook can instruct to be taken to complete + * the mapping of a VMA. Specified in vm_area_desc. + */ +struct mmap_action { + union { + /* Remap range. */ + struct { + unsigned long start; + unsigned long start_pfn; + unsigned long size; + pgprot_t pgprot; + } remap; + }; + enum mmap_action_type type; + + /* + * If specified, this hook is invoked after the selected action has been + * successfully completed. Note that the VMA write lock still held. + * + * The absolute minimum ought to be done here. + * + * Returns 0 on success, or an error code. + */ + int (*success_hook)(const struct vm_area_struct *vma); + + /* + * If specified, this hook is invoked when an error occurred when + * attempting the selection action. + * + * The hook can return an error code in order to filter the error, but + * it is not valid to clear the error here. + */ + int (*error_hook)(int err); + + /* + * This should be set in rare instances where the operation required + * that the rmap should not be able to access the VMA until + * completely set up. + */ + bool hide_from_rmap_until_complete :1; +}; + /* * Describes a VMA that is about to be mmap()'ed. Drivers may choose to * manipulate mutable fields which will cause those fields to be updated in the @@ -292,12 +617,18 @@ struct vm_area_desc { /* Mutable fields. Populated with initial state. */ pgoff_t pgoff; struct file *vm_file; - vm_flags_t vm_flags; + union { + vm_flags_t vm_flags; + vma_flags_t vma_flags; + }; pgprot_t page_prot; /* Write-only fields. */ const struct vm_operations_struct *vm_ops; void *private_data; + + /* Take further action? */ + struct mmap_action action; }; struct file_operations { @@ -335,7 +666,7 @@ struct vm_area_struct { */ union { const vm_flags_t vm_flags; - vm_flags_t __private __vm_flags; + vma_flags_t flags; }; #ifdef CONFIG_PER_VMA_LOCK @@ -794,8 +1125,7 @@ static inline void update_hiwater_vm(struct mm_struct *mm) static inline void unmap_vmas(struct mmu_gather *tlb, struct ma_state *mas, struct vm_area_struct *vma, unsigned long start_addr, - unsigned long end_addr, unsigned long tree_end, - bool mm_wr_locked) + unsigned long end_addr, unsigned long tree_end) { } @@ -844,6 +1174,14 @@ static inline void vma_start_write(struct vm_area_struct *vma) vma->vm_lock_seq++; } +static inline __must_check +int vma_start_write_killable(struct vm_area_struct *vma) +{ + /* Used to indicate to tests that a write operation has begun. */ + vma->vm_lock_seq++; + return 0; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -1042,26 +1380,6 @@ static inline bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, return true; } -static inline void vm_flags_init(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma->__vm_flags = flags; -} - -static inline void vm_flags_set(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags |= flags; -} - -static inline void vm_flags_clear(struct vm_area_struct *vma, - vm_flags_t flags) -{ - vma_start_write(vma); - vma->__vm_flags &= ~flags; -} - static inline int shmem_zero_setup(struct vm_area_struct *vma) { return 0; @@ -1218,13 +1536,118 @@ static inline void userfaultfd_unmap_complete(struct mm_struct *mm, { } -# define ACCESS_PRIVATE(p, member) ((p)->member) +#define ACCESS_PRIVATE(p, member) ((p)->member) + +#define bitmap_size(nbits) (ALIGN(nbits, BITS_PER_LONG) / BITS_PER_BYTE) + +static __always_inline void bitmap_zero(unsigned long *dst, unsigned int nbits) +{ + unsigned int len = bitmap_size(nbits); + + if (small_const_nbits(nbits)) + *dst = 0; + else + memset(dst, 0, len); +} static inline bool mm_flags_test(int flag, const struct mm_struct *mm) { return test_bit(flag, ACCESS_PRIVATE(&mm->flags, __mm_flags)); } +/* Clears all bits in the VMA flags bitmap, non-atomically. */ +static inline void vma_flags_clear_all(vma_flags_t *flags) +{ + bitmap_zero(ACCESS_PRIVATE(flags, __vma_flags), NUM_VMA_FLAG_BITS); +} + +/* + * Copy value to the first system word of VMA flags, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word(vma_flags_t *flags, unsigned long value) +{ + *ACCESS_PRIVATE(flags, __vma_flags) = value; +} + +/* + * Copy value to the first system word of VMA flags ONCE, non-atomically. + * + * IMPORTANT: This does not overwrite bytes past the first system word. The + * caller must account for this. + */ +static inline void vma_flags_overwrite_word_once(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + WRITE_ONCE(*bitmap, value); +} + +/* Update the first system word of VMA flags setting bits, non-atomically. */ +static inline void vma_flags_set_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap |= value; +} + +/* Update the first system word of VMA flags clearing bits, non-atomically. */ +static inline void vma_flags_clear_word(vma_flags_t *flags, unsigned long value) +{ + unsigned long *bitmap = ACCESS_PRIVATE(flags, __vma_flags); + + *bitmap &= ~value; +} + + +/* Use when VMA is not part of the VMA tree and needs no locking */ +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word(&vma->flags, flags); +} + +/* + * Use when VMA is part of the VMA tree and modifications need coordination + * Note: vm_flags_reset and vm_flags_reset_once do not lock the vma and + * it should be locked explicitly beforehand. + */ +static inline void vm_flags_reset(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + vm_flags_init(vma, flags); +} + +static inline void vm_flags_reset_once(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_assert_write_locked(vma); + /* + * The user should only be interested in avoiding reordering of + * assignment to the first word. + */ + vma_flags_clear_all(&vma->flags); + vma_flags_overwrite_word_once(&vma->flags, flags); +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_set_word(&vma->flags, flags); +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma_flags_clear_word(&vma->flags, flags); +} + /* * Denies creating a writable executable mapping or gaining executable permissions. * @@ -1326,12 +1749,23 @@ static inline void free_anon_vma_name(struct vm_area_struct *vma) static inline void set_vma_from_desc(struct vm_area_struct *vma, struct vm_area_desc *desc); -static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, +static inline void mmap_action_prepare(struct mmap_action *action, + struct vm_area_desc *desc) +{ +} + +static inline int mmap_action_complete(struct mmap_action *action, + struct vm_area_struct *vma) +{ + return 0; +} + +static inline int __compat_vma_mmap(const struct file_operations *f_op, struct file *file, struct vm_area_struct *vma) { struct vm_area_desc desc = { .mm = vma->vm_mm, - .file = vma->vm_file, + .file = file, .start = vma->vm_start, .end = vma->vm_end, @@ -1339,21 +1773,24 @@ static inline int __compat_vma_mmap_prepare(const struct file_operations *f_op, .vm_file = vma->vm_file, .vm_flags = vma->vm_flags, .page_prot = vma->vm_page_prot, + + .action.type = MMAP_NOTHING, /* Default */ }; int err; err = f_op->mmap_prepare(&desc); if (err) return err; - set_vma_from_desc(vma, &desc); - return 0; + mmap_action_prepare(&desc.action, &desc); + set_vma_from_desc(vma, &desc); + return mmap_action_complete(&desc.action, vma); } -static inline int compat_vma_mmap_prepare(struct file *file, +static inline int compat_vma_mmap(struct file *file, struct vm_area_struct *vma) { - return __compat_vma_mmap_prepare(file->f_op, file, vma); + return __compat_vma_mmap(file->f_op, file, vma); } /* Did the driver provide valid mmap hook configuration? */ @@ -1374,7 +1811,7 @@ static inline bool can_mmap_file(struct file *file) static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma) { if (file->f_op->mmap_prepare) - return compat_vma_mmap_prepare(file, vma); + return compat_vma_mmap(file, vma); return file->f_op->mmap(file, vma); } @@ -1407,4 +1844,20 @@ static inline vm_flags_t ksm_vma_flags(const struct mm_struct *mm, return vm_flags; } +static inline void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn) +{ +} + +static inline int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr, + unsigned long pfn, unsigned long size, pgprot_t pgprot) +{ + return 0; +} + +static inline int do_munmap(struct mm_struct *, unsigned long, size_t, + struct list_head *uf) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ |
