From 9bbce32a20d6a72c767a7f85fd6127babd1410ac Mon Sep 17 00:00:00 2001 From: Mans Rullgard Date: Thu, 28 Jan 2021 15:56:44 +0000 Subject: ARM: dts: am33xx: add aliases for mmc interfaces Without DT aliases, the numbering of mmc interfaces is unpredictable. Adding them makes it possible to refer to devices consistently. The popular suggestion to use UUIDs obviously doesn't work with a blank device fresh from the factory. See commit fa2d0aa96941 ("mmc: core: Allow setting slot index via device tree alias") for more discussion. Signed-off-by: Mans Rullgard Signed-off-by: Tony Lindgren --- arch/arm/boot/dts/am33xx.dtsi | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi index 5b213a1e68bb..5e33d0e88f5b 100644 --- a/arch/arm/boot/dts/am33xx.dtsi +++ b/arch/arm/boot/dts/am33xx.dtsi @@ -40,6 +40,9 @@ ethernet1 = &cpsw_emac1; spi0 = &spi0; spi1 = &spi1; + mmc0 = &mmc1; + mmc1 = &mmc2; + mmc2 = &mmc3; }; cpus { -- cgit v1.2.3 From fbfa463be8dc7957ee4f81556e9e1ea2a951807d Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Wed, 10 Feb 2021 10:53:48 +0200 Subject: ARM: OMAP2+: Fix smartreflex init regression after dropping legacy data When I dropped legacy data for omap4 and dra7 smartreflex in favor of device tree based data, it seems I only testd for the "SmartReflex Class3 initialized" line in dmesg. I missed the fact that there is also omap_devinit_smartreflex() that happens later, and now it produces an error on boot for "No Voltage table for the corresponding vdd. Cannot create debugfs entries for n-values". This happens as we no longer have the smartreflex instance legacy data, and have not yet moved completely to device tree based booting for the driver. Let's fix the issue by changing the smartreflex init to use names. This should all eventually go away in favor of doing the init in the driver based on devicetree compatible value. Note that dra7xx_init_early() is not calling any voltage domain init like omap54xx_voltagedomains_init(), or a dra7 specific voltagedomains init. This means that on dra7 smartreflex is still not fully initialized, and also seems to be missing the related devicetree nodes. Fixes: a6b1e717e942 ("ARM: OMAP2+: Drop legacy platform data for omap4 smartreflex") Fixes: e54740b4afe8 ("ARM: OMAP2+: Drop legacy platform data for dra7 smartreflex") Signed-off-by: Tony Lindgren --- arch/arm/mach-omap2/sr_device.c | 75 +++++++++++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-omap2/sr_device.c b/arch/arm/mach-omap2/sr_device.c index 62df666c2bd0..17b66f0d0dee 100644 --- a/arch/arm/mach-omap2/sr_device.c +++ b/arch/arm/mach-omap2/sr_device.c @@ -88,34 +88,26 @@ static void __init sr_set_nvalues(struct omap_volt_data *volt_data, extern struct omap_sr_data omap_sr_pdata[]; -static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +static int __init sr_init_by_name(const char *name, const char *voltdm) { struct omap_sr_data *sr_data = NULL; struct omap_volt_data *volt_data; - struct omap_smartreflex_dev_attr *sr_dev_attr; static int i; - if (!strncmp(oh->name, "smartreflex_mpu_iva", 20) || - !strncmp(oh->name, "smartreflex_mpu", 16)) + if (!strncmp(name, "smartreflex_mpu_iva", 20) || + !strncmp(name, "smartreflex_mpu", 16)) sr_data = &omap_sr_pdata[OMAP_SR_MPU]; - else if (!strncmp(oh->name, "smartreflex_core", 17)) + else if (!strncmp(name, "smartreflex_core", 17)) sr_data = &omap_sr_pdata[OMAP_SR_CORE]; - else if (!strncmp(oh->name, "smartreflex_iva", 16)) + else if (!strncmp(name, "smartreflex_iva", 16)) sr_data = &omap_sr_pdata[OMAP_SR_IVA]; if (!sr_data) { - pr_err("%s: Unknown instance %s\n", __func__, oh->name); + pr_err("%s: Unknown instance %s\n", __func__, name); return -EINVAL; } - sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; - if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { - pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", - __func__, oh->name); - goto exit; - } - - sr_data->name = oh->name; + sr_data->name = name; if (cpu_is_omap343x()) sr_data->ip_type = 1; else @@ -136,10 +128,10 @@ static int __init sr_dev_init(struct omap_hwmod *oh, void *user) } } - sr_data->voltdm = voltdm_lookup(sr_dev_attr->sensor_voltdm_name); + sr_data->voltdm = voltdm_lookup(voltdm); if (!sr_data->voltdm) { pr_err("%s: Unable to get voltage domain pointer for VDD %s\n", - __func__, sr_dev_attr->sensor_voltdm_name); + __func__, voltdm); goto exit; } @@ -160,6 +152,20 @@ exit: return 0; } +static int __init sr_dev_init(struct omap_hwmod *oh, void *user) +{ + struct omap_smartreflex_dev_attr *sr_dev_attr; + + sr_dev_attr = (struct omap_smartreflex_dev_attr *)oh->dev_attr; + if (!sr_dev_attr || !sr_dev_attr->sensor_voltdm_name) { + pr_err("%s: No voltage domain specified for %s. Cannot initialize\n", + __func__, oh->name); + return 0; + } + + return sr_init_by_name(oh->name, sr_dev_attr->sensor_voltdm_name); +} + /* * API to be called from board files to enable smartreflex * autocompensation at init. @@ -169,7 +175,42 @@ void __init omap_enable_smartreflex_on_init(void) sr_enable_on_init = true; } +static const char * const omap4_sr_instances[] = { + "mpu", + "iva", + "core", +}; + +static const char * const dra7_sr_instances[] = { + "mpu", + "core", +}; + int __init omap_devinit_smartreflex(void) { + const char * const *sr_inst; + int i, nr_sr = 0; + + if (soc_is_omap44xx()) { + sr_inst = omap4_sr_instances; + nr_sr = ARRAY_SIZE(omap4_sr_instances); + + } else if (soc_is_dra7xx()) { + sr_inst = dra7_sr_instances; + nr_sr = ARRAY_SIZE(dra7_sr_instances); + } + + if (nr_sr) { + const char *name, *voltdm; + + for (i = 0; i < nr_sr; i++) { + name = kasprintf(GFP_KERNEL, "smartreflex_%s", sr_inst[i]); + voltdm = sr_inst[i]; + sr_init_by_name(name, voltdm); + } + + return 0; + } + return omap_hwmod_for_each_by_class("smartreflex", sr_dev_init, NULL); } -- cgit v1.2.3 From ab5eb336411f18fd449a1fb37d36a55ec422603f Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Thu, 25 Feb 2021 11:42:46 -0800 Subject: xtensa: move coprocessor_flush to the .text section coprocessor_flush is not a part of fast exception handlers, but it uses parts of fast coprocessor handling code that's why it's in the same source file. It uses call0 opcode to invoke those parts so there are no limitations on their relative location, but the rest of the code calls coprocessor_flush with call8 and that doesn't work when vectors are placed in a different gigabyte-aligned area than the rest of the kernel. Move coprocessor_flush from the .exception.text section to the .text so that it's reachable from the rest of the kernel with call8. Cc: stable@vger.kernel.org Signed-off-by: Max Filippov --- arch/xtensa/kernel/coprocessor.S | 64 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 31 deletions(-) (limited to 'arch') diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index c426b846beef..45cc0ae0af6f 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -99,37 +99,6 @@ LOAD_CP_REGS_TAB(6) LOAD_CP_REGS_TAB(7) -/* - * coprocessor_flush(struct thread_info*, index) - * a2 a3 - * - * Save coprocessor registers for coprocessor 'index'. - * The register values are saved to or loaded from the coprocessor area - * inside the task_info structure. - * - * Note that this function doesn't update the coprocessor_owner information! - * - */ - -ENTRY(coprocessor_flush) - - /* reserve 4 bytes on stack to save a0 */ - abi_entry(4) - - s32i a0, a1, 0 - movi a0, .Lsave_cp_regs_jump_table - addx8 a3, a3, a0 - l32i a4, a3, 4 - l32i a3, a3, 0 - add a2, a2, a4 - beqz a3, 1f - callx0 a3 -1: l32i a0, a1, 0 - - abi_ret(4) - -ENDPROC(coprocessor_flush) - /* * Entry condition: * @@ -245,6 +214,39 @@ ENTRY(fast_coprocessor) ENDPROC(fast_coprocessor) + .text + +/* + * coprocessor_flush(struct thread_info*, index) + * a2 a3 + * + * Save coprocessor registers for coprocessor 'index'. + * The register values are saved to or loaded from the coprocessor area + * inside the task_info structure. + * + * Note that this function doesn't update the coprocessor_owner information! + * + */ + +ENTRY(coprocessor_flush) + + /* reserve 4 bytes on stack to save a0 */ + abi_entry(4) + + s32i a0, a1, 0 + movi a0, .Lsave_cp_regs_jump_table + addx8 a3, a3, a0 + l32i a4, a3, 4 + l32i a3, a3, 0 + add a2, a2, a4 + beqz a3, 1f + callx0 a3 +1: l32i a0, a1, 0 + + abi_ret(4) + +ENDPROC(coprocessor_flush) + .data ENTRY(coprocessor_owner) -- cgit v1.2.3 From de920fc64cbaa031f947e9be964bda05fd090380 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 8 Mar 2021 17:56:47 -0800 Subject: bpf, x86: Use kvmalloc_array instead kmalloc_array in bpf_jit_comp x86 bpf_jit_comp.c used kmalloc_array to store jited addresses for each bpf insn. With a large bpf program, we have see the following allocation failures in our production server: page allocation failure: order:5, mode:0x40cc0(GFP_KERNEL|__GFP_COMP), nodemask=(null),cpuset=/,mems_allowed=0" Call Trace: dump_stack+0x50/0x70 warn_alloc.cold.120+0x72/0xd2 ? __alloc_pages_direct_compact+0x157/0x160 __alloc_pages_slowpath+0xcdb/0xd00 ? get_page_from_freelist+0xe44/0x1600 ? vunmap_page_range+0x1ba/0x340 __alloc_pages_nodemask+0x2c9/0x320 kmalloc_order+0x18/0x80 kmalloc_order_trace+0x1d/0xa0 bpf_int_jit_compile+0x1e2/0x484 ? kmalloc_order_trace+0x1d/0xa0 bpf_prog_select_runtime+0xc3/0x150 bpf_prog_load+0x480/0x720 ? __mod_memcg_lruvec_state+0x21/0x100 __do_sys_bpf+0xc31/0x2040 ? close_pdeo+0x86/0xe0 do_syscall_64+0x42/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f2f300f7fa9 Code: Bad RIP value. Dumped assembly: ffffffff810b6d70 : ; { ffffffff810b6d70: e8 eb a5 b4 00 callq 0xffffffff81c01360 <__fentry__> ffffffff810b6d75: 41 57 pushq %r15 ... ffffffff810b6f39: e9 72 fe ff ff jmp 0xffffffff810b6db0 ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f3e: 8b 45 0c movl 12(%rbp), %eax ; return __kmalloc(bytes, flags); ffffffff810b6f41: be c0 0c 00 00 movl $3264, %esi ; addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); ffffffff810b6f46: 8d 78 01 leal 1(%rax), %edi ; if (unlikely(check_mul_overflow(n, size, &bytes))) ffffffff810b6f49: 48 c1 e7 02 shlq $2, %rdi ; return __kmalloc(bytes, flags); ffffffff810b6f4d: e8 8e 0c 1d 00 callq 0xffffffff81287be0 <__kmalloc> ; if (!addrs) { ffffffff810b6f52: 48 85 c0 testq %rax, %rax Change kmalloc_array() to kvmalloc_array() to avoid potential allocation error for big bpf programs. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210309015647.3657852-1-yhs@fb.com --- arch/x86/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 6926d0ca6c71..747bba0a584a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2225,7 +2225,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) padding = true; goto skip_init_addrs; } - addrs = kmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); + addrs = kvmalloc_array(prog->len + 1, sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; goto out_addrs; @@ -2317,7 +2317,7 @@ out_image: if (image) bpf_prog_fill_jited_linfo(prog, addrs + 1); out_addrs: - kfree(addrs); + kvfree(addrs); kfree(jit_data); prog->aux->jit_data = NULL; } -- cgit v1.2.3 From 221c3a09ddf70a0a51715e6c2878d8305e95c558 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 11 Apr 2018 19:05:03 +0300 Subject: ARM: dts: at91-sama5d27_som1: fix phy address to 7 Fix the phy address to 7 for Ethernet PHY on SAMA5D27 SOM1. No connection established if phy address 0 is used. The board uses the 24 pins version of the KSZ8081RNA part, KSZ8081RNA pin 16 REFCLK as PHYAD bit [2] has weak internal pull-down. But at reset, connected to PD09 of the MPU it's connected with an internal pull-up forming PHYAD[2:0] = 7. Signed-off-by: Claudiu Beznea Fixes: 2f61929eb10a ("ARM: dts: at91: at91-sama5d27_som1: fix PHY ID") Cc: Ludovic Desroches Signed-off-by: Nicolas Ferre Cc: # 4.14+ --- arch/arm/boot/dts/at91-sama5d27_som1.dtsi | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi index 1b1163858b1d..e3251f3e3eaa 100644 --- a/arch/arm/boot/dts/at91-sama5d27_som1.dtsi +++ b/arch/arm/boot/dts/at91-sama5d27_som1.dtsi @@ -84,8 +84,8 @@ pinctrl-0 = <&pinctrl_macb0_default>; phy-mode = "rmii"; - ethernet-phy@0 { - reg = <0x0>; + ethernet-phy@7 { + reg = <0x7>; interrupt-parent = <&pioA>; interrupts = ; pinctrl-names = "default"; -- cgit v1.2.3 From 664979bba8169d775959452def968d1a7c03901f Mon Sep 17 00:00:00 2001 From: Federico Pellegrin Date: Sun, 7 Feb 2021 06:00:22 +0100 Subject: ARM: dts: at91: sam9x60: fix mux-mask for PA7 so it can be set to A, B and C According to the datasheet PA7 can be set to either function A, B or C (see table 6-2 of DS60001579D). The previous value would permit just configuring with function C. Signed-off-by: Federico Pellegrin Fixes: 1e5f532c2737 ("ARM: dts: at91: sam9x60: add device tree for soc and board") Cc: # 5.6+ Cc: Sandeep Sheriker Mallikarjun Signed-off-by: Nicolas Ferre --- arch/arm/boot/dts/at91-sam9x60ek.dts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/at91-sam9x60ek.dts b/arch/arm/boot/dts/at91-sam9x60ek.dts index 73b6b1f89de9..4c40ae571154 100644 --- a/arch/arm/boot/dts/at91-sam9x60ek.dts +++ b/arch/arm/boot/dts/at91-sam9x60ek.dts @@ -336,7 +336,7 @@ &pinctrl { atmel,mux-mask = < /* A B C */ - 0xFFFFFE7F 0xC0E0397F 0xEF00019D /* pioA */ + 0xFFFFFEFF 0xC0E039FF 0xEF00019D /* pioA */ 0x03FFFFFF 0x02FC7E68 0x00780000 /* pioB */ 0xffffffff 0xF83FFFFF 0xB800F3FC /* pioC */ 0x003FFFFF 0x003F8000 0x00000000 /* pioD */ -- cgit v1.2.3 From 2c69c8a1736eace8de491d480e6e577a27c2087c Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Wed, 10 Mar 2021 16:20:06 +0100 Subject: ARM: dts: at91: sam9x60: fix mux-mask to match product's datasheet Fix the whole mux-mask table according to datasheet for the sam9x60 product. Too much functions for pins were disabled leading to misunderstandings when enabling more peripherals or taking this table as an example for another board. Take advantage of this fix to move the mux-mask in the SoC file where it belongs and use lower case letters for hex numbers like everywhere in the file. Signed-off-by: Nicolas Ferre Fixes: 1e5f532c2737 ("ARM: dts: at91: sam9x60: add device tree for soc and board") Cc: # 5.6+ Cc: Sandeep Sheriker Mallikarjun Reviewed-by: Tudor Ambarus Link: https://lore.kernel.org/r/20210310152006.15018-1-nicolas.ferre@microchip.com --- arch/arm/boot/dts/at91-sam9x60ek.dts | 8 -------- arch/arm/boot/dts/sam9x60.dtsi | 9 +++++++++ 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/at91-sam9x60ek.dts b/arch/arm/boot/dts/at91-sam9x60ek.dts index 4c40ae571154..775ceb3acb6c 100644 --- a/arch/arm/boot/dts/at91-sam9x60ek.dts +++ b/arch/arm/boot/dts/at91-sam9x60ek.dts @@ -334,14 +334,6 @@ }; &pinctrl { - atmel,mux-mask = < - /* A B C */ - 0xFFFFFEFF 0xC0E039FF 0xEF00019D /* pioA */ - 0x03FFFFFF 0x02FC7E68 0x00780000 /* pioB */ - 0xffffffff 0xF83FFFFF 0xB800F3FC /* pioC */ - 0x003FFFFF 0x003F8000 0x00000000 /* pioD */ - >; - adc { pinctrl_adc_default: adc_default { atmel,pins = ; diff --git a/arch/arm/boot/dts/sam9x60.dtsi b/arch/arm/boot/dts/sam9x60.dtsi index 84066c1298df..ec45ced3cde6 100644 --- a/arch/arm/boot/dts/sam9x60.dtsi +++ b/arch/arm/boot/dts/sam9x60.dtsi @@ -606,6 +606,15 @@ compatible = "microchip,sam9x60-pinctrl", "atmel,at91sam9x5-pinctrl", "atmel,at91rm9200-pinctrl", "simple-bus"; ranges = <0xfffff400 0xfffff400 0x800>; + /* mux-mask corresponding to sam9x60 SoC in TFBGA228L package */ + atmel,mux-mask = < + /* A B C */ + 0xffffffff 0xffe03fff 0xef00019d /* pioA */ + 0x03ffffff 0x02fc7e7f 0x00780000 /* pioB */ + 0xffffffff 0xffffffff 0xf83fffff /* pioC */ + 0x003fffff 0x003f8000 0x00000000 /* pioD */ + >; + pioA: gpio@fffff400 { compatible = "microchip,sam9x60-gpio", "atmel,at91sam9x5-gpio", "atmel,at91rm9200-gpio"; reg = <0xfffff400 0x200>; -- cgit v1.2.3 From e2c1b0ff38c961d49ce34efda48fa45eb1cb5f19 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Thu, 4 Feb 2021 17:38:46 -0800 Subject: ARM: imx: avic: Convert to using IRQCHIP_DECLARE Using IRQCHIP_DECLARE lets fw_devlink know that it should not wait for these interrupt controllers to be populated as struct devices. Without this change, fw_devlink=on will make the consumers of these interrupt controllers wait for the struct device to be added and thereby block the consumers' probes forever. Converting to IRQCHIP_DECLARE addresses boot issues on imx25 with fw_devlink=on that were reported by Martin. This also removes a lot of boilerplate code. Fixes: e590474768f1 ("driver core: Set fw_devlink=on by default") Reported-by: Martin Kaiser Signed-off-by: Saravana Kannan Tested-by: Martin Kaiser Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/mach-imx/avic.c | 16 +++++++++++++++- arch/arm/mach-imx/common.h | 1 - arch/arm/mach-imx/mach-imx1.c | 11 ----------- arch/arm/mach-imx/mach-imx25.c | 12 ------------ arch/arm/mach-imx/mach-imx27.c | 12 ------------ arch/arm/mach-imx/mach-imx31.c | 1 - arch/arm/mach-imx/mach-imx35.c | 1 - arch/arm/mach-imx/mm-imx3.c | 24 ------------------------ 8 files changed, 15 insertions(+), 63 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-imx/avic.c b/arch/arm/mach-imx/avic.c index 322caa21bcb3..21bce4049cec 100644 --- a/arch/arm/mach-imx/avic.c +++ b/arch/arm/mach-imx/avic.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,7 @@ static void __exception_irq_entry avic_handle_irq(struct pt_regs *regs) * interrupts. It registers the interrupt enable and disable functions * to the kernel for each interrupt source. */ -void __init mxc_init_irq(void __iomem *irqbase) +static void __init mxc_init_irq(void __iomem *irqbase) { struct device_node *np; int irq_base; @@ -220,3 +221,16 @@ void __init mxc_init_irq(void __iomem *irqbase) printk(KERN_INFO "MXC IRQ initialized\n"); } + +static int __init imx_avic_init(struct device_node *node, + struct device_node *parent) +{ + void __iomem *avic_base; + + avic_base = of_iomap(node, 0); + BUG_ON(!avic_base); + mxc_init_irq(avic_base); + return 0; +} + +IRQCHIP_DECLARE(imx_avic, "fsl,avic", imx_avic_init); diff --git a/arch/arm/mach-imx/common.h b/arch/arm/mach-imx/common.h index 2b004cc4f95e..474dedb73bc7 100644 --- a/arch/arm/mach-imx/common.h +++ b/arch/arm/mach-imx/common.h @@ -22,7 +22,6 @@ void mx35_map_io(void); void imx21_init_early(void); void imx31_init_early(void); void imx35_init_early(void); -void mxc_init_irq(void __iomem *); void mx31_init_irq(void); void mx35_init_irq(void); void mxc_set_cpu_type(unsigned int type); diff --git a/arch/arm/mach-imx/mach-imx1.c b/arch/arm/mach-imx/mach-imx1.c index 32df3b8012f9..8eca92d66a2e 100644 --- a/arch/arm/mach-imx/mach-imx1.c +++ b/arch/arm/mach-imx/mach-imx1.c @@ -17,16 +17,6 @@ static void __init imx1_init_early(void) mxc_set_cpu_type(MXC_CPU_MX1); } -static void __init imx1_init_irq(void) -{ - void __iomem *avic_addr; - - avic_addr = ioremap(MX1_AVIC_ADDR, SZ_4K); - WARN_ON(!avic_addr); - - mxc_init_irq(avic_addr); -} - static const char * const imx1_dt_board_compat[] __initconst = { "fsl,imx1", NULL @@ -34,7 +24,6 @@ static const char * const imx1_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX1_DT, "Freescale i.MX1 (Device Tree Support)") .init_early = imx1_init_early, - .init_irq = imx1_init_irq, .dt_compat = imx1_dt_board_compat, .restart = mxc_restart, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx25.c b/arch/arm/mach-imx/mach-imx25.c index 95de48a1aa7d..51927bd08aef 100644 --- a/arch/arm/mach-imx/mach-imx25.c +++ b/arch/arm/mach-imx/mach-imx25.c @@ -22,17 +22,6 @@ static void __init imx25_dt_init(void) imx_aips_allow_unprivileged_access("fsl,imx25-aips"); } -static void __init mx25_init_irq(void) -{ - struct device_node *np; - void __iomem *avic_base; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx25_dt_board_compat[] __initconst = { "fsl,imx25", NULL @@ -42,6 +31,5 @@ DT_MACHINE_START(IMX25_DT, "Freescale i.MX25 (Device Tree Support)") .init_early = imx25_init_early, .init_machine = imx25_dt_init, .init_late = imx25_pm_init, - .init_irq = mx25_init_irq, .dt_compat = imx25_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx27.c b/arch/arm/mach-imx/mach-imx27.c index 262422a9c196..e325c9468105 100644 --- a/arch/arm/mach-imx/mach-imx27.c +++ b/arch/arm/mach-imx/mach-imx27.c @@ -56,17 +56,6 @@ static void __init imx27_init_early(void) mxc_set_cpu_type(MXC_CPU_MX27); } -static void __init mx27_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - mxc_init_irq(avic_base); -} - static const char * const imx27_dt_board_compat[] __initconst = { "fsl,imx27", NULL @@ -75,7 +64,6 @@ static const char * const imx27_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX27_DT, "Freescale i.MX27 (Device Tree Support)") .map_io = mx27_map_io, .init_early = imx27_init_early, - .init_irq = mx27_init_irq, .init_late = imx27_pm_init, .dt_compat = imx27_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx31.c b/arch/arm/mach-imx/mach-imx31.c index dc69dfe600df..e9a1092b6093 100644 --- a/arch/arm/mach-imx/mach-imx31.c +++ b/arch/arm/mach-imx/mach-imx31.c @@ -14,6 +14,5 @@ static const char * const imx31_dt_board_compat[] __initconst = { DT_MACHINE_START(IMX31_DT, "Freescale i.MX31 (Device Tree Support)") .map_io = mx31_map_io, .init_early = imx31_init_early, - .init_irq = mx31_init_irq, .dt_compat = imx31_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mach-imx35.c b/arch/arm/mach-imx/mach-imx35.c index ec5c3068715c..0fc08218b77d 100644 --- a/arch/arm/mach-imx/mach-imx35.c +++ b/arch/arm/mach-imx/mach-imx35.c @@ -27,6 +27,5 @@ DT_MACHINE_START(IMX35_DT, "Freescale i.MX35 (Device Tree Support)") .l2c_aux_mask = ~0, .map_io = mx35_map_io, .init_early = imx35_init_early, - .init_irq = mx35_init_irq, .dt_compat = imx35_dt_board_compat, MACHINE_END diff --git a/arch/arm/mach-imx/mm-imx3.c b/arch/arm/mach-imx/mm-imx3.c index 5056438e5b42..28db97289ee8 100644 --- a/arch/arm/mach-imx/mm-imx3.c +++ b/arch/arm/mach-imx/mm-imx3.c @@ -109,18 +109,6 @@ void __init imx31_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx31_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx31-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX31 */ #ifdef CONFIG_SOC_IMX35 @@ -158,16 +146,4 @@ void __init imx35_init_early(void) mx3_ccm_base = of_iomap(np, 0); BUG_ON(!mx3_ccm_base); } - -void __init mx35_init_irq(void) -{ - void __iomem *avic_base; - struct device_node *np; - - np = of_find_compatible_node(NULL, NULL, "fsl,imx35-avic"); - avic_base = of_iomap(np, 0); - BUG_ON(!avic_base); - - mxc_init_irq(avic_base); -} #endif /* ifdef CONFIG_SOC_IMX35 */ -- cgit v1.2.3 From 9c3a16f88385e671b63a0de7b82b85e604a80f42 Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Sun, 7 Mar 2021 22:47:35 +0200 Subject: arm64: dts: ls1046a: mark crypto engine dma coherent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crypto engine (CAAM) on LS1046A platform is configured HW-coherent, mark accordingly the DT node. As reported by Greg and Sascha, and explained by Robin, lack of "dma-coherent" property for an IP that is configured HW-coherent can lead to problems, e.g. on v5.11: > kernel BUG at drivers/crypto/caam/jr.c:247! > Internal error: Oops - BUG: 0 [#1] PREEMPT SMP > Modules linked in: > CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.11.0-20210225-3-00039-g434215968816-dirty #12 > Hardware name: TQ TQMLS1046A SoM on Arkona AT1130 (C300) board (DT) > pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--) > pc : caam_jr_dequeue+0x98/0x57c > lr : caam_jr_dequeue+0x98/0x57c > sp : ffff800010003d50 > x29: ffff800010003d50 x28: ffff8000118d4000 > x27: ffff8000118d4328 x26: 00000000000001f0 > x25: ffff0008022be480 x24: ffff0008022c6410 > x23: 00000000000001f1 x22: ffff8000118d4329 > x21: 0000000000004d80 x20: 00000000000001f1 > x19: 0000000000000001 x18: 0000000000000020 > x17: 0000000000000000 x16: 0000000000000015 > x15: ffff800011690230 x14: 2e2e2e2e2e2e2e2e > x13: 2e2e2e2e2e2e2020 x12: 3030303030303030 > x11: ffff800011700a38 x10: 00000000fffff000 > x9 : ffff8000100ada30 x8 : ffff8000116a8a38 > x7 : 0000000000000001 x6 : 0000000000000000 > x5 : 0000000000000000 x4 : 0000000000000000 > x3 : 00000000ffffffff x2 : 0000000000000000 > x1 : 0000000000000000 x0 : 0000000000001800 > Call trace: > caam_jr_dequeue+0x98/0x57c > tasklet_action_common.constprop.0+0x164/0x18c > tasklet_action+0x44/0x54 > __do_softirq+0x160/0x454 > __irq_exit_rcu+0x164/0x16c > irq_exit+0x1c/0x30 > __handle_domain_irq+0xc0/0x13c > gic_handle_irq+0x5c/0xf0 > el1_irq+0xb4/0x180 > arch_cpu_idle+0x18/0x30 > default_idle_call+0x3c/0x1c0 > do_idle+0x23c/0x274 > cpu_startup_entry+0x34/0x70 > rest_init+0xdc/0xec > arch_call_rest_init+0x1c/0x28 > start_kernel+0x4ac/0x4e4 > Code: 91392021 912c2000 d377d8c6 97f24d96 (d4210000) Cc: # v4.10+ Fixes: 8126d88162a5 ("arm64: dts: add QorIQ LS1046A SoC support") Link: https://lore.kernel.org/linux-crypto/fe6faa24-d8f7-d18f-adfa-44fa0caa1598@arm.com Reported-by: Greg Ungerer Reported-by: Sascha Hauer Tested-by: Sascha Hauer Signed-off-by: Horia Geantă Acked-by: Greg Ungerer Acked-by: Li Yang Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi index 1d6dfd189c7f..39458305e333 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1046a.dtsi @@ -354,6 +354,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = ; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", -- cgit v1.2.3 From 4fb3a074755b7737c4081cffe0ccfa08c2f2d29d Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Sun, 7 Mar 2021 22:47:36 +0200 Subject: arm64: dts: ls1043a: mark crypto engine dma coherent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crypto engine (CAAM) on LS1043A platform is configured HW-coherent, mark accordingly the DT node. Lack of "dma-coherent" property for an IP that is configured HW-coherent can lead to problems, similar to what has been reported for LS1046A. Cc: # v4.8+ Fixes: 63dac35b58f4 ("arm64: dts: ls1043a: add crypto node") Link: https://lore.kernel.org/linux-crypto/fe6faa24-d8f7-d18f-adfa-44fa0caa1598@arm.com Signed-off-by: Horia Geantă Acked-by: Li Yang Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi index 5a8a1dc4262d..28c51e521cb2 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1043a.dtsi @@ -348,6 +348,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = <0 75 0x4>; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", -- cgit v1.2.3 From ba8da03fa7dff59d9400250aebd38f94cde3cb0f Mon Sep 17 00:00:00 2001 From: Horia Geantă Date: Sun, 7 Mar 2021 22:47:37 +0200 Subject: arm64: dts: ls1012a: mark crypto engine dma coherent MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Crypto engine (CAAM) on LS1012A platform is configured HW-coherent, mark accordingly the DT node. Lack of "dma-coherent" property for an IP that is configured HW-coherent can lead to problems, similar to what has been reported for LS1046A. Cc: # v4.12+ Fixes: 85b85c569507 ("arm64: dts: ls1012a: add crypto node") Signed-off-by: Horia Geantă Acked-by: Li Yang Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi index 7de6b376d792..9058cfa4980f 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls1012a.dtsi @@ -198,6 +198,7 @@ ranges = <0x0 0x00 0x1700000 0x100000>; reg = <0x00 0x1700000 0x0 0x100000>; interrupts = ; + dma-coherent; sec_jr0: jr@10000 { compatible = "fsl,sec-v5.4-job-ring", -- cgit v1.2.3 From 412627f6ffe32211863e1dcd76dab98c90556fc7 Mon Sep 17 00:00:00 2001 From: Teresa Remmet Date: Thu, 11 Mar 2021 07:14:45 +0100 Subject: arm64: dts: imx8mp-phyboard-pollux-rdk: Add missing pinctrl entry Add missing pinctrl-names for i2c gpio recovery mode. Fixes: 88f7f6bcca37 ("arm64: dts: freescale: Add support for phyBOARD-Pollux-i.MX8MP") Signed-off-by: Teresa Remmet Signed-off-by: Shawn Guo --- arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts | 2 +- arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts index 0e1a6d953389..122c95ddad30 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts +++ b/arch/arm64/boot/dts/freescale/imx8mp-phyboard-pollux-rdk.dts @@ -35,7 +35,7 @@ &i2c2 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c2>; pinctrl-1 = <&pinctrl_i2c2_gpio>; sda-gpios = <&gpio5 17 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; diff --git a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi index 44a8c2337cee..f3965ec5b31d 100644 --- a/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mp-phycore-som.dtsi @@ -67,7 +67,7 @@ &i2c1 { clock-frequency = <400000>; - pinctrl-names = "default"; + pinctrl-names = "default", "gpio"; pinctrl-0 = <&pinctrl_i2c1>; pinctrl-1 = <&pinctrl_i2c1_gpio>; sda-gpios = <&gpio5 15 (GPIO_ACTIVE_HIGH | GPIO_OPEN_DRAIN)>; -- cgit v1.2.3 From 69cbbf6be5d5c25deeddd9450ae538e769480dc3 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Fri, 12 Mar 2021 08:37:46 -0300 Subject: ARM: imx6ul-14x14-evk: Do not reset the Ethernet PHYs independently The imx6ul-evk board designer took the bad decision to tie the two Ethernet PHY reset lines together. This prevents one Ethernet interface to work while the other one is brought down. For example: # ifconfig eth0 down # [ 279.386551] fec 2188000.ethernet eth1: Link is Down Bringing eth0 interface down also causes eth1 to be down. The Ethernet reset lines comes from the IO expander and both come in logic level 0 by default. To fix this issue, remove the Ethernet PHY reset descriptions from its respective PHY nodes and force both Ethernet PHY lines to be at logic level 1 via gpio-hog. Fixes: 2db7e78bf02b ("ARM: dts: imx6ul-14x14-evk: Describe the KSZ8081 reset") Reported-by: Joakim Zhang Signed-off-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6ul-14x14-evk.dtsi | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi index c593597b2119..5a1e10def6ef 100644 --- a/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi +++ b/arch/arm/boot/dts/imx6ul-14x14-evk.dtsi @@ -210,9 +210,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 1 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; @@ -222,9 +219,6 @@ micrel,led-mode = <1>; clocks = <&clks IMX6UL_CLK_ENET2_REF>; clock-names = "rmii-ref"; - reset-gpios = <&gpio_spi 2 GPIO_ACTIVE_LOW>; - reset-assert-us = <10000>; - reset-deassert-us = <100>; }; }; }; @@ -243,6 +237,22 @@ status = "okay"; }; +&gpio_spi { + eth0-phy-hog { + gpio-hog; + gpios = <1 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth0-phy"; + }; + + eth1-phy-hog { + gpio-hog; + gpios = <2 GPIO_ACTIVE_HIGH>; + output-high; + line-name = "eth1-phy"; + }; +}; + &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; -- cgit v1.2.3 From 0710442a88d1c646d37ac83c52de85f456e99171 Mon Sep 17 00:00:00 2001 From: Alex Elder Date: Sun, 14 Mar 2021 20:26:50 -0500 Subject: arm64: csum: cast to the proper type The last line of ip_fast_csum() calls csum_fold(), forcing the type of the argument passed to be u32. But csum_fold() takes a __wsum argument (which is __u32 __bitwise for arm64). As long as we're forcing the cast, cast it to the right type. Signed-off-by: Alex Elder Acked-by: Robin Murphy Link: https://lore.kernel.org/r/20210315012650.1221328-1-elder@linaro.org Signed-off-by: Will Deacon --- arch/arm64/include/asm/checksum.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h index 93a161b3bf3f..dc52b733675d 100644 --- a/arch/arm64/include/asm/checksum.h +++ b/arch/arm64/include/asm/checksum.h @@ -37,7 +37,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) } while (--n > 0); sum += ((sum >> 32) | (sum << 32)); - return csum_fold((__force u32)(sum >> 32)); + return csum_fold((__force __wsum)(sum >> 32)); } #define ip_fast_csum ip_fast_csum -- cgit v1.2.3 From e21aa341785c679dd409c8cb71f864c00fe6c463 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 16 Mar 2021 14:00:07 -0700 Subject: bpf: Fix fexit trampoline. The fexit/fmod_ret programs can be attached to kernel functions that can sleep. The synchronize_rcu_tasks() will not wait for such tasks to complete. In such case the trampoline image will be freed and when the task wakes up the return IP will point to freed memory causing the crash. Solve this by adding percpu_ref_get/put for the duration of trampoline and separate trampoline vs its image life times. The "half page" optimization has to be removed, since first_half->second_half->first_half transition cannot be guaranteed to complete in deterministic time. Every trampoline update becomes a new image. The image with fmod_ret or fexit progs will be freed via percpu_ref_kill and call_rcu_tasks. Together they will wait for the original function and trampoline asm to complete. The trampoline is patched from nop to jmp to skip fexit progs. They are freed independently from the trampoline. The image with fentry progs only will be freed via call_rcu_tasks_trace+call_rcu_tasks which will wait for both sleepable and non-sleepable progs to complete. Fixes: fec56f5890d9 ("bpf: Introduce BPF trampoline") Reported-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Paul E. McKenney # for RCU Link: https://lore.kernel.org/bpf/20210316210007.38949-1-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 26 +++++- include/linux/bpf.h | 24 ++++- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 4 +- kernel/bpf/trampoline.c | 218 ++++++++++++++++++++++++++++++++++---------- 5 files changed, 213 insertions(+), 61 deletions(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 747bba0a584a..72b5a57e9e31 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1936,7 +1936,7 @@ static int invoke_bpf_mod_ret(const struct btf_func_model *m, u8 **pprog, * add rsp, 8 // skip eth_type_trans's frame * ret // return to its caller */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) @@ -1975,6 +1975,15 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, save_regs(m, &prog, nr_args, stack_size); + if (flags & BPF_TRAMP_F_CALL_ORIG) { + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_enter, prog)) { + ret = -EINVAL; + goto cleanup; + } + } + if (fentry->nr_progs) if (invoke_bpf(m, &prog, fentry, stack_size)) return -EINVAL; @@ -1993,8 +2002,7 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } if (flags & BPF_TRAMP_F_CALL_ORIG) { - if (fentry->nr_progs || fmod_ret->nr_progs) - restore_regs(m, &prog, nr_args, stack_size); + restore_regs(m, &prog, nr_args, stack_size); /* call original function */ if (emit_call(&prog, orig_call, prog)) { @@ -2003,6 +2011,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, } /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + im->ip_after_call = prog; + emit_nops(&prog, 5); } if (fmod_ret->nr_progs) { @@ -2033,9 +2043,17 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, * the return value is only updated on the stack and still needs to be * restored to R0. */ - if (flags & BPF_TRAMP_F_CALL_ORIG) + if (flags & BPF_TRAMP_F_CALL_ORIG) { + im->ip_epilogue = prog; + /* arg1: mov rdi, im */ + emit_mov_imm64(&prog, BPF_REG_1, (long) im >> 32, (u32) (long) im); + if (emit_call(&prog, __bpf_tramp_exit, prog)) { + ret = -EINVAL; + goto cleanup; + } /* restore original return value back into RAX */ emit_ldx(&prog, BPF_DW, BPF_REG_0, BPF_REG_FP, -8); + } EMIT1(0x5B); /* pop rbx */ EMIT1(0xC9); /* leave */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d7e0f479a5b0..3625f019767d 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -21,6 +21,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -556,7 +557,8 @@ struct bpf_tramp_progs { * fentry = a set of program to run before calling original function * fexit = a set of program to run after original function */ -int arch_prepare_bpf_trampoline(void *image, void *image_end, +struct bpf_tramp_image; +int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call); @@ -565,6 +567,8 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr); +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr); struct bpf_ksym { unsigned long start; @@ -583,6 +587,18 @@ enum bpf_tramp_prog_type { BPF_TRAMP_REPLACE, /* more than MAX */ }; +struct bpf_tramp_image { + void *image; + struct bpf_ksym ksym; + struct percpu_ref pcref; + void *ip_after_call; + void *ip_epilogue; + union { + struct rcu_head rcu; + struct work_struct work; + }; +}; + struct bpf_trampoline { /* hlist for trampoline_table */ struct hlist_node hlist; @@ -605,9 +621,8 @@ struct bpf_trampoline { /* Number of attached programs. A counter per kind. */ int progs_cnt[BPF_TRAMP_MAX]; /* Executable image of trampoline */ - void *image; + struct bpf_tramp_image *cur_image; u64 selector; - struct bpf_ksym ksym; }; struct bpf_attach_target_info { @@ -691,6 +706,8 @@ void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym); void bpf_image_ksym_del(struct bpf_ksym *ksym); void bpf_ksym_add(struct bpf_ksym *ksym); void bpf_ksym_del(struct bpf_ksym *ksym); +int bpf_jit_charge_modmem(u32 pages); +void bpf_jit_uncharge_modmem(u32 pages); #else static inline int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr) @@ -787,7 +804,6 @@ struct bpf_prog_aux { bool func_proto_unreliable; bool sleepable; bool tail_call_reachable; - enum bpf_tramp_prog_type trampoline_prog_type; struct hlist_node tramp_hlist; /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */ const struct btf_type *attach_func_proto; diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 1a666a975416..70f6fd4fa305 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -430,7 +430,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, tprogs[BPF_TRAMP_FENTRY].progs[0] = prog; tprogs[BPF_TRAMP_FENTRY].nr_progs = 1; - err = arch_prepare_bpf_trampoline(image, + err = arch_prepare_bpf_trampoline(NULL, image, st_map->image + PAGE_SIZE, &st_ops->func_models[i], 0, tprogs, NULL); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3a283bf97f2f..75244ecb2389 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -827,7 +827,7 @@ static int __init bpf_jit_charge_init(void) } pure_initcall(bpf_jit_charge_init); -static int bpf_jit_charge_modmem(u32 pages) +int bpf_jit_charge_modmem(u32 pages) { if (atomic_long_add_return(pages, &bpf_jit_current) > (bpf_jit_limit >> PAGE_SHIFT)) { @@ -840,7 +840,7 @@ static int bpf_jit_charge_modmem(u32 pages) return 0; } -static void bpf_jit_uncharge_modmem(u32 pages) +void bpf_jit_uncharge_modmem(u32 pages) { atomic_long_sub(pages, &bpf_jit_current); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 7bc3b3209224..1f3a4be4b175 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -57,19 +57,10 @@ void bpf_image_ksym_del(struct bpf_ksym *ksym) PAGE_SIZE, true, ksym->name); } -static void bpf_trampoline_ksym_add(struct bpf_trampoline *tr) -{ - struct bpf_ksym *ksym = &tr->ksym; - - snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu", tr->key); - bpf_image_ksym_add(tr->image, ksym); -} - static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) { struct bpf_trampoline *tr; struct hlist_head *head; - void *image; int i; mutex_lock(&trampoline_mutex); @@ -84,14 +75,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) if (!tr) goto out; - /* is_root was checked earlier. No need for bpf_jit_charge_modmem() */ - image = bpf_jit_alloc_exec_page(); - if (!image) { - kfree(tr); - tr = NULL; - goto out; - } - tr->key = key; INIT_HLIST_NODE(&tr->hlist); hlist_add_head(&tr->hlist, head); @@ -99,9 +82,6 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) mutex_init(&tr->mutex); for (i = 0; i < BPF_TRAMP_MAX; i++) INIT_HLIST_HEAD(&tr->progs_hlist[i]); - tr->image = image; - INIT_LIST_HEAD_RCU(&tr->ksym.lnode); - bpf_trampoline_ksym_add(tr); out: mutex_unlock(&trampoline_mutex); return tr; @@ -185,10 +165,142 @@ bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total) return tprogs; } +static void __bpf_tramp_image_put_deferred(struct work_struct *work) +{ + struct bpf_tramp_image *im; + + im = container_of(work, struct bpf_tramp_image, work); + bpf_image_ksym_del(&im->ksym); + bpf_jit_free_exec(im->image); + bpf_jit_uncharge_modmem(1); + percpu_ref_exit(&im->pcref); + kfree_rcu(im, rcu); +} + +/* callback, fexit step 3 or fentry step 2 */ +static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); + schedule_work(&im->work); +} + +/* callback, fexit step 2. Called after percpu_ref_kill confirms. */ +static void __bpf_tramp_image_release(struct percpu_ref *pcref) +{ + struct bpf_tramp_image *im; + + im = container_of(pcref, struct bpf_tramp_image, pcref); + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +/* callback, fexit or fentry step 1 */ +static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) +{ + struct bpf_tramp_image *im; + + im = container_of(rcu, struct bpf_tramp_image, rcu); + if (im->ip_after_call) + /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ + percpu_ref_kill(&im->pcref); + else + /* the case of fentry trampoline */ + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu); +} + +static void bpf_tramp_image_put(struct bpf_tramp_image *im) +{ + /* The trampoline image that calls original function is using: + * rcu_read_lock_trace to protect sleepable bpf progs + * rcu_read_lock to protect normal bpf progs + * percpu_ref to protect trampoline itself + * rcu tasks to protect trampoline asm not covered by percpu_ref + * (which are few asm insns before __bpf_tramp_enter and + * after __bpf_tramp_exit) + * + * The trampoline is unreachable before bpf_tramp_image_put(). + * + * First, patch the trampoline to avoid calling into fexit progs. + * The progs will be freed even if the original function is still + * executing or sleeping. + * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on + * first few asm instructions to execute and call into + * __bpf_tramp_enter->percpu_ref_get. + * Then use percpu_ref_kill to wait for the trampoline and the original + * function to finish. + * Then use call_rcu_tasks() to make sure few asm insns in + * the trampoline epilogue are done as well. + * + * In !PREEMPT case the task that got interrupted in the first asm + * insns won't go through an RCU quiescent state which the + * percpu_ref_kill will be waiting for. Hence the first + * call_rcu_tasks() is not necessary. + */ + if (im->ip_after_call) { + int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, + NULL, im->ip_epilogue); + WARN_ON(err); + if (IS_ENABLED(CONFIG_PREEMPTION)) + call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); + else + percpu_ref_kill(&im->pcref); + return; + } + + /* The trampoline without fexit and fmod_ret progs doesn't call original + * function and doesn't use percpu_ref. + * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. + * Then use call_rcu_tasks() to wait for the rest of trampoline asm + * and normal progs. + */ + call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks); +} + +static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx) +{ + struct bpf_tramp_image *im; + struct bpf_ksym *ksym; + void *image; + int err = -ENOMEM; + + im = kzalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + err = bpf_jit_charge_modmem(1); + if (err) + goto out_free_im; + + err = -ENOMEM; + im->image = image = bpf_jit_alloc_exec_page(); + if (!image) + goto out_uncharge; + + err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL); + if (err) + goto out_free_image; + + ksym = &im->ksym; + INIT_LIST_HEAD_RCU(&ksym->lnode); + snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx); + bpf_image_ksym_add(image, ksym); + return im; + +out_free_image: + bpf_jit_free_exec(im->image); +out_uncharge: + bpf_jit_uncharge_modmem(1); +out_free_im: + kfree(im); +out: + return ERR_PTR(err); +} + static int bpf_trampoline_update(struct bpf_trampoline *tr) { - void *old_image = tr->image + ((tr->selector + 1) & 1) * PAGE_SIZE/2; - void *new_image = tr->image + (tr->selector & 1) * PAGE_SIZE/2; + struct bpf_tramp_image *im; struct bpf_tramp_progs *tprogs; u32 flags = BPF_TRAMP_F_RESTORE_REGS; int err, total; @@ -198,41 +310,42 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr) return PTR_ERR(tprogs); if (total == 0) { - err = unregister_fentry(tr, old_image); + err = unregister_fentry(tr, tr->cur_image->image); + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = NULL; tr->selector = 0; goto out; } + im = bpf_tramp_image_alloc(tr->key, tr->selector); + if (IS_ERR(im)) { + err = PTR_ERR(im); + goto out; + } + if (tprogs[BPF_TRAMP_FEXIT].nr_progs || tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs) flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; - /* Though the second half of trampoline page is unused a task could be - * preempted in the middle of the first half of trampoline and two - * updates to trampoline would change the code from underneath the - * preempted task. Hence wait for tasks to voluntarily schedule or go - * to userspace. - * The same trampoline can hold both sleepable and non-sleepable progs. - * synchronize_rcu_tasks_trace() is needed to make sure all sleepable - * programs finish executing. - * Wait for these two grace periods together. - */ - synchronize_rcu_mult(call_rcu_tasks, call_rcu_tasks_trace); - - err = arch_prepare_bpf_trampoline(new_image, new_image + PAGE_SIZE / 2, + err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE, &tr->func.model, flags, tprogs, tr->func.addr); if (err < 0) goto out; - if (tr->selector) + WARN_ON(tr->cur_image && tr->selector == 0); + WARN_ON(!tr->cur_image && tr->selector); + if (tr->cur_image) /* progs already running at this address */ - err = modify_fentry(tr, old_image, new_image); + err = modify_fentry(tr, tr->cur_image->image, im->image); else /* first time registering */ - err = register_fentry(tr, new_image); + err = register_fentry(tr, im->image); if (err) goto out; + if (tr->cur_image) + bpf_tramp_image_put(tr->cur_image); + tr->cur_image = im; tr->selector++; out: kfree(tprogs); @@ -364,17 +477,12 @@ void bpf_trampoline_put(struct bpf_trampoline *tr) goto out; if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[BPF_TRAMP_FEXIT]))) goto out; - bpf_image_ksym_del(&tr->ksym); - /* This code will be executed when all bpf progs (both sleepable and - * non-sleepable) went through - * bpf_prog_put()->call_rcu[_tasks_trace]()->bpf_prog_free_deferred(). - * Hence no need for another synchronize_rcu_tasks_trace() here, - * but synchronize_rcu_tasks() is still needed, since trampoline - * may not have had any sleepable programs and we need to wait - * for tasks to get out of trampoline code before freeing it. + /* This code will be executed even when the last bpf_tramp_image + * is alive. All progs are detached from the trampoline and the + * trampoline image is patched with jmp into epilogue to skip + * fexit progs. The fentry-only trampoline will be freed via + * multiple rcu callbacks. */ - synchronize_rcu_tasks(); - bpf_jit_free_exec(tr->image); hlist_del(&tr->hlist); kfree(tr); out: @@ -478,8 +586,18 @@ void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) rcu_read_unlock_trace(); } +void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) +{ + percpu_ref_get(&tr->pcref); +} + +void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) +{ + percpu_ref_put(&tr->pcref); +} + int __weak -arch_prepare_bpf_trampoline(void *image, void *image_end, +arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end, const struct btf_func_model *m, u32 flags, struct bpf_tramp_progs *tprogs, void *orig_call) -- cgit v1.2.3 From e4817a1b6b77db538bc0141c3b138f2df803ce87 Mon Sep 17 00:00:00 2001 From: dillon min Date: Wed, 17 Mar 2021 23:45:09 +0800 Subject: ARM: dts: imx6ull: fix ubi filesystem mount failed For NAND Ecc layout, there is a dependency from old kernel's nand driver setting and current. if old kernel use 4 bit ecc , we should use 4 bit in new kernel either. else will run into following error at filesystem mounting. So, enable fsl,use-minimum-ecc from device tree, to fix this mismatch [ 9.449265] ubi0: scanning is finished [ 9.463968] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 22528 bytes from PEB 513:4096, read only 22528 bytes, retry [ 9.486940] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 22528 bytes from PEB 513:4096, read only 22528 bytes, retry [ 9.509906] ubi0 warning: ubi_io_read: error -74 (ECC error) while reading 22528 bytes from PEB 513:4096, read only 22528 bytes, retry [ 9.532845] ubi0 error: ubi_io_read: error -74 (ECC error) while reading 22528 bytes from PEB 513:4096, read 22528 bytes Fixes: f9ecf10cb88c ("ARM: dts: imx6ull: add MYiR MYS-6ULX SBC") Signed-off-by: dillon min Reviewed-by: Fabio Estevam Signed-off-by: Shawn Guo --- arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts index ecbb2cc5b9ab..79cc45728cd2 100644 --- a/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts +++ b/arch/arm/boot/dts/imx6ull-myir-mys-6ulx-eval.dts @@ -14,5 +14,6 @@ }; &gpmi { + fsl,use-minimum-ecc; status = "okay"; }; -- cgit v1.2.3 From b9082970478009b778aa9b22d5561eef35b53b63 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 19 Mar 2021 17:00:01 -0700 Subject: bpf: Use NOP_ATOMIC5 instead of emit_nops(&prog, 5) for BPF_TRAMP_F_CALL_ORIG __bpf_arch_text_poke does rewrite only for atomic nop5, emit_nops(xxx, 5) emits non-atomic one which breaks fentry/fexit with k8 atomics: P6_NOP5 == P6_NOP5_ATOMIC (0f1f440000 == 0f1f440000) K8_NOP5 != K8_NOP5_ATOMIC (6666906690 != 6666666690) Can be reproduced by doing "ideal_nops = k8_nops" in "arch_init_ideal_nops() and running fexit_bpf2bpf selftest. Fixes: e21aa341785c ("bpf: Fix fexit trampoline.") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210320000001.915366-1-sdf@google.com --- arch/x86/net/bpf_jit_comp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 72b5a57e9e31..b35fc8023884 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -2012,7 +2012,8 @@ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *i /* remember return value in a stack for bpf prog to access */ emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); im->ip_after_call = prog; - emit_nops(&prog, 5); + memcpy(prog, ideal_nops[NOP_ATOMIC5], X86_PATCH_SIZE); + prog += X86_PATCH_SIZE; } if (fmod_ret->nr_progs) { -- cgit v1.2.3 From c607ab4f916d4d5259072eca34055d3f5a795c21 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 19 Mar 2021 18:41:06 +0000 Subject: arm64: stacktrace: don't trace arch_stack_walk() We recently converted arm64 to use arch_stack_walk() in commit: 5fc57df2f6fd ("arm64: stacktrace: Convert to ARCH_STACKWALK") The core stacktrace code expects that (when tracing the current task) arch_stack_walk() starts a trace at its caller, and does not include itself in the trace. However, arm64's arch_stack_walk() includes itself, and so traces include one more entry than callers expect. The core stacktrace code which calls arch_stack_walk() tries to skip a number of entries to prevent itself appearing in a trace, and the additional entry prevents skipping one of the core stacktrace functions, leaving this in the trace unexpectedly. We can fix this by having arm64's arch_stack_walk() begin the trace with its caller. The first value returned by the trace will be __builtin_return_address(0), i.e. the caller of arch_stack_walk(). The first frame record to be unwound will be __builtin_frame_address(1), i.e. the caller's frame record. To prevent surprises, arch_stack_walk() is also marked noinline. While __builtin_frame_address(1) is not safe in portable code, local GCC developers have confirmed that it is safe on arm64. To find the caller's frame record, the builtin can safely dereference the current function's frame record or (in theory) could stash the original FP into another GPR at function entry time, neither of which are problematic. Prior to this patch, the tracing code would unexpectedly show up in traces of the current task, e.g. | # cat /proc/self/stack | [<0>] stack_trace_save_tsk+0x98/0x100 | [<0>] proc_pid_stack+0xb4/0x130 | [<0>] proc_single_show+0x60/0x110 | [<0>] seq_read_iter+0x230/0x4d0 | [<0>] seq_read+0xdc/0x130 | [<0>] vfs_read+0xac/0x1e0 | [<0>] ksys_read+0x6c/0xfc | [<0>] __arm64_sys_read+0x20/0x30 | [<0>] el0_svc_common.constprop.0+0x60/0x120 | [<0>] do_el0_svc+0x24/0x90 | [<0>] el0_svc+0x2c/0x54 | [<0>] el0_sync_handler+0x1a4/0x1b0 | [<0>] el0_sync+0x170/0x180 After this patch, the tracing code will not show up in such traces: | # cat /proc/self/stack | [<0>] proc_pid_stack+0xb4/0x130 | [<0>] proc_single_show+0x60/0x110 | [<0>] seq_read_iter+0x230/0x4d0 | [<0>] seq_read+0xdc/0x130 | [<0>] vfs_read+0xac/0x1e0 | [<0>] ksys_read+0x6c/0xfc | [<0>] __arm64_sys_read+0x20/0x30 | [<0>] el0_svc_common.constprop.0+0x60/0x120 | [<0>] do_el0_svc+0x24/0x90 | [<0>] el0_svc+0x2c/0x54 | [<0>] el0_sync_handler+0x1a4/0x1b0 | [<0>] el0_sync+0x170/0x180 Erring on the side of caution, I've given this a spin with a bunch of toolchains, verifying the output of /proc/self/stack and checking that the assembly looked sound. For GCC (where we require version 5.1.0 or later) I tested with the kernel.org crosstool binares for versions 5.5.0, 6.4.0, 6.5.0, 7.3.0, 7.5.0, 8.1.0, 8.3.0, 8.4.0, 9.2.0, and 10.1.0. For clang (where we require version 10.0.1 or later) I tested with the llvm.org binary releases of 11.0.0, and 11.0.1. Fixes: 5fc57df2f6fd ("arm64: stacktrace: Convert to ARCH_STACKWALK") Signed-off-by: Mark Rutland Cc: Catalin Marinas Cc: Chen Jun Cc: Marco Elver Cc: Mark Brown Cc: Will Deacon Cc: # 5.10.x Reviewed-by: Catalin Marinas Reviewed-by: Mark Brown Link: https://lore.kernel.org/r/20210319184106.5688-1-mark.rutland@arm.com Signed-off-by: Will Deacon --- arch/arm64/kernel/stacktrace.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index ad20981dfda4..d55bdfb7789c 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -194,8 +194,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl) #ifdef CONFIG_STACKTRACE -void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, - struct task_struct *task, struct pt_regs *regs) +noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, + void *cookie, struct task_struct *task, + struct pt_regs *regs) { struct stackframe frame; @@ -203,8 +204,8 @@ void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie, start_backtrace(&frame, regs->regs[29], regs->pc); else if (task == current) start_backtrace(&frame, - (unsigned long)__builtin_frame_address(0), - (unsigned long)arch_stack_walk); + (unsigned long)__builtin_frame_address(1), + (unsigned long)__builtin_return_address(0)); else start_backtrace(&frame, thread_saved_fp(task), thread_saved_pc(task)); -- cgit v1.2.3 From d1296f1265f7ebb66c2bfab387bc1a0f969a5968 Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Sat, 20 Mar 2021 03:58:48 +0530 Subject: arm64: cpuinfo: Fix a typo s/acurate/accurate/ Signed-off-by: Bhaskar Chowdhury Acked-by: Randy Dunlap Link: https://lore.kernel.org/r/20210319222848.29928-1-unixbhaskar@gmail.com Signed-off-by: Will Deacon --- arch/arm64/kernel/cpuinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index 77605aec25fe..51fcf99d5351 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -353,7 +353,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info) * with the CLIDR_EL1 fields to avoid triggering false warnings * when there is a mismatch across the CPUs. Keep track of the * effective value of the CTR_EL0 in our internal records for - * acurate sanity check and feature enablement. + * accurate sanity check and feature enablement. */ info->reg_ctr = read_cpuid_effective_cachetype(); info->reg_dczid = read_cpuid(DCZID_EL0); -- cgit v1.2.3 From 141f8202cfa4192c3af79b6cbd68e7760bb01b5a Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 19 Mar 2021 16:50:54 -0400 Subject: arm64: kdump: update ppos when reading elfcorehdr The ppos points to a position in the old kernel memory (and in case of arm64 in the crash kernel since elfcorehdr is passed as a segment). The function should update the ppos by the amount that was read. This bug is not exposed by accident, but other platforms update this value properly. So, fix it in ARM64 version of elfcorehdr_read() as well. Signed-off-by: Pavel Tatashin Fixes: e62aaeac426a ("arm64: kdump: provide /proc/vmcore file") Reviewed-by: Tyler Hicks Link: https://lore.kernel.org/r/20210319205054.743368-1-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/kernel/crash_dump.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kernel/crash_dump.c b/arch/arm64/kernel/crash_dump.c index e6e284265f19..58303a9ec32c 100644 --- a/arch/arm64/kernel/crash_dump.c +++ b/arch/arm64/kernel/crash_dump.c @@ -64,5 +64,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) { memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + *ppos += count; + return count; } -- cgit v1.2.3 From ee7febce051945be28ad86d16a15886f878204de Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 16 Feb 2021 10:03:51 -0500 Subject: arm64: mm: correct the inside linear map range during hotplug check Memory hotplug may fail on systems with CONFIG_RANDOMIZE_BASE because the linear map range is not checked correctly. The start physical address that linear map covers can be actually at the end of the range because of randomization. Check that and if so reduce it to 0. This can be verified on QEMU with setting kaslr-seed to ~0ul: memstart_offset_seed = 0xffff START: __pa(_PAGE_OFFSET(vabits_actual)) = ffff9000c0000000 END: __pa(PAGE_END - 1) = 1000bfffffff Signed-off-by: Pavel Tatashin Fixes: 58284a901b42 ("arm64/mm: Validate hotplug range before creating linear mapping") Tested-by: Tyler Hicks Reviewed-by: Anshuman Khandual Link: https://lore.kernel.org/r/20210216150351.129018-2-pasha.tatashin@soleen.com Signed-off-by: Will Deacon --- arch/arm64/mm/mmu.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 7484ea4f6ba0..5d9550fdb9cf 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1448,6 +1448,22 @@ static void __remove_pgd_mapping(pgd_t *pgdir, unsigned long start, u64 size) struct range arch_get_mappable_range(void) { struct range mhp_range; + u64 start_linear_pa = __pa(_PAGE_OFFSET(vabits_actual)); + u64 end_linear_pa = __pa(PAGE_END - 1); + + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + /* + * Check for a wrap, it is possible because of randomized linear + * mapping the start physical address is actually bigger than + * the end physical address. In this case set start to zero + * because [0, end_linear_pa] range must still be able to cover + * all addressable physical addresses. + */ + if (start_linear_pa > end_linear_pa) + start_linear_pa = 0; + } + + WARN_ON(start_linear_pa > end_linear_pa); /* * Linear mapping region is the range [PAGE_OFFSET..(PAGE_END - 1)] @@ -1455,8 +1471,9 @@ struct range arch_get_mappable_range(void) * range which can be mapped inside this linear mapping range, must * also be derived from its end points. */ - mhp_range.start = __pa(_PAGE_OFFSET(vabits_actual)); - mhp_range.end = __pa(PAGE_END - 1); + mhp_range.start = start_linear_pa; + mhp_range.end = end_linear_pa; + return mhp_range; } -- cgit v1.2.3 From e834df6cfc71d8e5ce2c27a0184145ea125c3f0f Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 15 Mar 2021 03:00:44 -0500 Subject: powerpc/pseries/mobility: use struct for shared state The atomic_t counter is the only shared state for the join/suspend sequence so far, but that will change. Contain it in a struct (pseries_suspend_info), and document its intended use. No functional change. Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210315080045.460331-2-nathanl@linux.ibm.com --- arch/powerpc/platforms/pseries/mobility.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ea4d6a660e0d..a6739ce9feac 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -452,9 +452,21 @@ static int do_suspend(void) return ret; } +/** + * struct pseries_suspend_info - State shared between CPUs for join/suspend. + * @counter: Threads are to increment this upon resuming from suspend + * or if an error is received from H_JOIN. The thread which performs + * the first increment (i.e. sets it to 1) is responsible for + * waking the other threads. + */ +struct pseries_suspend_info { + atomic_t counter; +}; + static int do_join(void *arg) { - atomic_t *counter = arg; + struct pseries_suspend_info *info = arg; + atomic_t *counter = &info->counter; long hvrc; int ret; @@ -535,11 +547,15 @@ static int pseries_suspend(u64 handle) int ret; while (true) { - atomic_t counter = ATOMIC_INIT(0); + struct pseries_suspend_info info; unsigned long vasi_state; int vasi_err; - ret = stop_machine(do_join, &counter, cpu_online_mask); + info = (struct pseries_suspend_info) { + .counter = ATOMIC_INIT(0), + }; + + ret = stop_machine(do_join, &info, cpu_online_mask); if (ret == 0) break; /* -- cgit v1.2.3 From 274cb1ca2e7ce02cab56f5f4c61a74aeb566f931 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Mon, 15 Mar 2021 03:00:45 -0500 Subject: powerpc/pseries/mobility: handle premature return from H_JOIN The pseries join/suspend sequence in its current form was written with the assumption that it was the only user of H_PROD and that it needn't handle spurious successful returns from H_JOIN. That's wrong; powerpc's paravirt spinlock code uses H_PROD, and CPUs entering do_join() can be woken prematurely from H_JOIN with a status of H_SUCCESS as a result. This causes all CPUs to exit the sequence early, preventing suspend from occurring at all. Add a 'done' boolean flag to the pseries_suspend_info struct, and have the waking thread set it before waking the other threads. Threads which receive H_SUCCESS from H_JOIN retry if the 'done' flag is still unset. Fixes: 9327dc0aeef3 ("powerpc/pseries/mobility: use stop_machine for join/suspend") Signed-off-by: Nathan Lynch Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210315080045.460331-3-nathanl@linux.ibm.com --- arch/powerpc/platforms/pseries/mobility.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index a6739ce9feac..e83e0891272d 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -458,9 +458,12 @@ static int do_suspend(void) * or if an error is received from H_JOIN. The thread which performs * the first increment (i.e. sets it to 1) is responsible for * waking the other threads. + * @done: False if join/suspend is in progress. True if the operation is + * complete (successful or not). */ struct pseries_suspend_info { atomic_t counter; + bool done; }; static int do_join(void *arg) @@ -470,6 +473,7 @@ static int do_join(void *arg) long hvrc; int ret; +retry: /* Must ensure MSR.EE off for H_JOIN. */ hard_irq_disable(); hvrc = plpar_hcall_norets(H_JOIN); @@ -485,8 +489,20 @@ static int do_join(void *arg) case H_SUCCESS: /* * The suspend is complete and this cpu has received a - * prod. + * prod, or we've received a stray prod from unrelated + * code (e.g. paravirt spinlocks) and we need to join + * again. + * + * This barrier orders the return from H_JOIN above vs + * the load of info->done. It pairs with the barrier + * in the wakeup/prod path below. */ + smp_mb(); + if (READ_ONCE(info->done) == false) { + pr_info_ratelimited("premature return from H_JOIN on CPU %i, retrying", + smp_processor_id()); + goto retry; + } ret = 0; break; case H_BAD_MODE: @@ -500,6 +516,13 @@ static int do_join(void *arg) if (atomic_inc_return(counter) == 1) { pr_info("CPU %u waking all threads\n", smp_processor_id()); + WRITE_ONCE(info->done, true); + /* + * This barrier orders the store to info->done vs subsequent + * H_PRODs to wake the other CPUs. It pairs with the barrier + * in the H_SUCCESS case above. + */ + smp_mb(); prod_others(); } /* @@ -553,6 +576,7 @@ static int pseries_suspend(u64 handle) info = (struct pseries_suspend_info) { .counter = ATOMIC_INIT(0), + .done = false, }; ret = stop_machine(do_join, &info, cpu_online_mask); -- cgit v1.2.3 From 8249d17d3194eac064a8ca5bc5ca0abc86feecde Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Thu, 18 Mar 2021 13:26:57 -0700 Subject: x86/mem_encrypt: Correct physical address calculation in __set_clr_pte_enc() The pfn variable contains the page frame number as returned by the pXX_pfn() functions, shifted to the right by PAGE_SHIFT to remove the page bits. After page protection computations are done to it, it gets shifted back to the physical address using page_level_shift(). That is wrong, of course, because that function determines the shift length based on the level of the page in the page table but in all the cases, it was shifted by PAGE_SHIFT before. Therefore, shift it back using PAGE_SHIFT to get the correct physical address. [ bp: Rewrite commit message. ] Fixes: dfaaec9033b8 ("x86: Add support for changing memory encryption attribute in early boot") Signed-off-by: Isaku Yamahata Signed-off-by: Borislav Petkov Reviewed-by: Kirill A. Shutemov Reviewed-by: Tom Lendacky Cc: Link: https://lkml.kernel.org/r/81abbae1657053eccc535c16151f63cd049dcb97.1616098294.git.isaku.yamahata@intel.com --- arch/x86/mm/mem_encrypt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 4b01f7dbaf30..ae78cef79980 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -262,7 +262,7 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) if (pgprot_val(old_prot) == pgprot_val(new_prot)) return; - pa = pfn << page_level_shift(level); + pa = pfn << PAGE_SHIFT; size = page_level_size(level); /* -- cgit v1.2.3 From 9fcb51c14da2953de585c5c6e50697b8a6e91a7b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 23 Mar 2021 13:48:36 +0100 Subject: x86/build: Turn off -fcf-protection for realmode targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new Ubuntu GCC packages turn on -fcf-protection globally, which causes a build failure in the x86 realmode code: cc1: error: ‘-fcf-protection’ is not compatible with this target Turn it off explicitly on compilers that understand this option. Signed-off-by: Arnd Bergmann Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20210323124846.1584944-1-arnd@kernel.org --- arch/x86/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 2d6d5a28c3bf..9a85eae37b17 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -27,7 +27,7 @@ endif REALMODE_CFLAGS := -m16 -g -Os -DDISABLE_BRANCH_PROFILING \ -Wall -Wstrict-prototypes -march=i386 -mregparm=3 \ -fno-strict-aliasing -fomit-frame-pointer -fno-pic \ - -mno-mmx -mno-sse + -mno-mmx -mno-sse $(call cc-option,-fcf-protection=none) REALMODE_CFLAGS += -ffreestanding REALMODE_CFLAGS += -fno-stack-protector -- cgit v1.2.3 From 1d676673d665fd2162e7e466dcfbe5373bfdb73e Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Mar 2021 12:06:29 +0000 Subject: KVM: arm64: Hide system instruction access to Trace registers Currently we advertise the ID_AA6DFR0_EL1.TRACEVER for the guest, when the trace register accesses are trapped (CPTR_EL2.TTA == 1). So, the guest will get an undefined instruction, if trusts the ID registers and access one of the trace registers. Lets be nice to the guest and hide the feature to avoid unexpected behavior. Even though this can be done at KVM sysreg emulation layer, we do this by removing the TRACEVER from the sanitised feature register field. This is fine as long as the ETM drivers can handle the individual trace units separately, even when there are differences among the CPUs. Cc: Will Deacon Cc: Catalin Marinas Cc: Mark Rutland Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210323120647.454211-2-suzuki.poulose@arm.com --- arch/arm64/kernel/cpufeature.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 066030717a4c..a4698f09bf32 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -383,7 +383,6 @@ static const struct arm64_ftr_bits ftr_id_aa64dfr0[] = { * of support. */ S_ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_EXACT, ID_AA64DFR0_PMUVER_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_TRACEVER_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64DFR0_DEBUGVER_SHIFT, 4, 0x6), ARM64_FTR_END, }; -- cgit v1.2.3 From a354a64d91eec3e0f8ef0eed575b480fd75b999c Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 23 Mar 2021 12:06:30 +0000 Subject: KVM: arm64: Disable guest access to trace filter controls Disable guest access to the Trace Filter control registers. We do not advertise the Trace filter feature to the guest (ID_AA64DFR0_EL1: TRACE_FILT is cleared) already, but the guest can still access the TRFCR_EL1 unless we trap it. This will also make sure that the guest cannot fiddle with the filtering controls set by a nvhe host. Cc: Marc Zyngier Cc: Will Deacon Cc: Mark Rutland Cc: Catalin Marinas Signed-off-by: Suzuki K Poulose Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20210323120647.454211-3-suzuki.poulose@arm.com --- arch/arm64/include/asm/kvm_arm.h | 1 + arch/arm64/kvm/debug.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index 4e90c2debf70..94d4025acc0b 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -278,6 +278,7 @@ #define CPTR_EL2_DEFAULT CPTR_EL2_RES1 /* Hyp Debug Configuration Register bits */ +#define MDCR_EL2_TTRF (1 << 19) #define MDCR_EL2_TPMS (1 << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index 7a7e425616b5..dbc890511631 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -89,6 +89,7 @@ void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) * - Debug ROM Address (MDCR_EL2_TDRA) * - OS related registers (MDCR_EL2_TDOSA) * - Statistical profiler (MDCR_EL2_TPMS/MDCR_EL2_E2PB) + * - Self-hosted Trace Filter controls (MDCR_EL2_TTRF) * * Additionally, KVM only traps guest accesses to the debug registers if * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY @@ -112,6 +113,7 @@ void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK; vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM | MDCR_EL2_TPMS | + MDCR_EL2_TTRF | MDCR_EL2_TPMCR | MDCR_EL2_TDRA | MDCR_EL2_TDOSA); -- cgit v1.2.3 From af22df997d71c32304d6835a8b690281063b8010 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 23 Mar 2021 16:08:18 +0000 Subject: KVM: arm64: Fix CPU interface MMIO compatibility detection In order to detect whether a GICv3 CPU interface is MMIO capable, we switch ICC_SRE_EL1.SRE to 0 and check whether it sticks. However, this is only possible if *ALL* of the HCR_EL2 interrupt overrides are set, and the CPU is perfectly allowed to ignore the write to ICC_SRE_EL1 otherwise. This leads KVM to pretend that a whole bunch of ARMv8.0 CPUs aren't MMIO-capable, and breaks VMs that should work correctly otherwise. Fix this by setting IMO/FMO/IMO before touching ICC_SRE_EL1, and clear them afterwards. This allows us to reliably detect the CPU interface capabilities. Tested-by: Shameerali Kolothum Thodi Fixes: 9739f6ef053f ("KVM: arm64: Workaround firmware wrongly advertising GICv2-on-v3 compatibility") Signed-off-by: Marc Zyngier --- arch/arm64/kvm/hyp/vgic-v3-sr.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch') diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c index ee3682b9873c..39f8f7f9227c 100644 --- a/arch/arm64/kvm/hyp/vgic-v3-sr.c +++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c @@ -429,6 +429,13 @@ u64 __vgic_v3_get_gic_config(void) if (has_vhe()) flags = local_daif_save(); + /* + * Table 11-2 "Permitted ICC_SRE_ELx.SRE settings" indicates + * that to be able to set ICC_SRE_EL1.SRE to 0, all the + * interrupt overrides must be set. You've got to love this. + */ + sysreg_clear_set(hcr_el2, 0, HCR_AMO | HCR_FMO | HCR_IMO); + isb(); write_gicreg(0, ICC_SRE_EL1); isb(); @@ -436,6 +443,8 @@ u64 __vgic_v3_get_gic_config(void) write_gicreg(sre, ICC_SRE_EL1); isb(); + sysreg_clear_set(hcr_el2, HCR_AMO | HCR_FMO | HCR_IMO, 0); + isb(); if (has_vhe()) local_daif_restore(flags); -- cgit v1.2.3 From 2b514ec72706a31bea0c3b97e622b81535b5323a Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 24 Mar 2021 13:24:23 +0100 Subject: xen/x86: make XEN_BALLOON_MEMORY_HOTPLUG_LIMIT depend on MEMORY_HOTPLUG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Xen memory hotplug limit should depend on the memory hotplug generic option, rather than the Xen balloon configuration. It's possible to have a kernel with generic memory hotplug enabled, but without Xen balloon enabled, at which point memory hotplug won't work correctly due to the size limitation of the p2m. Rename the option to XEN_MEMORY_HOTPLUG_LIMIT since it's no longer tied to ballooning. Fixes: 9e2369c06c8a18 ("xen: add helpers to allocate unpopulated memory") Signed-off-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210324122424.58685-2-roger.pau@citrix.com Signed-off-by: Boris Ostrovsky --- arch/x86/xen/p2m.c | 4 ++-- drivers/xen/Kconfig | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index 17d80f751fcb..a33902d05e45 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -98,8 +98,8 @@ EXPORT_SYMBOL_GPL(xen_p2m_size); unsigned long xen_max_p2m_pfn __read_mostly; EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); -#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT -#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +#ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT +#define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT #else #define P2M_LIMIT 0 #endif diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 41645fe6ad48..ea0efd290c37 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -50,11 +50,11 @@ config XEN_BALLOON_MEMORY_HOTPLUG SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" -config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT +config XEN_MEMORY_HOTPLUG_LIMIT int "Hotplugged memory limit (in GiB) for a PV guest" default 512 depends on XEN_HAVE_PVMMU - depends on XEN_BALLOON_MEMORY_HOTPLUG + depends on MEMORY_HOTPLUG help Maxmium amount of memory (in GiB) that a PV guest can be expanded to when using memory hotplug. -- cgit v1.2.3 From af44a387e743ab7aa39d3fb5e29c0a973cf91bdc Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 24 Mar 2021 13:24:24 +0100 Subject: Revert "xen: fix p2m size in dom0 for disabled memory hotplug case" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This partially reverts commit 882213990d32 ("xen: fix p2m size in dom0 for disabled memory hotplug case") There's no need to special case XEN_UNPOPULATED_ALLOC anymore in order to correctly size the p2m. The generic memory hotplug option has already been tied together with the Xen hotplug limit, so enabling memory hotplug should already trigger a properly sized p2m on Xen PV. Note that XEN_UNPOPULATED_ALLOC depends on ZONE_DEVICE which pulls in MEMORY_HOTPLUG. Leave the check added to __set_phys_to_machine and the adjusted comment about EXTRA_MEM_RATIO. Signed-off-by: Roger Pau Monné Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210324122424.58685-3-roger.pau@citrix.com [boris: fixed formatting issues] Signed-off-by: Boris Ostrovsky --- arch/x86/include/asm/xen/page.h | 12 ------------ arch/x86/xen/p2m.c | 3 --- arch/x86/xen/setup.c | 16 ++++++++++++++-- 3 files changed, 14 insertions(+), 17 deletions(-) (limited to 'arch') diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 7068e4bb057d..1a162e559753 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -86,18 +86,6 @@ clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, } #endif -/* - * The maximum amount of extra memory compared to the base size. The - * main scaling factor is the size of struct page. At extreme ratios - * of base:extra, all the base memory can be filled with page - * structures for the extra memory, leaving no space for anything - * else. - * - * 10x seems like a reasonable balance between scaling flexibility and - * leaving a practically usable system. - */ -#define XEN_EXTRA_MEM_RATIO (10) - /* * Helper functions to write or read unsigned long values to/from * memory, when the access may fault. diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index a33902d05e45..ac06ca32e9ef 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -416,9 +416,6 @@ void __init xen_vmalloc_p2m_tree(void) xen_p2m_last_pfn = xen_max_p2m_pfn; p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; - if (!p2m_limit && IS_ENABLED(CONFIG_XEN_UNPOPULATED_ALLOC)) - p2m_limit = xen_start_info->nr_pages * XEN_EXTRA_MEM_RATIO; - vm.flags = VM_ALLOC; vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), PMD_SIZE * PMDS_PER_MID_PAGE); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1a3b75652fa4..8bfc10330107 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -59,6 +59,18 @@ static struct { } xen_remap_buf __initdata __aligned(PAGE_SIZE); static unsigned long xen_remap_mfn __initdata = INVALID_P2M_ENTRY; +/* + * The maximum amount of extra memory compared to the base size. The + * main scaling factor is the size of struct page. At extreme ratios + * of base:extra, all the base memory can be filled with page + * structures for the extra memory, leaving no space for anything + * else. + * + * 10x seems like a reasonable balance between scaling flexibility and + * leaving a practically usable system. + */ +#define EXTRA_MEM_RATIO (10) + static bool xen_512gb_limit __initdata = IS_ENABLED(CONFIG_XEN_512GB); static void __init xen_parse_512gb(void) @@ -778,13 +790,13 @@ char * __init xen_memory_setup(void) extra_pages += max_pages - max_pfn; /* - * Clamp the amount of extra memory to a XEN_EXTRA_MEM_RATIO + * Clamp the amount of extra memory to a EXTRA_MEM_RATIO * factor the base size. * * Make sure we have no memory above max_pages, as this area * isn't handled by the p2m management. */ - extra_pages = min3(XEN_EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), + extra_pages = min3(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)), extra_pages, max_pages - max_pfn); i = 0; addr = xen_e820_table.entries[0].addr; -- cgit v1.2.3 From baa96377bc7b5aa7b8cf038db09cb99642321490 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 24 Mar 2021 12:24:58 +0530 Subject: arm64/process.c: fix Wmissing-prototypes build warnings Fix GCC warnings reported when building with "-Wmissing-prototypes": arch/arm64/kernel/process.c:261:6: warning: no previous prototype for '__show_regs' [-Wmissing-prototypes] 261 | void __show_regs(struct pt_regs *regs) | ^~~~~~~~~~~ arch/arm64/kernel/process.c:307:6: warning: no previous prototype for '__show_regs_alloc_free' [-Wmissing-prototypes] 307 | void __show_regs_alloc_free(struct pt_regs *regs) | ^~~~~~~~~~~~~~~~~~~~~~ arch/arm64/kernel/process.c:365:5: warning: no previous prototype for 'arch_dup_task_struct' [-Wmissing-prototypes] 365 | int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) | ^~~~~~~~~~~~~~~~~~~~ arch/arm64/kernel/process.c:546:41: warning: no previous prototype for '__switch_to' [-Wmissing-prototypes] 546 | __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, | ^~~~~~~~~~~ arch/arm64/kernel/process.c:710:25: warning: no previous prototype for 'arm64_preempt_schedule_irq' [-Wmissing-prototypes] 710 | asmlinkage void __sched arm64_preempt_schedule_irq(void) | ^~~~~~~~~~~~~~~~~~~~~~~~~~ Link: https://lore.kernel.org/lkml/202103192250.AennsfXM-lkp@intel.com Reported-by: kernel test robot Signed-off-by: Maninder Singh Link: https://lore.kernel.org/r/1616568899-986-1-git-send-email-maninder1.s@samsung.com Signed-off-by: Will Deacon --- arch/arm64/include/asm/processor.h | 2 ++ arch/arm64/include/asm/thread_info.h | 2 ++ arch/arm64/kernel/process.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index ca2cd75d3286..efc10e9041a0 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -251,6 +251,8 @@ unsigned long get_wchan(struct task_struct *p); extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); +asmlinkage void arm64_preempt_schedule_irq(void); + #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 9f4e3b266f21..6623c99f0984 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -55,6 +55,8 @@ void arch_setup_new_exec(void); #define arch_setup_new_exec arch_setup_new_exec void arch_release_task_struct(struct task_struct *tsk); +int arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src); #endif diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 325c83b1a24d..6e60aa3b5ea9 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -57,6 +57,8 @@ #include #include #include +#include +#include #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK) #include -- cgit v1.2.3 From 20109a859a9b514eb10c22b8a14b5704ffe93897 Mon Sep 17 00:00:00 2001 From: Rich Wiley Date: Tue, 23 Mar 2021 17:28:09 -0700 Subject: arm64: kernel: disable CNP on Carmel On NVIDIA Carmel cores, CNP behaves differently than it does on standard ARM cores. On Carmel, if two cores have CNP enabled and share an L2 TLB entry created by core0 for a specific ASID, a non-shareable TLBI from core1 may still see the shared entry. On standard ARM cores, that TLBI will invalidate the shared entry as well. This causes issues with patchsets that attempt to do local TLBIs based on cpumasks instead of broadcast TLBIs. Avoid these issues by disabling CNP support for NVIDIA Carmel cores. Signed-off-by: Rich Wiley Link: https://lore.kernel.org/r/20210324002809.30271-1-rwiley@nvidia.com [will: Fix pre-existing whitespace issue] Signed-off-by: Will Deacon --- Documentation/arm64/silicon-errata.rst | 3 +++ arch/arm64/Kconfig | 10 ++++++++++ arch/arm64/include/asm/cpucaps.h | 3 ++- arch/arm64/kernel/cpu_errata.c | 8 ++++++++ arch/arm64/kernel/cpufeature.c | 5 ++++- 5 files changed, 27 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/Documentation/arm64/silicon-errata.rst b/Documentation/arm64/silicon-errata.rst index 719510247292..d410a47ffa57 100644 --- a/Documentation/arm64/silicon-errata.rst +++ b/Documentation/arm64/silicon-errata.rst @@ -130,6 +130,9 @@ stable kernels. | Marvell | ARM-MMU-500 | #582743 | N/A | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ +| NVIDIA | Carmel Core | N/A | NVIDIA_CARMEL_CNP_ERRATUM | ++----------------+-----------------+-----------------+-----------------------------+ ++----------------+-----------------+-----------------+-----------------------------+ | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | +----------------+-----------------+-----------------+-----------------------------+ +----------------+-----------------+-----------------+-----------------------------+ diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 5656e7aacd69..e4e1b6550115 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -810,6 +810,16 @@ config QCOM_FALKOR_ERRATUM_E1041 If unsure, say Y. +config NVIDIA_CARMEL_CNP_ERRATUM + bool "NVIDIA Carmel CNP: CNP on Carmel semantically different than ARM cores" + default y + help + If CNP is enabled on Carmel cores, non-sharable TLBIs on a core will not + invalidate shared TLB entries installed by a different core, as it would + on standard ARM cores. + + If unsure, say Y. + config SOCIONEXT_SYNQUACER_PREITS bool "Socionext Synquacer: Workaround for GICv3 pre-ITS" default y diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index b77d997b173b..c40f2490cd7b 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -66,7 +66,8 @@ #define ARM64_WORKAROUND_1508412 58 #define ARM64_HAS_LDAPR 59 #define ARM64_KVM_PROTECTED_MODE 60 +#define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61 -#define ARM64_NCAPS 61 +#define ARM64_NCAPS 62 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 506a1cd37973..e2c20c036442 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -525,6 +525,14 @@ const struct arm64_cpu_capabilities arm64_errata[] = { 0, 0, 1, 0), }, +#endif +#ifdef CONFIG_NVIDIA_CARMEL_CNP_ERRATUM + { + /* NVIDIA Carmel */ + .desc = "NVIDIA Carmel CNP erratum", + .capability = ARM64_WORKAROUND_NVIDIA_CARMEL_CNP, + ERRATA_MIDR_ALL_VERSIONS(MIDR_NVIDIA_CARMEL), + }, #endif { } diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 066030717a4c..2a5d9854d664 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -1321,7 +1321,10 @@ has_useable_cnp(const struct arm64_cpu_capabilities *entry, int scope) * may share TLB entries with a CPU stuck in the crashed * kernel. */ - if (is_kdump_kernel()) + if (is_kdump_kernel()) + return false; + + if (cpus_have_const_cap(ARM64_WORKAROUND_NVIDIA_CARMEL_CNP)) return false; return has_cpuid_feature(entry, scope); -- cgit v1.2.3 From f2a419cf495f95cac49ea289318b833477e1a0e2 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Wed, 24 Mar 2021 21:37:38 -0700 Subject: ia64: mca: allocate early mca with GFP_ATOMIC The sleep warning happens at early boot right at secondary CPU activation bootup: smp: Bringing up secondary CPUs ... BUG: sleeping function called from invalid context at mm/page_alloc.c:4942 in_atomic(): 0, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/1 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.12.0-rc2-00007-g79e228d0b611-dirty #99 .. Call Trace: show_stack+0x90/0xc0 dump_stack+0x150/0x1c0 ___might_sleep+0x1c0/0x2a0 __might_sleep+0xa0/0x160 __alloc_pages_nodemask+0x1a0/0x600 alloc_page_interleave+0x30/0x1c0 alloc_pages_current+0x2c0/0x340 __get_free_pages+0x30/0xa0 ia64_mca_cpu_init+0x2d0/0x3a0 cpu_init+0x8b0/0x1440 start_secondary+0x60/0x700 start_ap+0x750/0x780 Fixed BSP b0 value from CPU 1 As I understand interrupts are not enabled yet and system has a lot of memory. There is little chance to sleep and switch to GFP_ATOMIC should be a no-op. Link: https://lkml.kernel.org/r/20210315085045.204414-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index d4cae2fc69ca..adf6521525f4 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1824,7 +1824,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = (void *)__get_free_pages(GFP_KERNEL, + data = (void *)__get_free_pages(GFP_ATOMIC, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", -- cgit v1.2.3 From 95d44a470a6814207d52dd6312203b0f4ef12710 Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Wed, 24 Mar 2021 21:37:41 -0700 Subject: ia64: fix format strings for err_inject Fix warning with %lx / u64 mismatch: arch/ia64/kernel/err_inject.c: In function 'show_resources': arch/ia64/kernel/err_inject.c:62:22: warning: format '%lx' expects argument of type 'long unsigned int', but argument 3 has type 'u64' {aka 'long long unsigned int'} 62 | return sprintf(buf, "%lx", name[cpu]); \ | ^~~~~~~ Link: https://lkml.kernel.org/r/20210313104312.1548232-1-slyfox@gentoo.org Signed-off-by: Sergei Trofimovich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/err_inject.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 8b5b8e6bc9d9..dd5bfed52031 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -59,7 +59,7 @@ show_##name(struct device *dev, struct device_attribute *attr, \ char *buf) \ { \ u32 cpu=dev->id; \ - return sprintf(buf, "%lx\n", name[cpu]); \ + return sprintf(buf, "%llx\n", name[cpu]); \ } #define store(name) \ @@ -86,9 +86,9 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "pal_mc_err_inject for cpu%d:\n", cpu); - printk(KERN_DEBUG "err_type_info=%lx,\n", err_type_info[cpu]); - printk(KERN_DEBUG "err_struct_info=%lx,\n", err_struct_info[cpu]); - printk(KERN_DEBUG "err_data_buffer=%lx, %lx, %lx.\n", + printk(KERN_DEBUG "err_type_info=%llx,\n", err_type_info[cpu]); + printk(KERN_DEBUG "err_struct_info=%llx,\n", err_struct_info[cpu]); + printk(KERN_DEBUG "err_data_buffer=%llx, %llx, %llx.\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -117,8 +117,8 @@ store_call_start(struct device *dev, struct device_attribute *attr, #ifdef ERR_INJ_DEBUG printk(KERN_DEBUG "Returns: status=%d,\n", (int)status[cpu]); - printk(KERN_DEBUG "capabilities=%lx,\n", capabilities[cpu]); - printk(KERN_DEBUG "resources=%lx\n", resources[cpu]); + printk(KERN_DEBUG "capabilities=%llx,\n", capabilities[cpu]); + printk(KERN_DEBUG "resources=%llx\n", resources[cpu]); #endif return size; } @@ -131,7 +131,7 @@ show_virtual_to_phys(struct device *dev, struct device_attribute *attr, char *buf) { unsigned int cpu=dev->id; - return sprintf(buf, "%lx\n", phys_addr[cpu]); + return sprintf(buf, "%llx\n", phys_addr[cpu]); } static ssize_t @@ -145,7 +145,7 @@ store_virtual_to_phys(struct device *dev, struct device_attribute *attr, ret = get_user_pages_fast(virt_addr, 1, FOLL_WRITE, NULL); if (ret<=0) { #ifdef ERR_INJ_DEBUG - printk("Virtual address %lx is not existing.\n",virt_addr); + printk("Virtual address %llx is not existing.\n", virt_addr); #endif return -EINVAL; } @@ -163,7 +163,7 @@ show_err_data_buffer(struct device *dev, { unsigned int cpu=dev->id; - return sprintf(buf, "%lx, %lx, %lx\n", + return sprintf(buf, "%llx, %llx, %llx\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3); @@ -178,13 +178,13 @@ store_err_data_buffer(struct device *dev, int ret; #ifdef ERR_INJ_DEBUG - printk("write err_data_buffer=[%lx,%lx,%lx] on cpu%d\n", + printk("write err_data_buffer=[%llx,%llx,%llx] on cpu%d\n", err_data_buffer[cpu].data1, err_data_buffer[cpu].data2, err_data_buffer[cpu].data3, cpu); #endif - ret=sscanf(buf, "%lx, %lx, %lx", + ret = sscanf(buf, "%llx, %llx, %llx", &err_data_buffer[cpu].data1, &err_data_buffer[cpu].data2, &err_data_buffer[cpu].data3); -- cgit v1.2.3 From 72bbc226ed2ef0a46c165a482861fff00dd6d4e1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 23 Mar 2021 21:40:11 +0100 Subject: s390/vdso: copy tod_steering_delta value to vdso_data page When converting the vdso assembler code to C it was forgotten to actually copy the tod_steering_delta value to vdso_data page. Which in turn means that tod clock steering will not work correctly. Fix this by simply copying the value whenever it is updated. Fixes: 4bff8cb54502 ("s390: convert to GENERIC_VDSO") Cc: # 5.10 Signed-off-by: Heiko Carstens --- arch/s390/kernel/time.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 165da961f901..e37285a5101b 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -382,6 +382,7 @@ static void clock_sync_global(unsigned long delta) tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); vdso_data->arch_data.tod_steering_end = tod_steering_end; + vdso_data->arch_data.tod_steering_delta = tod_steering_delta; /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) -- cgit v1.2.3 From b24bacd67ffddd9192c4745500fd6f73dbfe565e Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 24 Mar 2021 20:22:42 +0100 Subject: s390/vdso: fix tod_steering_delta type The s390 specific vdso function __arch_get_hw_counter() is supposed to consider tod clock steering. If a tod clock steering event happens and the tod clock is set to a new value __arch_get_hw_counter() will not return the real tod clock value but slowly drift it from the old delta until the returned value finally matches the real tod clock value again. Unfortunately the type of tod_steering_delta unsigned while it is supposed to be signed. It depends on if tod_steering_delta is negative or positive in which direction the vdso code drifts the clock value. Worst case is now that instead of drifting the clock slowly it will jump into the opposite direction by a factor of two. Fix this by simply making tod_steering_delta signed. Fixes: 4bff8cb54502 ("s390: convert to GENERIC_VDSO") Cc: # 5.10 Signed-off-by: Heiko Carstens --- arch/s390/include/asm/vdso/data.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/s390/include/asm/vdso/data.h b/arch/s390/include/asm/vdso/data.h index 7b3cdb4a5f48..73ee89142666 100644 --- a/arch/s390/include/asm/vdso/data.h +++ b/arch/s390/include/asm/vdso/data.h @@ -6,7 +6,7 @@ #include struct arch_vdso_data { - __u64 tod_steering_delta; + __s64 tod_steering_delta; __u64 tod_steering_end; }; -- cgit v1.2.3 From 5b43bd184530af6b868d8273b0a743a138d37ee8 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 24 Mar 2021 20:23:55 +0100 Subject: s390/vdso: fix initializing and updating of vdso_data Li Wang reported that clock_gettime(CLOCK_MONOTONIC_RAW, ...) returns incorrect values when time is provided via vdso instead of system call: vdso_ts_nsec = 4484351380985507, vdso_ts.tv_sec = 4484351, vdso_ts.tv_nsec = 380985507 sys_ts_nsec = 1446923235377, sys_ts.tv_sec = 1446, sys_ts.tv_nsec = 923235377 Within the s390 specific vdso function __arch_get_hw_counter() reads tod clock steering values from the arch_data member of the passed in vdso_data structure. Problem is that only for the CS_HRES_COARSE vdso_data arch_data is initialized and gets updated. The CS_RAW specific vdso_data does not contain any valid tod_clock_steering information, which explains the different values. Fix this by initializing and updating all vdso_datas. Reported-by: Li Wang Tested-by: Li Wang Fixes: 1ba2d6c0fd4e ("s390/vdso: simplify __arch_get_hw_counter()") Link: https://lore.kernel.org/linux-s390/YFnxr1ZlMIOIqjfq@osiris Signed-off-by: Heiko Carstens --- arch/s390/kernel/time.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index e37285a5101b..326cb8f75f58 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -80,10 +80,12 @@ void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + int cs; /* Initialize TOD steering parameters */ tod_steering_end = tod_clock_base.tod; - vdso_data->arch_data.tod_steering_end = tod_steering_end; + for (cs = 0; cs < CS_BASES; cs++) + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; if (!test_facility(28)) return; @@ -366,6 +368,7 @@ static void clock_sync_global(unsigned long delta) { unsigned long now, adj; struct ptff_qto qto; + int cs; /* Fixup the monotonic sched clock. */ tod_clock_base.eitod += delta; @@ -381,8 +384,10 @@ static void clock_sync_global(unsigned long delta) panic("TOD clock sync offset %li is too large to drift\n", tod_steering_delta); tod_steering_end = now + (abs(tod_steering_delta) << 15); - vdso_data->arch_data.tod_steering_end = tod_steering_end; - vdso_data->arch_data.tod_steering_delta = tod_steering_delta; + for (cs = 0; cs < CS_BASES; cs++) { + vdso_data[cs].arch_data.tod_steering_end = tod_steering_end; + vdso_data[cs].arch_data.tod_steering_delta = tod_steering_delta; + } /* Update LPAR offset. */ if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) -- cgit v1.2.3 From 53f1d31708f6240e4615b0927df31f182e389e2f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Fri, 26 Mar 2021 12:37:55 +0530 Subject: powerpc/mm/book3s64: Use the correct storage key value when calling H_PROTECT H_PROTECT expects the flag value to include flags: AVPN, pp0, pp1, pp2, key0-key4, Noexec, CMO Option flags This patch updates hpte_updatepp() to fetch the storage key value from the linux page table and use the same in H_PROTECT hcall. native_hpte_updatepp() is not updated because the kernel doesn't clear the existing storage key value there. The kernel also doesn't use hpte_updatepp() callback for updating storage keys. This fixes the below kernel crash observed with KUAP enabled. BUG: Unable to handle kernel data access on write at 0xc009fffffc440000 Faulting instruction address: 0xc0000000000b7030 Key fault AMR: 0xfcffffffffffffff IAMR: 0xc0000077bc498100 Found HPTE: v = 0x40070adbb6fffc05 r = 0x1ffffffffff1194 Oops: Kernel access of bad area, sig: 11 [#1] LE PAGE_SIZE=64K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ... CFAR: c000000000010100 DAR: c009fffffc440000 DSISR: 02200000 IRQMASK: 0 ... NIP memset+0x68/0x104 LR pcpu_alloc+0x54c/0xb50 Call Trace: pcpu_alloc+0x55c/0xb50 (unreliable) blk_stat_alloc_callback+0x94/0x150 blk_mq_init_allocated_queue+0x64/0x560 blk_mq_init_queue+0x54/0xb0 scsi_mq_alloc_queue+0x30/0xa0 scsi_alloc_sdev+0x1cc/0x300 scsi_probe_and_add_lun+0xb50/0x1020 __scsi_scan_target+0x17c/0x790 scsi_scan_channel+0x90/0xe0 scsi_scan_host_selected+0x148/0x1f0 do_scan_async+0x2c/0x2a0 async_run_entry_fn+0x78/0x220 process_one_work+0x264/0x540 worker_thread+0xa8/0x600 kthread+0x190/0x1a0 ret_from_kernel_thread+0x5c/0x6c With KUAP enabled the kernel uses storage key 3 for all its translations. But as shown by the debug print, in this specific case we have the hash page table entry created with key value 0. Found HPTE: v = 0x40070adbb6fffc05 r = 0x1ffffffffff1194 and DSISR indicates a key fault. This can happen due to parallel fault on the same EA by different CPUs: CPU 0 CPU 1 fault on X H_PAGE_BUSY set fault on X finish fault handling and clear H_PAGE_BUSY check for H_PAGE_BUSY continue with fault handling. This implies CPU1 will end up calling hpte_updatepp for address X and the kernel updated the hash pte entry with key 0 Fixes: d94b827e89dc ("powerpc/book3s64/kuap: Use Key 3 for kernel mapping with hash translation") Reported-by: Murilo Opsfelder Araujo Signed-off-by: Aneesh Kumar K.V Debugged-by: Michael Ellerman Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210326070755.304625-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/platforms/pseries/lpar.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 764170fdb0f7..3805519a6469 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -887,7 +887,8 @@ static long pSeries_lpar_hpte_updatepp(unsigned long slot, want_v = hpte_encode_avpn(vpn, psize, ssize); - flags = (newpp & 7) | H_AVPN; + flags = (newpp & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO)) | H_AVPN; + flags |= (newpp & HPTE_R_KEY_HI) >> 48; if (mmu_has_feature(MMU_FTR_KERNEL_RO)) /* Move pp0 into bit 8 (IBM 55) */ flags |= (newpp & HPTE_R_PP0) >> 55; -- cgit v1.2.3 From 1a1c130ab7575498eed5bcf7220037ae09cd1f8a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 23 Mar 2021 20:26:52 +0100 Subject: ACPI: tables: x86: Reserve memory occupied by ACPI tables The following problem has been reported by George Kennedy: Since commit 7fef431be9c9 ("mm/page_alloc: place pages to tail in __free_pages_core()") the following use after free occurs intermittently when ACPI tables are accessed. BUG: KASAN: use-after-free in ibft_init+0x134/0xc49 Read of size 4 at addr ffff8880be453004 by task swapper/0/1 CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.12.0-rc1-7a7fd0d #1 Call Trace: dump_stack+0xf6/0x158 print_address_description.constprop.9+0x41/0x60 kasan_report.cold.14+0x7b/0xd4 __asan_report_load_n_noabort+0xf/0x20 ibft_init+0x134/0xc49 do_one_initcall+0xc4/0x3e0 kernel_init_freeable+0x5af/0x66b kernel_init+0x16/0x1d0 ret_from_fork+0x22/0x30 ACPI tables mapped via kmap() do not have their mapped pages reserved and the pages can be "stolen" by the buddy allocator. Apparently, on the affected system, the ACPI table in question is not located in "reserved" memory, like ACPI NVS or ACPI Data, that will not be used by the buddy allocator, so the memory occupied by that table has to be explicitly reserved to prevent the buddy allocator from using it. In order to address this problem, rearrange the initialization of the ACPI tables on x86 to locate the initial tables earlier and reserve the memory occupied by them. The other architectures using ACPI should not be affected by this change. Link: https://lore.kernel.org/linux-acpi/1614802160-29362-1-git-send-email-george.kennedy@oracle.com/ Reported-by: George Kennedy Tested-by: George Kennedy Signed-off-by: Rafael J. Wysocki Reviewed-by: Mike Rapoport Cc: 5.10+ # 5.10+ --- arch/x86/kernel/acpi/boot.c | 25 ++++++++++++------------- arch/x86/kernel/setup.c | 8 +++----- drivers/acpi/tables.c | 42 +++++++++++++++++++++++++++++++++++++++--- include/linux/acpi.h | 9 ++++++++- 4 files changed, 62 insertions(+), 22 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 7bdc0239a943..14cd3186dc77 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1554,10 +1554,18 @@ void __init acpi_boot_table_init(void) /* * Initialize the ACPI boot-time table parser. */ - if (acpi_table_init()) { + if (acpi_locate_initial_tables()) disable_acpi(); - return; - } + else + acpi_reserve_initial_tables(); +} + +int __init early_acpi_boot_init(void) +{ + if (acpi_disabled) + return 1; + + acpi_table_init_complete(); acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1570,18 +1578,9 @@ void __init acpi_boot_table_init(void) } else { printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); disable_acpi(); - return; + return 1; } } -} - -int __init early_acpi_boot_init(void) -{ - /* - * If acpi_disabled, bail out - */ - if (acpi_disabled) - return 1; /* * Process the Multiple APIC Description Table (MADT), if present diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d883176ef2ce..5ecd69a48393 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1045,6 +1045,9 @@ void __init setup_arch(char **cmdline_p) cleanup_highmap(); + /* Look for ACPI tables and reserve memory occupied by them. */ + acpi_boot_table_init(); + memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); @@ -1136,11 +1139,6 @@ void __init setup_arch(char **cmdline_p) early_platform_quirks(); - /* - * Parse the ACPI tables for possible boot-time SMP configuration. - */ - acpi_boot_table_init(); - early_acpi_boot_init(); initmem_init(); diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index e48690a006a4..9d581045acff 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -780,7 +780,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table, } /* - * acpi_table_init() + * acpi_locate_initial_tables() * * find RSDP, find and checksum SDT/XSDT. * checksum all tables, print SDT/XSDT @@ -788,7 +788,7 @@ acpi_status acpi_os_table_override(struct acpi_table_header *existing_table, * result: sdt_entry[] is initialized */ -int __init acpi_table_init(void) +int __init acpi_locate_initial_tables(void) { acpi_status status; @@ -803,9 +803,45 @@ int __init acpi_table_init(void) status = acpi_initialize_tables(initial_tables, ACPI_MAX_TABLES, 0); if (ACPI_FAILURE(status)) return -EINVAL; - acpi_table_initrd_scan(); + return 0; +} + +void __init acpi_reserve_initial_tables(void) +{ + int i; + + for (i = 0; i < ACPI_MAX_TABLES; i++) { + struct acpi_table_desc *table_desc = &initial_tables[i]; + u64 start = table_desc->address; + u64 size = table_desc->length; + + if (!start || !size) + break; + + pr_info("Reserving %4s table memory at [mem 0x%llx-0x%llx]\n", + table_desc->signature.ascii, start, start + size - 1); + + memblock_reserve(start, size); + } +} + +void __init acpi_table_init_complete(void) +{ + acpi_table_initrd_scan(); check_multiple_madt(); +} + +int __init acpi_table_init(void) +{ + int ret; + + ret = acpi_locate_initial_tables(); + if (ret) + return ret; + + acpi_table_init_complete(); + return 0; } diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fcdaab723916..3bdcfc4401b7 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -222,10 +222,14 @@ void __iomem *__acpi_map_table(unsigned long phys, unsigned long size); void __acpi_unmap_table(void __iomem *map, unsigned long size); int early_acpi_boot_init(void); int acpi_boot_init (void); +void acpi_boot_table_prepare (void); void acpi_boot_table_init (void); int acpi_mps_check (void); int acpi_numa_init (void); +int acpi_locate_initial_tables (void); +void acpi_reserve_initial_tables (void); +void acpi_table_init_complete (void); int acpi_table_init (void); int acpi_table_parse(char *id, acpi_tbl_table_handler handler); int __init acpi_table_parse_entries(char *id, unsigned long table_size, @@ -814,9 +818,12 @@ static inline int acpi_boot_init(void) return 0; } +static inline void acpi_boot_table_prepare(void) +{ +} + static inline void acpi_boot_table_init(void) { - return; } static inline int acpi_mps_check(void) -- cgit v1.2.3 From 7b9acbb6aad4f54623dcd4bd4b1a60fe0c727b09 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Sun, 7 Feb 2021 04:57:58 -0800 Subject: xtensa: fix uaccess-related livelock in do_page_fault If a uaccess (e.g. get_user()) triggers a fault and there's a fault signal pending, the handler will return to the uaccess without having performed a uaccess fault fixup, and so the CPU will immediately execute the uaccess instruction again, whereupon it will livelock bouncing between that instruction and the fault handler. https://lore.kernel.org/lkml/20210121123140.GD48431@C02TD0UTHF1T.local/ Cc: stable@vger.kernel.org Reported-by: Mark Rutland Signed-off-by: Max Filippov --- arch/xtensa/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/xtensa/mm/fault.c b/arch/xtensa/mm/fault.c index 7666408ce12a..95a74890c7e9 100644 --- a/arch/xtensa/mm/fault.c +++ b/arch/xtensa/mm/fault.c @@ -112,8 +112,11 @@ good_area: */ fault = handle_mm_fault(vma, address, flags, regs); - if (fault_signal_pending(fault, regs)) + if (fault_signal_pending(fault, regs)) { + if (!user_mode(regs)) + goto bad_page_fault; return; + } if (unlikely(fault & VM_FAULT_ERROR)) { if (fault & VM_FAULT_OOM) -- cgit v1.2.3 From 9ae31e2ab293bf4d9c42e7079b156072f8a7f8ca Mon Sep 17 00:00:00 2001 From: Mauri Sandberg Date: Mon, 29 Mar 2021 15:31:36 +0300 Subject: MIPS: kernel: setup.c: fix compilation error With ath79_defconfig enabling CONFIG_MIPS_ELF_APPENDED_DTB gives a compilation error. This patch fixes it. Build log: ... CC kernel/locking/percpu-rwsem.o ../arch/mips/kernel/setup.c:46:39: error: conflicting types for '__appended_dtb' const char __section(".appended_dtb") __appended_dtb[0x100000]; ^~~~~~~~~~~~~~ In file included from ../arch/mips/kernel/setup.c:34: ../arch/mips/include/asm/bootinfo.h:118:13: note: previous declaration of '__appended_dtb' was here extern char __appended_dtb[]; ^~~~~~~~~~~~~~ CC fs/attr.o make[4]: *** [../scripts/Makefile.build:271: arch/mips/kernel/setup.o] Error 1 ... Root cause seems to be: Fixes: b83ba0b9df56 ("MIPS: of: Introduce helper function to get DTB") Signed-off-by: Mauri Sandberg Reviewed-by: Thomas Bogendoerfer Cc: trivial@kernel.org Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 279be0153f8b..23a140327a0b 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -43,7 +43,7 @@ #include #ifdef CONFIG_MIPS_ELF_APPENDED_DTB -const char __section(".appended_dtb") __appended_dtb[0x100000]; +char __section(".appended_dtb") __appended_dtb[0x100000]; #endif /* CONFIG_MIPS_ELF_APPENDED_DTB */ struct cpuinfo_mips cpu_data[NR_CPUS] __read_mostly; -- cgit v1.2.3 From d632826f26f2361e6ef18881611928036fac30e6 Mon Sep 17 00:00:00 2001 From: Haiwei Li Date: Sat, 13 Mar 2021 13:10:32 +0800 Subject: KVM: clean up the unused argument kvm_msr_ignored_check function never uses vcpu argument. Clean up the function and invokers. Signed-off-by: Haiwei Li Message-Id: <20210313051032.4171-1-lihaiwei.kernel@gmail.com> Reviewed-by: Keqian Zhu Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fe806e894212..f7d12fca397b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -271,8 +271,7 @@ static struct kmem_cache *x86_emulator_cache; * When called, it means the previous get/set msr reached an invalid msr. * Return true if we want to ignore/silent this failed msr access. */ -static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr, - u64 data, bool write) +static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) { const char *op = write ? "wrmsr" : "rdmsr"; @@ -1445,7 +1444,7 @@ static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) if (r == KVM_MSR_RET_INVALID) { /* Unconditionally clear the output for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) r = 0; } @@ -1620,7 +1619,7 @@ static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, int ret = __kvm_set_msr(vcpu, index, data, host_initiated); if (ret == KVM_MSR_RET_INVALID) - if (kvm_msr_ignored_check(vcpu, index, data, true)) + if (kvm_msr_ignored_check(index, data, true)) ret = 0; return ret; @@ -1658,7 +1657,7 @@ static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, if (ret == KVM_MSR_RET_INVALID) { /* Unconditionally clear *data for simplicity */ *data = 0; - if (kvm_msr_ignored_check(vcpu, index, 0, false)) + if (kvm_msr_ignored_check(index, 0, false)) ret = 0; } -- cgit v1.2.3 From ecaf088f53fcc893cd00c846f53042a536b9630d Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Fri, 26 Mar 2021 00:03:34 -0700 Subject: KVM: x86: remove unused declaration of kvm_write_tsc() kvm_write_tsc() was renamed and made static since commit 0c899c25d754 ("KVM: x86: do not attempt TSC synchronization on guest writes"). Remove its unused declaration. Signed-off-by: Dongli Zhang Message-Id: <20210326070334.12310-1-dongli.zhang@oracle.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 39eb04887141..9035e34aa156 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -250,7 +250,6 @@ static inline bool kvm_vcpu_latch_init(struct kvm_vcpu *vcpu) void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs); void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); -void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 get_kvmclock_ns(struct kvm *kvm); int kvm_read_guest_virt(struct kvm_vcpu *vcpu, -- cgit v1.2.3 From 1973cadd4cca08eaeca944f60598f04ab0d80682 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Tue, 23 Mar 2021 09:45:15 +0100 Subject: KVM: x86/vPMU: Forbid writing to MSR_F15H_PERF MSRs when guest doesn't have X86_FEATURE_PERFCTR_CORE MSR_F15H_PERF_CTL0-5, MSR_F15H_PERF_CTR0-5 MSRs are only available when X86_FEATURE_PERFCTR_CORE CPUID bit was exposed to the guest. KVM, however, allows these MSRs unconditionally because kvm_pmu_is_valid_msr() -> amd_msr_idx_to_pmc() check always passes and because kvm_pmu_set_msr() -> amd_pmu_set_msr() doesn't fail. In case of a counter (CTRn), no big harm is done as we only increase internal PMC's value but in case of an eventsel (CTLn), we go deep into perf internals with a non-existing counter. Note, kvm_get_msr_common() just returns '0' when these MSRs don't exist and this also seems to contradict architectural behavior which is #GP (I did check one old Opteron host) but changing this status quo is a bit scarier. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210323084515.1346540-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/pmu.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch') diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index 035da07500e8..fdf587f19c5f 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -98,6 +98,8 @@ static enum index msr_to_index(u32 msr) static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, enum pmu_type type) { + struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu); + switch (msr) { case MSR_F15H_PERF_CTL0: case MSR_F15H_PERF_CTL1: @@ -105,6 +107,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTL3: case MSR_F15H_PERF_CTL4: case MSR_F15H_PERF_CTL5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: if (type != PMU_TYPE_EVNTSEL) return NULL; @@ -115,6 +120,9 @@ static inline struct kvm_pmc *get_gp_pmc_amd(struct kvm_pmu *pmu, u32 msr, case MSR_F15H_PERF_CTR3: case MSR_F15H_PERF_CTR4: case MSR_F15H_PERF_CTR5: + if (!guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) + return NULL; + fallthrough; case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: if (type != PMU_TYPE_COUNTER) return NULL; -- cgit v1.2.3 From 6fb3084ab5d9331cfadf07c59cf4a0bd4059bf4a Mon Sep 17 00:00:00 2001 From: Siddharth Chandrasekaran Date: Wed, 24 Mar 2021 13:43:47 +0100 Subject: KVM: make: Fix out-of-source module builds Building kvm module out-of-source with, make -C $SRC O=$BIN M=arch/x86/kvm fails to find "irq.h" as the include dir passed to cflags-y does not prefix the source dir. Fix this by prefixing $(srctree) to the include dir path. Signed-off-by: Siddharth Chandrasekaran Message-Id: <20210324124347.18336-1-sidcha@amazon.de> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 1b4766fe1de2..eafc4d601f25 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -1,6 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 -ccflags-y += -Iarch/x86/kvm +ccflags-y += -I $(srctree)/arch/x86/kvm ccflags-$(CONFIG_KVM_WERROR) += -Werror ifeq ($(CONFIG_FRAME_POINTER),y) -- cgit v1.2.3 From a835429cda91621fca915d80672a157b47738afb Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 13:01:17 -0700 Subject: KVM: x86/mmu: Ensure TLBs are flushed when yielding during GFN range zap When flushing a range of GFNs across multiple roots, ensure any pending flush from a previous root is honored before yielding while walking the tables of the current root. Note, kvm_tdp_mmu_zap_gfn_range() now intentionally overwrites its local "flush" with the result to avoid redundant flushes. zap_gfn_range() preserves and return the incoming "flush", unless of course the flush was performed prior to yielding and no new flush was triggered. Fixes: 1af4a96025b3 ("KVM: x86/mmu: Yield in TDU MMU iter even if no SPTES changed") Cc: stable@vger.kernel.org Reviewed-by: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210325200119.1359384-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/tdp_mmu.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d78915019b08..f80648ac1d15 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -86,7 +86,7 @@ static inline struct kvm_mmu_page *tdp_mmu_next_root(struct kvm *kvm, list_for_each_entry(_root, &_kvm->arch.tdp_mmu_roots, link) static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield); + gfn_t start, gfn_t end, bool can_yield, bool flush); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) { @@ -99,7 +99,7 @@ void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root) list_del(&root->link); - zap_gfn_range(kvm, root, 0, max_gfn, false); + zap_gfn_range(kvm, root, 0, max_gfn, false, false); free_page((unsigned long)root->spt); kmem_cache_free(mmu_page_header_cache, root); @@ -678,20 +678,21 @@ static inline bool tdp_mmu_iter_cond_resched(struct kvm *kvm, * scheduler needs the CPU or there is contention on the MMU lock. If this * function cannot yield, it will not release the MMU lock or reschedule and * the caller must ensure it does not supply too large a GFN range, or the - * operation can cause a soft lockup. + * operation can cause a soft lockup. Note, in some use cases a flush may be + * required by prior actions. Ensure the pending flush is performed prior to + * yielding. */ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, - gfn_t start, gfn_t end, bool can_yield) + gfn_t start, gfn_t end, bool can_yield, bool flush) { struct tdp_iter iter; - bool flush_needed = false; rcu_read_lock(); tdp_root_for_each_pte(iter, root, start, end) { if (can_yield && - tdp_mmu_iter_cond_resched(kvm, &iter, flush_needed)) { - flush_needed = false; + tdp_mmu_iter_cond_resched(kvm, &iter, flush)) { + flush = false; continue; } @@ -709,11 +710,11 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, continue; tdp_mmu_set_spte(kvm, &iter, 0); - flush_needed = true; + flush = true; } rcu_read_unlock(); - return flush_needed; + return flush; } /* @@ -728,7 +729,7 @@ bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush |= zap_gfn_range(kvm, root, start, end, true); + flush = zap_gfn_range(kvm, root, start, end, true, flush); return flush; } @@ -940,7 +941,7 @@ static int zap_gfn_range_hva_wrapper(struct kvm *kvm, struct kvm_mmu_page *root, gfn_t start, gfn_t end, unsigned long unused) { - return zap_gfn_range(kvm, root, start, end, false); + return zap_gfn_range(kvm, root, start, end, false, false); } int kvm_tdp_mmu_zap_hva_range(struct kvm *kvm, unsigned long start, -- cgit v1.2.3 From 048f49809c526348775425420fb5b8e84fd9a133 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 13:01:18 -0700 Subject: KVM: x86/mmu: Ensure TLBs are flushed for TDP MMU during NX zapping Honor the "flush needed" return from kvm_tdp_mmu_zap_gfn_range(), which does the flush itself if and only if it yields (which it will never do in this particular scenario), and otherwise expects the caller to do the flush. If pages are zapped from the TDP MMU but not the legacy MMU, then no flush will occur. Fixes: 29cf0f5007a2 ("kvm: x86/mmu: NX largepage recovery for TDP MMU") Cc: stable@vger.kernel.org Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210325200119.1359384-3-seanjc@google.com> Reviewed-by: Ben Gardon Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index d75524bc8423..2705f9fa22b9 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5884,6 +5884,8 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) struct kvm_mmu_page *sp; unsigned int ratio; LIST_HEAD(invalid_list); + bool flush = false; + gfn_t gfn_end; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5905,19 +5907,20 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, - sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level)); + gfn_end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, gfn_end); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); } if (need_resched() || rwlock_needbreak(&kvm->mmu_lock)) { - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); cond_resched_rwlock_write(&kvm->mmu_lock); + flush = false; } } - kvm_mmu_commit_zap_page(kvm, &invalid_list); + kvm_mmu_remote_flush_or_zap(kvm, &invalid_list, flush); write_unlock(&kvm->mmu_lock); srcu_read_unlock(&kvm->srcu, rcu_idx); -- cgit v1.2.3 From 33a3164161fc86b9cc238f7f2aa2ccb1d5559b1c Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 25 Mar 2021 13:01:19 -0700 Subject: KVM: x86/mmu: Don't allow TDP MMU to yield when recovering NX pages Prevent the TDP MMU from yielding when zapping a gfn range during NX page recovery. If a flush is pending from a previous invocation of the zapping helper, either in the TDP MMU or the legacy MMU, but the TDP MMU has not accumulated a flush for the current invocation, then yielding will release mmu_lock with stale TLB entries. That being said, this isn't technically a bug fix in the current code, as the TDP MMU will never yield in this case. tdp_mmu_iter_cond_resched() will yield if and only if it has made forward progress, as defined by the current gfn vs. the last yielded (or starting) gfn. Because zapping a single shadow page is guaranteed to (a) find that page and (b) step sideways at the level of the shadow page, the TDP iter will break its loop before getting a chance to yield. But that is all very, very subtle, and will break at the slightest sneeze, e.g. zapping while holding mmu_lock for read would break as the TDP MMU wouldn't be guaranteed to see the present shadow page, and thus could step sideways at a lower level. Cc: Ben Gardon Signed-off-by: Sean Christopherson Message-Id: <20210325200119.1359384-4-seanjc@google.com> [Add lockdep assertion. - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 4 +--- arch/x86/kvm/mmu/tdp_mmu.c | 5 +++-- arch/x86/kvm/mmu/tdp_mmu.h | 24 +++++++++++++++++++++++- 3 files changed, 27 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 2705f9fa22b9..486aa94ecf1d 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5885,7 +5885,6 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) unsigned int ratio; LIST_HEAD(invalid_list); bool flush = false; - gfn_t gfn_end; ulong to_zap; rcu_idx = srcu_read_lock(&kvm->srcu); @@ -5907,8 +5906,7 @@ static void kvm_recover_nx_lpages(struct kvm *kvm) lpage_disallowed_link); WARN_ON_ONCE(!sp->lpage_disallowed); if (is_tdp_mmu_page(sp)) { - gfn_end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); - flush = kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, gfn_end); + flush = kvm_tdp_mmu_zap_sp(kvm, sp); } else { kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); WARN_ON_ONCE(sp->lpage_disallowed); diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index f80648ac1d15..81352cd428fa 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -723,13 +723,14 @@ static bool zap_gfn_range(struct kvm *kvm, struct kvm_mmu_page *root, * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. */ -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end) +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield) { struct kvm_mmu_page *root; bool flush = false; for_each_tdp_mmu_root_yield_safe(kvm, root) - flush = zap_gfn_range(kvm, root, start, end, true, flush); + flush = zap_gfn_range(kvm, root, start, end, can_yield, flush); return flush; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 3b761c111bff..31096ece9b14 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -8,7 +8,29 @@ hpa_t kvm_tdp_mmu_get_vcpu_root_hpa(struct kvm_vcpu *vcpu); void kvm_tdp_mmu_free_root(struct kvm *kvm, struct kvm_mmu_page *root); -bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end); +bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, gfn_t end, + bool can_yield); +static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, gfn_t start, + gfn_t end) +{ + return __kvm_tdp_mmu_zap_gfn_range(kvm, start, end, true); +} +static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + gfn_t end = sp->gfn + KVM_PAGES_PER_HPAGE(sp->role.level); + + /* + * Don't allow yielding, as the caller may have a flush pending. Note, + * if mmu_lock is held for write, zapping will never yield in this case, + * but explicitly disallow it for safety. The TDP MMU does not yield + * until it has made forward progress (steps sideways), and when zapping + * a single shadow page that it's guaranteed to see (thus the mmu_lock + * requirement), its "step sideways" will always step beyond the bounds + * of the shadow page's gfn range and stop iterating before yielding. + */ + lockdep_assert_held_write(&kvm->mmu_lock); + return __kvm_tdp_mmu_zap_gfn_range(kvm, sp->gfn, end, false); +} void kvm_tdp_mmu_zap_all(struct kvm *kvm); int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, -- cgit v1.2.3 From a58d9166a756a0f4a6618e4f593232593d6df134 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Mar 2021 06:24:43 -0400 Subject: KVM: SVM: load control fields from VMCB12 before checking them Avoid races between check and use of the nested VMCB controls. This for example ensures that the VMRUN intercept is always reflected to the nested hypervisor, instead of being processed by the host. Without this patch, it is possible to end up with svm->nested.hsave pointing to the MSR permission bitmap for nested guests. This bug is CVE-2021-29657. Reported-by: Felix Wilhelm Cc: stable@vger.kernel.org Fixes: 2fcf4876ada ("KVM: nSVM: implement on demand allocation of the nested state") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 9e4c226dbf7d..061a00f91af5 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -225,7 +225,7 @@ static bool nested_vmcb_check_controls(struct vmcb_control_area *control) return true; } -static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) +static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool vmcb12_lma; @@ -257,7 +257,7 @@ static bool nested_vmcb_checks(struct vcpu_svm *svm, struct vmcb *vmcb12) if (kvm_valid_cr4(&svm->vcpu, vmcb12->save.cr4)) return false; - return nested_vmcb_check_controls(&vmcb12->control); + return true; } static void load_nested_vmcb_control(struct vcpu_svm *svm, @@ -440,7 +440,6 @@ int enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb12_gpa, int ret; svm->nested.vmcb12_gpa = vmcb12_gpa; - load_nested_vmcb_control(svm, &vmcb12->control); nested_prepare_vmcb_save(svm, vmcb12); nested_prepare_vmcb_control(svm); @@ -484,7 +483,10 @@ int nested_svm_vmrun(struct vcpu_svm *svm) if (WARN_ON_ONCE(!svm->nested.initialized)) return -EINVAL; - if (!nested_vmcb_checks(svm, vmcb12)) { + load_nested_vmcb_control(svm, &vmcb12->control); + + if (!nested_vmcb_check_save(svm, vmcb12) || + !nested_vmcb_check_controls(&svm->nested.ctl)) { vmcb12->control.exit_code = SVM_EXIT_ERR; vmcb12->control.exit_code_hi = 0; vmcb12->control.exit_info_1 = 0; -- cgit v1.2.3 From 3c346c0c60ab06a021d1c0884a0ef494bc4ee3a7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Wed, 31 Mar 2021 06:28:01 -0400 Subject: KVM: SVM: ensure that EFER.SVME is set when running nested guest or on nested vmexit Fixing nested_vmcb_check_save to avoid all TOC/TOU races is a bit harder in released kernels, so do the bare minimum by avoiding that EFER.SVME is cleared. This is problematic because svm_set_efer frees the data structures for nested virtualization if EFER.SVME is cleared. Also check that EFER.SVME remains set after a nested vmexit; clearing it could happen if the bit is zero in the save area that is passed to KVM_SET_NESTED_STATE (the save area of the nested state corresponds to the nested hypervisor's state and is restored on the next nested vmexit). Cc: stable@vger.kernel.org Fixes: 2fcf4876ada ("KVM: nSVM: implement on demand allocation of the nested state") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/nested.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index 061a00f91af5..c9e7b86350d6 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -229,6 +229,13 @@ static bool nested_vmcb_check_save(struct vcpu_svm *svm, struct vmcb *vmcb12) { bool vmcb12_lma; + /* + * FIXME: these should be done after copying the fields, + * to avoid TOC/TOU races. For these save area checks + * the possible damage is limited since kvm_set_cr0 and + * kvm_set_cr4 handle failure; EFER_SVME is an exception + * so it is force-set later in nested_prepare_vmcb_save. + */ if ((vmcb12->save.efer & EFER_SVME) == 0) return false; @@ -382,7 +389,14 @@ static void nested_prepare_vmcb_save(struct vcpu_svm *svm, struct vmcb *vmcb12) svm->vmcb->save.gdtr = vmcb12->save.gdtr; svm->vmcb->save.idtr = vmcb12->save.idtr; kvm_set_rflags(&svm->vcpu, vmcb12->save.rflags); - svm_set_efer(&svm->vcpu, vmcb12->save.efer); + + /* + * Force-set EFER_SVME even though it is checked earlier on the + * VMCB12, because the guest can flip the bit between the check + * and now. Clearing EFER_SVME would call svm_free_nested. + */ + svm_set_efer(&svm->vcpu, vmcb12->save.efer | EFER_SVME); + svm_set_cr0(&svm->vcpu, vmcb12->save.cr0); svm_set_cr4(&svm->vcpu, vmcb12->save.cr4); svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = vmcb12->save.cr2; @@ -1188,6 +1202,8 @@ static int svm_set_nested_state(struct kvm_vcpu *vcpu, */ if (!(save->cr0 & X86_CR0_PG)) goto out_free; + if (!(save->efer & EFER_SVME)) + goto out_free; /* * All checks done, we can enter guest mode. L1 control fields -- cgit v1.2.3 From c2c647f91aec192f45f0849c225f134183cf4e90 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 25 Mar 2021 14:05:11 -0400 Subject: KVM: x86: reduce pvclock_gtod_sync_lock critical sections There is no need to include changes to vcpu->requests into the pvclock_gtod_sync_lock critical section. The changes to the shared data structures (in pvclock_update_vm_gtod_copy) already occur under the lock. Cc: David Woodhouse Cc: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7d12fca397b..2301623d3c53 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2561,10 +2561,12 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) kvm_hv_invalidate_tsc_page(kvm); - spin_lock(&ka->pvclock_gtod_sync_lock); kvm_make_mclock_inprogress_request(kvm); + /* no guest entries from this point */ + spin_lock(&ka->pvclock_gtod_sync_lock); pvclock_update_vm_gtod_copy(kvm); + spin_unlock(&ka->pvclock_gtod_sync_lock); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2572,8 +2574,6 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } @@ -7739,16 +7739,14 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm_arch *ka = &kvm->arch; spin_lock(&ka->pvclock_gtod_sync_lock); - pvclock_update_vm_gtod_copy(kvm); + spin_unlock(&ka->pvclock_gtod_sync_lock); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); - - spin_unlock(&ka->pvclock_gtod_sync_lock); } mutex_unlock(&kvm_lock); } -- cgit v1.2.3 From a83829f56c7ce17d5d05370820e185d9a23d3090 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 25 Mar 2021 14:11:14 -0400 Subject: KVM: x86: disable interrupts while pvclock_gtod_sync_lock is taken pvclock_gtod_sync_lock can be taken with interrupts disabled if the preempt notifier calls get_kvmclock_ns to update the Xen runstate information: spin_lock include/linux/spinlock.h:354 [inline] get_kvmclock_ns+0x25/0x390 arch/x86/kvm/x86.c:2587 kvm_xen_update_runstate+0x3d/0x2c0 arch/x86/kvm/xen.c:69 kvm_xen_update_runstate_guest+0x74/0x320 arch/x86/kvm/xen.c:100 kvm_xen_runstate_set_preempted arch/x86/kvm/xen.h:96 [inline] kvm_arch_vcpu_put+0x2d8/0x5a0 arch/x86/kvm/x86.c:4062 So change the users of the spinlock to spin_lock_irqsave and spin_unlock_irqrestore. Reported-by: syzbot+b282b65c2c68492df769@syzkaller.appspotmail.com Fixes: 30b5c851af79 ("KVM: x86/xen: Add support for vCPU runstate information") Cc: David Woodhouse Cc: Marcelo Tosatti Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2301623d3c53..e1960eed921f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2328,7 +2328,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) kvm_vcpu_write_tsc_offset(vcpu, offset); raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); - spin_lock(&kvm->arch.pvclock_gtod_sync_lock); + spin_lock_irqsave(&kvm->arch.pvclock_gtod_sync_lock, flags); if (!matched) { kvm->arch.nr_vcpus_matched_tsc = 0; } else if (!already_matched) { @@ -2336,7 +2336,7 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data) } kvm_track_tsc_matching(vcpu); - spin_unlock(&kvm->arch.pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&kvm->arch.pvclock_gtod_sync_lock, flags); } static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, @@ -2558,15 +2558,16 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) int i; struct kvm_vcpu *vcpu; struct kvm_arch *ka = &kvm->arch; + unsigned long flags; kvm_hv_invalidate_tsc_page(kvm); kvm_make_mclock_inprogress_request(kvm); /* no guest entries from this point */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(i, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2581,17 +2582,18 @@ u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; + unsigned long flags; u64 ret; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); if (!ka->use_master_clock) { - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); return get_kvmclock_base_ns() + ka->kvmclock_offset; } hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* both __this_cpu_read() and rdtsc() should be on the same cpu */ get_cpu(); @@ -2685,13 +2687,13 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * If the host uses TSC clock, then passthrough TSC as stable * to the guest. */ - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); use_master_clock = ka->use_master_clock; if (use_master_clock) { host_tsc = ka->master_cycle_now; kernel_ns = ka->master_kernel_ns; } - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); @@ -7723,6 +7725,7 @@ static void kvm_hyperv_tsc_notifier(void) struct kvm *kvm; struct kvm_vcpu *vcpu; int cpu; + unsigned long flags; mutex_lock(&kvm_lock); list_for_each_entry(kvm, &vm_list, vm_list) @@ -7738,9 +7741,9 @@ static void kvm_hyperv_tsc_notifier(void) list_for_each_entry(kvm, &vm_list, vm_list) { struct kvm_arch *ka = &kvm->arch; - spin_lock(&ka->pvclock_gtod_sync_lock); + spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags); pvclock_update_vm_gtod_copy(kvm); - spin_unlock(&ka->pvclock_gtod_sync_lock); + spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags); kvm_for_each_vcpu(cpu, vcpu, kvm) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); -- cgit v1.2.3 From 77fcbe823f002ad18426545351fa2fb94f8d5e61 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 31 Mar 2021 14:41:29 +0200 Subject: KVM: x86: Prevent 'hv_clock->system_time' from going negative in kvm_guest_time_update() When guest time is reset with KVM_SET_CLOCK(0), it is possible for 'hv_clock->system_time' to become a small negative number. This happens because in KVM_SET_CLOCK handling we set 'kvm->arch.kvmclock_offset' based on get_kvmclock_ns(kvm) but when KVM_REQ_CLOCK_UPDATE is handled, kvm_guest_time_update() does (masterclock in use case): hv_clock.system_time = ka->master_kernel_ns + v->kvm->arch.kvmclock_offset; And 'master_kernel_ns' represents the last time when masterclock got updated, it can precede KVM_SET_CLOCK() call. Normally, this is not a problem, the difference is very small, e.g. I'm observing hv_clock.system_time = -70 ns. The issue comes from the fact that 'hv_clock.system_time' is stored as unsigned and 'system_time / 100' in compute_tsc_page_parameters() becomes a very big number. Use 'master_kernel_ns' instead of get_kvmclock_ns() when masterclock is in use and get_kvmclock_base_ns() when it's not to prevent 'system_time' from going negative. Signed-off-by: Vitaly Kuznetsov Message-Id: <20210331124130.337992-2-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e1960eed921f..eca63625aee4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5727,6 +5727,7 @@ set_pit2_out: } #endif case KVM_SET_CLOCK: { + struct kvm_arch *ka = &kvm->arch; struct kvm_clock_data user_ns; u64 now_ns; @@ -5745,8 +5746,22 @@ set_pit2_out: * pvclock_update_vm_gtod_copy(). */ kvm_gen_update_masterclock(kvm); - now_ns = get_kvmclock_ns(kvm); - kvm->arch.kvmclock_offset += user_ns.clock - now_ns; + + /* + * This pairs with kvm_guest_time_update(): when masterclock is + * in use, we use master_kernel_ns + kvmclock_offset to set + * unsigned 'system_time' so if we use get_kvmclock_ns() (which + * is slightly ahead) here we risk going negative on unsigned + * 'system_time' when 'user_ns.clock' is very small. + */ + spin_lock_irq(&ka->pvclock_gtod_sync_lock); + if (kvm->arch.use_master_clock) + now_ns = ka->master_kernel_ns; + else + now_ns = get_kvmclock_base_ns(); + ka->kvmclock_offset = user_ns.clock - now_ns; + spin_unlock_irq(&ka->pvclock_gtod_sync_lock); + kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); break; } -- cgit v1.2.3 From 8cdddd182bd7befae6af49c5fd612893f55d6ccb Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 24 Mar 2021 16:22:19 +0100 Subject: ACPI: processor: Fix CPU0 wakeup in acpi_idle_play_dead() Commit 496121c02127 ("ACPI: processor: idle: Allow probing on platforms with one ACPI C-state") broke CPU0 hotplug on certain systems, e.g. I'm observing the following on AWS Nitro (e.g r5b.xlarge but other instance types are affected as well): # echo 0 > /sys/devices/system/cpu/cpu0/online # echo 1 > /sys/devices/system/cpu/cpu0/online <10 seconds delay> -bash: echo: write error: Input/output error In fact, the above mentioned commit only revealed the problem and did not introduce it. On x86, to wakeup CPU an NMI is being used and hlt_play_dead()/mwait_play_dead() loops are prepared to handle it: /* * If NMI wants to wake up CPU0, start CPU0. */ if (wakeup_cpu0()) start_cpu0(); cpuidle_play_dead() -> acpi_idle_play_dead() (which is now being called on systems where it wasn't called before the above mentioned commit) serves the same purpose but it doesn't have a path for CPU0. What happens now on wakeup is: - NMI is sent to CPU0 - wakeup_cpu0_nmi() works as expected - we get back to while (1) loop in acpi_idle_play_dead() - safe_halt() puts CPU0 to sleep again. The straightforward/minimal fix is add the special handling for CPU0 on x86 and that's what the patch is doing. Fixes: 496121c02127 ("ACPI: processor: idle: Allow probing on platforms with one ACPI C-state") Signed-off-by: Vitaly Kuznetsov Cc: 5.10+ # 5.10+ Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 2 +- drivers/acpi/processor_idle.c | 7 +++++++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index c0538f82c9a2..57ef2094af93 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -132,6 +132,7 @@ void native_play_dead(void); void play_dead_common(void); void wbinvd_on_cpu(int cpu); int wbinvd_on_all_cpus(void); +bool wakeup_cpu0(void); void native_smp_send_reschedule(int cpu); void native_send_call_func_ipi(const struct cpumask *mask); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 02813a7f3a7c..f877150a91da 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1659,7 +1659,7 @@ void play_dead_common(void) local_irq_disable(); } -static bool wakeup_cpu0(void) +bool wakeup_cpu0(void) { if (smp_processor_id() == 0 && enable_start_cpu0) return true; diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d93e400940a3..768a6b4d2368 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -29,6 +29,7 @@ */ #ifdef CONFIG_X86 #include +#include #endif #define _COMPONENT ACPI_PROCESSOR_COMPONENT @@ -541,6 +542,12 @@ static int acpi_idle_play_dead(struct cpuidle_device *dev, int index) wait_for_freeze(); } else return -ENODEV; + +#if defined(CONFIG_X86) && defined(CONFIG_HOTPLUG_CPU) + /* If NMI wants to wake up CPU0, start CPU0. */ + if (wakeup_cpu0()) + start_cpu0(); +#endif } /* Never reached */ -- cgit v1.2.3 From 23c1075ae83adaf14ea3f727c40368799f80bccc Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 17 Mar 2021 23:08:38 +0800 Subject: riscv: Drop const annotation for sp The const annotation should not be used for 'sp', or it will become read only and lead to bad stack output. Fixes: dec822771b01 ("riscv: stacktrace: Move register keyword to beginning of declaration") Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 3f893c9d9d85..2b3e0cb90d78 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -14,7 +14,7 @@ #include -register const unsigned long sp_in_global __asm__("sp"); +register unsigned long sp_in_global __asm__("sp"); #ifdef CONFIG_FRAME_POINTER -- cgit v1.2.3 From 285a76bb2cf51b0c74c634f2aaccdb93e1f2a359 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 29 Mar 2021 10:57:49 +0100 Subject: riscv: evaluate put_user() arg before enabling user access The header has a problem with put_user(a, ptr) if the 'a' is not a simple variable, such as a function. This can lead to the compiler producing code as so: 1: enable_user_access() 2: evaluate 'a' into register 'r' 3: put 'r' to 'ptr' 4: disable_user_acess() The issue is that 'a' is now being evaluated with the user memory protections disabled. So we try and force the evaulation by assigning 'x' to __val at the start, and hoping the compiler barriers in enable_user_access() do the job of ordering step 2 before step 1. This has shown up in a bug where 'a' sleeps and thus schedules out and loses the SR_SUM flag. This isn't sufficient to fully fix, but should reduce the window of opportunity. The first instance of this we found is in scheudle_tail() where the code does: $ less -N kernel/sched/core.c 4263 if (current->set_child_tid) 4264 put_user(task_pid_vnr(current), current->set_child_tid); Here, the task_pid_vnr(current) is called within the block that has enabled the user memory access. This can be made worse with KASAN which makes task_pid_vnr() a rather large call with plenty of opportunity to sleep. Signed-off-by: Ben Dooks Reported-by: syzbot+e74b94fe601ab9552d69@syzkaller.appspotmail.com Suggested-by: Arnd Bergman -- Changes since v1: - fixed formatting and updated the patch description with more info Changes since v2: - fixed commenting on __put_user() (schwab@linux-m68k.org) Change since v3: - fixed RFC in patch title. Should be ready to merge. Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/uaccess.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/riscv/include/asm/uaccess.h b/arch/riscv/include/asm/uaccess.h index 824b2c9da75b..f944062c9d99 100644 --- a/arch/riscv/include/asm/uaccess.h +++ b/arch/riscv/include/asm/uaccess.h @@ -306,7 +306,9 @@ do { \ * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable - * to the result of dereferencing @ptr. + * to the result of dereferencing @ptr. The value of @x is copied to avoid + * re-ordering where @x is evaluated inside the block that enables user-space + * access (thus bypassing user space protection if @x is a function). * * Caller must check the pointer with access_ok() before calling this * function. @@ -316,12 +318,13 @@ do { \ #define __put_user(x, ptr) \ ({ \ __typeof__(*(ptr)) __user *__gu_ptr = (ptr); \ + __typeof__(*__gu_ptr) __val = (x); \ long __pu_err = 0; \ \ __chk_user_ptr(__gu_ptr); \ \ __enable_user_access(); \ - __put_user_nocheck(x, __gu_ptr, __pu_err); \ + __put_user_nocheck(__val, __gu_ptr, __pu_err); \ __disable_user_access(); \ \ __pu_err; \ -- cgit v1.2.3 From ac8d0b901f0033b783156ab2dc1a0e73ec42409b Mon Sep 17 00:00:00 2001 From: Zihao Yu Date: Wed, 17 Mar 2021 16:17:25 +0800 Subject: riscv,entry: fix misaligned base for excp_vect_table In RV64, the size of each entry in excp_vect_table is 8 bytes. If the base of the table is not 8-byte aligned, loading an entry in the table will raise a misaligned exception. Although such exception will be handled by opensbi/bbl, this still causes performance degradation. Signed-off-by: Zihao Yu Reviewed-by: Anup Patel Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch') diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 744f3209c48d..76274a4a1d8e 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -447,6 +447,7 @@ ENDPROC(__switch_to) #endif .section ".rodata" + .align LGREG /* Exception vector table */ ENTRY(excp_vect_table) RISCV_PTR do_trap_insn_misaligned -- cgit v1.2.3 From 9d8c7d92015ece9a2139a259cef781a41845d2c0 Mon Sep 17 00:00:00 2001 From: Yang Li Date: Mon, 22 Mar 2021 16:38:36 +0800 Subject: riscv: remove unneeded semicolon Eliminate the following coccicheck warning: ./arch/riscv/mm/kasan_init.c:219:2-3: Unneeded semicolon Reported-by: Abaci Robot Signed-off-by: Yang Li Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 4f85c6d0ddf8..937d13ce9ab8 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -216,7 +216,7 @@ void __init kasan_init(void) break; kasan_populate(kasan_mem_to_shadow(start), kasan_mem_to_shadow(end)); - }; + } for (i = 0; i < PTRS_PER_PTE; i++) set_pte(&kasan_early_shadow_pte[i], -- cgit v1.2.3 From 1adbc2941eee8acbe3c7dc6b51cdbc5a9bf19565 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Tue, 30 Mar 2021 21:25:31 +0800 Subject: riscv: Make NUMA depend on MMU NUMA is useless when NOMMU, and it leads some build error, make it depend on MMU. Reported-by: kernel test robot Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 87d7b52f278f..0d0cf67359cb 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -314,7 +314,7 @@ endchoice # Common NUMA Features config NUMA bool "NUMA Memory Allocation and Scheduler Support" - depends on SMP + depends on SMP && MMU select GENERIC_ARCH_NUMA select OF_NUMA select ARCH_SUPPORTS_NUMA_BALANCING -- cgit v1.2.3