From 6608b45ac5ecb56f9e171252229c39580cc85f0f Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Wed, 23 Oct 2019 12:19:51 +0200 Subject: x86/speculation/taa: Add sysfs reporting for TSX Async Abort Add the sysfs reporting file for TSX Async Abort. It exposes the vulnerability and the mitigation state similar to the existing files for the other hardware vulnerabilities. Sysfs file path is: /sys/devices/system/cpu/vulnerabilities/tsx_async_abort Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Thomas Gleixner Tested-by: Neelima Krishnan Reviewed-by: Mark Gross Reviewed-by: Tony Luck Reviewed-by: Greg Kroah-Hartman Reviewed-by: Josh Poimboeuf --- include/linux/cpu.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d0633ebdaa9c..f35369f79771 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,6 +59,9 @@ extern ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_tsx_async_abort(struct device *dev, + struct device_attribute *attr, + char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From db4d30fbb71b47e4ecb11c4efa5d8aad4b03dfae Mon Sep 17 00:00:00 2001 From: Vineela Tummalapalli Date: Mon, 4 Nov 2019 12:22:01 +0100 Subject: x86/bugs: Add ITLB_MULTIHIT bug infrastructure Some processors may incur a machine check error possibly resulting in an unrecoverable CPU lockup when an instruction fetch encounters a TLB multi-hit in the instruction TLB. This can occur when the page size is changed along with either the physical address or cache type. The relevant erratum can be found here: https://bugzilla.kernel.org/show_bug.cgi?id=205195 There are other processors affected for which the erratum does not fully disclose the impact. This issue affects both bare-metal x86 page tables and EPT. It can be mitigated by either eliminating the use of large pages or by using careful TLB invalidations when changing the page size in the page tables. Just like Spectre, Meltdown, L1TF and MDS, a new bit has been allocated in MSR_IA32_ARCH_CAPABILITIES (PSCHANGE_MC_NO) and will be set on CPUs which are mitigated against this issue. Signed-off-by: Vineela Tummalapalli Co-developed-by: Pawan Gupta Signed-off-by: Pawan Gupta Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner --- Documentation/ABI/testing/sysfs-devices-system-cpu | 1 + arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 7 +++ arch/x86/kernel/cpu/bugs.c | 13 +++++ arch/x86/kernel/cpu/common.c | 65 ++++++++++++---------- drivers/base/cpu.c | 8 +++ include/linux/cpu.h | 2 + 7 files changed, 67 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 0e77569bd5e0..fc20cde63d1e 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -487,6 +487,7 @@ What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds /sys/devices/system/cpu/vulnerabilities/tsx_async_abort + /sys/devices/system/cpu/vulnerabilities/itlb_multihit Date: January 2018 Contact: Linux kernel mailing list Description: Information about CPU vulnerabilities diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 989e03544f18..c4fbe379cc0b 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -400,5 +400,6 @@ #define X86_BUG_MSBDS_ONLY X86_BUG(20) /* CPU is only affected by the MSDBS variant of BUG_MDS */ #define X86_BUG_SWAPGS X86_BUG(21) /* CPU is affected by speculation through SWAPGS */ #define X86_BUG_TAA X86_BUG(22) /* CPU is affected by TSX Async Abort(TAA) */ +#define X86_BUG_ITLB_MULTIHIT X86_BUG(23) /* CPU may incur MCE during certain page attribute changes */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index b3a8bb2af0b6..6a3124664289 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -93,6 +93,13 @@ * Microarchitectural Data * Sampling (MDS) vulnerabilities. */ +#define ARCH_CAP_PSCHANGE_MC_NO BIT(6) /* + * The processor is not susceptible to a + * machine check error due to modifying the + * code page size along with either the + * physical address or cache type + * without TLB invalidation. + */ #define ARCH_CAP_TSX_CTRL_MSR BIT(7) /* MSR for TSX control is available. */ #define ARCH_CAP_TAA_NO BIT(8) /* * Not susceptible to diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 43c647e19439..5364beda8c61 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1419,6 +1419,11 @@ static ssize_t l1tf_show_state(char *buf) } #endif +static ssize_t itlb_multihit_show_state(char *buf) +{ + return sprintf(buf, "Processor vulnerable\n"); +} + static ssize_t mds_show_state(char *buf) { if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { @@ -1524,6 +1529,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_TAA: return tsx_async_abort_show_state(buf); + case X86_BUG_ITLB_MULTIHIT: + return itlb_multihit_show_state(buf); + default: break; } @@ -1565,4 +1573,9 @@ ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *at { return cpu_show_common(dev, attr, buf, X86_BUG_TAA); } + +ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITLB_MULTIHIT); +} #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f8b8afc8f5b5..d29b71ca3ca7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1016,13 +1016,14 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #endif } -#define NO_SPECULATION BIT(0) -#define NO_MELTDOWN BIT(1) -#define NO_SSB BIT(2) -#define NO_L1TF BIT(3) -#define NO_MDS BIT(4) -#define MSBDS_ONLY BIT(5) -#define NO_SWAPGS BIT(6) +#define NO_SPECULATION BIT(0) +#define NO_MELTDOWN BIT(1) +#define NO_SSB BIT(2) +#define NO_L1TF BIT(3) +#define NO_MDS BIT(4) +#define MSBDS_ONLY BIT(5) +#define NO_SWAPGS BIT(6) +#define NO_ITLB_MULTIHIT BIT(7) #define VULNWL(_vendor, _family, _model, _whitelist) \ { X86_VENDOR_##_vendor, _family, _model, X86_FEATURE_ANY, _whitelist } @@ -1043,27 +1044,27 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL(NSC, 5, X86_MODEL_ANY, NO_SPECULATION), /* Intel Family 6 */ - VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION), - VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION), - VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION), - - VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS), + VULNWL_INTEL(ATOM_SALTWELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_TABLET, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SALTWELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL, NO_SPECULATION | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_BONNELL_MID, NO_SPECULATION | NO_ITLB_MULTIHIT), + + VULNWL_INTEL(ATOM_SILVERMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_D, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_SILVERMONT_MID, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNL, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(XEON_PHI_KNM, NO_SSB | NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(CORE_YONAH, NO_SSB), - VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS), - VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_AIRMONT_MID, NO_L1TF | MSBDS_ONLY | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_AIRMONT_NP, NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS), + VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1074,14 +1075,14 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { */ /* AMD Family 0xf - 0x12 */ - VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x10, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x11, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_AMD(0x12, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), /* FAMILY_ANY must be last, otherwise 0x0f - 0x12 matches won't work */ - VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), - VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS), + VULNWL_AMD(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_HYGON(X86_FAMILY_ANY, NO_MELTDOWN | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), {} }; @@ -1106,6 +1107,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 ia32_cap = x86_read_arch_cap_msr(); + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ + if (!cpu_matches(NO_ITLB_MULTIHIT) && !(ia32_cap & ARCH_CAP_PSCHANGE_MC_NO)) + setup_force_cpu_bug(X86_BUG_ITLB_MULTIHIT); + if (cpu_matches(NO_SPECULATION)) return; diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 0fccd8c0312e..6265871a4af2 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -561,6 +561,12 @@ ssize_t __weak cpu_show_tsx_async_abort(struct device *dev, return sprintf(buf, "Not affected\n"); } +ssize_t __weak cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -568,6 +574,7 @@ static DEVICE_ATTR(spec_store_bypass, 0444, cpu_show_spec_store_bypass, NULL); static DEVICE_ATTR(l1tf, 0444, cpu_show_l1tf, NULL); static DEVICE_ATTR(mds, 0444, cpu_show_mds, NULL); static DEVICE_ATTR(tsx_async_abort, 0444, cpu_show_tsx_async_abort, NULL); +static DEVICE_ATTR(itlb_multihit, 0444, cpu_show_itlb_multihit, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -577,6 +584,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_l1tf.attr, &dev_attr_mds.attr, &dev_attr_tsx_async_abort.attr, + &dev_attr_itlb_multihit.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f35369f79771..2a093434e975 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -62,6 +62,8 @@ extern ssize_t cpu_show_mds(struct device *dev, extern ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_itlb_multihit(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, -- cgit v1.2.3 From 731dc9df975a5da21237a18c3384f811a7a41cc6 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 4 Nov 2019 12:22:02 +0100 Subject: cpu/speculation: Uninline and export CPU mitigations helpers A kernel module may need to check the value of the "mitigations=" kernel command line parameter as part of its setup when the module needs to perform software mitigations for a CPU flaw. Uninline and export the helper functions surrounding the cpu_mitigations enum to allow for their usage from a module. Lastly, privatize the enum and cpu_mitigations variable since the value of cpu_mitigations can be checked with the exported helper functions. Signed-off-by: Tyler Hicks Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner --- include/linux/cpu.h | 25 ++----------------------- kernel/cpu.c | 27 ++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 2a093434e975..bc6c879bd110 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -218,28 +218,7 @@ static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif -/* - * These are used for a global "mitigations=" cmdline option for toggling - * optional CPU mitigations. - */ -enum cpu_mitigations { - CPU_MITIGATIONS_OFF, - CPU_MITIGATIONS_AUTO, - CPU_MITIGATIONS_AUTO_NOSMT, -}; - -extern enum cpu_mitigations cpu_mitigations; - -/* mitigations=off */ -static inline bool cpu_mitigations_off(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_OFF; -} - -/* mitigations=auto,nosmt */ -static inline bool cpu_mitigations_auto_nosmt(void) -{ - return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; -} +extern bool cpu_mitigations_off(void); +extern bool cpu_mitigations_auto_nosmt(void); #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/cpu.c b/kernel/cpu.c index fc28e17940e0..e2cad3ee2ead 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -2373,7 +2373,18 @@ void __init boot_cpu_hotplug_init(void) this_cpu_write(cpuhp_state.state, CPUHP_ONLINE); } -enum cpu_mitigations cpu_mitigations __ro_after_init = CPU_MITIGATIONS_AUTO; +/* + * These are used for a global "mitigations=" cmdline option for toggling + * optional CPU mitigations. + */ +enum cpu_mitigations { + CPU_MITIGATIONS_OFF, + CPU_MITIGATIONS_AUTO, + CPU_MITIGATIONS_AUTO_NOSMT, +}; + +static enum cpu_mitigations cpu_mitigations __ro_after_init = + CPU_MITIGATIONS_AUTO; static int __init mitigations_parse_cmdline(char *arg) { @@ -2390,3 +2401,17 @@ static int __init mitigations_parse_cmdline(char *arg) return 0; } early_param("mitigations", mitigations_parse_cmdline); + +/* mitigations=off */ +bool cpu_mitigations_off(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_OFF; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_off); + +/* mitigations=auto,nosmt */ +bool cpu_mitigations_auto_nosmt(void) +{ + return cpu_mitigations == CPU_MITIGATIONS_AUTO_NOSMT; +} +EXPORT_SYMBOL_GPL(cpu_mitigations_auto_nosmt); -- cgit v1.2.3 From c57c80467f90e5504c8df9ad3555d2c78800bf94 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Mon, 4 Nov 2019 12:22:02 +0100 Subject: kvm: Add helper function for creating VM worker threads Add a function to create a kernel thread associated with a given VM. In particular, it ensures that the worker thread inherits the priority and cgroups of the calling thread. Signed-off-by: Junaid Shahid Signed-off-by: Paolo Bonzini Signed-off-by: Thomas Gleixner --- include/linux/kvm_host.h | 6 ++++ virt/kvm/kvm_main.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 719fc3e15ea4..52ed5f66e8f9 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1382,4 +1382,10 @@ static inline int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu) } #endif /* CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE */ +typedef int (*kvm_vm_thread_fn_t)(struct kvm *kvm, uintptr_t data); + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr); + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d6f0696d98ef..8aed32b604d9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -4371,3 +4372,86 @@ void kvm_exit(void) kvm_vfio_ops_exit(); } EXPORT_SYMBOL_GPL(kvm_exit); + +struct kvm_vm_worker_thread_context { + struct kvm *kvm; + struct task_struct *parent; + struct completion init_done; + kvm_vm_thread_fn_t thread_fn; + uintptr_t data; + int err; +}; + +static int kvm_vm_worker_thread(void *context) +{ + /* + * The init_context is allocated on the stack of the parent thread, so + * we have to locally copy anything that is needed beyond initialization + */ + struct kvm_vm_worker_thread_context *init_context = context; + struct kvm *kvm = init_context->kvm; + kvm_vm_thread_fn_t thread_fn = init_context->thread_fn; + uintptr_t data = init_context->data; + int err; + + err = kthread_park(current); + /* kthread_park(current) is never supposed to return an error */ + WARN_ON(err != 0); + if (err) + goto init_complete; + + err = cgroup_attach_task_all(init_context->parent, current); + if (err) { + kvm_err("%s: cgroup_attach_task_all failed with err %d\n", + __func__, err); + goto init_complete; + } + + set_user_nice(current, task_nice(init_context->parent)); + +init_complete: + init_context->err = err; + complete(&init_context->init_done); + init_context = NULL; + + if (err) + return err; + + /* Wait to be woken up by the spawner before proceeding. */ + kthread_parkme(); + + if (!kthread_should_stop()) + err = thread_fn(kvm, data); + + return err; +} + +int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn, + uintptr_t data, const char *name, + struct task_struct **thread_ptr) +{ + struct kvm_vm_worker_thread_context init_context = {}; + struct task_struct *thread; + + *thread_ptr = NULL; + init_context.kvm = kvm; + init_context.parent = current; + init_context.thread_fn = thread_fn; + init_context.data = data; + init_completion(&init_context.init_done); + + thread = kthread_run(kvm_vm_worker_thread, &init_context, + "%s-%d", name, task_pid_nr(current)); + if (IS_ERR(thread)) + return PTR_ERR(thread); + + /* kthread_run is never supposed to return NULL */ + WARN_ON(thread == NULL); + + wait_for_completion(&init_context.init_done); + + if (!init_context.err) + *thread_ptr = thread; + + return init_context.err; +} -- cgit v1.2.3 From 4e7120d79edb31e4ee68e6f8421448e4603be1e9 Mon Sep 17 00:00:00 2001 From: Eric Auger Date: Fri, 8 Nov 2019 16:58:03 +0100 Subject: iommu/vt-d: Fix QI_DEV_IOTLB_PFSID and QI_DEV_EIOTLB_PFSID macros For both PASID-based-Device-TLB Invalidate Descriptor and Device-TLB Invalidate Descriptor, the Physical Function Source-ID value is split according to this layout: PFSID[3:0] is set at offset 12 and PFSID[15:4] is put at offset 52. Fix the part laid out at offset 52. Fixes: 0f725561e1684 ("iommu/vt-d: Add definitions for PFSID") Signed-off-by: Eric Auger Acked-by: Jacob Pan Cc: stable@vger.kernel.org # v4.19+ Acked-by: Lu Baolu Signed-off-by: Joerg Roedel --- include/linux/intel-iommu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index ed11ef594378..6d8bf4bdf240 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -336,7 +336,8 @@ enum { #define QI_DEV_IOTLB_SID(sid) ((u64)((sid) & 0xffff) << 32) #define QI_DEV_IOTLB_QDEP(qdep) (((qdep) & 0x1f) << 16) #define QI_DEV_IOTLB_ADDR(addr) ((u64)(addr) & VTD_PAGE_MASK) -#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) +#define QI_DEV_IOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ + ((u64)((pfsid >> 4) & 0xfff) << 52)) #define QI_DEV_IOTLB_SIZE 1 #define QI_DEV_IOTLB_MAX_INVS 32 @@ -360,7 +361,8 @@ enum { #define QI_DEV_EIOTLB_PASID(p) (((u64)p) << 32) #define QI_DEV_EIOTLB_SID(sid) ((u64)((sid) & 0xffff) << 16) #define QI_DEV_EIOTLB_QDEP(qd) ((u64)((qd) & 0x1f) << 4) -#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | ((u64)(pfsid & 0xfff) << 52)) +#define QI_DEV_EIOTLB_PFSID(pfsid) (((u64)(pfsid & 0xf) << 12) | \ + ((u64)((pfsid >> 4) & 0xfff) << 52)) #define QI_DEV_EIOTLB_MAX_INVS 32 /* Page group response descriptor QW0 */ -- cgit v1.2.3 From a78986aae9b2988f8493f9f65a587ee433e83bc3 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Mon, 11 Nov 2019 14:12:27 -0800 Subject: KVM: MMU: Do not treat ZONE_DEVICE pages as being reserved Explicitly exempt ZONE_DEVICE pages from kvm_is_reserved_pfn() and instead manually handle ZONE_DEVICE on a case-by-case basis. For things like page refcounts, KVM needs to treat ZONE_DEVICE pages like normal pages, e.g. put pages grabbed via gup(). But for flows such as setting A/D bits or shifting refcounts for transparent huge pages, KVM needs to to avoid processing ZONE_DEVICE pages as the flows in question lack the underlying machinery for proper handling of ZONE_DEVICE pages. This fixes a hang reported by Adam Borowski[*] in dev_pagemap_cleanup() when running a KVM guest backed with /dev/dax memory, as KVM straight up doesn't put any references to ZONE_DEVICE pages acquired by gup(). Note, Dan Williams proposed an alternative solution of doing put_page() on ZONE_DEVICE pages immediately after gup() in order to simplify the auditing needed to ensure is_zone_device_page() is called if and only if the backing device is pinned (via gup()). But that approach would break kvm_vcpu_{un}map() as KVM requires the page to be pinned from map() 'til unmap() when accessing guest memory, unlike KVM's secondary MMU, which coordinates with mmu_notifier invalidations to avoid creating stale page references, i.e. doesn't rely on pages being pinned. [*] http://lkml.kernel.org/r/20190919115547.GA17963@angband.pl Reported-by: Adam Borowski Analyzed-by: David Hildenbrand Acked-by: Dan Williams Cc: stable@vger.kernel.org Fixes: 3565fce3a659 ("mm, x86: get_user_pages() for dax mappings") Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu.c | 8 ++++---- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 26 +++++++++++++++++++++++--- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 24c23c66b226..bf82b1f2e834 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3306,7 +3306,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, * here. */ if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && - level == PT_PAGE_TABLE_LEVEL && + !kvm_is_zone_device_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompoundMap(pfn_to_page(pfn)) && !mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; @@ -5914,9 +5914,9 @@ restart: * the guest, and the guest page table is using 4K page size * mapping if the indirect sp has level = 1. */ - if (sp->role.direct && - !kvm_is_reserved_pfn(pfn) && - PageTransCompoundMap(pfn_to_page(pfn))) { + if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && + !kvm_is_zone_device_pfn(pfn) && + PageTransCompoundMap(pfn_to_page(pfn))) { pte_list_remove(rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 719fc3e15ea4..290dbe353a47 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -966,6 +966,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); bool kvm_is_reserved_pfn(kvm_pfn_t pfn); +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn); struct kvm_irq_ack_notifier { struct hlist_node link; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index e7a07132cd7f..0dac149ead16 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -149,10 +149,30 @@ __weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, return 0; } +bool kvm_is_zone_device_pfn(kvm_pfn_t pfn) +{ + /* + * The metadata used by is_zone_device_page() to determine whether or + * not a page is ZONE_DEVICE is guaranteed to be valid if and only if + * the device has been pinned, e.g. by get_user_pages(). WARN if the + * page_count() is zero to help detect bad usage of this helper. + */ + if (!pfn_valid(pfn) || WARN_ON_ONCE(!page_count(pfn_to_page(pfn)))) + return false; + + return is_zone_device_page(pfn_to_page(pfn)); +} + bool kvm_is_reserved_pfn(kvm_pfn_t pfn) { + /* + * ZONE_DEVICE pages currently set PG_reserved, but from a refcounting + * perspective they are "normal" pages, albeit with slightly different + * usage rules. + */ if (pfn_valid(pfn)) - return PageReserved(pfn_to_page(pfn)); + return PageReserved(pfn_to_page(pfn)) && + !kvm_is_zone_device_pfn(pfn); return true; } @@ -1857,7 +1877,7 @@ EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty); void kvm_set_pfn_dirty(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) { + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) { struct page *page = pfn_to_page(pfn); SetPageDirty(page); @@ -1867,7 +1887,7 @@ EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty); void kvm_set_pfn_accessed(kvm_pfn_t pfn) { - if (!kvm_is_reserved_pfn(pfn)) + if (!kvm_is_reserved_pfn(pfn) && !kvm_is_zone_device_pfn(pfn)) mark_page_accessed(pfn_to_page(pfn)); } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); -- cgit v1.2.3 From 975987e7015bb12a482df7f14fd524417d2c8e8f Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Thu, 7 Nov 2019 11:55:42 +0100 Subject: can: af_can: export can_sock_destruct() In j1939 we need our own struct sock::sk_destruct callback. Export the generic af_can can_sock_destruct() that allows us to chain-call it. Fixes: 9d71dd0c7009 ("can: add support of SAE J1939 protocol") Signed-off-by: Oleksij Rempel --- include/linux/can/core.h | 1 + net/can/af_can.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/can/core.h b/include/linux/can/core.h index 8339071ab08b..e20a0cd09ba5 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -65,5 +65,6 @@ extern void can_rx_unregister(struct net *net, struct net_device *dev, void *data); extern int can_send(struct sk_buff *skb, int loop); +void can_sock_destruct(struct sock *sk); #endif /* !_CAN_CORE_H */ diff --git a/net/can/af_can.c b/net/can/af_can.c index 5518a7d9eed9..128d37a4c2e0 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -86,11 +86,12 @@ static atomic_t skbcounter = ATOMIC_INIT(0); /* af_can socket functions */ -static void can_sock_destruct(struct sock *sk) +void can_sock_destruct(struct sock *sk) { skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_error_queue); } +EXPORT_SYMBOL(can_sock_destruct); static const struct can_proto *can_get_proto(int protocol) { -- cgit v1.2.3 From 2c91f8fc6c999fe10185d8ad99fda1759f662f70 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 15 Nov 2019 17:34:57 -0800 Subject: mm/memory_hotplug: fix try_offline_node() try_offline_node() is pretty much broken right now: - The node span is updated when onlining memory, not when adding it. We ignore memory that was mever onlined. Bad. - We touch possible garbage memmaps. The pfn_to_nid(pfn) can easily trigger a kernel panic. Bad for memory that is offline but also bad for subsection hotadd with ZONE_DEVICE, whereby the memmap of the first PFN of a section might contain garbage. - Sections belonging to mixed nodes are not properly considered. As memory blocks might belong to multiple nodes, we would have to walk all pageblocks (or at least subsections) within present sections. However, we don't have a way to identify whether a memmap that is not online was initialized (relevant for ZONE_DEVICE). This makes things more complicated. Luckily, we can piggy pack on the node span and the nid stored in memory blocks. Currently, the node span is grown when calling move_pfn_range_to_zone() - e.g., when onlining memory, and shrunk when removing memory, before calling try_offline_node(). Sysfs links are created via link_mem_sections(), e.g., during boot or when adding memory. If the node still spans memory or if any memory block belongs to the nid, we don't set the node offline. As memory blocks that span multiple nodes cannot get offlined, the nid stored in memory blocks is reliable enough (for such online memory blocks, the node still spans the memory). Introduce for_each_memory_block() to efficiently walk all memory blocks. Note: We will soon stop shrinking the ZONE_DEVICE zone and the node span when removing ZONE_DEVICE memory to fix similar issues (access of garbage memmaps) - until we have a reliable way to identify whether these memmaps were properly initialized. This implies later, that once a node had ZONE_DEVICE memory, we won't be able to set a node offline - which should be acceptable. Since commit f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") memory that is added is not assoziated with a zone/node (memmap not initialized). The introducing commit 60a5a19e7419 ("memory-hotplug: remove sysfs file of node") already missed that we could have multiple nodes for a section and that the zone/node span is updated when onlining pages, not when adding them. I tested this by hotplugging two DIMMs to a memory-less and cpu-less NUMA node. The node is properly onlined when adding the DIMMs. When removing the DIMMs, the node is properly offlined. Masayoshi Mizuma reported: : Without this patch, memory hotplug fails as panic: : : BUG: kernel NULL pointer dereference, address: 0000000000000000 : ... : Call Trace: : remove_memory_block_devices+0x81/0xc0 : try_remove_memory+0xb4/0x130 : __remove_memory+0xa/0x20 : acpi_memory_device_remove+0x84/0x100 : acpi_bus_trim+0x57/0x90 : acpi_bus_trim+0x2e/0x90 : acpi_device_hotplug+0x2b2/0x4d0 : acpi_hotplug_work_fn+0x1a/0x30 : process_one_work+0x171/0x380 : worker_thread+0x49/0x3f0 : kthread+0xf8/0x130 : ret_from_fork+0x35/0x40 [david@redhat.com: v3] Link: http://lkml.kernel.org/r/20191102120221.7553-1-david@redhat.com Link: http://lkml.kernel.org/r/20191028105458.28320-1-david@redhat.com Fixes: 60a5a19e7419 ("memory-hotplug: remove sysfs file of node") Fixes: f1dd2cd13c4b ("mm, memory_hotplug: do not associate hotadded memory to zones until online") # visiable after d0dc12e86b319 Signed-off-by: David Hildenbrand Tested-by: Masayoshi Mizuma Cc: Tang Chen Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Keith Busch Cc: Jiri Olsa Cc: "Peter Zijlstra (Intel)" Cc: Jani Nikula Cc: Nayna Jain Cc: Michal Hocko Cc: Oscar Salvador Cc: Stephen Rothwell Cc: Dan Williams Cc: Pavel Tatashin Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/memory.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/memory.h | 1 + mm/memory_hotplug.c | 43 +++++++++++++++++++++++++++---------------- 3 files changed, 64 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 55907c27075b..84c4e1f72cbd 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -872,3 +872,39 @@ int walk_memory_blocks(unsigned long start, unsigned long size, } return ret; } + +struct for_each_memory_block_cb_data { + walk_memory_blocks_func_t func; + void *arg; +}; + +static int for_each_memory_block_cb(struct device *dev, void *data) +{ + struct memory_block *mem = to_memory_block(dev); + struct for_each_memory_block_cb_data *cb_data = data; + + return cb_data->func(mem, cb_data->arg); +} + +/** + * for_each_memory_block - walk through all present memory blocks + * + * @arg: argument passed to func + * @func: callback for each memory block walked + * + * This function walks through all present memory blocks, calling func on + * each memory block. + * + * In case func() returns an error, walking is aborted and the error is + * returned. + */ +int for_each_memory_block(void *arg, walk_memory_blocks_func_t func) +{ + struct for_each_memory_block_cb_data cb_data = { + .func = func, + .arg = arg, + }; + + return bus_for_each_dev(&memory_subsys, NULL, &cb_data, + for_each_memory_block_cb); +} diff --git a/include/linux/memory.h b/include/linux/memory.h index 0ebb105eb261..4c75dae8dd29 100644 --- a/include/linux/memory.h +++ b/include/linux/memory.h @@ -119,6 +119,7 @@ extern struct memory_block *find_memory_block(struct mem_section *); typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *); extern int walk_memory_blocks(unsigned long start, unsigned long size, void *arg, walk_memory_blocks_func_t func); +extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func); #define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<nid == nid ? -EEXIST : 0; +} + /** * try_offline_node * @nid: the node ID @@ -1658,25 +1670,24 @@ static int check_cpu_on_node(pg_data_t *pgdat) void try_offline_node(int nid) { pg_data_t *pgdat = NODE_DATA(nid); - unsigned long start_pfn = pgdat->node_start_pfn; - unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages; - unsigned long pfn; - - for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) { - unsigned long section_nr = pfn_to_section_nr(pfn); - - if (!present_section_nr(section_nr)) - continue; + int rc; - if (pfn_to_nid(pfn) != nid) - continue; + /* + * If the node still spans pages (especially ZONE_DEVICE), don't + * offline it. A node spans memory after move_pfn_range_to_zone(), + * e.g., after the memory block was onlined. + */ + if (pgdat->node_spanned_pages) + return; - /* - * some memory sections of this node are not removed, and we - * can't offline node now. - */ + /* + * Especially offline memory blocks might not be spanned by the + * node. They will get spanned by the node once they get onlined. + * However, they link to the node in sysfs and can get onlined later. + */ + rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb); + if (rc) return; - } if (check_cpu_on_node(pgdat)) return; -- cgit v1.2.3