From c54cdf141c40a5115774e91fc947c34e91df0259 Mon Sep 17 00:00:00 2001 From: Liang Chen Date: Wed, 16 Mar 2016 19:33:16 +0800 Subject: KVM: x86: optimize steal time calculation Since accumulate_steal_time is now only called in record_steal_time, it doesn't quite make sense to put the delta calculation in a separate function. The function could be called thousands of times before guest enables the steal time MSR (though the compiler may optimize out this function call). And after it's enabled, the MSR enable bit is tested twice every time. Removing the accumulate_steal_time function also avoids the necessity of having the accum_steal field. Signed-off-by: Liang Chen Signed-off-by: Gavin Guo Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index b7e394485a5f..c66e26280707 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -562,7 +562,6 @@ struct kvm_vcpu_arch { struct { u64 msr_val; u64 last_steal; - u64 accum_steal; struct gfn_to_hva_cache stime; struct kvm_steal_time steal; } st; -- cgit v1.2.3 From 775d054aba90a2c787d4c081d6369f1fddaae0f4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 1 May 2016 22:11:59 +0200 Subject: intel_telemetry: Constify telemetry_core_ops structures The telemetry_core_ops structures are never modified, so declare them as const. Done with the help of Coccinelle. Signed-off-by: Julia Lawall Signed-off-by: Darren Hart --- arch/x86/include/asm/intel_telemetry.h | 2 +- drivers/platform/x86/intel_telemetry_core.c | 6 +++--- drivers/platform/x86/intel_telemetry_pltdrv.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/intel_telemetry.h b/arch/x86/include/asm/intel_telemetry.h index ed65fe701de5..85029b58d0cd 100644 --- a/arch/x86/include/asm/intel_telemetry.h +++ b/arch/x86/include/asm/intel_telemetry.h @@ -99,7 +99,7 @@ struct telemetry_core_ops { int (*reset_events)(void); }; -int telemetry_set_pltdata(struct telemetry_core_ops *ops, +int telemetry_set_pltdata(const struct telemetry_core_ops *ops, struct telemetry_plt_config *pltconfig); int telemetry_clear_pltdata(void); diff --git a/drivers/platform/x86/intel_telemetry_core.c b/drivers/platform/x86/intel_telemetry_core.c index a695a436a1c3..0d4c3808a6d8 100644 --- a/drivers/platform/x86/intel_telemetry_core.c +++ b/drivers/platform/x86/intel_telemetry_core.c @@ -25,7 +25,7 @@ struct telemetry_core_config { struct telemetry_plt_config *plt_config; - struct telemetry_core_ops *telem_ops; + const struct telemetry_core_ops *telem_ops; }; static struct telemetry_core_config telm_core_conf; @@ -95,7 +95,7 @@ static int telemetry_def_reset_events(void) return 0; } -static struct telemetry_core_ops telm_defpltops = { +static const struct telemetry_core_ops telm_defpltops = { .set_sampling_period = telemetry_def_set_sampling_period, .get_sampling_period = telemetry_def_get_sampling_period, .get_trace_verbosity = telemetry_def_get_trace_verbosity, @@ -332,7 +332,7 @@ EXPORT_SYMBOL_GPL(telemetry_set_trace_verbosity); * * Return: 0 success, < 0 for failure */ -int telemetry_set_pltdata(struct telemetry_core_ops *ops, +int telemetry_set_pltdata(const struct telemetry_core_ops *ops, struct telemetry_plt_config *pltconfig) { if (ops) diff --git a/drivers/platform/x86/intel_telemetry_pltdrv.c b/drivers/platform/x86/intel_telemetry_pltdrv.c index 397119f83e82..1347da642d44 100644 --- a/drivers/platform/x86/intel_telemetry_pltdrv.c +++ b/drivers/platform/x86/intel_telemetry_pltdrv.c @@ -1081,7 +1081,7 @@ out: return ret; } -static struct telemetry_core_ops telm_pltops = { +static const struct telemetry_core_ops telm_pltops = { .get_trace_verbosity = telemetry_plt_get_trace_verbosity, .set_trace_verbosity = telemetry_plt_set_trace_verbosity, .set_sampling_period = telemetry_plt_set_sampling_period, -- cgit v1.2.3 From 3491caf2755e9f312666712510d80b00c81ff247 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Fri, 13 May 2016 12:16:35 +0200 Subject: KVM: halt_polling: provide a way to qualify wakeups during poll MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some wakeups should not be considered a sucessful poll. For example on s390 I/O interrupts are usually floating, which means that _ALL_ CPUs would be considered runnable - letting all vCPUs poll all the time for transactional like workload, even if one vCPU would be enough. This can result in huge CPU usage for large guests. This patch lets architectures provide a way to qualify wakeups if they should be considered a good/bad wakeups in regard to polls. For s390 the implementation will fence of halt polling for anything but known good, single vCPU events. The s390 implementation for floating interrupts does a wakeup for one vCPU, but the interrupt will be delivered by whatever CPU checks first for a pending interrupt. We prefer the woken up CPU by marking the poll of this CPU as "good" poll. This code will also mark several other wakeup reasons like IPI or expired timers as "good". This will of course also mark some events as not sucessful. As KVM on z runs always as a 2nd level hypervisor, we prefer to not poll, unless we are really sure, though. This patch successfully limits the CPU usage for cases like uperf 1byte transactional ping pong workload or wakeup heavy workload like OLTP while still providing a proper speedup. This also introduced a new vcpu stat "halt_poll_no_tuning" that marks wakeups that are considered not good for polling. Signed-off-by: Christian Borntraeger Acked-by: Radim Krčmář (for an earlier version) Cc: David Matlack Cc: Wanpeng Li [Rename config symbol. - Paolo] Signed-off-by: Paolo Bonzini --- arch/arm/include/asm/kvm_host.h | 2 ++ arch/arm64/include/asm/kvm_host.h | 2 ++ arch/mips/include/asm/kvm_host.h | 2 ++ arch/mips/kvm/mips.c | 1 + arch/powerpc/include/asm/kvm_host.h | 2 ++ arch/powerpc/kvm/book3s.c | 1 + arch/powerpc/kvm/booke.c | 1 + arch/s390/include/asm/kvm_host.h | 3 +++ arch/s390/kvm/Kconfig | 1 + arch/s390/kvm/interrupt.c | 5 +++++ arch/s390/kvm/kvm-s390.c | 6 ++++++ arch/x86/include/asm/kvm_host.h | 2 ++ arch/x86/kvm/x86.c | 1 + include/linux/kvm_host.h | 15 +++++++++++++++ include/trace/events/kvm.h | 11 +++++++---- virt/kvm/Kconfig | 3 +++ virt/kvm/kvm_main.c | 8 ++++++-- 17 files changed, 60 insertions(+), 6 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 385070180c25..4cd8732796ab 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -187,6 +187,7 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 hvc_exit_stat; u64 wfe_exit_stat; @@ -282,6 +283,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} static inline void kvm_arm_init_debug(void) {} static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index f5c6bd2541ef..d49399d9890d 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -293,6 +293,7 @@ struct kvm_vm_stat { struct kvm_vcpu_stat { u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 hvc_exit_stat; u64 wfe_exit_stat; @@ -357,6 +358,7 @@ static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} void kvm_arm_init_debug(void); void kvm_arm_setup_debug(struct kvm_vcpu *vcpu); diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 942b8f6bf35b..9a37a1044032 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -122,6 +122,7 @@ struct kvm_vcpu_stat { u32 flush_dcache_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; }; @@ -812,5 +813,6 @@ static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* __MIPS_KVM_HOST_H__ */ diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 23b209463238..dc052fb5c7a2 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -56,6 +56,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU }, { "halt_wakeup", VCPU_STAT(halt_wakeup), KVM_STAT_VCPU }, {NULL} }; diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a07645c17818..ec35af34a3fb 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -116,6 +116,7 @@ struct kvm_vcpu_stat { u32 ext_intr_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 dbell_exits; u32 gdbell_exits; @@ -727,5 +728,6 @@ static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {} static inline void kvm_arch_exit(void) {} static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* __POWERPC_KVM_HOST_H__ */ diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index b34220d2aa42..47018fcbf7d6 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -54,6 +54,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "queue_intr", VCPU_STAT(queue_intr) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, { "sp_storage", VCPU_STAT(sp_storage) }, diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 4d66f44a1657..4afae695899a 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -64,6 +64,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index 9282ccf1d136..53d794538067 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -247,6 +247,7 @@ struct kvm_vcpu_stat { u32 exit_instruction; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 instruction_lctl; u32 instruction_lctlg; @@ -696,4 +697,6 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu); + #endif diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig index 5ea5af3c7db7..b1900239b0ab 100644 --- a/arch/s390/kvm/Kconfig +++ b/arch/s390/kvm/Kconfig @@ -28,6 +28,7 @@ config KVM select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_INVALID_WAKEUPS select SRCU select KVM_VFIO ---help--- diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index e55040467eb5..5a80af740d3e 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -977,6 +977,11 @@ no_timer: void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu) { + /* + * We cannot move this into the if, as the CPU might be already + * in kvm_vcpu_block without having the waitqueue set (polling) + */ + vcpu->valid_wakeup = true; if (swait_active(&vcpu->wq)) { /* * The vcpu gave up the cpu voluntarily, mark it as a good diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index c597201a5ca9..6d8ec3ac9dd8 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -65,6 +65,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "instruction_lctlg", VCPU_STAT(instruction_lctlg) }, { "instruction_lctl", VCPU_STAT(instruction_lctl) }, @@ -2992,6 +2993,11 @@ static inline unsigned long nonhyp_mask(int i) return 0x0000ffffffffffffUL >> (nonhyp_fai << 4); } +void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) +{ + vcpu->valid_wakeup = false; +} + static int __init kvm_s390_init(void) { int i; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c66e26280707..c99494b4bdf7 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -803,6 +803,7 @@ struct kvm_vcpu_stat { u32 halt_exits; u32 halt_successful_poll; u32 halt_attempted_poll; + u32 halt_poll_invalid; u32 halt_wakeup; u32 request_irq_exits; u32 irq_exits; @@ -1342,5 +1343,6 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6c774cdf553c..bcef92fc41d8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -161,6 +161,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, + { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, { "request_irq", VCPU_STAT(request_irq_exits) }, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 92a0229044fb..bbcd921d7cb0 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -229,6 +229,7 @@ struct kvm_vcpu { sigset_t sigset; struct kvm_vcpu_stat stat; unsigned int halt_poll_ns; + bool valid_wakeup; #ifdef CONFIG_HAS_IOMEM int mmio_needed; @@ -1196,4 +1197,18 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); #endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */ +#ifdef CONFIG_HAVE_KVM_INVALID_WAKEUPS +/* If we wakeup during the poll time, was it a sucessful poll? */ +static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) +{ + return vcpu->valid_wakeup; +} + +#else +static inline bool vcpu_valid_wakeup(struct kvm_vcpu *vcpu) +{ + return true; +} +#endif /* CONFIG_HAVE_KVM_INVALID_WAKEUPS */ + #endif diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h index aa69253ecc7d..526fb3d2e43a 100644 --- a/include/trace/events/kvm.h +++ b/include/trace/events/kvm.h @@ -38,22 +38,25 @@ TRACE_EVENT(kvm_userspace_exit, ); TRACE_EVENT(kvm_vcpu_wakeup, - TP_PROTO(__u64 ns, bool waited), - TP_ARGS(ns, waited), + TP_PROTO(__u64 ns, bool waited, bool valid), + TP_ARGS(ns, waited, valid), TP_STRUCT__entry( __field( __u64, ns ) __field( bool, waited ) + __field( bool, valid ) ), TP_fast_assign( __entry->ns = ns; __entry->waited = waited; + __entry->valid = valid; ), - TP_printk("%s time %lld ns", + TP_printk("%s time %lld ns, polling %s", __entry->waited ? "wait" : "poll", - __entry->ns) + __entry->ns, + __entry->valid ? "valid" : "invalid") ); #if defined(CONFIG_HAVE_KVM_IRQFD) diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig index 7a79b6853583..e5d6108f5e85 100644 --- a/virt/kvm/Kconfig +++ b/virt/kvm/Kconfig @@ -41,6 +41,9 @@ config KVM_VFIO config HAVE_KVM_ARCH_TLB_FLUSH_ALL bool +config HAVE_KVM_INVALID_WAKEUPS + bool + config KVM_GENERIC_DIRTYLOG_READ_PROTECT bool diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ed3d9bb18a56..21f6498d52e3 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -2028,6 +2028,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) */ if (kvm_vcpu_check_block(vcpu) < 0) { ++vcpu->stat.halt_successful_poll; + if (!vcpu_valid_wakeup(vcpu)) + ++vcpu->stat.halt_poll_invalid; goto out; } cur = ktime_get(); @@ -2057,7 +2059,8 @@ out: if (block_ns <= vcpu->halt_poll_ns) ; /* we had a long block, shrink polling */ - else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns) + else if (!vcpu_valid_wakeup(vcpu) || + (vcpu->halt_poll_ns && block_ns > halt_poll_ns)) shrink_halt_poll_ns(vcpu); /* we had a short halt and our poll time is too small */ else if (vcpu->halt_poll_ns < halt_poll_ns && @@ -2066,7 +2069,8 @@ out: } else vcpu->halt_poll_ns = 0; - trace_kvm_vcpu_wakeup(block_ns, waited); + trace_kvm_vcpu_wakeup(block_ns, waited, vcpu_valid_wakeup(vcpu)); + kvm_arch_vcpu_block_finish(vcpu); } EXPORT_SYMBOL_GPL(kvm_vcpu_block); -- cgit v1.2.3 From e8df1a95b685af84a81698199ee206e0e66a8b44 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 13 May 2016 15:13:28 -0700 Subject: x86/cpufeature, x86/mm/pkeys: Fix broken compile-time disabling of pkeys When I added support for the Memory Protection Keys processor feature, I had to reindent the REQUIRED/DISABLED_MASK macros, and also consult the later cpufeature words. I'm not quite sure how I bungled it, but I consulted the wrong word at the end. This only affected required or disabled cpu features in cpufeature words 14, 15 and 16. So, only Protection Keys itself was screwed over here. The result was that if you disabled pkeys in your .config, you might still see some code show up that should have been compiled out. There should be no functional problems, though. In verifying this patch I also realized that the DISABLE_PKU/OSPKE macros were defined backwards and that the cpu_has() check in setup_pku() was not doing the compile-time disabled checks. So also fix the macro for DISABLE_PKU/OSPKE and add a compile-time check for pkeys being enabled in setup_pku(). Signed-off-by: Dave Hansen Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Dave Hansen Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Fixes: dfb4a70f20c5 ("x86/cpufeature, x86/mm/pkeys: Add protection keys related CPUID definitions") Link: http://lkml.kernel.org/r/20160513221328.C200930B@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 12 ++++++------ arch/x86/include/asm/disabled-features.h | 6 +++--- arch/x86/kernel/cpu/common.c | 4 ++++ 3 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3636ec06c887..aeab47932933 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -63,9 +63,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; (((bit)>>5)==11 && (1UL<<((bit)&31) & REQUIRED_MASK11)) || \ (((bit)>>5)==12 && (1UL<<((bit)&31) & REQUIRED_MASK12)) || \ (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK13)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) + (((bit)>>5)==14 && (1UL<<((bit)&31) & REQUIRED_MASK14)) || \ + (((bit)>>5)==15 && (1UL<<((bit)&31) & REQUIRED_MASK15)) || \ + (((bit)>>5)==16 && (1UL<<((bit)&31) & REQUIRED_MASK16)) ) #define DISABLED_MASK_BIT_SET(bit) \ ( (((bit)>>5)==0 && (1UL<<((bit)&31) & DISABLED_MASK0 )) || \ @@ -82,9 +82,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; (((bit)>>5)==11 && (1UL<<((bit)&31) & DISABLED_MASK11)) || \ (((bit)>>5)==12 && (1UL<<((bit)&31) & DISABLED_MASK12)) || \ (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK13)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ - (((bit)>>5)==13 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ - (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) + (((bit)>>5)==14 && (1UL<<((bit)&31) & DISABLED_MASK14)) || \ + (((bit)>>5)==15 && (1UL<<((bit)&31) & DISABLED_MASK15)) || \ + (((bit)>>5)==16 && (1UL<<((bit)&31) & DISABLED_MASK16)) ) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 39343be7d4f4..911e9358ceb1 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -29,11 +29,11 @@ #endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS -# define DISABLE_PKU (1<<(X86_FEATURE_PKU)) -# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE)) -#else # define DISABLE_PKU 0 # define DISABLE_OSPKE 0 +#else +# define DISABLE_PKU (1<<(X86_FEATURE_PKU & 31)) +# define DISABLE_OSPKE (1<<(X86_FEATURE_OSPKE & 31)) #endif /* CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS */ /* diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8394b3d1f94f..f45a4b9d28c8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -310,6 +310,10 @@ static bool pku_disabled; static __always_inline void setup_pku(struct cpuinfo_x86 *c) { + /* check the boot processor, plus compile options for PKU: */ + if (!cpu_feature_enabled(X86_FEATURE_PKU)) + return; + /* checks the actual processor's cpuid bits: */ if (!cpu_has(c, X86_FEATURE_PKU)) return; if (pku_disabled) -- cgit v1.2.3 From 03543133cea646406307870823343912c1ef0a3a Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:42 -0500 Subject: KVM: x86: Introducing kvm_x86_ops VM init/destroy hooks Adding function pointers in struct kvm_x86_ops for processor-specific layer to provide hooks for when KVM initialize and destroy VM. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/x86.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c99494b4bdf7..d644226737ee 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -848,6 +848,9 @@ struct kvm_x86_ops { bool (*cpu_has_high_real_mode_segbase)(void); void (*cpuid_update)(struct kvm_vcpu *vcpu); + int (*vm_init)(struct kvm *kvm); + void (*vm_destroy)(struct kvm *kvm); + /* Create, but do not attach this VCPU */ struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); void (*vcpu_free)(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bcef92fc41d8..7fcaed49b3c5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7756,6 +7756,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + if (kvm_x86_ops->vm_init) + return kvm_x86_ops->vm_init(kvm); + return 0; } @@ -7877,6 +7880,8 @@ void kvm_arch_destroy_vm(struct kvm *kvm) x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0); x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); } + if (kvm_x86_ops->vm_destroy) + kvm_x86_ops->vm_destroy(kvm); kvm_iommu_unmap_guest(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); -- cgit v1.2.3 From d1ed092f77ef562edcc1b45ae331ff1683f50295 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:43 -0500 Subject: KVM: x86: Introducing kvm_x86_ops VCPU blocking/unblocking hooks Adding new function pointer in struct kvm_x86_ops, and calling them from the kvm_arch_vcpu[blocking/unblocking]. Signed-off-by: Suravee Suthikulpanit Reviewed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index d644226737ee..2df5db6fb58b 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -993,6 +993,10 @@ struct kvm_x86_ops { */ int (*pre_block)(struct kvm_vcpu *vcpu); void (*post_block)(struct kvm_vcpu *vcpu); + + void (*vcpu_blocking)(struct kvm_vcpu *vcpu); + void (*vcpu_unblocking)(struct kvm_vcpu *vcpu); + int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); }; @@ -1344,8 +1348,18 @@ bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm_lapic_irq *irq); -static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {} -static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} +static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_blocking) + kvm_x86_ops->vcpu_blocking(vcpu); +} + +static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) +{ + if (kvm_x86_ops->vcpu_unblocking) + kvm_x86_ops->vcpu_unblocking(vcpu); +} + static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} #endif /* _ASM_X86_KVM_HOST_H */ -- cgit v1.2.3 From 3d5615e5342781453633af7107d72afd601b36a7 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:45 -0500 Subject: svm: Introduce new AVIC VMCB registers Introduce new AVIC VMCB registers. Signed-off-by: Suravee Suthikulpanit Reviewed-by: Paolo Bonzini Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/svm.h | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 6136d99f537b..4711fa4457f3 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -78,7 +78,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u32 exit_int_info; u32 exit_int_info_err; u64 nested_ctl; - u8 reserved_4[16]; + u64 avic_vapic_bar; + u8 reserved_4[8]; u32 event_inj; u32 event_inj_err; u64 nested_cr3; @@ -88,7 +89,11 @@ struct __attribute__ ((__packed__)) vmcb_control_area { u64 next_rip; u8 insn_len; u8 insn_bytes[15]; - u8 reserved_6[800]; + u64 avic_backing_page; /* Offset 0xe0 */ + u8 reserved_6[8]; /* Offset 0xe8 */ + u64 avic_logical_id; /* Offset 0xf0 */ + u64 avic_physical_id; /* Offset 0xf8 */ + u8 reserved_7[768]; }; -- cgit v1.2.3 From 44a95dae1d229a4967bbfcc56fb2116a9b4fe942 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:46 -0500 Subject: KVM: x86: Detect and Initialize AVIC support This patch introduces AVIC-related data structure, and AVIC initialization code. There are three main data structures for AVIC: * Virtual APIC (vAPIC) backing page (per-VCPU) * Physical APIC ID table (per-VM) * Logical APIC ID table (per-VM) Currently, AVIC is disabled by default. Users can manually enable AVIC via kernel boot option kvm-amd.avic=1 or during kvm-amd module loading with parameter avic=1. Signed-off-by: Suravee Suthikulpanit [Avoid extra indentation (Boris). - Paolo] Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 4 + arch/x86/include/asm/svm.h | 3 + arch/x86/kvm/svm.c | 225 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 231 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2df5db6fb58b..337e13b0d01d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -773,6 +773,10 @@ struct kvm_arch { u8 nr_reserved_ioapic_pins; bool disabled_lapic_found; + + /* Struct members for AVIC */ + struct page *avic_logical_id_table_page; + struct page *avic_physical_id_table_page; }; struct kvm_vm_stat { diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index 4711fa4457f3..d0fe23ec7e98 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -116,6 +116,9 @@ struct __attribute__ ((__packed__)) vmcb_control_area { #define V_INTR_MASKING_SHIFT 24 #define V_INTR_MASKING_MASK (1 << V_INTR_MASKING_SHIFT) +#define AVIC_ENABLE_SHIFT 31 +#define AVIC_ENABLE_MASK (1 << AVIC_ENABLE_SHIFT) + #define SVM_INTERRUPT_SHADOW_MASK 1 #define SVM_IOIO_STR_SHIFT 2 diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 31346a3f20a5..192cad25d321 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -14,6 +14,9 @@ * the COPYING file in the top-level directory. * */ + +#define pr_fmt(fmt) "SVM: " fmt + #include #include "irq.h" @@ -78,6 +81,14 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); #define TSC_RATIO_MIN 0x0000000000000001ULL #define TSC_RATIO_MAX 0x000000ffffffffffULL +#define AVIC_HPA_MASK ~((0xFFFULL << 52) || 0xFFF) + +/* + * 0xff is broadcast, so the max index allowed for physical APIC ID + * table is 0xfe. APIC IDs above 0xff are reserved. + */ +#define AVIC_MAX_PHYSICAL_ID_COUNT 255 + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -162,8 +173,19 @@ struct vcpu_svm { /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; + + struct page *avic_backing_page; + u64 *avic_physical_id_cache; }; +#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF) +#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31) + +#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL) +#define AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK (0xFFFFFFFFFFULL << 12) +#define AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK (1ULL << 62) +#define AVIC_PHYSICAL_ID_ENTRY_VALID_MASK (1ULL << 63) + static DEFINE_PER_CPU(u64, current_tsc_ratio); #define TSC_RATIO_DEFAULT 0x0100000000ULL @@ -205,6 +227,10 @@ module_param(npt, int, S_IRUGO); static int nested = true; module_param(nested, int, S_IRUGO); +/* enable / disable AVIC */ +static int avic; +module_param(avic, int, S_IRUGO); + static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); static void svm_flush_tlb(struct kvm_vcpu *vcpu); static void svm_complete_interrupts(struct vcpu_svm *svm); @@ -228,12 +254,18 @@ enum { VMCB_SEG, /* CS, DS, SS, ES, CPL */ VMCB_CR2, /* CR2 only */ VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */ + VMCB_AVIC, /* AVIC APIC_BAR, AVIC APIC_BACKING_PAGE, + * AVIC PHYSICAL_TABLE pointer, + * AVIC LOGICAL_TABLE pointer + */ VMCB_DIRTY_MAX, }; /* TPR and CR2 are always written before VMRUN */ #define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2)) +#define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL + static inline void mark_all_dirty(struct vmcb *vmcb) { vmcb->control.clean = 0; @@ -255,6 +287,12 @@ static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_svm, vcpu); } +static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) +{ + svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; + mark_dirty(svm->vmcb, VMCB_AVIC); +} + static void recalc_intercepts(struct vcpu_svm *svm) { struct vmcb_control_area *c, *h; @@ -923,6 +961,12 @@ static __init int svm_hardware_setup(void) } else kvm_disable_tdp(); + if (avic && (!npt_enabled || !boot_cpu_has(X86_FEATURE_AVIC))) + avic = false; + + if (avic) + pr_info("AVIC enabled\n"); + return 0; err: @@ -1000,6 +1044,22 @@ static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static void avic_init_vmcb(struct vcpu_svm *svm) +{ + struct vmcb *vmcb = svm->vmcb; + struct kvm_arch *vm_data = &svm->vcpu.kvm->arch; + phys_addr_t bpa = page_to_phys(svm->avic_backing_page); + phys_addr_t lpa = page_to_phys(vm_data->avic_logical_id_table_page); + phys_addr_t ppa = page_to_phys(vm_data->avic_physical_id_table_page); + + vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; + vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; + vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.int_ctl |= AVIC_ENABLE_MASK; + svm->vcpu.arch.apicv_active = true; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1110,9 +1170,131 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_PAUSE); } + if (avic) + avic_init_vmcb(svm); + mark_all_dirty(svm->vmcb); enable_gif(svm); + +} + +static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, int index) +{ + u64 *avic_physical_id_table; + struct kvm_arch *vm_data = &vcpu->kvm->arch; + + if (index >= AVIC_MAX_PHYSICAL_ID_COUNT) + return NULL; + + avic_physical_id_table = page_address(vm_data->avic_physical_id_table_page); + + return &avic_physical_id_table[index]; +} + +/** + * Note: + * AVIC hardware walks the nested page table to check permissions, + * but does not use the SPA address specified in the leaf page + * table entry since it uses address in the AVIC_BACKING_PAGE pointer + * field of the VMCB. Therefore, we set up the + * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. + */ +static int avic_init_access_page(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + int ret; + + if (kvm->arch.apic_access_page_done) + return 0; + + ret = x86_set_memory_region(kvm, + APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, + APIC_DEFAULT_PHYS_BASE, + PAGE_SIZE); + if (ret) + return ret; + + kvm->arch.apic_access_page_done = true; + return 0; +} + +static int avic_init_backing_page(struct kvm_vcpu *vcpu) +{ + int ret; + u64 *entry, new_entry; + int id = vcpu->vcpu_id; + struct vcpu_svm *svm = to_svm(vcpu); + + ret = avic_init_access_page(vcpu); + if (ret) + return ret; + + if (id >= AVIC_MAX_PHYSICAL_ID_COUNT) + return -EINVAL; + + if (!svm->vcpu.arch.apic->regs) + return -EINVAL; + + svm->avic_backing_page = virt_to_page(svm->vcpu.arch.apic->regs); + + /* Setting AVIC backing page address in the phy APIC ID table */ + entry = avic_get_physical_id_entry(vcpu, id); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry = (page_to_phys(svm->avic_backing_page) & + AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | + AVIC_PHYSICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + svm->avic_physical_id_cache = entry; + + return 0; +} + +static void avic_vm_destroy(struct kvm *kvm) +{ + struct kvm_arch *vm_data = &kvm->arch; + + if (vm_data->avic_logical_id_table_page) + __free_page(vm_data->avic_logical_id_table_page); + if (vm_data->avic_physical_id_table_page) + __free_page(vm_data->avic_physical_id_table_page); +} + +static int avic_vm_init(struct kvm *kvm) +{ + int err = -ENOMEM; + struct kvm_arch *vm_data = &kvm->arch; + struct page *p_page; + struct page *l_page; + + if (!avic) + return 0; + + /* Allocating physical APIC ID table (4KB) */ + p_page = alloc_page(GFP_KERNEL); + if (!p_page) + goto free_avic; + + vm_data->avic_physical_id_table_page = p_page; + clear_page(page_address(p_page)); + + /* Allocating logical APIC ID table (4KB) */ + l_page = alloc_page(GFP_KERNEL); + if (!l_page) + goto free_avic; + + vm_data->avic_logical_id_table_page = l_page; + clear_page(page_address(l_page)); + + return 0; + +free_avic: + avic_vm_destroy(kvm); + return err; } static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) @@ -1131,6 +1313,9 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy); kvm_register_write(vcpu, VCPU_REGS_RDX, eax); + + if (kvm_vcpu_apicv_active(vcpu) && !init_event) + avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) @@ -1169,6 +1354,12 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (!hsave_page) goto free_page3; + if (avic) { + err = avic_init_backing_page(&svm->vcpu); + if (err) + goto free_page4; + } + svm->nested.hsave = page_address(hsave_page); svm->msrpm = page_address(msrpm_pages); @@ -1187,6 +1378,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) return &svm->vcpu; +free_page4: + __free_page(hsave_page); free_page3: __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER); free_page2: @@ -3212,6 +3405,10 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; + case MSR_IA32_APICBASE: + if (kvm_vcpu_apicv_active(vcpu)) + avic_update_vapic_bar(to_svm(vcpu), data); + /* Follow through */ default: return kvm_set_msr_common(vcpu, msr); } @@ -3375,10 +3572,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%016llx\n", "avic_vapic_bar:", control->avic_vapic_bar); pr_err("%-20s%08x\n", "event_inj:", control->event_inj); pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); + pr_err("%-20s%016llx\n", "avic_backing_page:", control->avic_backing_page); + pr_err("%-20s%016llx\n", "avic_logical_id:", control->avic_logical_id); + pr_err("%-20s%016llx\n", "avic_physical_id:", control->avic_physical_id); pr_err("VMCB State Save Area:\n"); pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", "es:", @@ -3606,11 +3807,28 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set) static bool svm_get_enable_apicv(void) { - return false; + return avic; +} + +static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) +{ +} + +static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +{ } +/* Note: Currently only used by Hyper-V. */ static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { + struct vcpu_svm *svm = to_svm(vcpu); + struct vmcb *vmcb = svm->vmcb; + + if (!avic) + return; + + vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK; + mark_dirty(vmcb, VMCB_INTR); } static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap) @@ -4322,6 +4540,9 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_free = svm_free_vcpu, .vcpu_reset = svm_vcpu_reset, + .vm_init = avic_vm_init, + .vm_destroy = avic_vm_destroy, + .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, @@ -4382,6 +4603,8 @@ static struct kvm_x86_ops svm_x86_ops = { .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .sync_pir_to_irr = svm_sync_pir_to_irr, + .hwapic_irr_update = svm_hwapic_irr_update, + .hwapic_isr_update = svm_hwapic_isr_update, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, -- cgit v1.2.3 From 18f40c53e10f8d1267dc47cce4487664eececd6d Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:48 -0500 Subject: svm: Add VMEXIT handlers for AVIC This patch introduces VMEXIT handlers, avic_incomplete_ipi_interception() and avic_unaccelerated_access_interception() along with two trace points (trace_kvm_avic_incomplete_ipi and trace_kvm_avic_unaccelerated_access). Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/include/uapi/asm/svm.h | 9 +- arch/x86/kvm/lapic.h | 3 + arch/x86/kvm/svm.c | 279 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/trace.h | 57 ++++++++ arch/x86/kvm/x86.c | 2 + 6 files changed, 350 insertions(+), 1 deletion(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 337e13b0d01d..7aaa10891e2c 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -775,6 +775,7 @@ struct kvm_arch { bool disabled_lapic_found; /* Struct members for AVIC */ + u32 ldr_mode; struct page *avic_logical_id_table_page; struct page *avic_physical_id_table_page; }; diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index 8a4add8e4639..b9e9bb2c6089 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -73,6 +73,8 @@ #define SVM_EXIT_MWAIT_COND 0x08c #define SVM_EXIT_XSETBV 0x08d #define SVM_EXIT_NPF 0x400 +#define SVM_EXIT_AVIC_INCOMPLETE_IPI 0x401 +#define SVM_EXIT_AVIC_UNACCELERATED_ACCESS 0x402 #define SVM_EXIT_ERR -1 @@ -107,8 +109,10 @@ { SVM_EXIT_SMI, "smi" }, \ { SVM_EXIT_INIT, "init" }, \ { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CR0_SEL_WRITE, "cr0_sel_write" }, \ { SVM_EXIT_CPUID, "cpuid" }, \ { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_PAUSE, "pause" }, \ { SVM_EXIT_HLT, "hlt" }, \ { SVM_EXIT_INVLPG, "invlpg" }, \ { SVM_EXIT_INVLPGA, "invlpga" }, \ @@ -127,7 +131,10 @@ { SVM_EXIT_MONITOR, "monitor" }, \ { SVM_EXIT_MWAIT, "mwait" }, \ { SVM_EXIT_XSETBV, "xsetbv" }, \ - { SVM_EXIT_NPF, "npf" } + { SVM_EXIT_NPF, "npf" }, \ + { SVM_EXIT_RSM, "rsm" }, \ + { SVM_EXIT_AVIC_INCOMPLETE_IPI, "avic_incomplete_ipi" }, \ + { SVM_EXIT_AVIC_UNACCELERATED_ACCESS, "avic_unaccelerated_access" } #endif /* _UAPI__SVM_H */ diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index bbe5d12018bb..891c6da7d4aa 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -9,6 +9,9 @@ #define KVM_APIC_SIPI 1 #define KVM_APIC_LVT_NUM 6 +#define KVM_APIC_SHORT_MASK 0xc0000 +#define KVM_APIC_DEST_MASK 0x800 + struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 95d9b5c4c431..db2c140ed079 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -91,6 +91,10 @@ MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); */ #define AVIC_MAX_PHYSICAL_ID_COUNT 255 +#define AVIC_UNACCEL_ACCESS_WRITE_MASK 1 +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK 0xFF0 +#define AVIC_UNACCEL_ACCESS_VECTOR_MASK 0xFFFFFFFF + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -176,6 +180,7 @@ struct vcpu_svm { /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; + u32 ldr_reg; struct page *avic_backing_page; u64 *avic_physical_id_cache; }; @@ -3492,6 +3497,278 @@ static int mwait_interception(struct vcpu_svm *svm) return nop_interception(svm); } +enum avic_ipi_failure_cause { + AVIC_IPI_FAILURE_INVALID_INT_TYPE, + AVIC_IPI_FAILURE_TARGET_NOT_RUNNING, + AVIC_IPI_FAILURE_INVALID_TARGET, + AVIC_IPI_FAILURE_INVALID_BACKING_PAGE, +}; + +static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) +{ + u32 icrh = svm->vmcb->control.exit_info_1 >> 32; + u32 icrl = svm->vmcb->control.exit_info_1; + u32 id = svm->vmcb->control.exit_info_2 >> 32; + u32 index = svm->vmcb->control.exit_info_2 && 0xFF; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + trace_kvm_avic_incomplete_ipi(svm->vcpu.vcpu_id, icrh, icrl, id, index); + + switch (id) { + case AVIC_IPI_FAILURE_INVALID_INT_TYPE: + /* + * AVIC hardware handles the generation of + * IPIs when the specified Message Type is Fixed + * (also known as fixed delivery mode) and + * the Trigger Mode is edge-triggered. The hardware + * also supports self and broadcast delivery modes + * specified via the Destination Shorthand(DSH) + * field of the ICRL. Logical and physical APIC ID + * formats are supported. All other IPI types cause + * a #VMEXIT, which needs to emulated. + */ + kvm_lapic_reg_write(apic, APIC_ICR2, icrh); + kvm_lapic_reg_write(apic, APIC_ICR, icrl); + break; + case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; + struct kvm_lapic *apic = svm->vcpu.arch.apic; + + /* + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. + */ + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } + break; + } + case AVIC_IPI_FAILURE_INVALID_TARGET: + break; + case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: + WARN_ONCE(1, "Invalid backing page\n"); + break; + default: + pr_err("Unknown IPI interception\n"); + } + + return 1; +} + +static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) +{ + struct kvm_arch *vm_data = &vcpu->kvm->arch; + int index; + u32 *logical_apic_id_table; + int dlid = GET_APIC_LOGICAL_ID(ldr); + + if (!dlid) + return NULL; + + if (flat) { /* flat */ + index = ffs(dlid) - 1; + if (index > 7) + return NULL; + } else { /* cluster */ + int cluster = (dlid & 0xf0) >> 4; + int apic = ffs(dlid & 0x0f) - 1; + + if ((apic < 0) || (apic > 7) || + (cluster >= 0xf)) + return NULL; + index = (cluster << 2) + apic; + } + + logical_apic_id_table = (u32 *) page_address(vm_data->avic_logical_id_table_page); + + return &logical_apic_id_table[index]; +} + +static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr, + bool valid) +{ + bool flat; + u32 *entry, new_entry; + + flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; + entry = avic_get_logical_id_entry(vcpu, ldr, flat); + if (!entry) + return -EINVAL; + + new_entry = READ_ONCE(*entry); + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; + new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); + if (valid) + new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + else + new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK; + WRITE_ONCE(*entry, new_entry); + + return 0; +} + +static int avic_handle_ldr_update(struct kvm_vcpu *vcpu) +{ + int ret; + struct vcpu_svm *svm = to_svm(vcpu); + u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); + + if (!ldr) + return 1; + + ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true); + if (ret && svm->ldr_reg) { + avic_ldr_write(vcpu, 0, svm->ldr_reg, false); + svm->ldr_reg = 0; + } else { + svm->ldr_reg = ldr; + } + return ret; +} + +static int avic_handle_apic_id_update(struct kvm_vcpu *vcpu) +{ + u64 *old, *new; + struct vcpu_svm *svm = to_svm(vcpu); + u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID); + u32 id = (apic_id_reg >> 24) & 0xff; + + if (vcpu->vcpu_id == id) + return 0; + + old = avic_get_physical_id_entry(vcpu, vcpu->vcpu_id); + new = avic_get_physical_id_entry(vcpu, id); + if (!new || !old) + return 1; + + /* We need to move physical_id_entry to new offset */ + *new = *old; + *old = 0ULL; + to_svm(vcpu)->avic_physical_id_cache = new; + + /* + * Also update the guest physical APIC ID in the logical + * APIC ID table entry if already setup the LDR. + */ + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + + return 0; +} + +static int avic_handle_dfr_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + struct kvm_arch *vm_data = &vcpu->kvm->arch; + u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); + u32 mod = (dfr >> 28) & 0xf; + + /* + * We assume that all local APICs are using the same type. + * If this changes, we need to flush the AVIC logical + * APID id table. + */ + if (vm_data->ldr_mode == mod) + return 0; + + clear_page(page_address(vm_data->avic_logical_id_table_page)); + vm_data->ldr_mode = mod; + + if (svm->ldr_reg) + avic_handle_ldr_update(vcpu); + return 0; +} + +static int avic_unaccel_trap_write(struct vcpu_svm *svm) +{ + struct kvm_lapic *apic = svm->vcpu.arch.apic; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + + switch (offset) { + case APIC_ID: + if (avic_handle_apic_id_update(&svm->vcpu)) + return 0; + break; + case APIC_LDR: + if (avic_handle_ldr_update(&svm->vcpu)) + return 0; + break; + case APIC_DFR: + avic_handle_dfr_update(&svm->vcpu); + break; + default: + break; + } + + kvm_lapic_reg_write(apic, offset, kvm_lapic_get_reg(apic, offset)); + + return 1; +} + +static bool is_avic_unaccelerated_access_trap(u32 offset) +{ + bool ret = false; + + switch (offset) { + case APIC_ID: + case APIC_EOI: + case APIC_RRR: + case APIC_LDR: + case APIC_DFR: + case APIC_SPIV: + case APIC_ESR: + case APIC_ICR: + case APIC_LVTT: + case APIC_LVTTHMR: + case APIC_LVTPC: + case APIC_LVT0: + case APIC_LVT1: + case APIC_LVTERR: + case APIC_TMICT: + case APIC_TDCR: + ret = true; + break; + default: + break; + } + return ret; +} + +static int avic_unaccelerated_access_interception(struct vcpu_svm *svm) +{ + int ret = 0; + u32 offset = svm->vmcb->control.exit_info_1 & + AVIC_UNACCEL_ACCESS_OFFSET_MASK; + u32 vector = svm->vmcb->control.exit_info_2 & + AVIC_UNACCEL_ACCESS_VECTOR_MASK; + bool write = (svm->vmcb->control.exit_info_1 >> 32) & + AVIC_UNACCEL_ACCESS_WRITE_MASK; + bool trap = is_avic_unaccelerated_access_trap(offset); + + trace_kvm_avic_unaccelerated_access(svm->vcpu.vcpu_id, offset, + trap, write, vector); + if (trap) { + /* Handling Trap */ + WARN_ONCE(!write, "svm: Handling trap read.\n"); + ret = avic_unaccel_trap_write(svm); + } else { + /* Handling Fault */ + ret = (emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE); + } + + return ret; +} + static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR0] = cr_interception, [SVM_EXIT_READ_CR3] = cr_interception, @@ -3555,6 +3832,8 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_XSETBV] = xsetbv_interception, [SVM_EXIT_NPF] = pf_interception, [SVM_EXIT_RSM] = emulate_on_interception, + [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, + [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, }; static void dump_vmcb(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 2f1ea2f61e1f..39f264cbda71 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -1292,6 +1292,63 @@ TRACE_EVENT(kvm_hv_stimer_cleanup, __entry->vcpu_id, __entry->timer_index) ); +/* + * Tracepoint for AMD AVIC + */ +TRACE_EVENT(kvm_avic_incomplete_ipi, + TP_PROTO(u32 vcpu, u32 icrh, u32 icrl, u32 id, u32 index), + TP_ARGS(vcpu, icrh, icrl, id, index), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, icrh) + __field(u32, icrl) + __field(u32, id) + __field(u32, index) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->icrh = icrh; + __entry->icrl = icrl; + __entry->id = id; + __entry->index = index; + ), + + TP_printk("vcpu=%u, icrh:icrl=%#010x:%08x, id=%u, index=%u\n", + __entry->vcpu, __entry->icrh, __entry->icrl, + __entry->id, __entry->index) +); + +TRACE_EVENT(kvm_avic_unaccelerated_access, + TP_PROTO(u32 vcpu, u32 offset, bool ft, bool rw, u32 vec), + TP_ARGS(vcpu, offset, ft, rw, vec), + + TP_STRUCT__entry( + __field(u32, vcpu) + __field(u32, offset) + __field(bool, ft) + __field(bool, rw) + __field(u32, vec) + ), + + TP_fast_assign( + __entry->vcpu = vcpu; + __entry->offset = offset; + __entry->ft = ft; + __entry->rw = rw; + __entry->vec = vec; + ), + + TP_printk("vcpu=%u, offset=%#x(%s), %s, %s, vec=%#x\n", + __entry->vcpu, + __entry->offset, + __print_symbolic(__entry->offset, kvm_trace_symbol_apic), + __entry->ft ? "trap" : "fault", + __entry->rw ? "write" : "read", + __entry->vec) +); + #endif /* _TRACE_KVM_H */ #undef TRACE_INCLUDE_PATH diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 7fcaed49b3c5..a8c7ca34ee5d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8435,3 +8435,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); -- cgit v1.2.3 From be8ca170edf342cc45ae4a6431ef192e5cbe211e Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 4 May 2016 14:09:49 -0500 Subject: KVM: x86: Introducing kvm_x86_ops.apicv_post_state_restore Adding kvm_x86_ops hooks to allow APICv to do post state restore. This is required to support VM save and restore feature. Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/lapic.c | 2 ++ arch/x86/kvm/svm.c | 10 ++++++++++ 3 files changed, 13 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 7aaa10891e2c..43e74384c606 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1004,6 +1004,7 @@ struct kvm_x86_ops { int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq, bool set); + void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 34c28c0ffe6a..539675c6daa2 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1961,6 +1961,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, 1 : count_vectors(apic->regs + APIC_ISR); apic->highest_isr_cache = -1; if (vcpu->arch.apicv_active) { + if (kvm_x86_ops->apicv_post_state_restore) + kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); kvm_x86_ops->hwapic_isr_update(vcpu->kvm, diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index db2c140ed079..8c85b8ee68b6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4834,6 +4834,15 @@ static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) { } +static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) +{ + if (avic_handle_apic_id_update(vcpu) != 0) + return; + if (avic_handle_dfr_update(vcpu) != 0) + return; + avic_handle_ldr_update(vcpu); +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -4914,6 +4923,7 @@ static struct kvm_x86_ops svm_x86_ops = { .sync_pir_to_irr = svm_sync_pir_to_irr, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, + .apicv_post_state_restore = avic_post_state_restore, .set_tss_addr = svm_set_tss_addr, .get_tdp_level = get_npt_level, -- cgit v1.2.3 From 67c9dddc95ac16a09db996e8e4dcacfd94cf2306 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 10 May 2016 17:01:23 +0200 Subject: KVM: x86: make hwapic_isr_update and hwapic_irr_update look the same MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Neither APICv nor AVIC actually need the first argument of hwapic_isr_update, but the vCPU makes more sense than passing the pointer to the whole virtual machine! In fact in the APICv case it's just happening that the vCPU is used implicitly, through the loaded VMCS. The second argument instead is named differently, make it consistent. Reviewed-by: Radim Krčmář Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/lapic.c | 6 +++--- arch/x86/kvm/svm.c | 2 +- arch/x86/kvm/vmx.c | 10 +++++----- 4 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 43e74384c606..e0fbe7e70dc1 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -922,7 +922,7 @@ struct kvm_x86_ops { bool (*get_enable_apicv)(void); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); - void (*hwapic_isr_update)(struct kvm *kvm, int isr); + void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set); void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 539675c6daa2..bbb5b283ff63 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -392,7 +392,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic) * just set SVI. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec); + kvm_x86_ops->hwapic_isr_update(vcpu, vec); else { ++apic->isr_count; BUG_ON(apic->isr_count > MAX_APIC_VECTOR); @@ -440,7 +440,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic) * and must be left alone. */ if (unlikely(vcpu->arch.apicv_active)) - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); else { --apic->isr_count; @@ -1965,7 +1965,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu, kvm_x86_ops->apicv_post_state_restore(vcpu); kvm_x86_ops->hwapic_irr_update(vcpu, apic_find_highest_irr(apic)); - kvm_x86_ops->hwapic_isr_update(vcpu->kvm, + kvm_x86_ops->hwapic_isr_update(vcpu, apic_find_highest_isr(apic)); } kvm_make_request(KVM_REQ_EVENT, vcpu); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 7aeef57a093a..b0dd90338de7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4202,7 +4202,7 @@ static void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr) { } -static void svm_hwapic_isr_update(struct kvm *kvm, int isr) +static void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ab4a387b98b1..7ebf27bafe5c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -8314,19 +8314,19 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa) vmcs_write64(APIC_ACCESS_ADDR, hpa); } -static void vmx_hwapic_isr_update(struct kvm *kvm, int isr) +static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr) { u16 status; u8 old; - if (isr == -1) - isr = 0; + if (max_isr == -1) + max_isr = 0; status = vmcs_read16(GUEST_INTR_STATUS); old = status >> 8; - if (isr != old) { + if (max_isr != old) { status &= 0xff; - status |= isr << 8; + status |= max_isr << 8; vmcs_write16(GUEST_INTR_STATUS, status); } } -- cgit v1.2.3 From fd8cfd3000191cb7f5b9ea8640bd46181f6b4b74 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 May 2016 17:13:00 -0700 Subject: arch: fix has_transparent_hugepage() I've just discovered that the useful-sounding has_transparent_hugepage() is actually an architecture-dependent minefield: on some arches it only builds if CONFIG_TRANSPARENT_HUGEPAGE=y, on others it's also there when not, but on some of those (arm and arm64) it then gives the wrong answer; and on mips alone it's marked __init, which would crash if called later (but so far it has not been called later). Straighten this out: make it available to all configs, with a sensible default in asm-generic/pgtable.h, removing its definitions from those arches (arc, arm, arm64, sparc, tile) which are served by the default, adding #define has_transparent_hugepage has_transparent_hugepage to those (mips, powerpc, s390, x86) which need to override the default at runtime, and removing the __init from mips (but maybe that kind of code should be avoided after init: set a static variable the first time it's called). Signed-off-by: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Andres Lagar-Cavilla Cc: Yang Shi Cc: Ning Qu Cc: Mel Gorman Cc: Konstantin Khlebnikov Acked-by: David S. Miller Acked-by: Vineet Gupta [arch/arc] Acked-by: Gerald Schaefer [arch/s390] Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/include/asm/hugepage.h | 2 -- arch/arm/include/asm/pgtable-3level.h | 5 ----- arch/arm64/include/asm/pgtable.h | 5 ----- arch/mips/include/asm/pgtable.h | 1 + arch/mips/mm/tlb-r4k.c | 21 +++++++++++---------- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/pgtable.h | 1 - arch/s390/include/asm/pgtable.h | 1 + arch/sparc/include/asm/pgtable_64.h | 2 -- arch/tile/include/asm/pgtable.h | 1 - arch/x86/include/asm/pgtable.h | 1 + include/asm-generic/pgtable.h | 8 ++++++++ 12 files changed, 23 insertions(+), 26 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index 7afe3356b770..317ff773e1ca 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -61,8 +61,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); -#define has_transparent_hugepage() 1 - /* Generic variants assume pgtable_t is struct page *, hence need for these */ #define __HAVE_ARCH_PGTABLE_DEPOSIT extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index dc46398bc3a5..fa70db7c714b 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -281,11 +281,6 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, flush_pmd_entry(pmdp); } -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #endif /* __ASSEMBLY__ */ #endif /* _ASM_PGTABLE_3LEVEL_H */ diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 2da46ae9c991..a7ac45a03dd0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -314,11 +314,6 @@ static inline int pmd_protnone(pmd_t pmd) #define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)) -static inline int has_transparent_hugepage(void) -{ - return 1; -} - #define __pgprot_modify(prot,mask,bits) \ __pgprot((pgprot_val(prot) & ~(mask)) | (bits)) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 9a4fe0133ff1..f53a7e3a4dd9 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -468,6 +468,7 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); static inline int pmd_trans_huge(pmd_t pmd) diff --git a/arch/mips/mm/tlb-r4k.c b/arch/mips/mm/tlb-r4k.c index c17d7627f872..2d93b63cf830 100644 --- a/arch/mips/mm/tlb-r4k.c +++ b/arch/mips/mm/tlb-r4k.c @@ -400,19 +400,20 @@ void add_wired_entry(unsigned long entrylo0, unsigned long entrylo1, #ifdef CONFIG_TRANSPARENT_HUGEPAGE -int __init has_transparent_hugepage(void) +int has_transparent_hugepage(void) { - unsigned int mask; - unsigned long flags; - - local_irq_save(flags); - write_c0_pagemask(PM_HUGE_MASK); - back_to_back_c0_hazard(); - mask = read_c0_pagemask(); - write_c0_pagemask(PM_DEFAULT_MASK); + static unsigned int mask = -1; - local_irq_restore(flags); + if (mask == -1) { /* first call comes during __init */ + unsigned long flags; + local_irq_save(flags); + write_c0_pagemask(PM_HUGE_MASK); + back_to_back_c0_hazard(); + mask = read_c0_pagemask(); + write_c0_pagemask(PM_DEFAULT_MASK); + local_irq_restore(flags); + } return mask == PM_HUGE_MASK; } diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 77d3ce05798e..8fe6f6b48aa5 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -219,6 +219,7 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd); extern void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); +#define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 47897a30982d..ee09e99097f0 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -65,7 +65,6 @@ extern int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr, struct page **pages, int *nr); #ifndef CONFIG_TRANSPARENT_HUGEPAGE #define pmd_large(pmd) 0 -#define has_transparent_hugepage() 0 #endif pte_t *__find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea, bool *is_thp, unsigned *shift); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2f66645587a2..18d2beb89340 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1223,6 +1223,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return pmd_val(pmd) & _SEGMENT_ENTRY_LARGE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return MACHINE_HAS_HPAGE ? 1 : 0; diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index f089cfa249f3..93ce0ada3c63 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -681,8 +681,6 @@ static inline unsigned long pmd_trans_huge(pmd_t pmd) return pte_val(pte) & _PAGE_PMD_HUGE; } -#define has_transparent_hugepage() 1 - static inline pmd_t pmd_mkold(pmd_t pmd) { pte_t pte = __pte(pmd_val(pmd)); diff --git a/arch/tile/include/asm/pgtable.h b/arch/tile/include/asm/pgtable.h index 96cecf55522e..2a26cc4fefc2 100644 --- a/arch/tile/include/asm/pgtable.h +++ b/arch/tile/include/asm/pgtable.h @@ -487,7 +487,6 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -#define has_transparent_hugepage() 1 #define pmd_trans_huge pmd_huge_page #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f86491a7bc9d..1a27396b6ea0 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -181,6 +181,7 @@ static inline int pmd_trans_huge(pmd_t pmd) return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE; } +#define has_transparent_hugepage has_transparent_hugepage static inline int has_transparent_hugepage(void) { return boot_cpu_has(X86_FEATURE_PSE); diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 9401f4819891..d4458b6dbfb4 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd) #define io_remap_pfn_range remap_pfn_range #endif +#ifndef has_transparent_hugepage +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define has_transparent_hugepage() 1 +#else +#define has_transparent_hugepage() 0 +#endif +#endif + #endif /* _ASM_GENERIC_PGTABLE_H */ -- cgit v1.2.3 From 0f6ff2bce0d4c3e4ff34f5d2ffb7329025b30844 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 12 May 2016 15:04:00 -0700 Subject: x86/mm/mpx: Work around MPX erratum SKD046 This erratum essentially causes the CPU to forget which privilege level it is operating on (kernel vs. user) for the purposes of MPX. This erratum can only be triggered when a system is not using Supervisor Mode Execution Prevention (SMEP). Our workaround for the erratum is to ensure that MPX can only be used in cases where SMEP is present in the processor and is enabled. This erratum only affects Core processors. Atom is unaffected. But, there is no architectural way to determine Atom vs. Core. So, we just apply this workaround to all processors. It's possible that it will mistakenly disable MPX on some Atom processsors or future unaffected Core processors. There are currently no processors that have MPX and not SMEP. It would take something akin to a hypervisor masking SMEP out on an Atom processor for this to present itself on current hardware. More details can be found at: http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/desktop-6th-gen-core-family-spec-update.pdf " SKD046 Branch Instructions May Initialize MPX Bound Registers Incorrectly Problem: Depending on the current Intel MPX (Memory Protection Extensions) configuration, execution of certain branch instructions (near CALL, near RET, near JMP, and Jcc instructions) without a BND prefix (F2H) initialize the MPX bound registers. Due to this erratum, such a branch instruction that is executed both with CPL = 3 and with CPL < 3 may not use the correct MPX configuration register (BNDCFGU or BNDCFGS, respectively) for determining whether to initialize the bound registers; it may thus initialize the bound registers when it should not, or fail to initialize them when it should. Implication: A branch instruction that has executed both in user mode and in supervisor mode (from the same linear address) may cause a #BR (bound range fault) when it should not have or may not cause a #BR when it should have. Workaround An operating system can avoid this erratum by setting CR4.SMEP[bit 20] to enable supervisor-mode execution prevention (SMEP). When SMEP is enabled, no code can be executed both with CPL = 3 and with CPL < 3. " Signed-off-by: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160512220400.3B35F1BC@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bugs.h | 8 ++++++++ arch/x86/kernel/cpu/common.c | 3 +++ arch/x86/kernel/cpu/intel.c | 37 +++++++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h index 08abf639075f..5490bbaf71d5 100644 --- a/arch/x86/include/asm/bugs.h +++ b/arch/x86/include/asm/bugs.h @@ -1,8 +1,16 @@ #ifndef _ASM_X86_BUGS_H #define _ASM_X86_BUGS_H +#include + extern void check_bugs(void); +#if defined(CONFIG_CPU_SUP_INTEL) +void check_mpx_erratum(struct cpuinfo_x86 *c); +#else +static inline void check_mpx_erratum(struct cpuinfo_x86 *c) {} +#endif + #if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32) int ppro_with_ram_bug(void); #else diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f45a4b9d28c8..62ff5255ae16 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -270,6 +271,8 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) static __init int setup_disable_smep(char *arg) { setup_clear_cpu_cap(X86_FEATURE_SMEP); + /* Check for things that depend on SMEP being enabled: */ + check_mpx_erratum(&boot_cpu_data); return 1; } __setup("nosmep", setup_disable_smep); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index e4393bfc7f0d..b47df99dc5d2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -25,6 +25,41 @@ #include #endif +/* + * Just in case our CPU detection goes bad, or you have a weird system, + * allow a way to override the automatic disabling of MPX. + */ +static int forcempx; + +static int __init forcempx_setup(char *__unused) +{ + forcempx = 1; + + return 1; +} +__setup("intel-skd-046-workaround=disable", forcempx_setup); + +void check_mpx_erratum(struct cpuinfo_x86 *c) +{ + if (forcempx) + return; + /* + * Turn off the MPX feature on CPUs where SMEP is not + * available or disabled. + * + * Works around Intel Erratum SKD046: "Branch Instructions + * May Initialize MPX Bound Registers Incorrectly". + * + * This might falsely disable MPX on systems without + * SMEP, like Atom processors without SMEP. But there + * is no such hardware known at the moment. + */ + if (cpu_has(c, X86_FEATURE_MPX) && !cpu_has(c, X86_FEATURE_SMEP)) { + setup_clear_cpu_cap(X86_FEATURE_MPX); + pr_warn("x86/mpx: Disabling MPX since SMEP not present\n"); + } +} + static void early_init_intel(struct cpuinfo_x86 *c) { u64 misc_enable; @@ -173,6 +208,8 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (edx & (1U << 28)) c->x86_coreid_bits = get_count_order((ebx >> 16) & 0xff); } + + check_mpx_erratum(c); } #ifdef CONFIG_X86_32 -- cgit v1.2.3 From 1771c6e1a567ea0ba2cccc0a4ffe68a1419fd8ef Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 20 May 2016 16:59:31 -0700 Subject: x86/kasan: instrument user memory access API Exchange between user and kernel memory is coded in assembly language. Which means that such accesses won't be spotted by KASAN as a compiler instruments only C code. Add explicit KASAN checks to user memory access API to ensure that userspace writes to (or reads from) a valid kernel memory. Note: Unlike others strncpy_from_user() is written mostly in C and KASAN sees memory accesses in it. However, it makes sense to add explicit check for all @count bytes that *potentially* could be written to the kernel. [aryabinin@virtuozzo.com: move kasan check under the condition] Link: http://lkml.kernel.org/r/1462869209-21096-1-git-send-email-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/1462538722-1574-4-git-send-email-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess.h | 5 +++++ arch/x86/include/asm/uaccess_64.h | 7 +++++++ lib/strncpy_from_user.c | 2 ++ 3 files changed, 14 insertions(+) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 12f9653bde8d..2982387ba817 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ */ #include #include +#include #include #include #include @@ -721,6 +722,8 @@ copy_from_user(void *to, const void __user *from, unsigned long n) might_fault(); + kasan_check_write(to, n); + /* * While we would like to have the compiler do the checking for us * even in the non-constant size case, any false positives there are @@ -754,6 +757,8 @@ copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); + kasan_check_read(from, n); + might_fault(); /* See the comment in copy_from_user() above. */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 307698688fa1..2eac2aa3e37f 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -109,6 +110,7 @@ static __always_inline __must_check int __copy_from_user(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } @@ -175,6 +177,7 @@ static __always_inline __must_check int __copy_to_user(void __user *dst, const void *src, unsigned size) { might_fault(); + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -242,12 +245,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size) static __must_check __always_inline int __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_from_user_nocheck(dst, src, size); } static __must_check __always_inline int __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) { + kasan_check_read(src, size); return __copy_to_user_nocheck(dst, src, size); } @@ -258,6 +263,7 @@ static inline int __copy_from_user_nocache(void *dst, const void __user *src, unsigned size) { might_fault(); + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 1); } @@ -265,6 +271,7 @@ static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { + kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 0); } diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 33840324138c..33f655ef48cd 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -109,6 +110,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count) unsigned long max = max_addr - src_addr; long retval; + kasan_check_write(dst, count); user_access_begin(); retval = do_strncpy_from_user(dst, src, count, max); user_access_end(); -- cgit v1.2.3 From 5b09c3edecd37ec1a52fbd5ae97a19734edc7a77 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 14:19:37 -0700 Subject: x86: remove pointless uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_to_user_inatomic()" is mostly the special cases for the constant size, and it's actually never relevant. Every user except for one aren't actually using a constant size anyway, and the one user that uses it is better off just using __put_user() instead. So get rid of the unnecessary complexity. [ The same cleanup should likely happen to __copy_from_user_inatomic() as well, but that one has a lot more users that I need to take a look at first ] Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_32.h | 36 ------------------------------ drivers/gpu/drm/i915/i915_gem_execbuffer.c | 4 +--- 2 files changed, 1 insertion(+), 39 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 3fe0eac59462..537cc883ea29 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -33,46 +33,10 @@ unsigned long __must_check __copy_from_user_ll_nocache_nozero * the specified block with access_ok() before calling this function. * The caller should also make sure he pins the user space address * so that we don't result in page fault and sleep. - * - * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault - * we return the initial request size (1, 2 or 4), as copy_*_user should do. - * If a store crosses a page boundary and gets a fault, the x86 will not write - * anything, so this is accurate. */ - static __always_inline unsigned long __must_check __copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) { - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __put_user_size(*(u8 *)from, (u8 __user *)to, - 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __put_user_size(*(u16 *)from, (u16 __user *)to, - 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __put_user_size(*(u32 *)from, (u32 __user *)to, - 4, ret, 4); - __uaccess_end(); - return ret; - case 8: - __uaccess_begin(); - __put_user_size(*(u64 *)from, (u64 __user *)to, - 8, ret, 8); - __uaccess_end(); - return ret; - } - } return __copy_to_user_ll(to, from, n); } diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c index a676eedb441b..d4d7c88ab595 100644 --- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c @@ -535,9 +535,7 @@ i915_gem_execbuffer_relocate_vma(struct i915_vma *vma, return ret; if (r->presumed_offset != offset && - __copy_to_user_inatomic(&user_relocs->presumed_offset, - &r->presumed_offset, - sizeof(r->presumed_offset))) { + __put_user(r->presumed_offset, &user_relocs->presumed_offset)) { return -EFAULT; } -- cgit v1.2.3 From bd28b14591b98f696bc9f94c5ba2e598ca487dfd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 22 May 2016 17:21:27 -0700 Subject: x86: remove more uaccess_32.h complexity I'm looking at trying to possibly merge the 32-bit and 64-bit versions of the x86 uaccess.h implementation, but first this needs to be cleaned up. For example, the 32-bit version of "__copy_from_user_inatomic()" is mostly the special cases for the constant size, and it's actually almost never relevant. Most users aren't actually using a constant size anyway, and the few cases that do small constant copies are better off just using __get_user() instead. So get rid of the unnecessary complexity. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/uaccess_32.h | 26 -------------------------- kernel/events/uprobes.c | 3 +-- kernel/futex.c | 2 +- mm/maccess.c | 3 +-- 4 files changed, 3 insertions(+), 31 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 537cc883ea29..4b32da24faaf 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -65,32 +65,6 @@ __copy_to_user(void __user *to, const void *from, unsigned long n) static __always_inline unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - /* Avoid zeroing the tail if the copy fails.. - * If 'n' is constant and 1, 2, or 4, we do still zero on a failure, - * but as the zeroing behaviour is only significant when n is not - * constant, that shouldn't be a problem. - */ - if (__builtin_constant_p(n)) { - unsigned long ret; - - switch (n) { - case 1: - __uaccess_begin(); - __get_user_size(*(u8 *)to, from, 1, ret, 1); - __uaccess_end(); - return ret; - case 2: - __uaccess_begin(); - __get_user_size(*(u16 *)to, from, 2, ret, 2); - __uaccess_end(); - return ret; - case 4: - __uaccess_begin(); - __get_user_size(*(u32 *)to, from, 4, ret, 4); - __uaccess_end(); - return ret; - } - } return __copy_from_user_ll_nozero(to, from, n); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7edc95edfaee..c01f733ff2e1 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1694,8 +1694,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) int result; pagefault_disable(); - result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr, - sizeof(opcode)); + result = __get_user(opcode, (uprobe_opcode_t __user *)vaddr); pagefault_enable(); if (likely(result == 0)) diff --git a/kernel/futex.c b/kernel/futex.c index c20f06f38ef3..ee25f5ba4aca 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -729,7 +729,7 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from) int ret; pagefault_disable(); - ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + ret = __get_user(*dest, from); pagefault_enable(); return ret ? -EFAULT : 0; diff --git a/mm/maccess.c b/mm/maccess.c index d159b1c96e48..78f9274dd49d 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -96,8 +96,7 @@ long strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count) pagefault_disable(); do { - ret = __copy_from_user_inatomic(dst++, - (const void __user __force *)src++, 1); + ret = __get_user(*dst++, (const char __user __force *)src++); } while (dst[-1] && ret == 0 && src - unsafe_addr < count); dst[-1] = '\0'; -- cgit v1.2.3 From b740d2e9233cb33626d3b62210bcfc6a34baa839 Mon Sep 17 00:00:00 2001 From: Rajneesh Bhardwaj Date: Thu, 26 May 2016 14:41:19 +0530 Subject: platform/x86: Add PMC Driver for Intel Core SoC This patch adds the Power Management Controller driver as a PCI driver for Intel Core SoC architecture. This driver can utilize debugging capabilities and supported features as exposed by the Power Management Controller. Please refer to the below specification for more details on PMC features. http://www.intel.in/content/www/in/en/chipsets/100-series-chipset-datasheet-vol-2.html The current version of this driver exposes SLP_S0_RESIDENCY counter. This counter can be used for detecting fragile SLP_S0 signal related failures and take corrective actions when PCH SLP_S0 signal is not asserted after kernel freeze as part of suspend to idle flow (echo freeze > /sys/power/state). Intel Platform Controller Hub (PCH) asserts SLP_S0 signal when it detects favorable conditions to enter its low power mode. As a pre-requisite the SoC should be in deepest possible Package C-State and devices should be in low power mode. For example, on Skylake SoC the deepest Package C-State is Package C10 or PC10. Suspend to idle flow generally leads to PC10 state but PC10 state may not be sufficient for realizing the platform wide power potential which SLP_S0 signal assertion can provide. SLP_S0 signal is often connected to the Embedded Controller (EC) and the Power Management IC (PMIC) for other platform power management related optimizations. In general, SLP_S0 assertion == PC10 + PCH low power mode + ModPhy Lanes power gated + PLL Idle. As part of this driver, a mechanism to read the SLP_S0_RESIDENCY is exposed as an API and also debugfs features are added to indicate SLP_S0 signal assertion residency in microseconds. echo freeze > /sys/power/state wake the system cat /sys/kernel/debug/pmc_core/slp_s0_residency_usec Signed-off-by: Rajneesh Bhardwaj Signed-off-by: Vishwanath Somayaji Reviewed-by: Andy Shevchenko Signed-off-by: Darren Hart --- MAINTAINERS | 8 ++ arch/x86/include/asm/pmc_core.h | 27 +++++ drivers/platform/x86/Kconfig | 12 ++ drivers/platform/x86/Makefile | 1 + drivers/platform/x86/intel_pmc_core.c | 200 ++++++++++++++++++++++++++++++++++ drivers/platform/x86/intel_pmc_core.h | 51 +++++++++ 6 files changed, 299 insertions(+) create mode 100644 arch/x86/include/asm/pmc_core.h create mode 100644 drivers/platform/x86/intel_pmc_core.c create mode 100644 drivers/platform/x86/intel_pmc_core.h (limited to 'arch/x86/include/asm') diff --git a/MAINTAINERS b/MAINTAINERS index 1c32f8a3d6c4..32a57926cd04 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5879,6 +5879,14 @@ S: Maintained F: arch/x86/include/asm/intel_telemetry.h F: drivers/platform/x86/intel_telemetry* +INTEL PMC CORE DRIVER +M: Rajneesh Bhardwaj +M: Vishwanath Somayaji +L: platform-driver-x86@vger.kernel.org +S: Maintained +F: arch/x86/include/asm/pmc_core.h +F: drivers/platform/x86/intel_pmc_core* + IOC3 ETHERNET DRIVER M: Ralf Baechle L: linux-mips@linux-mips.org diff --git a/arch/x86/include/asm/pmc_core.h b/arch/x86/include/asm/pmc_core.h new file mode 100644 index 000000000000..d4855f11136d --- /dev/null +++ b/arch/x86/include/asm/pmc_core.h @@ -0,0 +1,27 @@ +/* + * Intel Core SoC Power Management Controller Header File + * + * Copyright (c) 2016, Intel Corporation. + * All Rights Reserved. + * + * Authors: Rajneesh Bhardwaj + * Vishwanath Somayaji + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef _ASM_PMC_CORE_H +#define _ASM_PMC_CORE_H + +/* API to read SLP_S0_RESIDENCY counter */ +int intel_pmc_slp_s0_counter_read(u32 *data); + +#endif /* _ASM_PMC_CORE_H */ diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index ed2004be13cf..c06bb85c2839 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -846,6 +846,18 @@ config INTEL_IMR If you are running on a Galileo/Quark say Y here. +config INTEL_PMC_CORE + bool "Intel PMC Core driver" + depends on X86 && PCI + ---help--- + The Intel Platform Controller Hub for Intel Core SoCs provides access + to Power Management Controller registers via a PCI interface. This + driver can utilize debugging capabilities and supported features as + exposed by the Power Management Controller. + + Supported features: + - SLP_S0_RESIDENCY counter. + config IBM_RTL tristate "Device driver to enable PRTL support" depends on X86 && PCI diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile index 448443c3baba..9b11b4073e03 100644 --- a/drivers/platform/x86/Makefile +++ b/drivers/platform/x86/Makefile @@ -69,3 +69,4 @@ obj-$(CONFIG_INTEL_PUNIT_IPC) += intel_punit_ipc.o obj-$(CONFIG_INTEL_TELEMETRY) += intel_telemetry_core.o \ intel_telemetry_pltdrv.o \ intel_telemetry_debugfs.o +obj-$(CONFIG_INTEL_PMC_CORE) += intel_pmc_core.o diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c new file mode 100644 index 000000000000..2776bec89c88 --- /dev/null +++ b/drivers/platform/x86/intel_pmc_core.c @@ -0,0 +1,200 @@ +/* + * Intel Core SoC Power Management Controller Driver + * + * Copyright (c) 2016, Intel Corporation. + * All Rights Reserved. + * + * Authors: Rajneesh Bhardwaj + * Vishwanath Somayaji + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "intel_pmc_core.h" + +static struct pmc_dev pmc; + +static const struct pci_device_id pmc_pci_ids[] = { + { PCI_VDEVICE(INTEL, SPT_PMC_PCI_DEVICE_ID), (kernel_ulong_t)NULL }, + { 0, }, +}; + +static inline u32 pmc_core_reg_read(struct pmc_dev *pmcdev, int reg_offset) +{ + return readl(pmcdev->regbase + reg_offset); +} + +static inline u32 pmc_core_adjust_slp_s0_step(u32 value) +{ + return value * SPT_PMC_SLP_S0_RES_COUNTER_STEP; +} + +/** + * intel_pmc_slp_s0_counter_read() - Read SLP_S0 residency. + * @data: Out param that contains current SLP_S0 count. + * + * This API currently supports Intel Skylake SoC and Sunrise + * Point Platform Controller Hub. Future platform support + * should be added for platforms that support low power modes + * beyond Package C10 state. + * + * SLP_S0_RESIDENCY counter counts in 100 us granularity per + * step hence function populates the multiplied value in out + * parameter @data. + * + * Return: an error code or 0 on success. + */ +int intel_pmc_slp_s0_counter_read(u32 *data) +{ + struct pmc_dev *pmcdev = &pmc; + u32 value; + + if (!pmcdev->has_slp_s0_res) + return -EACCES; + + value = pmc_core_reg_read(pmcdev, SPT_PMC_SLP_S0_RES_COUNTER_OFFSET); + *data = pmc_core_adjust_slp_s0_step(value); + + return 0; +} +EXPORT_SYMBOL_GPL(intel_pmc_slp_s0_counter_read); + +#if IS_ENABLED(CONFIG_DEBUG_FS) +static int pmc_core_dev_state_show(struct seq_file *s, void *unused) +{ + struct pmc_dev *pmcdev = s->private; + u32 counter_val; + + counter_val = pmc_core_reg_read(pmcdev, + SPT_PMC_SLP_S0_RES_COUNTER_OFFSET); + seq_printf(s, "%u\n", pmc_core_adjust_slp_s0_step(counter_val)); + + return 0; +} + +static int pmc_core_dev_state_open(struct inode *inode, struct file *file) +{ + return single_open(file, pmc_core_dev_state_show, inode->i_private); +} + +static const struct file_operations pmc_core_dev_state_ops = { + .open = pmc_core_dev_state_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev) +{ + debugfs_remove_recursive(pmcdev->dbgfs_dir); +} + +static int pmc_core_dbgfs_register(struct pmc_dev *pmcdev) +{ + struct dentry *dir, *file; + + dir = debugfs_create_dir("pmc_core", NULL); + if (!dir) + return -ENOMEM; + + pmcdev->dbgfs_dir = dir; + file = debugfs_create_file("slp_s0_residency_usec", S_IFREG | S_IRUGO, + dir, pmcdev, &pmc_core_dev_state_ops); + + if (!file) { + pmc_core_dbgfs_unregister(pmcdev); + return -ENODEV; + } + + return 0; +} +#else +static inline int pmc_core_dbgfs_register(struct pmc_dev *pmcdev) +{ + return 0; +} + +static inline void pmc_core_dbgfs_unregister(struct pmc_dev *pmcdev) +{ +} +#endif /* CONFIG_DEBUG_FS */ + +static const struct x86_cpu_id intel_pmc_core_ids[] = { + { X86_VENDOR_INTEL, 6, 0x4e, X86_FEATURE_MWAIT, + (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */ + { X86_VENDOR_INTEL, 6, 0x5e, X86_FEATURE_MWAIT, + (kernel_ulong_t)NULL}, /* Skylake CPUID Signature */ + {} +}; + +static int pmc_core_probe(struct pci_dev *dev, const struct pci_device_id *id) +{ + struct device *ptr_dev = &dev->dev; + struct pmc_dev *pmcdev = &pmc; + const struct x86_cpu_id *cpu_id; + int err; + + cpu_id = x86_match_cpu(intel_pmc_core_ids); + if (!cpu_id) { + dev_dbg(&dev->dev, "PMC Core: cpuid mismatch.\n"); + return -EINVAL; + } + + err = pcim_enable_device(dev); + if (err < 0) { + dev_dbg(&dev->dev, "PMC Core: failed to enable Power Management Controller.\n"); + return err; + } + + err = pci_read_config_dword(dev, + SPT_PMC_BASE_ADDR_OFFSET, + &pmcdev->base_addr); + if (err < 0) { + dev_dbg(&dev->dev, "PMC Core: failed to read PCI config space.\n"); + return err; + } + dev_dbg(&dev->dev, "PMC Core: PWRMBASE is %#x\n", pmcdev->base_addr); + + pmcdev->regbase = devm_ioremap_nocache(ptr_dev, + pmcdev->base_addr, + SPT_PMC_MMIO_REG_LEN); + if (!pmcdev->regbase) { + dev_dbg(&dev->dev, "PMC Core: ioremap failed.\n"); + return -ENOMEM; + } + + err = pmc_core_dbgfs_register(pmcdev); + if (err < 0) { + dev_err(&dev->dev, "PMC Core: debugfs register failed.\n"); + return err; + } + + pmc.has_slp_s0_res = true; + return 0; +} + +static struct pci_driver intel_pmc_core_driver = { + .name = "intel_pmc_core", + .id_table = pmc_pci_ids, + .probe = pmc_core_probe, +}; + +builtin_pci_driver(intel_pmc_core_driver); diff --git a/drivers/platform/x86/intel_pmc_core.h b/drivers/platform/x86/intel_pmc_core.h new file mode 100644 index 000000000000..a9dadaf787c1 --- /dev/null +++ b/drivers/platform/x86/intel_pmc_core.h @@ -0,0 +1,51 @@ +/* + * Intel Core SoC Power Management Controller Header File + * + * Copyright (c) 2016, Intel Corporation. + * All Rights Reserved. + * + * Authors: Rajneesh Bhardwaj + * Vishwanath Somayaji + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef PMC_CORE_H +#define PMC_CORE_H + +/* Sunrise Point Power Management Controller PCI Device ID */ +#define SPT_PMC_PCI_DEVICE_ID 0x9d21 +#define SPT_PMC_BASE_ADDR_OFFSET 0x48 +#define SPT_PMC_SLP_S0_RES_COUNTER_OFFSET 0x13c +#define SPT_PMC_MMIO_REG_LEN 0x100 +#define SPT_PMC_SLP_S0_RES_COUNTER_STEP 0x64 + +/** + * struct pmc_dev - pmc device structure + * @base_addr: comtains pmc base address + * @regbase: pointer to io-remapped memory location + * @dbgfs_dir: path to debug fs interface + * @feature_available: flag to indicate whether + * the feature is available + * on a particular platform or not. + * + * pmc_dev contains info about power management controller device. + */ +struct pmc_dev { + u32 base_addr; + void __iomem *regbase; +#if IS_ENABLED(CONFIG_DEBUG_FS) + struct dentry *dbgfs_dir; +#endif /* CONFIG_DEBUG_FS */ + bool has_slp_s0_res; +}; + +#endif /* PMC_CORE_H */ -- cgit v1.2.3 From 08dd8cd06ed95625b9e2fac43c78fcb45b7eaf94 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Fri, 3 Jun 2016 19:00:59 +0100 Subject: x86/msr: Use the proper trace point conditional for writes The msr tracing for writes is incorrectly conditional on the read trace. Fixes: 7f47d8cc039f "x86, tracing, perf: Add trace point for MSR accesses" Signed-off-by: Dr. David Alan Gilbert Cc: stable@vger.kernel.org Cc: ak@linux.intel.com Link: http://lkml.kernel.org/r/1464976859-21850-1-git-send-email-dgilbert@redhat.com Signed-off-by: Thomas Gleixner --- arch/x86/include/asm/msr.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7dc1d8fef7fd..b5fee97813cd 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -122,7 +122,7 @@ notrace static inline void native_write_msr(unsigned int msr, "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) : : "c" (msr), "a"(low), "d" (high) : "memory"); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } @@ -141,7 +141,7 @@ notrace static inline int native_write_msr_safe(unsigned int msr, : "c" (msr), "0" (low), "d" (high), [fault] "i" (-EIO) : "memory"); - if (msr_tracepoint_active(__tracepoint_read_msr)) + if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), err); return err; } -- cgit v1.2.3 From 970442c599b22ccd644ebfe94d1d303bf6f87c05 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 2 Jun 2016 17:19:27 -0700 Subject: x86/cpu/intel: Introduce macros for Intel family numbers Problem: We have a boatload of open-coded family-6 model numbers. Half of them have these model numbers in hex and the other half in decimal. This makes grepping for them tons of fun, if you were to try. Solution: Consolidate all the magic numbers. Put all the definitions in one header. The names here are closely derived from the comments describing the models from arch/x86/events/intel/core.c. We could easily make them shorter by doing things like s/SANDYBRIDGE/SNB/, but they seemed fine even with the longer versions to me. Do not take any of these names too literally, like "DESKTOP" or "MOBILE". These are all colloquial names and not precise descriptions of everywhere a given model will show up. Signed-off-by: Dave Hansen Cc: Adrian Hunter Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Darren Hart Cc: Dave Hansen Cc: Denys Vlasenko Cc: Doug Thompson Cc: Eduardo Valentin Cc: H. Peter Anvin Cc: Jacob Pan Cc: Kan Liang Cc: Len Brown Cc: Linus Torvalds Cc: Mauro Carvalho Chehab Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Rajneesh Bhardwaj Cc: Souvik Kumar Chakravarty Cc: Srinivas Pandruvada Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Tony Luck Cc: Ulf Hansson Cc: Viresh Kumar Cc: Vishwanath Somayaji Cc: Zhang Rui Cc: jacob.jun.pan@intel.com Cc: linux-acpi@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: linux-mmc@vger.kernel.org Cc: linux-pm@vger.kernel.org Cc: platform-driver-x86@vger.kernel.org Link: http://lkml.kernel.org/r/20160603001927.F2A7D828@viggo.jf.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/intel-family.h | 68 +++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 arch/x86/include/asm/intel-family.h (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h new file mode 100644 index 000000000000..6999f7d01a0d --- /dev/null +++ b/arch/x86/include/asm/intel-family.h @@ -0,0 +1,68 @@ +#ifndef _ASM_X86_INTEL_FAMILY_H +#define _ASM_X86_INTEL_FAMILY_H + +/* + * "Big Core" Processors (Branded as Core, Xeon, etc...) + * + * The "_X" parts are generally the EP and EX Xeons, or the + * "Extreme" ones, like Broadwell-E. + * + * Things ending in "2" are usually because we have no better + * name for them. There's no processor called "WESTMERE2". + */ + +#define INTEL_FAM6_CORE_YONAH 0x0E +#define INTEL_FAM6_CORE2_MEROM 0x0F +#define INTEL_FAM6_CORE2_MEROM_L 0x16 +#define INTEL_FAM6_CORE2_PENRYN 0x17 +#define INTEL_FAM6_CORE2_DUNNINGTON 0x1D + +#define INTEL_FAM6_NEHALEM 0x1E +#define INTEL_FAM6_NEHALEM_EP 0x1A +#define INTEL_FAM6_NEHALEM_EX 0x2E +#define INTEL_FAM6_WESTMERE 0x25 +#define INTEL_FAM6_WESTMERE2 0x1F +#define INTEL_FAM6_WESTMERE_EP 0x2C +#define INTEL_FAM6_WESTMERE_EX 0x2F + +#define INTEL_FAM6_SANDYBRIDGE 0x2A +#define INTEL_FAM6_SANDYBRIDGE_X 0x2D +#define INTEL_FAM6_IVYBRIDGE 0x3A +#define INTEL_FAM6_IVYBRIDGE_X 0x3E + +#define INTEL_FAM6_HASWELL_CORE 0x3C +#define INTEL_FAM6_HASWELL_X 0x3F +#define INTEL_FAM6_HASWELL_ULT 0x45 +#define INTEL_FAM6_HASWELL_GT3E 0x46 + +#define INTEL_FAM6_BROADWELL_CORE 0x3D +#define INTEL_FAM6_BROADWELL_XEON_D 0x56 +#define INTEL_FAM6_BROADWELL_GT3E 0x47 +#define INTEL_FAM6_BROADWELL_X 0x4F + +#define INTEL_FAM6_SKYLAKE_MOBILE 0x4E +#define INTEL_FAM6_SKYLAKE_DESKTOP 0x5E +#define INTEL_FAM6_SKYLAKE_X 0x55 +#define INTEL_FAM6_KABYLAKE_MOBILE 0x8E +#define INTEL_FAM6_KABYLAKE_DESKTOP 0x9E + +/* "Small Core" Processors (Atom) */ + +#define INTEL_FAM6_ATOM_PINEVIEW 0x1C +#define INTEL_FAM6_ATOM_LINCROFT 0x26 +#define INTEL_FAM6_ATOM_PENWELL 0x27 +#define INTEL_FAM6_ATOM_CLOVERVIEW 0x35 +#define INTEL_FAM6_ATOM_CEDARVIEW 0x36 +#define INTEL_FAM6_ATOM_SILVERMONT1 0x37 /* BayTrail/BYT / Valleyview */ +#define INTEL_FAM6_ATOM_SILVERMONT2 0x4D /* Avaton/Rangely */ +#define INTEL_FAM6_ATOM_AIRMONT 0x4C /* CherryTrail / Braswell */ +#define INTEL_FAM6_ATOM_MERRIFIELD1 0x4A /* Tangier */ +#define INTEL_FAM6_ATOM_MERRIFIELD2 0x5A /* Annidale */ +#define INTEL_FAM6_ATOM_GOLDMONT 0x5C +#define INTEL_FAM6_ATOM_DENVERTON 0x5F /* Goldmont Microserver */ + +/* Xeon Phi */ + +#define INTEL_FAM6_XEON_PHI_KNL 0x57 /* Knights Landing */ + +#endif /* _ASM_X86_INTEL_FAMILY_H */ -- cgit v1.2.3 From 7d669f50847481c52faf0656aea7b4be63113210 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 15 Jun 2016 17:23:45 -0500 Subject: kvm: svm: Fix implicit declaration for __default_cpu_present_to_apicid() The commit 8221c1370056 ("svm: Manage vcpu load/unload when enable AVIC") introduces a build error due to implicit function declaration when #ifdef CONFIG_X86_32 and #ifndef CONFIG_X86_LOCAL_APIC (as reported by Kbuild test robot i386-randconfig-x0-06121009). So, this patch introduces kvm_cpu_get_apicid() wrapper around __default_cpu_present_to_apicid() with additional handling if CONFIG_X86_LOCAL_APIC is not defined. Reported-by: kbuild test robot Fixes: commit 8221c1370056 ("svm: Manage vcpu load/unload when enable AVIC") Signed-off-by: Suravee Suthikulpanit Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 11 +++++++++++ arch/x86/kvm/svm.c | 6 +++--- 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index e0fbe7e70dc1..69e62862b622 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -1368,4 +1369,14 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) static inline void kvm_arch_vcpu_block_finish(struct kvm_vcpu *vcpu) {} +static inline int kvm_cpu_get_apicid(int mps_cpu) +{ +#ifdef CONFIG_X86_LOCAL_APIC + return __default_cpu_present_to_apicid(mps_cpu); +#else + WARN_ON_ONCE(1); + return BAD_APICID; +#endif +} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 1163e8173e5a..8a3c571e4a86 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1324,7 +1324,7 @@ free_avic: static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) { u64 entry; - int h_physical_id = __default_cpu_present_to_apicid(vcpu->cpu); + int h_physical_id = kvm_cpu_get_apicid(vcpu->cpu); struct vcpu_svm *svm = to_svm(vcpu); if (!kvm_vcpu_apicv_active(vcpu)) @@ -1349,7 +1349,7 @@ static void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { u64 entry; /* ID = 0xff (broadcast), ID > 0xff (reserved) */ - int h_physical_id = __default_cpu_present_to_apicid(cpu); + int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); if (!kvm_vcpu_apicv_active(vcpu)) @@ -4236,7 +4236,7 @@ static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec) if (avic_vcpu_is_running(vcpu)) wrmsrl(SVM_AVIC_DOORBELL, - __default_cpu_present_to_apicid(vcpu->cpu)); + kvm_cpu_get_apicid(vcpu->cpu)); else kvm_vcpu_wake_up(vcpu); } -- cgit v1.2.3 From da01e18a37a57f360222d3a123b8f6994aa1ad14 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 23 Jun 2016 12:20:01 -0700 Subject: x86: avoid avoid passing around 'thread_info' in stack dumping code None of the code actually wants a thread_info, it all wants a task_struct, and it's just converting to a thread_info pointer much too early. No semantic change. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/stacktrace.h | 6 +++--- arch/x86/kernel/dumpstack.c | 22 ++++++++++------------ arch/x86/kernel/dumpstack_32.c | 4 +--- arch/x86/kernel/dumpstack_64.c | 8 +++----- 4 files changed, 17 insertions(+), 23 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 7c247e7404be..0944218af9e2 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -14,7 +14,7 @@ extern int kstack_depth_to_print; struct thread_info; struct stacktrace_ops; -typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, +typedef unsigned long (*walk_stack_t)(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, @@ -23,13 +23,13 @@ typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo, int *graph); extern unsigned long -print_context_stack(struct thread_info *tinfo, +print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph); extern unsigned long -print_context_stack_bp(struct thread_info *tinfo, +print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 2bb25c3fe2e8..d6209f3a69cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -42,16 +42,14 @@ void printk_address(unsigned long address) static void print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) + struct task_struct *task, int *graph) { - struct task_struct *task; unsigned long ret_addr; int index; if (addr != (unsigned long)return_to_handler) return; - task = tinfo->task; index = task->curr_ret_stack; if (!task->ret_stack || index < *graph) @@ -68,7 +66,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data, static inline void print_ftrace_graph_addr(unsigned long addr, void *data, const struct stacktrace_ops *ops, - struct thread_info *tinfo, int *graph) + struct task_struct *task, int *graph) { } #endif @@ -79,10 +77,10 @@ print_ftrace_graph_addr(unsigned long addr, void *data, * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct thread_info *tinfo, +static inline int valid_stack_ptr(struct task_struct *task, void *p, unsigned int size, void *end) { - void *t = tinfo; + void *t = task_thread_info(task); if (end) { if (p < end && p >= (end-THREAD_SIZE)) return 1; @@ -93,14 +91,14 @@ static inline int valid_stack_ptr(struct thread_info *tinfo, } unsigned long -print_context_stack(struct thread_info *tinfo, +print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { unsigned long addr; addr = *stack; @@ -112,7 +110,7 @@ print_context_stack(struct thread_info *tinfo, } else { ops->address(data, addr, 0); } - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + print_ftrace_graph_addr(addr, data, ops, task, graph); } stack++; } @@ -121,7 +119,7 @@ print_context_stack(struct thread_info *tinfo, EXPORT_SYMBOL_GPL(print_context_stack); unsigned long -print_context_stack_bp(struct thread_info *tinfo, +print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, unsigned long *end, int *graph) @@ -129,7 +127,7 @@ print_context_stack_bp(struct thread_info *tinfo, struct stack_frame *frame = (struct stack_frame *)bp; unsigned long *ret_addr = &frame->return_address; - while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) { + while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { unsigned long addr = *ret_addr; if (!__kernel_text_address(addr)) @@ -139,7 +137,7 @@ print_context_stack_bp(struct thread_info *tinfo, break; frame = frame->next_frame; ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, tinfo, graph); + print_ftrace_graph_addr(addr, data, ops, task, graph); } return (unsigned long)frame; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 464ffd69b92e..fef917e79b9d 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -61,15 +61,13 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, bp = stack_frame(task, regs); for (;;) { - struct thread_info *context; void *end_stack; end_stack = is_hardirq_stack(stack, cpu); if (!end_stack) end_stack = is_softirq_stack(stack, cpu); - context = task_thread_info(task); - bp = ops->walk_stack(context, stack, bp, ops, data, + bp = ops->walk_stack(task, stack, bp, ops, data, end_stack, &graph); /* Stop if not on irq stack */ diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5f1c6266eb30..d558a8a49016 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -153,7 +153,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); - struct thread_info *tinfo; unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); unsigned long dummy; unsigned used = 0; @@ -179,7 +178,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * current stack address. If the stacks consist of nested * exceptions */ - tinfo = task_thread_info(task); while (!done) { unsigned long *stack_end; enum stack_type stype; @@ -202,7 +200,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, id) < 0) break; - bp = ops->walk_stack(tinfo, stack, bp, ops, + bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); ops->stack(data, ""); /* @@ -218,7 +216,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (ops->stack(data, "IRQ") < 0) break; - bp = ops->walk_stack(tinfo, stack, bp, + bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); /* * We link to the next stack (which would be @@ -240,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); put_cpu(); } EXPORT_SYMBOL(dump_trace); -- cgit v1.2.3 From aca9c293d098292579e345b2b39b394778d41526 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 24 Jun 2016 16:55:53 -0700 Subject: x86: fix up a few misc stack pointer vs thread_info confusions As the actual pointer value is the same for the thread stack allocation and the thread_info, code that confused the two worked fine, but will break when the thread info is moved away from the stack allocation. It also looks very confusing. For example, the kprobe code wanted to know the current top of stack. To do that, it used this: (unsigned long)current_thread_info() + THREAD_SIZE which did indeed give the correct value. But it's not only a fairly nonsensical expression, it's also rather complex, especially since we actually have this: static inline unsigned long current_top_of_stack(void) which not only gives us the value we are interested in, but happens to be how "current_thread_info()" is currently defined as: (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); so using current_thread_info() to figure out the top of the stack really is a very round-about thing to do. The other cases are just simpler confusion about task_thread_info() vs task_stack_page(), which currently return the same pointer - but if you want the stack page, you really should be using the latter one. And there was one entirely unused assignment of the current stack to a thread_info pointer. All cleaned up to make more sense today, and make it easier to move the thread_info away from the stack in the future. No semantic changes. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kprobes.h | 11 +++++------ arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/irq_32.c | 2 -- 3 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 4421b5da409d..d1d1e5094c28 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -38,12 +38,11 @@ typedef u8 kprobe_opcode_t; #define RELATIVECALL_OPCODE 0xe8 #define RELATIVE_ADDR_SIZE 4 #define MAX_STACK_SIZE 64 -#define MIN_STACK_SIZE(ADDR) \ - (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) \ - ? (MAX_STACK_SIZE) \ - : (((unsigned long)current_thread_info()) + \ - THREAD_SIZE - (unsigned long)(ADDR))) +#define CUR_STACK_SIZE(ADDR) \ + (current_top_of_stack() - (unsigned long)(ADDR)) +#define MIN_STACK_SIZE(ADDR) \ + (MAX_STACK_SIZE < CUR_STACK_SIZE(ADDR) ? \ + MAX_STACK_SIZE : CUR_STACK_SIZE(ADDR)) #define flush_insn_slot(p) do { } while (0) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index d6209f3a69cb..ef8017ca5ba9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -80,7 +80,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data, static inline int valid_stack_ptr(struct task_struct *task, void *p, unsigned int size, void *end) { - void *t = task_thread_info(task); + void *t = task_stack_page(task); if (end) { if (p < end && p >= (end-THREAD_SIZE)) return 1; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 38da8f29a9c8..c627bf8d98ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -130,11 +130,9 @@ void irq_ctx_init(int cpu) void do_softirq_own_stack(void) { - struct thread_info *curstk; struct irq_stack *irqstk; u32 *isp, *prev_esp; - curstk = current_stack(); irqstk = __this_cpu_read(softirq_stack); /* build the stack frame on the softirq stack */ -- cgit v1.2.3 From 32d6bd9059f265f617f6502c68dfbcae7e515add Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 24 Jun 2016 14:48:47 -0700 Subject: tree wide: get rid of __GFP_REPEAT for order-0 allocations part I This is the third version of the patchset previously sent [1]. I have basically only rebased it on top of 4.7-rc1 tree and dropped "dm: get rid of superfluous gfp flags" which went through dm tree. I am sending it now because it is tree wide and chances for conflicts are reduced considerably when we want to target rc2. I plan to send the next step and rename the flag and move to a better semantic later during this release cycle so we will have a new semantic ready for 4.8 merge window hopefully. Motivation: While working on something unrelated I've checked the current usage of __GFP_REPEAT in the tree. It seems that a majority of the usage is and always has been bogus because __GFP_REPEAT has always been about costly high order allocations while we are using it for order-0 or very small orders very often. It seems that a big pile of them is just a copy&paste when a code has been adopted from one arch to another. I think it makes some sense to get rid of them because they are just making the semantic more unclear. Please note that GFP_REPEAT is documented as * __GFP_REPEAT: Try hard to allocate the memory, but the allocation attempt * _might_ fail. This depends upon the particular VM implementation. while !costly requests have basically nofail semantic. So one could reasonably expect that order-0 request with __GFP_REPEAT will not loop for ever. This is not implemented right now though. I would like to move on with __GFP_REPEAT and define a better semantic for it. $ git grep __GFP_REPEAT origin/master | wc -l 111 $ git grep __GFP_REPEAT | wc -l 36 So we are down to the third after this patch series. The remaining places really seem to be relying on __GFP_REPEAT due to large allocation requests. This still needs some double checking which I will do later after all the simple ones are sorted out. I am touching a lot of arch specific code here and I hope I got it right but as a matter of fact I even didn't compile test for some archs as I do not have cross compiler for them. Patches should be quite trivial to review for stupid compile mistakes though. The tricky parts are usually hidden by macro definitions and thats where I would appreciate help from arch maintainers. [1] http://lkml.kernel.org/r/1461849846-27209-1-git-send-email-mhocko@kernel.org This patch (of 19): __GFP_REPEAT has a rather weak semantic but since it has been introduced around 2.6.12 it has been ignored for low order allocations. Yet we have the full kernel tree with its usage for apparently order-0 allocations. This is really confusing because __GFP_REPEAT is explicitly documented to allow allocation failures which is a weaker semantic than the current order-0 has (basically nofail). Let's simply drop __GFP_REPEAT from those places. This would allow to identify place which really need allocator to retry harder and formulate a more specific semantic for what the flag is supposed to do actually. Link: http://lkml.kernel.org/r/1464599699-30131-2-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Cc: "David S. Miller" Cc: "H. Peter Anvin" Cc: "James E.J. Bottomley" Cc: "Theodore Ts'o" Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Chen Liqin Cc: Chris Metcalf [for tile] Cc: Guan Xuetao Cc: Heiko Carstens Cc: Helge Deller Cc: Ingo Molnar Cc: Jan Kara Cc: John Crispin Cc: Lennox Wu Cc: Ley Foon Tan Cc: Martin Schwidefsky Cc: Matt Fleming Cc: Ralf Baechle Cc: Rich Felker Cc: Russell King Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Will Deacon Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/pgalloc.h | 4 ++-- arch/arm/include/asm/pgalloc.h | 2 +- arch/avr32/include/asm/pgalloc.h | 6 +++--- arch/cris/include/asm/pgalloc.h | 4 ++-- arch/frv/mm/pgalloc.c | 6 +++--- arch/hexagon/include/asm/pgalloc.h | 4 ++-- arch/m68k/include/asm/mcf_pgalloc.h | 4 ++-- arch/m68k/include/asm/motorola_pgalloc.h | 4 ++-- arch/m68k/include/asm/sun3_pgalloc.h | 4 ++-- arch/metag/include/asm/pgalloc.h | 5 ++--- arch/microblaze/include/asm/pgalloc.h | 4 ++-- arch/microblaze/mm/pgtable.c | 3 +-- arch/mn10300/mm/pgtable.c | 6 +++--- arch/openrisc/include/asm/pgalloc.h | 2 +- arch/openrisc/mm/ioremap.c | 2 +- arch/parisc/include/asm/pgalloc.h | 4 ++-- arch/powerpc/include/asm/book3s/64/pgalloc.h | 2 +- arch/powerpc/include/asm/nohash/64/pgalloc.h | 2 +- arch/powerpc/mm/pgtable_32.c | 4 ++-- arch/powerpc/mm/pgtable_64.c | 3 +-- arch/sh/include/asm/pgalloc.h | 4 ++-- arch/sparc/mm/init_64.c | 6 ++---- arch/um/kernel/mem.c | 4 ++-- arch/x86/include/asm/pgalloc.h | 4 ++-- arch/x86/xen/p2m.c | 2 +- arch/xtensa/include/asm/pgalloc.h | 2 +- drivers/block/aoe/aoecmd.c | 2 +- 27 files changed, 47 insertions(+), 52 deletions(-) (limited to 'arch/x86/include/asm') diff --git a/arch/alpha/include/asm/pgalloc.h b/arch/alpha/include/asm/pgalloc.h index aab14a019c20..c2ebb6f36c9d 100644 --- a/arch/alpha/include/asm/pgalloc.h +++ b/arch/alpha/include/asm/pgalloc.h @@ -40,7 +40,7 @@ pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pmd_t * pmd_alloc_one(struct mm_struct *mm, unsigned long address) { - pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pmd_t *ret = (pmd_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return ret; } @@ -53,7 +53,7 @@ pmd_free(struct mm_struct *mm, pmd_t *pmd) static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; } diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index 19cfab526d13..20febb368844 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -29,7 +29,7 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + return (pmd_t *)get_zeroed_page(GFP_KERNEL); } static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) diff --git a/arch/avr32/include/asm/pgalloc.h b/arch/avr32/include/asm/pgalloc.h index 1aba19d68c5e..db039cb368be 100644 --- a/arch/avr32/include/asm/pgalloc.h +++ b/arch/avr32/include/asm/pgalloc.h @@ -43,7 +43,7 @@ static inline void pgd_ctor(void *x) */ static inline pgd_t *pgd_alloc(struct mm_struct *mm) { - return quicklist_alloc(QUICK_PGD, GFP_KERNEL | __GFP_REPEAT, pgd_ctor); + return quicklist_alloc(QUICK_PGD, GFP_KERNEL, pgd_ctor); } static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) @@ -54,7 +54,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL); + return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, @@ -63,7 +63,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; void *pg; - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL); + pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); if (!pg) return NULL; diff --git a/arch/cris/include/asm/pgalloc.h b/arch/cris/include/asm/pgalloc.h index 235ece437ddd..42f1affb9c2d 100644 --- a/arch/cris/include/asm/pgalloc.h +++ b/arch/cris/include/asm/pgalloc.h @@ -24,14 +24,14 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if (!pte) return NULL; if (!pgtable_page_ctor(pte)) { diff --git a/arch/frv/mm/pgalloc.c b/arch/frv/mm/pgalloc.c index 41907d25ed38..c9ed14f6c67d 100644 --- a/arch/frv/mm/pgalloc.c +++ b/arch/frv/mm/pgalloc.c @@ -22,7 +22,7 @@ pgd_t swapper_pg_dir[PTRS_PER_PGD] __attribute__((aligned(PAGE_SIZE))); pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL); if (pte) clear_page(pte); return pte; @@ -33,9 +33,9 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *page; #ifdef CONFIG_HIGHPTE - page = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); + page = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM, 0); #else - page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + page = alloc_pages(GFP_KERNEL, 0); #endif if (!page) return NULL; diff --git a/arch/hexagon/include/asm/pgalloc.h b/arch/hexagon/include/asm/pgalloc.h index 77da3b0ae3c2..eeebf862c46c 100644 --- a/arch/hexagon/include/asm/pgalloc.h +++ b/arch/hexagon/include/asm/pgalloc.h @@ -64,7 +64,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, { struct page *pte; - pte = alloc_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + pte = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!pte) return NULL; if (!pgtable_page_ctor(pte)) { @@ -78,7 +78,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; + gfp_t flags = GFP_KERNEL | __GFP_ZERO; return (pte_t *) __get_free_page(flags); } diff --git a/arch/m68k/include/asm/mcf_pgalloc.h b/arch/m68k/include/asm/mcf_pgalloc.h index f9924fbcfe42..fb95aed5f428 100644 --- a/arch/m68k/include/asm/mcf_pgalloc.h +++ b/arch/m68k/include/asm/mcf_pgalloc.h @@ -14,7 +14,7 @@ extern const char bad_pmd_string[]; extern inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - unsigned long page = __get_free_page(GFP_DMA|__GFP_REPEAT); + unsigned long page = __get_free_page(GFP_DMA); if (!page) return NULL; @@ -51,7 +51,7 @@ static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_pages(GFP_DMA|__GFP_REPEAT, 0); + struct page *page = alloc_pages(GFP_DMA, 0); pte_t *pte; if (!page) diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index 24bcba496c75..c895b987202c 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -11,7 +11,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long ad { pte_t *pte; - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); if (pte) { __flush_page_to_ram(pte); flush_tlb_kernel_page(pte); @@ -32,7 +32,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long addres struct page *page; pte_t *pte; - page = alloc_pages(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO, 0); + page = alloc_pages(GFP_KERNEL|__GFP_ZERO, 0); if(!page) return NULL; if (!pgtable_page_ctor(page)) { diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index 0931388de47f..1901f61f926f 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -37,7 +37,7 @@ do { \ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - unsigned long page = __get_free_page(GFP_KERNEL|__GFP_REPEAT); + unsigned long page = __get_free_page(GFP_KERNEL); if (!page) return NULL; @@ -49,7 +49,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + struct page *page = alloc_pages(GFP_KERNEL, 0); if (page == NULL) return NULL; diff --git a/arch/metag/include/asm/pgalloc.h b/arch/metag/include/asm/pgalloc.h index 3104df0a4822..c2caa1ee4360 100644 --- a/arch/metag/include/asm/pgalloc.h +++ b/arch/metag/include/asm/pgalloc.h @@ -42,8 +42,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | - __GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); return pte; } @@ -51,7 +50,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; - pte = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0); + pte = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); if (!pte) return NULL; if (!pgtable_page_ctor(pte)) { diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 61436d69775c..7c89390c0c13 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -116,9 +116,9 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, struct page *ptepage; #ifdef CONFIG_HIGHPTE - int flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_REPEAT; + int flags = GFP_KERNEL | __GFP_HIGHMEM; #else - int flags = GFP_KERNEL | __GFP_REPEAT; + int flags = GFP_KERNEL; #endif ptepage = alloc_pages(flags, 0); diff --git a/arch/microblaze/mm/pgtable.c b/arch/microblaze/mm/pgtable.c index 4f4520e779a5..eb99fcc76088 100644 --- a/arch/microblaze/mm/pgtable.c +++ b/arch/microblaze/mm/pgtable.c @@ -239,8 +239,7 @@ __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, { pte_t *pte; if (mem_init_done) { - pte = (pte_t *)__get_free_page(GFP_KERNEL | - __GFP_REPEAT | __GFP_ZERO); + pte = (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } else { pte = (pte_t *)early_get_page(); if (pte) diff --git a/arch/mn10300/mm/pgtable.c b/arch/mn10300/mm/pgtable.c index e77a7c728081..9577cf768875 100644 --- a/arch/mn10300/mm/pgtable.c +++ b/arch/mn10300/mm/pgtable.c @@ -63,7 +63,7 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags) pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL); if (pte) clear_page(pte); return pte; @@ -74,9 +74,9 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) struct page *pte; #ifdef CONFIG_HIGHPTE - pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM|__GFP_REPEAT, 0); + pte = alloc_pages(GFP_KERNEL|__GFP_HIGHMEM, 0); #else - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + pte = alloc_pages(GFP_KERNEL, 0); #endif if (!pte) return NULL; diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 21484e5b9e9a..87eebd185089 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -77,7 +77,7 @@ static inline struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; - pte = alloc_pages(GFP_KERNEL|__GFP_REPEAT, 0); + pte = alloc_pages(GFP_KERNEL, 0); if (!pte) return NULL; clear_page(page_address(pte)); diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index 62b08ef392be..5b2a95116e8f 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -122,7 +122,7 @@ pte_t __init_refok *pte_alloc_one_kernel(struct mm_struct *mm, pte_t *pte; if (likely(mem_init_done)) { - pte = (pte_t *) __get_free_page(GFP_KERNEL | __GFP_REPEAT); + pte = (pte_t *) __get_free_page(GFP_KERNEL); } else { pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); #if 0 diff --git a/arch/parisc/include/asm/pgalloc.h b/arch/parisc/include/asm/pgalloc.h index f2fd327dce2e..52c3defb40c9 100644 --- a/arch/parisc/include/asm/pgalloc.h +++ b/arch/parisc/include/asm/pgalloc.h @@ -124,7 +124,7 @@ pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) static inline pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!page) return NULL; if (!pgtable_page_ctor(page)) { @@ -137,7 +137,7 @@ pte_alloc_one(struct mm_struct *mm, unsigned long address) static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) { - pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; } diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h index 488279edb1f0..049b80359db6 100644 --- a/arch/powerpc/include/asm/book3s/64/pgalloc.h +++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h @@ -151,7 +151,7 @@ static inline pgtable_t pmd_pgtable(pmd_t pmd) static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, diff --git a/arch/powerpc/include/asm/nohash/64/pgalloc.h b/arch/powerpc/include/asm/nohash/64/pgalloc.h index 069369f6414b..8a8a7d991249 100644 --- a/arch/powerpc/include/asm/nohash/64/pgalloc.h +++ b/arch/powerpc/include/asm/nohash/64/pgalloc.h @@ -88,7 +88,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO); + return (pte_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c index bf7bf32b54f8..7f922f557936 100644 --- a/arch/powerpc/mm/pgtable_32.c +++ b/arch/powerpc/mm/pgtable_32.c @@ -84,7 +84,7 @@ __init_refok pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long add pte_t *pte; if (slab_is_available()) { - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); } else { pte = __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); if (pte) @@ -97,7 +97,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *ptepage; - gfp_t flags = GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO; + gfp_t flags = GFP_KERNEL | __GFP_ZERO; ptepage = alloc_pages(flags, 0); if (!ptepage) diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index e009e0604a8a..f5e8d4edb808 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -350,8 +350,7 @@ static pte_t *get_from_cache(struct mm_struct *mm) static pte_t *__alloc_for_cache(struct mm_struct *mm, int kernel) { void *ret = NULL; - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (!page) return NULL; if (!kernel && !pgtable_page_ctor(page)) { diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index a33673b3687d..f3f42c84c40f 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -34,7 +34,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - return quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL); + return quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); } static inline pgtable_t pte_alloc_one(struct mm_struct *mm, @@ -43,7 +43,7 @@ static inline pgtable_t pte_alloc_one(struct mm_struct *mm, struct page *page; void *pg; - pg = quicklist_alloc(QUICK_PT, GFP_KERNEL | __GFP_REPEAT, NULL); + pg = quicklist_alloc(QUICK_PT, GFP_KERNEL, NULL); if (!pg) return NULL; page = virt_to_page(pg); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 14bb0d5ed3c6..aec508e37490 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2704,8 +2704,7 @@ void __flush_tlb_all(void) pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); pte_t *pte = NULL; if (page) @@ -2717,8 +2716,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | - __GFP_REPEAT | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); if (!page) return NULL; if (!pgtable_page_ctor(page)) { diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index b2a2dff50b4e..e7437ec62710 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -204,7 +204,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { pte_t *pte; - pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_ZERO); return pte; } @@ -212,7 +212,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { struct page *pte; - pte = alloc_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); + pte = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!pte) return NULL; if (!pgtable_page_ctor(pte)) { diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index bf7f8b55b0f9..574c23cf761a 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -81,7 +81,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) { struct page *page; - page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0); + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); if (!page) return NULL; if (!pgtable_pmd_page_ctor(page)) { @@ -125,7 +125,7 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) { - return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT); + return (pud_t *)get_zeroed_page(GFP_KERNEL); } static inline void pud_free(struct mm_struct *mm, pud_t *pud) diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index cab9f766bb06..dd2a49a8aacc 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -182,7 +182,7 @@ static void * __ref alloc_p2m_page(void) if (unlikely(!slab_is_available())) return alloc_bootmem_align(PAGE_SIZE, PAGE_SIZE); - return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT); + return (void *)__get_free_page(GFP_KERNEL); } static void __ref free_p2m_page(void *p) diff --git a/arch/xtensa/include/asm/pgalloc.h b/arch/xtensa/include/asm/pgalloc.h index d38eb9237e64..1065bc8bcae5 100644 --- a/arch/xtensa/include/asm/pgalloc.h +++ b/arch/xtensa/include/asm/pgalloc.h @@ -44,7 +44,7 @@ static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pte_t *ptep; int i; - ptep = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT); + ptep = (pte_t *)__get_free_page(GFP_KERNEL); if (!ptep) return NULL; for (i = 0; i < 1024; i++) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d597e432e195..ab19adb07a12 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1750,7 +1750,7 @@ aoecmd_init(void) int ret; /* get_zeroed_page returns page with ref count 1 */ - p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + p = (void *) get_zeroed_page(GFP_KERNEL); if (!p) return -ENOMEM; empty_page = virt_to_page(p); -- cgit v1.2.3