From ee6352b2c47a24234398e06381edd93a8e965976 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Dec 2019 17:36:11 +0100 Subject: x86/context-tracking: Remove exception_enter/exit() from do_page_fault() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit do_page_fault(), like other exceptions, is already covered by user_enter() and user_exit() when the exception triggers in userspace. As explained in: 8c84014f3bbb11 ("x86/entry: Remove exception_enter() from most trap handlers") exception_enter/exit() only remained to handle possible page fault from kernel mode while context tracking is in CONTEXT_USER mode, ie: on kernel entry before we manage to call user_exit(). The only known offender was do_fast_syscall_32() fetching EBP register from where vDSO stashed it. Meanwhile this got fixed in: 9999c8c01f34c9 ("x86/entry: Call enter_from_user_mode() with IRQs off") that moved enter_from_user_mode() before the call to get_user(). So we can safely remove it now. Signed-off-by: Frederic Weisbecker Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Jim Mattson Cc: Joerg Roedel Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Wanpeng Li Link: https://lkml.kernel.org/r/20191227163612.10039-2-frederic@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 39 ++++++++++++--------------------------- 1 file changed, 12 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 304d31d8cbbc..2b4ab2862eda 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1486,27 +1486,6 @@ good_area: } NOKPROBE_SYMBOL(do_user_addr_fault); -/* - * Explicitly marked noinline such that the function tracer sees this as the - * page_fault entry point. - */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, - unsigned long address) -{ - prefetchw(¤t->mm->mmap_sem); - - if (unlikely(kmmio_fault(regs, address))) - return; - - /* Was the fault on kernel-controlled part of the address space? */ - if (unlikely(fault_in_kernel_space(address))) - do_kern_addr_fault(regs, hw_error_code, address); - else - do_user_addr_fault(regs, hw_error_code, address); -} -NOKPROBE_SYMBOL(__do_page_fault); - static __always_inline void trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, unsigned long address) @@ -1521,13 +1500,19 @@ trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code, } dotraplinkage void -do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) +do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, + unsigned long address) { - enum ctx_state prev_state; + prefetchw(¤t->mm->mmap_sem); + trace_page_fault_entries(regs, hw_error_code, address); - prev_state = exception_enter(); - trace_page_fault_entries(regs, error_code, address); - __do_page_fault(regs, error_code, address); - exception_exit(prev_state); + if (unlikely(kmmio_fault(regs, address))) + return; + + /* Was the fault on kernel-controlled part of the address space? */ + if (unlikely(fault_in_kernel_space(address))) + do_kern_addr_fault(regs, hw_error_code, address); + else + do_user_addr_fault(regs, hw_error_code, address); } NOKPROBE_SYMBOL(do_page_fault); -- cgit v1.2.3 From 50cc02e599ef1e049473358604dc07d32b751e2c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Dec 2019 17:36:12 +0100 Subject: x86/context-tracking: Remove exception_enter/exit() from KVM_PV_REASON_PAGE_NOT_PRESENT async page fault MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a leftover. Page faults, just like most other exceptions, are protected inside user_exit() / user_enter() calls in x86 entry code when we fault from userspace. So this pair of calls is now superfluous. Signed-off-by: Frederic Weisbecker Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Jim Mattson Cc: Joerg Roedel Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krčmář Cc: Sean Christopherson Cc: Thomas Gleixner Cc: Vitaly Kuznetsov Cc: Wanpeng Li Link: https://lkml.kernel.org/r/20191227163612.10039-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 32ef1ee733b7..81045aabb6f4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -245,17 +245,13 @@ NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason); dotraplinkage void do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address) { - enum ctx_state prev_state; - switch (kvm_read_and_reset_pf_reason()) { default: do_page_fault(regs, error_code, address); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ - prev_state = exception_enter(); kvm_async_pf_task_wait((u32)address, !user_mode(regs)); - exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); -- cgit v1.2.3 From f444a5ff95dce07cf4353cbb85fc3e785019d430 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Mon, 16 Dec 2019 13:42:54 -0800 Subject: x86/cpufeatures: Add support for fast short REP; MOVSB >From the Intel Optimization Reference Manual: 3.7.6.1 Fast Short REP MOVSB Beginning with processors based on Ice Lake Client microarchitecture, REP MOVSB performance of short operations is enhanced. The enhancement applies to string lengths between 1 and 128 bytes long. Support for fast-short REP MOVSB is enumerated by the CPUID feature flag: CPUID [EAX=7H, ECX=0H).EDX.FAST_SHORT_REP_MOVSB[bit 4] = 1. There is no change in the REP STOS performance. Add an X86_FEATURE_FSRM flag for this. memmove() avoids REP MOVSB for short (< 32 byte) copies. Check FSRM and use REP MOVSB for short copies on systems that support it. [ bp: Massage and add comment. ] Signed-off-by: Tony Luck Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20191216214254.26492-1-tony.luck@intel.com --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/lib/memmove_64.S | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index e9b62498fe75..98c60fa31ced 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -357,6 +357,7 @@ /* Intel-defined CPU features, CPUID level 0x00000007:0 (EDX), word 18 */ #define X86_FEATURE_AVX512_4VNNIW (18*32+ 2) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (18*32+ 3) /* AVX-512 Multiply Accumulation Single precision */ +#define X86_FEATURE_FSRM (18*32+ 4) /* Fast Short Rep Mov */ #define X86_FEATURE_AVX512_VP2INTERSECT (18*32+ 8) /* AVX-512 Intersect for D/Q */ #define X86_FEATURE_MD_CLEAR (18*32+10) /* VERW clears CPU buffers */ #define X86_FEATURE_TSX_FORCE_ABORT (18*32+13) /* "" TSX_FORCE_ABORT */ diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 337830d7a59c..7ff00ea64e4f 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -29,10 +29,7 @@ SYM_FUNC_START_ALIAS(memmove) SYM_FUNC_START(__memmove) - /* Handle more 32 bytes in loop */ mov %rdi, %rax - cmp $0x20, %rdx - jb 1f /* Decide forward/backward copy mode */ cmp %rdi, %rsi @@ -42,7 +39,9 @@ SYM_FUNC_START(__memmove) cmp %rdi, %r8 jg 2f + /* FSRM implies ERMS => no length checks, do the copy directly */ .Lmemmove_begin_forward: + ALTERNATIVE "cmp $0x20, %rdx; jb 1f", "", X86_FEATURE_FSRM ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS /* @@ -114,6 +113,8 @@ SYM_FUNC_START(__memmove) */ .p2align 4 2: + cmp $0x20, %rdx + jb 1f cmp $680, %rdx jb 6f cmp %dil, %sil -- cgit v1.2.3 From 2b10906f2d25515bba58070b8183babc89063597 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Thu, 19 Dec 2019 06:58:12 -0500 Subject: x86: Remove force_iret() force_iret() was originally intended to prevent the return to user mode with the SYSRET or SYSEXIT instructions, in cases where the register state could have been changed to be incompatible with those instructions. The entry code has been significantly reworked since then, and register state is validated before SYSRET or SYSEXIT are used. force_iret() no longer serves its original purpose and can be eliminated. Signed-off-by: Brian Gerst Signed-off-by: Borislav Petkov Acked-by: Oleg Nesterov Link: https://lkml.kernel.org/r/20191219115812.102620-1-brgerst@gmail.com --- arch/x86/ia32/ia32_signal.c | 2 -- arch/x86/include/asm/ptrace.h | 16 ---------------- arch/x86/include/asm/thread_info.h | 9 --------- arch/x86/kernel/process_32.c | 1 - arch/x86/kernel/process_64.c | 1 - arch/x86/kernel/signal.c | 2 -- arch/x86/kernel/vm86_32.c | 1 - 7 files changed, 32 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 30416d7f19d4..a3aefe9b9401 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -114,8 +114,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, err |= fpu__restore_sig(buf, 1); - force_iret(); - return err; } diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5057a8ed100b..78897a8da01f 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -339,22 +339,6 @@ static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, #define ARCH_HAS_USER_SINGLE_STEP_REPORT -/* - * When hitting ptrace_stop(), we cannot return using SYSRET because - * that does not restore the full CPU state, only a minimal set. The - * ptracer can change arbitrary register values, which is usually okay - * because the usual ptrace stops run off the signal delivery path which - * forces IRET; however, ptrace_event() stops happen in arbitrary places - * in the kernel and don't force IRET path. - * - * So force IRET path after a ptrace stop. - */ -#define arch_ptrace_stop_needed(code, info) \ -({ \ - force_iret(); \ - false; \ -}) - struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index d779366ce3f8..cf4327986e98 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -239,15 +239,6 @@ static inline int arch_within_stack_frames(const void * const stack, current_thread_info()->status & TS_COMPAT) #endif -/* - * Force syscall return via IRET by making it look as if there was - * some work pending. IRET is our most capable (but slowest) syscall - * return path, which is able to restore modified SS, CS and certain - * EFLAGS values that other (fast) syscall return instructions - * are not able to restore properly. - */ -#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) - extern void arch_task_cache_init(void); extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); extern void arch_release_task_struct(struct task_struct *tsk); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 323499f48858..5052ced43373 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -124,7 +124,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - force_iret(); } EXPORT_SYMBOL_GPL(start_thread); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 506d66830d4d..ffd497804dbc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -394,7 +394,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; - force_iret(); } void diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 8eb7193e158d..8a29573851a3 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -151,8 +151,6 @@ static int restore_sigcontext(struct pt_regs *regs, err |= fpu__restore_sig(buf, IS_ENABLED(CONFIG_X86_32)); - force_iret(); - return err; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index a76c12b38e92..91d55454e702 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -381,7 +381,6 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) mark_screen_rdonly(tsk->mm); memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); - force_iret(); return regs->ax; } -- cgit v1.2.3 From b2b1d94cdfd4e906d3936dab2850096a4a0c2017 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 16 Dec 2019 11:40:03 +0100 Subject: x86/entry/64: Add instruction suffix to SYSRET ignore_sysret() contains an unsuffixed SYSRET instruction. gas correctly interprets this as SYSRETL, but leaving it up to gas to guess when there is no register operand that implies a size is bad practice, and upstream gas is likely to warn about this in the future. Use SYSRETL explicitly. This does not change the assembled output. Signed-off-by: Jan Beulich Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Link: https://lkml.kernel.org/r/038a7c35-062b-a285-c6d2-653b56585844@suse.com --- arch/x86/entry/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 76942cbd95a1..f2bb91e87877 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1728,7 +1728,7 @@ SYM_CODE_END(nmi) SYM_CODE_START(ignore_sysret) UNWIND_HINT_EMPTY mov $-ENOSYS, %eax - sysret + sysretl SYM_CODE_END(ignore_sysret) #endif -- cgit v1.2.3 From 183ef7adf4ed638ac0fb0c3c9a71fc00e8512b61 Mon Sep 17 00:00:00 2001 From: Arvind Sankar Date: Tue, 7 Jan 2020 14:44:34 -0500 Subject: x86/boot: Simplify calculation of output address Condense the calculation of decompressed kernel start a little. Committer notes: before: ebp = ebx - (init_size - _end) after: eax = (ebx + _end) - init_size where in both ebx contains the temporary address the kernel is moved to for in-place decompression. The before and after difference in register state is %eax and %ebp but that is immaterial because the compressed image is not built with -mregparm, i.e., all arguments of the following extract_kernel() call are passed on the stack. Signed-off-by: Arvind Sankar Signed-off-by: Borislav Petkov Link: https://lkml.kernel.org/r/20200107194436.2166846-1-nivedita@alum.mit.edu --- arch/x86/boot/compressed/head_32.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index f2dfd6d083ef..1cc55c79d1d0 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -240,11 +240,9 @@ SYM_FUNC_START_LOCAL_NOALIGN(.Lrelocated) /* push arguments for extract_kernel: */ pushl $z_output_len /* decompressed length, end of relocs */ - movl BP_init_size(%esi), %eax - subl $_end, %eax - movl %ebx, %ebp - subl %eax, %ebp - pushl %ebp /* output address */ + leal _end(%ebx), %eax + subl BP_init_size(%esi), %eax + pushl %eax /* output address */ pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax -- cgit v1.2.3