diff options
Diffstat (limited to 'arch/x86/mm/fault.c')
| -rw-r--r-- | arch/x86/mm/fault.c | 480 | 
1 files changed, 265 insertions, 215 deletions
| diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index b9123c497e0a..b24eb4eb9984 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -16,6 +16,7 @@  #include <linux/prefetch.h>		/* prefetchw			*/  #include <linux/context_tracking.h>	/* exception_enter(), ...	*/  #include <linux/uaccess.h>		/* faulthandler_disabled()	*/ +#include <linux/efi.h>			/* efi_recover_from_page_fault()*/  #include <linux/mm_types.h>  #include <asm/cpufeature.h>		/* boot_cpu_has, ...		*/ @@ -25,6 +26,7 @@  #include <asm/vsyscall.h>		/* emulate_vsyscall		*/  #include <asm/vm86.h>			/* struct vm86			*/  #include <asm/mmu_context.h>		/* vma_pkey()			*/ +#include <asm/efi.h>			/* efi_recover_from_page_fault()*/  #define CREATE_TRACE_POINTS  #include <asm/trace/exceptions.h> @@ -44,17 +46,19 @@ kmmio_fault(struct pt_regs *regs, unsigned long addr)  static nokprobe_inline int kprobes_fault(struct pt_regs *regs)  { -	int ret = 0; - -	/* kprobe_running() needs smp_processor_id() */ -	if (kprobes_built_in() && !user_mode(regs)) { -		preempt_disable(); -		if (kprobe_running() && kprobe_fault_handler(regs, 14)) -			ret = 1; -		preempt_enable(); -	} - -	return ret; +	if (!kprobes_built_in()) +		return 0; +	if (user_mode(regs)) +		return 0; +	/* +	 * To be potentially processing a kprobe fault and to be allowed to call +	 * kprobe_running(), we have to be non-preemptible. +	 */ +	if (preemptible()) +		return 0; +	if (!kprobe_running()) +		return 0; +	return kprobe_fault_handler(regs, X86_TRAP_PF);  }  /* @@ -153,79 +157,6 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)  	return prefetch;  } -/* - * A protection key fault means that the PKRU value did not allow - * access to some PTE.  Userspace can figure out what PKRU was - * from the XSAVE state, and this function fills out a field in - * siginfo so userspace can discover which protection key was set - * on the PTE. - * - * If we get here, we know that the hardware signaled a X86_PF_PK - * fault and that there was a VMA once we got in the fault - * handler.  It does *not* guarantee that the VMA we find here - * was the one that we faulted on. - * - * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4); - * 2. T1   : set PKRU to deny access to pkey=4, touches page - * 3. T1   : faults... - * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5); - * 5. T1   : enters fault handler, takes mmap_sem, etc... - * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really - *	     faulted on a pte with its pkey=4. - */ -static void fill_sig_info_pkey(int si_signo, int si_code, siginfo_t *info, -		u32 *pkey) -{ -	/* This is effectively an #ifdef */ -	if (!boot_cpu_has(X86_FEATURE_OSPKE)) -		return; - -	/* Fault not from Protection Keys: nothing to do */ -	if ((si_code != SEGV_PKUERR) || (si_signo != SIGSEGV)) -		return; -	/* -	 * force_sig_info_fault() is called from a number of -	 * contexts, some of which have a VMA and some of which -	 * do not.  The X86_PF_PK handing happens after we have a -	 * valid VMA, so we should never reach this without a -	 * valid VMA. -	 */ -	if (!pkey) { -		WARN_ONCE(1, "PKU fault with no VMA passed in"); -		info->si_pkey = 0; -		return; -	} -	/* -	 * si_pkey should be thought of as a strong hint, but not -	 * absolutely guranteed to be 100% accurate because of -	 * the race explained above. -	 */ -	info->si_pkey = *pkey; -} - -static void -force_sig_info_fault(int si_signo, int si_code, unsigned long address, -		     struct task_struct *tsk, u32 *pkey, int fault) -{ -	unsigned lsb = 0; -	siginfo_t info; - -	clear_siginfo(&info); -	info.si_signo	= si_signo; -	info.si_errno	= 0; -	info.si_code	= si_code; -	info.si_addr	= (void __user *)address; -	if (fault & VM_FAULT_HWPOISON_LARGE) -		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));  -	if (fault & VM_FAULT_HWPOISON) -		lsb = PAGE_SHIFT; -	info.si_addr_lsb = lsb; - -	fill_sig_info_pkey(si_signo, si_code, &info, pkey); - -	force_sig_info(si_signo, &info, tsk); -} -  DEFINE_SPINLOCK(pgd_lock);  LIST_HEAD(pgd_list); @@ -709,7 +640,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,  	int sig;  	/* Are we prepared to handle this kernel fault? */ -	if (fixup_exception(regs, X86_TRAP_PF)) { +	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {  		/*  		 * Any interrupt that takes a fault gets the fixup. This makes  		 * the below recursive fault logic only apply to a faults from @@ -730,8 +661,8 @@ no_context(struct pt_regs *regs, unsigned long error_code,  			tsk->thread.cr2 = address;  			/* XXX: hwpoison faults will set the wrong code. */ -			force_sig_info_fault(signal, si_code, address, -					     tsk, NULL, 0); +			force_sig_fault(signal, si_code, (void __user *)address, +					tsk);  		}  		/* @@ -789,6 +720,13 @@ no_context(struct pt_regs *regs, unsigned long error_code,  		return;  	/* +	 * Buggy firmware could access regions which might page fault, try to +	 * recover from such faults. +	 */ +	if (IS_ENABLED(CONFIG_EFI)) +		efi_recover_from_page_fault(address); + +	/*  	 * Oops. The kernel tried to access some bad page. We'll have to  	 * terminate things with extreme prejudice:  	 */ @@ -837,12 +775,21 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,  	printk(KERN_CONT "\n"); -	show_opcodes((u8 *)regs->ip, loglvl); +	show_opcodes(regs, loglvl); +} + +/* + * The (legacy) vsyscall page is the long page in the kernel portion + * of the address space that has user-accessible permissions. + */ +static bool is_vsyscall_vaddr(unsigned long vaddr) +{ +	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);  }  static void  __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, -		       unsigned long address, u32 *pkey, int si_code) +		       unsigned long address, u32 pkey, int si_code)  {  	struct task_struct *tsk = current; @@ -863,18 +810,6 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  		if (is_errata100(regs, address))  			return; -#ifdef CONFIG_X86_64 -		/* -		 * Instruction fetch faults in the vsyscall page might need -		 * emulation. -		 */ -		if (unlikely((error_code & X86_PF_INSTR) && -			     ((address & ~0xfff) == VSYSCALL_ADDR))) { -			if (emulate_vsyscall(regs, address)) -				return; -		} -#endif -  		/*  		 * To avoid leaking information about the kernel page table  		 * layout, pretend that user-mode accesses to kernel addresses @@ -890,7 +825,10 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  		tsk->thread.error_code	= error_code;  		tsk->thread.trap_nr	= X86_TRAP_PF; -		force_sig_info_fault(SIGSEGV, si_code, address, tsk, pkey, 0); +		if (si_code == SEGV_PKUERR) +			force_sig_pkuerr((void __user *)address, pkey); + +		force_sig_fault(SIGSEGV, si_code, (void __user *)address, tsk);  		return;  	} @@ -903,35 +841,29 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,  static noinline void  bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, -		     unsigned long address, u32 *pkey) +		     unsigned long address)  { -	__bad_area_nosemaphore(regs, error_code, address, pkey, SEGV_MAPERR); +	__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);  }  static void  __bad_area(struct pt_regs *regs, unsigned long error_code, -	   unsigned long address,  struct vm_area_struct *vma, int si_code) +	   unsigned long address, u32 pkey, int si_code)  {  	struct mm_struct *mm = current->mm; -	u32 pkey; - -	if (vma) -		pkey = vma_pkey(vma); -  	/*  	 * Something tried to access memory that isn't in our memory map..  	 * Fix it, but check if it's kernel or user first..  	 */  	up_read(&mm->mmap_sem); -	__bad_area_nosemaphore(regs, error_code, address, -			       (vma) ? &pkey : NULL, si_code); +	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);  }  static noinline void  bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)  { -	__bad_area(regs, error_code, address, NULL, SEGV_MAPERR); +	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);  }  static inline bool bad_area_access_from_pkeys(unsigned long error_code, @@ -960,18 +892,40 @@ bad_area_access_error(struct pt_regs *regs, unsigned long error_code,  	 * But, doing it this way allows compiler optimizations  	 * if pkeys are compiled out.  	 */ -	if (bad_area_access_from_pkeys(error_code, vma)) -		__bad_area(regs, error_code, address, vma, SEGV_PKUERR); -	else -		__bad_area(regs, error_code, address, vma, SEGV_ACCERR); +	if (bad_area_access_from_pkeys(error_code, vma)) { +		/* +		 * A protection key fault means that the PKRU value did not allow +		 * access to some PTE.  Userspace can figure out what PKRU was +		 * from the XSAVE state.  This function captures the pkey from +		 * the vma and passes it to userspace so userspace can discover +		 * which protection key was set on the PTE. +		 * +		 * If we get here, we know that the hardware signaled a X86_PF_PK +		 * fault and that there was a VMA once we got in the fault +		 * handler.  It does *not* guarantee that the VMA we find here +		 * was the one that we faulted on. +		 * +		 * 1. T1   : mprotect_key(foo, PAGE_SIZE, pkey=4); +		 * 2. T1   : set PKRU to deny access to pkey=4, touches page +		 * 3. T1   : faults... +		 * 4.    T2: mprotect_key(foo, PAGE_SIZE, pkey=5); +		 * 5. T1   : enters fault handler, takes mmap_sem, etc... +		 * 6. T1   : reaches here, sees vma_pkey(vma)=5, when we really +		 *	     faulted on a pte with its pkey=4. +		 */ +		u32 pkey = vma_pkey(vma); + +		__bad_area(regs, error_code, address, pkey, SEGV_PKUERR); +	} else { +		__bad_area(regs, error_code, address, 0, SEGV_ACCERR); +	}  }  static void  do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, -	  u32 *pkey, unsigned int fault) +	  unsigned int fault)  {  	struct task_struct *tsk = current; -	int code = BUS_ADRERR;  	/* Kernel mode? Handle exceptions or die: */  	if (!(error_code & X86_PF_USER)) { @@ -989,18 +943,25 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,  #ifdef CONFIG_MEMORY_FAILURE  	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { -		printk(KERN_ERR +		unsigned lsb = 0; + +		pr_err(  	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",  			tsk->comm, tsk->pid, address); -		code = BUS_MCEERR_AR; +		if (fault & VM_FAULT_HWPOISON_LARGE) +			lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); +		if (fault & VM_FAULT_HWPOISON) +			lsb = PAGE_SHIFT; +		force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb, tsk); +		return;  	}  #endif -	force_sig_info_fault(SIGBUS, code, address, tsk, pkey, fault); +	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address, tsk);  }  static noinline void  mm_fault_error(struct pt_regs *regs, unsigned long error_code, -	       unsigned long address, u32 *pkey, vm_fault_t fault) +	       unsigned long address, vm_fault_t fault)  {  	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {  		no_context(regs, error_code, address, 0, 0); @@ -1024,27 +985,21 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,  	} else {  		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|  			     VM_FAULT_HWPOISON_LARGE)) -			do_sigbus(regs, error_code, address, pkey, fault); +			do_sigbus(regs, error_code, address, fault);  		else if (fault & VM_FAULT_SIGSEGV) -			bad_area_nosemaphore(regs, error_code, address, pkey); +			bad_area_nosemaphore(regs, error_code, address);  		else  			BUG();  	}  } -static int spurious_fault_check(unsigned long error_code, pte_t *pte) +static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)  {  	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))  		return 0;  	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))  		return 0; -	/* -	 * Note: We do not do lazy flushing on protection key -	 * changes, so no spurious fault will ever set X86_PF_PK. -	 */ -	if ((error_code & X86_PF_PK)) -		return 1;  	return 1;  } @@ -1071,7 +1026,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)   * (Optional Invalidation).   */  static noinline int -spurious_fault(unsigned long error_code, unsigned long address) +spurious_kernel_fault(unsigned long error_code, unsigned long address)  {  	pgd_t *pgd;  	p4d_t *p4d; @@ -1102,27 +1057,27 @@ spurious_fault(unsigned long error_code, unsigned long address)  		return 0;  	if (p4d_large(*p4d)) -		return spurious_fault_check(error_code, (pte_t *) p4d); +		return spurious_kernel_fault_check(error_code, (pte_t *) p4d);  	pud = pud_offset(p4d, address);  	if (!pud_present(*pud))  		return 0;  	if (pud_large(*pud)) -		return spurious_fault_check(error_code, (pte_t *) pud); +		return spurious_kernel_fault_check(error_code, (pte_t *) pud);  	pmd = pmd_offset(pud, address);  	if (!pmd_present(*pmd))  		return 0;  	if (pmd_large(*pmd)) -		return spurious_fault_check(error_code, (pte_t *) pmd); +		return spurious_kernel_fault_check(error_code, (pte_t *) pmd);  	pte = pte_offset_kernel(pmd, address);  	if (!pte_present(*pte))  		return 0; -	ret = spurious_fault_check(error_code, pte); +	ret = spurious_kernel_fault_check(error_code, pte);  	if (!ret)  		return 0; @@ -1130,12 +1085,12 @@ spurious_fault(unsigned long error_code, unsigned long address)  	 * Make sure we have permissions in PMD.  	 * If not, then there's a bug in the page tables:  	 */ -	ret = spurious_fault_check(error_code, (pte_t *) pmd); +	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);  	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");  	return ret;  } -NOKPROBE_SYMBOL(spurious_fault); +NOKPROBE_SYMBOL(spurious_kernel_fault);  int show_unhandled_signals = 1; @@ -1182,6 +1137,14 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)  static int fault_in_kernel_space(unsigned long address)  { +	/* +	 * On 64-bit systems, the vsyscall page is at an address above +	 * TASK_SIZE_MAX, but is not considered part of the kernel +	 * address space. +	 */ +	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address)) +		return false; +  	return address >= TASK_SIZE_MAX;  } @@ -1203,31 +1166,23 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)  }  /* - * This routine handles page faults.  It determines the address, - * and the problem, and then passes it off to one of the appropriate - * routines. + * Called for all faults where 'address' is part of the kernel address + * space.  Might get called for faults that originate from *code* that + * ran in userspace or the kernel.   */ -static noinline void -__do_page_fault(struct pt_regs *regs, unsigned long error_code, -		unsigned long address) +static void +do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code, +		   unsigned long address)  { -	struct vm_area_struct *vma; -	struct task_struct *tsk; -	struct mm_struct *mm; -	vm_fault_t fault, major = 0; -	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; -	u32 pkey; - -	tsk = current; -	mm = tsk->mm; - -	prefetchw(&mm->mmap_sem); - -	if (unlikely(kmmio_fault(regs, address))) -		return; +	/* +	 * Protection keys exceptions only happen on user pages.  We +	 * have no user pages in the kernel portion of the address +	 * space, so do not expect them here. +	 */ +	WARN_ON_ONCE(hw_error_code & X86_PF_PK);  	/* -	 * We fault-in kernel-space virtual memory on-demand. The +	 * We can fault-in kernel-space virtual memory on-demand. The  	 * 'reference' page table is init_mm.pgd.  	 *  	 * NOTE! We MUST NOT take any locks for this case. We may @@ -1235,41 +1190,73 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,  	 * only copy the information from the master page table,  	 * nothing more.  	 * -	 * This verifies that the fault happens in kernel space -	 * (error_code & 4) == 0, and that the fault was not a -	 * protection error (error_code & 9) == 0. +	 * Before doing this on-demand faulting, ensure that the +	 * fault is not any of the following: +	 * 1. A fault on a PTE with a reserved bit set. +	 * 2. A fault caused by a user-mode access.  (Do not demand- +	 *    fault kernel memory due to user-mode accesses). +	 * 3. A fault caused by a page-level protection violation. +	 *    (A demand fault would be on a non-present page which +	 *     would have X86_PF_PROT==0).  	 */ -	if (unlikely(fault_in_kernel_space(address))) { -		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { -			if (vmalloc_fault(address) >= 0) -				return; -		} - -		/* Can handle a stale RO->RW TLB: */ -		if (spurious_fault(error_code, address)) +	if (!(hw_error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { +		if (vmalloc_fault(address) >= 0)  			return; +	} -		/* kprobes don't want to hook the spurious faults: */ -		if (kprobes_fault(regs)) -			return; -		/* -		 * Don't take the mm semaphore here. If we fixup a prefetch -		 * fault we could otherwise deadlock: -		 */ -		bad_area_nosemaphore(regs, error_code, address, NULL); +	/* Was the fault spurious, caused by lazy TLB invalidation? */ +	if (spurious_kernel_fault(hw_error_code, address)) +		return; +	/* kprobes don't want to hook the spurious faults: */ +	if (kprobes_fault(regs))  		return; -	} + +	/* +	 * Note, despite being a "bad area", there are quite a few +	 * acceptable reasons to get here, such as erratum fixups +	 * and handling kernel code that can fault, like get_user(). +	 * +	 * Don't take the mm semaphore here. If we fixup a prefetch +	 * fault we could otherwise deadlock: +	 */ +	bad_area_nosemaphore(regs, hw_error_code, address); +} +NOKPROBE_SYMBOL(do_kern_addr_fault); + +/* Handle faults in the user portion of the address space */ +static inline +void do_user_addr_fault(struct pt_regs *regs, +			unsigned long hw_error_code, +			unsigned long address) +{ +	unsigned long sw_error_code; +	struct vm_area_struct *vma; +	struct task_struct *tsk; +	struct mm_struct *mm; +	vm_fault_t fault, major = 0; +	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + +	tsk = current; +	mm = tsk->mm;  	/* kprobes don't want to hook the spurious faults: */  	if (unlikely(kprobes_fault(regs)))  		return; -	if (unlikely(error_code & X86_PF_RSVD)) -		pgtable_bad(regs, error_code, address); +	/* +	 * Reserved bits are never expected to be set on +	 * entries in the user portion of the page tables. +	 */ +	if (unlikely(hw_error_code & X86_PF_RSVD)) +		pgtable_bad(regs, hw_error_code, address); -	if (unlikely(smap_violation(error_code, regs))) { -		bad_area_nosemaphore(regs, error_code, address, NULL); +	/* +	 * Check for invalid kernel (supervisor) access to user +	 * pages in the user address space. +	 */ +	if (unlikely(smap_violation(hw_error_code, regs))) { +		bad_area_nosemaphore(regs, hw_error_code, address);  		return;  	} @@ -1278,11 +1265,18 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,  	 * in a region with pagefaults disabled then we must not take the fault  	 */  	if (unlikely(faulthandler_disabled() || !mm)) { -		bad_area_nosemaphore(regs, error_code, address, NULL); +		bad_area_nosemaphore(regs, hw_error_code, address);  		return;  	}  	/* +	 * hw_error_code is literally the "page fault error code" passed to +	 * the kernel directly from the hardware.  But, we will shortly be +	 * modifying it in software, so give it a new name. +	 */ +	sw_error_code = hw_error_code; + +	/*  	 * It's safe to allow irq's after cr2 has been saved and the  	 * vmalloc fault has been handled.  	 * @@ -1291,7 +1285,26 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,  	 */  	if (user_mode(regs)) {  		local_irq_enable(); -		error_code |= X86_PF_USER; +		/* +		 * Up to this point, X86_PF_USER set in hw_error_code +		 * indicated a user-mode access.  But, after this, +		 * X86_PF_USER in sw_error_code will indicate either +		 * that, *or* an implicit kernel(supervisor)-mode access +		 * which originated from user mode. +		 */ +		if (!(hw_error_code & X86_PF_USER)) { +			/* +			 * The CPU was in user mode, but the CPU says +			 * the fault was not a user-mode access. +			 * Must be an implicit kernel-mode access, +			 * which we do not expect to happen in the +			 * user address space. +			 */ +			pr_warn_once("kernel-mode error from user-mode: %lx\n", +					hw_error_code); + +			sw_error_code |= X86_PF_USER; +		}  		flags |= FAULT_FLAG_USER;  	} else {  		if (regs->flags & X86_EFLAGS_IF) @@ -1300,31 +1313,49 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,  	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address); -	if (error_code & X86_PF_WRITE) +	if (sw_error_code & X86_PF_WRITE)  		flags |= FAULT_FLAG_WRITE; -	if (error_code & X86_PF_INSTR) +	if (sw_error_code & X86_PF_INSTR)  		flags |= FAULT_FLAG_INSTRUCTION; +#ifdef CONFIG_X86_64 +	/* +	 * Instruction fetch faults in the vsyscall page might need +	 * emulation.  The vsyscall page is at a high address +	 * (>PAGE_OFFSET), but is considered to be part of the user +	 * address space. +	 * +	 * The vsyscall page does not have a "real" VMA, so do this +	 * emulation before we go searching for VMAs. +	 */ +	if ((sw_error_code & X86_PF_INSTR) && is_vsyscall_vaddr(address)) { +		if (emulate_vsyscall(regs, address)) +			return; +	} +#endif +  	/* -	 * When running in the kernel we expect faults to occur only to -	 * addresses in user space.  All other faults represent errors in -	 * the kernel and should generate an OOPS.  Unfortunately, in the -	 * case of an erroneous fault occurring in a code path which already -	 * holds mmap_sem we will deadlock attempting to validate the fault -	 * against the address space.  Luckily the kernel only validly -	 * references user space from well defined areas of code, which are -	 * listed in the exceptions table. +	 * Kernel-mode access to the user address space should only occur +	 * on well-defined single instructions listed in the exception +	 * tables.  But, an erroneous kernel fault occurring outside one of +	 * those areas which also holds mmap_sem might deadlock attempting +	 * to validate the fault against the address space.  	 * -	 * As the vast majority of faults will be valid we will only perform -	 * the source reference check when there is a possibility of a -	 * deadlock. Attempt to lock the address space, if we cannot we then -	 * validate the source. If this is invalid we can skip the address -	 * space check, thus avoiding the deadlock: +	 * Only do the expensive exception table search when we might be at +	 * risk of a deadlock.  This happens if we +	 * 1. Failed to acquire mmap_sem, and +	 * 2. The access did not originate in userspace.  Note: either the +	 *    hardware or earlier page fault code may set X86_PF_USER +	 *    in sw_error_code.  	 */  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) { -		if (!(error_code & X86_PF_USER) && +		if (!(sw_error_code & X86_PF_USER) &&  		    !search_exception_tables(regs->ip)) { -			bad_area_nosemaphore(regs, error_code, address, NULL); +			/* +			 * Fault from code in kernel from +			 * which we do not expect faults. +			 */ +			bad_area_nosemaphore(regs, sw_error_code, address);  			return;  		}  retry: @@ -1340,16 +1371,16 @@ retry:  	vma = find_vma(mm, address);  	if (unlikely(!vma)) { -		bad_area(regs, error_code, address); +		bad_area(regs, sw_error_code, address);  		return;  	}  	if (likely(vma->vm_start <= address))  		goto good_area;  	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) { -		bad_area(regs, error_code, address); +		bad_area(regs, sw_error_code, address);  		return;  	} -	if (error_code & X86_PF_USER) { +	if (sw_error_code & X86_PF_USER) {  		/*  		 * Accessing the stack below %sp is always a bug.  		 * The large cushion allows instructions like enter @@ -1357,12 +1388,12 @@ retry:  		 * 32 pointers and then decrements %sp by 65535.)  		 */  		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) { -			bad_area(regs, error_code, address); +			bad_area(regs, sw_error_code, address);  			return;  		}  	}  	if (unlikely(expand_stack(vma, address))) { -		bad_area(regs, error_code, address); +		bad_area(regs, sw_error_code, address);  		return;  	} @@ -1371,8 +1402,8 @@ retry:  	 * we can handle it..  	 */  good_area: -	if (unlikely(access_error(error_code, vma))) { -		bad_area_access_error(regs, error_code, address, vma); +	if (unlikely(access_error(sw_error_code, vma))) { +		bad_area_access_error(regs, sw_error_code, address, vma);  		return;  	} @@ -1388,10 +1419,7 @@ good_area:  	 * (potentially after handling any pending signal during the return to  	 * userland). The return to userland is identified whenever  	 * FAULT_FLAG_USER|FAULT_FLAG_KILLABLE are both set in flags. -	 * Thus we have to be careful about not touching vma after handling the -	 * fault, so we read the pkey beforehand.  	 */ -	pkey = vma_pkey(vma);  	fault = handle_mm_fault(vma, address, flags);  	major |= fault & VM_FAULT_MAJOR; @@ -1414,13 +1442,13 @@ good_area:  			return;  		/* Not returning to user mode? Handle exceptions or die: */ -		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); +		no_context(regs, sw_error_code, address, SIGBUS, BUS_ADRERR);  		return;  	}  	up_read(&mm->mmap_sem);  	if (unlikely(fault & VM_FAULT_ERROR)) { -		mm_fault_error(regs, error_code, address, &pkey, fault); +		mm_fault_error(regs, sw_error_code, address, fault);  		return;  	} @@ -1438,6 +1466,28 @@ good_area:  	check_v8086_mode(regs, address, tsk);  } +NOKPROBE_SYMBOL(do_user_addr_fault); + +/* + * This routine handles page faults.  It determines the address, + * and the problem, and then passes it off to one of the appropriate + * routines. + */ +static noinline void +__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code, +		unsigned long address) +{ +	prefetchw(¤t->mm->mmap_sem); + +	if (unlikely(kmmio_fault(regs, address))) +		return; + +	/* Was the fault on kernel-controlled part of the address space? */ +	if (unlikely(fault_in_kernel_space(address))) +		do_kern_addr_fault(regs, hw_error_code, address); +	else +		do_user_addr_fault(regs, hw_error_code, address); +}  NOKPROBE_SYMBOL(__do_page_fault);  static nokprobe_inline void | 
