diff options
Diffstat (limited to 'arch/x86/kvm')
| -rw-r--r-- | arch/x86/kvm/cpuid.h | 2 | ||||
| -rw-r--r-- | arch/x86/kvm/emulate.c | 102 | ||||
| -rw-r--r-- | arch/x86/kvm/ioapic.c | 34 | ||||
| -rw-r--r-- | arch/x86/kvm/lapic.c | 12 | ||||
| -rw-r--r-- | arch/x86/kvm/mmu.c | 27 | ||||
| -rw-r--r-- | arch/x86/kvm/svm.c | 25 | ||||
| -rw-r--r-- | arch/x86/kvm/vmx.c | 110 | ||||
| -rw-r--r-- | arch/x86/kvm/x86.c | 139 | 
8 files changed, 297 insertions, 154 deletions
| diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index cdc70a3a6583..c2cea6651279 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -44,7 +44,7 @@ static const struct cpuid_reg reverse_cpuid[] = {  	[CPUID_8086_0001_EDX] = {0x80860001, 0, CPUID_EDX},  	[CPUID_1_ECX]         = {         1, 0, CPUID_ECX},  	[CPUID_C000_0001_EDX] = {0xc0000001, 0, CPUID_EDX}, -	[CPUID_8000_0001_ECX] = {0xc0000001, 0, CPUID_ECX}, +	[CPUID_8000_0001_ECX] = {0x80000001, 0, CPUID_ECX},  	[CPUID_7_0_EBX]       = {         7, 0, CPUID_EBX},  	[CPUID_D_1_EAX]       = {       0xd, 1, CPUID_EAX},  	[CPUID_F_0_EDX]       = {       0xf, 0, CPUID_EDX}, diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8079d141792a..290ecf711aec 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -25,6 +25,7 @@  #include <asm/kvm_emulate.h>  #include <linux/stringify.h>  #include <asm/debugreg.h> +#include <asm/nospec-branch.h>  #include "x86.h"  #include "tss.h" @@ -1021,8 +1022,8 @@ static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)  	void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);  	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF; -	asm("push %[flags]; popf; call *%[fastop]" -	    : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags)); +	asm("push %[flags]; popf; " CALL_NOSPEC +	    : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));  	return rc;  } @@ -1046,7 +1047,6 @@ static void fetch_register_operand(struct operand *op)  static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;  	case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break; @@ -1068,13 +1068,11 @@ static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)  #endif  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,  			  int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;  	case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break; @@ -1096,12 +1094,10 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,  #endif  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;  	case 1: asm("movq %%mm1, %0" : "=m"(*data)); break; @@ -1113,12 +1109,10 @@ static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  	case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  { -	ctxt->ops->get_fpu(ctxt);  	switch (reg) {  	case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;  	case 1: asm("movq %0, %%mm1" : : "m"(*data)); break; @@ -1130,7 +1124,6 @@ static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)  	case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;  	default: BUG();  	} -	ctxt->ops->put_fpu(ctxt);  }  static int em_fninit(struct x86_emulate_ctxt *ctxt) @@ -1138,9 +1131,7 @@ static int em_fninit(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fninit"); -	ctxt->ops->put_fpu(ctxt);  	return X86EMUL_CONTINUE;  } @@ -1151,9 +1142,7 @@ static int em_fnstcw(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fnstcw %0": "+m"(fcw)); -	ctxt->ops->put_fpu(ctxt);  	ctxt->dst.val = fcw; @@ -1167,9 +1156,7 @@ static int em_fnstsw(struct x86_emulate_ctxt *ctxt)  	if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))  		return emulate_nm(ctxt); -	ctxt->ops->get_fpu(ctxt);  	asm volatile("fnstsw %0": "+m"(fsw)); -	ctxt->ops->put_fpu(ctxt);  	ctxt->dst.val = fsw; @@ -2404,9 +2391,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)  }  static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt, -				     u64 cr0, u64 cr4) +				    u64 cr0, u64 cr3, u64 cr4)  {  	int bad; +	u64 pcid; + +	/* In order to later set CR4.PCIDE, CR3[11:0] must be zero.  */ +	pcid = 0; +	if (cr4 & X86_CR4_PCIDE) { +		pcid = cr3 & 0xfff; +		cr3 &= ~0xfff; +	} + +	bad = ctxt->ops->set_cr(ctxt, 3, cr3); +	if (bad) +		return X86EMUL_UNHANDLEABLE;  	/*  	 * First enable PAE, long mode needs it before CR0.PG = 1 is set. @@ -2425,6 +2424,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,  		bad = ctxt->ops->set_cr(ctxt, 4, cr4);  		if (bad)  			return X86EMUL_UNHANDLEABLE; +		if (pcid) { +			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid); +			if (bad) +				return X86EMUL_UNHANDLEABLE; +		} +  	}  	return X86EMUL_CONTINUE; @@ -2435,11 +2440,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)  	struct desc_struct desc;  	struct desc_ptr dt;  	u16 selector; -	u32 val, cr0, cr4; +	u32 val, cr0, cr3, cr4;  	int i;  	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc); -	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8)); +	cr3 =                      GET_SMSTATE(u32, smbase, 0x7ff8);  	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;  	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0); @@ -2481,14 +2486,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)  	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8)); -	return rsm_enter_protected_mode(ctxt, cr0, cr4); +	return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);  }  static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  {  	struct desc_struct desc;  	struct desc_ptr dt; -	u64 val, cr0, cr4; +	u64 val, cr0, cr3, cr4;  	u32 base3;  	u16 selector;  	int i, r; @@ -2505,7 +2510,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);  	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58); -	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50)); +	cr3 =                       GET_SMSTATE(u64, smbase, 0x7f50);  	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);  	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));  	val =                       GET_SMSTATE(u64, smbase, 0x7ed0); @@ -2533,7 +2538,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)  	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);  	ctxt->ops->set_gdt(ctxt, &dt); -	r = rsm_enter_protected_mode(ctxt, cr0, cr4); +	r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);  	if (r != X86EMUL_CONTINUE)  		return r; @@ -4001,12 +4006,8 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)  	if (rc != X86EMUL_CONTINUE)  		return rc; -	ctxt->ops->get_fpu(ctxt); -  	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); -	ctxt->ops->put_fpu(ctxt); -  	if (rc != X86EMUL_CONTINUE)  		return rc; @@ -4014,6 +4015,26 @@ static int em_fxsave(struct x86_emulate_ctxt *ctxt)  		                   fxstate_size(ctxt));  } +/* + * FXRSTOR might restore XMM registers not provided by the guest. Fill + * in the host registers (via FXSAVE) instead, so they won't be modified. + * (preemption has to stay disabled until FXRSTOR). + * + * Use noinline to keep the stack for other functions called by callers small. + */ +static noinline int fxregs_fixup(struct fxregs_state *fx_state, +				 const size_t used_size) +{ +	struct fxregs_state fx_tmp; +	int rc; + +	rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_tmp)); +	memcpy((void *)fx_state + used_size, (void *)&fx_tmp + used_size, +	       __fxstate_size(16) - used_size); + +	return rc; +} +  static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  {  	struct fxregs_state fx_state; @@ -4024,19 +4045,17 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  	if (rc != X86EMUL_CONTINUE)  		return rc; -	ctxt->ops->get_fpu(ctxt); -  	size = fxstate_size(ctxt); +	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); +	if (rc != X86EMUL_CONTINUE) +		return rc; +  	if (size < __fxstate_size(16)) { -		rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); +		rc = fxregs_fixup(&fx_state, size);  		if (rc != X86EMUL_CONTINUE)  			goto out;  	} -	rc = segmented_read_std(ctxt, ctxt->memop.addr.mem, &fx_state, size); -	if (rc != X86EMUL_CONTINUE) -		goto out; -  	if (fx_state.mxcsr >> 16) {  		rc = emulate_gp(ctxt, 0);  		goto out; @@ -4046,8 +4065,6 @@ static int em_fxrstor(struct x86_emulate_ctxt *ctxt)  		rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));  out: -	ctxt->ops->put_fpu(ctxt); -  	return rc;  } @@ -5000,6 +5017,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)  	bool op_prefix = false;  	bool has_seg_override = false;  	struct opcode opcode; +	u16 dummy; +	struct desc_struct desc;  	ctxt->memop.type = OP_NONE;  	ctxt->memopp = NULL; @@ -5018,6 +5037,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)  	switch (mode) {  	case X86EMUL_MODE_REAL:  	case X86EMUL_MODE_VM86: +		def_op_bytes = def_ad_bytes = 2; +		ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS); +		if (desc.d) +			def_op_bytes = def_ad_bytes = 4; +		break;  	case X86EMUL_MODE_PROT16:  		def_op_bytes = def_ad_bytes = 2;  		break; @@ -5290,9 +5314,7 @@ static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt)  {  	int rc; -	ctxt->ops->get_fpu(ctxt);  	rc = asm_safe("fwait"); -	ctxt->ops->put_fpu(ctxt);  	if (unlikely(rc != X86EMUL_CONTINUE))  		return emulate_exception(ctxt, MF_VECTOR, 0, false); @@ -5314,9 +5336,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))  	if (!(ctxt->d & ByteOp))  		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE; -	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n" +	asm("push %[flags]; popf; " CALL_NOSPEC " ; pushf; pop %[flags]\n"  	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags), -	      [fastop]"+S"(fop), ASM_CALL_CONSTRAINT +	      [thunk_target]"+S"(fop), ASM_CALL_CONSTRAINT  	    : "c"(ctxt->src2.val));  	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index bdff437acbcb..4e822ad363f3 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -209,12 +209,12 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,  	old_irr = ioapic->irr;  	ioapic->irr |= mask; -	if (edge) +	if (edge) {  		ioapic->irr_delivered &= ~mask; -	if ((edge && old_irr == ioapic->irr) || -	    (!edge && entry.fields.remote_irr)) { -		ret = 0; -		goto out; +		if (old_irr == ioapic->irr) { +			ret = 0; +			goto out; +		}  	}  	ret = ioapic_service(ioapic, irq, line_status); @@ -257,8 +257,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors)  		    index == RTC_GSI) {  			if (kvm_apic_match_dest(vcpu, NULL, 0,  			             e->fields.dest_id, e->fields.dest_mode) || -			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG && -			     kvm_apic_pending_eoi(vcpu, e->fields.vector))) +			    kvm_apic_pending_eoi(vcpu, e->fields.vector))  				__set_bit(e->fields.vector,  					  ioapic_handled_vectors);  		} @@ -277,6 +276,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  {  	unsigned index;  	bool mask_before, mask_after; +	int old_remote_irr, old_delivery_status;  	union kvm_ioapic_redirect_entry *e;  	switch (ioapic->ioregsel) { @@ -299,14 +299,28 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)  			return;  		e = &ioapic->redirtbl[index];  		mask_before = e->fields.mask; +		/* Preserve read-only fields */ +		old_remote_irr = e->fields.remote_irr; +		old_delivery_status = e->fields.delivery_status;  		if (ioapic->ioregsel & 1) {  			e->bits &= 0xffffffff;  			e->bits |= (u64) val << 32;  		} else {  			e->bits &= ~0xffffffffULL;  			e->bits |= (u32) val; -			e->fields.remote_irr = 0;  		} +		e->fields.remote_irr = old_remote_irr; +		e->fields.delivery_status = old_delivery_status; + +		/* +		 * Some OSes (Linux, Xen) assume that Remote IRR bit will +		 * be cleared by IOAPIC hardware when the entry is configured +		 * as edge-triggered. This behavior is used to simulate an +		 * explicit EOI on IOAPICs that don't have the EOI register. +		 */ +		if (e->fields.trig_mode == IOAPIC_EDGE_TRIG) +			e->fields.remote_irr = 0; +  		mask_after = e->fields.mask;  		if (mask_before != mask_after)  			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after); @@ -324,7 +338,9 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)  	struct kvm_lapic_irq irqe;  	int ret; -	if (entry->fields.mask) +	if (entry->fields.mask || +	    (entry->fields.trig_mode == IOAPIC_LEVEL_TRIG && +	    entry->fields.remote_irr))  		return -1;  	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x " diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 943acbf00c69..e2c1fb8d35ce 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -266,9 +266,14 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)  	recalculate_apic_map(apic->vcpu->kvm);  } +static inline u32 kvm_apic_calc_x2apic_ldr(u32 id) +{ +	return ((id >> 4) << 16) | (1 << (id & 0xf)); +} +  static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)  { -	u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); +	u32 ldr = kvm_apic_calc_x2apic_ldr(id);  	WARN_ON_ONCE(id != apic->vcpu->vcpu_id); @@ -2245,6 +2250,7 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,  {  	if (apic_x2apic_mode(vcpu->arch.apic)) {  		u32 *id = (u32 *)(s->regs + APIC_ID); +		u32 *ldr = (u32 *)(s->regs + APIC_LDR);  		if (vcpu->kvm->arch.x2apic_format) {  			if (*id != vcpu->vcpu_id) @@ -2255,6 +2261,10 @@ static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,  			else  				*id <<= 24;  		} + +		/* In x2APIC mode, the LDR is fixed and based on the id */ +		if (set) +			*ldr = kvm_apic_calc_x2apic_ldr(*id);  	}  	return 0; diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e5e66e5c6640..2b8eb4da4d08 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3395,7 +3395,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if(make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, 0, 0,  				vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL); @@ -3410,7 +3410,7 @@ static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)  			spin_lock(&vcpu->kvm->mmu_lock);  			if (make_mmu_pages_available(vcpu) < 0) {  				spin_unlock(&vcpu->kvm->mmu_lock); -				return 1; +				return -ENOSPC;  			}  			sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),  					i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL); @@ -3450,7 +3450,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if (make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,  				vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL); @@ -3487,7 +3487,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)  		spin_lock(&vcpu->kvm->mmu_lock);  		if (make_mmu_pages_available(vcpu) < 0) {  			spin_unlock(&vcpu->kvm->mmu_lock); -			return 1; +			return -ENOSPC;  		}  		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,  				      0, ACC_ALL); @@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)  bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)  {  	if (unlikely(!lapic_in_kernel(vcpu) || -		     kvm_event_needs_reinjection(vcpu))) +		     kvm_event_needs_reinjection(vcpu) || +		     vcpu->arch.exception.pending))  		return false;  	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu)) @@ -5465,30 +5466,34 @@ static void mmu_destroy_caches(void)  int kvm_mmu_module_init(void)  { +	int ret = -ENOMEM; +  	kvm_mmu_clear_all_pte_masks();  	pte_list_desc_cache = kmem_cache_create("pte_list_desc",  					    sizeof(struct pte_list_desc),  					    0, SLAB_ACCOUNT, NULL);  	if (!pte_list_desc_cache) -		goto nomem; +		goto out;  	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",  						  sizeof(struct kvm_mmu_page),  						  0, SLAB_ACCOUNT, NULL);  	if (!mmu_page_header_cache) -		goto nomem; +		goto out;  	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) -		goto nomem; +		goto out; -	register_shrinker(&mmu_shrinker); +	ret = register_shrinker(&mmu_shrinker); +	if (ret) +		goto out;  	return 0; -nomem: +out:  	mmu_destroy_caches(); -	return -ENOMEM; +	return ret;  }  /* diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 59e13a79c2e3..f40d0da1f1d3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -45,6 +45,7 @@  #include <asm/debugreg.h>  #include <asm/kvm_para.h>  #include <asm/irq_remapping.h> +#include <asm/nospec-branch.h>  #include <asm/virtext.h>  #include "trace.h" @@ -2197,6 +2198,8 @@ static int ud_interception(struct vcpu_svm *svm)  	int er;  	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD); +	if (er == EMULATE_USER_EXIT) +		return 0;  	if (er != EMULATE_DONE)  		kvm_queue_exception(&svm->vcpu, UD_VECTOR);  	return 1; @@ -4977,6 +4980,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  		"mov %%r14, %c[r14](%[svm]) \n\t"  		"mov %%r15, %c[r15](%[svm]) \n\t"  #endif +		/* +		* Clear host registers marked as clobbered to prevent +		* speculative use. +		*/ +		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t" +		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t" +		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t" +		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t" +		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t" +#ifdef CONFIG_X86_64 +		"xor %%r8, %%r8 \n\t" +		"xor %%r9, %%r9 \n\t" +		"xor %%r10, %%r10 \n\t" +		"xor %%r11, %%r11 \n\t" +		"xor %%r12, %%r12 \n\t" +		"xor %%r13, %%r13 \n\t" +		"xor %%r14, %%r14 \n\t" +		"xor %%r15, %%r15 \n\t" +#endif  		"pop %%" _ASM_BP  		:  		: [svm]"a"(svm), @@ -5006,6 +5028,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)  #endif  		); +	/* Eliminate branch target predictions from guest mode */ +	vmexit_fill_RSB(); +  #ifdef CONFIG_X86_64  	wrmsrl(MSR_GS_BASE, svm->host.gs_base);  #else diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 714a0673ec3c..a8b96dc4cd83 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -50,6 +50,7 @@  #include <asm/apic.h>  #include <asm/irq_remapping.h>  #include <asm/mmu_context.h> +#include <asm/nospec-branch.h>  #include "trace.h"  #include "pmu.h" @@ -899,8 +900,16 @@ static inline short vmcs_field_to_offset(unsigned long field)  {  	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX); -	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) || -	    vmcs_field_to_offset_table[field] == 0) +	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table)) +		return -ENOENT; + +	/* +	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a +	 * generic mechanism. +	 */ +	asm("lfence"); + +	if (vmcs_field_to_offset_table[field] == 0)  		return -ENOENT;  	return vmcs_field_to_offset_table[field]; @@ -2300,7 +2309,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)  		 * processors.  See 22.2.4.  		 */  		vmcs_writel(HOST_TR_BASE, -			    (unsigned long)this_cpu_ptr(&cpu_tss)); +			    (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);  		vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt);   /* 22.2.4 */  		/* @@ -5600,7 +5609,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);  	} -	vmcs_writel(GUEST_RFLAGS, 0x02); +	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);  	kvm_rip_write(vcpu, 0xfff0);  	vmcs_writel(GUEST_GDTR_BASE, 0); @@ -5915,11 +5924,9 @@ static int handle_exception(struct kvm_vcpu *vcpu)  		return 1;  /* already handled by vmx_vcpu_run() */  	if (is_invalid_opcode(intr_info)) { -		if (is_guest_mode(vcpu)) { -			kvm_queue_exception(vcpu, UD_VECTOR); -			return 1; -		}  		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD); +		if (er == EMULATE_USER_EXIT) +			return 0;  		if (er != EMULATE_DONE)  			kvm_queue_exception(vcpu, UD_VECTOR);  		return 1; @@ -6602,7 +6609,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)  		if (kvm_test_request(KVM_REQ_EVENT, vcpu))  			return 1; -		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); +		err = emulate_instruction(vcpu, 0);  		if (err == EMULATE_USER_EXIT) {  			++vcpu->stat.mmio_exits; @@ -6750,16 +6757,10 @@ static __init int hardware_setup(void)  			goto out;  	} -	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);  	memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE);  	memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); -	/* -	 * Allow direct access to the PC debug port (it is often used for I/O -	 * delays, but the vmexits simply slow things down). -	 */  	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE); -	clear_bit(0x80, vmx_io_bitmap_a);  	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE); @@ -7414,10 +7415,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)   */  static void free_nested(struct vcpu_vmx *vmx)  { -	if (!vmx->nested.vmxon) +	if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)  		return;  	vmx->nested.vmxon = false; +	vmx->nested.smm.vmxon = false;  	free_vpid(vmx->nested.vpid02);  	vmx->nested.posted_intr_nv = -1;  	vmx->nested.current_vmptr = -1ull; @@ -9127,14 +9129,14 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)  #endif  			"pushf\n\t"  			__ASM_SIZE(push) " $%c[cs]\n\t" -			"call *%[entry]\n\t" +			CALL_NOSPEC  			:  #ifdef CONFIG_X86_64  			[sp]"=&r"(tmp),  #endif  			ASM_CALL_CONSTRAINT  			: -			[entry]"r"(entry), +			THUNK_TARGET(entry),  			[ss]"i"(__KERNEL_DS),  			[cs]"i"(__KERNEL_CS)  			); @@ -9419,6 +9421,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  		/* Save guest registers, load host registers, keep flags */  		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"  		"pop %0 \n\t" +		"setbe %c[fail](%0)\n\t"  		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"  		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"  		__ASM_SIZE(pop) " %c[rcx](%0) \n\t" @@ -9435,12 +9438,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  		"mov %%r13, %c[r13](%0) \n\t"  		"mov %%r14, %c[r14](%0) \n\t"  		"mov %%r15, %c[r15](%0) \n\t" +		"xor %%r8d,  %%r8d \n\t" +		"xor %%r9d,  %%r9d \n\t" +		"xor %%r10d, %%r10d \n\t" +		"xor %%r11d, %%r11d \n\t" +		"xor %%r12d, %%r12d \n\t" +		"xor %%r13d, %%r13d \n\t" +		"xor %%r14d, %%r14d \n\t" +		"xor %%r15d, %%r15d \n\t"  #endif  		"mov %%cr2, %%" _ASM_AX "   \n\t"  		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t" +		"xor %%eax, %%eax \n\t" +		"xor %%ebx, %%ebx \n\t" +		"xor %%esi, %%esi \n\t" +		"xor %%edi, %%edi \n\t"  		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t" -		"setbe %c[fail](%0) \n\t"  		".pushsection .rodata \n\t"  		".global vmx_return \n\t"  		"vmx_return: " _ASM_PTR " 2b \n\t" @@ -9477,6 +9491,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)  #endif  	      ); +	/* Eliminate branch target predictions from guest mode */ +	vmexit_fill_RSB(); +  	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */  	if (debugctlmsr)  		update_debugctlmsr(debugctlmsr); @@ -9800,8 +9817,7 @@ static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)  	cr4_fixed1_update(X86_CR4_SMEP,       ebx, bit(X86_FEATURE_SMEP));  	cr4_fixed1_update(X86_CR4_SMAP,       ebx, bit(X86_FEATURE_SMAP));  	cr4_fixed1_update(X86_CR4_PKE,        ecx, bit(X86_FEATURE_PKU)); -	/* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ -	cr4_fixed1_update(bit(11),            ecx, bit(2)); +	cr4_fixed1_update(X86_CR4_UMIP,       ecx, bit(X86_FEATURE_UMIP));  #undef cr4_fixed1_update  } @@ -10875,6 +10891,11 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  			return 1;  	} +	if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS) && +		(is_noncanonical_address(vmcs12->guest_bndcfgs & PAGE_MASK, vcpu) || +		(vmcs12->guest_bndcfgs & MSR_IA32_BNDCFGS_RSVD))) +			return 1; +  	return 0;  } @@ -11099,13 +11120,12 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  {  	struct vcpu_vmx *vmx = to_vmx(vcpu);  	unsigned long exit_qual; - -	if (kvm_event_needs_reinjection(vcpu)) -		return -EBUSY; +	bool block_nested_events = +	    vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);  	if (vcpu->arch.exception.pending &&  		nested_vmx_check_exception(vcpu, &exit_qual)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_inject_exception_vmexit(vcpu, exit_qual);  		vcpu->arch.exception.pending = false; @@ -11114,14 +11134,14 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  	if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&  	    vmx->nested.preemption_timer_expired) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);  		return 0;  	}  	if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,  				  NMI_VECTOR | INTR_TYPE_NMI_INTR | @@ -11137,7 +11157,7 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)  	if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&  	    nested_exit_on_intr(vcpu)) { -		if (vmx->nested.nested_run_pending) +		if (block_nested_events)  			return -EBUSY;  		nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);  		return 0; @@ -11324,6 +11344,24 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,  	kvm_clear_interrupt_queue(vcpu);  } +static void load_vmcs12_mmu_host_state(struct kvm_vcpu *vcpu, +			struct vmcs12 *vmcs12) +{ +	u32 entry_failure_code; + +	nested_ept_uninit_mmu_context(vcpu); + +	/* +	 * Only PDPTE load can fail as the value of cr3 was checked on entry and +	 * couldn't have changed. +	 */ +	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) +		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); + +	if (!enable_ept) +		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +} +  /*   * A part of what we need to when the nested L2 guest exits and we want to   * run its L1 parent, is to reset L1's guest state to the host state specified @@ -11337,7 +11375,6 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  				   struct vmcs12 *vmcs12)  {  	struct kvm_segment seg; -	u32 entry_failure_code;  	if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)  		vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -11364,17 +11401,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,  	vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);  	vmx_set_cr4(vcpu, vmcs12->host_cr4); -	nested_ept_uninit_mmu_context(vcpu); - -	/* -	 * Only PDPTE load can fail as the value of cr3 was checked on entry and -	 * couldn't have changed. -	 */ -	if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) -		nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); - -	if (!enable_ept) -		vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; +	load_vmcs12_mmu_host_state(vcpu, vmcs12);  	if (enable_vpid) {  		/* @@ -11604,6 +11631,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,  	 * accordingly.  	 */  	nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); + +	load_vmcs12_mmu_host_state(vcpu, vmcs12); +  	/*  	 * The emulated instruction was already skipped in  	 * nested_vmx_run, but the updated RIP was never diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 34c85aa2e2d1..c53298dfbf50 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -107,6 +107,9 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);  static bool __read_mostly ignore_msrs = 0;  module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); +static bool __read_mostly report_ignored_msrs = true; +module_param(report_ignored_msrs, bool, S_IRUGO | S_IWUSR); +  unsigned int min_timer_period_us = 500;  module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); @@ -1795,10 +1798,13 @@ u64 get_kvmclock_ns(struct kvm *kvm)  	/* both __this_cpu_read() and rdtsc() should be on the same cpu */  	get_cpu(); -	kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, -			   &hv_clock.tsc_shift, -			   &hv_clock.tsc_to_system_mul); -	ret = __pvclock_read_cycles(&hv_clock, rdtsc()); +	if (__this_cpu_read(cpu_tsc_khz)) { +		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, +				   &hv_clock.tsc_shift, +				   &hv_clock.tsc_to_system_mul); +		ret = __pvclock_read_cycles(&hv_clock, rdtsc()); +	} else +		ret = ktime_get_boot_ns() + ka->kvmclock_offset;  	put_cpu(); @@ -1830,6 +1836,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v)  	 */  	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); +	if (guest_hv_clock.version & 1) +		++guest_hv_clock.version;  /* first time write, random junk */ +  	vcpu->hv_clock.version = guest_hv_clock.version + 1;  	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,  				&vcpu->hv_clock, @@ -2322,7 +2331,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  		/* Drop writes to this legacy MSR -- see rdmsr  		 * counterpart for further detail.  		 */ -		vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", msr, data); +		if (report_ignored_msrs) +			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", +				msr, data);  		break;  	case MSR_AMD64_OSVW_ID_LENGTH:  		if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) @@ -2359,8 +2370,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  				    msr, data);  			return 1;  		} else { -			vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n", -				    msr, data); +			if (report_ignored_msrs) +				vcpu_unimpl(vcpu, +					"ignored wrmsr: 0x%x data 0x%llx\n", +					msr, data);  			break;  		}  	} @@ -2578,7 +2591,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)  					       msr_info->index);  			return 1;  		} else { -			vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); +			if (report_ignored_msrs) +				vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", +					msr_info->index);  			msr_info->data = 0;  		}  		break; @@ -2922,7 +2937,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)  	srcu_read_unlock(&vcpu->kvm->srcu, idx);  	pagefault_enable();  	kvm_x86_ops->vcpu_put(vcpu); -	kvm_put_guest_fpu(vcpu);  	vcpu->arch.last_host_tsc = rdtsc();  } @@ -4370,7 +4384,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)  					 addr, n, v))  		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))  			break; -		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); +		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);  		handled += n;  		addr += n;  		len -= n; @@ -4629,7 +4643,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)  {  	if (vcpu->mmio_read_completed) {  		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, -			       vcpu->mmio_fragments[0].gpa, *(u64 *)val); +			       vcpu->mmio_fragments[0].gpa, val);  		vcpu->mmio_read_completed = 0;  		return 1;  	} @@ -4651,14 +4665,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,  static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)  { -	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); +	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);  	return vcpu_mmio_write(vcpu, gpa, bytes, val);  }  static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,  			  void *val, int bytes)  { -	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); +	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);  	return X86EMUL_IO_NEEDED;  } @@ -5237,17 +5251,6 @@ static void emulator_halt(struct x86_emulate_ctxt *ctxt)  	emul_to_vcpu(ctxt)->arch.halt_request = 1;  } -static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) -{ -	preempt_disable(); -	kvm_load_guest_fpu(emul_to_vcpu(ctxt)); -} - -static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) -{ -	preempt_enable(); -} -  static int emulator_intercept(struct x86_emulate_ctxt *ctxt,  			      struct x86_instruction_info *info,  			      enum x86_intercept_stage stage) @@ -5325,8 +5328,6 @@ static const struct x86_emulate_ops emulate_ops = {  	.halt                = emulator_halt,  	.wbinvd              = emulator_wbinvd,  	.fix_hypercall       = emulator_fix_hypercall, -	.get_fpu             = emulator_get_fpu, -	.put_fpu             = emulator_put_fpu,  	.intercept           = emulator_intercept,  	.get_cpuid           = emulator_get_cpuid,  	.set_nmi_mask        = emulator_set_nmi_mask, @@ -5430,7 +5431,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)  		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;  		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;  		vcpu->run->internal.ndata = 0; -		r = EMULATE_FAIL; +		r = EMULATE_USER_EXIT;  	}  	kvm_queue_exception(vcpu, UD_VECTOR); @@ -5722,6 +5723,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,  			if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,  						emulation_type))  				return EMULATE_DONE; +			if (ctxt->have_exception && inject_emulated_exception(vcpu)) +				return EMULATE_DONE;  			if (emulation_type & EMULTYPE_SKIP)  				return EMULATE_FAIL;  			return handle_emulation_failure(vcpu); @@ -6761,6 +6764,20 @@ static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)  	kvm_x86_ops->tlb_flush(vcpu);  } +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, +		unsigned long start, unsigned long end) +{ +	unsigned long apic_address; + +	/* +	 * The physical address of apic access page is stored in the VMCS. +	 * Update it when it becomes invalid. +	 */ +	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); +	if (start <= apic_address && apic_address < end) +		kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); +} +  void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)  {  	struct page *page = NULL; @@ -6935,7 +6952,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)  	preempt_disable();  	kvm_x86_ops->prepare_guest_switch(vcpu); -	kvm_load_guest_fpu(vcpu);  	/*  	 * Disable IRQs before setting IN_GUEST_MODE.  Posted interrupt @@ -7248,14 +7264,11 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)  int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  { -	struct fpu *fpu = ¤t->thread.fpu;  	int r; -	sigset_t sigsaved; -	fpu__initialize(fpu); +	kvm_sigset_activate(vcpu); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); +	kvm_load_guest_fpu(vcpu);  	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {  		if (kvm_run->immediate_exit) { @@ -7297,9 +7310,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)  		r = vcpu_run(vcpu);  out: +	kvm_put_guest_fpu(vcpu);  	post_kvm_run_save(vcpu); -	if (vcpu->sigset_active) -		sigprocmask(SIG_SETMASK, &sigsaved, NULL); +	kvm_sigset_deactivate(vcpu);  	return r;  } @@ -7367,7 +7380,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)  #endif  	kvm_rip_write(vcpu, regs->rip); -	kvm_set_rflags(vcpu, regs->rflags); +	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);  	vcpu->arch.exception.pending = false; @@ -7481,6 +7494,29 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,  }  EXPORT_SYMBOL_GPL(kvm_task_switch); +int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ +	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { +		/* +		 * When EFER.LME and CR0.PG are set, the processor is in +		 * 64-bit mode (though maybe in a 32-bit code segment). +		 * CR4.PAE and EFER.LMA must be set. +		 */ +		if (!(sregs->cr4 & X86_CR4_PAE) +		    || !(sregs->efer & EFER_LMA)) +			return -EINVAL; +	} else { +		/* +		 * Not in 64-bit mode: EFER.LMA is clear and the code +		 * segment cannot be 64-bit. +		 */ +		if (sregs->efer & EFER_LMA || sregs->cs.l) +			return -EINVAL; +	} + +	return 0; +} +  int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  				  struct kvm_sregs *sregs)  { @@ -7493,6 +7529,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,  			(sregs->cr4 & X86_CR4_OSXSAVE))  		return -EINVAL; +	if (kvm_valid_sregs(vcpu, sregs)) +		return -EINVAL; +  	apic_base_msr.data = sregs->apic_base;  	apic_base_msr.host_initiated = true;  	if (kvm_set_apic_base(vcpu, &apic_base_msr)) @@ -7690,32 +7729,25 @@ static void fx_init(struct kvm_vcpu *vcpu)  	vcpu->arch.cr0 |= X86_CR0_ET;  } +/* Swap (qemu) user FPU context for the guest FPU context. */  void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)  { -	if (vcpu->guest_fpu_loaded) -		return; - -	/* -	 * Restore all possible states in the guest, -	 * and assume host would use all available bits. -	 * Guest xcr0 would be loaded later. -	 */ -	vcpu->guest_fpu_loaded = 1; -	__kernel_fpu_begin(); +	preempt_disable(); +	copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);  	/* PKRU is separately restored in kvm_x86_ops->run.  */  	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,  				~XFEATURE_MASK_PKRU); +	preempt_enable();  	trace_kvm_fpu(1);  } +/* When vcpu_run ends, restore user space FPU context. */  void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  { -	if (!vcpu->guest_fpu_loaded) -		return; - -	vcpu->guest_fpu_loaded = 0; +	preempt_disable();  	copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); -	__kernel_fpu_end(); +	copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state); +	preempt_enable();  	++vcpu->stat.fpu_reload;  	trace_kvm_fpu(0);  } @@ -7832,7 +7864,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  		 * To avoid have the INIT path from kvm_apic_has_events() that be  		 * called with loaded FPU and does not let userspace fix the state.  		 */ -		kvm_put_guest_fpu(vcpu); +		if (init_event) +			kvm_put_guest_fpu(vcpu);  		mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu.state.xsave,  					XFEATURE_MASK_BNDREGS);  		if (mpx_state_buffer) @@ -7841,6 +7874,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)  					XFEATURE_MASK_BNDCSR);  		if (mpx_state_buffer)  			memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr)); +		if (init_event) +			kvm_load_guest_fpu(vcpu);  	}  	if (!init_event) { | 
