summaryrefslogtreecommitdiff
path: root/arch
diff options
context:
space:
mode:
Diffstat (limited to 'arch')
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c9
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c28
-rw-r--r--arch/arm64/kvm/sys_regs.c71
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c16
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c16
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c18
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c3
-rw-r--r--arch/arm64/kvm/vgic/vgic.c23
-rw-r--r--arch/riscv/kvm/aia_imsic.c16
-rw-r--r--arch/riscv/kvm/mmu.c25
-rw-r--r--arch/riscv/kvm/vcpu.c2
-rw-r--r--arch/x86/include/uapi/asm/vmx.h1
-rw-r--r--arch/x86/kvm/svm/avic.c24
-rw-r--r--arch/x86/kvm/svm/nested.c20
-rw-r--r--arch/x86/kvm/svm/svm.c86
-rw-r--r--arch/x86/kvm/svm/svm.h4
-rw-r--r--arch/x86/kvm/vmx/common.h2
-rw-r--r--arch/x86/kvm/vmx/nested.c8
-rw-r--r--arch/x86/kvm/vmx/vmx.c8
-rw-r--r--arch/x86/kvm/x86.c50
20 files changed, 250 insertions, 180 deletions
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 4e16f9b96f63..58b7d0c477d7 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -479,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id,
struct ffa_mem_region_attributes *ep_mem_access;
struct ffa_composite_mem_region *reg;
struct ffa_mem_region *buf;
- u32 offset, nr_ranges;
+ u32 offset, nr_ranges, checked_offset;
int ret = 0;
if (addr_mbz || npages_mbz || fraglen > len ||
@@ -516,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
- if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < checked_offset) {
ret = FFA_RET_INVALID_PARAMETERS;
goto out_unlock;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ddc8beb55eee..49db32f3ddf7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -367,6 +367,19 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+/*
+ * Ensure the PFN range is contained within PA-range.
+ *
+ * This check is also robust to overflows and is therefore a requirement before
+ * using a pfn/nr_pages pair from an untrusted source.
+ */
+static bool pfn_range_is_valid(u64 pfn, u64 nr_pages)
+{
+ u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT);
+
+ return pfn < limit && ((limit - pfn) >= nr_pages);
+}
+
struct kvm_mem_range {
u64 start;
u64 end;
@@ -776,6 +789,9 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
void *virt = __hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -804,6 +820,9 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
u64 virt = (u64)__hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -887,6 +906,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (!ret)
@@ -902,6 +924,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (!ret)
@@ -945,6 +970,9 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
if (ret)
return ret;
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e67eb39ddc11..8ae2bca81614 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -2595,19 +2595,23 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
.val = 0, \
}
-/* sys_reg_desc initialiser for known cpufeature ID registers */
-#define AA32_ID_SANITISED(name) { \
- ID_DESC(name), \
- .visibility = aa32_id_visibility, \
- .val = 0, \
-}
-
/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) { \
ID_DESC(name), \
.val = mask, \
}
+/*
+ * 32bit ID regs are fully writable when the guest is 32bit
+ * capable. Nothing in the KVM code should rely on 32bit features
+ * anyway, only 64bit, so let the VMM do its worse.
+ */
+#define AA32_ID_WRITABLE(name) { \
+ ID_DESC(name), \
+ .visibility = aa32_id_visibility, \
+ .val = GENMASK(31, 0), \
+}
+
/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
#define ID_FILTERED(sysreg, name, mask) { \
ID_DESC(sysreg), \
@@ -3128,40 +3132,39 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
- AA32_ID_SANITISED(ID_PFR0_EL1),
- AA32_ID_SANITISED(ID_PFR1_EL1),
+ AA32_ID_WRITABLE(ID_PFR0_EL1),
+ AA32_ID_WRITABLE(ID_PFR1_EL1),
{ SYS_DESC(SYS_ID_DFR0_EL1),
.access = access_id_reg,
.get_user = get_id_reg,
.set_user = set_id_dfr0_el1,
.visibility = aa32_id_visibility,
.reset = read_sanitised_id_dfr0_el1,
- .val = ID_DFR0_EL1_PerfMon_MASK |
- ID_DFR0_EL1_CopDbg_MASK, },
+ .val = GENMASK(31, 0) },
ID_HIDDEN(ID_AFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR2_EL1),
- AA32_ID_SANITISED(ID_MMFR3_EL1),
+ AA32_ID_WRITABLE(ID_MMFR0_EL1),
+ AA32_ID_WRITABLE(ID_MMFR1_EL1),
+ AA32_ID_WRITABLE(ID_MMFR2_EL1),
+ AA32_ID_WRITABLE(ID_MMFR3_EL1),
/* CRm=2 */
- AA32_ID_SANITISED(ID_ISAR0_EL1),
- AA32_ID_SANITISED(ID_ISAR1_EL1),
- AA32_ID_SANITISED(ID_ISAR2_EL1),
- AA32_ID_SANITISED(ID_ISAR3_EL1),
- AA32_ID_SANITISED(ID_ISAR4_EL1),
- AA32_ID_SANITISED(ID_ISAR5_EL1),
- AA32_ID_SANITISED(ID_MMFR4_EL1),
- AA32_ID_SANITISED(ID_ISAR6_EL1),
+ AA32_ID_WRITABLE(ID_ISAR0_EL1),
+ AA32_ID_WRITABLE(ID_ISAR1_EL1),
+ AA32_ID_WRITABLE(ID_ISAR2_EL1),
+ AA32_ID_WRITABLE(ID_ISAR3_EL1),
+ AA32_ID_WRITABLE(ID_ISAR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR6_EL1),
/* CRm=3 */
- AA32_ID_SANITISED(MVFR0_EL1),
- AA32_ID_SANITISED(MVFR1_EL1),
- AA32_ID_SANITISED(MVFR2_EL1),
+ AA32_ID_WRITABLE(MVFR0_EL1),
+ AA32_ID_WRITABLE(MVFR1_EL1),
+ AA32_ID_WRITABLE(MVFR2_EL1),
ID_UNALLOCATED(3,3),
- AA32_ID_SANITISED(ID_PFR2_EL1),
+ AA32_ID_WRITABLE(ID_PFR2_EL1),
ID_HIDDEN(ID_DFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR5_EL1),
ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
@@ -5606,11 +5609,13 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
guard(mutex)(&kvm->arch.config_lock);
- if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
- irqchip_in_kernel(kvm) &&
- kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
+ if (!irqchip_in_kernel(kvm)) {
+ u64 val;
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
}
if (vcpu_has_nv(vcpu)) {
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 4c1209261b65..bb92853d1fd3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -64,29 +64,37 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
static int iter_mark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
int nr_lpis = 0;
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+
xa_for_each(&dist->lpi_xa, intid, irq) {
if (!vgic_try_get_irq_ref(irq))
continue;
- xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
nr_lpis++;
}
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
return nr_lpis;
}
static void iter_unmark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
- xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+ __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ /* vgic_put_irq() expects to be called outside of the xa_lock */
vgic_put_irq(kvm, irq);
}
}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1796b1a22a72..da62edbc1205 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- xa_init(&dist->lpi_xa);
+ xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
@@ -71,6 +71,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
+ u64 aa64pfr0, pfr1;
unsigned long i;
int ret;
@@ -161,10 +162,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- else
+ } else {
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ }
+
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index ce3e3ed3f29f..3f1c4b10fed9 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -78,6 +78,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
+ unsigned long flags;
int ret;
/* In this case there is no put, since we keep the reference. */
@@ -88,7 +89,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
if (!irq)
return ERR_PTR(-ENOMEM);
- ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+ ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
if (ret) {
kfree(irq);
return ERR_PTR(ret);
@@ -103,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
irq->target_vcpu = vcpu;
irq->group = 1;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
@@ -114,21 +115,18 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
/* Someone was faster with adding this LPI, lets use that. */
kfree(irq);
irq = oldirq;
-
- goto out_unlock;
+ } else {
+ ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
}
- ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
if (ret) {
xa_release(&dist->lpi_xa, intid);
kfree(irq);
- }
-
-out_unlock:
- xa_unlock(&dist->lpi_xa);
- if (ret)
return ERR_PTR(ret);
+ }
/*
* We "cache" the configuration table entries in our struct vgic_irq's.
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6fbb4b099855..2f75ef14d339 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -301,7 +301,8 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
return;
/* Hide GICv3 sysreg if necessary */
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 ||
+ !irqchip_in_kernel(vcpu->kvm)) {
vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
ICH_HCR_EL2_TC);
return;
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 6dd5a10081e2..8d20c53faef0 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* kvm->arch.config_lock (mutex)
* its->cmd_lock (mutex)
* its->its_lock (mutex)
- * vgic_dist->lpi_xa.xa_lock
+ * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
* vgic_cpu->ap_list_lock must be taken with IRQs disabled
* vgic_irq->irq_lock must be taken with IRQs disabled
*
@@ -141,32 +141,39 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long flags;
- if (irq->intid >= VGIC_MIN_LPI)
- might_lock(&dist->lpi_xa.xa_lock);
+ /*
+ * Normally the lock is only taken when the refcount drops to 0.
+ * Acquire/release it early on lockdep kernels to make locking issues
+ * in rare release paths a bit more obvious.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) {
+ guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock);
+ }
if (!__vgic_put_irq(kvm, irq))
return;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
vgic_release_lpi_locked(dist, irq);
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
static void vgic_release_deleted_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned long intid;
+ unsigned long flags, intid;
struct vgic_irq *irq;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
xa_for_each(&dist->lpi_xa, intid, irq) {
if (irq->pending_release)
vgic_release_lpi_locked(dist, irq);
}
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
diff --git a/arch/riscv/kvm/aia_imsic.c b/arch/riscv/kvm/aia_imsic.c
index fda0346f0ea1..11422cb95a64 100644
--- a/arch/riscv/kvm/aia_imsic.c
+++ b/arch/riscv/kvm/aia_imsic.c
@@ -689,8 +689,20 @@ bool kvm_riscv_vcpu_aia_imsic_has_interrupt(struct kvm_vcpu *vcpu)
*/
read_lock_irqsave(&imsic->vsfile_lock, flags);
- if (imsic->vsfile_cpu > -1)
- ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei));
+ if (imsic->vsfile_cpu > -1) {
+ /*
+ * This function is typically called from kvm_vcpu_block() via
+ * kvm_arch_vcpu_runnable() upon WFI trap. The kvm_vcpu_block()
+ * can be preempted and the blocking VCPU might resume on a
+ * different CPU. This means it is possible that current CPU
+ * does not match the imsic->vsfile_cpu hence this function
+ * must check imsic->vsfile_cpu before accessing HGEIP CSR.
+ */
+ if (imsic->vsfile_cpu != vcpu->cpu)
+ ret = true;
+ else
+ ret = !!(csr_read(CSR_HGEIP) & BIT(imsic->vsfile_hgei));
+ }
read_unlock_irqrestore(&imsic->vsfile_lock, flags);
return ret;
diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
index 525fb5a330c0..58f5f3536ffd 100644
--- a/arch/riscv/kvm/mmu.c
+++ b/arch/riscv/kvm/mmu.c
@@ -171,7 +171,6 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
enum kvm_mr_change change)
{
hva_t hva, reg_end, size;
- gpa_t base_gpa;
bool writable;
int ret = 0;
@@ -190,15 +189,13 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
hva = new->userspace_addr;
size = new->npages << PAGE_SHIFT;
reg_end = hva + size;
- base_gpa = new->base_gfn << PAGE_SHIFT;
writable = !(new->flags & KVM_MEM_READONLY);
mmap_read_lock(current->mm);
/*
* A memory region could potentially cover multiple VMAs, and
- * any holes between them, so iterate over all of them to find
- * out if we can map any of them right now.
+ * any holes between them, so iterate over all of them.
*
* +--------------------------------------------+
* +---------------+----------------+ +----------------+
@@ -209,7 +206,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
*/
do {
struct vm_area_struct *vma;
- hva_t vm_start, vm_end;
+ hva_t vm_end;
vma = find_vma_intersection(current->mm, hva, reg_end);
if (!vma)
@@ -225,36 +222,18 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
}
/* Take the intersection of this VMA with the memory region */
- vm_start = max(hva, vma->vm_start);
vm_end = min(reg_end, vma->vm_end);
if (vma->vm_flags & VM_PFNMAP) {
- gpa_t gpa = base_gpa + (vm_start - hva);
- phys_addr_t pa;
-
- pa = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
- pa += vm_start - vma->vm_start;
-
/* IO region dirty page logging not allowed */
if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
ret = -EINVAL;
goto out;
}
-
- ret = kvm_riscv_mmu_ioremap(kvm, gpa, pa, vm_end - vm_start,
- writable, false);
- if (ret)
- break;
}
hva = vm_end;
} while (hva < reg_end);
- if (change == KVM_MR_FLAGS_ONLY)
- goto out;
-
- if (ret)
- kvm_riscv_mmu_iounmap(kvm, base_gpa, size);
-
out:
mmap_read_unlock(current->mm);
return ret;
diff --git a/arch/riscv/kvm/vcpu.c b/arch/riscv/kvm/vcpu.c
index bccb919ca615..5ce35aba6069 100644
--- a/arch/riscv/kvm/vcpu.c
+++ b/arch/riscv/kvm/vcpu.c
@@ -212,7 +212,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
{
- return (kvm_riscv_vcpu_has_interrupts(vcpu, -1UL) &&
+ return (kvm_riscv_vcpu_has_interrupts(vcpu, -1ULL) &&
!kvm_riscv_vcpu_stopped(vcpu) && !vcpu->arch.pause);
}
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index 9792e329343e..1baa86dfe029 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -93,6 +93,7 @@
#define EXIT_REASON_TPAUSE 68
#define EXIT_REASON_BUS_LOCK 74
#define EXIT_REASON_NOTIFY 75
+#define EXIT_REASON_SEAMCALL 76
#define EXIT_REASON_TDCALL 77
#define EXIT_REASON_MSR_READ_IMM 84
#define EXIT_REASON_MSR_WRITE_IMM 85
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index f286b5706d7c..fef00546c885 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -216,7 +216,7 @@ static void avic_deactivate_vmcb(struct vcpu_svm *svm)
* This function is called from IOMMU driver to notify
* SVM to schedule in a particular vCPU of a particular VM.
*/
-int avic_ga_log_notifier(u32 ga_tag)
+static int avic_ga_log_notifier(u32 ga_tag)
{
unsigned long flags;
struct kvm_svm *kvm_svm;
@@ -788,7 +788,7 @@ int avic_init_vcpu(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
INIT_LIST_HEAD(&svm->ir_list);
- spin_lock_init(&svm->ir_list_lock);
+ raw_spin_lock_init(&svm->ir_list_lock);
if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm))
return 0;
@@ -816,9 +816,9 @@ static void svm_ir_list_del(struct kvm_kernel_irqfd *irqfd)
if (!vcpu)
return;
- spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&to_svm(vcpu)->ir_list_lock, flags);
list_del(&irqfd->vcpu_list);
- spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&to_svm(vcpu)->ir_list_lock, flags);
}
int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
@@ -855,7 +855,7 @@ int avic_pi_update_irte(struct kvm_kernel_irqfd *irqfd, struct kvm *kvm,
* list of IRQs being posted to the vCPU, to ensure the IRTE
* isn't programmed with stale pCPU/IsRunning information.
*/
- guard(spinlock_irqsave)(&svm->ir_list_lock);
+ guard(raw_spinlock_irqsave)(&svm->ir_list_lock);
/*
* Update the target pCPU for IOMMU doorbells if the vCPU is
@@ -972,7 +972,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
* up-to-date entry information, or that this task will wait until
* svm_ir_list_add() completes to set the new target pCPU.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
entry = svm->avic_physical_id_entry;
WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK);
@@ -997,7 +997,7 @@ static void __avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu,
avic_update_iommu_vcpu_affinity(vcpu, h_physical_id, action);
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
@@ -1035,7 +1035,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
* or that this task will wait until svm_ir_list_add() completes to
* mark the vCPU as not running.
*/
- spin_lock_irqsave(&svm->ir_list_lock, flags);
+ raw_spin_lock_irqsave(&svm->ir_list_lock, flags);
avic_update_iommu_vcpu_affinity(vcpu, -1, action);
@@ -1059,7 +1059,7 @@ static void __avic_vcpu_put(struct kvm_vcpu *vcpu, enum avic_vcpu_action action)
svm->avic_physical_id_entry = entry;
- spin_unlock_irqrestore(&svm->ir_list_lock, flags);
+ raw_spin_unlock_irqrestore(&svm->ir_list_lock, flags);
}
void avic_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1243,3 +1243,9 @@ bool __init avic_hardware_setup(void)
return true;
}
+
+void avic_hardware_unsetup(void)
+{
+ if (avic)
+ amd_iommu_register_ga_log_notifier(NULL);
+}
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index a6443feab252..da6e80b3ac35 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -677,11 +677,10 @@ static void nested_vmcb02_prepare_save(struct vcpu_svm *svm, struct vmcb *vmcb12
*/
svm_copy_lbrs(vmcb02, vmcb12);
vmcb02->save.dbgctl &= ~DEBUGCTL_RESERVED_BITS;
- svm_update_lbrv(&svm->vcpu);
-
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ } else {
svm_copy_lbrs(vmcb02, vmcb01);
}
+ svm_update_lbrv(&svm->vcpu);
}
static inline bool is_evtinj_soft(u32 evtinj)
@@ -833,11 +832,7 @@ static void nested_vmcb02_prepare_control(struct vcpu_svm *svm,
svm->soft_int_next_rip = vmcb12_rip;
}
- vmcb02->control.virt_ext = vmcb01->control.virt_ext &
- LBR_CTL_ENABLE_MASK;
- if (guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV))
- vmcb02->control.virt_ext |=
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK);
+ /* LBR_CTL_ENABLE_MASK is controlled by svm_update_lbrv() */
if (!nested_vmcb_needs_vls_intercept(svm))
vmcb02->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK;
@@ -1189,13 +1184,12 @@ int nested_svm_vmexit(struct vcpu_svm *svm)
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
if (unlikely(guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
- (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK))) {
+ (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)))
svm_copy_lbrs(vmcb12, vmcb02);
- svm_update_lbrv(vcpu);
- } else if (unlikely(vmcb01->control.virt_ext & LBR_CTL_ENABLE_MASK)) {
+ else
svm_copy_lbrs(vmcb01, vmcb02);
- svm_update_lbrv(vcpu);
- }
+
+ svm_update_lbrv(vcpu);
if (vnmi) {
if (vmcb02->control.int_ctl & V_NMI_BLOCKING_MASK)
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 153c12dbf3eb..10c21e4c5406 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -806,60 +806,43 @@ void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb)
vmcb_mark_dirty(to_vmcb, VMCB_LBR);
}
-void svm_enable_lbrv(struct kvm_vcpu *vcpu)
+static void __svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
- svm_recalc_lbr_msr_intercepts(vcpu);
-
- /* Move the LBR msrs to the vmcb02 so that the guest can see them. */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb, svm->vmcb01.ptr);
+ to_svm(vcpu)->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK;
}
-static void svm_disable_lbrv(struct kvm_vcpu *vcpu)
+void svm_enable_lbrv(struct kvm_vcpu *vcpu)
{
- struct vcpu_svm *svm = to_svm(vcpu);
-
- KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
- svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
+ __svm_enable_lbrv(vcpu);
svm_recalc_lbr_msr_intercepts(vcpu);
-
- /*
- * Move the LBR msrs back to the vmcb01 to avoid copying them
- * on nested guest entries.
- */
- if (is_guest_mode(vcpu))
- svm_copy_lbrs(svm->vmcb01.ptr, svm->vmcb);
}
-static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm)
+static void __svm_disable_lbrv(struct kvm_vcpu *vcpu)
{
- /*
- * If LBR virtualization is disabled, the LBR MSRs are always kept in
- * vmcb01. If LBR virtualization is enabled and L1 is running VMs of
- * its own, the MSRs are moved between vmcb01 and vmcb02 as needed.
- */
- return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb :
- svm->vmcb01.ptr;
+ KVM_BUG_ON(sev_es_guest(vcpu->kvm), vcpu->kvm);
+ to_svm(vcpu)->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK;
}
void svm_update_lbrv(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK;
- bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) ||
+ bool enable_lbrv = (svm->vmcb->save.dbgctl & DEBUGCTLMSR_LBR) ||
(is_guest_mode(vcpu) && guest_cpu_cap_has(vcpu, X86_FEATURE_LBRV) &&
(svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK));
- if (enable_lbrv == current_enable_lbrv)
- return;
+ if (enable_lbrv && !current_enable_lbrv)
+ __svm_enable_lbrv(vcpu);
+ else if (!enable_lbrv && current_enable_lbrv)
+ __svm_disable_lbrv(vcpu);
- if (enable_lbrv)
- svm_enable_lbrv(vcpu);
- else
- svm_disable_lbrv(vcpu);
+ /*
+ * During nested transitions, it is possible that the current VMCB has
+ * LBR_CTL set, but the previous LBR_CTL had it cleared (or vice versa).
+ * In this case, even though LBR_CTL does not need an update, intercepts
+ * do, so always recalculate the intercepts here.
+ */
+ svm_recalc_lbr_msr_intercepts(vcpu);
}
void disable_nmi_singlestep(struct vcpu_svm *svm)
@@ -921,6 +904,8 @@ static void svm_hardware_unsetup(void)
{
int cpu;
+ avic_hardware_unsetup();
+
sev_hardware_unsetup();
for_each_possible_cpu(cpu)
@@ -2722,19 +2707,19 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
msr_info->data = svm->tsc_aux;
break;
case MSR_IA32_DEBUGCTLMSR:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl;
+ msr_info->data = svm->vmcb->save.dbgctl;
break;
case MSR_IA32_LASTBRANCHFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from;
+ msr_info->data = svm->vmcb->save.br_from;
break;
case MSR_IA32_LASTBRANCHTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to;
+ msr_info->data = svm->vmcb->save.br_to;
break;
case MSR_IA32_LASTINTFROMIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from;
+ msr_info->data = svm->vmcb->save.last_excp_from;
break;
case MSR_IA32_LASTINTTOIP:
- msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to;
+ msr_info->data = svm->vmcb->save.last_excp_to;
break;
case MSR_VM_HSAVE_PA:
msr_info->data = svm->nested.hsave_msr;
@@ -3002,7 +2987,11 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
if (data & DEBUGCTL_RESERVED_BITS)
return 1;
- svm_get_lbr_vmcb(svm)->save.dbgctl = data;
+ if (svm->vmcb->save.dbgctl == data)
+ break;
+
+ svm->vmcb->save.dbgctl = data;
+ vmcb_mark_dirty(svm->vmcb, VMCB_LBR);
svm_update_lbrv(vcpu);
break;
case MSR_VM_HSAVE_PA:
@@ -5386,12 +5375,6 @@ static __init int svm_hardware_setup(void)
svm_hv_hardware_setup();
- for_each_possible_cpu(cpu) {
- r = svm_cpu_init(cpu);
- if (r)
- goto err;
- }
-
enable_apicv = avic_hardware_setup();
if (!enable_apicv) {
enable_ipiv = false;
@@ -5435,6 +5418,13 @@ static __init int svm_hardware_setup(void)
svm_set_cpu_caps();
kvm_caps.inapplicable_quirks &= ~KVM_X86_QUIRK_CD_NW_CLEARED;
+
+ for_each_possible_cpu(cpu) {
+ r = svm_cpu_init(cpu);
+ if (r)
+ goto err;
+ }
+
return 0;
err:
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index e4b04f435b3d..c856d8e0f95e 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -329,7 +329,7 @@ struct vcpu_svm {
* back into remapped mode).
*/
struct list_head ir_list;
- spinlock_t ir_list_lock;
+ raw_spinlock_t ir_list_lock;
struct vcpu_sev_es_state sev_es;
@@ -805,7 +805,7 @@ extern struct kvm_x86_nested_ops svm_nested_ops;
)
bool __init avic_hardware_setup(void);
-int avic_ga_log_notifier(u32 ga_tag);
+void avic_hardware_unsetup(void);
void avic_vm_destroy(struct kvm *kvm);
int avic_vm_init(struct kvm *kvm);
void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb);
diff --git a/arch/x86/kvm/vmx/common.h b/arch/x86/kvm/vmx/common.h
index bc5ece76533a..412d0829d7a2 100644
--- a/arch/x86/kvm/vmx/common.h
+++ b/arch/x86/kvm/vmx/common.h
@@ -98,7 +98,7 @@ static inline int __vmx_handle_ept_violation(struct kvm_vcpu *vcpu, gpa_t gpa,
error_code |= (exit_qualification & EPT_VIOLATION_PROT_MASK)
? PFERR_PRESENT_MASK : 0;
- if (error_code & EPT_VIOLATION_GVA_IS_VALID)
+ if (exit_qualification & EPT_VIOLATION_GVA_IS_VALID)
error_code |= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) ?
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 76271962cb70..bcea087b642f 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -6728,6 +6728,14 @@ static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
case EXIT_REASON_NOTIFY:
/* Notify VM exit is not exposed to L1 */
return false;
+ case EXIT_REASON_SEAMCALL:
+ case EXIT_REASON_TDCALL:
+ /*
+ * SEAMCALL and TDCALL unconditionally VM-Exit, but aren't
+ * virtualized by KVM for L1 hypervisors, i.e. L1 should
+ * never want or expect such an exit.
+ */
+ return false;
default:
return true;
}
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f87c216d976d..91b6f2f3edc2 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -6032,6 +6032,12 @@ static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_tdx_instruction(struct kvm_vcpu *vcpu)
+{
+ kvm_queue_exception(vcpu, UD_VECTOR);
+ return 1;
+}
+
#ifndef CONFIG_X86_SGX_KVM
static int handle_encls(struct kvm_vcpu *vcpu)
{
@@ -6157,6 +6163,8 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_ENCLS] = handle_encls,
[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
[EXIT_REASON_NOTIFY] = handle_notify,
+ [EXIT_REASON_SEAMCALL] = handle_tdx_instruction,
+ [EXIT_REASON_TDCALL] = handle_tdx_instruction,
[EXIT_REASON_MSR_READ_IMM] = handle_rdmsr_imm,
[EXIT_REASON_MSR_WRITE_IMM] = handle_wrmsr_imm,
};
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b4b5d2d09634..c9c2aa6f4705 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3874,15 +3874,9 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
/*
* Returns true if the MSR in question is managed via XSTATE, i.e. is context
- * switched with the rest of guest FPU state. Note! S_CET is _not_ context
- * switched via XSTATE even though it _is_ saved/restored via XSAVES/XRSTORS.
- * Because S_CET is loaded on VM-Enter and VM-Exit via dedicated VMCS fields,
- * the value saved/restored via XSTATE is always the host's value. That detail
- * is _extremely_ important, as the guest's S_CET must _never_ be resident in
- * hardware while executing in the host. Loading guest values for U_CET and
- * PL[0-3]_SSP while executing in the kernel is safe, as U_CET is specific to
- * userspace, and PL[0-3]_SSP are only consumed when transitioning to lower
- * privilege levels, i.e. are effectively only consumed by userspace as well.
+ * switched with the rest of guest FPU state.
+ *
+ * Note, S_CET is _not_ saved/restored via XSAVES/XRSTORS.
*/
static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
{
@@ -3905,6 +3899,11 @@ static bool is_xstate_managed_msr(struct kvm_vcpu *vcpu, u32 msr)
* MSR that is managed via XSTATE. Note, the caller is responsible for doing
* the initial FPU load, this helper only ensures that guest state is resident
* in hardware (the kernel can load its FPU state in IRQ context).
+ *
+ * Note, loading guest values for U_CET and PL[0-3]_SSP while executing in the
+ * kernel is safe, as U_CET is specific to userspace, and PL[0-3]_SSP are only
+ * consumed when transitioning to lower privilege levels, i.e. are effectively
+ * only consumed by userspace as well.
*/
static __always_inline void kvm_access_xstate_msr(struct kvm_vcpu *vcpu,
struct msr_data *msr_info,
@@ -11807,6 +11806,9 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
/* Exclude PKRU, it's restored separately immediately after VM-Exit. */
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, true);
trace_kvm_fpu(1);
@@ -11815,6 +11817,9 @@ static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
+ if (KVM_BUG_ON(!vcpu->arch.guest_fpu.fpstate->in_use, vcpu->kvm))
+ return;
+
fpu_swap_kvm_fpstate(&vcpu->arch.guest_fpu, false);
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
@@ -12137,9 +12142,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
int r;
vcpu_load(vcpu);
- if (kvm_mpx_supported())
- kvm_load_guest_fpu(vcpu);
-
kvm_vcpu_srcu_read_lock(vcpu);
r = kvm_apic_accept_events(vcpu);
@@ -12156,9 +12158,6 @@ int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
out:
kvm_vcpu_srcu_read_unlock(vcpu);
-
- if (kvm_mpx_supported())
- kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return r;
}
@@ -12788,6 +12787,7 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
{
struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate;
u64 xfeatures_mask;
+ bool fpu_in_use;
int i;
/*
@@ -12811,13 +12811,23 @@ static void kvm_xstate_reset(struct kvm_vcpu *vcpu, bool init_event)
BUILD_BUG_ON(sizeof(xfeatures_mask) * BITS_PER_BYTE <= XFEATURE_MAX);
/*
- * All paths that lead to INIT are required to load the guest's FPU
- * state (because most paths are buried in KVM_RUN).
- */
- kvm_put_guest_fpu(vcpu);
+ * Unload guest FPU state (if necessary) before zeroing XSTATE fields
+ * as the kernel can only modify the state when its resident in memory,
+ * i.e. when it's not loaded into hardware.
+ *
+ * WARN if the vCPU's desire to run, i.e. whether or not its in KVM_RUN,
+ * doesn't match the loaded/in-use state of the FPU, as KVM_RUN is the
+ * only path that can trigger INIT emulation _and_ loads FPU state, and
+ * KVM_RUN should _always_ load FPU state.
+ */
+ WARN_ON_ONCE(vcpu->wants_to_run != fpstate->in_use);
+ fpu_in_use = fpstate->in_use;
+ if (fpu_in_use)
+ kvm_put_guest_fpu(vcpu);
for_each_set_bit(i, (unsigned long *)&xfeatures_mask, XFEATURE_MAX)
fpstate_clear_xstate_component(fpstate, i);
- kvm_load_guest_fpu(vcpu);
+ if (fpu_in_use)
+ kvm_load_guest_fpu(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)