summaryrefslogtreecommitdiff
path: root/arch/arm64/kvm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/arm64/kvm')
-rw-r--r--arch/arm64/kvm/arch_timer.c2
-rw-r--r--arch/arm64/kvm/arm.c24
-rw-r--r--arch/arm64/kvm/at.c196
-rw-r--r--arch/arm64/kvm/hyp/nvhe/ffa.c11
-rw-r--r--arch/arm64/kvm/hyp/nvhe/hyp-main.c7
-rw-r--r--arch/arm64/kvm/hyp/nvhe/mem_protect.c28
-rw-r--r--arch/arm64/kvm/hyp/nvhe/pkvm.c3
-rw-r--r--arch/arm64/kvm/hyp/nvhe/sys_regs.c5
-rw-r--r--arch/arm64/kvm/hyp/pgtable.c122
-rw-r--r--arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c4
-rw-r--r--arch/arm64/kvm/hyp/vgic-v3-sr.c96
-rw-r--r--arch/arm64/kvm/mmu.c134
-rw-r--r--arch/arm64/kvm/nested.c125
-rw-r--r--arch/arm64/kvm/pkvm.c11
-rw-r--r--arch/arm64/kvm/ptdump.c35
-rw-r--r--arch/arm64/kvm/sys_regs.c103
-rw-r--r--arch/arm64/kvm/vgic/vgic-debug.c16
-rw-r--r--arch/arm64/kvm/vgic/vgic-init.c25
-rw-r--r--arch/arm64/kvm/vgic/vgic-its.c18
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio-v2.c24
-rw-r--r--arch/arm64/kvm/vgic/vgic-mmio.h1
-rw-r--r--arch/arm64/kvm/vgic/vgic-v2.c291
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3-nested.c104
-rw-r--r--arch/arm64/kvm/vgic/vgic-v3.c427
-rw-r--r--arch/arm64/kvm/vgic/vgic-v4.c5
-rw-r--r--arch/arm64/kvm/vgic/vgic.c321
-rw-r--r--arch/arm64/kvm/vgic/vgic.h43
27 files changed, 1634 insertions, 547 deletions
diff --git a/arch/arm64/kvm/arch_timer.c b/arch/arm64/kvm/arch_timer.c
index 3f675875abea..99a07972068d 100644
--- a/arch/arm64/kvm/arch_timer.c
+++ b/arch/arm64/kvm/arch_timer.c
@@ -806,7 +806,7 @@ static void timer_set_traps(struct kvm_vcpu *vcpu, struct timer_map *map)
tpt = tpc = true;
/*
- * For the poor sods that could not correctly substract one value
+ * For the poor sods that could not correctly subtract one value
* from another, trap the full virtual timer and counter.
*/
if (has_broken_cntvoff() && timer_get_offset(map->direct_vtimer))
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 870953b4a8a7..4f80da0c0d1d 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -132,6 +132,10 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
}
mutex_unlock(&kvm->lock);
break;
+ case KVM_CAP_ARM_SEA_TO_USER:
+ r = 0;
+ set_bit(KVM_ARCH_FLAG_EXIT_SEA, &kvm->arch.flags);
+ break;
default:
break;
}
@@ -327,6 +331,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_IRQFD_RESAMPLE:
case KVM_CAP_COUNTER_OFFSET:
case KVM_CAP_ARM_WRITABLE_IMP_ID_REGS:
+ case KVM_CAP_ARM_SEA_TO_USER:
r = 1;
break;
case KVM_CAP_SET_GUEST_DEBUG2:
@@ -440,7 +445,7 @@ struct kvm *kvm_arch_alloc_vm(void)
if (!has_vhe())
return kzalloc(sz, GFP_KERNEL_ACCOUNT);
- return __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+ return kvzalloc(sz, GFP_KERNEL_ACCOUNT);
}
int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
@@ -624,6 +629,7 @@ nommu:
kvm_timer_vcpu_load(vcpu);
kvm_vgic_load(vcpu);
kvm_vcpu_load_debug(vcpu);
+ kvm_vcpu_load_fgt(vcpu);
if (has_vhe())
kvm_vcpu_load_vhe(vcpu);
kvm_arch_vcpu_load_fp(vcpu);
@@ -642,7 +648,6 @@ nommu:
vcpu->arch.hcr_el2 |= HCR_TWI;
vcpu_set_pauth_traps(vcpu);
- kvm_vcpu_load_fgt(vcpu);
if (is_protected_kvm_enabled()) {
kvm_call_hyp_nvhe(__pkvm_vcpu_load,
@@ -659,8 +664,7 @@ nommu:
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
{
if (is_protected_kvm_enabled()) {
- kvm_call_hyp(__vgic_v3_save_vmcr_aprs,
- &vcpu->arch.vgic_cpu.vgic_v3);
+ kvm_call_hyp(__vgic_v3_save_aprs, &vcpu->arch.vgic_cpu.vgic_v3);
kvm_call_hyp_nvhe(__pkvm_vcpu_put);
}
@@ -1042,6 +1046,10 @@ static int check_vcpu_requests(struct kvm_vcpu *vcpu)
*/
kvm_check_request(KVM_REQ_IRQ_PENDING, vcpu);
+ /* Process interrupts deactivated through a trap */
+ if (kvm_check_request(KVM_REQ_VGIC_PROCESS_UPDATE, vcpu))
+ kvm_vgic_process_async_update(vcpu);
+
if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
kvm_update_stolen_time(vcpu);
@@ -1835,6 +1843,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
return r;
}
+long kvm_arch_vcpu_unlocked_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ return -ENOIOCTLCMD;
+}
+
void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
@@ -2448,7 +2462,7 @@ static void kvm_hyp_init_symbols(void)
kvm_nvhe_sym(__icache_flags) = __icache_flags;
kvm_nvhe_sym(kvm_arm_vmid_bits) = kvm_arm_vmid_bits;
- /* Propagate the FGT state to the the nVHE side */
+ /* Propagate the FGT state to the nVHE side */
kvm_nvhe_sym(hfgrtr_masks) = hfgrtr_masks;
kvm_nvhe_sym(hfgwtr_masks) = hfgwtr_masks;
kvm_nvhe_sym(hfgitr_masks) = hfgitr_masks;
diff --git a/arch/arm64/kvm/at.c b/arch/arm64/kvm/at.c
index be26d5aa668c..53bf70126f81 100644
--- a/arch/arm64/kvm/at.c
+++ b/arch/arm64/kvm/at.c
@@ -346,6 +346,11 @@ static int setup_s1_walk(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
wi->baddr &= GENMASK_ULL(wi->max_oa_bits - 1, x);
+ wi->ha = kvm_has_feat(vcpu->kvm, ID_AA64MMFR1_EL1, HAFDBS, AF);
+ wi->ha &= (wi->regime == TR_EL2 ?
+ FIELD_GET(TCR_EL2_HA, tcr) :
+ FIELD_GET(TCR_HA, tcr));
+
return 0;
addrsz:
@@ -362,10 +367,42 @@ transfault:
return -EFAULT;
}
+static int kvm_read_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 *desc,
+ struct s1_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int kvm_swap_s1_desc(struct kvm_vcpu *vcpu, u64 pa, u64 old, u64 new,
+ struct s1_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
struct s1_walk_result *wr, u64 va)
{
- u64 va_top, va_bottom, baddr, desc;
+ u64 va_top, va_bottom, baddr, desc, new_desc, ipa;
int level, stride, ret;
level = wi->sl;
@@ -375,7 +412,7 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
va_top = get_ia_size(wi) - 1;
while (1) {
- u64 index, ipa;
+ u64 index;
va_bottom = (3 - level) * stride + wi->pgshift;
index = (va & GENMASK_ULL(va_top, va_bottom)) >> (va_bottom - 3);
@@ -414,16 +451,13 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
return ret;
}
- ret = kvm_read_guest(vcpu->kvm, ipa, &desc, sizeof(desc));
+ ret = kvm_read_s1_desc(vcpu, ipa, &desc, wi);
if (ret) {
fail_s1_walk(wr, ESR_ELx_FSC_SEA_TTW(level), false);
return ret;
}
- if (wi->be)
- desc = be64_to_cpu((__force __be64)desc);
- else
- desc = le64_to_cpu((__force __le64)desc);
+ new_desc = desc;
/* Invalid descriptor */
if (!(desc & BIT(0)))
@@ -477,6 +511,17 @@ static int walk_s1(struct kvm_vcpu *vcpu, struct s1_walk_info *wi,
if (check_output_size(baddr & GENMASK(52, va_bottom), wi))
goto addrsz;
+ if (wi->ha)
+ new_desc |= PTE_AF;
+
+ if (new_desc != desc) {
+ ret = kvm_swap_s1_desc(vcpu, ipa, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
if (!(desc & PTE_AF)) {
fail_s1_walk(wr, ESR_ELx_FSC_ACCESS_L(level), false);
return -EACCES;
@@ -1221,7 +1266,7 @@ static void compute_s1_permissions(struct kvm_vcpu *vcpu,
wr->pr &= !pan;
}
-static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+static int handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr, u64 *par)
{
struct s1_walk_result wr = {};
struct s1_walk_info wi = {};
@@ -1246,6 +1291,11 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ /*
+ * Race to update a descriptor -- restart the walk.
+ */
+ if (ret == -EAGAIN)
+ return ret;
if (ret)
goto compute_par;
@@ -1279,7 +1329,8 @@ static u64 handle_at_slow(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
fail_s1_walk(&wr, ESR_ELx_FSC_PERM_L(wr.level), false);
compute_par:
- return compute_par_s1(vcpu, &wi, &wr);
+ *par = compute_par_s1(vcpu, &wi, &wr);
+ return 0;
}
/*
@@ -1407,9 +1458,10 @@ static bool par_check_s1_access_fault(u64 par)
!(par & SYS_PAR_EL1_S));
}
-void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par = __kvm_at_s1e01_fast(vcpu, op, vaddr);
+ int ret;
/*
* If PAR_EL1 reports that AT failed on a S1 permission or access
@@ -1421,15 +1473,20 @@ void __kvm_at_s1e01(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
*/
if ((par & SYS_PAR_EL1_F) &&
!par_check_s1_perm_fault(par) &&
- !par_check_s1_access_fault(par))
- par = handle_at_slow(vcpu, op, vaddr);
+ !par_check_s1_access_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
-void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
u64 par;
+ int ret;
/*
* We've trapped, so everything is live on the CPU. As we will be
@@ -1476,13 +1533,17 @@ void __kvm_at_s1e2(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
}
/* We failed the translation, let's replay it in slow motion */
- if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par))
- par = handle_at_slow(vcpu, op, vaddr);
+ if ((par & SYS_PAR_EL1_F) && !par_check_s1_perm_fault(par)) {
+ ret = handle_at_slow(vcpu, op, vaddr, &par);
+ if (ret)
+ return ret;
+ }
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
-void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
+int __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
{
struct kvm_s2_trans out = {};
u64 ipa, par;
@@ -1509,13 +1570,13 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
break;
default:
WARN_ON_ONCE(1);
- return;
+ return 0;
}
__kvm_at_s1e01(vcpu, op, vaddr);
par = vcpu_read_sys_reg(vcpu, PAR_EL1);
if (par & SYS_PAR_EL1_F)
- return;
+ return 0;
/*
* If we only have a single stage of translation (EL2&0), exit
@@ -1523,14 +1584,14 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
*/
if (compute_translation_regime(vcpu, op) == TR_EL20 ||
!(vcpu_read_sys_reg(vcpu, HCR_EL2) & (HCR_VM | HCR_DC)))
- return;
+ return 0;
/* Do the stage-2 translation */
ipa = (par & GENMASK_ULL(47, 12)) | (vaddr & GENMASK_ULL(11, 0));
out.esr = 0;
ret = kvm_walk_nested_s2(vcpu, ipa, &out);
if (ret < 0)
- return;
+ return ret;
/* Check the access permission */
if (!out.esr &&
@@ -1539,6 +1600,7 @@ void __kvm_at_s12(struct kvm_vcpu *vcpu, u32 op, u64 vaddr)
par = compute_par_s12(vcpu, par, &out);
vcpu_write_sys_reg(vcpu, par, PAR_EL1);
+ return 0;
}
/*
@@ -1637,3 +1699,97 @@ int __kvm_find_s1_desc_level(struct kvm_vcpu *vcpu, u64 va, u64 ipa, int *level)
return ret;
}
}
+
+#ifdef CONFIG_ARM64_LSE_ATOMICS
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ u64 tmp = old;
+ int ret = 0;
+
+ uaccess_enable_privileged();
+
+ asm volatile(__LSE_PREAMBLE
+ "1: cas %[old], %[new], %[addr]\n"
+ "2:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 2b, %w[ret])
+ : [old] "+r" (old), [addr] "+Q" (*ptep), [ret] "+r" (ret)
+ : [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ if (ret)
+ return ret;
+ if (tmp != old)
+ return -EAGAIN;
+
+ return ret;
+}
+#else
+static int __lse_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ return -EINVAL;
+}
+#endif
+
+static int __llsc_swap_desc(u64 __user *ptep, u64 old, u64 new)
+{
+ int ret = 1;
+ u64 tmp;
+
+ uaccess_enable_privileged();
+
+ asm volatile("prfm pstl1strm, %[addr]\n"
+ "1: ldxr %[tmp], %[addr]\n"
+ "sub %[tmp], %[tmp], %[old]\n"
+ "cbnz %[tmp], 3f\n"
+ "2: stlxr %w[ret], %[new], %[addr]\n"
+ "3:\n"
+ _ASM_EXTABLE_UACCESS_ERR(1b, 3b, %w[ret])
+ _ASM_EXTABLE_UACCESS_ERR(2b, 3b, %w[ret])
+ : [ret] "+r" (ret), [addr] "+Q" (*ptep), [tmp] "=&r" (tmp)
+ : [old] "r" (old), [new] "r" (new)
+ : "memory");
+
+ uaccess_disable_privileged();
+
+ /* STLXR didn't update the descriptor, or the compare failed */
+ if (ret == 1)
+ return -EAGAIN;
+
+ return ret;
+}
+
+int __kvm_at_swap_desc(struct kvm *kvm, gpa_t ipa, u64 old, u64 new)
+{
+ struct kvm_memory_slot *slot;
+ unsigned long hva;
+ u64 __user *ptep;
+ bool writable;
+ int offset;
+ gfn_t gfn;
+ int r;
+
+ lockdep_assert(srcu_read_lock_held(&kvm->srcu));
+
+ gfn = ipa >> PAGE_SHIFT;
+ offset = offset_in_page(ipa);
+ slot = gfn_to_memslot(kvm, gfn);
+ hva = gfn_to_hva_memslot_prot(slot, gfn, &writable);
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ if (!writable)
+ return -EPERM;
+
+ ptep = (u64 __user *)hva + offset;
+ if (cpus_have_final_cap(ARM64_HAS_LSE_ATOMICS))
+ r = __lse_swap_desc(ptep, old, new);
+ else
+ r = __llsc_swap_desc(ptep, old, new);
+
+ if (r < 0)
+ return r;
+
+ mark_page_dirty_in_slot(kvm, slot, gfn);
+ return 0;
+}
diff --git a/arch/arm64/kvm/hyp/nvhe/ffa.c b/arch/arm64/kvm/hyp/nvhe/ffa.c
index 4e16f9b96f63..f731cc4c3f28 100644
--- a/arch/arm64/kvm/hyp/nvhe/ffa.c
+++ b/arch/arm64/kvm/hyp/nvhe/ffa.c
@@ -115,7 +115,7 @@ static void ffa_set_retval(struct kvm_cpu_context *ctxt,
*
* FFA-1.3 introduces 64-bit variants of the CPU cycle management
* interfaces. Moreover, FF-A 1.3 clarifies that SMC32 direct requests
- * complete with SMC32 direct reponses which *should* allow us use the
+ * complete with SMC32 direct responses which *should* allow us use the
* function ID sent by the caller to determine whether to return x8-x17.
*
* Note that we also cannot rely on function IDs in the response.
@@ -479,7 +479,7 @@ static void __do_ffa_mem_xfer(const u64 func_id,
struct ffa_mem_region_attributes *ep_mem_access;
struct ffa_composite_mem_region *reg;
struct ffa_mem_region *buf;
- u32 offset, nr_ranges;
+ u32 offset, nr_ranges, checked_offset;
int ret = 0;
if (addr_mbz || npages_mbz || fraglen > len ||
@@ -516,7 +516,12 @@ static void __do_ffa_mem_xfer(const u64 func_id,
goto out_unlock;
}
- if (fraglen < offset + sizeof(struct ffa_composite_mem_region)) {
+ if (check_add_overflow(offset, sizeof(struct ffa_composite_mem_region), &checked_offset)) {
+ ret = FFA_RET_INVALID_PARAMETERS;
+ goto out_unlock;
+ }
+
+ if (fraglen < checked_offset) {
ret = FFA_RET_INVALID_PARAMETERS;
goto out_unlock;
}
diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
index 29430c031095..a7c689152f68 100644
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -157,6 +157,7 @@ static void sync_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
host_vcpu->arch.iflags = hyp_vcpu->vcpu.arch.iflags;
host_cpu_if->vgic_hcr = hyp_cpu_if->vgic_hcr;
+ host_cpu_if->vgic_vmcr = hyp_cpu_if->vgic_vmcr;
for (i = 0; i < hyp_cpu_if->used_lrs; ++i)
host_cpu_if->vgic_lr[i] = hyp_cpu_if->vgic_lr[i];
}
@@ -464,11 +465,11 @@ static void handle___vgic_v3_init_lrs(struct kvm_cpu_context *host_ctxt)
__vgic_v3_init_lrs();
}
-static void handle___vgic_v3_save_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
+static void handle___vgic_v3_save_aprs(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(struct vgic_v3_cpu_if *, cpu_if, host_ctxt, 1);
- __vgic_v3_save_vmcr_aprs(kern_hyp_va(cpu_if));
+ __vgic_v3_save_aprs(kern_hyp_va(cpu_if));
}
static void handle___vgic_v3_restore_vmcr_aprs(struct kvm_cpu_context *host_ctxt)
@@ -616,7 +617,7 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__kvm_tlb_flush_vmid_range),
HANDLE_FUNC(__kvm_flush_cpu_context),
HANDLE_FUNC(__kvm_timer_set_cntvoff),
- HANDLE_FUNC(__vgic_v3_save_vmcr_aprs),
+ HANDLE_FUNC(__vgic_v3_save_aprs),
HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs),
HANDLE_FUNC(__pkvm_reserve_vm),
HANDLE_FUNC(__pkvm_unreserve_vm),
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index ddc8beb55eee..49db32f3ddf7 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -367,6 +367,19 @@ static int host_stage2_unmap_dev_all(void)
return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr);
}
+/*
+ * Ensure the PFN range is contained within PA-range.
+ *
+ * This check is also robust to overflows and is therefore a requirement before
+ * using a pfn/nr_pages pair from an untrusted source.
+ */
+static bool pfn_range_is_valid(u64 pfn, u64 nr_pages)
+{
+ u64 limit = BIT(kvm_phys_shift(&host_mmu.arch.mmu) - PAGE_SHIFT);
+
+ return pfn < limit && ((limit - pfn) >= nr_pages);
+}
+
struct kvm_mem_range {
u64 start;
u64 end;
@@ -776,6 +789,9 @@ int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages)
void *virt = __hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -804,6 +820,9 @@ int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages)
u64 virt = (u64)__hyp_va(phys);
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
hyp_lock_component();
@@ -887,6 +906,9 @@ int __pkvm_host_share_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_OWNED);
if (!ret)
@@ -902,6 +924,9 @@ int __pkvm_host_unshare_ffa(u64 pfn, u64 nr_pages)
u64 size = PAGE_SIZE * nr_pages;
int ret;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
host_lock_component();
ret = __host_check_page_state_range(phys, size, PKVM_PAGE_SHARED_OWNED);
if (!ret)
@@ -945,6 +970,9 @@ int __pkvm_host_share_guest(u64 pfn, u64 gfn, u64 nr_pages, struct pkvm_hyp_vcpu
if (prot & ~KVM_PGTABLE_PROT_RWX)
return -EINVAL;
+ if (!pfn_range_is_valid(pfn, nr_pages))
+ return -EINVAL;
+
ret = __guest_check_transition_size(phys, ipa, nr_pages, &size);
if (ret)
return ret;
diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c
index 43bde061b65d..8911338961c5 100644
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -337,6 +337,9 @@ static void pkvm_init_features_from_host(struct pkvm_hyp_vm *hyp_vm, const struc
/* CTR_EL0 is always under host control, even for protected VMs. */
hyp_vm->kvm.arch.ctr_el0 = host_kvm->arch.ctr_el0;
+ /* Preserve the vgic model so that GICv3 emulation works */
+ hyp_vm->kvm.arch.vgic.vgic_model = host_kvm->arch.vgic.vgic_model;
+
if (test_bit(KVM_ARCH_FLAG_MTE_ENABLED, &host_kvm->arch.flags))
set_bit(KVM_ARCH_FLAG_MTE_ENABLED, &kvm->arch.flags);
diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
index 82da9b03692d..3108b5185c20 100644
--- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c
+++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c
@@ -444,6 +444,8 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Scalable Vector Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_PMR_EL1),
+
RAZ_WI(SYS_ERRIDR_EL1),
RAZ_WI(SYS_ERRSELR_EL1),
RAZ_WI(SYS_ERXFR_EL1),
@@ -457,9 +459,12 @@ static const struct sys_reg_desc pvm_sys_reg_descs[] = {
/* Limited Ordering Regions Registers are restricted. */
+ HOST_HANDLED(SYS_ICC_DIR_EL1),
+ HOST_HANDLED(SYS_ICC_RPR_EL1),
HOST_HANDLED(SYS_ICC_SGI1R_EL1),
HOST_HANDLED(SYS_ICC_ASGI1R_EL1),
HOST_HANDLED(SYS_ICC_SGI0R_EL1),
+ HOST_HANDLED(SYS_ICC_CTLR_EL1),
{ SYS_DESC(SYS_ICC_SRE_EL1), .access = pvm_gic_read_sre, },
HOST_HANDLED(SYS_CCSIDR_EL1),
diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c
index c351b4abd5db..947ac1a951a5 100644
--- a/arch/arm64/kvm/hyp/pgtable.c
+++ b/arch/arm64/kvm/hyp/pgtable.c
@@ -661,11 +661,37 @@ void kvm_tlb_flush_vmid_range(struct kvm_s2_mmu *mmu,
#define KVM_S2_MEMATTR(pgt, attr) PAGE_S2_MEMATTR(attr, stage2_has_fwb(pgt))
+static int stage2_set_xn_attr(enum kvm_pgtable_prot prot, kvm_pte_t *attr)
+{
+ bool px, ux;
+ u8 xn;
+
+ px = prot & KVM_PGTABLE_PROT_PX;
+ ux = prot & KVM_PGTABLE_PROT_UX;
+
+ if (!cpus_have_final_cap(ARM64_HAS_XNX) && px != ux)
+ return -EINVAL;
+
+ if (px && ux)
+ xn = 0b00;
+ else if (!px && ux)
+ xn = 0b01;
+ else if (!px && !ux)
+ xn = 0b10;
+ else
+ xn = 0b11;
+
+ *attr &= ~KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ *attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_HI_S2_XN, xn);
+ return 0;
+}
+
static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot prot,
kvm_pte_t *ptep)
{
kvm_pte_t attr;
u32 sh = KVM_PTE_LEAF_ATTR_LO_S2_SH_IS;
+ int r;
switch (prot & (KVM_PGTABLE_PROT_DEVICE |
KVM_PGTABLE_PROT_NORMAL_NC)) {
@@ -685,8 +711,9 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p
attr = KVM_S2_MEMATTR(pgt, NORMAL);
}
- if (!(prot & KVM_PGTABLE_PROT_X))
- attr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ r = stage2_set_xn_attr(prot, &attr);
+ if (r)
+ return r;
if (prot & KVM_PGTABLE_PROT_R)
attr |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
@@ -715,8 +742,20 @@ enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte)
prot |= KVM_PGTABLE_PROT_R;
if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W)
prot |= KVM_PGTABLE_PROT_W;
- if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN))
- prot |= KVM_PGTABLE_PROT_X;
+
+ switch (FIELD_GET(KVM_PTE_LEAF_ATTR_HI_S2_XN, pte)) {
+ case 0b00:
+ prot |= KVM_PGTABLE_PROT_PX | KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b01:
+ prot |= KVM_PGTABLE_PROT_UX;
+ break;
+ case 0b11:
+ prot |= KVM_PGTABLE_PROT_PX;
+ break;
+ default:
+ break;
+ }
return prot;
}
@@ -1290,9 +1329,9 @@ bool kvm_pgtable_stage2_test_clear_young(struct kvm_pgtable *pgt, u64 addr,
int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
enum kvm_pgtable_prot prot, enum kvm_pgtable_walk_flags flags)
{
- int ret;
+ kvm_pte_t xn = 0, set = 0, clr = 0;
s8 level;
- kvm_pte_t set = 0, clr = 0;
+ int ret;
if (prot & KVM_PTE_LEAF_ATTR_HI_SW)
return -EINVAL;
@@ -1303,8 +1342,12 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr,
if (prot & KVM_PGTABLE_PROT_W)
set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
- if (prot & KVM_PGTABLE_PROT_X)
- clr |= KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ ret = stage2_set_xn_attr(prot, &xn);
+ if (ret)
+ return ret;
+
+ set |= xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
+ clr |= ~xn & KVM_PTE_LEAF_ATTR_HI_S2_XN;
ret = stage2_update_leaf_attrs(pgt, addr, 1, set, clr, NULL, &level, flags);
if (!ret || ret == -EAGAIN)
@@ -1535,37 +1578,80 @@ size_t kvm_pgtable_stage2_pgd_size(u64 vtcr)
return kvm_pgd_pages(ia_bits, start_level) * PAGE_SIZE;
}
-static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
- enum kvm_pgtable_walk_flags visit)
+static int stage2_free_leaf(const struct kvm_pgtable_visit_ctx *ctx)
{
struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
- if (!stage2_pte_is_counted(ctx->old))
+ mm_ops->put_page(ctx->ptep);
+ return 0;
+}
+
+static int stage2_free_table_post(const struct kvm_pgtable_visit_ctx *ctx)
+{
+ struct kvm_pgtable_mm_ops *mm_ops = ctx->mm_ops;
+ kvm_pte_t *childp = kvm_pte_follow(ctx->old, mm_ops);
+
+ if (mm_ops->page_count(childp) != 1)
return 0;
+ /*
+ * Drop references and clear the now stale PTE to avoid rewalking the
+ * freed page table.
+ */
mm_ops->put_page(ctx->ptep);
+ mm_ops->put_page(childp);
+ kvm_clear_pte(ctx->ptep);
+ return 0;
+}
- if (kvm_pte_table(ctx->old, ctx->level))
- mm_ops->put_page(kvm_pte_follow(ctx->old, mm_ops));
+static int stage2_free_walker(const struct kvm_pgtable_visit_ctx *ctx,
+ enum kvm_pgtable_walk_flags visit)
+{
+ if (!stage2_pte_is_counted(ctx->old))
+ return 0;
- return 0;
+ switch (visit) {
+ case KVM_PGTABLE_WALK_LEAF:
+ return stage2_free_leaf(ctx);
+ case KVM_PGTABLE_WALK_TABLE_POST:
+ return stage2_free_table_post(ctx);
+ default:
+ return -EINVAL;
+ }
}
-void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void kvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
{
- size_t pgd_sz;
struct kvm_pgtable_walker walker = {
.cb = stage2_free_walker,
.flags = KVM_PGTABLE_WALK_LEAF |
KVM_PGTABLE_WALK_TABLE_POST,
};
- WARN_ON(kvm_pgtable_walk(pgt, 0, BIT(pgt->ia_bits), &walker));
+ WARN_ON(kvm_pgtable_walk(pgt, addr, size, &walker));
+}
+
+void kvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ size_t pgd_sz;
+
pgd_sz = kvm_pgd_pages(pgt->ia_bits, pgt->start_level) * PAGE_SIZE;
- pgt->mm_ops->free_pages_exact(kvm_dereference_pteref(&walker, pgt->pgd), pgd_sz);
+
+ /*
+ * Since the pgtable is unlinked at this point, and not shared with
+ * other walkers, safely deference pgd with kvm_dereference_pteref_raw()
+ */
+ pgt->mm_ops->free_pages_exact(kvm_dereference_pteref_raw(pgt->pgd), pgd_sz);
pgt->pgd = NULL;
}
+void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ kvm_pgtable_stage2_destroy_range(pgt, 0, BIT(pgt->ia_bits));
+ kvm_pgtable_stage2_destroy_pgd(pgt);
+}
+
void kvm_pgtable_stage2_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
{
kvm_pteref_t ptep = (kvm_pteref_t)pgtable;
diff --git a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
index 78579b31a420..5fd99763b54d 100644
--- a/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
+++ b/arch/arm64/kvm/hyp/vgic-v2-cpuif-proxy.c
@@ -63,6 +63,10 @@ int __vgic_v2_perform_cpuif_access(struct kvm_vcpu *vcpu)
return -1;
}
+ /* Handle deactivation as a normal exit */
+ if ((fault_ipa - vgic->vgic_cpu_base) >= GIC_CPU_DEACTIVATE)
+ return 0;
+
rd = kvm_vcpu_dabt_get_rd(vcpu);
addr = kvm_vgic_global_state.vcpu_hyp_va;
addr += fault_ipa - vgic->vgic_cpu_base;
diff --git a/arch/arm64/kvm/hyp/vgic-v3-sr.c b/arch/arm64/kvm/hyp/vgic-v3-sr.c
index acd909b7f225..0b670a033fd8 100644
--- a/arch/arm64/kvm/hyp/vgic-v3-sr.c
+++ b/arch/arm64/kvm/hyp/vgic-v3-sr.c
@@ -14,6 +14,8 @@
#include <asm/kvm_hyp.h>
#include <asm/kvm_mmu.h>
+#include "../../vgic/vgic.h"
+
#define vtr_to_max_lr_idx(v) ((v) & 0xf)
#define vtr_to_nr_pre_bits(v) ((((u32)(v) >> 26) & 7) + 1)
#define vtr_to_nr_apr_regs(v) (1 << (vtr_to_nr_pre_bits(v) - 5))
@@ -58,7 +60,7 @@ u64 __gic_v3_get_lr(unsigned int lr)
unreachable();
}
-static void __gic_v3_set_lr(u64 val, int lr)
+void __gic_v3_set_lr(u64 val, int lr)
{
switch (lr & 0xf) {
case 0:
@@ -196,6 +198,11 @@ static u32 __vgic_v3_read_ap1rn(int n)
return val;
}
+static u64 compute_ich_hcr(struct vgic_v3_cpu_if *cpu_if)
+{
+ return cpu_if->vgic_hcr | vgic_ich_hcr_trap_bits();
+}
+
void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
{
u64 used_lrs = cpu_if->used_lrs;
@@ -212,14 +219,12 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
}
}
- if (used_lrs || cpu_if->its_vpe.its_vm) {
+ if (used_lrs) {
int i;
u32 elrsr;
elrsr = read_gicreg(ICH_ELRSR_EL2);
- write_gicreg(cpu_if->vgic_hcr & ~ICH_HCR_EL2_En, ICH_HCR_EL2);
-
for (i = 0; i < used_lrs; i++) {
if (elrsr & (1 << i))
cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
@@ -229,6 +234,23 @@ void __vgic_v3_save_state(struct vgic_v3_cpu_if *cpu_if)
__gic_v3_set_lr(0, i);
}
}
+
+ cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
+
+ if (cpu_if->vgic_hcr & ICH_HCR_EL2_LRENPIE) {
+ u64 val = read_gicreg(ICH_HCR_EL2);
+ cpu_if->vgic_hcr &= ~ICH_HCR_EL2_EOIcount;
+ cpu_if->vgic_hcr |= val & ICH_HCR_EL2_EOIcount;
+ }
+
+ write_gicreg(0, ICH_HCR_EL2);
+
+ /*
+ * Hack alert: On NV, this results in a trap so that the above write
+ * actually takes effect... No synchronisation is necessary, as we
+ * only care about the effects when this traps.
+ */
+ read_gicreg(ICH_MISR_EL2);
}
void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
@@ -236,12 +258,10 @@ void __vgic_v3_restore_state(struct vgic_v3_cpu_if *cpu_if)
u64 used_lrs = cpu_if->used_lrs;
int i;
- if (used_lrs || cpu_if->its_vpe.its_vm) {
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ write_gicreg(compute_ich_hcr(cpu_if), ICH_HCR_EL2);
- for (i = 0; i < used_lrs; i++)
- __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
- }
+ for (i = 0; i < used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
/*
* Ensure that writes to the LRs, and on non-VHE systems ensure that
@@ -307,24 +327,20 @@ void __vgic_v3_activate_traps(struct vgic_v3_cpu_if *cpu_if)
}
/*
- * If we need to trap system registers, we must write
- * ICH_HCR_EL2 anyway, even if no interrupts are being
- * injected. Note that this also applies if we don't expect
- * any system register access (no vgic at all).
+ * If we need to trap system registers, we must write ICH_HCR_EL2
+ * anyway, even if no interrupts are being injected. Note that this
+ * also applies if we don't expect any system register access (no
+ * vgic at all). In any case, no need to provide MI configuration.
*/
if (static_branch_unlikely(&vgic_v3_cpuif_trap) ||
cpu_if->its_vpe.its_vm || !cpu_if->vgic_sre)
- write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+ write_gicreg(vgic_ich_hcr_trap_bits() | ICH_HCR_EL2_En, ICH_HCR_EL2);
}
void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
- if (!cpu_if->vgic_sre) {
- cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2);
- }
-
/* Only restore SRE if the host implements the GICv2 interface */
if (static_branch_unlikely(&vgic_v3_has_v2_compat)) {
val = read_gicreg(ICC_SRE_EL2);
@@ -346,7 +362,7 @@ void __vgic_v3_deactivate_traps(struct vgic_v3_cpu_if *cpu_if)
write_gicreg(0, ICH_HCR_EL2);
}
-static void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
+void __vgic_v3_save_aprs(struct vgic_v3_cpu_if *cpu_if)
{
u64 val;
u32 nr_pre_bits;
@@ -507,13 +523,6 @@ static void __vgic_v3_write_vmcr(u32 vmcr)
write_gicreg(vmcr, ICH_VMCR_EL2);
}
-void __vgic_v3_save_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
-{
- __vgic_v3_save_aprs(cpu_if);
- if (cpu_if->vgic_sre)
- cpu_if->vgic_vmcr = __vgic_v3_read_vmcr();
-}
-
void __vgic_v3_restore_vmcr_aprs(struct vgic_v3_cpu_if *cpu_if)
{
__vgic_v3_compat_mode_enable();
@@ -790,7 +799,7 @@ static void __vgic_v3_bump_eoicount(void)
write_gicreg(hcr, ICH_HCR_EL2);
}
-static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+static int ___vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
{
u32 vid = vcpu_get_reg(vcpu, rt);
u64 lr_val;
@@ -798,19 +807,25 @@ static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
/* EOImode == 0, nothing to be done here */
if (!(vmcr & ICH_VMCR_EOIM_MASK))
- return;
+ return 1;
/* No deactivate to be performed on an LPI */
if (vid >= VGIC_MIN_LPI)
- return;
+ return 1;
lr = __vgic_v3_find_active_lr(vcpu, vid, &lr_val);
- if (lr == -1) {
- __vgic_v3_bump_eoicount();
- return;
+ if (lr != -1) {
+ __vgic_v3_clear_active_lr(lr, lr_val);
+ return 1;
}
- __vgic_v3_clear_active_lr(lr, lr_val);
+ return 0;
+}
+
+static void __vgic_v3_write_dir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
+{
+ if (!___vgic_v3_write_dir(vcpu, vmcr, rt))
+ __vgic_v3_bump_eoicount();
}
static void __vgic_v3_write_eoir(struct kvm_vcpu *vcpu, u32 vmcr, int rt)
@@ -1245,6 +1260,21 @@ int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu)
case SYS_ICC_DIR_EL1:
if (unlikely(is_read))
return 0;
+ /*
+ * Full exit if required to handle overflow deactivation,
+ * unless we can emulate it in the LRs (likely the majority
+ * of the cases).
+ */
+ if (vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr & ICH_HCR_EL2_TDIR) {
+ int ret;
+
+ ret = ___vgic_v3_write_dir(vcpu, __vgic_v3_read_vmcr(),
+ kvm_vcpu_sys_get_rt(vcpu));
+ if (ret)
+ __kvm_skip_instr(vcpu);
+
+ return ret;
+ }
fn = __vgic_v3_write_dir;
break;
case SYS_ICC_RPR_EL1:
diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
index 7cc964af8d30..48d7c372a4cd 100644
--- a/arch/arm64/kvm/mmu.c
+++ b/arch/arm64/kvm/mmu.c
@@ -904,6 +904,38 @@ static int kvm_init_ipa_range(struct kvm_s2_mmu *mmu, unsigned long type)
return 0;
}
+/*
+ * Assume that @pgt is valid and unlinked from the KVM MMU to free the
+ * page-table without taking the kvm_mmu_lock and without performing any
+ * TLB invalidations.
+ *
+ * Also, the range of addresses can be large enough to cause need_resched
+ * warnings, for instance on CONFIG_PREEMPT_NONE kernels. Hence, invoke
+ * cond_resched() periodically to prevent hogging the CPU for a long time
+ * and schedule something else, if required.
+ */
+static void stage2_destroy_range(struct kvm_pgtable *pgt, phys_addr_t addr,
+ phys_addr_t end)
+{
+ u64 next;
+
+ do {
+ next = stage2_range_addr_end(addr, end);
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_range)(pgt, addr,
+ next - addr);
+ if (next != end)
+ cond_resched();
+ } while (addr = next, addr != end);
+}
+
+static void kvm_stage2_destroy(struct kvm_pgtable *pgt)
+{
+ unsigned int ia_bits = VTCR_EL2_IPA(pgt->mmu->vtcr);
+
+ stage2_destroy_range(pgt, 0, BIT(ia_bits));
+ KVM_PGT_FN(kvm_pgtable_stage2_destroy_pgd)(pgt);
+}
+
/**
* kvm_init_stage2_mmu - Initialise a S2 MMU structure
* @kvm: The pointer to the KVM structure
@@ -980,7 +1012,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
return 0;
out_destroy_pgtable:
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_stage2_destroy(pgt);
out_free_pgtable:
kfree(pgt);
return err;
@@ -1081,7 +1113,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
write_unlock(&kvm->mmu_lock);
if (pgt) {
- KVM_PGT_FN(kvm_pgtable_stage2_destroy)(pgt);
+ kvm_stage2_destroy(pgt);
kfree(pgt);
}
}
@@ -1521,6 +1553,16 @@ static void adjust_nested_fault_perms(struct kvm_s2_trans *nested,
*prot |= kvm_encode_nested_level(nested);
}
+static void adjust_nested_exec_perms(struct kvm *kvm,
+ struct kvm_s2_trans *nested,
+ enum kvm_pgtable_prot *prot)
+{
+ if (!kvm_s2_trans_exec_el0(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_UX;
+ if (!kvm_s2_trans_exec_el1(kvm, nested))
+ *prot &= ~KVM_PGTABLE_PROT_PX;
+}
+
#define KVM_PGTABLE_WALK_MEMABORT_FLAGS (KVM_PGTABLE_WALK_HANDLE_FAULT | KVM_PGTABLE_WALK_SHARED)
static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
@@ -1572,11 +1614,12 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
if (writable)
prot |= KVM_PGTABLE_PROT_W;
- if (exec_fault ||
- (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
- (!nested || kvm_s2_trans_executable(nested))))
+ if (exec_fault || cpus_have_final_cap(ARM64_HAS_CACHE_DIC))
prot |= KVM_PGTABLE_PROT_X;
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
kvm_fault_lock(kvm);
if (mmu_invalidate_retry(kvm, mmu_seq)) {
ret = -EAGAIN;
@@ -1755,7 +1798,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
/*
* Check if this is non-struct page memory PFN, and cannot support
- * CMOs. It could potentially be unsafe to access as cachable.
+ * CMOs. It could potentially be unsafe to access as cacheable.
*/
if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
if (is_vma_cacheable) {
@@ -1851,11 +1894,13 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
prot |= KVM_PGTABLE_PROT_NORMAL_NC;
else
prot |= KVM_PGTABLE_PROT_DEVICE;
- } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC) &&
- (!nested || kvm_s2_trans_executable(nested))) {
+ } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
prot |= KVM_PGTABLE_PROT_X;
}
+ if (nested)
+ adjust_nested_exec_perms(kvm, nested, &prot);
+
/*
* Under the premise of getting a FSC_PERM fault, we just need to relax
* permissions only if vma_pagesize equals fault_granule. Otherwise,
@@ -1899,8 +1944,48 @@ static void handle_access_fault(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa)
read_unlock(&vcpu->kvm->mmu_lock);
}
+/*
+ * Returns true if the SEA should be handled locally within KVM if the abort
+ * is caused by a kernel memory allocation (e.g. stage-2 table memory).
+ */
+static bool host_owns_sea(struct kvm_vcpu *vcpu, u64 esr)
+{
+ /*
+ * Without FEAT_RAS HCR_EL2.TEA is RES0, meaning any external abort
+ * taken from a guest EL to EL2 is due to a host-imposed access (e.g.
+ * stage-2 PTW).
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
+ return true;
+
+ /* KVM owns the VNCR when the vCPU isn't in a nested context. */
+ if (is_hyp_ctxt(vcpu) && !kvm_vcpu_trap_is_iabt(vcpu) && (esr & ESR_ELx_VNCR))
+ return true;
+
+ /*
+ * Determining if an external abort during a table walk happened at
+ * stage-2 is only possible with S1PTW is set. Otherwise, since KVM
+ * sets HCR_EL2.TEA, SEAs due to a stage-1 walk (i.e. accessing the
+ * PA of the stage-1 descriptor) can reach here and are reported
+ * with a TTW ESR value.
+ */
+ return (esr_fsc_is_sea_ttw(esr) && (esr & ESR_ELx_S1PTW));
+}
+
int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_run *run = vcpu->run;
+ u64 esr = kvm_vcpu_get_esr(vcpu);
+ u64 esr_mask = ESR_ELx_EC_MASK |
+ ESR_ELx_IL |
+ ESR_ELx_FnV |
+ ESR_ELx_EA |
+ ESR_ELx_CM |
+ ESR_ELx_WNR |
+ ESR_ELx_FSC;
+ u64 ipa;
+
/*
* Give APEI the opportunity to claim the abort before handling it
* within KVM. apei_claim_sea() expects to be called with IRQs enabled.
@@ -1909,7 +1994,33 @@ int kvm_handle_guest_sea(struct kvm_vcpu *vcpu)
if (apei_claim_sea(NULL) == 0)
return 1;
- return kvm_inject_serror(vcpu);
+ if (host_owns_sea(vcpu, esr) ||
+ !test_bit(KVM_ARCH_FLAG_EXIT_SEA, &vcpu->kvm->arch.flags))
+ return kvm_inject_serror(vcpu);
+
+ /* ESR_ELx.SET is RES0 when FEAT_RAS isn't implemented. */
+ if (kvm_has_ras(kvm))
+ esr_mask |= ESR_ELx_SET_MASK;
+
+ /*
+ * Exit to userspace, and provide faulting guest virtual and physical
+ * addresses in case userspace wants to emulate SEA to guest by
+ * writing to FAR_ELx and HPFAR_ELx registers.
+ */
+ memset(&run->arm_sea, 0, sizeof(run->arm_sea));
+ run->exit_reason = KVM_EXIT_ARM_SEA;
+ run->arm_sea.esr = esr & esr_mask;
+
+ if (!(esr & ESR_ELx_FnV))
+ run->arm_sea.gva = kvm_vcpu_get_hfar(vcpu);
+
+ ipa = kvm_vcpu_get_fault_ipa(vcpu);
+ if (ipa != INVALID_GPA) {
+ run->arm_sea.flags |= KVM_EXIT_ARM_SEA_FLAG_GPA_VALID;
+ run->arm_sea.gpa = ipa;
+ }
+
+ return 0;
}
/**
@@ -1999,6 +2110,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
u32 esr;
ret = kvm_walk_nested_s2(vcpu, fault_ipa, &nested_trans);
+ if (ret == -EAGAIN) {
+ ret = 1;
+ goto out_unlock;
+ }
+
if (ret) {
esr = kvm_s2_trans_esr(&nested_trans);
kvm_inject_s2_fault(vcpu, esr);
diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
index f04cda40545b..cdeeb8f09e72 100644
--- a/arch/arm64/kvm/nested.c
+++ b/arch/arm64/kvm/nested.c
@@ -85,7 +85,7 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
/*
* Let's treat memory allocation failures as benign: If we fail to
* allocate anything, return an error and keep the allocated array
- * alive. Userspace may try to recover by intializing the vcpu
+ * alive. Userspace may try to recover by initializing the vcpu
* again, and there is no reason to affect the whole VM for this.
*/
num_mmus = atomic_read(&kvm->online_vcpus) * S2_MMU_PER_VCPU;
@@ -124,14 +124,13 @@ int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu)
}
struct s2_walk_info {
- int (*read_desc)(phys_addr_t pa, u64 *desc, void *data);
- void *data;
- u64 baddr;
- unsigned int max_oa_bits;
- unsigned int pgshift;
- unsigned int sl;
- unsigned int t0sz;
- bool be;
+ u64 baddr;
+ unsigned int max_oa_bits;
+ unsigned int pgshift;
+ unsigned int sl;
+ unsigned int t0sz;
+ bool be;
+ bool ha;
};
static u32 compute_fsc(int level, u32 fsc)
@@ -199,6 +198,42 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
return 0;
}
+static int read_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 *desc,
+ struct s2_walk_info *wi)
+{
+ u64 val;
+ int r;
+
+ r = kvm_read_guest(vcpu->kvm, pa, &val, sizeof(val));
+ if (r)
+ return r;
+
+ /*
+ * Handle reversedescriptors if endianness differs between the
+ * host and the guest hypervisor.
+ */
+ if (wi->be)
+ *desc = be64_to_cpu((__force __be64)val);
+ else
+ *desc = le64_to_cpu((__force __le64)val);
+
+ return 0;
+}
+
+static int swap_guest_s2_desc(struct kvm_vcpu *vcpu, phys_addr_t pa, u64 old, u64 new,
+ struct s2_walk_info *wi)
+{
+ if (wi->be) {
+ old = (__force u64)cpu_to_be64(old);
+ new = (__force u64)cpu_to_be64(new);
+ } else {
+ old = (__force u64)cpu_to_le64(old);
+ new = (__force u64)cpu_to_le64(new);
+ }
+
+ return __kvm_at_swap_desc(vcpu->kvm, pa, old, new);
+}
+
/*
* This is essentially a C-version of the pseudo code from the ARM ARM
* AArch64.TranslationTableWalk function. I strongly recommend looking at
@@ -206,13 +241,13 @@ static int check_output_size(struct s2_walk_info *wi, phys_addr_t output)
*
* Must be called with the kvm->srcu read lock held
*/
-static int walk_nested_s2_pgd(phys_addr_t ipa,
+static int walk_nested_s2_pgd(struct kvm_vcpu *vcpu, phys_addr_t ipa,
struct s2_walk_info *wi, struct kvm_s2_trans *out)
{
int first_block_level, level, stride, input_size, base_lower_bound;
phys_addr_t base_addr;
unsigned int addr_top, addr_bottom;
- u64 desc; /* page table entry */
+ u64 desc, new_desc; /* page table entry */
int ret;
phys_addr_t paddr;
@@ -257,28 +292,30 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
>> (addr_bottom - 3);
paddr = base_addr | index;
- ret = wi->read_desc(paddr, &desc, wi->data);
+ ret = read_guest_s2_desc(vcpu, paddr, &desc, wi);
if (ret < 0)
return ret;
- /*
- * Handle reversedescriptors if endianness differs between the
- * host and the guest hypervisor.
- */
- if (wi->be)
- desc = be64_to_cpu((__force __be64)desc);
- else
- desc = le64_to_cpu((__force __le64)desc);
+ new_desc = desc;
/* Check for valid descriptor at this point */
- if (!(desc & 1) || ((desc & 3) == 1 && level == 3)) {
+ if (!(desc & KVM_PTE_VALID)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
out->desc = desc;
return 1;
}
- /* We're at the final level or block translation level */
- if ((desc & 3) == 1 || level == 3)
+ if (FIELD_GET(KVM_PTE_TYPE, desc) == KVM_PTE_TYPE_BLOCK) {
+ if (level < 3)
+ break;
+
+ out->esr = compute_fsc(level, ESR_ELx_FSC_FAULT);
+ out->desc = desc;
+ return 1;
+ }
+
+ /* We're at the final level */
+ if (level == 3)
break;
if (check_output_size(wi, desc)) {
@@ -305,7 +342,18 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
return 1;
}
- if (!(desc & BIT(10))) {
+ if (wi->ha)
+ new_desc |= KVM_PTE_LEAF_ATTR_LO_S2_AF;
+
+ if (new_desc != desc) {
+ ret = swap_guest_s2_desc(vcpu, paddr, desc, new_desc, wi);
+ if (ret)
+ return ret;
+
+ desc = new_desc;
+ }
+
+ if (!(desc & KVM_PTE_LEAF_ATTR_LO_S2_AF)) {
out->esr = compute_fsc(level, ESR_ELx_FSC_ACCESS);
out->desc = desc;
return 1;
@@ -318,20 +366,13 @@ static int walk_nested_s2_pgd(phys_addr_t ipa,
(ipa & GENMASK_ULL(addr_bottom - 1, 0));
out->output = paddr;
out->block_size = 1UL << ((3 - level) * stride + wi->pgshift);
- out->readable = desc & (0b01 << 6);
- out->writable = desc & (0b10 << 6);
+ out->readable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R;
+ out->writable = desc & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W;
out->level = level;
out->desc = desc;
return 0;
}
-static int read_guest_s2_desc(phys_addr_t pa, u64 *desc, void *data)
-{
- struct kvm_vcpu *vcpu = data;
-
- return kvm_read_guest(vcpu->kvm, pa, desc, sizeof(*desc));
-}
-
static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
{
wi->t0sz = vtcr & TCR_EL2_T0SZ_MASK;
@@ -350,6 +391,8 @@ static void vtcr_to_walk_info(u64 vtcr, struct s2_walk_info *wi)
/* Global limit for now, should eventually be per-VM */
wi->max_oa_bits = min(get_kvm_ipa_limit(),
ps_to_output_size(FIELD_GET(VTCR_EL2_PS_MASK, vtcr), false));
+
+ wi->ha = vtcr & VTCR_EL2_HA;
}
int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
@@ -364,15 +407,13 @@ int kvm_walk_nested_s2(struct kvm_vcpu *vcpu, phys_addr_t gipa,
if (!vcpu_has_nv(vcpu))
return 0;
- wi.read_desc = read_guest_s2_desc;
- wi.data = vcpu;
wi.baddr = vcpu_read_sys_reg(vcpu, VTTBR_EL2);
vtcr_to_walk_info(vtcr, &wi);
wi.be = vcpu_read_sys_reg(vcpu, SCTLR_EL2) & SCTLR_ELx_EE;
- ret = walk_nested_s2_pgd(gipa, &wi, result);
+ ret = walk_nested_s2_pgd(vcpu, gipa, &wi, result);
if (ret)
result->esr |= (kvm_vcpu_get_esr(vcpu) & ~ESR_ELx_FSC);
@@ -788,7 +829,10 @@ int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu, struct kvm_s2_trans *trans)
return 0;
if (kvm_vcpu_trap_is_iabt(vcpu)) {
- forward_fault = !kvm_s2_trans_executable(trans);
+ if (vcpu_mode_priv(vcpu))
+ forward_fault = !kvm_s2_trans_exec_el1(vcpu->kvm, trans);
+ else
+ forward_fault = !kvm_s2_trans_exec_el0(vcpu->kvm, trans);
} else {
bool write_fault = kvm_is_write_fault(vcpu);
@@ -1555,12 +1599,13 @@ u64 limit_nv_id_reg(struct kvm *kvm, u32 reg, u64 val)
case SYS_ID_AA64MMFR1_EL1:
val &= ~(ID_AA64MMFR1_EL1_CMOW |
ID_AA64MMFR1_EL1_nTLBPA |
- ID_AA64MMFR1_EL1_ETS |
- ID_AA64MMFR1_EL1_XNX |
- ID_AA64MMFR1_EL1_HAFDBS);
+ ID_AA64MMFR1_EL1_ETS);
+
/* FEAT_E2H0 implies no VHE */
if (test_bit(KVM_ARM_VCPU_HAS_EL2_E2H0, kvm->arch.vcpu_features))
val &= ~ID_AA64MMFR1_EL1_VH;
+
+ val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64MMFR1_EL1, HAFDBS, AF);
break;
case SYS_ID_AA64MMFR2_EL1:
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 24f0f8a8c943..d7a0f69a9982 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -344,9 +344,16 @@ static int __pkvm_pgtable_stage2_unmap(struct kvm_pgtable *pgt, u64 start, u64 e
return 0;
}
-void pkvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt)
+void pkvm_pgtable_stage2_destroy_range(struct kvm_pgtable *pgt,
+ u64 addr, u64 size)
{
- __pkvm_pgtable_stage2_unmap(pgt, 0, ~(0ULL));
+ __pkvm_pgtable_stage2_unmap(pgt, addr, addr + size);
+}
+
+void pkvm_pgtable_stage2_destroy_pgd(struct kvm_pgtable *pgt)
+{
+ /* Expected to be called after all pKVM mappings have been released. */
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&pgt->pkvm_mappings.rb_root));
}
int pkvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
diff --git a/arch/arm64/kvm/ptdump.c b/arch/arm64/kvm/ptdump.c
index dc5acfb00af9..6cbe018fd6fd 100644
--- a/arch/arm64/kvm/ptdump.c
+++ b/arch/arm64/kvm/ptdump.c
@@ -31,27 +31,46 @@ static const struct ptdump_prot_bits stage2_pte_bits[] = {
.val = PTE_VALID,
.set = " ",
.clear = "F",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R,
.set = "R",
.clear = " ",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
.val = KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W,
.set = "W",
.clear = " ",
- }, {
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
- .val = KVM_PTE_LEAF_ATTR_HI_S2_XN,
- .set = "NX",
- .clear = "x ",
- }, {
+ .val = 0b00UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px ux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b01UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNux ",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b10UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "PXNUXN",
+ },
+ {
+ .mask = KVM_PTE_LEAF_ATTR_HI_S2_XN,
+ .val = 0b11UL << __bf_shf(KVM_PTE_LEAF_ATTR_HI_S2_XN),
+ .set = "px UXN",
+ },
+ {
.mask = KVM_PTE_LEAF_ATTR_LO_S2_AF,
.val = KVM_PTE_LEAF_ATTR_LO_S2_AF,
.set = "AF",
.clear = " ",
- }, {
+ },
+ {
.mask = PMD_TYPE_MASK,
.val = PMD_TYPE_SECT,
.set = "BLK",
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e67eb39ddc11..c8fd7c6a12a1 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -666,6 +666,21 @@ static bool access_gic_sre(struct kvm_vcpu *vcpu,
return true;
}
+static bool access_gic_dir(struct kvm_vcpu *vcpu,
+ struct sys_reg_params *p,
+ const struct sys_reg_desc *r)
+{
+ if (!kvm_has_gicv3(vcpu->kvm))
+ return undef_access(vcpu, p, r);
+
+ if (!p->is_write)
+ return undef_access(vcpu, p, r);
+
+ vgic_v3_deactivate(vcpu, p->regval);
+
+ return true;
+}
+
static bool trap_raz_wi(struct kvm_vcpu *vcpu,
struct sys_reg_params *p,
const struct sys_reg_desc *r)
@@ -2595,19 +2610,23 @@ static bool bad_redir_trap(struct kvm_vcpu *vcpu,
.val = 0, \
}
-/* sys_reg_desc initialiser for known cpufeature ID registers */
-#define AA32_ID_SANITISED(name) { \
- ID_DESC(name), \
- .visibility = aa32_id_visibility, \
- .val = 0, \
-}
-
/* sys_reg_desc initialiser for writable ID registers */
#define ID_WRITABLE(name, mask) { \
ID_DESC(name), \
.val = mask, \
}
+/*
+ * 32bit ID regs are fully writable when the guest is 32bit
+ * capable. Nothing in the KVM code should rely on 32bit features
+ * anyway, only 64bit, so let the VMM do its worse.
+ */
+#define AA32_ID_WRITABLE(name) { \
+ ID_DESC(name), \
+ .visibility = aa32_id_visibility, \
+ .val = GENMASK(31, 0), \
+}
+
/* sys_reg_desc initialiser for cpufeature ID registers that need filtering */
#define ID_FILTERED(sysreg, name, mask) { \
ID_DESC(sysreg), \
@@ -3128,40 +3147,39 @@ static const struct sys_reg_desc sys_reg_descs[] = {
/* AArch64 mappings of the AArch32 ID registers */
/* CRm=1 */
- AA32_ID_SANITISED(ID_PFR0_EL1),
- AA32_ID_SANITISED(ID_PFR1_EL1),
+ AA32_ID_WRITABLE(ID_PFR0_EL1),
+ AA32_ID_WRITABLE(ID_PFR1_EL1),
{ SYS_DESC(SYS_ID_DFR0_EL1),
.access = access_id_reg,
.get_user = get_id_reg,
.set_user = set_id_dfr0_el1,
.visibility = aa32_id_visibility,
.reset = read_sanitised_id_dfr0_el1,
- .val = ID_DFR0_EL1_PerfMon_MASK |
- ID_DFR0_EL1_CopDbg_MASK, },
+ .val = GENMASK(31, 0) },
ID_HIDDEN(ID_AFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR0_EL1),
- AA32_ID_SANITISED(ID_MMFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR2_EL1),
- AA32_ID_SANITISED(ID_MMFR3_EL1),
+ AA32_ID_WRITABLE(ID_MMFR0_EL1),
+ AA32_ID_WRITABLE(ID_MMFR1_EL1),
+ AA32_ID_WRITABLE(ID_MMFR2_EL1),
+ AA32_ID_WRITABLE(ID_MMFR3_EL1),
/* CRm=2 */
- AA32_ID_SANITISED(ID_ISAR0_EL1),
- AA32_ID_SANITISED(ID_ISAR1_EL1),
- AA32_ID_SANITISED(ID_ISAR2_EL1),
- AA32_ID_SANITISED(ID_ISAR3_EL1),
- AA32_ID_SANITISED(ID_ISAR4_EL1),
- AA32_ID_SANITISED(ID_ISAR5_EL1),
- AA32_ID_SANITISED(ID_MMFR4_EL1),
- AA32_ID_SANITISED(ID_ISAR6_EL1),
+ AA32_ID_WRITABLE(ID_ISAR0_EL1),
+ AA32_ID_WRITABLE(ID_ISAR1_EL1),
+ AA32_ID_WRITABLE(ID_ISAR2_EL1),
+ AA32_ID_WRITABLE(ID_ISAR3_EL1),
+ AA32_ID_WRITABLE(ID_ISAR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR4_EL1),
+ AA32_ID_WRITABLE(ID_ISAR6_EL1),
/* CRm=3 */
- AA32_ID_SANITISED(MVFR0_EL1),
- AA32_ID_SANITISED(MVFR1_EL1),
- AA32_ID_SANITISED(MVFR2_EL1),
+ AA32_ID_WRITABLE(MVFR0_EL1),
+ AA32_ID_WRITABLE(MVFR1_EL1),
+ AA32_ID_WRITABLE(MVFR2_EL1),
ID_UNALLOCATED(3,3),
- AA32_ID_SANITISED(ID_PFR2_EL1),
+ AA32_ID_WRITABLE(ID_PFR2_EL1),
ID_HIDDEN(ID_DFR1_EL1),
- AA32_ID_SANITISED(ID_MMFR5_EL1),
+ AA32_ID_WRITABLE(ID_MMFR5_EL1),
ID_UNALLOCATED(3,7),
/* AArch64 ID registers */
@@ -3370,7 +3388,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
{ SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
{ SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
- { SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
{ SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi },
{ SYS_DESC(SYS_ICC_ASGI1R_EL1), access_gic_sgi },
@@ -3767,7 +3785,8 @@ static bool handle_at_s1e01(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
- __kvm_at_s1e01(vcpu, op, p->regval);
+ if (__kvm_at_s1e01(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -3784,7 +3803,8 @@ static bool handle_at_s1e2(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
return false;
}
- __kvm_at_s1e2(vcpu, op, p->regval);
+ if (__kvm_at_s1e2(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -3794,7 +3814,8 @@ static bool handle_at_s12(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
{
u32 op = sys_insn(p->Op0, p->Op1, p->CRn, p->CRm, p->Op2);
- __kvm_at_s12(vcpu, op, p->regval);
+ if (__kvm_at_s12(vcpu, op, p->regval))
+ return false;
return true;
}
@@ -4495,7 +4516,7 @@ static const struct sys_reg_desc cp15_regs[] = {
{ CP15_SYS_DESC(SYS_ICC_AP1R1_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_AP1R2_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_AP1R3_EL1), undef_access },
- { CP15_SYS_DESC(SYS_ICC_DIR_EL1), undef_access },
+ { CP15_SYS_DESC(SYS_ICC_DIR_EL1), access_gic_dir },
{ CP15_SYS_DESC(SYS_ICC_RPR_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_IAR1_EL1), undef_access },
{ CP15_SYS_DESC(SYS_ICC_EOIR1_EL1), undef_access },
@@ -5606,11 +5627,17 @@ int kvm_finalize_sys_regs(struct kvm_vcpu *vcpu)
guard(mutex)(&kvm->arch.config_lock);
- if (!(static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif) &&
- irqchip_in_kernel(kvm) &&
- kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)) {
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_AA64PFR0_EL1)] &= ~ID_AA64PFR0_EL1_GIC_MASK;
- kvm->arch.id_regs[IDREG_IDX(SYS_ID_PFR1_EL1)] &= ~ID_PFR1_EL1_GIC_MASK;
+ /*
+ * This hacks into the ID registers, so only perform it when the
+ * first vcpu runs, or the kvm_set_vm_id_reg() helper will scream.
+ */
+ if (!irqchip_in_kernel(kvm) && !kvm_vm_has_ran_once(kvm)) {
+ u64 val;
+
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
+ val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
}
if (vcpu_has_nv(vcpu)) {
diff --git a/arch/arm64/kvm/vgic/vgic-debug.c b/arch/arm64/kvm/vgic/vgic-debug.c
index 4c1209261b65..bb92853d1fd3 100644
--- a/arch/arm64/kvm/vgic/vgic-debug.c
+++ b/arch/arm64/kvm/vgic/vgic-debug.c
@@ -64,29 +64,37 @@ static void iter_next(struct kvm *kvm, struct vgic_state_iter *iter)
static int iter_mark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
int nr_lpis = 0;
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+
xa_for_each(&dist->lpi_xa, intid, irq) {
if (!vgic_try_get_irq_ref(irq))
continue;
- xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ __xa_set_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
nr_lpis++;
}
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
return nr_lpis;
}
static void iter_unmark_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long intid, flags;
struct vgic_irq *irq;
- unsigned long intid;
xa_for_each_marked(&dist->lpi_xa, intid, irq, LPI_XA_MARK_DEBUG_ITER) {
- xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
+ __xa_clear_mark(&dist->lpi_xa, intid, LPI_XA_MARK_DEBUG_ITER);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
+ /* vgic_put_irq() expects to be called outside of the xa_lock */
vgic_put_irq(kvm, irq);
}
}
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 1796b1a22a72..dc9f9db31026 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -53,7 +53,7 @@ void kvm_vgic_early_init(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- xa_init(&dist->lpi_xa);
+ xa_init_flags(&dist->lpi_xa, XA_FLAGS_LOCK_IRQ);
}
/* CREATION */
@@ -71,6 +71,7 @@ static int vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
int kvm_vgic_create(struct kvm *kvm, u32 type)
{
struct kvm_vcpu *vcpu;
+ u64 aa64pfr0, pfr1;
unsigned long i;
int ret;
@@ -161,10 +162,19 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2)
+ aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) & ~ID_AA64PFR0_EL1_GIC;
+ pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) & ~ID_PFR1_EL1_GIC;
+
+ if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- else
+ } else {
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
+ aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1, GIC, IMP);
+ pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC, GICv3);
+ }
+
+ kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
+ kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
kvm->arch.vgic.nassgicap = system_supports_direct_sgis();
@@ -188,6 +198,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
+ dist->active_spis = (atomic_t)ATOMIC_INIT(0);
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL_ACCOUNT);
if (!dist->spis)
return -ENOMEM;
@@ -353,12 +364,12 @@ int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
return ret;
}
-static void kvm_vgic_vcpu_enable(struct kvm_vcpu *vcpu)
+static void kvm_vgic_vcpu_reset(struct kvm_vcpu *vcpu)
{
if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_enable(vcpu);
+ vgic_v2_reset(vcpu);
else
- vgic_v3_enable(vcpu);
+ vgic_v3_reset(vcpu);
}
/*
@@ -405,7 +416,7 @@ int vgic_init(struct kvm *kvm)
}
kvm_for_each_vcpu(idx, vcpu, kvm)
- kvm_vgic_vcpu_enable(vcpu);
+ kvm_vgic_vcpu_reset(vcpu);
ret = kvm_vgic_setup_default_irq_routing(kvm);
if (ret)
diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c
index ce3e3ed3f29f..3f1c4b10fed9 100644
--- a/arch/arm64/kvm/vgic/vgic-its.c
+++ b/arch/arm64/kvm/vgic/vgic-its.c
@@ -78,6 +78,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
{
struct vgic_dist *dist = &kvm->arch.vgic;
struct vgic_irq *irq = vgic_get_irq(kvm, intid), *oldirq;
+ unsigned long flags;
int ret;
/* In this case there is no put, since we keep the reference. */
@@ -88,7 +89,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
if (!irq)
return ERR_PTR(-ENOMEM);
- ret = xa_reserve(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
+ ret = xa_reserve_irq(&dist->lpi_xa, intid, GFP_KERNEL_ACCOUNT);
if (ret) {
kfree(irq);
return ERR_PTR(ret);
@@ -103,7 +104,7 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
irq->target_vcpu = vcpu;
irq->group = 1;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
/*
* There could be a race with another vgic_add_lpi(), so we need to
@@ -114,21 +115,18 @@ static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid,
/* Someone was faster with adding this LPI, lets use that. */
kfree(irq);
irq = oldirq;
-
- goto out_unlock;
+ } else {
+ ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
}
- ret = xa_err(__xa_store(&dist->lpi_xa, intid, irq, 0));
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
+
if (ret) {
xa_release(&dist->lpi_xa, intid);
kfree(irq);
- }
-
-out_unlock:
- xa_unlock(&dist->lpi_xa);
- if (ret)
return ERR_PTR(ret);
+ }
/*
* We "cache" the configuration table entries in our struct vgic_irq's.
diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
index f25fccb1f8e6..406845b3117c 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c
@@ -359,6 +359,16 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu,
vgic_set_vmcr(vcpu, &vmcr);
}
+static void vgic_mmio_write_dir(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ if (kvm_vgic_global_state.type == VGIC_V2)
+ vgic_v2_deactivate(vcpu, val);
+ else
+ vgic_v3_deactivate(vcpu, val);
+}
+
static unsigned long vgic_mmio_read_apr(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -482,6 +492,10 @@ static const struct vgic_register_region vgic_v2_cpu_registers[] = {
REGISTER_DESC_WITH_LENGTH(GIC_CPU_IDENT,
vgic_mmio_read_vcpuif, vgic_mmio_write_vcpuif, 4,
VGIC_ACCESS_32bit),
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GIC_CPU_DEACTIVATE,
+ vgic_mmio_read_raz, vgic_mmio_write_dir,
+ vgic_mmio_read_raz, vgic_mmio_uaccess_write_wi,
+ 4, VGIC_ACCESS_32bit),
};
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
@@ -494,6 +508,16 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev)
return SZ_4K;
}
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev)
+{
+ dev->regions = vgic_v2_cpu_registers;
+ dev->nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers);
+
+ kvm_iodevice_init(&dev->dev, &kvm_io_gic_ops);
+
+ return KVM_VGIC_V2_CPU_SIZE;
+}
+
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr)
{
const struct vgic_register_region *region;
diff --git a/arch/arm64/kvm/vgic/vgic-mmio.h b/arch/arm64/kvm/vgic/vgic-mmio.h
index 5b490a4dfa5e..50dc80220b0f 100644
--- a/arch/arm64/kvm/vgic/vgic-mmio.h
+++ b/arch/arm64/kvm/vgic/vgic-mmio.h
@@ -213,6 +213,7 @@ void vgic_write_irq_line_level_info(struct kvm_vcpu *vcpu, u32 intid,
const u32 val);
unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
+unsigned int vgic_v2_init_cpuif_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c
index 381673f03c39..585491fbda80 100644
--- a/arch/arm64/kvm/vgic/vgic-v2.c
+++ b/arch/arm64/kvm/vgic/vgic-v2.c
@@ -9,6 +9,7 @@
#include <kvm/arm_vgic.h>
#include <asm/kvm_mmu.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static inline void vgic_v2_write_lr(int lr, u32 val)
@@ -26,11 +27,24 @@ void vgic_v2_init_lrs(void)
vgic_v2_write_lr(i, 0);
}
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2;
- cpuif->vgic_hcr |= GICH_HCR_UIE;
+ cpuif->vgic_hcr = GICH_HCR_EN;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= GICH_HCR_UIE;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP0_MASK) ?
+ GICH_HCR_VGrp0DIE : GICH_HCR_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & GICH_VMCR_ENABLE_GRP1_MASK) ?
+ GICH_HCR_VGrp1DIE : GICH_HCR_VGrp1EIE;
}
static bool lr_signals_eoi_mi(u32 lr_val)
@@ -39,43 +53,23 @@ static bool lr_signals_eoi_mi(u32 lr_val)
!(lr_val & GICH_LR_HW);
}
-/*
- * transfer the content of the LRs back into the corresponding ap_list:
- * - active bit is transferred as is
- * - pending bit is
- * - transferred as is in case of edge sensitive IRQs
- * - set to the line-level (resample time) for level sensitive IRQs
- */
-void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+static void vgic_v2_fold_lr(struct kvm_vcpu *vcpu, u32 val)
{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
- int lr;
-
- DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
-
- cpuif->vgic_hcr &= ~GICH_HCR_UIE;
-
- for (lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++) {
- u32 val = cpuif->vgic_lr[lr];
- u32 cpuid, intid = val & GICH_LR_VIRTUALID;
- struct vgic_irq *irq;
- bool deactivated;
-
- /* Extract the source vCPU id from the LR */
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
- cpuid &= 7;
+ u32 cpuid, intid = val & GICH_LR_VIRTUALID;
+ struct vgic_irq *irq;
+ bool deactivated;
- /* Notify fds when the guest EOI'ed a level-triggered SPI */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ /* Extract the source vCPU id from the LR */
+ cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val) & 7;
- irq = vgic_get_vcpu_irq(vcpu, intid);
+ /* Notify fds when the guest EOI'ed a level-triggered SPI */
+ if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
- raw_spin_lock(&irq->irq_lock);
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
/* Always preserve the active bit, note deactivation */
deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT);
irq->active = !!(val & GICH_LR_ACTIVE_BIT);
@@ -101,29 +95,139 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
/* Handle resampling for mapped interrupts if required */
vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT);
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ irq->on_lr = false;
}
- cpuif->used_lrs = 0;
+ vgic_put_irq(vcpu->kvm, irq);
}
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
/*
- * Populates the particular LR with the state of a given IRQ:
- * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
- * - for a level sensitive IRQ the pending state value is unchanged;
- * it is dictated directly by the input level
- *
- * If @irq describes an SGI with multiple sources, we choose the
- * lowest-numbered source VCPU and clear that bit in the source bitmap.
- *
- * The irq_lock must be held by the caller.
+ * transfer the content of the LRs back into the corresponding ap_list:
+ * - active bit is transferred as is
+ * - pending bit is
+ * - transferred as is in case of edge sensitive IRQs
+ * - set to the line-level (resample time) for level sensitive IRQs
*/
-void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ u32 eoicount = FIELD_GET(GICH_HCR_EOICOUNT, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
+
+ DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
+
+ for (int lr = 0; lr < vgic_cpu->vgic_v2.used_lrs; lr++)
+ vgic_v2_fold_lr(vcpu, cpuif->vgic_lr[lr]);
+
+ /* See the GICv3 equivalent for the EOIcount handling rationale */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u32 lr;
+
+ if (!eoicount) {
+ break;
+ } else {
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+ vgic_v2_fold_lr(vcpu, lr);
+ eoicount--;
+ }
+
+ cpuif->used_lrs = 0;
+}
+
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
+
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
+
+ /* We only deal with DIR when EOIMode==1 */
+ if (!(cpuif->vgic_vmcr & GICH_VMCR_EOI_MODE_MASK))
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /* See the corresponding v3 code for the rationale */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
+
+ /*
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
+ */
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
+
+ /* SGI: check that the cpuid matches */
+ if (val < VGIC_NR_SGIS && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
+
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v2_compute_lr(vcpu, irq) & ~GICH_LR_ACTIVE_BIT;
+ }
+
+ if (lr & GICH_LR_HW)
+ writel_relaxed(FIELD_GET(GICH_LR_PHYSID_CPUID, lr),
+ kvm_vgic_global_state.gicc_base + GIC_CPU_DEACTIVATE);
+
+ vgic_v2_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
+}
+
+static u32 vgic_v2_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 val = irq->intid;
bool allow_pending = true;
+ WARN_ON(irq->on_lr);
+
if (irq->active) {
val |= GICH_LR_ACTIVE_BIT;
if (vgic_irq_is_sgi(irq->intid))
@@ -163,22 +267,52 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= GICH_LR_PENDING_BIT;
- if (irq->config == VGIC_CONFIG_EDGE)
- irq->pending_latch = false;
-
if (vgic_irq_is_sgi(irq->intid)) {
u32 src = ffs(irq->source);
if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
irq->intid))
- return;
+ return 0;
val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
- irq->pending_latch = true;
+ if (irq->source & ~BIT(src - 1))
val |= GICH_LR_EOI;
- }
+ }
+ }
+
+ /* The GICv2 LR only holds five bits of priority. */
+ val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+/*
+ * Populates the particular LR with the state of a given IRQ:
+ * - for an edge sensitive IRQ the pending state is cleared in struct vgic_irq
+ * - for a level sensitive IRQ the pending state value is unchanged;
+ * it is dictated directly by the input level
+ *
+ * If @irq describes an SGI with multiple sources, we choose the
+ * lowest-numbered source VCPU and clear that bit in the source bitmap.
+ *
+ * The irq_lock must be held by the caller.
+ */
+void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 val = vgic_v2_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+
+ if (val & GICH_LR_PENDING_BIT) {
+ if (irq->config == VGIC_CONFIG_EDGE)
+ irq->pending_latch = false;
+
+ if (vgic_irq_is_sgi(irq->intid)) {
+ u32 src = ffs(irq->source);
+
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
+ irq->pending_latch = true;
}
}
@@ -194,7 +328,7 @@ void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
/* The GICv2 LR only holds five bits of priority. */
val |= (irq->priority >> 3) << GICH_LR_PRIORITY_SHIFT;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -257,7 +391,7 @@ void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GICH_VMCR_PRIMASK_SHIFT) << GICV_PMR_PRIORITY_SHIFT;
}
-void vgic_v2_enable(struct kvm_vcpu *vcpu)
+void vgic_v2_reset(struct kvm_vcpu *vcpu)
{
/*
* By forcing VMCR to zero, the GIC will restore the binary
@@ -265,9 +399,6 @@ void vgic_v2_enable(struct kvm_vcpu *vcpu)
* anyway.
*/
vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
-
- /* Get the show on the road... */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
}
/* check for overlapping regions and for regions crossing the end of memory */
@@ -289,6 +420,7 @@ static bool vgic_v2_check_base(gpa_t dist_base, gpa_t cpu_base)
int vgic_v2_map_resources(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned int len;
int ret = 0;
if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
@@ -312,10 +444,20 @@ int vgic_v2_map_resources(struct kvm *kvm)
return ret;
}
+ len = vgic_v2_init_cpuif_iodev(&dist->cpuif_iodev);
+ dist->cpuif_iodev.base_addr = dist->vgic_cpu_base;
+ dist->cpuif_iodev.iodev_type = IODEV_CPUIF;
+ dist->cpuif_iodev.redist_vcpu = NULL;
+
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, dist->vgic_cpu_base,
+ len, &dist->cpuif_iodev.dev);
+ if (ret)
+ return ret;
+
if (!static_branch_unlikely(&vgic_v2_cpuif_trap)) {
ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
kvm_vgic_global_state.vcpu_base,
- KVM_VGIC_V2_CPU_SIZE, true);
+ KVM_VGIC_V2_CPU_SIZE - SZ_4K, true);
if (ret) {
kvm_err("Unable to remap VGIC CPU to VCPU\n");
return ret;
@@ -385,6 +527,7 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
kvm_vgic_global_state.can_emulate_gicv2 = true;
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
+ kvm_vgic_global_state.gicc_base = info->gicc_base;
kvm_vgic_global_state.type = VGIC_V2;
kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
@@ -423,16 +566,26 @@ static void save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
void vgic_v2_save_state(struct kvm_vcpu *vcpu)
{
+ struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
void __iomem *base = kvm_vgic_global_state.vctrl_base;
u64 used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
if (!base)
return;
- if (used_lrs) {
+ cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
+
+ if (used_lrs)
save_lrs(vcpu, base);
- writel_relaxed(0, base + GICH_HCR);
+
+ if (cpu_if->vgic_hcr & GICH_HCR_LRENPIE) {
+ u32 val = readl_relaxed(base + GICH_HCR);
+
+ cpu_if->vgic_hcr &= ~GICH_HCR_EOICOUNT;
+ cpu_if->vgic_hcr |= val & GICH_HCR_EOICOUNT;
}
+
+ writel_relaxed(0, base + GICH_HCR);
}
void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
@@ -445,13 +598,10 @@ void vgic_v2_restore_state(struct kvm_vcpu *vcpu)
if (!base)
return;
- if (used_lrs) {
- writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
- for (i = 0; i < used_lrs; i++) {
- writel_relaxed(cpu_if->vgic_lr[i],
- base + GICH_LR0 + (i * 4));
- }
- }
+ writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
+
+ for (i = 0; i < used_lrs; i++)
+ writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4));
}
void vgic_v2_load(struct kvm_vcpu *vcpu)
@@ -468,6 +618,5 @@ void vgic_v2_put(struct kvm_vcpu *vcpu)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- cpu_if->vgic_vmcr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VMCR);
cpu_if->vgic_apr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_APR);
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3-nested.c b/arch/arm64/kvm/vgic/vgic-v3-nested.c
index 7f1259b49c50..61b44f3f2bf1 100644
--- a/arch/arm64/kvm/vgic/vgic-v3-nested.c
+++ b/arch/arm64/kvm/vgic/vgic-v3-nested.c
@@ -70,13 +70,14 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
* - on L2 put: perform the inverse transformation, so that the result of L2
* running becomes visible to L1 in the VNCR-accessible registers.
*
- * - there is nothing to do on L2 entry, as everything will have happened
- * on load. However, this is the point where we detect that an interrupt
- * targeting L1 and prepare the grand switcheroo.
+ * - there is nothing to do on L2 entry apart from enabling the vgic, as
+ * everything will have happened on load. However, this is the point where
+ * we detect that an interrupt targeting L1 and prepare the grand
+ * switcheroo.
*
- * - on L2 exit: emulate the HW bit, and deactivate corresponding the L1
- * interrupt. The L0 active state will be cleared by the HW if the L1
- * interrupt was itself backed by a HW interrupt.
+ * - on L2 exit: resync the LRs and VMCR, emulate the HW bit, and deactivate
+ * corresponding the L1 interrupt. The L0 active state will be cleared by
+ * the HW if the L1 interrupt was itself backed by a HW interrupt.
*
* Maintenance Interrupt (MI) management:
*
@@ -93,8 +94,10 @@ static int lr_map_idx_to_shadow_idx(struct shadow_if *shadow_if, int idx)
*
* - because most of the ICH_*_EL2 registers live in the VNCR page, the
* quality of emulation is poor: L1 can setup the vgic so that an MI would
- * immediately fire, and not observe anything until the next exit. Trying
- * to read ICH_MISR_EL2 would do the trick, for example.
+ * immediately fire, and not observe anything until the next exit.
+ * Similarly, a pending MI is not immediately disabled by clearing
+ * ICH_HCR_EL2.En. Trying to read ICH_MISR_EL2 would do the trick, for
+ * example.
*
* System register emulation:
*
@@ -265,16 +268,37 @@ static void vgic_v3_create_shadow_lr(struct kvm_vcpu *vcpu,
s_cpu_if->used_lrs = hweight16(shadow_if->lr_map);
}
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu)
+{
+ u64 val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
+
+ write_sysreg_s(val | vgic_ich_hcr_trap_bits(), SYS_ICH_HCR_EL2);
+}
+
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
int i;
for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
- u64 lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
- struct vgic_irq *irq;
+ u64 val, host_lr, lr;
+
+ host_lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
+
+ /* Propagate the new LR state */
+ lr = __vcpu_sys_reg(vcpu, ICH_LRN(i));
+ val = lr & ~ICH_LR_STATE;
+ val |= host_lr & ICH_LR_STATE;
+ __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
- if (!(lr & ICH_LR_HW) || !(lr & ICH_LR_STATE))
+ /*
+ * Deactivation of a HW interrupt: the LR must have the HW
+ * bit set, have been in a non-invalid state before the run,
+ * and now be in an invalid state. If any of that doesn't
+ * hold, we're done with this LR.
+ */
+ if (!((lr & ICH_LR_HW) && (lr & ICH_LR_STATE) &&
+ !(host_lr & ICH_LR_STATE)))
continue;
/*
@@ -282,35 +306,27 @@ void vgic_v3_sync_nested(struct kvm_vcpu *vcpu)
* need to emulate the HW effect between the guest hypervisor
* and the nested guest.
*/
- irq = vgic_get_vcpu_irq(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- if (WARN_ON(!irq)) /* Shouldn't happen as we check on load */
- continue;
+ vgic_v3_deactivate(vcpu, FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+ }
- lr = __gic_v3_get_lr(lr_map_idx_to_shadow_idx(shadow_if, i));
- if (!(lr & ICH_LR_STATE))
- irq->active = false;
+ /* We need these to be synchronised to generate the MI */
+ __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, read_sysreg_s(SYS_ICH_VMCR_EL2));
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, &=, ~ICH_HCR_EL2_EOIcount);
+ __vcpu_rmw_sys_reg(vcpu, ICH_HCR_EL2, |=, read_sysreg_s(SYS_ICH_HCR_EL2) & ICH_HCR_EL2_EOIcount);
- vgic_put_irq(vcpu->kvm, irq);
- }
+ write_sysreg_s(0, SYS_ICH_HCR_EL2);
+ isb();
+
+ vgic_v3_nested_update_mi(vcpu);
}
static void vgic_v3_create_shadow_state(struct kvm_vcpu *vcpu,
struct vgic_v3_cpu_if *s_cpu_if)
{
struct vgic_v3_cpu_if *host_if = &vcpu->arch.vgic_cpu.vgic_v3;
- u64 val = 0;
int i;
- /*
- * If we're on a system with a broken vgic that requires
- * trapping, propagate the trapping requirements.
- *
- * Ah, the smell of rotten fruits...
- */
- if (static_branch_unlikely(&vgic_v3_cpuif_trap))
- val = host_if->vgic_hcr & (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
- ICH_HCR_EL2_TC | ICH_HCR_EL2_TDIR);
- s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2) | val;
+ s_cpu_if->vgic_hcr = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
s_cpu_if->vgic_vmcr = __vcpu_sys_reg(vcpu, ICH_VMCR_EL2);
s_cpu_if->vgic_sre = host_if->vgic_sre;
@@ -334,7 +350,8 @@ void vgic_v3_load_nested(struct kvm_vcpu *vcpu)
__vgic_v3_restore_vmcr_aprs(cpu_if);
__vgic_v3_activate_traps(cpu_if);
- __vgic_v3_restore_state(cpu_if);
+ for (int i = 0; i < cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(cpu_if->vgic_lr[i], i);
/*
* Propagate the number of used LRs for the benefit of the HYP
@@ -347,36 +364,19 @@ void vgic_v3_put_nested(struct kvm_vcpu *vcpu)
{
struct shadow_if *shadow_if = get_shadow_if();
struct vgic_v3_cpu_if *s_cpu_if = &shadow_if->cpuif;
- u64 val;
int i;
- __vgic_v3_save_vmcr_aprs(s_cpu_if);
- __vgic_v3_deactivate_traps(s_cpu_if);
- __vgic_v3_save_state(s_cpu_if);
-
- /*
- * Translate the shadow state HW fields back to the virtual ones
- * before copying the shadow struct back to the nested one.
- */
- val = __vcpu_sys_reg(vcpu, ICH_HCR_EL2);
- val &= ~ICH_HCR_EL2_EOIcount_MASK;
- val |= (s_cpu_if->vgic_hcr & ICH_HCR_EL2_EOIcount_MASK);
- __vcpu_assign_sys_reg(vcpu, ICH_HCR_EL2, val);
- __vcpu_assign_sys_reg(vcpu, ICH_VMCR_EL2, s_cpu_if->vgic_vmcr);
+ __vgic_v3_save_aprs(s_cpu_if);
for (i = 0; i < 4; i++) {
__vcpu_assign_sys_reg(vcpu, ICH_AP0RN(i), s_cpu_if->vgic_ap0r[i]);
__vcpu_assign_sys_reg(vcpu, ICH_AP1RN(i), s_cpu_if->vgic_ap1r[i]);
}
- for_each_set_bit(i, &shadow_if->lr_map, kvm_vgic_global_state.nr_lr) {
- val = __vcpu_sys_reg(vcpu, ICH_LRN(i));
-
- val &= ~ICH_LR_STATE;
- val |= s_cpu_if->vgic_lr[lr_map_idx_to_shadow_idx(shadow_if, i)] & ICH_LR_STATE;
+ for (i = 0; i < s_cpu_if->used_lrs; i++)
+ __gic_v3_set_lr(0, i);
- __vcpu_assign_sys_reg(vcpu, ICH_LRN(i), val);
- }
+ __vgic_v3_deactivate_traps(s_cpu_if);
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = 0;
}
diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c
index 6fbb4b099855..1d6dd1b545bd 100644
--- a/arch/arm64/kvm/vgic/vgic-v3.c
+++ b/arch/arm64/kvm/vgic/vgic-v3.c
@@ -12,6 +12,7 @@
#include <asm/kvm_mmu.h>
#include <asm/kvm_asm.h>
+#include "vgic-mmio.h"
#include "vgic.h"
static bool group0_trap;
@@ -20,11 +21,48 @@ static bool common_trap;
static bool dir_trap;
static bool gicv4_enable;
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu)
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3;
- cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
+ if (!irqchip_in_kernel(vcpu->kvm))
+ return;
+
+ cpuif->vgic_hcr = ICH_HCR_EL2_En;
+
+ if (irqs_pending_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_NPIE;
+ if (irqs_active_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_LRENPIE;
+ if (irqs_outside_lrs(als))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_UIE;
+
+ if (!als->nr_sgi)
+ cpuif->vgic_hcr |= ICH_HCR_EL2_vSGIEOICount;
+
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG0_MASK) ?
+ ICH_HCR_EL2_VGrp0DIE : ICH_HCR_EL2_VGrp0EIE;
+ cpuif->vgic_hcr |= (cpuif->vgic_vmcr & ICH_VMCR_ENG1_MASK) ?
+ ICH_HCR_EL2_VGrp1DIE : ICH_HCR_EL2_VGrp1EIE;
+
+ /*
+ * Dealing with EOImode=1 is a massive source of headache. Not
+ * only do we need to track that we have active interrupts
+ * outside of the LRs and force DIR to be trapped, we also
+ * need to deal with SPIs that can be deactivated on another
+ * CPU.
+ *
+ * On systems that do not implement TDIR, force the bit in the
+ * shadow state anyway to avoid IPI-ing on these poor sods.
+ *
+ * Note that we set the trap irrespective of EOIMode, as that
+ * can change behind our back without any warning...
+ */
+ if (!cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) ||
+ irqs_active_outside_lrs(als) ||
+ atomic_read(&vcpu->kvm->arch.vgic.active_spis))
+ cpuif->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
static bool lr_signals_eoi_mi(u64 lr_val)
@@ -33,84 +71,238 @@ static bool lr_signals_eoi_mi(u64 lr_val)
!(lr_val & ICH_LR_HW);
}
+static void vgic_v3_fold_lr(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_irq *irq;
+ bool is_v2_sgi = false;
+ bool deactivated;
+ u32 intid;
+
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
+ intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ } else {
+ intid = val & GICH_LR_VIRTUALID;
+ is_v2_sgi = vgic_irq_is_sgi(intid);
+ }
+
+ irq = vgic_get_vcpu_irq(vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ return;
+
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ /* Always preserve the active bit for !LPIs, note deactivation */
+ if (irq->intid >= VGIC_MIN_LPI)
+ val &= ~ICH_LR_ACTIVE_BIT;
+ deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
+ irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+
+ /* Edge is the only case where we preserve the pending bit */
+ if (irq->config == VGIC_CONFIG_EDGE &&
+ (val & ICH_LR_PENDING_BIT))
+ irq->pending_latch = true;
+
+ /*
+ * Clear soft pending state when level irqs have been acked.
+ */
+ if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
+ irq->pending_latch = false;
+
+ if (is_v2_sgi) {
+ u8 cpuid = FIELD_GET(GICH_LR_PHYSID_CPUID, val);
+
+ if (irq->active)
+ irq->active_source = cpuid;
+
+ if (val & ICH_LR_PENDING_BIT)
+ irq->source |= BIT(cpuid);
+ }
+
+ /* Handle resampling for mapped interrupts if required */
+ vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+
+ irq->on_lr = false;
+ }
+
+ /* Notify fds when the guest EOI'ed a level-triggered SPI, and drop the refcount */
+ if (deactivated && lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) {
+ kvm_notify_acked_irq(vcpu->kvm, 0,
+ intid - VGIC_NR_PRIVATE_IRQS);
+ atomic_dec_if_positive(&vcpu->kvm->arch.vgic.active_spis);
+ }
+
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq);
+
+static void vgic_v3_deactivate_phys(u32 intid)
+{
+ if (cpus_have_final_cap(ARM64_HAS_GICV5_LEGACY))
+ gic_insn(intid | FIELD_PREP(GICV5_GIC_CDDI_TYPE_MASK, 1), CDDI);
+ else
+ gic_write_dir(intid);
+}
+
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
- u32 model = vcpu->kvm->arch.vgic.vgic_model;
- int lr;
+ u32 eoicount = FIELD_GET(ICH_HCR_EL2_EOIcount, cpuif->vgic_hcr);
+ struct vgic_irq *irq;
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- cpuif->vgic_hcr &= ~ICH_HCR_EL2_UIE;
-
- for (lr = 0; lr < cpuif->used_lrs; lr++) {
- u64 val = cpuif->vgic_lr[lr];
- u32 intid, cpuid;
- struct vgic_irq *irq;
- bool is_v2_sgi = false;
- bool deactivated;
+ for (int lr = 0; lr < cpuif->used_lrs; lr++)
+ vgic_v3_fold_lr(vcpu, cpuif->vgic_lr[lr]);
- cpuid = val & GICH_LR_PHYSID_CPUID;
- cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT;
+ /*
+ * EOIMode=0: use EOIcount to emulate deactivation. We are
+ * guaranteed to deactivate in reverse order of the activation, so
+ * just pick one active interrupt after the other in the ap_list,
+ * and replay the deactivation as if the CPU was doing it. We also
+ * rely on priority drop to have taken place, and the list to be
+ * sorted by priority.
+ */
+ list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
+ u64 lr;
- if (model == KVM_DEV_TYPE_ARM_VGIC_V3) {
- intid = val & ICH_LR_VIRTUAL_ID_MASK;
+ /*
+ * I would have loved to write this using a scoped_guard(),
+ * but using 'continue' here is a total train wreck.
+ */
+ if (!eoicount) {
+ break;
} else {
- intid = val & GICH_LR_VIRTUALID;
- is_v2_sgi = vgic_irq_is_sgi(intid);
+ guard(raw_spinlock)(&irq->irq_lock);
+
+ if (!(likely(vgic_target_oracle(irq) == vcpu) &&
+ irq->active))
+ continue;
+
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- /* Notify fds when the guest EOI'ed a level-triggered IRQ */
- if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid))
- kvm_notify_acked_irq(vcpu->kvm, 0,
- intid - VGIC_NR_PRIVATE_IRQS);
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
- irq = vgic_get_vcpu_irq(vcpu, intid);
- if (!irq) /* An LPI could have been unmapped. */
- continue;
+ vgic_v3_fold_lr(vcpu, lr);
+ eoicount--;
+ }
- raw_spin_lock(&irq->irq_lock);
+ cpuif->used_lrs = 0;
+}
- /* Always preserve the active bit, note deactivation */
- deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT);
- irq->active = !!(val & ICH_LR_ACTIVE_BIT);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3;
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ struct kvm_vcpu *target_vcpu = NULL;
+ bool mmio = false, is_v2_sgi;
+ struct vgic_irq *irq;
+ unsigned long flags;
+ u64 lr = 0;
+ u8 cpuid;
- if (irq->active && is_v2_sgi)
- irq->active_source = cpuid;
+ /* Snapshot CPUID, and remove it from the INTID */
+ cpuid = FIELD_GET(GENMASK_ULL(12, 10), val);
+ val &= ~GENMASK_ULL(12, 10);
- /* Edge is the only case where we preserve the pending bit */
- if (irq->config == VGIC_CONFIG_EDGE &&
- (val & ICH_LR_PENDING_BIT)) {
- irq->pending_latch = true;
+ is_v2_sgi = (model == KVM_DEV_TYPE_ARM_VGIC_V2 &&
+ val < VGIC_NR_SGIS);
- if (is_v2_sgi)
- irq->source |= (1 << cpuid);
- }
+ /*
+ * We only deal with DIR when EOIMode==1, and only for SGI,
+ * PPI or SPI.
+ */
+ if (!(cpuif->vgic_vmcr & ICH_VMCR_EOIM_MASK) ||
+ val >= vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS)
+ return;
+
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
+
+ irq = vgic_get_vcpu_irq(vcpu, val);
+ if (WARN_ON_ONCE(!irq))
+ goto out;
+
+ /*
+ * EOIMode=1: we must rely on traps to handle deactivate of
+ * overflowing interrupts, as there is no ordering guarantee and
+ * EOIcount isn't being incremented. Priority drop will have taken
+ * place, as ICV_EOIxR_EL1 only affects the APRs and not the LRs.
+ *
+ * Three possibities:
+ *
+ * - The irq is not queued on any CPU, and there is nothing to
+ * do,
+ *
+ * - Or the irq is in an LR, meaning that its state is not
+ * directly observable. Treat it bluntly by making it as if
+ * this was a write to GICD_ICACTIVER, which will force an
+ * exit on all vcpus. If it hurts, don't do that.
+ *
+ * - Or the irq is active, but not in an LR, and we can
+ * directly deactivate it by building a pseudo-LR, fold it,
+ * and queue a request to prune the resulting ap_list,
+ *
+ * Special care must be taken to match the source CPUID when
+ * deactivating a GICv2 SGI.
+ */
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ target_vcpu = irq->vcpu;
+
+ /* Not on any ap_list? */
+ if (!target_vcpu)
+ goto put;
/*
- * Clear soft pending state when level irqs have been acked.
+ * Urgh. We're deactivating something that we cannot
+ * observe yet... Big hammer time.
*/
- if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE))
- irq->pending_latch = false;
+ if (irq->on_lr) {
+ mmio = true;
+ goto put;
+ }
- /* Handle resampling for mapped interrupts if required */
- vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT);
+ /* GICv2 SGI: check that the cpuid matches */
+ if (is_v2_sgi && irq->active_source != cpuid) {
+ target_vcpu = NULL;
+ goto put;
+ }
- raw_spin_unlock(&irq->irq_lock);
- vgic_put_irq(vcpu->kvm, irq);
+ /* (with a Dalek voice) DEACTIVATE!!!! */
+ lr = vgic_v3_compute_lr(vcpu, irq) & ~ICH_LR_ACTIVE_BIT;
}
- cpuif->used_lrs = 0;
+ if (lr & ICH_LR_HW)
+ vgic_v3_deactivate_phys(FIELD_GET(ICH_LR_PHYS_ID_MASK, lr));
+
+ vgic_v3_fold_lr(vcpu, lr);
+
+put:
+ vgic_put_irq(vcpu->kvm, irq);
+
+out:
+ local_irq_restore(flags);
+
+ if (mmio)
+ vgic_mmio_write_cactive(vcpu, (val / 32) * 4, 4, BIT(val % 32));
+
+ /* Force the ap_list to be pruned */
+ if (target_vcpu)
+ kvm_make_request(KVM_REQ_VGIC_PROCESS_UPDATE, target_vcpu);
}
/* Requires the irq to be locked already */
-void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+static u64 vgic_v3_compute_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq)
{
u32 model = vcpu->kvm->arch.vgic.vgic_model;
u64 val = irq->intid;
bool allow_pending = true, is_v2_sgi;
+ WARN_ON(irq->on_lr);
+
is_v2_sgi = (vgic_irq_is_sgi(irq->intid) &&
model == KVM_DEV_TYPE_ARM_VGIC_V2);
@@ -150,6 +342,35 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (allow_pending && irq_is_pending(irq)) {
val |= ICH_LR_PENDING_BIT;
+ if (is_v2_sgi) {
+ u32 src = ffs(irq->source);
+
+ if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
+ irq->intid))
+ return 0;
+
+ val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
+ if (irq->source & ~BIT(src - 1))
+ val |= ICH_LR_EOI;
+ }
+ }
+
+ if (irq->group)
+ val |= ICH_LR_GROUP;
+
+ val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
+
+ return val;
+}
+
+void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
+{
+ u32 model = vcpu->kvm->arch.vgic.vgic_model;
+ u64 val = vgic_v3_compute_lr(vcpu, irq);
+
+ vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+
+ if (val & ICH_LR_PENDING_BIT) {
if (irq->config == VGIC_CONFIG_EDGE)
irq->pending_latch = false;
@@ -157,16 +378,9 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
model == KVM_DEV_TYPE_ARM_VGIC_V2) {
u32 src = ffs(irq->source);
- if (WARN_RATELIMIT(!src, "No SGI source for INTID %d\n",
- irq->intid))
- return;
-
- val |= (src - 1) << GICH_LR_PHYSID_CPUID_SHIFT;
- irq->source &= ~(1 << (src - 1));
- if (irq->source) {
+ irq->source &= ~BIT(src - 1);
+ if (irq->source)
irq->pending_latch = true;
- val |= ICH_LR_EOI;
- }
}
}
@@ -179,12 +393,7 @@ void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr)
if (vgic_irq_is_mapped_level(irq) && (val & ICH_LR_PENDING_BIT))
irq->line_level = false;
- if (irq->group)
- val |= ICH_LR_GROUP;
-
- val |= (u64)irq->priority << ICH_LR_PRIORITY_SHIFT;
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = val;
+ irq->on_lr = true;
}
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr)
@@ -258,7 +467,7 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
-void vgic_v3_enable(struct kvm_vcpu *vcpu)
+void vgic_v3_reset(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -288,9 +497,6 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
kvm_vgic_global_state.ich_vtr_el2);
vcpu->arch.vgic_cpu.num_pri_bits = FIELD_GET(ICH_VTR_EL2_PRIbits,
kvm_vgic_global_state.ich_vtr_el2) + 1;
-
- /* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EL2_En;
}
void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
@@ -301,20 +507,10 @@ void vcpu_set_ich_hcr(struct kvm_vcpu *vcpu)
return;
/* Hide GICv3 sysreg if necessary */
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2) {
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2 ||
+ !irqchip_in_kernel(vcpu->kvm))
vgic_v3->vgic_hcr |= (ICH_HCR_EL2_TALL0 | ICH_HCR_EL2_TALL1 |
ICH_HCR_EL2_TC);
- return;
- }
-
- if (group0_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL0;
- if (group1_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TALL1;
- if (common_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TC;
- if (dir_trap)
- vgic_v3->vgic_hcr |= ICH_HCR_EL2_TDIR;
}
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq)
@@ -635,8 +831,53 @@ static const struct midr_range broken_seis[] = {
static bool vgic_v3_broken_seis(void)
{
- return ((kvm_vgic_global_state.ich_vtr_el2 & ICH_VTR_EL2_SEIS) &&
- is_midr_in_range_list(broken_seis));
+ return (is_kernel_in_hyp_mode() &&
+ is_midr_in_range_list(broken_seis) &&
+ (read_sysreg_s(SYS_ICH_VTR_EL2) & ICH_VTR_EL2_SEIS));
+}
+
+void noinstr kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr,
+ int nr_inst)
+{
+ u32 insn, oinsn, rd;
+ u64 hcr = 0;
+
+ if (cpus_have_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
+ group0_trap = true;
+ group1_trap = true;
+ }
+
+ if (vgic_v3_broken_seis()) {
+ /* We know that these machines have ICH_HCR_EL2.TDIR */
+ group0_trap = true;
+ group1_trap = true;
+ dir_trap = true;
+ }
+
+ if (!cpus_have_cap(ARM64_HAS_ICH_HCR_EL2_TDIR))
+ common_trap = true;
+
+ if (group0_trap)
+ hcr |= ICH_HCR_EL2_TALL0;
+ if (group1_trap)
+ hcr |= ICH_HCR_EL2_TALL1;
+ if (common_trap)
+ hcr |= ICH_HCR_EL2_TC;
+ if (dir_trap)
+ hcr |= ICH_HCR_EL2_TDIR;
+
+ /* Compute target register */
+ oinsn = le32_to_cpu(*origptr);
+ rd = aarch64_insn_decode_register(AARCH64_INSN_REGTYPE_RD, oinsn);
+
+ /* movz rd, #(val & 0xffff) */
+ insn = aarch64_insn_gen_movewide(rd,
+ (u16)hcr,
+ 0,
+ AARCH64_INSN_VARIANT_64BIT,
+ AARCH64_INSN_MOVEWIDE_ZERO);
+ *updptr = cpu_to_le32(insn);
}
/**
@@ -650,6 +891,7 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
{
u64 ich_vtr_el2 = kvm_call_hyp_ret(__vgic_v3_get_gic_config);
bool has_v2;
+ u64 traps;
int ret;
has_v2 = ich_vtr_el2 >> 63;
@@ -708,29 +950,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
if (has_v2)
static_branch_enable(&vgic_v3_has_v2_compat);
- if (cpus_have_final_cap(ARM64_WORKAROUND_CAVIUM_30115)) {
- group0_trap = true;
- group1_trap = true;
- }
-
if (vgic_v3_broken_seis()) {
kvm_info("GICv3 with broken locally generated SEI\n");
-
kvm_vgic_global_state.ich_vtr_el2 &= ~ICH_VTR_EL2_SEIS;
- group0_trap = true;
- group1_trap = true;
- if (ich_vtr_el2 & ICH_VTR_EL2_TDS)
- dir_trap = true;
- else
- common_trap = true;
}
- if (group0_trap || group1_trap || common_trap | dir_trap) {
+ traps = vgic_ich_hcr_trap_bits();
+ if (traps) {
kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n",
- group0_trap ? "G0" : "",
- group1_trap ? "G1" : "",
- common_trap ? "C" : "",
- dir_trap ? "D" : "");
+ (traps & ICH_HCR_EL2_TALL0) ? "G0" : "",
+ (traps & ICH_HCR_EL2_TALL1) ? "G1" : "",
+ (traps & ICH_HCR_EL2_TC) ? "C" : "",
+ (traps & ICH_HCR_EL2_TDIR) ? "D" : "");
static_branch_enable(&vgic_v3_cpuif_trap);
}
@@ -770,7 +1001,7 @@ void vgic_v3_put(struct kvm_vcpu *vcpu)
}
if (likely(!is_protected_kvm_enabled()))
- kvm_call_hyp(__vgic_v3_save_vmcr_aprs, cpu_if);
+ kvm_call_hyp(__vgic_v3_save_aprs, cpu_if);
WARN_ON(vgic_v4_put(vcpu));
if (has_vhe())
diff --git a/arch/arm64/kvm/vgic/vgic-v4.c b/arch/arm64/kvm/vgic/vgic-v4.c
index 548aec9d5a72..09c3e9eb23f8 100644
--- a/arch/arm64/kvm/vgic/vgic-v4.c
+++ b/arch/arm64/kvm/vgic/vgic-v4.c
@@ -163,6 +163,7 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
struct vgic_irq *irq = vgic_get_vcpu_irq(vcpu, i);
struct irq_desc *desc;
unsigned long flags;
+ bool pending;
int ret;
raw_spin_lock_irqsave(&irq->irq_lock, flags);
@@ -173,9 +174,11 @@ static void vgic_v4_disable_vsgis(struct kvm_vcpu *vcpu)
irq->hw = false;
ret = irq_get_irqchip_state(irq->host_irq,
IRQCHIP_STATE_PENDING,
- &irq->pending_latch);
+ &pending);
WARN_ON(ret);
+ irq->pending_latch = pending;
+
desc = irq_to_desc(irq->host_irq);
irq_domain_deactivate_irq(irq_desc_get_irq_data(desc));
unlock:
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index 6dd5a10081e2..430aa98888fd 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -28,7 +28,7 @@ struct vgic_global kvm_vgic_global_state __ro_after_init = {
* kvm->arch.config_lock (mutex)
* its->cmd_lock (mutex)
* its->its_lock (mutex)
- * vgic_dist->lpi_xa.xa_lock
+ * vgic_dist->lpi_xa.xa_lock must be taken with IRQs disabled
* vgic_cpu->ap_list_lock must be taken with IRQs disabled
* vgic_irq->irq_lock must be taken with IRQs disabled
*
@@ -141,32 +141,39 @@ static __must_check bool vgic_put_irq_norelease(struct kvm *kvm, struct vgic_irq
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
{
struct vgic_dist *dist = &kvm->arch.vgic;
+ unsigned long flags;
- if (irq->intid >= VGIC_MIN_LPI)
- might_lock(&dist->lpi_xa.xa_lock);
+ /*
+ * Normally the lock is only taken when the refcount drops to 0.
+ * Acquire/release it early on lockdep kernels to make locking issues
+ * in rare release paths a bit more obvious.
+ */
+ if (IS_ENABLED(CONFIG_LOCKDEP) && irq->intid >= VGIC_MIN_LPI) {
+ guard(spinlock_irqsave)(&dist->lpi_xa.xa_lock);
+ }
if (!__vgic_put_irq(kvm, irq))
return;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
vgic_release_lpi_locked(dist, irq);
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
static void vgic_release_deleted_lpis(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- unsigned long intid;
+ unsigned long flags, intid;
struct vgic_irq *irq;
- xa_lock(&dist->lpi_xa);
+ xa_lock_irqsave(&dist->lpi_xa, flags);
xa_for_each(&dist->lpi_xa, intid, irq) {
if (irq->pending_release)
vgic_release_lpi_locked(dist, irq);
}
- xa_unlock(&dist->lpi_xa);
+ xa_unlock_irqrestore(&dist->lpi_xa, flags);
}
void vgic_flush_pending_lpis(struct kvm_vcpu *vcpu)
@@ -237,7 +244,7 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active)
*
* Requires the IRQ lock to be held.
*/
-static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
{
lockdep_assert_held(&irq->irq_lock);
@@ -265,17 +272,20 @@ static struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
return NULL;
}
+struct vgic_sort_info {
+ struct kvm_vcpu *vcpu;
+ struct vgic_vmcr vmcr;
+};
+
/*
* The order of items in the ap_lists defines how we'll pack things in LRs as
* well, the first items in the list being the first things populated in the
* LRs.
*
- * A hard rule is that active interrupts can never be pushed out of the LRs
- * (and therefore take priority) since we cannot reliably trap on deactivation
- * of IRQs and therefore they have to be present in the LRs.
- *
+ * Pending, non-active interrupts must be placed at the head of the list.
* Otherwise things should be sorted by the priority field and the GIC
* hardware support will take care of preemption of priority groups etc.
+ * Interrupts that are not deliverable should be at the end of the list.
*
* Return negative if "a" sorts before "b", 0 to preserve order, and positive
* to sort "b" before "a".
@@ -285,6 +295,8 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
{
struct vgic_irq *irqa = container_of(a, struct vgic_irq, ap_list);
struct vgic_irq *irqb = container_of(b, struct vgic_irq, ap_list);
+ struct vgic_sort_info *info = priv;
+ struct kvm_vcpu *vcpu = info->vcpu;
bool penda, pendb;
int ret;
@@ -298,21 +310,32 @@ static int vgic_irq_cmp(void *priv, const struct list_head *a,
raw_spin_lock(&irqa->irq_lock);
raw_spin_lock_nested(&irqb->irq_lock, SINGLE_DEPTH_NESTING);
- if (irqa->active || irqb->active) {
- ret = (int)irqb->active - (int)irqa->active;
+ /* Undeliverable interrupts should be last */
+ ret = (int)(vgic_target_oracle(irqb) == vcpu) - (int)(vgic_target_oracle(irqa) == vcpu);
+ if (ret)
+ goto out;
+
+ /* Same thing for interrupts targeting a disabled group */
+ ret = (int)(irqb->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ ret -= (int)(irqa->group ? info->vmcr.grpen1 : info->vmcr.grpen0);
+ if (ret)
goto out;
- }
- penda = irqa->enabled && irq_is_pending(irqa);
- pendb = irqb->enabled && irq_is_pending(irqb);
+ penda = irqa->enabled && irq_is_pending(irqa) && !irqa->active;
+ pendb = irqb->enabled && irq_is_pending(irqb) && !irqb->active;
- if (!penda || !pendb) {
- ret = (int)pendb - (int)penda;
+ ret = (int)pendb - (int)penda;
+ if (ret)
goto out;
- }
- /* Both pending and enabled, sort by priority */
- ret = irqa->priority - irqb->priority;
+ /* Both pending and enabled, sort by priority (lower number first) */
+ ret = (int)irqa->priority - (int)irqb->priority;
+ if (ret)
+ goto out;
+
+ /* Finally, HW bit active interrupts have priority over non-HW ones */
+ ret = (int)irqb->hw - (int)irqa->hw;
+
out:
raw_spin_unlock(&irqb->irq_lock);
raw_spin_unlock(&irqa->irq_lock);
@@ -323,10 +346,12 @@ out:
static void vgic_sort_ap_list(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct vgic_sort_info info = { .vcpu = vcpu, };
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- list_sort(NULL, &vgic_cpu->ap_list_head, vgic_irq_cmp);
+ vgic_get_vmcr(vcpu, &info.vmcr);
+ list_sort(&info, &vgic_cpu->ap_list_head, vgic_irq_cmp);
}
/*
@@ -349,6 +374,20 @@ static bool vgic_validate_injection(struct vgic_irq *irq, bool level, void *owne
return false;
}
+static bool vgic_model_needs_bcst_kick(struct kvm *kvm)
+{
+ /*
+ * A GICv3 (or GICv3-like) system exposing a GICv3 to the guest
+ * needs a broadcast kick to set TDIR globally.
+ *
+ * For systems that do not have TDIR (ARM's own v8.0 CPUs), the
+ * shadow TDIR bit is always set, and so is the register's TC bit,
+ * so no need to kick the CPUs.
+ */
+ return (cpus_have_final_cap(ARM64_HAS_ICH_HCR_EL2_TDIR) &&
+ kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3);
+}
+
/*
* Check whether an IRQ needs to (and can) be queued to a VCPU's ap list.
* Do the queuing if necessary, taking the right locks in the right order.
@@ -361,6 +400,7 @@ bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
unsigned long flags) __releases(&irq->irq_lock)
{
struct kvm_vcpu *vcpu;
+ bool bcast;
lockdep_assert_held(&irq->irq_lock);
@@ -435,11 +475,20 @@ retry:
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
+ /* A new SPI may result in deactivation trapping on all vcpus */
+ bcast = (vgic_model_needs_bcst_kick(vcpu->kvm) &&
+ vgic_valid_spi(vcpu->kvm, irq->intid) &&
+ atomic_fetch_inc(&vcpu->kvm->arch.vgic.active_spis) == 0);
+
raw_spin_unlock(&irq->irq_lock);
raw_spin_unlock_irqrestore(&vcpu->arch.vgic_cpu.ap_list_lock, flags);
- kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
- kvm_vcpu_kick(vcpu);
+ if (!bcast) {
+ kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+ kvm_vcpu_kick(vcpu);
+ } else {
+ kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_IRQ_PENDING);
+ }
return true;
}
@@ -791,98 +840,148 @@ static inline void vgic_clear_lr(struct kvm_vcpu *vcpu, int lr)
vgic_v3_clear_lr(vcpu, lr);
}
-static inline void vgic_set_underflow(struct kvm_vcpu *vcpu)
-{
- if (kvm_vgic_global_state.type == VGIC_V2)
- vgic_v2_set_underflow(vcpu);
- else
- vgic_v3_set_underflow(vcpu);
-}
-
-/* Requires the ap_list_lock to be held. */
-static int compute_ap_list_depth(struct kvm_vcpu *vcpu,
- bool *multi_sgi)
+static void summarize_ap_list(struct kvm_vcpu *vcpu,
+ struct ap_list_summary *als)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
struct vgic_irq *irq;
- int count = 0;
-
- *multi_sgi = false;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
+ *als = (typeof(*als)){};
+
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- int w;
+ guard(raw_spinlock)(&irq->irq_lock);
- raw_spin_lock(&irq->irq_lock);
- /* GICv2 SGIs can count for more than one... */
- w = vgic_irq_get_lr_count(irq);
- raw_spin_unlock(&irq->irq_lock);
+ if (unlikely(vgic_target_oracle(irq) != vcpu))
+ continue;
- count += w;
- *multi_sgi |= (w > 1);
+ if (!irq->active)
+ als->nr_pend++;
+ else
+ als->nr_act++;
+
+ if (irq->intid < VGIC_NR_SGIS)
+ als->nr_sgi++;
}
- return count;
}
-/* Requires the VCPU's ap_list_lock to be held. */
+/*
+ * Dealing with LR overflow is close to black magic -- dress accordingly.
+ *
+ * We have to present an almost infinite number of interrupts through a very
+ * limited number of registers. Therefore crucial decisions must be made to
+ * ensure we feed the most relevant interrupts into the LRs, and yet have
+ * some facilities to let the guest interact with those that are not there.
+ *
+ * All considerations below are in the context of interrupts targeting a
+ * single vcpu with non-idle state (either pending, active, or both),
+ * colloquially called the ap_list:
+ *
+ * - Pending interrupts must have priority over active interrupts. This also
+ * excludes pending+active interrupts. This ensures that a guest can
+ * perform priority drops on any number of interrupts, and yet be
+ * presented the next pending one.
+ *
+ * - Deactivation of interrupts outside of the LRs must be tracked by using
+ * either the EOIcount-driven maintenance interrupt, and sometimes by
+ * trapping the DIR register.
+ *
+ * - For EOImode=0, a non-zero EOIcount means walking the ap_list past the
+ * point that made it into the LRs, and deactivate interrupts that would
+ * have made it onto the LRs if we had the space.
+ *
+ * - The MI-generation bits must be used to try and force an exit when the
+ * guest has done enough changes to the LRs that we want to reevaluate the
+ * situation:
+ *
+ * - if the total number of pending interrupts exceeds the number of
+ * LR, NPIE must be set in order to exit once no pending interrupts
+ * are present in the LRs, allowing us to populate the next batch.
+ *
+ * - if there are active interrupts outside of the LRs, then LRENPIE
+ * must be set so that we exit on deactivation of one of these, and
+ * work out which one is to be deactivated. Note that this is not
+ * enough to deal with EOImode=1, see below.
+ *
+ * - if the overall number of interrupts exceeds the number of LRs,
+ * then UIE must be set to allow refilling of the LRs once the
+ * majority of them has been processed.
+ *
+ * - as usual, MI triggers are only an optimisation, since we cannot
+ * rely on the MI being delivered in timely manner...
+ *
+ * - EOImode=1 creates some additional problems:
+ *
+ * - deactivation can happen in any order, and we cannot rely on
+ * EOImode=0's coupling of priority-drop and deactivation which
+ * imposes strict reverse Ack order. This means that DIR must
+ * trap if we have active interrupts outside of the LRs.
+ *
+ * - deactivation of SPIs can occur on any CPU, while the SPI is only
+ * present in the ap_list of the CPU that actually ack-ed it. In that
+ * case, EOIcount doesn't provide enough information, and we must
+ * resort to trapping DIR even if we don't overflow the LRs. Bonus
+ * point for not trapping DIR when no SPIs are pending or active in
+ * the whole VM.
+ *
+ * - LPIs do not suffer the same problem as SPIs on deactivation, as we
+ * have to essentially discard the active state, see below.
+ *
+ * - Virtual LPIs have an active state (surprise!), which gets removed on
+ * priority drop (EOI). However, EOIcount doesn't get bumped when the LPI
+ * is not present in the LR (surprise again!). Special care must therefore
+ * be taken to remove the active state from any activated LPI when exiting
+ * from the guest. This is in a way no different from what happens on the
+ * physical side. We still rely on the running priority to have been
+ * removed from the APRs, irrespective of the LPI being present in the LRs
+ * or not.
+ *
+ * - Virtual SGIs directly injected via GICv4.1 must not affect EOIcount, as
+ * they are not managed in SW and don't have a true active state. So only
+ * set vSGIEOICount when no SGIs are in the ap_list.
+ *
+ * - GICv2 SGIs with multiple sources are injected one source at a time, as
+ * if they were made pending sequentially. This may mean that we don't
+ * always present the HPPI if other interrupts with lower priority are
+ * pending in the LRs. Big deal.
+ */
static void vgic_flush_lr_state(struct kvm_vcpu *vcpu)
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ struct ap_list_summary als;
struct vgic_irq *irq;
- int count;
- bool multi_sgi;
- u8 prio = 0xff;
- int i = 0;
+ int count = 0;
lockdep_assert_held(&vgic_cpu->ap_list_lock);
- count = compute_ap_list_depth(vcpu, &multi_sgi);
- if (count > kvm_vgic_global_state.nr_lr || multi_sgi)
- vgic_sort_ap_list(vcpu);
+ summarize_ap_list(vcpu, &als);
- count = 0;
+ if (irqs_outside_lrs(&als))
+ vgic_sort_ap_list(vcpu);
list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) {
- raw_spin_lock(&irq->irq_lock);
-
- /*
- * If we have multi-SGIs in the pipeline, we need to
- * guarantee that they are all seen before any IRQ of
- * lower priority. In that case, we need to filter out
- * these interrupts by exiting early. This is easy as
- * the AP list has been sorted already.
- */
- if (multi_sgi && irq->priority > prio) {
- raw_spin_unlock(&irq->irq_lock);
- break;
- }
-
- if (likely(vgic_target_oracle(irq) == vcpu)) {
- vgic_populate_lr(vcpu, irq, count++);
-
- if (irq->source)
- prio = irq->priority;
+ scoped_guard(raw_spinlock, &irq->irq_lock) {
+ if (likely(vgic_target_oracle(irq) == vcpu)) {
+ vgic_populate_lr(vcpu, irq, count++);
+ }
}
- raw_spin_unlock(&irq->irq_lock);
-
- if (count == kvm_vgic_global_state.nr_lr) {
- if (!list_is_last(&irq->ap_list,
- &vgic_cpu->ap_list_head))
- vgic_set_underflow(vcpu);
+ if (count == kvm_vgic_global_state.nr_lr)
break;
- }
}
/* Nuke remaining LRs */
- for (i = count ; i < kvm_vgic_global_state.nr_lr; i++)
+ for (int i = count ; i < kvm_vgic_global_state.nr_lr; i++)
vgic_clear_lr(vcpu, i);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
+ if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif)) {
vcpu->arch.vgic_cpu.vgic_v2.used_lrs = count;
- else
+ vgic_v2_configure_hcr(vcpu, &als);
+ } else {
vcpu->arch.vgic_cpu.vgic_v3.used_lrs = count;
+ vgic_v3_configure_hcr(vcpu, &als);
+ }
}
static inline bool can_access_vgic_from_kernel(void)
@@ -906,8 +1005,6 @@ static inline void vgic_save_state(struct kvm_vcpu *vcpu)
/* Sync back the hardware VGIC state into our emulation after a guest's run. */
void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
{
- int used_lrs;
-
/* If nesting, emulate the HW effect from L0 to L1 */
if (vgic_state_is_nested(vcpu)) {
vgic_v3_sync_nested(vcpu);
@@ -917,21 +1014,22 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
if (vcpu_has_nv(vcpu))
vgic_v3_nested_update_mi(vcpu);
- /* An empty ap_list_head implies used_lrs == 0 */
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head))
- return;
-
if (can_access_vgic_from_kernel())
vgic_save_state(vcpu);
- if (!static_branch_unlikely(&kvm_vgic_global_state.gicv3_cpuif))
- used_lrs = vcpu->arch.vgic_cpu.vgic_v2.used_lrs;
- else
- used_lrs = vcpu->arch.vgic_cpu.vgic_v3.used_lrs;
+ vgic_fold_lr_state(vcpu);
+ vgic_prune_ap_list(vcpu);
+}
+
+/* Sync interrupts that were deactivated through a DIR trap */
+void kvm_vgic_process_async_update(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
- if (used_lrs)
- vgic_fold_lr_state(vcpu);
+ /* Make sure we're in the same context as LR handling */
+ local_irq_save(flags);
vgic_prune_ap_list(vcpu);
+ local_irq_restore(flags);
}
static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
@@ -958,8 +1056,9 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
* abort the entry procedure and inject the exception at the
* beginning of the run loop.
*
- * - Otherwise, do exactly *NOTHING*. The guest state is
- * already loaded, and we can carry on with running it.
+ * - Otherwise, do exactly *NOTHING* apart from enabling the virtual
+ * CPU interface. The guest state is already loaded, and we can
+ * carry on with running it.
*
* If we have NV, but are not in a nested state, compute the
* maintenance interrupt state, as it may fire.
@@ -968,35 +1067,17 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
if (kvm_vgic_vcpu_pending_irq(vcpu))
kvm_make_request(KVM_REQ_GUEST_HYP_IRQ_PENDING, vcpu);
+ vgic_v3_flush_nested(vcpu);
return;
}
if (vcpu_has_nv(vcpu))
vgic_v3_nested_update_mi(vcpu);
- /*
- * If there are no virtual interrupts active or pending for this
- * VCPU, then there is no work to do and we can bail out without
- * taking any lock. There is a potential race with someone injecting
- * interrupts to the VCPU, but it is a benign race as the VCPU will
- * either observe the new interrupt before or after doing this check,
- * and introducing additional synchronization mechanism doesn't change
- * this.
- *
- * Note that we still need to go through the whole thing if anything
- * can be directly injected (GICv4).
- */
- if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head) &&
- !vgic_supports_direct_irqs(vcpu->kvm))
- return;
-
DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
- if (!list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) {
- raw_spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock);
+ scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
vgic_flush_lr_state(vcpu);
- raw_spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock);
- }
if (can_access_vgic_from_kernel())
vgic_restore_state(vcpu);
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index ac5f9c5d2b98..5f0fc96b4dc2 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -164,6 +164,22 @@ static inline int vgic_write_guest_lock(struct kvm *kvm, gpa_t gpa,
return ret;
}
+void kvm_compute_ich_hcr_trap_bits(struct alt_instr *alt,
+ __le32 *origptr, __le32 *updptr, int nr_inst);
+
+static inline u64 vgic_ich_hcr_trap_bits(void)
+{
+ u64 hcr;
+
+ /* All the traps are in the bottom 16bits */
+ asm volatile(ALTERNATIVE_CB("movz %0, #0\n",
+ ARM64_ALWAYS_SYSTEM,
+ kvm_compute_ich_hcr_trap_bits)
+ : "=r" (hcr));
+
+ return hcr;
+}
+
/*
* This struct provides an intermediate representation of the fields contained
* in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC
@@ -220,6 +236,21 @@ struct its_ite {
u32 event_id;
};
+struct ap_list_summary {
+ unsigned int nr_pend; /* purely pending, not active */
+ unsigned int nr_act; /* active, or active+pending */
+ unsigned int nr_sgi; /* any SGI */
+};
+
+#define irqs_outside_lrs(s) \
+ (((s)->nr_pend + (s)->nr_act) > kvm_vgic_global_state.nr_lr)
+
+#define irqs_pending_outside_lrs(s) \
+ ((s)->nr_pend > kvm_vgic_global_state.nr_lr)
+
+#define irqs_active_outside_lrs(s) \
+ ((s)->nr_act && irqs_outside_lrs(s))
+
int vgic_v3_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
struct vgic_reg_attr *reg_attr);
int vgic_v2_parse_attr(struct kvm_device *dev, struct kvm_device_attr *attr,
@@ -230,6 +261,7 @@ vgic_get_mmio_region(struct kvm_vcpu *vcpu, struct vgic_io_device *iodev,
struct vgic_irq *vgic_get_irq(struct kvm *kvm, u32 intid);
struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid);
void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
+struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq);
bool vgic_get_phys_line_level(struct vgic_irq *irq);
void vgic_irq_set_phys_pending(struct vgic_irq *irq, bool pending);
void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active);
@@ -245,8 +277,9 @@ int vgic_check_iorange(struct kvm *kvm, phys_addr_t ioaddr,
void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
+void vgic_v2_deactivate(struct kvm_vcpu *vcpu, u32 val);
void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v2_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v2_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
int vgic_v2_has_attr_regs(struct kvm_device *dev, struct kvm_device_attr *attr);
int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
@@ -254,7 +287,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
int offset, u32 *val);
void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v2_enable(struct kvm_vcpu *vcpu);
+void vgic_v2_reset(struct kvm_vcpu *vcpu);
int vgic_v2_probe(const struct gic_kvm_info *info);
int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
@@ -286,10 +319,11 @@ static inline void vgic_get_irq_ref(struct vgic_irq *irq)
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr);
-void vgic_v3_set_underflow(struct kvm_vcpu *vcpu);
+void vgic_v3_deactivate(struct kvm_vcpu *vcpu, u64 val);
+void vgic_v3_configure_hcr(struct kvm_vcpu *vcpu, struct ap_list_summary *als);
void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_v3_enable(struct kvm_vcpu *vcpu);
+void vgic_v3_reset(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq);
@@ -412,6 +446,7 @@ static inline bool kvm_has_gicv3(struct kvm *kvm)
return kvm_has_feat(kvm, ID_AA64PFR0_EL1, GIC, IMP);
}
+void vgic_v3_flush_nested(struct kvm_vcpu *vcpu);
void vgic_v3_sync_nested(struct kvm_vcpu *vcpu);
void vgic_v3_load_nested(struct kvm_vcpu *vcpu);
void vgic_v3_put_nested(struct kvm_vcpu *vcpu);