diff options
Diffstat (limited to 'arch/x86/hyperv')
-rw-r--r-- | arch/x86/hyperv/hv_init.c | 69 | ||||
-rw-r--r-- | arch/x86/hyperv/irqdomain.c | 111 | ||||
-rw-r--r-- | arch/x86/hyperv/ivm.c | 226 |
3 files changed, 345 insertions, 61 deletions
diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index afdbda2dd7b7..e890fd37e9c2 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -17,7 +17,6 @@ #include <asm/desc.h> #include <asm/e820/api.h> #include <asm/sev.h> -#include <asm/ibt.h> #include <asm/hypervisor.h> #include <hyperv/hvhdk.h> #include <asm/mshyperv.h> @@ -37,7 +36,45 @@ #include <linux/export.h> void *hv_hypercall_pg; + +#ifdef CONFIG_X86_64 +static u64 __hv_hyperfail(u64 control, u64 param1, u64 param2) +{ + return U64_MAX; +} + +DEFINE_STATIC_CALL(__hv_hypercall, __hv_hyperfail); + +u64 hv_std_hypercall(u64 control, u64 param1, u64 param2) +{ + u64 hv_status; + + register u64 __r8 asm("r8") = param2; + asm volatile ("call " STATIC_CALL_TRAMP_STR(__hv_hypercall) + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); + + return hv_status; +} + +typedef u64 (*hv_hypercall_f)(u64 control, u64 param1, u64 param2); + +static inline void hv_set_hypercall_pg(void *ptr) +{ + hv_hypercall_pg = ptr; + + if (!ptr) + ptr = &__hv_hyperfail; + static_call_update(__hv_hypercall, (hv_hypercall_f)ptr); +} +#else +static inline void hv_set_hypercall_pg(void *ptr) +{ + hv_hypercall_pg = ptr; +} EXPORT_SYMBOL_GPL(hv_hypercall_pg); +#endif union hv_ghcb * __percpu *hv_ghcb_pg; @@ -330,7 +367,7 @@ static int hv_suspend(void) * pointer is restored on resume. */ hv_hypercall_pg_saved = hv_hypercall_pg; - hv_hypercall_pg = NULL; + hv_set_hypercall_pg(NULL); /* Disable the hypercall page in the hypervisor */ rdmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); @@ -356,7 +393,7 @@ static void hv_resume(void) vmalloc_to_pfn(hv_hypercall_pg_saved); wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); - hv_hypercall_pg = hv_hypercall_pg_saved; + hv_set_hypercall_pg(hv_hypercall_pg_saved); hv_hypercall_pg_saved = NULL; /* @@ -476,8 +513,8 @@ void __init hyperv_init(void) if (hv_isolation_type_tdx() && !ms_hyperv.paravisor_present) goto skip_hypercall_pg_init; - hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, VMALLOC_START, - VMALLOC_END, GFP_KERNEL, PAGE_KERNEL_ROX, + hv_hypercall_pg = __vmalloc_node_range(PAGE_SIZE, 1, MODULES_VADDR, + MODULES_END, GFP_KERNEL, PAGE_KERNEL_ROX, VM_FLUSH_RESET_PERMS, NUMA_NO_NODE, __builtin_return_address(0)); if (hv_hypercall_pg == NULL) @@ -515,27 +552,9 @@ void __init hyperv_init(void) wrmsrq(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); } -skip_hypercall_pg_init: - /* - * Some versions of Hyper-V that provide IBT in guest VMs have a bug - * in that there's no ENDBR64 instruction at the entry to the - * hypercall page. Because hypercalls are invoked via an indirect call - * to the hypercall page, all hypercall attempts fail when IBT is - * enabled, and Linux panics. For such buggy versions, disable IBT. - * - * Fixed versions of Hyper-V always provide ENDBR64 on the hypercall - * page, so if future Linux kernel versions enable IBT for 32-bit - * builds, additional hypercall page hackery will be required here - * to provide an ENDBR32. - */ -#ifdef CONFIG_X86_KERNEL_IBT - if (cpu_feature_enabled(X86_FEATURE_IBT) && - *(u32 *)hv_hypercall_pg != gen_endbr()) { - setup_clear_cpu_cap(X86_FEATURE_IBT); - pr_warn("Disabling IBT because of Hyper-V bug\n"); - } -#endif + hv_set_hypercall_pg(hv_hypercall_pg); +skip_hypercall_pg_init: /* * hyperv_init() is called before LAPIC is initialized: see * apic_intr_mode_init() -> x86_platform.apic_post_init() and diff --git a/arch/x86/hyperv/irqdomain.c b/arch/x86/hyperv/irqdomain.c index 090f5ac9f492..c3ba12b1bc07 100644 --- a/arch/x86/hyperv/irqdomain.c +++ b/arch/x86/hyperv/irqdomain.c @@ -11,6 +11,7 @@ #include <linux/pci.h> #include <linux/irq.h> #include <linux/export.h> +#include <linux/irqchip/irq-msi-lib.h> #include <asm/mshyperv.h> static int hv_map_interrupt(union hv_device_id device_id, bool level, @@ -289,59 +290,99 @@ static void hv_teardown_msi_irq(struct pci_dev *dev, struct irq_data *irqd) (void)hv_unmap_msi_interrupt(dev, &old_entry); } -static void hv_msi_free_irq(struct irq_domain *domain, - struct msi_domain_info *info, unsigned int virq) -{ - struct irq_data *irqd = irq_get_irq_data(virq); - struct msi_desc *desc; - - if (!irqd) - return; - - desc = irq_data_get_msi_desc(irqd); - if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) - return; - - hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); -} - /* * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, * which implement the MSI or MSI-X Capability Structure. */ static struct irq_chip hv_pci_msi_controller = { .name = "HV-PCI-MSI", - .irq_unmask = pci_msi_unmask_irq, - .irq_mask = pci_msi_mask_irq, .irq_ack = irq_chip_ack_parent, - .irq_retrigger = irq_chip_retrigger_hierarchy, .irq_compose_msi_msg = hv_irq_compose_msi_msg, - .irq_set_affinity = msi_domain_set_affinity, - .flags = IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED, + .irq_set_affinity = irq_chip_set_affinity_parent, }; -static struct msi_domain_ops pci_msi_domain_ops = { - .msi_free = hv_msi_free_irq, - .msi_prepare = pci_msi_prepare, +static bool hv_init_dev_msi_info(struct device *dev, struct irq_domain *domain, + struct irq_domain *real_parent, struct msi_domain_info *info) +{ + struct irq_chip *chip = info->chip; + + if (!msi_lib_init_dev_msi_info(dev, domain, real_parent, info)) + return false; + + chip->flags |= IRQCHIP_SKIP_SET_WAKE | IRQCHIP_MOVE_DEFERRED; + + info->ops->msi_prepare = pci_msi_prepare; + + return true; +} + +#define HV_MSI_FLAGS_SUPPORTED (MSI_GENERIC_FLAGS_MASK | MSI_FLAG_PCI_MSIX) +#define HV_MSI_FLAGS_REQUIRED (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS) + +static struct msi_parent_ops hv_msi_parent_ops = { + .supported_flags = HV_MSI_FLAGS_SUPPORTED, + .required_flags = HV_MSI_FLAGS_REQUIRED, + .bus_select_token = DOMAIN_BUS_NEXUS, + .bus_select_mask = MATCH_PCI_MSI, + .chip_flags = MSI_CHIP_FLAG_SET_ACK, + .prefix = "HV-", + .init_dev_msi_info = hv_init_dev_msi_info, }; -static struct msi_domain_info hv_pci_msi_domain_info = { - .flags = MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX, - .ops = &pci_msi_domain_ops, - .chip = &hv_pci_msi_controller, - .handler = handle_edge_irq, - .handler_name = "edge", +static int hv_msi_domain_alloc(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs, + void *arg) +{ + /* + * TODO: The allocation bits of hv_irq_compose_msi_msg(), i.e. everything except + * entry_to_msi_msg() should be in here. + */ + + int ret; + + ret = irq_domain_alloc_irqs_parent(d, virq, nr_irqs, arg); + if (ret) + return ret; + + for (int i = 0; i < nr_irqs; ++i) { + irq_domain_set_info(d, virq + i, 0, &hv_pci_msi_controller, NULL, + handle_edge_irq, NULL, "edge"); + } + return 0; +} + +static void hv_msi_domain_free(struct irq_domain *d, unsigned int virq, unsigned int nr_irqs) +{ + for (int i = 0; i < nr_irqs; ++i) { + struct irq_data *irqd = irq_domain_get_irq_data(d, virq); + struct msi_desc *desc; + + desc = irq_data_get_msi_desc(irqd); + if (!desc || !desc->irq || WARN_ON_ONCE(!dev_is_pci(desc->dev))) + continue; + + hv_teardown_msi_irq(to_pci_dev(desc->dev), irqd); + } + irq_domain_free_irqs_top(d, virq, nr_irqs); +} + +static const struct irq_domain_ops hv_msi_domain_ops = { + .select = msi_lib_irq_domain_select, + .alloc = hv_msi_domain_alloc, + .free = hv_msi_domain_free, }; struct irq_domain * __init hv_create_pci_msi_domain(void) { struct irq_domain *d = NULL; - struct fwnode_handle *fn; - fn = irq_domain_alloc_named_fwnode("HV-PCI-MSI"); - if (fn) - d = pci_msi_create_irq_domain(fn, &hv_pci_msi_domain_info, x86_vector_domain); + struct irq_domain_info info = { + .fwnode = irq_domain_alloc_named_fwnode("HV-PCI-MSI"), + .ops = &hv_msi_domain_ops, + .parent = x86_vector_domain, + }; + + if (info.fwnode) + d = msi_create_parent_irq_domain(&info, &hv_msi_parent_ops); /* No point in going further if we can't get an irq domain */ BUG_ON(!d); diff --git a/arch/x86/hyperv/ivm.c b/arch/x86/hyperv/ivm.c index ade6c665c97e..651771534cae 100644 --- a/arch/x86/hyperv/ivm.c +++ b/arch/x86/hyperv/ivm.c @@ -385,9 +385,23 @@ int hv_snp_boot_ap(u32 apic_id, unsigned long start_ip, unsigned int cpu) return ret; } +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) +{ + u64 hv_status; + + register u64 __r8 asm("r8") = param2; + asm volatile("vmmcall" + : "=a" (hv_status), ASM_CALL_CONSTRAINT, + "+c" (control), "+d" (param1), "+r" (__r8) + : : "cc", "memory", "r9", "r10", "r11"); + + return hv_status; +} + #else static inline void hv_ghcb_msr_write(u64 msr, u64 value) {} static inline void hv_ghcb_msr_read(u64 msr, u64 *value) {} +u64 hv_snp_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } #endif /* CONFIG_AMD_MEM_ENCRYPT */ #ifdef CONFIG_INTEL_TDX_GUEST @@ -437,6 +451,7 @@ u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) #else static inline void hv_tdx_msr_write(u64 msr, u64 value) {} static inline void hv_tdx_msr_read(u64 msr, u64 *value) {} +u64 hv_tdx_hypercall(u64 control, u64 param1, u64 param2) { return U64_MAX; } #endif /* CONFIG_INTEL_TDX_GUEST */ #if defined(CONFIG_AMD_MEM_ENCRYPT) || defined(CONFIG_INTEL_TDX_GUEST) @@ -463,6 +478,195 @@ void hv_ivm_msr_read(u64 msr, u64 *value) } /* + * Keep track of the PFN regions which were shared with the host. The access + * must be revoked upon kexec/kdump (see hv_ivm_clear_host_access()). + */ +struct hv_enc_pfn_region { + struct list_head list; + u64 pfn; + int count; +}; + +static LIST_HEAD(hv_list_enc); +static DEFINE_RAW_SPINLOCK(hv_list_enc_lock); + +static int hv_list_enc_add(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + /* Check if the PFN already exists in some region first */ + list_for_each_entry(ent, &hv_list_enc, list) { + if ((ent->pfn <= pfn) && (ent->pfn + ent->count - 1 >= pfn)) + /* Nothing to do - pfn is already in the list */ + goto unlock_done; + } + + /* + * Check if the PFN is adjacent to an existing region. Growing + * a region can make it adjacent to another one but merging is + * not (yet) implemented for simplicity. A PFN cannot be added + * to two regions to keep the logic in hv_list_enc_remove() + * correct. + */ + list_for_each_entry(ent, &hv_list_enc, list) { + if (ent->pfn + ent->count == pfn) { + /* Grow existing region up */ + ent->count++; + goto unlock_done; + } else if (pfn + 1 == ent->pfn) { + /* Grow existing region down */ + ent->pfn--; + ent->count++; + goto unlock_done; + } + } + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + /* No adjacent region found -- create a new one */ + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = pfn; + ent->count = 1; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +static int hv_list_enc_remove(const u64 *pfn_list, int count) +{ + struct hv_enc_pfn_region *ent, *t; + struct hv_enc_pfn_region new_region; + unsigned long flags; + u64 pfn; + int i; + + for (i = 0; i < count; i++) { + pfn = pfn_list[i]; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_for_each_entry_safe(ent, t, &hv_list_enc, list) { + if (pfn == ent->pfn + ent->count - 1) { + /* Removing tail pfn */ + ent->count--; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn == ent->pfn) { + /* Removing head pfn */ + ent->count--; + ent->pfn++; + if (!ent->count) { + list_del(&ent->list); + kfree(ent); + } + goto unlock_done; + } else if (pfn > ent->pfn && pfn < ent->pfn + ent->count - 1) { + /* + * Removing a pfn in the middle. Cut off the tail + * of the existing region and create a template for + * the new one. + */ + new_region.pfn = pfn + 1; + new_region.count = ent->count - (pfn - ent->pfn + 1); + ent->count = pfn - ent->pfn; + goto unlock_split; + } + + } +unlock_done: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + continue; + +unlock_split: + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + + ent = kzalloc(sizeof(struct hv_enc_pfn_region), GFP_KERNEL); + if (!ent) + return -ENOMEM; + + ent->pfn = new_region.pfn; + ent->count = new_region.count; + + raw_spin_lock_irqsave(&hv_list_enc_lock, flags); + list_add(&ent->list, &hv_list_enc); + raw_spin_unlock_irqrestore(&hv_list_enc_lock, flags); + } + + return 0; +} + +/* Stop new private<->shared conversions */ +static void hv_vtom_kexec_begin(void) +{ + if (!IS_ENABLED(CONFIG_KEXEC_CORE)) + return; + + /* + * Crash kernel reaches here with interrupts disabled: can't wait for + * conversions to finish. + * + * If race happened, just report and proceed. + */ + if (!set_memory_enc_stop_conversion()) + pr_warn("Failed to stop shared<->private conversions\n"); +} + +static void hv_vtom_kexec_finish(void) +{ + struct hv_gpa_range_for_visibility *input; + struct hv_enc_pfn_region *ent; + unsigned long flags; + u64 hv_status; + int cur, i; + + local_irq_save(flags); + input = *this_cpu_ptr(hyperv_pcpu_input_arg); + + if (unlikely(!input)) + goto out; + + list_for_each_entry(ent, &hv_list_enc, list) { + for (i = 0, cur = 0; i < ent->count; i++) { + input->gpa_page_list[cur] = ent->pfn + i; + cur++; + + if (cur == HV_MAX_MODIFY_GPA_REP_COUNT || i == ent->count - 1) { + input->partition_id = HV_PARTITION_ID_SELF; + input->host_visibility = VMBUS_PAGE_NOT_VISIBLE; + input->reserved0 = 0; + input->reserved1 = 0; + hv_status = hv_do_rep_hypercall( + HVCALL_MODIFY_SPARSE_GPA_PAGE_HOST_VISIBILITY, + cur, 0, input, NULL); + WARN_ON_ONCE(!hv_result_success(hv_status)); + cur = 0; + } + } + + } + +out: + local_irq_restore(flags); +} + +/* * hv_mark_gpa_visibility - Set pages visible to host via hvcall. * * In Isolation VM, all guest memory is encrypted from host and guest @@ -475,6 +679,7 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], struct hv_gpa_range_for_visibility *input; u64 hv_status; unsigned long flags; + int ret; /* no-op if partition isolation is not enabled */ if (!hv_is_isolation_supported()) @@ -486,6 +691,13 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], return -EINVAL; } + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_remove(pfn, count); + else + ret = hv_list_enc_add(pfn, count); + if (ret) + return ret; + local_irq_save(flags); input = *this_cpu_ptr(hyperv_pcpu_input_arg); @@ -506,8 +718,18 @@ static int hv_mark_gpa_visibility(u16 count, const u64 pfn[], if (hv_result_success(hv_status)) return 0; + + if (visibility == VMBUS_PAGE_NOT_VISIBLE) + ret = hv_list_enc_add(pfn, count); else - return -EFAULT; + ret = hv_list_enc_remove(pfn, count); + /* + * There's no good way to recover from -ENOMEM here, the accounting is + * wrong either way. + */ + WARN_ON_ONCE(ret); + + return -EFAULT; } /* @@ -669,6 +891,8 @@ void __init hv_vtom_init(void) x86_platform.guest.enc_tlb_flush_required = hv_vtom_tlb_flush_required; x86_platform.guest.enc_status_change_prepare = hv_vtom_clear_present; x86_platform.guest.enc_status_change_finish = hv_vtom_set_host_visibility; + x86_platform.guest.enc_kexec_begin = hv_vtom_kexec_begin; + x86_platform.guest.enc_kexec_finish = hv_vtom_kexec_finish; /* Set WB as the default cache mode. */ guest_force_mtrr_state(NULL, 0, MTRR_TYPE_WRBACK); |