From 08e61e861a0e47e5e1a3fb78406afd6b0cea6b6d Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 13 Apr 2022 07:36:21 -0600 Subject: PCI: hv: Fix multi-MSI to allow more than one MSI vector If the allocation of multiple MSI vectors for multi-MSI fails in the core PCI framework, the framework will retry the allocation as a single MSI vector, assuming that meets the min_vecs specified by the requesting driver. Hyper-V advertises that multi-MSI is supported, but reuses the VECTOR domain to implement that for x86. The VECTOR domain does not support multi-MSI, so the alloc will always fail and fallback to a single MSI allocation. In short, Hyper-V advertises a capability it does not implement. Hyper-V can support multi-MSI because it coordinates with the hypervisor to map the MSIs in the IOMMU's interrupt remapper, which is something the VECTOR domain does not have. Therefore the fix is simple - copy what the x86 IOMMU drivers (AMD/Intel-IR) do by removing X86_IRQ_ALLOC_CONTIGUOUS_VECTORS after calling the VECTOR domain's pci_msi_prepare(). Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs") Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Link: https://lore.kernel.org/r/1649856981-14649-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index d270a204324e..1cbe24b92a38 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -614,7 +614,16 @@ static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *info) { - return pci_msi_prepare(domain, dev, nvec, info); + int ret = pci_msi_prepare(domain, dev, nvec, info); + + /* + * By using the interrupt remapper in the hypervisor IOMMU, contiguous + * CPU vectors is not needed for multi-MSI + */ + if (info->type == X86_IRQ_ALLOC_TYPE_PCI_MSI) + info->flags &= ~X86_IRQ_ALLOC_CONTIGUOUS_VECTORS; + + return ret; } /** -- cgit v1.2.3 From de5ddb7d44347ad8b00533c1850a4e2e636a1ce9 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:21 +0200 Subject: PCI: hv: Use vmbus_requestor to generate transaction IDs for VMbus hardening Currently, pointers to guest memory are passed to Hyper-V as transaction IDs in hv_pci. In the face of errors or malicious behavior in Hyper-V, hv_pci should not expose or trust the transaction IDs returned by Hyper-V to be valid guest memory addresses. Instead, use small integers generated by vmbus_requestor as request (transaction) IDs. Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 39 +++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 10 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 1cbe24b92a38..c18e9b608bd6 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -91,6 +91,13 @@ static enum pci_protocol_version_t pci_protocol_versions[] = { /* space for 32bit serial number as string */ #define SLOT_NAME_SIZE 11 +/* + * Size of requestor for VMbus; the value is based on the observation + * that having more than one request outstanding is 'rare', and so 64 + * should be generous in ensuring that we don't ever run out. + */ +#define HV_PCI_RQSTOR_SIZE 64 + /* * Message Types */ @@ -1529,7 +1536,7 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, int_pkt->wslot.slot = hpdev->desc.win_slot.slot; int_pkt->int_desc = *int_desc; vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt), - (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0); + 0, VM_PKT_DATA_INBAND, 0); kfree(int_desc); } @@ -2661,7 +2668,7 @@ static void hv_eject_device_work(struct work_struct *work) ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE; ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot; vmbus_sendpacket(hbus->hdev->channel, ejct_pkt, - sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt, + sizeof(*ejct_pkt), 0, VM_PKT_DATA_INBAND, 0); /* For the get_pcichild() in hv_pci_eject_device() */ @@ -2708,8 +2715,9 @@ static void hv_pci_onchannelcallback(void *context) const int packet_size = 0x100; int ret; struct hv_pcibus_device *hbus = context; + struct vmbus_channel *chan = hbus->hdev->channel; u32 bytes_recvd; - u64 req_id; + u64 req_id, req_addr; struct vmpacket_descriptor *desc; unsigned char *buffer; int bufferlen = packet_size; @@ -2727,8 +2735,8 @@ static void hv_pci_onchannelcallback(void *context) return; while (1) { - ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer, - bufferlen, &bytes_recvd, &req_id); + ret = vmbus_recvpacket_raw(chan, buffer, bufferlen, + &bytes_recvd, &req_id); if (ret == -ENOBUFS) { kfree(buffer); @@ -2755,11 +2763,14 @@ static void hv_pci_onchannelcallback(void *context) switch (desc->type) { case VM_PKT_COMP: - /* - * The host is trusted, and thus it's safe to interpret - * this transaction ID as a pointer. - */ - comp_packet = (struct pci_packet *)req_id; + req_addr = chan->request_addr_callback(chan, req_id); + if (req_addr == VMBUS_RQST_ERROR) { + dev_err(&hbus->hdev->device, + "Invalid transaction ID %llx\n", + req_id); + break; + } + comp_packet = (struct pci_packet *)req_addr; response = (struct pci_response *)buffer; comp_packet->completion_func(comp_packet->compl_ctxt, response, @@ -3440,6 +3451,10 @@ static int hv_pci_probe(struct hv_device *hdev, goto free_dom; } + hdev->channel->next_request_id_callback = vmbus_next_request_id; + hdev->channel->request_addr_callback = vmbus_request_addr; + hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, hv_pci_onchannelcallback, hbus); if (ret) @@ -3770,6 +3785,10 @@ static int hv_pci_resume(struct hv_device *hdev) hbus->state = hv_pcibus_init; + hdev->channel->next_request_id_callback = vmbus_next_request_id; + hdev->channel->request_addr_callback = vmbus_request_addr; + hdev->channel->rqstor_size = HV_PCI_RQSTOR_SIZE; + ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0, hv_pci_onchannelcallback, hbus); if (ret) -- cgit v1.2.3 From a765ed47e45166451680ee9af2b9e435c82ec3ba Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Tue, 19 Apr 2022 14:23:25 +0200 Subject: PCI: hv: Fix synchronization between channel callback and hv_compose_msi_msg() Dexuan wrote: "[...] when we disable AccelNet, the host PCI VSP driver sends a PCI_EJECT message first, and the channel callback may set hpdev->state to hv_pcichild_ejecting on a different CPU. This can cause hv_compose_msi_msg() to exit from the loop and 'return', and the on-stack variable 'ctxt' is invalid. Now, if the response message from the host arrives, the channel callback will try to access the invalid 'ctxt' variable, and this may cause a crash." Schematically: Hyper-V sends PCI_EJECT msg hv_pci_onchannelcallback() state = hv_pcichild_ejecting hv_compose_msi_msg() alloc and init comp_pkt state == hv_pcichild_ejecting Hyper-V sends VM_PKT_COMP msg hv_pci_onchannelcallback() retrieve address of comp_pkt 'free' comp_pkt and return comp_pkt->completion_func() Dexuan also showed how the crash can be triggered after introducing suitable delays in the driver code, thus validating the 'assumption' that the host can still normally respond to the guest's compose_msi request after the host has started to eject the PCI device. Fix the synchronization by leveraging the requestor lock as follows: - Before 'return'-ing in hv_compose_msi_msg(), remove the ID (while holding the requestor lock) associated to the completion packet. - Retrieve the address *and call ->completion_func() within a same (requestor) critical section in hv_pci_onchannelcallback(). Reported-by: Wei Hu Reported-by: Dexuan Cui Suggested-by: Michael Kelley Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20220419122325.10078-7-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index c18e9b608bd6..5800ecfcc517 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1707,7 +1707,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct pci_create_interrupt3 v3; } int_pkts; } __packed ctxt; - + u64 trans_id; u32 size; int ret; @@ -1769,10 +1769,10 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) goto free_int_desc; } - ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts, - size, (unsigned long)&ctxt.pci_pkt, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket_getid(hpdev->hbus->hdev->channel, &ctxt.int_pkts, + size, (unsigned long)&ctxt.pci_pkt, + &trans_id, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret) { dev_err(&hbus->hdev->device, "Sending request for interrupt failed: 0x%x", @@ -1851,6 +1851,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) enable_tasklet: tasklet_enable(&channel->callback_event); + /* + * The completion packet on the stack becomes invalid after 'return'; + * remove the ID from the VMbus requestor if the identifier is still + * mapped to/associated with the packet. (The identifier could have + * been 're-used', i.e., already removed and (re-)mapped.) + * + * Cf. hv_pci_onchannelcallback(). + */ + vmbus_request_addr_match(channel, trans_id, (unsigned long)&ctxt.pci_pkt); free_int_desc: kfree(int_desc); drop_reference: @@ -2729,6 +2738,7 @@ static void hv_pci_onchannelcallback(void *context) struct pci_dev_inval_block *inval; struct pci_dev_incoming *dev_message; struct hv_pci_dev *hpdev; + unsigned long flags; buffer = kmalloc(bufferlen, GFP_ATOMIC); if (!buffer) @@ -2763,8 +2773,11 @@ static void hv_pci_onchannelcallback(void *context) switch (desc->type) { case VM_PKT_COMP: - req_addr = chan->request_addr_callback(chan, req_id); + lock_requestor(chan, flags); + req_addr = __vmbus_request_addr_match(chan, req_id, + VMBUS_RQST_ADDR_ANY); if (req_addr == VMBUS_RQST_ERROR) { + unlock_requestor(chan, flags); dev_err(&hbus->hdev->device, "Invalid transaction ID %llx\n", req_id); @@ -2772,9 +2785,17 @@ static void hv_pci_onchannelcallback(void *context) } comp_packet = (struct pci_packet *)req_addr; response = (struct pci_response *)buffer; + /* + * Call ->completion_func() within the critical section to make + * sure that the packet pointer is still valid during the call: + * here 'valid' means that there's a task still waiting for the + * completion, and that the packet data is still on the waiting + * task's stack. Cf. hv_compose_msi_msg(). + */ comp_packet->completion_func(comp_packet->compl_ctxt, response, bytes_recvd); + unlock_requestor(chan, flags); break; case VM_PKT_DATA_INBAND: -- cgit v1.2.3 From 455880dfe292a2bdd3b4ad6a107299fce610e64b Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 27 Apr 2022 08:07:33 -0600 Subject: PCI: hv: Fix hv_arch_irq_unmask() for multi-MSI In the multi-MSI case, hv_arch_irq_unmask() will only operate on the first MSI of the N allocated. This is because only the first msi_desc is cached and it is shared by all the MSIs of the multi-MSI block. This means that hv_arch_irq_unmask() gets the correct address, but the wrong data (always 0). This can break MSIs. Lets assume MSI0 is vector 34 on CPU0, and MSI1 is vector 33 on CPU0. hv_arch_irq_unmask() is called on MSI0. It uses a hypercall to configure the MSI address and data (0) to vector 34 of CPU0. This is correct. Then hv_arch_irq_unmask is called on MSI1. It uses another hypercall to configure the MSI address and data (0) to vector 33 of CPU0. This is wrong, and results in both MSI0 and MSI1 being routed to vector 33. Linux will observe extra instances of MSI1 and no instances of MSI0 despite the endpoint device behaving correctly. For the multi-MSI case, we need unique address and data info for each MSI, but the cached msi_desc does not provide that. However, that information can be gotten from the int_desc cached in the chip_data by compose_msi_msg(). Fix the multi-MSI case to use that cached information instead. Since hv_set_msi_entry_from_desc() is no longer applicable, remove it. Signed-off-by: Jeffrey Hugo Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/1651068453-29588-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 5800ecfcc517..7aea0b7a994b 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -611,13 +611,6 @@ static unsigned int hv_msi_get_int_vector(struct irq_data *data) return cfg->vector; } -static void hv_set_msi_entry_from_desc(union hv_msi_entry *msi_entry, - struct msi_desc *msi_desc) -{ - msi_entry->address.as_uint32 = msi_desc->msg.address_lo; - msi_entry->data.as_uint32 = msi_desc->msg.data; -} - static int hv_msi_prepare(struct irq_domain *domain, struct device *dev, int nvec, msi_alloc_info_t *info) { @@ -647,6 +640,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) { struct msi_desc *msi_desc = irq_data_get_msi_desc(data); struct hv_retarget_device_interrupt *params; + struct tran_int_desc *int_desc; struct hv_pcibus_device *hbus; struct cpumask *dest; cpumask_var_t tmp; @@ -661,6 +655,7 @@ static void hv_arch_irq_unmask(struct irq_data *data) pdev = msi_desc_to_pci_dev(msi_desc); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); + int_desc = data->chip_data; spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags); @@ -668,7 +663,8 @@ static void hv_arch_irq_unmask(struct irq_data *data) memset(params, 0, sizeof(*params)); params->partition_id = HV_PARTITION_ID_SELF; params->int_entry.source = HV_INTERRUPT_SOURCE_MSI; - hv_set_msi_entry_from_desc(¶ms->int_entry.msi_entry, msi_desc); + params->int_entry.msi_entry.address.as_uint32 = int_desc->address & 0xffffffff; + params->int_entry.msi_entry.data.as_uint32 = int_desc->data; params->device_id = (hbus->hdev->dev_instance.b[5] << 24) | (hbus->hdev->dev_instance.b[4] << 16) | (hbus->hdev->dev_instance.b[7] << 8) | -- cgit v1.2.3 From 23e118a48acf7be223e57d98e98da8ac5a4071ac Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Mon, 2 May 2022 00:42:55 -0700 Subject: PCI: hv: Do not set PCI_COMMAND_MEMORY to reduce VM boot time Currently when the pci-hyperv driver finishes probing and initializing the PCI device, it sets the PCI_COMMAND_MEMORY bit; later when the PCI device is registered to the core PCI subsystem, the core PCI driver's BAR detection and initialization code toggles the bit multiple times, and each toggling of the bit causes the hypervisor to unmap/map the virtual BARs from/to the physical BARs, which can be slow if the BAR sizes are huge, e.g., a Linux VM with 14 GPU devices has to spend more than 3 minutes on BAR detection and initialization, causing a long boot time. Reduce the boot time by not setting the PCI_COMMAND_MEMORY bit when we register the PCI device (there is no need to have it set in the first place). The bit stays off till the PCI device driver calls pci_enable_device(). With this change, the boot time of such a 14-GPU VM is reduced by almost 3 minutes. Link: https://lore.kernel.org/lkml/20220419220007.26550-1-decui@microsoft.com/ Tested-by: Boqun Feng (Microsoft) Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Cc: Jake Oshins Link: https://lore.kernel.org/r/20220502074255.16901-1-decui@microsoft.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 7aea0b7a994b..cf2fe5754fde 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -2103,12 +2103,17 @@ static void prepopulate_bars(struct hv_pcibus_device *hbus) } } if (high_size <= 1 && low_size <= 1) { - /* Set the memory enable bit. */ - _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2, - &command); - command |= PCI_COMMAND_MEMORY; - _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2, - command); + /* + * No need to set the PCI_COMMAND_MEMORY bit as + * the core PCI driver doesn't require the bit + * to be pre-set. Actually here we intentionally + * keep the bit off so that the PCI BAR probing + * in the core PCI driver doesn't cause Hyper-V + * to unnecessarily unmap/map the virtual BARs + * from/to the physical BARs multiple times. + * This reduces the VM boot time significantly + * if the BAR sizes are huge. + */ break; } } -- cgit v1.2.3 From b4b77778ecc5bfbd4e77de1b2fd5c1dd3c655f1f Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 11 May 2022 09:23:02 -0600 Subject: PCI: hv: Reuse existing IRTE allocation in compose_msi_msg() Currently if compose_msi_msg() is called multiple times, it will free any previous IRTE allocation, and generate a new allocation. While nothing prevents this from occurring, it is extraneous when Linux could just reuse the existing allocation and avoid a bunch of overhead. However, when future IRTE allocations operate on blocks of MSIs instead of a single line, freeing the allocation will impact all of the lines. This could cause an issue where an allocation of N MSIs occurs, then some of the lines are retargeted, and finally the allocation is freed/reallocated. The freeing of the allocation removes all of the configuration for the entire block, which requires all the lines to be retargeted, which might not happen since some lines might already be unmasked/active. Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Tested-by: Dexuan Cui Tested-by: Michael Kelley Link: https://lore.kernel.org/r/1652282582-21595-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index cf2fe5754fde..5e2e637ad344 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1707,6 +1707,15 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) u32 size; int ret; + /* Reuse the previous allocation */ + if (data->chip_data) { + int_desc = data->chip_data; + msg->address_hi = int_desc->address >> 32; + msg->address_lo = int_desc->address & 0xffffffff; + msg->data = int_desc->data; + return; + } + pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; @@ -1716,13 +1725,6 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (!hpdev) goto return_null_message; - /* Free any previous message that might have already been composed. */ - if (data->chip_data) { - int_desc = data->chip_data; - data->chip_data = NULL; - hv_int_desc_free(hpdev, int_desc); - } - int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC); if (!int_desc) goto drop_reference; -- cgit v1.2.3 From a2bad844a67b1c7740bda63e87453baf63c3a7f7 Mon Sep 17 00:00:00 2001 From: Jeffrey Hugo Date: Wed, 11 May 2022 09:23:19 -0600 Subject: PCI: hv: Fix interrupt mapping for multi-MSI According to Dexuan, the hypervisor folks beleive that multi-msi allocations are not correct. compose_msi_msg() will allocate multi-msi one by one. However, multi-msi is a block of related MSIs, with alignment requirements. In order for the hypervisor to allocate properly aligned and consecutive entries in the IOMMU Interrupt Remapping Table, there should be a single mapping request that requests all of the multi-msi vectors in one shot. Dexuan suggests detecting the multi-msi case and composing a single request related to the first MSI. Then for the other MSIs in the same block, use the cached information. This appears to be viable, so do it. Suggested-by: Dexuan Cui Signed-off-by: Jeffrey Hugo Reviewed-by: Dexuan Cui Tested-by: Michael Kelley Link: https://lore.kernel.org/r/1652282599-21643-1-git-send-email-quic_jhugo@quicinc.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 60 ++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 10 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index 5e2e637ad344..e439b810f974 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1525,6 +1525,10 @@ static void hv_int_desc_free(struct hv_pci_dev *hpdev, u8 buffer[sizeof(struct pci_delete_interrupt)]; } ctxt; + if (!int_desc->vector_count) { + kfree(int_desc); + return; + } memset(&ctxt, 0, sizeof(ctxt)); int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message; int_pkt->message_type.type = @@ -1609,12 +1613,12 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp, static u32 hv_compose_msi_req_v1( struct pci_create_interrupt *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; /* @@ -1637,14 +1641,14 @@ static int hv_compose_msi_req_get_cpu(struct cpumask *affinity) static u32 hv_compose_msi_req_v2( struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity, - u32 slot, u8 vector) + u32 slot, u8 vector, u8 vector_count) { int cpu; int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2; int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; cpu = hv_compose_msi_req_get_cpu(affinity); int_pkt->int_desc.processor_array[0] = @@ -1656,7 +1660,7 @@ static u32 hv_compose_msi_req_v2( static u32 hv_compose_msi_req_v3( struct pci_create_interrupt3 *int_pkt, struct cpumask *affinity, - u32 slot, u32 vector) + u32 slot, u32 vector, u8 vector_count) { int cpu; @@ -1664,7 +1668,7 @@ static u32 hv_compose_msi_req_v3( int_pkt->wslot.slot = slot; int_pkt->int_desc.vector = vector; int_pkt->int_desc.reserved = 0; - int_pkt->int_desc.vector_count = 1; + int_pkt->int_desc.vector_count = vector_count; int_pkt->int_desc.delivery_mode = DELIVERY_MODE; cpu = hv_compose_msi_req_get_cpu(affinity); int_pkt->int_desc.processor_array[0] = @@ -1695,6 +1699,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) struct cpumask *dest; struct compose_comp_ctxt comp; struct tran_int_desc *int_desc; + struct msi_desc *msi_desc; + u8 vector, vector_count; struct { struct pci_packet pci_pkt; union { @@ -1716,7 +1722,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) return; } - pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data)); + msi_desc = irq_data_get_msi_desc(data); + pdev = msi_desc_to_pci_dev(msi_desc); dest = irq_data_get_effective_affinity_mask(data); pbus = pdev->bus; hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata); @@ -1729,6 +1736,36 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) if (!int_desc) goto drop_reference; + if (!msi_desc->pci.msi_attrib.is_msix && msi_desc->nvec_used > 1) { + /* + * If this is not the first MSI of Multi MSI, we already have + * a mapping. Can exit early. + */ + if (msi_desc->irq != data->irq) { + data->chip_data = int_desc; + int_desc->address = msi_desc->msg.address_lo | + (u64)msi_desc->msg.address_hi << 32; + int_desc->data = msi_desc->msg.data + + (data->irq - msi_desc->irq); + msg->address_hi = msi_desc->msg.address_hi; + msg->address_lo = msi_desc->msg.address_lo; + msg->data = int_desc->data; + put_pcichild(hpdev); + return; + } + /* + * The vector we select here is a dummy value. The correct + * value gets sent to the hypervisor in unmask(). This needs + * to be aligned with the count, and also not zero. Multi-msi + * is powers of 2 up to 32, so 32 will always work here. + */ + vector = 32; + vector_count = msi_desc->nvec_used; + } else { + vector = hv_msi_get_int_vector(data); + vector_count = 1; + } + memset(&ctxt, 0, sizeof(ctxt)); init_completion(&comp.comp_pkt.host_event); ctxt.pci_pkt.completion_func = hv_pci_compose_compl; @@ -1739,7 +1776,8 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_2: @@ -1747,14 +1785,16 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; case PCI_PROTOCOL_VERSION_1_4: size = hv_compose_msi_req_v3(&ctxt.int_pkts.v3, dest, hpdev->desc.win_slot.slot, - hv_msi_get_int_vector(data)); + vector, + vector_count); break; default: -- cgit v1.2.3 From 9937fa6d1eb6fac95586970e17617a718919c858 Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 12 May 2022 00:32:06 +0200 Subject: PCI: hv: Add validation for untrusted Hyper-V values For additional robustness in the face of Hyper-V errors or malicious behavior, validate all values that originate from packets that Hyper-V has sent to the guest in the host-to-guest ring buffer. Ensure that invalid values cannot cause data being copied out of the bounds of the source buffer in hv_pci_onchannelcallback(). While at it, remove a redundant validation in hv_pci_generic_compl(): hv_pci_onchannelcallback() already ensures that all processed incoming packets are "at least as large as [in fact larger than] a response". Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220511223207.3386-2-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index e439b810f974..a06e2cf94658 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -981,11 +981,7 @@ static void hv_pci_generic_compl(void *context, struct pci_response *resp, { struct hv_pci_compl *comp_pkt = context; - if (resp_packet_size >= offsetofend(struct pci_response, status)) - comp_pkt->completion_status = resp->status; - else - comp_pkt->completion_status = -1; - + comp_pkt->completion_status = resp->status; complete(&comp_pkt->host_event); } @@ -1606,8 +1602,13 @@ static void hv_pci_compose_compl(void *context, struct pci_response *resp, struct pci_create_int_response *int_resp = (struct pci_create_int_response *)resp; + if (resp_packet_size < sizeof(*int_resp)) { + comp_pkt->comp_pkt.completion_status = -1; + goto out; + } comp_pkt->comp_pkt.completion_status = resp->status; comp_pkt->int_desc = int_resp->int_desc; +out: complete(&comp_pkt->comp_pkt.host_event); } @@ -2291,12 +2292,14 @@ static void q_resource_requirements(void *context, struct pci_response *resp, struct q_res_req_compl *completion = context; struct pci_q_res_req_response *q_res_req = (struct pci_q_res_req_response *)resp; + s32 status; int i; - if (resp->status < 0) { + status = (resp_packet_size < sizeof(*q_res_req)) ? -1 : resp->status; + if (status < 0) { dev_err(&completion->hpdev->hbus->hdev->device, "query resource requirements failed: %x\n", - resp->status); + status); } else { for (i = 0; i < PCI_STD_NUM_BARS; i++) { completion->hpdev->probed_bar[i] = @@ -2848,7 +2851,8 @@ static void hv_pci_onchannelcallback(void *context) case PCI_BUS_RELATIONS: bus_rel = (struct pci_bus_relations *)buffer; - if (bytes_recvd < + if (bytes_recvd < sizeof(*bus_rel) || + bytes_recvd < struct_size(bus_rel, func, bus_rel->device_count)) { dev_err(&hbus->hdev->device, @@ -2862,7 +2866,8 @@ static void hv_pci_onchannelcallback(void *context) case PCI_BUS_RELATIONS2: bus_rel2 = (struct pci_bus_relations2 *)buffer; - if (bytes_recvd < + if (bytes_recvd < sizeof(*bus_rel2) || + bytes_recvd < struct_size(bus_rel2, func, bus_rel2->device_count)) { dev_err(&hbus->hdev->device, @@ -2876,6 +2881,11 @@ static void hv_pci_onchannelcallback(void *context) case PCI_EJECT: dev_message = (struct pci_dev_incoming *)buffer; + if (bytes_recvd < sizeof(*dev_message)) { + dev_err(&hbus->hdev->device, + "eject message too small\n"); + break; + } hpdev = get_pcichild_wslot(hbus, dev_message->wslot.slot); if (hpdev) { @@ -2887,6 +2897,11 @@ static void hv_pci_onchannelcallback(void *context) case PCI_INVALIDATE_BLOCK: inval = (struct pci_dev_inval_block *)buffer; + if (bytes_recvd < sizeof(*inval)) { + dev_err(&hbus->hdev->device, + "invalidate message too small\n"); + break; + } hpdev = get_pcichild_wslot(hbus, inval->wslot.slot); if (hpdev) { -- cgit v1.2.3 From b4927bd272623694314f37823302f9d67aa5964c Mon Sep 17 00:00:00 2001 From: "Andrea Parri (Microsoft)" Date: Thu, 12 May 2022 00:32:07 +0200 Subject: PCI: hv: Fix synchronization between channel callback and hv_pci_bus_exit() [ Similarly to commit a765ed47e4516 ("PCI: hv: Fix synchronization between channel callback and hv_compose_msi_msg()"): ] The (on-stack) teardown packet becomes invalid once the completion timeout in hv_pci_bus_exit() has expired and hv_pci_bus_exit() has returned. Prevent the channel callback from accessing the invalid packet by removing the ID associated to such packet from the VMbus requestor in hv_pci_bus_exit(). Signed-off-by: Andrea Parri (Microsoft) Reviewed-by: Michael Kelley Acked-by: Lorenzo Pieralisi Link: https://lore.kernel.org/r/20220511223207.3386-3-parri.andrea@gmail.com Signed-off-by: Wei Liu --- drivers/pci/controller/pci-hyperv.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'drivers/pci/controller/pci-hyperv.c') diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index a06e2cf94658..db814f7b93ba 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -3664,6 +3664,7 @@ free_bus: static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) { struct hv_pcibus_device *hbus = hv_get_drvdata(hdev); + struct vmbus_channel *chan = hdev->channel; struct { struct pci_packet teardown_packet; u8 buffer[sizeof(struct pci_message)]; @@ -3671,13 +3672,14 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) struct hv_pci_compl comp_pkt; struct hv_pci_dev *hpdev, *tmp; unsigned long flags; + u64 trans_id; int ret; /* * After the host sends the RESCIND_CHANNEL message, it doesn't * access the per-channel ringbuffer any longer. */ - if (hdev->channel->rescind) + if (chan->rescind) return 0; if (!keep_devs) { @@ -3714,16 +3716,26 @@ static int hv_pci_bus_exit(struct hv_device *hdev, bool keep_devs) pkt.teardown_packet.compl_ctxt = &comp_pkt; pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT; - ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message, - sizeof(struct pci_message), - (unsigned long)&pkt.teardown_packet, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + ret = vmbus_sendpacket_getid(chan, &pkt.teardown_packet.message, + sizeof(struct pci_message), + (unsigned long)&pkt.teardown_packet, + &trans_id, VM_PKT_DATA_INBAND, + VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); if (ret) return ret; - if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) + if (wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ) == 0) { + /* + * The completion packet on the stack becomes invalid after + * 'return'; remove the ID from the VMbus requestor if the + * identifier is still mapped to/associated with the packet. + * + * Cf. hv_pci_onchannelcallback(). + */ + vmbus_request_addr_match(chan, trans_id, + (unsigned long)&pkt.teardown_packet); return -ETIMEDOUT; + } return 0; } -- cgit v1.2.3