summaryrefslogtreecommitdiff
path: root/drivers/hv/mshv_root_main.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/hv/mshv_root_main.c')
-rw-r--r--drivers/hv/mshv_root_main.c745
1 files changed, 407 insertions, 338 deletions
diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 1d8d8d00e4e0..1134a82c7881 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -42,7 +42,7 @@ MODULE_DESCRIPTION("Microsoft Hyper-V root partition VMM interface /dev/mshv");
/* TODO move this to another file when debugfs code is added */
enum hv_stats_vp_counters { /* HV_THREAD_COUNTER */
#if defined(CONFIG_X86)
- VpRootDispatchThreadBlocked = 201,
+ VpRootDispatchThreadBlocked = 202,
#elif defined(CONFIG_ARM64)
VpRootDispatchThreadBlocked = 94,
#endif
@@ -123,6 +123,7 @@ static struct miscdevice mshv_dev = {
*/
static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
HVCALL_SET_PARTITION_PROPERTY,
HVCALL_INSTALL_INTERCEPT,
HVCALL_GET_VP_REGISTERS,
@@ -137,6 +138,16 @@ static u16 mshv_passthru_hvcalls[] = {
HVCALL_GET_VP_CPUID_VALUES,
};
+/*
+ * Only allow hypercalls that are safe to be called by the VMM with the host
+ * partition as target (i.e. HV_PARTITION_ID_SELF). Carefully audit that a
+ * hypercall cannot be misused by the VMM before adding it to this list.
+ */
+static u16 mshv_self_passthru_hvcalls[] = {
+ HVCALL_GET_PARTITION_PROPERTY,
+ HVCALL_GET_PARTITION_PROPERTY_EX,
+};
+
static bool mshv_hvcall_is_async(u16 code)
{
switch (code) {
@@ -148,18 +159,38 @@ static bool mshv_hvcall_is_async(u16 code)
return false;
}
+static bool mshv_passthru_hvcall_allowed(u16 code, u64 pt_id)
+{
+ int i;
+ int n = ARRAY_SIZE(mshv_passthru_hvcalls);
+ u16 *allowed_hvcalls = mshv_passthru_hvcalls;
+
+ if (pt_id == HV_PARTITION_ID_SELF) {
+ n = ARRAY_SIZE(mshv_self_passthru_hvcalls);
+ allowed_hvcalls = mshv_self_passthru_hvcalls;
+ }
+
+ for (i = 0; i < n; ++i)
+ if (allowed_hvcalls[i] == code)
+ return true;
+
+ return false;
+}
+
static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
bool partition_locked,
void __user *user_args)
{
u64 status;
- int ret = 0, i;
+ int ret = 0;
bool is_async;
struct mshv_root_hvcall args;
struct page *page;
unsigned int pages_order;
void *input_pg = NULL;
void *output_pg = NULL;
+ u16 reps_completed;
+ u64 pt_id = partition ? partition->pt_id : HV_PARTITION_ID_SELF;
if (copy_from_user(&args, user_args, sizeof(args)))
return -EFAULT;
@@ -171,17 +202,13 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
if (args.out_ptr && (!args.out_sz || args.out_sz > HV_HYP_PAGE_SIZE))
return -EINVAL;
- for (i = 0; i < ARRAY_SIZE(mshv_passthru_hvcalls); ++i)
- if (args.code == mshv_passthru_hvcalls[i])
- break;
-
- if (i >= ARRAY_SIZE(mshv_passthru_hvcalls))
+ if (!mshv_passthru_hvcall_allowed(args.code, pt_id))
return -EINVAL;
is_async = mshv_hvcall_is_async(args.code);
if (is_async) {
/* async hypercalls can only be called from partition fd */
- if (!partition_locked)
+ if (!partition || !partition_locked)
return -EINVAL;
ret = mshv_init_async_handler(partition);
if (ret)
@@ -209,43 +236,44 @@ static int mshv_ioctl_passthru_hvcall(struct mshv_partition *partition,
* NOTE: This only works because all the allowed hypercalls' input
* structs begin with a u64 partition_id field.
*/
- *(u64 *)input_pg = partition->pt_id;
+ *(u64 *)input_pg = pt_id;
- if (args.reps)
- status = hv_do_rep_hypercall(args.code, args.reps, 0,
- input_pg, output_pg);
- else
- status = hv_do_hypercall(args.code, input_pg, output_pg);
-
- if (hv_result(status) == HV_STATUS_CALL_PENDING) {
- if (is_async) {
- mshv_async_hvcall_handler(partition, &status);
- } else { /* Paranoia check. This shouldn't happen! */
- ret = -EBADFD;
- goto free_pages_out;
+ reps_completed = 0;
+ do {
+ if (args.reps) {
+ status = hv_do_rep_hypercall_ex(args.code, args.reps,
+ 0, reps_completed,
+ input_pg, output_pg);
+ reps_completed = hv_repcomp(status);
+ } else {
+ status = hv_do_hypercall(args.code, input_pg, output_pg);
}
- }
- if (hv_result(status) == HV_STATUS_INSUFFICIENT_MEMORY) {
- ret = hv_call_deposit_pages(NUMA_NO_NODE, partition->pt_id, 1);
- if (!ret)
- ret = -EAGAIN;
- } else if (!hv_result_success(status)) {
- ret = hv_result_to_errno(status);
- }
+ if (hv_result(status) == HV_STATUS_CALL_PENDING) {
+ if (is_async) {
+ mshv_async_hvcall_handler(partition, &status);
+ } else { /* Paranoia check. This shouldn't happen! */
+ ret = -EBADFD;
+ goto free_pages_out;
+ }
+ }
+
+ if (hv_result_success(status))
+ break;
+
+ if (hv_result(status) != HV_STATUS_INSUFFICIENT_MEMORY)
+ ret = hv_result_to_errno(status);
+ else
+ ret = hv_call_deposit_pages(NUMA_NO_NODE,
+ pt_id, 1);
+ } while (!ret);
- /*
- * Always return the status and output data regardless of result.
- * The VMM may need it to determine how to proceed. E.g. the status may
- * contain the number of reps completed if a rep hypercall partially
- * succeeded.
- */
args.status = hv_result(status);
- args.reps = args.reps ? hv_repcomp(status) : 0;
+ args.reps = reps_completed;
if (copy_to_user(user_args, &args, sizeof(args)))
ret = -EFAULT;
- if (output_pg &&
+ if (!ret && output_pg &&
copy_to_user((void __user *)args.out_ptr, output_pg, args.out_sz))
ret = -EFAULT;
@@ -569,14 +597,98 @@ static long mshv_run_vp_with_root_scheduler(struct mshv_vp *vp)
static_assert(sizeof(struct hv_message) <= MSHV_RUN_VP_BUF_SZ,
"sizeof(struct hv_message) must not exceed MSHV_RUN_VP_BUF_SZ");
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
+ if (gfn >= region->start_gfn &&
+ gfn < region->start_gfn + region->nr_pages)
+ return region;
+ }
+
+ return NULL;
+}
+
+#ifdef CONFIG_X86_64
+static struct mshv_mem_region *
+mshv_partition_region_by_gfn_get(struct mshv_partition *p, u64 gfn)
+{
+ struct mshv_mem_region *region;
+
+ spin_lock(&p->pt_mem_regions_lock);
+ region = mshv_partition_region_by_gfn(p, gfn);
+ if (!region || !mshv_region_get(region)) {
+ spin_unlock(&p->pt_mem_regions_lock);
+ return NULL;
+ }
+ spin_unlock(&p->pt_mem_regions_lock);
+
+ return region;
+}
+
+/**
+ * mshv_handle_gpa_intercept - Handle GPA (Guest Physical Address) intercepts.
+ * @vp: Pointer to the virtual processor structure.
+ *
+ * This function processes GPA intercepts by identifying the memory region
+ * corresponding to the intercepted GPA, aligning the page offset, and
+ * mapping the required pages. It ensures that the region is valid and
+ * handles faults efficiently by mapping multiple pages at once.
+ *
+ * Return: true if the intercept was handled successfully, false otherwise.
+ */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp)
+{
+ struct mshv_partition *p = vp->vp_partition;
+ struct mshv_mem_region *region;
+ struct hv_x64_memory_intercept_message *msg;
+ bool ret;
+ u64 gfn;
+
+ msg = (struct hv_x64_memory_intercept_message *)
+ vp->vp_intercept_msg_page->u.payload;
+
+ gfn = HVPFN_DOWN(msg->guest_physical_address);
+
+ region = mshv_partition_region_by_gfn_get(p, gfn);
+ if (!region)
+ return false;
+
+ /* Only movable memory ranges are supported for GPA intercepts */
+ if (region->type == MSHV_REGION_TYPE_MEM_MOVABLE)
+ ret = mshv_region_handle_gfn_fault(region, gfn);
+ else
+ ret = false;
+
+ mshv_region_put(region);
+
+ return ret;
+}
+#else /* CONFIG_X86_64 */
+static bool mshv_handle_gpa_intercept(struct mshv_vp *vp) { return false; }
+#endif /* CONFIG_X86_64 */
+
+static bool mshv_vp_handle_intercept(struct mshv_vp *vp)
+{
+ switch (vp->vp_intercept_msg_page->header.message_type) {
+ case HVMSG_GPA_INTERCEPT:
+ return mshv_handle_gpa_intercept(vp);
+ }
+ return false;
+}
+
static long mshv_vp_ioctl_run_vp(struct mshv_vp *vp, void __user *ret_msg)
{
long rc;
- if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
- rc = mshv_run_vp_with_root_scheduler(vp);
- else
- rc = mshv_run_vp_with_hyp_scheduler(vp);
+ do {
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ rc = mshv_run_vp_with_root_scheduler(vp);
+ else
+ rc = mshv_run_vp_with_hyp_scheduler(vp);
+ } while (rc == 0 && mshv_vp_handle_intercept(vp));
if (rc)
return rc;
@@ -844,7 +956,8 @@ mshv_vp_release(struct inode *inode, struct file *filp)
return 0;
}
-static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
+static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index,
+ void *stats_pages[])
{
union hv_stats_object_identity identity = {
.vp.partition_id = partition_id,
@@ -852,10 +965,10 @@ static void mshv_vp_stats_unmap(u64 partition_id, u32 vp_index)
};
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
}
static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
@@ -868,14 +981,14 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
int err;
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_SELF]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_SELF]);
if (err)
return err;
identity.vp.stats_area_type = HV_STATS_AREA_PARENT;
- err = hv_call_map_stat_page(HV_STATS_OBJECT_VP, &identity,
- &stats_pages[HV_STATS_AREA_PARENT]);
+ err = hv_map_stats_page(HV_STATS_OBJECT_VP, &identity,
+ &stats_pages[HV_STATS_AREA_PARENT]);
if (err)
goto unmap_self;
@@ -883,7 +996,7 @@ static int mshv_vp_stats_map(u64 partition_id, u32 vp_index,
unmap_self:
identity.vp.stats_area_type = HV_STATS_AREA_SELF;
- hv_call_unmap_stat_page(HV_STATS_OBJECT_VP, &identity);
+ hv_unmap_stats_page(HV_STATS_OBJECT_VP, NULL, &identity);
return err;
}
@@ -893,7 +1006,7 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
{
struct mshv_create_vp args;
struct mshv_vp *vp;
- struct page *intercept_message_page, *register_page, *ghcb_page;
+ struct page *intercept_msg_page, *register_page, *ghcb_page;
void *stats_pages[2];
long ret;
@@ -911,33 +1024,34 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
if (ret)
return ret;
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero,
- &intercept_message_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ input_vtl_zero, &intercept_msg_page);
if (ret)
goto destroy_vp;
if (!mshv_partition_encrypted(partition)) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero,
- &register_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ input_vtl_zero, &register_page);
if (ret)
goto unmap_intercept_message_page;
}
if (mshv_partition_encrypted(partition) &&
is_ghcb_mapping_available()) {
- ret = hv_call_map_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal,
- &ghcb_page);
+ ret = hv_map_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ input_vtl_normal, &ghcb_page);
if (ret)
goto unmap_register_page;
}
- if (hv_parent_partition()) {
+ /*
+ * This mapping of the stats page is for detecting if dispatch thread
+ * is blocked - only relevant for root scheduler
+ */
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT) {
ret = mshv_vp_stats_map(partition->pt_id, args.vp_index,
stats_pages);
if (ret)
@@ -959,14 +1073,14 @@ mshv_partition_ioctl_create_vp(struct mshv_partition *partition,
atomic64_set(&vp->run.vp_signaled_count, 0);
vp->vp_index = args.vp_index;
- vp->vp_intercept_msg_page = page_to_virt(intercept_message_page);
+ vp->vp_intercept_msg_page = page_to_virt(intercept_msg_page);
if (!mshv_partition_encrypted(partition))
vp->vp_register_page = page_to_virt(register_page);
if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
vp->vp_ghcb_page = page_to_virt(ghcb_page);
- if (hv_parent_partition())
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
memcpy(vp->vp_stats_pages, stats_pages, sizeof(stats_pages));
/*
@@ -989,24 +1103,22 @@ put_partition:
free_vp:
kfree(vp);
unmap_stats_pages:
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, args.vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, args.vp_index, stats_pages);
unmap_ghcb_page:
- if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available()) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
- }
+ if (mshv_partition_encrypted(partition) && is_ghcb_mapping_available())
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_GHCB, ghcb_page,
+ input_vtl_normal);
unmap_register_page:
- if (!mshv_partition_encrypted(partition)) {
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
- }
+ if (!mshv_partition_encrypted(partition))
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ register_page, input_vtl_zero);
unmap_intercept_message_page:
- hv_call_unmap_vp_state_page(partition->pt_id, args.vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ hv_unmap_vp_state_page(partition->pt_id, args.vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ intercept_msg_page, input_vtl_zero);
destroy_vp:
hv_call_delete_vp(partition->pt_id, args.vp_index);
return ret;
@@ -1034,162 +1146,6 @@ static void mshv_async_hvcall_handler(void *data, u64 *status)
*status = partition->async_hypercall_status;
}
-static int
-mshv_partition_region_share(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_SHARED;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- HV_MAP_GPA_READABLE | HV_MAP_GPA_WRITABLE,
- flags, true);
-}
-
-static int
-mshv_partition_region_unshare(struct mshv_mem_region *region)
-{
- u32 flags = HV_MODIFY_SPA_PAGE_HOST_ACCESS_MAKE_EXCLUSIVE;
-
- if (region->flags.large_pages)
- flags |= HV_MODIFY_SPA_PAGE_HOST_ACCESS_LARGE_PAGE;
-
- return hv_call_modify_spa_host_access(region->partition->pt_id,
- region->pages, region->nr_pages,
- 0,
- flags, false);
-}
-
-static int
-mshv_region_remap_pages(struct mshv_mem_region *region, u32 map_flags,
- u64 page_offset, u64 page_count)
-{
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- if (region->flags.large_pages)
- map_flags |= HV_MAP_GPA_LARGE_PAGE;
-
- /* ask the hypervisor to map guest ram */
- return hv_call_map_gpa_pages(region->partition->pt_id,
- region->start_gfn + page_offset,
- page_count, map_flags,
- region->pages + page_offset);
-}
-
-static int
-mshv_region_map(struct mshv_mem_region *region)
-{
- u32 map_flags = region->hv_map_flags;
-
- return mshv_region_remap_pages(region, map_flags,
- 0, region->nr_pages);
-}
-
-static void
-mshv_region_evict_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- if (region->flags.range_pinned)
- unpin_user_pages(region->pages + page_offset, page_count);
-
- memset(region->pages + page_offset, 0,
- page_count * sizeof(struct page *));
-}
-
-static void
-mshv_region_evict(struct mshv_mem_region *region)
-{
- mshv_region_evict_pages(region, 0, region->nr_pages);
-}
-
-static int
-mshv_region_populate_pages(struct mshv_mem_region *region,
- u64 page_offset, u64 page_count)
-{
- u64 done_count, nr_pages;
- struct page **pages;
- __u64 userspace_addr;
- int ret;
-
- if (page_offset + page_count > region->nr_pages)
- return -EINVAL;
-
- for (done_count = 0; done_count < page_count; done_count += ret) {
- pages = region->pages + page_offset + done_count;
- userspace_addr = region->start_uaddr +
- (page_offset + done_count) *
- HV_HYP_PAGE_SIZE;
- nr_pages = min(page_count - done_count,
- MSHV_PIN_PAGES_BATCH_SIZE);
-
- /*
- * Pinning assuming 4k pages works for large pages too.
- * All page structs within the large page are returned.
- *
- * Pin requests are batched because pin_user_pages_fast
- * with the FOLL_LONGTERM flag does a large temporary
- * allocation of contiguous memory.
- */
- if (region->flags.range_pinned)
- ret = pin_user_pages_fast(userspace_addr,
- nr_pages,
- FOLL_WRITE | FOLL_LONGTERM,
- pages);
- else
- ret = -EOPNOTSUPP;
-
- if (ret < 0)
- goto release_pages;
- }
-
- if (PageHuge(region->pages[page_offset]))
- region->flags.large_pages = true;
-
- return 0;
-
-release_pages:
- mshv_region_evict_pages(region, page_offset, done_count);
- return ret;
-}
-
-static int
-mshv_region_populate(struct mshv_mem_region *region)
-{
- return mshv_region_populate_pages(region, 0, region->nr_pages);
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_gfn(struct mshv_partition *partition, u64 gfn)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (gfn >= region->start_gfn &&
- gfn < region->start_gfn + region->nr_pages)
- return region;
- }
-
- return NULL;
-}
-
-static struct mshv_mem_region *
-mshv_partition_region_by_uaddr(struct mshv_partition *partition, u64 uaddr)
-{
- struct mshv_mem_region *region;
-
- hlist_for_each_entry(region, &partition->pt_mem_regions, hnode) {
- if (uaddr >= region->start_uaddr &&
- uaddr < region->start_uaddr +
- (region->nr_pages << HV_HYP_PAGE_SHIFT))
- return region;
- }
-
- return NULL;
-}
-
/*
* NB: caller checks and makes sure mem->size is page aligned
* Returns: 0 with regionpp updated on success, or -errno
@@ -1199,53 +1155,61 @@ static int mshv_partition_create_region(struct mshv_partition *partition,
struct mshv_mem_region **regionpp,
bool is_mmio)
{
- struct mshv_mem_region *region;
+ struct mshv_mem_region *rg;
u64 nr_pages = HVPFN_DOWN(mem->size);
/* Reject overlapping regions */
- if (mshv_partition_region_by_gfn(partition, mem->guest_pfn) ||
- mshv_partition_region_by_gfn(partition, mem->guest_pfn + nr_pages - 1) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr) ||
- mshv_partition_region_by_uaddr(partition, mem->userspace_addr + mem->size - 1))
+ spin_lock(&partition->pt_mem_regions_lock);
+ hlist_for_each_entry(rg, &partition->pt_mem_regions, hnode) {
+ if (mem->guest_pfn + nr_pages <= rg->start_gfn ||
+ rg->start_gfn + rg->nr_pages <= mem->guest_pfn)
+ continue;
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EEXIST;
+ }
+ spin_unlock(&partition->pt_mem_regions_lock);
- region = vzalloc(sizeof(*region) + sizeof(struct page *) * nr_pages);
- if (!region)
- return -ENOMEM;
-
- region->nr_pages = nr_pages;
- region->start_gfn = mem->guest_pfn;
- region->start_uaddr = mem->userspace_addr;
- region->hv_map_flags = HV_MAP_GPA_READABLE | HV_MAP_GPA_ADJUSTABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_WRITABLE))
- region->hv_map_flags |= HV_MAP_GPA_WRITABLE;
- if (mem->flags & BIT(MSHV_SET_MEM_BIT_EXECUTABLE))
- region->hv_map_flags |= HV_MAP_GPA_EXECUTABLE;
+ rg = mshv_region_create(mem->guest_pfn, nr_pages,
+ mem->userspace_addr, mem->flags);
+ if (IS_ERR(rg))
+ return PTR_ERR(rg);
- /* Note: large_pages flag populated when we pin the pages */
- if (!is_mmio)
- region->flags.range_pinned = true;
+ if (is_mmio)
+ rg->type = MSHV_REGION_TYPE_MMIO;
+ else if (mshv_partition_encrypted(partition) ||
+ !mshv_region_movable_init(rg))
+ rg->type = MSHV_REGION_TYPE_MEM_PINNED;
+ else
+ rg->type = MSHV_REGION_TYPE_MEM_MOVABLE;
- region->partition = partition;
+ rg->partition = partition;
- *regionpp = region;
+ *regionpp = rg;
return 0;
}
-/*
- * Map guest ram. if snp, make sure to release that from the host first
- * Side Effects: In case of failure, pages are unpinned when feasible.
+/**
+ * mshv_prepare_pinned_region - Pin and map memory regions
+ * @region: Pointer to the memory region structure
+ *
+ * This function processes memory regions that are explicitly marked as pinned.
+ * Pinned regions are preallocated, mapped upfront, and do not rely on fault-based
+ * population. The function ensures the region is properly populated, handles
+ * encryption requirements for SNP partitions if applicable, maps the region,
+ * and performs necessary sharing or eviction operations based on the mapping
+ * result.
+ *
+ * Return: 0 on success, negative error code on failure.
*/
-static int
-mshv_partition_mem_region_map(struct mshv_mem_region *region)
+static int mshv_prepare_pinned_region(struct mshv_mem_region *region)
{
struct mshv_partition *partition = region->partition;
int ret;
- ret = mshv_region_populate(region);
+ ret = mshv_region_pin(region);
if (ret) {
- pt_err(partition, "Failed to populate memory region: %d\n",
+ pt_err(partition, "Failed to pin memory region: %d\n",
ret);
goto err_out;
}
@@ -1258,12 +1222,12 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
* access to guest memory regions.
*/
if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_unshare(region);
+ ret = mshv_region_unshare(region);
if (ret) {
pt_err(partition,
"Failed to unshare memory region (guest_pfn: %llu): %d\n",
region->start_gfn, ret);
- goto evict_region;
+ goto invalidate_region;
}
}
@@ -1271,9 +1235,9 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
if (ret && mshv_partition_encrypted(partition)) {
int shrc;
- shrc = mshv_partition_region_share(region);
+ shrc = mshv_region_share(region);
if (!shrc)
- goto evict_region;
+ goto invalidate_region;
pt_err(partition,
"Failed to share memory region (guest_pfn: %llu): %d\n",
@@ -1287,8 +1251,8 @@ mshv_partition_mem_region_map(struct mshv_mem_region *region)
return 0;
-evict_region:
- mshv_region_evict(region);
+invalidate_region:
+ mshv_region_invalidate(region);
err_out:
return ret;
}
@@ -1333,17 +1297,35 @@ mshv_map_user_memory(struct mshv_partition *partition,
if (ret)
return ret;
- if (is_mmio)
- ret = hv_call_map_mmio_pages(partition->pt_id, mem.guest_pfn,
- mmio_pfn, HVPFN_DOWN(mem.size));
- else
- ret = mshv_partition_mem_region_map(region);
+ switch (region->type) {
+ case MSHV_REGION_TYPE_MEM_PINNED:
+ ret = mshv_prepare_pinned_region(region);
+ break;
+ case MSHV_REGION_TYPE_MEM_MOVABLE:
+ /*
+ * For movable memory regions, remap with no access to let
+ * the hypervisor track dirty pages, enabling pre-copy live
+ * migration.
+ */
+ ret = hv_call_map_gpa_pages(partition->pt_id,
+ region->start_gfn,
+ region->nr_pages,
+ HV_MAP_GPA_NO_ACCESS, NULL);
+ break;
+ case MSHV_REGION_TYPE_MMIO:
+ ret = hv_call_map_mmio_pages(partition->pt_id,
+ region->start_gfn,
+ mmio_pfn,
+ region->nr_pages);
+ break;
+ }
if (ret)
goto errout;
- /* Install the new region */
+ spin_lock(&partition->pt_mem_regions_lock);
hlist_add_head(&region->hnode, &partition->pt_mem_regions);
+ spin_unlock(&partition->pt_mem_regions_lock);
return 0;
@@ -1358,33 +1340,32 @@ mshv_unmap_user_memory(struct mshv_partition *partition,
struct mshv_user_mem_region mem)
{
struct mshv_mem_region *region;
- u32 unmap_flags = 0;
if (!(mem.flags & BIT(MSHV_SET_MEM_BIT_UNMAP)))
return -EINVAL;
+ spin_lock(&partition->pt_mem_regions_lock);
+
region = mshv_partition_region_by_gfn(partition, mem.guest_pfn);
- if (!region)
- return -EINVAL;
+ if (!region) {
+ spin_unlock(&partition->pt_mem_regions_lock);
+ return -ENOENT;
+ }
/* Paranoia check */
if (region->start_uaddr != mem.userspace_addr ||
region->start_gfn != mem.guest_pfn ||
- region->nr_pages != HVPFN_DOWN(mem.size))
+ region->nr_pages != HVPFN_DOWN(mem.size)) {
+ spin_unlock(&partition->pt_mem_regions_lock);
return -EINVAL;
+ }
hlist_del(&region->hnode);
- if (region->flags.large_pages)
- unmap_flags |= HV_UNMAP_GPA_LARGE_PAGE;
-
- /* ignore unmap failures and continue as process may be exiting */
- hv_call_unmap_gpa_pages(partition->pt_id, region->start_gfn,
- region->nr_pages, unmap_flags);
+ spin_unlock(&partition->pt_mem_regions_lock);
- mshv_region_evict(region);
+ mshv_region_put(region);
- vfree(region);
return 0;
}
@@ -1720,8 +1701,8 @@ static void destroy_partition(struct mshv_partition *partition)
{
struct mshv_vp *vp;
struct mshv_mem_region *region;
- int i, ret;
struct hlist_node *n;
+ int i;
if (refcount_read(&partition->pt_ref_count)) {
pt_err(partition,
@@ -1743,28 +1724,32 @@ static void destroy_partition(struct mshv_partition *partition)
if (!vp)
continue;
- if (hv_parent_partition())
- mshv_vp_stats_unmap(partition->pt_id, vp->vp_index);
+ if (hv_scheduler_type == HV_SCHEDULER_TYPE_ROOT)
+ mshv_vp_stats_unmap(partition->pt_id, vp->vp_index,
+ (void **)vp->vp_stats_pages);
if (vp->vp_register_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_REGISTERS,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_REGISTERS,
+ virt_to_page(vp->vp_register_page),
+ input_vtl_zero);
vp->vp_register_page = NULL;
}
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
- input_vtl_zero);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_INTERCEPT_MESSAGE,
+ virt_to_page(vp->vp_intercept_msg_page),
+ input_vtl_zero);
vp->vp_intercept_msg_page = NULL;
if (vp->vp_ghcb_page) {
- (void)hv_call_unmap_vp_state_page(partition->pt_id,
- vp->vp_index,
- HV_VP_STATE_PAGE_GHCB,
- input_vtl_normal);
+ (void)hv_unmap_vp_state_page(partition->pt_id,
+ vp->vp_index,
+ HV_VP_STATE_PAGE_GHCB,
+ virt_to_page(vp->vp_ghcb_page),
+ input_vtl_normal);
vp->vp_ghcb_page = NULL;
}
@@ -1781,24 +1766,10 @@ static void destroy_partition(struct mshv_partition *partition)
remove_partition(partition);
- /* Remove regions, regain access to the memory and unpin the pages */
hlist_for_each_entry_safe(region, n, &partition->pt_mem_regions,
hnode) {
hlist_del(&region->hnode);
-
- if (mshv_partition_encrypted(partition)) {
- ret = mshv_partition_region_share(region);
- if (ret) {
- pt_err(partition,
- "Failed to regain access to memory, unpinning user pages will fail and crash the host error: %d\n",
- ret);
- return;
- }
- }
-
- mshv_region_evict(region);
-
- vfree(region);
+ mshv_region_put(region);
}
/* Withdraw and free all pages we deposited */
@@ -1865,41 +1836,117 @@ add_partition(struct mshv_partition *partition)
return 0;
}
-static long
-mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+static_assert(MSHV_NUM_CPU_FEATURES_BANKS ==
+ HV_PARTITION_PROCESSOR_FEATURES_BANKS);
+
+static long mshv_ioctl_process_pt_flags(void __user *user_arg, u64 *pt_flags,
+ struct hv_partition_creation_properties *cr_props,
+ union hv_partition_isolation_properties *isol_props)
{
- struct mshv_create_partition args;
- u64 creation_flags;
- struct hv_partition_creation_properties creation_properties = {};
- union hv_partition_isolation_properties isolation_properties = {};
- struct mshv_partition *partition;
- long ret;
+ int i;
+ struct mshv_create_partition_v2 args;
+ union hv_partition_processor_features *disabled_procs;
+ union hv_partition_processor_xsave_features *disabled_xsave;
- if (copy_from_user(&args, user_arg, sizeof(args)))
+ /* First, copy v1 struct in case user is on previous versions */
+ if (copy_from_user(&args, user_arg,
+ sizeof(struct mshv_create_partition)))
return -EFAULT;
if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
return -EINVAL;
+ disabled_procs = &cr_props->disabled_processor_features;
+ disabled_xsave = &cr_props->disabled_processor_xsave_features;
+
+ /* Check if user provided newer struct with feature fields */
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_CPU_AND_XSAVE_FEATURES)) {
+ if (copy_from_user(&args, user_arg, sizeof(args)))
+ return -EFAULT;
+
+ /* Re-validate v1 fields after second copy_from_user() */
+ if ((args.pt_flags & ~MSHV_PT_FLAGS_MASK) ||
+ args.pt_isolation >= MSHV_PT_ISOLATION_COUNT)
+ return -EINVAL;
+
+ if (args.pt_num_cpu_fbanks != MSHV_NUM_CPU_FEATURES_BANKS ||
+ mshv_field_nonzero(args, pt_rsvd) ||
+ mshv_field_nonzero(args, pt_rsvd1))
+ return -EINVAL;
+
+ /*
+ * Note this assumes MSHV_NUM_CPU_FEATURES_BANKS will never
+ * change and equals HV_PARTITION_PROCESSOR_FEATURES_BANKS
+ * (i.e. 2).
+ *
+ * Further banks (index >= 2) will be modifiable as 'early'
+ * properties via the set partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = args.pt_cpu_fbanks[i];
+
+#if IS_ENABLED(CONFIG_X86_64)
+ disabled_xsave->as_uint64 = args.pt_disabled_xsave;
+#else
+ /*
+ * In practice this field is ignored on arm64, but safer to
+ * zero it in case it is ever used.
+ */
+ disabled_xsave->as_uint64 = 0;
+
+ if (mshv_field_nonzero(args, pt_rsvd2))
+ return -EINVAL;
+#endif
+ } else {
+ /*
+ * v1 behavior: try to enable everything. The hypervisor will
+ * disable features that are not supported. The banks can be
+ * queried via the get partition property hypercall.
+ */
+ for (i = 0; i < HV_PARTITION_PROCESSOR_FEATURES_BANKS; i++)
+ disabled_procs->as_uint64[i] = 0;
+
+ disabled_xsave->as_uint64 = 0;
+ }
+
/* Only support EXO partitions */
- creation_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
- HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+ *pt_flags = HV_PARTITION_CREATION_FLAG_EXO_PARTITION |
+ HV_PARTITION_CREATION_FLAG_INTERCEPT_MESSAGE_PAGE_ENABLED;
+
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_LAPIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_X2APIC))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
+ if (args.pt_flags & BIT_ULL(MSHV_PT_BIT_GPA_SUPER_PAGES))
+ *pt_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_LAPIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_LAPIC_ENABLED;
- if (args.pt_flags & BIT(MSHV_PT_BIT_X2APIC))
- creation_flags |= HV_PARTITION_CREATION_FLAG_X2APIC_CAPABLE;
- if (args.pt_flags & BIT(MSHV_PT_BIT_GPA_SUPER_PAGES))
- creation_flags |= HV_PARTITION_CREATION_FLAG_GPA_SUPER_PAGES_ENABLED;
+ isol_props->as_uint64 = 0;
switch (args.pt_isolation) {
case MSHV_PT_ISOLATION_NONE:
- isolation_properties.isolation_type =
- HV_PARTITION_ISOLATION_TYPE_NONE;
+ isol_props->isolation_type = HV_PARTITION_ISOLATION_TYPE_NONE;
break;
}
+ return 0;
+}
+
+static long
+mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
+{
+ u64 creation_flags;
+ struct hv_partition_creation_properties creation_properties;
+ union hv_partition_isolation_properties isolation_properties;
+ struct mshv_partition *partition;
+ long ret;
+
+ ret = mshv_ioctl_process_pt_flags(user_arg, &creation_flags,
+ &creation_properties,
+ &isolation_properties);
+ if (ret)
+ return ret;
+
partition = kzalloc(sizeof(*partition), GFP_KERNEL);
if (!partition)
return -ENOMEM;
@@ -1919,6 +1966,7 @@ mshv_ioctl_create_partition(void __user *user_arg, struct device *module_dev)
INIT_HLIST_HEAD(&partition->pt_devices);
+ spin_lock_init(&partition->pt_mem_regions_lock);
INIT_HLIST_HEAD(&partition->pt_mem_regions);
mshv_eventfd_init(partition);
@@ -1966,6 +2014,9 @@ static long mshv_dev_ioctl(struct file *filp, unsigned int ioctl,
case MSHV_CREATE_PARTITION:
return mshv_ioctl_create_partition((void __user *)arg,
misc->this_device);
+ case MSHV_ROOT_HVCALL:
+ return mshv_ioctl_passthru_hvcall(NULL, false,
+ (void __user *)arg);
}
return -ENOTTY;
@@ -2182,6 +2233,22 @@ root_sched_deinit:
return err;
}
+static void mshv_init_vmm_caps(struct device *dev)
+{
+ /*
+ * This can only fail here if HVCALL_GET_PARTITION_PROPERTY_EX or
+ * HV_PARTITION_PROPERTY_VMM_CAPABILITIES are not supported. In that
+ * case it's valid to proceed as if all vmm_caps are disabled (zero).
+ */
+ if (hv_call_get_partition_property_ex(HV_PARTITION_ID_SELF,
+ HV_PARTITION_PROPERTY_VMM_CAPABILITIES,
+ 0, &mshv_root.vmm_caps,
+ sizeof(mshv_root.vmm_caps)))
+ dev_warn(dev, "Unable to get VMM capabilities\n");
+
+ dev_dbg(dev, "vmm_caps = %#llx\n", mshv_root.vmm_caps.as_uint64[0]);
+}
+
static int __init mshv_parent_partition_init(void)
{
int ret;
@@ -2234,6 +2301,8 @@ static int __init mshv_parent_partition_init(void)
if (ret)
goto remove_cpu_state;
+ mshv_init_vmm_caps(dev);
+
ret = mshv_irqfd_wq_init();
if (ret)
goto exit_partition;