summaryrefslogtreecommitdiff
path: root/virt/kvm/guest_memfd.c
diff options
context:
space:
mode:
Diffstat (limited to 'virt/kvm/guest_memfd.c')
-rw-r--r--virt/kvm/guest_memfd.c410
1 files changed, 307 insertions, 103 deletions
diff --git a/virt/kvm/guest_memfd.c b/virt/kvm/guest_memfd.c
index fbca8c0972da..fdaea3422c30 100644
--- a/virt/kvm/guest_memfd.c
+++ b/virt/kvm/guest_memfd.c
@@ -1,18 +1,47 @@
// SPDX-License-Identifier: GPL-2.0
+#include <linux/anon_inodes.h>
#include <linux/backing-dev.h>
#include <linux/falloc.h>
+#include <linux/fs.h>
#include <linux/kvm_host.h>
+#include <linux/mempolicy.h>
+#include <linux/pseudo_fs.h>
#include <linux/pagemap.h>
-#include <linux/anon_inodes.h>
#include "kvm_mm.h"
-struct kvm_gmem {
+static struct vfsmount *kvm_gmem_mnt;
+
+/*
+ * A guest_memfd instance can be associated multiple VMs, each with its own
+ * "view" of the underlying physical memory.
+ *
+ * The gmem's inode is effectively the raw underlying physical storage, and is
+ * used to track properties of the physical memory, while each gmem file is
+ * effectively a single VM's view of that storage, and is used to track assets
+ * specific to its associated VM, e.g. memslots=>gmem bindings.
+ */
+struct gmem_file {
struct kvm *kvm;
struct xarray bindings;
struct list_head entry;
};
+struct gmem_inode {
+ struct shared_policy policy;
+ struct inode vfs_inode;
+
+ u64 flags;
+};
+
+static __always_inline struct gmem_inode *GMEM_I(struct inode *inode)
+{
+ return container_of(inode, struct gmem_inode, vfs_inode);
+}
+
+#define kvm_gmem_for_each_file(f, mapping) \
+ list_for_each_entry(f, &(mapping)->i_private_list, entry)
+
/**
* folio_file_pfn - like folio_file_page, but return a pfn.
* @folio: The folio which contains this index.
@@ -25,6 +54,11 @@ static inline kvm_pfn_t folio_file_pfn(struct folio *folio, pgoff_t index)
return folio_pfn(folio) + (index & (folio_nr_pages(folio) - 1));
}
+static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ return gfn - slot->base_gfn + slot->gmem.pgoff;
+}
+
static int __kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
pgoff_t index, struct folio *folio)
{
@@ -77,9 +111,9 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
* The order will be passed when creating the guest_memfd, and
* checked when creating memslots.
*/
- WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, 1 << folio_order(folio)));
- index = gfn - slot->base_gfn + slot->gmem.pgoff;
- index = ALIGN_DOWN(index, 1 << folio_order(folio));
+ WARN_ON(!IS_ALIGNED(slot->gmem.pgoff, folio_nr_pages(folio)));
+ index = kvm_gmem_get_index(slot, gfn);
+ index = ALIGN_DOWN(index, folio_nr_pages(folio));
r = __kvm_gmem_prepare_folio(kvm, slot, index, folio);
if (!r)
kvm_gmem_mark_prepared(folio);
@@ -99,27 +133,45 @@ static int kvm_gmem_prepare_folio(struct kvm *kvm, struct kvm_memory_slot *slot,
static struct folio *kvm_gmem_get_folio(struct inode *inode, pgoff_t index)
{
/* TODO: Support huge pages. */
- return filemap_grab_folio(inode->i_mapping, index);
+ struct mempolicy *policy;
+ struct folio *folio;
+
+ /*
+ * Fast-path: See if folio is already present in mapping to avoid
+ * policy_lookup.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED, 0);
+ if (!IS_ERR(folio))
+ return folio;
+
+ policy = mpol_shared_policy_lookup(&GMEM_I(inode)->policy, index);
+ folio = __filemap_get_folio_mpol(inode->i_mapping, index,
+ FGP_LOCK | FGP_ACCESSED | FGP_CREAT,
+ mapping_gfp_mask(inode->i_mapping), policy);
+ mpol_cond_put(policy);
+
+ return folio;
}
static enum kvm_gfn_range_filter kvm_gmem_get_invalidate_filter(struct inode *inode)
{
- if ((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED)
+ if (GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED)
return KVM_FILTER_SHARED;
return KVM_FILTER_PRIVATE;
}
-static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
+static void __kvm_gmem_invalidate_begin(struct gmem_file *f, pgoff_t start,
pgoff_t end,
enum kvm_gfn_range_filter attr_filter)
{
bool flush = false, found_memslot = false;
struct kvm_memory_slot *slot;
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
unsigned long index;
- xa_for_each_range(&gmem->bindings, index, slot, start, end - 1) {
+ xa_for_each_range(&f->bindings, index, slot, start, end - 1) {
pgoff_t pgoff = slot->gmem.pgoff;
struct kvm_gfn_range gfn_range = {
@@ -150,22 +202,21 @@ static void __kvm_gmem_invalidate_begin(struct kvm_gmem *gmem, pgoff_t start,
static void kvm_gmem_invalidate_begin(struct inode *inode, pgoff_t start,
pgoff_t end)
{
- struct list_head *gmem_list = &inode->i_mapping->i_private_list;
enum kvm_gfn_range_filter attr_filter;
- struct kvm_gmem *gmem;
+ struct gmem_file *f;
attr_filter = kvm_gmem_get_invalidate_filter(inode);
- list_for_each_entry(gmem, gmem_list, entry)
- __kvm_gmem_invalidate_begin(gmem, start, end, attr_filter);
+ kvm_gmem_for_each_file(f, inode->i_mapping)
+ __kvm_gmem_invalidate_begin(f, start, end, attr_filter);
}
-static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
+static void __kvm_gmem_invalidate_end(struct gmem_file *f, pgoff_t start,
pgoff_t end)
{
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
- if (xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ if (xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
KVM_MMU_LOCK(kvm);
kvm_mmu_invalidate_end(kvm);
KVM_MMU_UNLOCK(kvm);
@@ -175,11 +226,10 @@ static void __kvm_gmem_invalidate_end(struct kvm_gmem *gmem, pgoff_t start,
static void kvm_gmem_invalidate_end(struct inode *inode, pgoff_t start,
pgoff_t end)
{
- struct list_head *gmem_list = &inode->i_mapping->i_private_list;
- struct kvm_gmem *gmem;
+ struct gmem_file *f;
- list_for_each_entry(gmem, gmem_list, entry)
- __kvm_gmem_invalidate_end(gmem, start, end);
+ kvm_gmem_for_each_file(f, inode->i_mapping)
+ __kvm_gmem_invalidate_end(f, start, end);
}
static long kvm_gmem_punch_hole(struct inode *inode, loff_t offset, loff_t len)
@@ -277,9 +327,9 @@ static long kvm_gmem_fallocate(struct file *file, int mode, loff_t offset,
static int kvm_gmem_release(struct inode *inode, struct file *file)
{
- struct kvm_gmem *gmem = file->private_data;
+ struct gmem_file *f = file->private_data;
struct kvm_memory_slot *slot;
- struct kvm *kvm = gmem->kvm;
+ struct kvm *kvm = f->kvm;
unsigned long index;
/*
@@ -299,7 +349,7 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
filemap_invalidate_lock(inode->i_mapping);
- xa_for_each(&gmem->bindings, index, slot)
+ xa_for_each(&f->bindings, index, slot)
WRITE_ONCE(slot->gmem.file, NULL);
/*
@@ -307,18 +357,18 @@ static int kvm_gmem_release(struct inode *inode, struct file *file)
* Zap all SPTEs pointed at by this file. Do not free the backing
* memory, as its lifetime is associated with the inode, not the file.
*/
- __kvm_gmem_invalidate_begin(gmem, 0, -1ul,
+ __kvm_gmem_invalidate_begin(f, 0, -1ul,
kvm_gmem_get_invalidate_filter(inode));
- __kvm_gmem_invalidate_end(gmem, 0, -1ul);
+ __kvm_gmem_invalidate_end(f, 0, -1ul);
- list_del(&gmem->entry);
+ list_del(&f->entry);
filemap_invalidate_unlock(inode->i_mapping);
mutex_unlock(&kvm->slots_lock);
- xa_destroy(&gmem->bindings);
- kfree(gmem);
+ xa_destroy(&f->bindings);
+ kfree(f);
kvm_put_kvm(kvm);
@@ -335,16 +385,12 @@ static inline struct file *kvm_gmem_get_file(struct kvm_memory_slot *slot)
return get_file_active(&slot->gmem.file);
}
-static pgoff_t kvm_gmem_get_index(struct kvm_memory_slot *slot, gfn_t gfn)
-{
- return gfn - slot->base_gfn + slot->gmem.pgoff;
-}
+DEFINE_CLASS(gmem_get_file, struct file *, if (_T) fput(_T),
+ kvm_gmem_get_file(slot), struct kvm_memory_slot *slot);
static bool kvm_gmem_supports_mmap(struct inode *inode)
{
- const u64 flags = (u64)inode->i_private;
-
- return flags & GUEST_MEMFD_FLAG_MMAP;
+ return GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_MMAP;
}
static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
@@ -356,17 +402,15 @@ static vm_fault_t kvm_gmem_fault_user_mapping(struct vm_fault *vmf)
if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
return VM_FAULT_SIGBUS;
- if (!((u64)inode->i_private & GUEST_MEMFD_FLAG_INIT_SHARED))
+ if (!(GMEM_I(inode)->flags & GUEST_MEMFD_FLAG_INIT_SHARED))
return VM_FAULT_SIGBUS;
folio = kvm_gmem_get_folio(inode, vmf->pgoff);
if (IS_ERR(folio)) {
- int err = PTR_ERR(folio);
-
- if (err == -EAGAIN)
+ if (PTR_ERR(folio) == -EAGAIN)
return VM_FAULT_RETRY;
- return vmf_error(err);
+ return vmf_error(PTR_ERR(folio));
}
if (WARN_ON_ONCE(folio_test_large(folio))) {
@@ -390,8 +434,40 @@ out_folio:
return ret;
}
+#ifdef CONFIG_NUMA
+static int kvm_gmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ return mpol_set_shared_policy(&GMEM_I(inode)->policy, vma, mpol);
+}
+
+static struct mempolicy *kvm_gmem_get_policy(struct vm_area_struct *vma,
+ unsigned long addr, pgoff_t *pgoff)
+{
+ struct inode *inode = file_inode(vma->vm_file);
+
+ *pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+
+ /*
+ * Return the memory policy for this index, or NULL if none is set.
+ *
+ * Returning NULL, e.g. instead of the current task's memory policy, is
+ * important for the .get_policy kernel ABI: it indicates that no
+ * explicit policy has been set via mbind() for this memory. The caller
+ * can then replace NULL with the default memory policy instead of the
+ * current task's memory policy.
+ */
+ return mpol_shared_policy_lookup(&GMEM_I(inode)->policy, *pgoff);
+}
+#endif /* CONFIG_NUMA */
+
static const struct vm_operations_struct kvm_gmem_vm_ops = {
- .fault = kvm_gmem_fault_user_mapping,
+ .fault = kvm_gmem_fault_user_mapping,
+#ifdef CONFIG_NUMA
+ .get_policy = kvm_gmem_get_policy,
+ .set_policy = kvm_gmem_set_policy,
+#endif
};
static int kvm_gmem_mmap(struct file *file, struct vm_area_struct *vma)
@@ -416,11 +492,6 @@ static struct file_operations kvm_gmem_fops = {
.fallocate = kvm_gmem_fallocate,
};
-void kvm_gmem_init(struct module *module)
-{
- kvm_gmem_fops.owner = module;
-}
-
static int kvm_gmem_migrate_folio(struct address_space *mapping,
struct folio *dst, struct folio *src,
enum migrate_mode mode)
@@ -492,8 +563,8 @@ bool __weak kvm_arch_supports_gmem_init_shared(struct kvm *kvm)
static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
{
- const char *anon_name = "[kvm-gmem]";
- struct kvm_gmem *gmem;
+ static const char *name = "[kvm-gmem]";
+ struct gmem_file *f;
struct inode *inode;
struct file *file;
int fd, err;
@@ -502,25 +573,24 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
if (fd < 0)
return fd;
- gmem = kzalloc(sizeof(*gmem), GFP_KERNEL);
- if (!gmem) {
+ f = kzalloc(sizeof(*f), GFP_KERNEL);
+ if (!f) {
err = -ENOMEM;
goto err_fd;
}
- file = anon_inode_create_getfile(anon_name, &kvm_gmem_fops, gmem,
- O_RDWR, NULL);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
+ /* __fput() will take care of fops_put(). */
+ if (!fops_get(&kvm_gmem_fops)) {
+ err = -ENOENT;
goto err_gmem;
}
- file->f_flags |= O_LARGEFILE;
-
- inode = file->f_inode;
- WARN_ON(file->f_mapping != inode->i_mapping);
+ inode = anon_inode_make_secure_inode(kvm_gmem_mnt->mnt_sb, name, NULL);
+ if (IS_ERR(inode)) {
+ err = PTR_ERR(inode);
+ goto err_fops;
+ }
- inode->i_private = (void *)(unsigned long)flags;
inode->i_op = &kvm_gmem_iops;
inode->i_mapping->a_ops = &kvm_gmem_aops;
inode->i_mode |= S_IFREG;
@@ -530,16 +600,31 @@ static int __kvm_gmem_create(struct kvm *kvm, loff_t size, u64 flags)
/* Unmovable mappings are supposed to be marked unevictable as well. */
WARN_ON_ONCE(!mapping_unevictable(inode->i_mapping));
+ GMEM_I(inode)->flags = flags;
+
+ file = alloc_file_pseudo(inode, kvm_gmem_mnt, name, O_RDWR, &kvm_gmem_fops);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_inode;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+ file->private_data = f;
+
kvm_get_kvm(kvm);
- gmem->kvm = kvm;
- xa_init(&gmem->bindings);
- list_add(&gmem->entry, &inode->i_mapping->i_private_list);
+ f->kvm = kvm;
+ xa_init(&f->bindings);
+ list_add(&f->entry, &inode->i_mapping->i_private_list);
fd_install(fd, file);
return fd;
+err_inode:
+ iput(inode);
+err_fops:
+ fops_put(&kvm_gmem_fops);
err_gmem:
- kfree(gmem);
+ kfree(f);
err_fd:
put_unused_fd(fd);
return err;
@@ -564,7 +649,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
{
loff_t size = slot->npages << PAGE_SHIFT;
unsigned long start, end;
- struct kvm_gmem *gmem;
+ struct gmem_file *f;
struct inode *inode;
struct file *file;
int r = -EINVAL;
@@ -578,8 +663,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
if (file->f_op != &kvm_gmem_fops)
goto err;
- gmem = file->private_data;
- if (gmem->kvm != kvm)
+ f = file->private_data;
+ if (f->kvm != kvm)
goto err;
inode = file_inode(file);
@@ -593,8 +678,8 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
start = offset >> PAGE_SHIFT;
end = start + slot->npages;
- if (!xa_empty(&gmem->bindings) &&
- xa_find(&gmem->bindings, &start, end - 1, XA_PRESENT)) {
+ if (!xa_empty(&f->bindings) &&
+ xa_find(&f->bindings, &start, end - 1, XA_PRESENT)) {
filemap_invalidate_unlock(inode->i_mapping);
goto err;
}
@@ -609,7 +694,7 @@ int kvm_gmem_bind(struct kvm *kvm, struct kvm_memory_slot *slot,
if (kvm_gmem_supports_mmap(inode))
slot->flags |= KVM_MEMSLOT_GMEM_ONLY;
- xa_store_range(&gmem->bindings, start, end - 1, slot, GFP_KERNEL);
+ xa_store_range(&f->bindings, start, end - 1, slot, GFP_KERNEL);
filemap_invalidate_unlock(inode->i_mapping);
/*
@@ -623,34 +708,49 @@ err:
return r;
}
-void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+static void __kvm_gmem_unbind(struct kvm_memory_slot *slot, struct gmem_file *f)
{
unsigned long start = slot->gmem.pgoff;
unsigned long end = start + slot->npages;
- struct kvm_gmem *gmem;
- struct file *file;
+
+ xa_store_range(&f->bindings, start, end - 1, NULL, GFP_KERNEL);
/*
- * Nothing to do if the underlying file was already closed (or is being
- * closed right now), kvm_gmem_release() invalidates all bindings.
+ * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
+ * cannot see this memslot.
*/
- file = kvm_gmem_get_file(slot);
- if (!file)
- return;
+ WRITE_ONCE(slot->gmem.file, NULL);
+}
- gmem = file->private_data;
+void kvm_gmem_unbind(struct kvm_memory_slot *slot)
+{
+ /*
+ * Nothing to do if the underlying file was _already_ closed, as
+ * kvm_gmem_release() invalidates and nullifies all bindings.
+ */
+ if (!slot->gmem.file)
+ return;
- filemap_invalidate_lock(file->f_mapping);
- xa_store_range(&gmem->bindings, start, end - 1, NULL, GFP_KERNEL);
+ CLASS(gmem_get_file, file)(slot);
/*
- * synchronize_srcu(&kvm->srcu) ensured that kvm_gmem_get_pfn()
- * cannot see this memslot.
+ * However, if the file is _being_ closed, then the bindings need to be
+ * removed as kvm_gmem_release() might not run until after the memslot
+ * is freed. Note, modifying the bindings is safe even though the file
+ * is dying as kvm_gmem_release() nullifies slot->gmem.file under
+ * slots_lock, and only puts its reference to KVM after destroying all
+ * bindings. I.e. reaching this point means kvm_gmem_release() hasn't
+ * yet destroyed the bindings or freed the gmem_file, and can't do so
+ * until the caller drops slots_lock.
*/
- WRITE_ONCE(slot->gmem.file, NULL);
- filemap_invalidate_unlock(file->f_mapping);
+ if (!file) {
+ __kvm_gmem_unbind(slot, slot->gmem.file->private_data);
+ return;
+ }
- fput(file);
+ filemap_invalidate_lock(file->f_mapping);
+ __kvm_gmem_unbind(slot, file->private_data);
+ filemap_invalidate_unlock(file->f_mapping);
}
/* Returns a locked folio on success. */
@@ -659,18 +759,17 @@ static struct folio *__kvm_gmem_get_pfn(struct file *file,
pgoff_t index, kvm_pfn_t *pfn,
bool *is_prepared, int *max_order)
{
- struct file *gmem_file = READ_ONCE(slot->gmem.file);
- struct kvm_gmem *gmem = file->private_data;
+ struct file *slot_file = READ_ONCE(slot->gmem.file);
+ struct gmem_file *f = file->private_data;
struct folio *folio;
- if (file != gmem_file) {
- WARN_ON_ONCE(gmem_file);
+ if (file != slot_file) {
+ WARN_ON_ONCE(slot_file);
return ERR_PTR(-EFAULT);
}
- gmem = file->private_data;
- if (xa_load(&gmem->bindings, index) != slot) {
- WARN_ON_ONCE(xa_load(&gmem->bindings, index));
+ if (xa_load(&f->bindings, index) != slot) {
+ WARN_ON_ONCE(xa_load(&f->bindings, index));
return ERR_PTR(-EIO);
}
@@ -697,19 +796,17 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
int *max_order)
{
pgoff_t index = kvm_gmem_get_index(slot, gfn);
- struct file *file = kvm_gmem_get_file(slot);
struct folio *folio;
bool is_prepared = false;
int r = 0;
+ CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
folio = __kvm_gmem_get_pfn(file, slot, index, pfn, &is_prepared, max_order);
- if (IS_ERR(folio)) {
- r = PTR_ERR(folio);
- goto out;
- }
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
if (!is_prepared)
r = kvm_gmem_prepare_folio(kvm, slot, gfn, folio);
@@ -721,8 +818,6 @@ int kvm_gmem_get_pfn(struct kvm *kvm, struct kvm_memory_slot *slot,
else
folio_put(folio);
-out:
- fput(file);
return r;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
@@ -731,7 +826,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_get_pfn);
long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long npages,
kvm_gmem_populate_cb post_populate, void *opaque)
{
- struct file *file;
struct kvm_memory_slot *slot;
void __user *p;
@@ -747,7 +841,7 @@ long kvm_gmem_populate(struct kvm *kvm, gfn_t start_gfn, void __user *src, long
if (!kvm_slot_has_gmem(slot))
return -EINVAL;
- file = kvm_gmem_get_file(slot);
+ CLASS(gmem_get_file, file)(slot);
if (!file)
return -EFAULT;
@@ -805,8 +899,118 @@ put_folio_and_exit:
filemap_invalidate_unlock(file->f_mapping);
- fput(file);
return ret && !i ? ret : i;
}
EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_gmem_populate);
#endif
+
+static struct kmem_cache *kvm_gmem_inode_cachep;
+
+static void kvm_gmem_init_inode_once(void *__gi)
+{
+ struct gmem_inode *gi = __gi;
+
+ /*
+ * Note! Don't initialize the inode with anything specific to the
+ * guest_memfd instance, or that might be specific to how the inode is
+ * used (from the VFS-layer's perspective). This hook is called only
+ * during the initial slab allocation, i.e. only fields/state that are
+ * idempotent across _all_ use of the inode _object_ can be initialized
+ * at this time!
+ */
+ inode_init_once(&gi->vfs_inode);
+}
+
+static struct inode *kvm_gmem_alloc_inode(struct super_block *sb)
+{
+ struct gmem_inode *gi;
+
+ gi = alloc_inode_sb(sb, kvm_gmem_inode_cachep, GFP_KERNEL);
+ if (!gi)
+ return NULL;
+
+ mpol_shared_policy_init(&gi->policy, NULL);
+
+ gi->flags = 0;
+ return &gi->vfs_inode;
+}
+
+static void kvm_gmem_destroy_inode(struct inode *inode)
+{
+ mpol_free_shared_policy(&GMEM_I(inode)->policy);
+}
+
+static void kvm_gmem_free_inode(struct inode *inode)
+{
+ kmem_cache_free(kvm_gmem_inode_cachep, GMEM_I(inode));
+}
+
+static const struct super_operations kvm_gmem_super_operations = {
+ .statfs = simple_statfs,
+ .alloc_inode = kvm_gmem_alloc_inode,
+ .destroy_inode = kvm_gmem_destroy_inode,
+ .free_inode = kvm_gmem_free_inode,
+};
+
+static int kvm_gmem_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx;
+
+ if (!init_pseudo(fc, GUEST_MEMFD_MAGIC))
+ return -ENOMEM;
+
+ fc->s_iflags |= SB_I_NOEXEC;
+ fc->s_iflags |= SB_I_NODEV;
+ ctx = fc->fs_private;
+ ctx->ops = &kvm_gmem_super_operations;
+
+ return 0;
+}
+
+static struct file_system_type kvm_gmem_fs = {
+ .name = "guest_memfd",
+ .init_fs_context = kvm_gmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int kvm_gmem_init_mount(void)
+{
+ kvm_gmem_mnt = kern_mount(&kvm_gmem_fs);
+
+ if (IS_ERR(kvm_gmem_mnt))
+ return PTR_ERR(kvm_gmem_mnt);
+
+ kvm_gmem_mnt->mnt_flags |= MNT_NOEXEC;
+ return 0;
+}
+
+int kvm_gmem_init(struct module *module)
+{
+ struct kmem_cache_args args = {
+ .align = 0,
+ .ctor = kvm_gmem_init_inode_once,
+ };
+ int ret;
+
+ kvm_gmem_fops.owner = module;
+ kvm_gmem_inode_cachep = kmem_cache_create("kvm_gmem_inode_cache",
+ sizeof(struct gmem_inode),
+ &args, SLAB_ACCOUNT);
+ if (!kvm_gmem_inode_cachep)
+ return -ENOMEM;
+
+ ret = kvm_gmem_init_mount();
+ if (ret) {
+ kmem_cache_destroy(kvm_gmem_inode_cachep);
+ return ret;
+ }
+ return 0;
+}
+
+void kvm_gmem_exit(void)
+{
+ kern_unmount(kvm_gmem_mnt);
+ kvm_gmem_mnt = NULL;
+ rcu_barrier();
+ kmem_cache_destroy(kvm_gmem_inode_cachep);
+}