// SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2025, Google LLC. * Pasha Tatashin * * Copyright (C) 2025 Amazon.com Inc. or its affiliates. * Pratyush Yadav */ /** * DOC: Memfd Preservation via LUO * * Overview * ======== * * Memory file descriptors (memfd) can be preserved over a kexec using the Live * Update Orchestrator (LUO) file preservation. This allows userspace to * transfer its memory contents to the next kernel after a kexec. * * The preservation is not intended to be transparent. Only select properties of * the file are preserved. All others are reset to default. The preserved * properties are described below. * * .. note:: * The LUO API is not stabilized yet, so the preserved properties of a memfd * are also not stable and are subject to backwards incompatible changes. * * .. note:: * Currently a memfd backed by Hugetlb is not supported. Memfds created * with ``MFD_HUGETLB`` will be rejected. * * Preserved Properties * ==================== * * The following properties of the memfd are preserved across kexec: * * File Contents * All data stored in the file is preserved. * * File Size * The size of the file is preserved. Holes in the file are filled by * allocating pages for them during preservation. * * File Position * The current file position is preserved, allowing applications to continue * reading/writing from their last position. * * File Status Flags * memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property * is maintained. * * Non-Preserved Properties * ======================== * * All properties which are not preserved must be assumed to be reset to * default. This section describes some of those properties which may be more of * note. * * ``FD_CLOEXEC`` flag * A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the * ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set * again after restore via ``fcntl()``. * * Seals * File seals are not preserved. The file is unsealed on restore and if * needed, must be sealed again via ``fcntl()``. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include #include #include #include #include #include #include #include #include #include "internal.h" static int memfd_luo_preserve_folios(struct file *file, struct kho_vmalloc *kho_vmalloc, struct memfd_luo_folio_ser **out_folios_ser, u64 *nr_foliosp) { struct inode *inode = file_inode(file); struct memfd_luo_folio_ser *folios_ser; unsigned int max_folios; long i, size, nr_pinned; struct folio **folios; int err = -EINVAL; pgoff_t offset; u64 nr_folios; size = i_size_read(inode); /* * If the file has zero size, then the folios and nr_folios properties * are not set. */ if (!size) { *nr_foliosp = 0; *out_folios_ser = NULL; memset(kho_vmalloc, 0, sizeof(*kho_vmalloc)); return 0; } /* * Guess the number of folios based on inode size. Real number might end * up being smaller if there are higher order folios. */ max_folios = PAGE_ALIGN(size) / PAGE_SIZE; folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL); if (!folios) return -ENOMEM; /* * Pin the folios so they don't move around behind our back. This also * ensures none of the folios are in CMA -- which ensures they don't * fall in KHO scratch memory. It also moves swapped out folios back to * memory. * * A side effect of doing this is that it allocates a folio for all * indices in the file. This might waste memory on sparse memfds. If * that is really a problem in the future, we can have a * memfd_pin_folios() variant that does not allocate a page on empty * slots. */ nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios, &offset); if (nr_pinned < 0) { err = nr_pinned; pr_err("failed to pin folios: %d\n", err); goto err_free_folios; } nr_folios = nr_pinned; folios_ser = vcalloc(nr_folios, sizeof(*folios_ser)); if (!folios_ser) { err = -ENOMEM; goto err_unpin; } for (i = 0; i < nr_folios; i++) { struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; struct folio *folio = folios[i]; unsigned int flags = 0; err = kho_preserve_folio(folio); if (err) goto err_unpreserve; if (folio_test_dirty(folio)) flags |= MEMFD_LUO_FOLIO_DIRTY; if (folio_test_uptodate(folio)) flags |= MEMFD_LUO_FOLIO_UPTODATE; pfolio->pfn = folio_pfn(folio); pfolio->flags = flags; pfolio->index = folio->index; } err = kho_preserve_vmalloc(folios_ser, kho_vmalloc); if (err) goto err_unpreserve; kvfree(folios); *nr_foliosp = nr_folios; *out_folios_ser = folios_ser; /* * Note: folios_ser is purposely not freed here. It is preserved * memory (via KHO). In the 'unpreserve' path, we use the vmap pointer * that is passed via private_data. */ return 0; err_unpreserve: for (i = i - 1; i >= 0; i--) kho_unpreserve_folio(folios[i]); vfree(folios_ser); err_unpin: unpin_folios(folios, nr_folios); err_free_folios: kvfree(folios); return err; } static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc, struct memfd_luo_folio_ser *folios_ser, u64 nr_folios) { long i; if (!nr_folios) return; kho_unpreserve_vmalloc(kho_vmalloc); for (i = 0; i < nr_folios; i++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; struct folio *folio; if (!pfolio->pfn) continue; folio = pfn_folio(pfolio->pfn); kho_unpreserve_folio(folio); unpin_folio(folio); } vfree(folios_ser); } static int memfd_luo_preserve(struct liveupdate_file_op_args *args) { struct inode *inode = file_inode(args->file); struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; u64 nr_folios; int err = 0; inode_lock(inode); shmem_freeze(inode, true); /* Allocate the main serialization structure in preserved memory */ ser = kho_alloc_preserve(sizeof(*ser)); if (IS_ERR(ser)) { err = PTR_ERR(ser); goto err_unlock; } ser->pos = args->file->f_pos; ser->size = i_size_read(inode); err = memfd_luo_preserve_folios(args->file, &ser->folios, &folios_ser, &nr_folios); if (err) goto err_free_ser; ser->nr_folios = nr_folios; inode_unlock(inode); args->private_data = folios_ser; args->serialized_data = virt_to_phys(ser); return 0; err_free_ser: kho_unpreserve_free(ser); err_unlock: shmem_freeze(inode, false); inode_unlock(inode); return err; } static int memfd_luo_freeze(struct liveupdate_file_op_args *args) { struct memfd_luo_ser *ser; if (WARN_ON_ONCE(!args->serialized_data)) return -EINVAL; ser = phys_to_virt(args->serialized_data); /* * The pos might have changed since prepare. Everything else stays the * same. */ ser->pos = args->file->f_pos; return 0; } static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args) { struct inode *inode = file_inode(args->file); struct memfd_luo_ser *ser; if (WARN_ON_ONCE(!args->serialized_data)) return; inode_lock(inode); shmem_freeze(inode, false); ser = phys_to_virt(args->serialized_data); memfd_luo_unpreserve_folios(&ser->folios, args->private_data, ser->nr_folios); kho_unpreserve_free(ser); inode_unlock(inode); } static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser, u64 nr_folios) { u64 i; for (i = 0; i < nr_folios; i++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; struct folio *folio; phys_addr_t phys; if (!pfolio->pfn) continue; phys = PFN_PHYS(pfolio->pfn); folio = kho_restore_folio(phys); if (!folio) { pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n", phys); continue; } folio_put(folio); } } static void memfd_luo_finish(struct liveupdate_file_op_args *args) { struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; if (args->retrieved) return; ser = phys_to_virt(args->serialized_data); if (!ser) return; if (ser->nr_folios) { folios_ser = kho_restore_vmalloc(&ser->folios); if (!folios_ser) goto out; memfd_luo_discard_folios(folios_ser, ser->nr_folios); vfree(folios_ser); } out: kho_restore_free(ser); } static int memfd_luo_retrieve_folios(struct file *file, struct memfd_luo_folio_ser *folios_ser, u64 nr_folios) { struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; struct folio *folio; int err = -EIO; long i; for (i = 0; i < nr_folios; i++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[i]; phys_addr_t phys; u64 index; int flags; if (!pfolio->pfn) continue; phys = PFN_PHYS(pfolio->pfn); folio = kho_restore_folio(phys); if (!folio) { pr_err("Unable to restore folio at physical address: %llx\n", phys); goto put_folios; } index = pfolio->index; flags = pfolio->flags; /* Set up the folio for insertion. */ __folio_set_locked(folio); __folio_set_swapbacked(folio); err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping)); if (err) { pr_err("shmem: failed to charge folio index %ld: %d\n", i, err); goto unlock_folio; } err = shmem_add_to_page_cache(folio, mapping, index, NULL, mapping_gfp_mask(mapping)); if (err) { pr_err("shmem: failed to add to page cache folio index %ld: %d\n", i, err); goto unlock_folio; } if (flags & MEMFD_LUO_FOLIO_UPTODATE) folio_mark_uptodate(folio); if (flags & MEMFD_LUO_FOLIO_DIRTY) folio_mark_dirty(folio); err = shmem_inode_acct_blocks(inode, 1); if (err) { pr_err("shmem: failed to account folio index %ld: %d\n", i, err); goto unlock_folio; } shmem_recalc_inode(inode, 1, 0); folio_add_lru(folio); folio_unlock(folio); folio_put(folio); } return 0; unlock_folio: folio_unlock(folio); folio_put(folio); put_folios: /* * Note: don't free the folios already added to the file. They will be * freed when the file is freed. Free the ones not added yet here. */ for (long j = i + 1; j < nr_folios; j++) { const struct memfd_luo_folio_ser *pfolio = &folios_ser[j]; folio = kho_restore_folio(pfolio->pfn); if (folio) folio_put(folio); } return err; } static int memfd_luo_retrieve(struct liveupdate_file_op_args *args) { struct memfd_luo_folio_ser *folios_ser; struct memfd_luo_ser *ser; struct file *file; int err; ser = phys_to_virt(args->serialized_data); if (!ser) return -EINVAL; file = shmem_file_setup("", 0, VM_NORESERVE); if (IS_ERR(file)) { pr_err("failed to setup file: %pe\n", file); return PTR_ERR(file); } vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE); file->f_inode->i_size = ser->size; if (ser->nr_folios) { folios_ser = kho_restore_vmalloc(&ser->folios); if (!folios_ser) { err = -EINVAL; goto put_file; } err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios); vfree(folios_ser); if (err) goto put_file; } args->file = file; kho_restore_free(ser); return 0; put_file: fput(file); return err; } static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler, struct file *file) { struct inode *inode = file_inode(file); return shmem_file(file) && !inode->i_nlink; } static const struct liveupdate_file_ops memfd_luo_file_ops = { .freeze = memfd_luo_freeze, .finish = memfd_luo_finish, .retrieve = memfd_luo_retrieve, .preserve = memfd_luo_preserve, .unpreserve = memfd_luo_unpreserve, .can_preserve = memfd_luo_can_preserve, .owner = THIS_MODULE, }; static struct liveupdate_file_handler memfd_luo_handler = { .ops = &memfd_luo_file_ops, .compatible = MEMFD_LUO_FH_COMPATIBLE, }; static int __init memfd_luo_init(void) { int err = liveupdate_register_file_handler(&memfd_luo_handler); if (err && err != -EOPNOTSUPP) { pr_err("Could not register luo filesystem handler: %pe\n", ERR_PTR(err)); return err; } return 0; } late_initcall(memfd_luo_init);