From 2b2d7ca7ce25fbec8389e7d85e57742caa47c97d Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 10 Dec 2024 10:08:42 +0100 Subject: dma-buf: fix incorrect dma-fence documentation v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There isn't much worse than documentation giving an incorrect advise. Grabbing a spinlock while interrupts are disabled usually means that you must also disable interrupts for all other uses of this spinlock. Otherwise really hard to debug issues can occur. So fix that invalid documentation. v2: use Dmitry's suggestion on the documentation Signed-off-by: Christian König Reviewed-by: Simona Vetter (v1) Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-2-christian.koenig@amd.com --- include/linux/dma-fence.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index e7ad819962e3..52587d390aca 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -169,8 +169,8 @@ struct dma_fence_ops { * implementation know that there is another driver waiting on the * signal (ie. hw->sw case). * - * This function can be called from atomic context, but not - * from irq context, so normal spinlocks can be used. + * This is called with irq's disabled, so only spinlocks which disable + * IRQ's can be used in the code outside of this callback. * * A return value of false indicates the fence already passed, * or some failure occurred that made it impossible to enable -- cgit v1.2.3 From 2ce07fea3cc8b866f7955a7ce1d62b0cc1f74819 Mon Sep 17 00:00:00 2001 From: Christian König Date: Wed, 18 Sep 2024 08:16:57 +0200 Subject: dma-buf/dma-fence: remove unnecessary callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fence_value_str and timeline_value_str callbacks were just an unnecessary abstraction in the SW sync implementation. The only caller of those callbacks already knew that the fence in questions is a timeline_fence. So print the values directly instead of using a redirection. Additional to that remove the implementations from virtgpu and vgem. As far as I can see those were never used in the first place. Signed-off-by: Christian König Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-3-christian.koenig@amd.com --- drivers/dma-buf/sw_sync.c | 16 ---------------- drivers/dma-buf/sync_debug.c | 21 ++------------------- drivers/gpu/drm/vgem/vgem_fence.c | 15 --------------- drivers/gpu/drm/virtio/virtgpu_fence.c | 16 ---------------- include/linux/dma-fence.h | 21 --------------------- 5 files changed, 2 insertions(+), 87 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c index f5905d67dedb..849280ae79a9 100644 --- a/drivers/dma-buf/sw_sync.c +++ b/drivers/dma-buf/sw_sync.c @@ -173,20 +173,6 @@ static bool timeline_fence_signaled(struct dma_fence *fence) return !__dma_fence_is_later(fence->seqno, parent->value, fence->ops); } -static void timeline_fence_value_str(struct dma_fence *fence, - char *str, int size) -{ - snprintf(str, size, "%lld", fence->seqno); -} - -static void timeline_fence_timeline_value_str(struct dma_fence *fence, - char *str, int size) -{ - struct sync_timeline *parent = dma_fence_parent(fence); - - snprintf(str, size, "%d", parent->value); -} - static void timeline_fence_set_deadline(struct dma_fence *fence, ktime_t deadline) { struct sync_pt *pt = dma_fence_to_sync_pt(fence); @@ -208,8 +194,6 @@ static const struct dma_fence_ops timeline_fence_ops = { .get_timeline_name = timeline_fence_get_timeline_name, .signaled = timeline_fence_signaled, .release = timeline_fence_release, - .fence_value_str = timeline_fence_value_str, - .timeline_value_str = timeline_fence_timeline_value_str, .set_deadline = timeline_fence_set_deadline, }; diff --git a/drivers/dma-buf/sync_debug.c b/drivers/dma-buf/sync_debug.c index 237bce21d1e7..270daae7d89a 100644 --- a/drivers/dma-buf/sync_debug.c +++ b/drivers/dma-buf/sync_debug.c @@ -82,25 +82,8 @@ static void sync_print_fence(struct seq_file *s, seq_printf(s, "@%lld.%09ld", (s64)ts64.tv_sec, ts64.tv_nsec); } - if (fence->ops->timeline_value_str && - fence->ops->fence_value_str) { - char value[64]; - bool success; - - fence->ops->fence_value_str(fence, value, sizeof(value)); - success = strlen(value); - - if (success) { - seq_printf(s, ": %s", value); - - fence->ops->timeline_value_str(fence, value, - sizeof(value)); - - if (strlen(value)) - seq_printf(s, " / %s", value); - } - } - + seq_printf(s, ": %lld", fence->seqno); + seq_printf(s, " / %d", parent->value); seq_putc(s, '\n'); } diff --git a/drivers/gpu/drm/vgem/vgem_fence.c b/drivers/gpu/drm/vgem/vgem_fence.c index e15754178395..5298d995faa7 100644 --- a/drivers/gpu/drm/vgem/vgem_fence.c +++ b/drivers/gpu/drm/vgem/vgem_fence.c @@ -53,25 +53,10 @@ static void vgem_fence_release(struct dma_fence *base) dma_fence_free(&fence->base); } -static void vgem_fence_value_str(struct dma_fence *fence, char *str, int size) -{ - snprintf(str, size, "%llu", fence->seqno); -} - -static void vgem_fence_timeline_value_str(struct dma_fence *fence, char *str, - int size) -{ - snprintf(str, size, "%llu", - dma_fence_is_signaled(fence) ? fence->seqno : 0); -} - static const struct dma_fence_ops vgem_fence_ops = { .get_driver_name = vgem_fence_get_driver_name, .get_timeline_name = vgem_fence_get_timeline_name, .release = vgem_fence_release, - - .fence_value_str = vgem_fence_value_str, - .timeline_value_str = vgem_fence_timeline_value_str, }; static void vgem_fence_timeout(struct timer_list *t) diff --git a/drivers/gpu/drm/virtio/virtgpu_fence.c b/drivers/gpu/drm/virtio/virtgpu_fence.c index f28357dbde35..44c1d8ef3c4d 100644 --- a/drivers/gpu/drm/virtio/virtgpu_fence.c +++ b/drivers/gpu/drm/virtio/virtgpu_fence.c @@ -49,26 +49,10 @@ static bool virtio_gpu_fence_signaled(struct dma_fence *f) return false; } -static void virtio_gpu_fence_value_str(struct dma_fence *f, char *str, int size) -{ - snprintf(str, size, "[%llu, %llu]", f->context, f->seqno); -} - -static void virtio_gpu_timeline_value_str(struct dma_fence *f, char *str, - int size) -{ - struct virtio_gpu_fence *fence = to_virtio_gpu_fence(f); - - snprintf(str, size, "%llu", - (u64)atomic64_read(&fence->drv->last_fence_id)); -} - static const struct dma_fence_ops virtio_gpu_fence_ops = { .get_driver_name = virtio_gpu_get_driver_name, .get_timeline_name = virtio_gpu_get_timeline_name, .signaled = virtio_gpu_fence_signaled, - .fence_value_str = virtio_gpu_fence_value_str, - .timeline_value_str = virtio_gpu_timeline_value_str, }; struct virtio_gpu_fence *virtio_gpu_fence_alloc(struct virtio_gpu_device *vgdev, diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 52587d390aca..b12776883d14 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -238,27 +238,6 @@ struct dma_fence_ops { */ void (*release)(struct dma_fence *fence); - /** - * @fence_value_str: - * - * Callback to fill in free-form debug info specific to this fence, like - * the sequence number. - * - * This callback is optional. - */ - void (*fence_value_str)(struct dma_fence *fence, char *str, int size); - - /** - * @timeline_value_str: - * - * Fills in the current value of the timeline as a string, like the - * sequence number. Note that the specific fence passed to this function - * should not matter, drivers should only use it to look up the - * corresponding timeline structures. - */ - void (*timeline_value_str)(struct dma_fence *fence, - char *str, int size); - /** * @set_deadline: * -- cgit v1.2.3 From de68b17d5d0716c9a02b8a6ffa34f47c8f2f7690 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 11 Feb 2025 15:26:16 +0100 Subject: dma-buf: dma-buf: stop mapping sg_tables on attach v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As a workaround to smoothly transit from static to dynamic DMA-buf handling we cached the sg_table on attach if dynamic handling mismatched between exporter and importer. Since Dmitry and Thomas cleaned that up and also documented the lock handling we can drop this workaround now. V2: implement Sima's comments Signed-off-by: Christian König Reviewed-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-4-christian.koenig@amd.com --- drivers/dma-buf/dma-buf.c | 151 ++++++++++++++++++---------------------------- include/linux/dma-buf.h | 14 ----- 2 files changed, 59 insertions(+), 106 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 5baa83b85515..1f7349b08df8 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -782,7 +782,7 @@ static void mangle_sg_table(struct sg_table *sg_table) /* To catch abuse of the underlying struct page by importers mix * up the bits, but take care to preserve the low SG_ bits to - * not corrupt the sgt. The mixing is undone in __unmap_dma_buf + * not corrupt the sgt. The mixing is undone on unmap * before passing the sgt back to the exporter. */ for_each_sgtable_sg(sg_table, sg, i) @@ -790,29 +790,19 @@ static void mangle_sg_table(struct sg_table *sg_table) #endif } -static struct sg_table *__map_dma_buf(struct dma_buf_attachment *attach, - enum dma_data_direction direction) -{ - struct sg_table *sg_table; - signed long ret; - sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); - if (IS_ERR_OR_NULL(sg_table)) - return sg_table; - - if (!dma_buf_attachment_is_dynamic(attach)) { - ret = dma_resv_wait_timeout(attach->dmabuf->resv, - DMA_RESV_USAGE_KERNEL, true, - MAX_SCHEDULE_TIMEOUT); - if (ret < 0) { - attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, - direction); - return ERR_PTR(ret); - } - } +static inline bool +dma_buf_attachment_is_dynamic(struct dma_buf_attachment *attach) +{ + return !!attach->importer_ops; +} - mangle_sg_table(sg_table); - return sg_table; +static bool +dma_buf_pin_on_map(struct dma_buf_attachment *attach) +{ + return attach->dmabuf->ops->pin && + (!dma_buf_attachment_is_dynamic(attach) || + !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)); } /** @@ -935,48 +925,11 @@ dma_buf_dynamic_attach(struct dma_buf *dmabuf, struct device *dev, list_add(&attach->node, &dmabuf->attachments); dma_resv_unlock(dmabuf->resv); - /* When either the importer or the exporter can't handle dynamic - * mappings we cache the mapping here to avoid issues with the - * reservation object lock. - */ - if (dma_buf_attachment_is_dynamic(attach) != - dma_buf_is_dynamic(dmabuf)) { - struct sg_table *sgt; - - dma_resv_lock(attach->dmabuf->resv, NULL); - if (dma_buf_is_dynamic(attach->dmabuf)) { - ret = dmabuf->ops->pin(attach); - if (ret) - goto err_unlock; - } - - sgt = __map_dma_buf(attach, DMA_BIDIRECTIONAL); - if (!sgt) - sgt = ERR_PTR(-ENOMEM); - if (IS_ERR(sgt)) { - ret = PTR_ERR(sgt); - goto err_unpin; - } - dma_resv_unlock(attach->dmabuf->resv); - attach->sgt = sgt; - attach->dir = DMA_BIDIRECTIONAL; - } - return attach; err_attach: kfree(attach); return ERR_PTR(ret); - -err_unpin: - if (dma_buf_is_dynamic(attach->dmabuf)) - dmabuf->ops->unpin(attach); - -err_unlock: - dma_resv_unlock(attach->dmabuf->resv); - - dma_buf_detach(dmabuf, attach); - return ERR_PTR(ret); } EXPORT_SYMBOL_NS_GPL(dma_buf_dynamic_attach, "DMA_BUF"); @@ -995,16 +948,6 @@ struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, } EXPORT_SYMBOL_NS_GPL(dma_buf_attach, "DMA_BUF"); -static void __unmap_dma_buf(struct dma_buf_attachment *attach, - struct sg_table *sg_table, - enum dma_data_direction direction) -{ - /* uses XOR, hence this unmangles */ - mangle_sg_table(sg_table); - - attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); -} - /** * dma_buf_detach - Remove the given attachment from dmabuf's attachments list * @dmabuf: [in] buffer to detach from. @@ -1022,11 +965,12 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) dma_resv_lock(dmabuf->resv, NULL); if (attach->sgt) { + mangle_sg_table(attach->sgt); + attach->dmabuf->ops->unmap_dma_buf(attach, attach->sgt, + attach->dir); - __unmap_dma_buf(attach, attach->sgt, attach->dir); - - if (dma_buf_is_dynamic(attach->dmabuf)) - dmabuf->ops->unpin(attach); + if (dma_buf_pin_on_map(attach)) + dma_buf_unpin(attach); } list_del(&attach->node); @@ -1058,7 +1002,7 @@ int dma_buf_pin(struct dma_buf_attachment *attach) struct dma_buf *dmabuf = attach->dmabuf; int ret = 0; - WARN_ON(!dma_buf_attachment_is_dynamic(attach)); + WARN_ON(!attach->importer_ops); dma_resv_assert_held(dmabuf->resv); @@ -1081,7 +1025,7 @@ void dma_buf_unpin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; - WARN_ON(!dma_buf_attachment_is_dynamic(attach)); + WARN_ON(!attach->importer_ops); dma_resv_assert_held(dmabuf->resv); @@ -1115,7 +1059,7 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, enum dma_data_direction direction) { struct sg_table *sg_table; - int r; + signed long ret; might_sleep(); @@ -1136,29 +1080,42 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, return attach->sgt; } - if (dma_buf_is_dynamic(attach->dmabuf)) { - if (!IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) { - r = attach->dmabuf->ops->pin(attach); - if (r) - return ERR_PTR(r); - } + if (dma_buf_pin_on_map(attach)) { + ret = attach->dmabuf->ops->pin(attach); + /* + * Catch exporters making buffers inaccessible even when + * attachments preventing that exist. + */ + WARN_ON_ONCE(ret == EBUSY); + if (ret) + return ERR_PTR(ret); } - sg_table = __map_dma_buf(attach, direction); + sg_table = attach->dmabuf->ops->map_dma_buf(attach, direction); if (!sg_table) sg_table = ERR_PTR(-ENOMEM); + if (IS_ERR(sg_table)) + goto error_unpin; - if (IS_ERR(sg_table) && dma_buf_is_dynamic(attach->dmabuf) && - !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) - attach->dmabuf->ops->unpin(attach); + /* + * Importers with static attachments don't wait for fences. + */ + if (!dma_buf_attachment_is_dynamic(attach)) { + ret = dma_resv_wait_timeout(attach->dmabuf->resv, + DMA_RESV_USAGE_KERNEL, true, + MAX_SCHEDULE_TIMEOUT); + if (ret < 0) + goto error_unmap; + } + mangle_sg_table(sg_table); - if (!IS_ERR(sg_table) && attach->dmabuf->ops->cache_sgt_mapping) { + if (attach->dmabuf->ops->cache_sgt_mapping) { attach->sgt = sg_table; attach->dir = direction; } #ifdef CONFIG_DMA_API_DEBUG - if (!IS_ERR(sg_table)) { + { struct scatterlist *sg; u64 addr; int len; @@ -1175,6 +1132,16 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, } #endif /* CONFIG_DMA_API_DEBUG */ return sg_table; + +error_unmap: + attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); + sg_table = ERR_PTR(ret); + +error_unpin: + if (dma_buf_pin_on_map(attach)) + attach->dmabuf->ops->unpin(attach); + + return sg_table; } EXPORT_SYMBOL_NS_GPL(dma_buf_map_attachment, "DMA_BUF"); @@ -1230,11 +1197,11 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, if (attach->sgt == sg_table) return; - __unmap_dma_buf(attach, sg_table, direction); + mangle_sg_table(sg_table); + attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); - if (dma_buf_is_dynamic(attach->dmabuf) && - !IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) - dma_buf_unpin(attach); + if (dma_buf_pin_on_map(attach)) + attach->dmabuf->ops->unpin(attach); } EXPORT_SYMBOL_NS_GPL(dma_buf_unmap_attachment, "DMA_BUF"); diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index 36216d28d8bd..c54ff2dda8cb 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -583,20 +583,6 @@ static inline bool dma_buf_is_dynamic(struct dma_buf *dmabuf) return !!dmabuf->ops->pin; } -/** - * dma_buf_attachment_is_dynamic - check if a DMA-buf attachment uses dynamic - * mappings - * @attach: the DMA-buf attachment to check - * - * Returns true if a DMA-buf importer wants to call the map/unmap functions with - * the dma_resv lock held. - */ -static inline bool -dma_buf_attachment_is_dynamic(struct dma_buf_attachment *attach) -{ - return !!attach->importer_ops; -} - struct dma_buf_attachment *dma_buf_attach(struct dma_buf *dmabuf, struct device *dev); struct dma_buf_attachment * -- cgit v1.2.3 From b72f66f22c0e39ae6684c43fead774c13db24e73 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 11 Feb 2025 17:20:53 +0100 Subject: dma-buf: drop caching of sg_tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit That was purely for the transition from static to dynamic dma-buf handling and can be removed again now. Signed-off-by: Christian König Reviewed-by: Simona Vetter Reviewed-by: Dmitry Osipenko Link: https://patchwork.freedesktop.org/patch/msgid/20250211163109.12200-5-christian.koenig@amd.com --- drivers/dma-buf/dma-buf.c | 34 ---------------------------------- drivers/dma-buf/udmabuf.c | 1 - drivers/gpu/drm/drm_prime.c | 1 - drivers/gpu/drm/virtio/virtgpu_prime.c | 1 - include/linux/dma-buf.h | 13 ------------- 5 files changed, 50 deletions(-) (limited to 'include/linux') diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c index 1f7349b08df8..0c48d41dd5eb 100644 --- a/drivers/dma-buf/dma-buf.c +++ b/drivers/dma-buf/dma-buf.c @@ -636,10 +636,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info) || !exp_info->ops->release)) return ERR_PTR(-EINVAL); - if (WARN_ON(exp_info->ops->cache_sgt_mapping && - (exp_info->ops->pin || exp_info->ops->unpin))) - return ERR_PTR(-EINVAL); - if (WARN_ON(!exp_info->ops->pin != !exp_info->ops->unpin)) return ERR_PTR(-EINVAL); @@ -963,17 +959,7 @@ void dma_buf_detach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) return; dma_resv_lock(dmabuf->resv, NULL); - - if (attach->sgt) { - mangle_sg_table(attach->sgt); - attach->dmabuf->ops->unmap_dma_buf(attach, attach->sgt, - attach->dir); - - if (dma_buf_pin_on_map(attach)) - dma_buf_unpin(attach); - } list_del(&attach->node); - dma_resv_unlock(dmabuf->resv); if (dmabuf->ops->detach) @@ -1068,18 +1054,6 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, dma_resv_assert_held(attach->dmabuf->resv); - if (attach->sgt) { - /* - * Two mappings with different directions for the same - * attachment are not allowed. - */ - if (attach->dir != direction && - attach->dir != DMA_BIDIRECTIONAL) - return ERR_PTR(-EBUSY); - - return attach->sgt; - } - if (dma_buf_pin_on_map(attach)) { ret = attach->dmabuf->ops->pin(attach); /* @@ -1109,11 +1083,6 @@ struct sg_table *dma_buf_map_attachment(struct dma_buf_attachment *attach, } mangle_sg_table(sg_table); - if (attach->dmabuf->ops->cache_sgt_mapping) { - attach->sgt = sg_table; - attach->dir = direction; - } - #ifdef CONFIG_DMA_API_DEBUG { struct scatterlist *sg; @@ -1194,9 +1163,6 @@ void dma_buf_unmap_attachment(struct dma_buf_attachment *attach, dma_resv_assert_held(attach->dmabuf->resv); - if (attach->sgt == sg_table) - return; - mangle_sg_table(sg_table); attach->dmabuf->ops->unmap_dma_buf(attach, sg_table, direction); diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c index cc7398cc17d6..2fa2c9135eac 100644 --- a/drivers/dma-buf/udmabuf.c +++ b/drivers/dma-buf/udmabuf.c @@ -285,7 +285,6 @@ static int end_cpu_udmabuf(struct dma_buf *buf, } static const struct dma_buf_ops udmabuf_ops = { - .cache_sgt_mapping = true, .map_dma_buf = map_udmabuf, .unmap_dma_buf = unmap_udmabuf, .release = release_udmabuf, diff --git a/drivers/gpu/drm/drm_prime.c b/drivers/gpu/drm/drm_prime.c index bdb51c8f262e..a3d64f93a225 100644 --- a/drivers/gpu/drm/drm_prime.c +++ b/drivers/gpu/drm/drm_prime.c @@ -804,7 +804,6 @@ int drm_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct *vma) EXPORT_SYMBOL(drm_gem_dmabuf_mmap); static const struct dma_buf_ops drm_gem_prime_dmabuf_ops = { - .cache_sgt_mapping = true, .attach = drm_gem_map_attach, .detach = drm_gem_map_detach, .map_dma_buf = drm_gem_map_dma_buf, diff --git a/drivers/gpu/drm/virtio/virtgpu_prime.c b/drivers/gpu/drm/virtio/virtgpu_prime.c index fe6a0b018571..c6f3be3cb914 100644 --- a/drivers/gpu/drm/virtio/virtgpu_prime.c +++ b/drivers/gpu/drm/virtio/virtgpu_prime.c @@ -75,7 +75,6 @@ static void virtgpu_gem_unmap_dma_buf(struct dma_buf_attachment *attach, static const struct virtio_dma_buf_ops virtgpu_dmabuf_ops = { .ops = { - .cache_sgt_mapping = true, .attach = virtio_dma_buf_attach, .detach = drm_gem_map_detach, .map_dma_buf = virtgpu_gem_map_dma_buf, diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h index c54ff2dda8cb..544f8f8c3f44 100644 --- a/include/linux/dma-buf.h +++ b/include/linux/dma-buf.h @@ -34,15 +34,6 @@ struct dma_buf_attachment; * @vunmap: [optional] unmaps a vmap from the buffer */ struct dma_buf_ops { - /** - * @cache_sgt_mapping: - * - * If true the framework will cache the first mapping made for each - * attachment. This avoids creating mappings for attachments multiple - * times. - */ - bool cache_sgt_mapping; - /** * @attach: * @@ -493,8 +484,6 @@ struct dma_buf_attach_ops { * @dmabuf: buffer for this attachment. * @dev: device attached to the buffer. * @node: list of dma_buf_attachment, protected by dma_resv lock of the dmabuf. - * @sgt: cached mapping. - * @dir: direction of cached mapping. * @peer2peer: true if the importer can handle peer resources without pages. * @priv: exporter specific attachment data. * @importer_ops: importer operations for this attachment, if provided @@ -514,8 +503,6 @@ struct dma_buf_attachment { struct dma_buf *dmabuf; struct device *dev; struct list_head node; - struct sg_table *sgt; - enum dma_data_direction dir; bool peer2peer; const struct dma_buf_attach_ops *importer_ops; void *importer_priv; -- cgit v1.2.3 From 62b1fa69f31969e11d03529d3a80599ddda2d043 Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:40 +1300 Subject: KVM: Export hardware virtualization enabling/disabling functions To support TDX, KVM will need to enable TDX during KVM module loading time. Enabling TDX requires enabling hardware virtualization first so that all online CPUs (and the new CPU going online) are in post-VMXON state. KVM by default enables hardware virtualization but that is done in kvm_init(), which must be the last step after all initialization is done thus is too late for enabling TDX. Export functions to enable/disable hardware virtualization so that TDX code can use them to handle hardware virtualization enabling before kvm_init(). Signed-off-by: Kai Huang Message-ID: Signed-off-by: Paolo Bonzini --- include/linux/kvm_host.h | 8 ++++++++ virt/kvm/kvm_main.c | 18 ++++-------------- 2 files changed, 12 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f34f4cfaa513..1e75fa114f34 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2571,4 +2571,12 @@ long kvm_arch_vcpu_pre_fault_memory(struct kvm_vcpu *vcpu, struct kvm_pre_fault_memory *range); #endif +#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +int kvm_enable_virtualization(void); +void kvm_disable_virtualization(void); +#else +static inline int kvm_enable_virtualization(void) { return 0; } +static inline void kvm_disable_virtualization(void) { } +#endif + #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d3..6e40383fbe47 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -143,8 +143,6 @@ static int kvm_no_compat_open(struct inode *inode, struct file *file) #define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \ .open = kvm_no_compat_open #endif -static int kvm_enable_virtualization(void); -static void kvm_disable_virtualization(void); static void kvm_io_bus_destroy(struct kvm_io_bus *bus); @@ -5576,7 +5574,7 @@ static struct syscore_ops kvm_syscore_ops = { .shutdown = kvm_shutdown, }; -static int kvm_enable_virtualization(void) +int kvm_enable_virtualization(void) { int r; @@ -5621,8 +5619,9 @@ err_cpuhp: --kvm_usage_count; return r; } +EXPORT_SYMBOL_GPL(kvm_enable_virtualization); -static void kvm_disable_virtualization(void) +void kvm_disable_virtualization(void) { guard(mutex)(&kvm_usage_lock); @@ -5633,6 +5632,7 @@ static void kvm_disable_virtualization(void) cpuhp_remove_state(CPUHP_AP_KVM_ONLINE); kvm_arch_disable_virtualization(); } +EXPORT_SYMBOL_GPL(kvm_disable_virtualization); static int kvm_init_virtualization(void) { @@ -5648,21 +5648,11 @@ static void kvm_uninit_virtualization(void) kvm_disable_virtualization(); } #else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */ -static int kvm_enable_virtualization(void) -{ - return 0; -} - static int kvm_init_virtualization(void) { return 0; } -static void kvm_disable_virtualization(void) -{ - -} - static void kvm_uninit_virtualization(void) { -- cgit v1.2.3 From fcdbdf63431c9faf639bffc957ea2ce9b545432e Mon Sep 17 00:00:00 2001 From: Kai Huang Date: Fri, 15 Nov 2024 22:52:41 +1300 Subject: KVM: VMX: Initialize TDX during KVM module load Before KVM can use TDX to create and run TDX guests, TDX needs to be initialized from two perspectives: 1) TDX module must be initialized properly to a working state; 2) A per-cpu TDX initialization, a.k.a the TDH.SYS.LP.INIT SEAMCALL must be done on any logical cpu before it can run any other TDX SEAMCALLs. The TDX host core-kernel provides two functions to do the above two respectively: tdx_enable() and tdx_cpu_enable(). There are two options in terms of when to initialize TDX: initialize TDX at KVM module loading time, or when creating the first TDX guest. Choose to initialize TDX during KVM module loading time: Initializing TDX module is both memory and CPU time consuming: 1) the kernel needs to allocate a non-trivial size(~1/256) of system memory as metadata used by TDX module to track each TDX-usable memory page's status; 2) the TDX module needs to initialize this metadata, one entry for each TDX-usable memory page. Also, the kernel uses alloc_contig_pages() to allocate those metadata chunks, because they are large and need to be physically contiguous. alloc_contig_pages() can fail. If initializing TDX when creating the first TDX guest, then there's chance that KVM won't be able to run any TDX guests albeit KVM _declares_ to be able to support TDX. This isn't good for the user. On the other hand, initializing TDX at KVM module loading time can make sure KVM is providing a consistent view of whether KVM can support TDX to the user. Always only try to initialize TDX after VMX has been initialized. TDX is based on VMX, and if VMX fails to initialize then TDX is likely to be broken anyway. Also, in practice, supporting TDX will require part of VMX and common x86 infrastructure in working order, so TDX cannot be enabled alone w/o VMX support. There are two cases that can result in failure to initialize TDX: 1) TDX cannot be supported (e.g., because of TDX is not supported or enabled by hardware, or module is not loaded, or missing some dependency in KVM's configuration); 2) Any unexpected error during TDX bring-up. For the first case only mark TDX is disabled but still allow KVM module to be loaded. For the second case just fail to load the KVM module so that the user can be aware. Because TDX costs additional memory, don't enable TDX by default. Add a new module parameter 'enable_tdx' to allow the user to opt-in. Note, the name tdx_init() has already been taken by the early boot code. Use tdx_bringup() for initializing TDX (and tdx_cleanup() since KVM doesn't actually teardown TDX). They don't match vt_init()/vt_exit(), vmx_init()/vmx_exit() etc but it's not end of the world. Also, once initialized, the TDX module cannot be disabled and enabled again w/o the TDX module runtime update, which isn't supported by the kernel. After TDX is enabled, nothing needs to be done when KVM disables hardware virtualization, e.g., when offlining CPU, or during suspend/resume. TDX host core-kernel code internally tracks TDX status and can handle "multiple enabling" scenario. Similar to KVM_AMD_SEV, add a new KVM_INTEL_TDX Kconfig to guide KVM TDX code. Make it depend on INTEL_TDX_HOST but not replace INTEL_TDX_HOST because in the longer term there's a use case that requires making SEAMCALLs w/o KVM as mentioned by Dan [1]. Link: https://lore.kernel.org/6723fc2070a96_60c3294dc@dwillia2-mobl3.amr.corp.intel.com.notmuch/ [1] Signed-off-by: Kai Huang Message-ID: <162f9dee05c729203b9ad6688db1ca2960b4b502.1731664295.git.kai.huang@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 3 + arch/x86/include/asm/tdx_global_metadata.h | 44 ++++++++ arch/x86/kvm/Kconfig | 10 ++ arch/x86/kvm/Makefile | 1 + arch/x86/kvm/vmx/main.c | 9 ++ arch/x86/kvm/vmx/tdx.c | 162 ++++++++++++++++++++++++++++ arch/x86/kvm/vmx/tdx.h | 13 +++ arch/x86/virt/vmx/tdx/tdx.c | 14 +++ arch/x86/virt/vmx/tdx/tdx.h | 1 - arch/x86/virt/vmx/tdx/tdx_global_metadata.h | 44 -------- include/linux/kvm_host.h | 1 + virt/kvm/kvm_main.c | 3 +- 12 files changed, 259 insertions(+), 46 deletions(-) create mode 100644 arch/x86/include/asm/tdx_global_metadata.h create mode 100644 arch/x86/kvm/vmx/tdx.c create mode 100644 arch/x86/kvm/vmx/tdx.h delete mode 100644 arch/x86/virt/vmx/tdx/tdx_global_metadata.h (limited to 'include/linux') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 75a91869453d..1a8c687603aa 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -34,6 +34,7 @@ #ifndef __ASSEMBLY__ #include +#include #include /* @@ -121,6 +122,7 @@ static inline u64 sc_retry(sc_func_t func, u64 fn, int tdx_cpu_enable(void); int tdx_enable(void); const char *tdx_dump_mce_info(struct mce *m); +const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); void tdx_guest_keyid_free(unsigned int keyid); @@ -179,6 +181,7 @@ static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } +static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/tdx_global_metadata.h b/arch/x86/include/asm/tdx_global_metadata.h new file mode 100644 index 000000000000..060a2ad744bf --- /dev/null +++ b/arch/x86/include/asm/tdx_global_metadata.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Automatically generated TDX global metadata structures. */ +#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H +#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H + +#include + +struct tdx_sys_info_features { + u64 tdx_features0; +}; + +struct tdx_sys_info_tdmr { + u16 max_tdmrs; + u16 max_reserved_per_tdmr; + u16 pamt_4k_entry_size; + u16 pamt_2m_entry_size; + u16 pamt_1g_entry_size; +}; + +struct tdx_sys_info_td_ctrl { + u16 tdr_base_size; + u16 tdcs_base_size; + u16 tdvps_base_size; +}; + +struct tdx_sys_info_td_conf { + u64 attributes_fixed0; + u64 attributes_fixed1; + u64 xfam_fixed0; + u64 xfam_fixed1; + u16 num_cpuid_config; + u16 max_vcpus_per_td; + u64 cpuid_config_leaves[128]; + u64 cpuid_config_values[128][2]; +}; + +struct tdx_sys_info { + struct tdx_sys_info_features features; + struct tdx_sys_info_tdmr tdmr; + struct tdx_sys_info_td_ctrl td_ctrl; + struct tdx_sys_info_td_conf td_conf; +}; + +#endif diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ea2c4f21c1ca..fe8cbee6f614 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -128,6 +128,16 @@ config X86_SGX_KVM If unsure, say N. +config KVM_INTEL_TDX + bool "Intel Trust Domain Extensions (TDX) support" + default y + depends on INTEL_TDX_HOST + help + Provides support for launching Intel Trust Domain Extensions (TDX) + confidential VMs on Intel processors. + + If unsure, say N. + config KVM_AMD tristate "KVM for AMD processors support" depends on KVM && (CPU_SUP_AMD || CPU_SUP_HYGON) diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index f9dddb8cb466..a5d362c7b504 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -20,6 +20,7 @@ kvm-intel-y += vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o \ kvm-intel-$(CONFIG_X86_SGX_KVM) += vmx/sgx.o kvm-intel-$(CONFIG_KVM_HYPERV) += vmx/hyperv.o vmx/hyperv_evmcs.o +kvm-intel-$(CONFIG_KVM_INTEL_TDX) += vmx/tdx.o kvm-amd-y += svm/svm.o svm/vmenter.o svm/pmu.o svm/nested.o svm/avic.o diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c index 54cf95cb8d42..97c453187cc1 100644 --- a/arch/x86/kvm/vmx/main.c +++ b/arch/x86/kvm/vmx/main.c @@ -6,6 +6,7 @@ #include "nested.h" #include "pmu.h" #include "posted_intr.h" +#include "tdx.h" #define VMX_REQUIRED_APICV_INHIBITS \ (BIT(APICV_INHIBIT_REASON_DISABLED) | \ @@ -172,6 +173,7 @@ struct kvm_x86_init_ops vt_init_ops __initdata = { static void __exit vt_exit(void) { kvm_exit(); + tdx_cleanup(); vmx_exit(); } module_exit(vt_exit); @@ -184,6 +186,11 @@ static int __init vt_init(void) if (r) return r; + /* tdx_init() has been taken */ + r = tdx_bringup(); + if (r) + goto err_tdx_bringup; + /* * Common KVM initialization _must_ come last, after this, /dev/kvm is * exposed to userspace! @@ -196,6 +203,8 @@ static int __init vt_init(void) return 0; err_kvm_init: + tdx_cleanup(); +err_tdx_bringup: vmx_exit(); return r; } diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c new file mode 100644 index 000000000000..3c089ed3b843 --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.c @@ -0,0 +1,162 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include "capabilities.h" +#include "tdx.h" + +#undef pr_fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +static bool enable_tdx __ro_after_init; +module_param_named(tdx, enable_tdx, bool, 0444); + +static enum cpuhp_state tdx_cpuhp_state; + +static int tdx_online_cpu(unsigned int cpu) +{ + unsigned long flags; + int r; + + /* Sanity check CPU is already in post-VMXON */ + WARN_ON_ONCE(!(cr4_read_shadow() & X86_CR4_VMXE)); + + local_irq_save(flags); + r = tdx_cpu_enable(); + local_irq_restore(flags); + + return r; +} + +static void __do_tdx_cleanup(void) +{ + /* + * Once TDX module is initialized, it cannot be disabled and + * re-initialized again w/o runtime update (which isn't + * supported by kernel). Only need to remove the cpuhp here. + * The TDX host core code tracks TDX status and can handle + * 'multiple enabling' scenario. + */ + WARN_ON_ONCE(!tdx_cpuhp_state); + cpuhp_remove_state_nocalls_cpuslocked(tdx_cpuhp_state); + tdx_cpuhp_state = 0; +} + +static void __tdx_cleanup(void) +{ + cpus_read_lock(); + __do_tdx_cleanup(); + cpus_read_unlock(); +} + +static int __init __do_tdx_bringup(void) +{ + int r; + + /* + * TDX-specific cpuhp callback to call tdx_cpu_enable() on all + * online CPUs before calling tdx_enable(), and on any new + * going-online CPU to make sure it is ready for TDX guest. + */ + r = cpuhp_setup_state_cpuslocked(CPUHP_AP_ONLINE_DYN, + "kvm/cpu/tdx:online", + tdx_online_cpu, NULL); + if (r < 0) + return r; + + tdx_cpuhp_state = r; + + r = tdx_enable(); + if (r) + __do_tdx_cleanup(); + + return r; +} + +static int __init __tdx_bringup(void) +{ + int r; + + /* + * Enabling TDX requires enabling hardware virtualization first, + * as making SEAMCALLs requires CPU being in post-VMXON state. + */ + r = kvm_enable_virtualization(); + if (r) + return r; + + cpus_read_lock(); + r = __do_tdx_bringup(); + cpus_read_unlock(); + + if (r) + goto tdx_bringup_err; + + /* + * Leave hardware virtualization enabled after TDX is enabled + * successfully. TDX CPU hotplug depends on this. + */ + return 0; +tdx_bringup_err: + kvm_disable_virtualization(); + return r; +} + +void tdx_cleanup(void) +{ + if (enable_tdx) { + __tdx_cleanup(); + kvm_disable_virtualization(); + } +} + +int __init tdx_bringup(void) +{ + int r; + + if (!enable_tdx) + return 0; + + if (!cpu_feature_enabled(X86_FEATURE_TDX_HOST_PLATFORM)) { + pr_err("tdx: no TDX private KeyIDs available\n"); + goto success_disable_tdx; + } + + if (!enable_virt_at_load) { + pr_err("tdx: tdx requires kvm.enable_virt_at_load=1\n"); + goto success_disable_tdx; + } + + /* + * Ideally KVM should probe whether TDX module has been loaded + * first and then try to bring it up. But TDX needs to use SEAMCALL + * to probe whether the module is loaded (there is no CPUID or MSR + * for that), and making SEAMCALL requires enabling virtualization + * first, just like the rest steps of bringing up TDX module. + * + * So, for simplicity do everything in __tdx_bringup(); the first + * SEAMCALL will return -ENODEV when the module is not loaded. The + * only complication is having to make sure that initialization + * SEAMCALLs don't return TDX_SEAMCALL_VMFAILINVALID in other + * cases. + */ + r = __tdx_bringup(); + if (r) { + /* + * Disable TDX only but don't fail to load module if + * the TDX module could not be loaded. No need to print + * message saying "module is not loaded" because it was + * printed when the first SEAMCALL failed. + */ + if (r == -ENODEV) + goto success_disable_tdx; + + enable_tdx = 0; + } + + return r; + +success_disable_tdx: + enable_tdx = 0; + return 0; +} diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h new file mode 100644 index 000000000000..9d4a0e8265bf --- /dev/null +++ b/arch/x86/kvm/vmx/tdx.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __KVM_X86_VMX_TDX_H +#define __KVM_X86_VMX_TDX_H + +#ifdef CONFIG_KVM_INTEL_TDX +int tdx_bringup(void); +void tdx_cleanup(void); +#else +static inline int tdx_bringup(void) { return 0; } +static inline void tdx_cleanup(void) {} +#endif + +#endif diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 0122051af6b3..9f0c482c1a03 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1462,6 +1462,20 @@ void __init tdx_init(void) check_tdx_erratum(); } +const struct tdx_sys_info *tdx_get_sysinfo(void) +{ + const struct tdx_sys_info *p = NULL; + + /* Make sure all fields in @tdx_sysinfo have been populated */ + mutex_lock(&tdx_module_lock); + if (tdx_module_status == TDX_MODULE_INITIALIZED) + p = (const struct tdx_sys_info *)&tdx_sysinfo; + mutex_unlock(&tdx_module_lock); + + return p; +} +EXPORT_SYMBOL_GPL(tdx_get_sysinfo); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/arch/x86/virt/vmx/tdx/tdx.h b/arch/x86/virt/vmx/tdx/tdx.h index 62cb7832c42d..da384387d4eb 100644 --- a/arch/x86/virt/vmx/tdx/tdx.h +++ b/arch/x86/virt/vmx/tdx/tdx.h @@ -3,7 +3,6 @@ #define _X86_VIRT_TDX_H #include -#include "tdx_global_metadata.h" /* * This file contains both macros and data structures defined by the TDX diff --git a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h b/arch/x86/virt/vmx/tdx/tdx_global_metadata.h deleted file mode 100644 index 060a2ad744bf..000000000000 --- a/arch/x86/virt/vmx/tdx/tdx_global_metadata.h +++ /dev/null @@ -1,44 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Automatically generated TDX global metadata structures. */ -#ifndef _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H -#define _X86_VIRT_TDX_AUTO_GENERATED_TDX_GLOBAL_METADATA_H - -#include - -struct tdx_sys_info_features { - u64 tdx_features0; -}; - -struct tdx_sys_info_tdmr { - u16 max_tdmrs; - u16 max_reserved_per_tdmr; - u16 pamt_4k_entry_size; - u16 pamt_2m_entry_size; - u16 pamt_1g_entry_size; -}; - -struct tdx_sys_info_td_ctrl { - u16 tdr_base_size; - u16 tdcs_base_size; - u16 tdvps_base_size; -}; - -struct tdx_sys_info_td_conf { - u64 attributes_fixed0; - u64 attributes_fixed1; - u64 xfam_fixed0; - u64 xfam_fixed1; - u16 num_cpuid_config; - u16 max_vcpus_per_td; - u64 cpuid_config_leaves[128]; - u64 cpuid_config_values[128][2]; -}; - -struct tdx_sys_info { - struct tdx_sys_info_features features; - struct tdx_sys_info_tdmr tdmr; - struct tdx_sys_info_td_ctrl td_ctrl; - struct tdx_sys_info_td_conf td_conf; -}; - -#endif diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 1e75fa114f34..3bfe3140f444 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -2284,6 +2284,7 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING +extern bool enable_virt_at_load; extern bool kvm_rebooting; #endif diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 6e40383fbe47..622b5a99078a 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -5464,8 +5464,9 @@ static struct miscdevice kvm_dev = { }; #ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING -static bool enable_virt_at_load = true; +bool enable_virt_at_load = true; module_param(enable_virt_at_load, bool, 0444); +EXPORT_SYMBOL_GPL(enable_virt_at_load); __visible bool kvm_rebooting; EXPORT_SYMBOL_GPL(kvm_rebooting); -- cgit v1.2.3 From 7c035bea94074b19ed560a4f23a76c5a6c8e594f Mon Sep 17 00:00:00 2001 From: Zhiming Hu Date: Wed, 19 Feb 2025 09:02:51 -0500 Subject: KVM: TDX: Register TDX host key IDs to cgroup misc controller TDX host key IDs (HKID) are limit resources in a machine, and the misc cgroup lets the machine owner track their usage and limits the possibility of abusing them outside the owner's control. The cgroup v2 miscellaneous subsystem was introduced to control the resource of AMD SEV & SEV-ES ASIDs. Likewise introduce HKIDs as a misc resource. Signed-off-by: Zhiming Hu Signed-off-by: Isaku Yamahata Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/tdx.h | 2 ++ arch/x86/kvm/vmx/tdx.c | 14 ++++++++++++++ arch/x86/kvm/vmx/tdx.h | 1 + arch/x86/virt/vmx/tdx/tdx.c | 6 ++++++ include/linux/misc_cgroup.h | 4 ++++ kernel/cgroup/misc.c | 4 ++++ 6 files changed, 31 insertions(+) (limited to 'include/linux') diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h index 1a8c687603aa..2879fc518a32 100644 --- a/arch/x86/include/asm/tdx.h +++ b/arch/x86/include/asm/tdx.h @@ -125,6 +125,7 @@ const char *tdx_dump_mce_info(struct mce *m); const struct tdx_sys_info *tdx_get_sysinfo(void); int tdx_guest_keyid_alloc(void); +u32 tdx_get_nr_guest_keyids(void); void tdx_guest_keyid_free(unsigned int keyid); struct tdx_td { @@ -180,6 +181,7 @@ u64 tdh_phymem_page_wbinvd_tdr(struct tdx_td *td); static inline void tdx_init(void) { } static inline int tdx_cpu_enable(void) { return -ENODEV; } static inline int tdx_enable(void) { return -ENODEV; } +static inline u32 tdx_get_nr_guest_keyids(void) { return 0; } static inline const char *tdx_dump_mce_info(struct mce *m) { return NULL; } static inline const struct tdx_sys_info *tdx_get_sysinfo(void) { return NULL; } #endif /* CONFIG_INTEL_TDX_HOST */ diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index 2ac925ecccd5..01166cb8f2e6 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include "capabilities.h" #include "mmu.h" @@ -140,6 +141,9 @@ static inline void tdx_hkid_free(struct kvm_tdx *kvm_tdx) tdx_guest_keyid_free(kvm_tdx->hkid); kvm_tdx->hkid = -1; atomic_dec(&nr_configured_hkid); + misc_cg_uncharge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + put_misc_cg(kvm_tdx->misc_cg); + kvm_tdx->misc_cg = NULL; } static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx) @@ -675,6 +679,10 @@ static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params, if (ret < 0) return ret; kvm_tdx->hkid = ret; + kvm_tdx->misc_cg = get_current_misc_cg(); + ret = misc_cg_try_charge(MISC_CG_RES_TDX, kvm_tdx->misc_cg, 1); + if (ret) + goto free_hkid; ret = -ENOMEM; @@ -1459,6 +1467,11 @@ static int __init __tdx_bringup(void) goto get_sysinfo_err; } + if (misc_cg_set_capacity(MISC_CG_RES_TDX, tdx_get_nr_guest_keyids())) { + r = -EINVAL; + goto get_sysinfo_err; + } + /* * Leave hardware virtualization enabled after TDX is enabled * successfully. TDX CPU hotplug depends on this. @@ -1475,6 +1488,7 @@ tdx_bringup_err: void tdx_cleanup(void) { if (enable_tdx) { + misc_cg_set_capacity(MISC_CG_RES_TDX, 0); __tdx_cleanup(); kvm_disable_virtualization(); } diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h index 6ec7ac1d91e3..0559126c8f9d 100644 --- a/arch/x86/kvm/vmx/tdx.h +++ b/arch/x86/kvm/vmx/tdx.h @@ -21,6 +21,7 @@ enum kvm_tdx_state { struct kvm_tdx { struct kvm kvm; + struct misc_cg *misc_cg; int hkid; enum kvm_tdx_state state; diff --git a/arch/x86/virt/vmx/tdx/tdx.c b/arch/x86/virt/vmx/tdx/tdx.c index 9f0c482c1a03..3a272e9ff2ca 100644 --- a/arch/x86/virt/vmx/tdx/tdx.c +++ b/arch/x86/virt/vmx/tdx/tdx.c @@ -1476,6 +1476,12 @@ const struct tdx_sys_info *tdx_get_sysinfo(void) } EXPORT_SYMBOL_GPL(tdx_get_sysinfo); +u32 tdx_get_nr_guest_keyids(void) +{ + return tdx_nr_guest_keyids; +} +EXPORT_SYMBOL_GPL(tdx_get_nr_guest_keyids); + int tdx_guest_keyid_alloc(void) { return ida_alloc_range(&tdx_guest_keyid_pool, tdx_guest_keyid_start, diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h index 49eef10c8e59..8c0e4f4d71be 100644 --- a/include/linux/misc_cgroup.h +++ b/include/linux/misc_cgroup.h @@ -17,6 +17,10 @@ enum misc_res_type { MISC_CG_RES_SEV, /** @MISC_CG_RES_SEV_ES: AMD SEV-ES ASIDs resource */ MISC_CG_RES_SEV_ES, +#endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + MISC_CG_RES_TDX, #endif /** @MISC_CG_RES_TYPES: count of enum misc_res_type constants */ MISC_CG_RES_TYPES diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c index 0e26068995a6..264aad22c967 100644 --- a/kernel/cgroup/misc.c +++ b/kernel/cgroup/misc.c @@ -24,6 +24,10 @@ static const char *const misc_res_name[] = { /* AMD SEV-ES ASIDs resource */ "sev_es", #endif +#ifdef CONFIG_INTEL_TDX_HOST + /* Intel TDX HKIDs resource */ + "tdx", +#endif }; /* Root misc cgroup */ -- cgit v1.2.3 From c4a92f12cf35b83ce81757f6e5e8eb6223b87388 Mon Sep 17 00:00:00 2001 From: Yan Zhao Date: Mon, 13 Jan 2025 11:08:41 +0800 Subject: KVM: Add parameter "kvm" to kvm_cpu_dirty_log_size() and its callers Add a parameter "kvm" to kvm_cpu_dirty_log_size() and down to its callers: kvm_dirty_ring_get_rsvd_entries(), kvm_dirty_ring_alloc(). This is a preparation to make cpu_dirty_log_size a per-VM value rather than a system-wide value. No function changes expected. Signed-off-by: Yan Zhao Signed-off-by: Paolo Bonzini --- arch/x86/kvm/mmu/mmu.c | 2 +- include/linux/kvm_dirty_ring.h | 11 ++++++----- virt/kvm/dirty_ring.c | 11 ++++++----- virt/kvm/kvm_main.c | 4 ++-- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 3c208291154c..a0c736c11091 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -1311,7 +1311,7 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -int kvm_cpu_dirty_log_size(void) +int kvm_cpu_dirty_log_size(struct kvm *kvm) { return kvm_x86_ops.cpu_dirty_log_size; } diff --git a/include/linux/kvm_dirty_ring.h b/include/linux/kvm_dirty_ring.h index 4862c98d80d3..da4d9b5f58f1 100644 --- a/include/linux/kvm_dirty_ring.h +++ b/include/linux/kvm_dirty_ring.h @@ -32,7 +32,7 @@ struct kvm_dirty_ring { * If CONFIG_HAVE_HVM_DIRTY_RING not defined, kvm_dirty_ring.o should * not be included as well, so define these nop functions for the arch. */ -static inline u32 kvm_dirty_ring_get_rsvd_entries(void) +static inline u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { return 0; } @@ -42,7 +42,7 @@ static inline bool kvm_use_dirty_bitmap(struct kvm *kvm) return true; } -static inline int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, +static inline int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, int index, u32 size) { return 0; @@ -71,11 +71,12 @@ static inline void kvm_dirty_ring_free(struct kvm_dirty_ring *ring) #else /* CONFIG_HAVE_KVM_DIRTY_RING */ -int kvm_cpu_dirty_log_size(void); +int kvm_cpu_dirty_log_size(struct kvm *kvm); bool kvm_use_dirty_bitmap(struct kvm *kvm); bool kvm_arch_allow_write_without_running_vcpu(struct kvm *kvm); -u32 kvm_dirty_ring_get_rsvd_entries(void); -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size); +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm); +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size); /* * called with kvm->slots_lock held, returns the number of diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7bc74969a819..d14ffc7513ee 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -11,14 +11,14 @@ #include #include "kvm_mm.h" -int __weak kvm_cpu_dirty_log_size(void) +int __weak kvm_cpu_dirty_log_size(struct kvm *kvm) { return 0; } -u32 kvm_dirty_ring_get_rsvd_entries(void) +u32 kvm_dirty_ring_get_rsvd_entries(struct kvm *kvm) { - return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(); + return KVM_DIRTY_RING_RSVD_ENTRIES + kvm_cpu_dirty_log_size(kvm); } bool kvm_use_dirty_bitmap(struct kvm *kvm) @@ -74,14 +74,15 @@ static void kvm_reset_dirty_gfn(struct kvm *kvm, u32 slot, u64 offset, u64 mask) KVM_MMU_UNLOCK(kvm); } -int kvm_dirty_ring_alloc(struct kvm_dirty_ring *ring, int index, u32 size) +int kvm_dirty_ring_alloc(struct kvm *kvm, struct kvm_dirty_ring *ring, + int index, u32 size) { ring->dirty_gfns = vzalloc(size); if (!ring->dirty_gfns) return -ENOMEM; ring->size = size / sizeof(struct kvm_dirty_gfn); - ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(); + ring->soft_limit = ring->size - kvm_dirty_ring_get_rsvd_entries(kvm); ring->dirty_index = 0; ring->reset_index = 0; ring->index = index; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 622b5a99078a..549537da3062 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4108,7 +4108,7 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, unsigned long id) goto vcpu_free_run_page; if (kvm->dirty_ring_size) { - r = kvm_dirty_ring_alloc(&vcpu->dirty_ring, + r = kvm_dirty_ring_alloc(kvm, &vcpu->dirty_ring, id, kvm->dirty_ring_size); if (r) goto arch_vcpu_destroy; @@ -4847,7 +4847,7 @@ static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size) return -EINVAL; /* Should be bigger to keep the reserved entries, or a page */ - if (size < kvm_dirty_ring_get_rsvd_entries() * + if (size < kvm_dirty_ring_get_rsvd_entries(kvm) * sizeof(struct kvm_dirty_gfn) || size < PAGE_SIZE) return -EINVAL; -- cgit v1.2.3 From 44428e4936022a7a31743017849b167e64f33a32 Mon Sep 17 00:00:00 2001 From: Binbin Wu Date: Sat, 22 Feb 2025 09:42:18 +0800 Subject: KVM: x86: Move pv_unhalted check out of kvm_vcpu_has_events() Move pv_unhalted check out of kvm_vcpu_has_events(), check pv_unhalted explicitly when handling PV unhalt and expose kvm_vcpu_has_events(). kvm_vcpu_has_events() returns true if pv_unhalted is set, and pv_unhalted is only cleared on transitions to KVM_MP_STATE_RUNNABLE. If the guest initiates a spurious wakeup, pv_unhalted could be left set in perpetuity. Currently, this is not problematic because kvm_vcpu_has_events() is only called when handling PV unhalt. However, if kvm_vcpu_has_events() is used for other purposes in the future, it could return the unexpected results. Export kvm_vcpu_has_events() for its usage in broader contexts. Suggested-by: Sean Christopherson Signed-off-by: Binbin Wu Message-ID: <20250222014225.897298-3-binbin.wu@linux.intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 11 +++++------ include/linux/kvm_host.h | 1 + 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a792207a0dd1..3cae210ffaa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11143,7 +11143,7 @@ static bool kvm_vcpu_running(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted); } -static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) { if (!list_empty_careful(&vcpu->async_pf.done)) return true; @@ -11152,9 +11152,6 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) kvm_apic_init_sipi_allowed(vcpu)) return true; - if (vcpu->arch.pv.pv_unhalted) - return true; - if (kvm_is_exception_pending(vcpu)) return true; @@ -11192,10 +11189,12 @@ static bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) return false; } +EXPORT_SYMBOL_GPL(kvm_vcpu_has_events); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { - return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); + return kvm_vcpu_running(vcpu) || vcpu->arch.pv.pv_unhalted || + kvm_vcpu_has_events(vcpu); } /* Called within kvm->srcu read side. */ @@ -11331,7 +11330,7 @@ static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) */ ++vcpu->stat.halt_exits; if (lapic_in_kernel(vcpu)) { - if (kvm_vcpu_has_events(vcpu)) + if (kvm_vcpu_has_events(vcpu) || vcpu->arch.pv.pv_unhalted) vcpu->arch.pv.pv_unhalted = false; else vcpu->arch.mp_state = state; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 3bfe3140f444..ed1968f6f841 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1609,6 +1609,7 @@ void kvm_arch_disable_virtualization(void); int kvm_arch_enable_virtualization_cpu(void); void kvm_arch_disable_virtualization_cpu(void); #endif +bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -- cgit v1.2.3 From 0b28b7080ef5a323c3afa3860c3d45d627629830 Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Tue, 4 Feb 2025 14:14:12 +0100 Subject: platform: cznic: Add keyctl helpers for Turris platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Turris devices support signing messages with a per-device unique asymmetric key that was created on the device at manufacture time. Add helper module that helps to expose this ability via the keyctl() syscall. A device-specific driver can register a signing key by calling devm_turris_signing_key_create(). Both the `.turris-signing-keys` keyring and the signing key are created with only the VIEW, READ and SEARCH permissions for userspace - it is impossible to link / unlink / move them, set their attributes, or unlink the keyring from userspace. Signed-off-by: Marek Behún Signed-off-by: Arnd Bergmann --- MAINTAINERS | 1 + drivers/platform/cznic/Kconfig | 5 + drivers/platform/cznic/Makefile | 2 + drivers/platform/cznic/turris-signing-key.c | 192 ++++++++++++++++++++++++++++ include/linux/turris-signing-key.h | 33 +++++ 5 files changed, 233 insertions(+) create mode 100644 drivers/platform/cznic/turris-signing-key.c create mode 100644 include/linux/turris-signing-key.h (limited to 'include/linux') diff --git a/MAINTAINERS b/MAINTAINERS index 6182a8a0e1c7..6a90c12c22ad 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2441,6 +2441,7 @@ F: include/dt-bindings/bus/moxtet.h F: include/linux/armada-37xx-rwtm-mailbox.h F: include/linux/moxtet.h F: include/linux/turris-omnia-mcu-interface.h +F: include/linux/turris-signing-key.h ARM/FARADAY FA526 PORT M: Hans Ulli Kroll diff --git a/drivers/platform/cznic/Kconfig b/drivers/platform/cznic/Kconfig index 49c383eb6785..2b4c91ede6d8 100644 --- a/drivers/platform/cznic/Kconfig +++ b/drivers/platform/cznic/Kconfig @@ -77,4 +77,9 @@ config TURRIS_OMNIA_MCU_TRNG endif # TURRIS_OMNIA_MCU +config TURRIS_SIGNING_KEY + tristate + depends on KEYS + depends on ASYMMETRIC_KEY_TYPE + endif # CZNIC_PLATFORMS diff --git a/drivers/platform/cznic/Makefile b/drivers/platform/cznic/Makefile index ce6d997f34d6..2b8606a9486b 100644 --- a/drivers/platform/cznic/Makefile +++ b/drivers/platform/cznic/Makefile @@ -6,3 +6,5 @@ turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_GPIO) += turris-omnia-mcu-gpio.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_SYSOFF_WAKEUP) += turris-omnia-mcu-sys-off-wakeup.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_TRNG) += turris-omnia-mcu-trng.o turris-omnia-mcu-$(CONFIG_TURRIS_OMNIA_MCU_WATCHDOG) += turris-omnia-mcu-watchdog.o + +obj-$(CONFIG_TURRIS_SIGNING_KEY) += turris-signing-key.o diff --git a/drivers/platform/cznic/turris-signing-key.c b/drivers/platform/cznic/turris-signing-key.c new file mode 100644 index 000000000000..3b12e5245fb7 --- /dev/null +++ b/drivers/platform/cznic/turris-signing-key.c @@ -0,0 +1,192 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Some of CZ.NIC's Turris devices support signing messages with a per-device unique asymmetric + * cryptographic key that was burned into the device at manufacture. + * + * This helper module exposes this message signing ability via the keyctl() syscall. Upon load, it + * creates the `.turris-signing-keys` keyring. A device-specific driver then has to create a signing + * key by calling devm_turris_signing_key_create(). + * + * 2025 by Marek Behún + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static int turris_signing_key_instantiate(struct key *, struct key_preparsed_payload *) +{ + return 0; +} + +static void turris_signing_key_describe(const struct key *key, struct seq_file *m) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(key); + + if (!subtype) + return; + + seq_printf(m, "%s: %*phN", key->description, subtype->public_key_size, + subtype->get_public_key(key)); +} + +static long turris_signing_key_read(const struct key *key, char *buffer, size_t buflen) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(key); + + if (!subtype) + return -EIO; + + if (buffer) { + if (buflen > subtype->public_key_size) + buflen = subtype->public_key_size; + + memcpy(buffer, subtype->get_public_key(key), subtype->public_key_size); + } + + return subtype->public_key_size; +} + +static bool turris_signing_key_asym_valid_params(const struct turris_signing_key_subtype *subtype, + const struct kernel_pkey_params *params) +{ + if (params->encoding && strcmp(params->encoding, "raw")) + return false; + + if (params->hash_algo && strcmp(params->hash_algo, subtype->hash_algo)) + return false; + + return true; +} + +static int turris_signing_key_asym_query(const struct kernel_pkey_params *params, + struct kernel_pkey_query *info) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(params->key); + + if (!subtype) + return -EIO; + + if (!turris_signing_key_asym_valid_params(subtype, params)) + return -EINVAL; + + info->supported_ops = KEYCTL_SUPPORTS_SIGN; + info->key_size = subtype->key_size; + info->max_data_size = subtype->data_size; + info->max_sig_size = subtype->sig_size; + info->max_enc_size = 0; + info->max_dec_size = 0; + + return 0; +} + +static int turris_signing_key_asym_eds_op(struct kernel_pkey_params *params, + const void *in, void *out) +{ + const struct turris_signing_key_subtype *subtype = dereference_key_rcu(params->key); + int err; + + if (!subtype) + return -EIO; + + if (!turris_signing_key_asym_valid_params(subtype, params)) + return -EINVAL; + + if (params->op != kernel_pkey_sign) + return -EOPNOTSUPP; + + if (params->in_len != subtype->data_size || params->out_len != subtype->sig_size) + return -EINVAL; + + err = subtype->sign(params->key, in, out); + if (err) + return err; + + return subtype->sig_size; +} + +static struct key_type turris_signing_key_type = { + .name = "turris-signing-key", + .instantiate = turris_signing_key_instantiate, + .describe = turris_signing_key_describe, + .read = turris_signing_key_read, + .asym_query = turris_signing_key_asym_query, + .asym_eds_op = turris_signing_key_asym_eds_op, +}; + +static struct key *turris_signing_keyring; + +static void turris_signing_key_release(void *key) +{ + key_unlink(turris_signing_keyring, key); + key_put(key); +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc) +{ + struct key *key; + key_ref_t kref; + + kref = key_create(make_key_ref(turris_signing_keyring, true), + turris_signing_key_type.name, desc, NULL, 0, + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | KEY_USR_READ | + KEY_USR_SEARCH, + KEY_ALLOC_BUILT_IN | KEY_ALLOC_SET_KEEP | KEY_ALLOC_NOT_IN_QUOTA); + if (IS_ERR(kref)) + return PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + key->payload.data[1] = dev; + rcu_assign_keypointer(key, subtype); + + return devm_add_action_or_reset(dev, turris_signing_key_release, key); +} +EXPORT_SYMBOL_GPL(devm_turris_signing_key_create); + +static int turris_signing_key_init(void) +{ + int err; + + err = register_key_type(&turris_signing_key_type); + if (err) + return err; + + turris_signing_keyring = keyring_alloc(".turris-signing-keys", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_BUILT_IN | KEY_ALLOC_SET_KEEP | + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(turris_signing_keyring)) { + pr_err("Cannot allocate Turris keyring\n"); + + unregister_key_type(&turris_signing_key_type); + + return PTR_ERR(turris_signing_keyring); + } + + return 0; +} +module_init(turris_signing_key_init); + +static void turris_signing_key_exit(void) +{ + key_put(turris_signing_keyring); + unregister_key_type(&turris_signing_key_type); +} +module_exit(turris_signing_key_exit); + +MODULE_AUTHOR("Marek Behun "); +MODULE_DESCRIPTION("CZ.NIC's Turris signing key helper"); +MODULE_LICENSE("GPL"); diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h new file mode 100644 index 000000000000..032ca8cbf636 --- /dev/null +++ b/include/linux/turris-signing-key.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * 2025 by Marek Behún + */ + +#ifndef __TURRIS_SIGNING_KEY_H +#define __TURRIS_SIGNING_KEY_H + +#include +#include + +struct device; + +struct turris_signing_key_subtype { + u16 key_size; + u8 data_size; + u8 sig_size; + u8 public_key_size; + const char *hash_algo; + const void *(*get_public_key)(const struct key *key); + int (*sign)(const struct key *key, const void *msg, void *signature); +}; + +static inline struct device *turris_signing_key_get_dev(const struct key *key) +{ + return key->payload.data[1]; +} + +int +devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, + const char *desc); + +#endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From 78a0323506f01e8017a5826cd7e91951c13184fa Mon Sep 17 00:00:00 2001 From: Sohil Mehta Date: Thu, 27 Mar 2025 23:46:22 +0000 Subject: x86/nmi: Consolidate NMI panic variables Commit: c305a4e98378 ("x86: Move sysctls into arch/x86") recently moved the sysctl handling of panic_on_unrecovered_nmi and panic_on_io_nmi to x86-specific code. These variables no longer need to be declared in the generic header file. Relocate the variable definitions and declarations closer to where they are used. This makes all the NMI panic options consistent and easier to track. [ mingo: Fixed up the SHA1 of the commit reference. ] Signed-off-by: Sohil Mehta Signed-off-by: Ingo Molnar Reviewed-by: Kai Huang Acked-by: Peter Zijlstra (Intel) Reviewed-by: Nikolay Borisov Cc: Joel Granados Link: https://lore.kernel.org/r/20250327234629.3953536-3-sohil.mehta@intel.com --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/dumpstack.c | 2 -- arch/x86/kernel/nmi.c | 3 +++ include/linux/panic.h | 2 -- 4 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 9cf96cce02fc..f85aea7bf7f1 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -17,6 +17,8 @@ extern void release_evntsel_nmi(unsigned int); #endif /* CONFIG_X86_LOCAL_APIC */ extern int unknown_nmi_panic; +extern int panic_on_unrecovered_nmi; +extern int panic_on_io_nmi; #define NMI_FLAG_FIRST 1 diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6fefd4585f8..71ee20102a8a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -23,8 +23,6 @@ #include #include -int panic_on_unrecovered_nmi; -int panic_on_io_nmi; static int die_counter; static struct pt_regs exec_summary_regs; diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a95d00f1423..671d846ed620 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -91,6 +91,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static int ignore_nmis __read_mostly; int unknown_nmi_panic; +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; + /* * Prevent NMI reason port (0x61) being accessed simultaneously, can * only be used in NMI handler. diff --git a/include/linux/panic.h b/include/linux/panic.h index 2494d51707ef..4adc65766935 100644 --- a/include/linux/panic.h +++ b/include/linux/panic.h @@ -20,8 +20,6 @@ extern bool panic_triggering_all_cpu_backtrace; extern int panic_timeout; extern unsigned long panic_print; extern int panic_on_oops; -extern int panic_on_unrecovered_nmi; -extern int panic_on_io_nmi; extern int panic_on_warn; extern unsigned long panic_on_taint; -- cgit v1.2.3 From 642989287350fa4964c37df0f3769094072421c3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 24 Mar 2025 17:59:59 +0100 Subject: firmware: turris-mox-rwtm: fix building without CONFIG_KEYS "struct key" is defined conditionally, so the code referencing it must be made conditional as well: In file included from drivers/firmware/turris-mox-rwtm.c:29: include/linux/turris-signing-key.h: In function 'turris_signing_key_get_dev': include/linux/turris-signing-key.h:26:19: error: invalid use of undefined type 'const struct key' 26 | return key->payload.data[1]; | ^~ Signed-off-by: Arnd Bergmann --- include/linux/turris-signing-key.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/turris-signing-key.h b/include/linux/turris-signing-key.h index 032ca8cbf636..8a435b73c3a9 100644 --- a/include/linux/turris-signing-key.h +++ b/include/linux/turris-signing-key.h @@ -11,6 +11,7 @@ struct device; +#ifdef CONFIG_KEYS struct turris_signing_key_subtype { u16 key_size; u8 data_size; @@ -29,5 +30,6 @@ static inline struct device *turris_signing_key_get_dev(const struct key *key) int devm_turris_signing_key_create(struct device *dev, const struct turris_signing_key_subtype *subtype, const char *desc); +#endif #endif /* __TURRIS_SIGNING_KEY_H */ -- cgit v1.2.3 From f6e9a26e2d488c743757d66898ae91c53ffbe528 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:46 -0700 Subject: cgroup: move rstat base stat objects into their own struct This non-functional change serves as preparation for moving to subsystem-based rstat trees. The base stats are not an actual subsystem, but in future commits they will have exclusive rstat trees just as other subsystems will. Moving the base stat objects into a new struct allows the cgroup_rstat_cpu struct to become more compact since it now only contains the minimum amount of pointers needed for rstat participation. Subsystems will (in future commits) make use of the compact cgroup_rstat_cpu struct while avoiding the memory overhead of the base stat objects which they will not use. An instance of the new struct cgroup_rstat_base_cpu was placed on the cgroup struct so it can retain ownership of these base stats common to all cgroups. A helper function was added for looking up the cpu-specific base stats of a given cgroup. Finally, initialization and variable names were adjusted where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 38 +++++++++++--------- kernel/cgroup/cgroup.c | 8 +++-- kernel/cgroup/rstat.c | 84 +++++++++++++++++++++++++++------------------ 3 files changed, 79 insertions(+), 51 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 5bc8f55c8cca..1bf2e8db6dac 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -344,10 +344,29 @@ struct cgroup_base_stat { * frequency decreases the cost of each read. * * This struct hosts both the fields which implement the above - - * updated_children and updated_next - and the fields which track basic - * resource statistics on top of it - bsync, bstat and last_bstat. + * updated_children and updated_next. */ struct cgroup_rstat_cpu { + /* + * Child cgroups with stat updates on this cpu since the last read + * are linked on the parent's ->updated_children through + * ->updated_next. + * + * In addition to being more compact, singly-linked list pointing + * to the cgroup makes it unnecessary for each per-cpu struct to + * point back to the associated cgroup. + * + * Protected by per-cpu cgroup_rstat_cpu_lock. + */ + struct cgroup *updated_children; /* terminated by self cgroup */ + struct cgroup *updated_next; /* NULL iff not on the list */ +}; + +/* + * This struct hosts the fields which track basic resource statistics on + * top of it - bsync, bstat and last_bstat. + */ +struct cgroup_rstat_base_cpu { /* * ->bsync protects ->bstat. These are the only fields which get * updated in the hot path. @@ -374,20 +393,6 @@ struct cgroup_rstat_cpu { * deltas to propagate to the per-cpu subtree_bstat. */ struct cgroup_base_stat last_subtree_bstat; - - /* - * Child cgroups with stat updates on this cpu since the last read - * are linked on the parent's ->updated_children through - * ->updated_next. - * - * In addition to being more compact, singly-linked list pointing - * to the cgroup makes it unnecessary for each per-cpu struct to - * point back to the associated cgroup. - * - * Protected by per-cpu cgroup_rstat_cpu_lock. - */ - struct cgroup *updated_children; /* terminated by self cgroup */ - struct cgroup *updated_next; /* NULL iff not on the list */ }; struct cgroup_freezer_state { @@ -518,6 +523,7 @@ struct cgroup { /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; + struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu; struct list_head rstat_css_list; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 49d622205997..e1f0284cbbb8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -161,10 +161,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, root_rstat_cpu); +static DEFINE_PER_CPU(struct cgroup_rstat_base_cpu, root_rstat_base_cpu); /* the default hierarchy */ -struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; +struct cgroup_root cgrp_dfl_root = { + .cgrp.rstat_cpu = &root_rstat_cpu, + .cgrp.rstat_base_cpu = &root_rstat_base_cpu, +}; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index b2239156b7de..918931f68de3 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -19,6 +19,12 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) return per_cpu_ptr(cgrp->rstat_cpu, cpu); } +static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( + struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->rstat_base_cpu, cpu); +} + /* * Helper functions for rstat per CPU lock (cgroup_rstat_cpu_lock). * @@ -352,12 +358,22 @@ int cgroup_rstat_init(struct cgroup *cgrp) return -ENOMEM; } + if (!cgrp->rstat_base_cpu) { + cgrp->rstat_base_cpu = alloc_percpu(struct cgroup_rstat_base_cpu); + if (!cgrp->rstat_cpu) { + free_percpu(cgrp->rstat_cpu); + return -ENOMEM; + } + } + /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = + cgroup_rstat_base_cpu(cgrp, cpu); rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); + u64_stats_init(&rstatbc->bsync); } return 0; @@ -380,6 +396,8 @@ void cgroup_rstat_exit(struct cgroup *cgrp) free_percpu(cgrp->rstat_cpu); cgrp->rstat_cpu = NULL; + free_percpu(cgrp->rstat_base_cpu); + cgrp->rstat_base_cpu = NULL; } void __init cgroup_rstat_boot(void) @@ -420,9 +438,9 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat, static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_base_cpu *rstatbc = cgroup_rstat_base_cpu(cgrp, cpu); struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_rstat_cpu *prstatc; + struct cgroup_rstat_base_cpu *prstatbc; struct cgroup_base_stat delta; unsigned seq; @@ -432,15 +450,15 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->bsync); - delta = rstatc->bstat; - } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); + seq = __u64_stats_fetch_begin(&rstatbc->bsync); + delta = rstatbc->bstat; + } while (__u64_stats_fetch_retry(&rstatbc->bsync, seq)); /* propagate per-cpu delta to cgroup and per-cpu global statistics */ - cgroup_base_stat_sub(&delta, &rstatc->last_bstat); + cgroup_base_stat_sub(&delta, &rstatbc->last_bstat); cgroup_base_stat_add(&cgrp->bstat, &delta); - cgroup_base_stat_add(&rstatc->last_bstat, &delta); - cgroup_base_stat_add(&rstatc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_bstat, &delta); + cgroup_base_stat_add(&rstatbc->subtree_bstat, &delta); /* propagate cgroup and per-cpu global delta to parent (unless that's root) */ if (cgroup_parent(parent)) { @@ -449,73 +467,73 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_add(&parent->bstat, &delta); cgroup_base_stat_add(&cgrp->last_bstat, &delta); - delta = rstatc->subtree_bstat; - prstatc = cgroup_rstat_cpu(parent, cpu); - cgroup_base_stat_sub(&delta, &rstatc->last_subtree_bstat); - cgroup_base_stat_add(&prstatc->subtree_bstat, &delta); - cgroup_base_stat_add(&rstatc->last_subtree_bstat, &delta); + delta = rstatbc->subtree_bstat; + prstatbc = cgroup_rstat_base_cpu(parent, cpu); + cgroup_base_stat_sub(&delta, &rstatbc->last_subtree_bstat); + cgroup_base_stat_add(&prstatbc->subtree_bstat, &delta); + cgroup_base_stat_add(&rstatbc->last_subtree_bstat, &delta); } } -static struct cgroup_rstat_cpu * +static struct cgroup_rstat_base_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; - rstatc = get_cpu_ptr(cgrp->rstat_cpu); - *flags = u64_stats_update_begin_irqsave(&rstatc->bsync); - return rstatc; + rstatbc = get_cpu_ptr(cgrp->rstat_base_cpu); + *flags = u64_stats_update_begin_irqsave(&rstatbc->bsync); + return rstatbc; } static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc, + struct cgroup_rstat_base_cpu *rstatbc, unsigned long flags) { - u64_stats_update_end_irqrestore(&rstatc->bsync, flags); + u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); cgroup_rstat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(rstatc); + put_cpu_ptr(rstatbc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); - rstatc->bstat.cputime.sum_exec_runtime += delta_exec; - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_rstat_cpu *rstatc; + struct cgroup_rstat_base_cpu *rstatbc; unsigned long flags; - rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); + rstatbc = cgroup_base_stat_cputime_account_begin(cgrp, &flags); switch (index) { case CPUTIME_NICE: - rstatc->bstat.ntime += delta_exec; + rstatbc->bstat.ntime += delta_exec; fallthrough; case CPUTIME_USER: - rstatc->bstat.cputime.utime += delta_exec; + rstatbc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->bstat.cputime.stime += delta_exec; + rstatbc->bstat.cputime.stime += delta_exec; break; #ifdef CONFIG_SCHED_CORE case CPUTIME_FORCEIDLE: - rstatc->bstat.forceidle_sum += delta_exec; + rstatbc->bstat.forceidle_sum += delta_exec; break; #endif default: break; } - cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags); + cgroup_base_stat_cputime_account_end(cgrp, rstatbc, flags); } /* -- cgit v1.2.3 From 845a7245801142bfff411bc84afa8cdbc789562f Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:47 -0700 Subject: cgroup: add helper for checking when css is cgroup::self The cgroup struct has a css field called "self". The main difference between this css and the others found in the cgroup::subsys array is that cgroup::self has a NULL subsystem pointer. There are several places where checks are performed to determine whether the css in question is cgroup::self or not. Instead of accessing css->ss directly, introduce a helper function that shows the intent and use where applicable. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 5 +++++ kernel/cgroup/cgroup.c | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index e7da3c3b098b..65d95bb2199f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -347,6 +347,11 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css) return css->flags & CSS_DYING; } +static inline bool css_is_cgroup(struct cgroup_subsys_state *css) +{ + return css->ss == NULL; +} + static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e1f0284cbbb8..d98994b106ed 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1706,7 +1706,7 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - if (!css->ss) { + if (css_is_cgroup(css)) { if (cgroup_on_dfl(cgrp)) { cgroup_addrm_files(css, cgrp, cgroup_base_files, false); @@ -1738,7 +1738,7 @@ static int css_populate_dir(struct cgroup_subsys_state *css) if (css->flags & CSS_VISIBLE) return 0; - if (!css->ss) { + if (css_is_cgroup(css)) { if (cgroup_on_dfl(cgrp)) { ret = cgroup_addrm_files(css, cgrp, cgroup_base_files, true); -- cgit v1.2.3 From a97915559f5c5ff1972d678b94fd460c72a3b5f2 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Thu, 3 Apr 2025 18:10:48 -0700 Subject: cgroup: change rstat function signatures from cgroup-based to css-based This non-functional change serves as preparation for moving to subsystem-based rstat trees. To simplify future commits, change the signatures of existing cgroup-based rstat functions to become css-based and rename them to reflect that. Though the signatures have changed, the implementations have not. Within these functions use the css->cgroup pointer to obtain the associated cgroup and allow code to function the same just as it did before this patch. At applicable call sites, pass the subsystem-specific css pointer as an argument or pass a pointer to cgroup::self if not in subsystem context. Note that cgroup_rstat_updated_list() and cgroup_rstat_push_children() are not altered yet since there would be a larger amount of css to cgroup conversions which may overcomplicate the code at this intermediate phase. Signed-off-by: JP Kobryn Signed-off-by: Tejun Heo --- block/blk-cgroup.c | 6 +- include/linux/cgroup-defs.h | 2 +- include/linux/cgroup.h | 4 +- kernel/cgroup/cgroup-internal.h | 4 +- kernel/cgroup/cgroup.c | 30 ++++---- kernel/cgroup/rstat.c | 80 +++++++++++++--------- mm/memcontrol.c | 4 +- .../bpf/progs/cgroup_hierarchical_stats.c | 9 +-- 8 files changed, 78 insertions(+), 61 deletions(-) (limited to 'include/linux') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5905f277057b..0560ea402856 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1144,7 +1144,7 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) /* * We source root cgroup stats from the system-wide stats to avoid * tracking the same information twice and incurring overhead when no - * cgroups are defined. For that reason, cgroup_rstat_flush in + * cgroups are defined. For that reason, css_rstat_flush in * blkcg_print_stat does not actually fill out the iostat in the root * cgroup's blkcg_gq. * @@ -1253,7 +1253,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) if (!seq_css(sf)->parent) blkcg_fill_root_iostats(); else - cgroup_rstat_flush(blkcg->css.cgroup); + css_rstat_flush(&blkcg->css); rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { @@ -2243,7 +2243,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + css_rstat_updated(&blkcg->css, cpu); put_cpu(); } diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 1bf2e8db6dac..e58bfb880111 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -536,7 +536,7 @@ struct cgroup { /* * A singly-linked list of cgroup structures to be rstat flushed. * This is a scratch field to be used exclusively by - * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock. + * css_rstat_flush_locked() and protected by cgroup_rstat_lock. */ struct cgroup *rstat_flush_next; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 65d95bb2199f..1f5b0a4a3356 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -693,8 +693,8 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) /* * cgroup scalable recursive statistics. */ -void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); -void cgroup_rstat_flush(struct cgroup *cgrp); +void css_rstat_updated(struct cgroup_subsys_state *css, int cpu); +void css_rstat_flush(struct cgroup_subsys_state *css); /* * Basic resource stats. diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 95ab39e1ec8f..c161d34be634 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -270,8 +270,8 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -int cgroup_rstat_init(struct cgroup *cgrp); -void cgroup_rstat_exit(struct cgroup *cgrp); +int css_rstat_init(struct cgroup_subsys_state *css); +void css_rstat_exit(struct cgroup_subsys_state *css); void cgroup_rstat_boot(void); void cgroup_base_stat_cputime_show(struct seq_file *seq); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index d98994b106ed..c284df1efc9f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1362,7 +1362,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_unlock(); - cgroup_rstat_exit(cgrp); + css_rstat_exit(&cgrp->self); kernfs_destroy_root(root->kf_root); cgroup_free_root(root); } @@ -2136,7 +2136,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) if (ret) goto destroy_root; - ret = cgroup_rstat_init(root_cgrp); + ret = css_rstat_init(&root_cgrp->self); if (ret) goto destroy_root; @@ -2178,7 +2178,7 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) goto out; exit_stats: - cgroup_rstat_exit(root_cgrp); + css_rstat_exit(&root_cgrp->self); destroy_root: kernfs_destroy_root(root->kf_root); root->kf_root = NULL; @@ -5435,7 +5435,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); psi_cgroup_free(cgrp); - cgroup_rstat_exit(cgrp); + css_rstat_exit(css); kfree(cgrp); } else { /* @@ -5465,7 +5465,7 @@ static void css_release_work_fn(struct work_struct *work) /* css release path */ if (!list_empty(&css->rstat_css_node)) { - cgroup_rstat_flush(cgrp); + css_rstat_flush(css); list_del_rcu(&css->rstat_css_node); } @@ -5493,7 +5493,7 @@ static void css_release_work_fn(struct work_struct *work) /* cgroup release path */ TRACE_CGROUP_PATH(release, cgrp); - cgroup_rstat_flush(cgrp); + css_rstat_flush(css); spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; @@ -5686,17 +5686,13 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, if (ret) goto out_free_cgrp; - ret = cgroup_rstat_init(cgrp); - if (ret) - goto out_cancel_ref; - /* create the directory */ kn = kernfs_create_dir_ns(parent->kn, name, mode, current_fsuid(), current_fsgid(), cgrp, NULL); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_stat_exit; + goto out_cancel_ref; } cgrp->kn = kn; @@ -5706,6 +5702,14 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, cgrp->root = root; cgrp->level = level; + /* + * Now that init_cgroup_housekeeping() has been called and cgrp->self + * is setup, it is safe to perform rstat initialization on it. + */ + ret = css_rstat_init(&cgrp->self); + if (ret) + goto out_stat_exit; + ret = psi_cgroup_alloc(cgrp); if (ret) goto out_kernfs_remove; @@ -5776,10 +5780,10 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name, out_psi_free: psi_cgroup_free(cgrp); +out_stat_exit: + css_rstat_exit(&cgrp->self); out_kernfs_remove: kernfs_remove(cgrp->kn); -out_stat_exit: - cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 918931f68de3..4a8834a70ca6 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -34,9 +34,10 @@ static struct cgroup_rstat_base_cpu *cgroup_rstat_base_cpu( * operations without handling high-frequency fast-path "update" events. */ static __always_inline -unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, const bool fast_path) +unsigned long _css_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup_subsys_state *css, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; unsigned long flags; bool contended; @@ -67,10 +68,12 @@ unsigned long _cgroup_rstat_cpu_lock(raw_spinlock_t *cpu_lock, int cpu, } static __always_inline -void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, - struct cgroup *cgrp, unsigned long flags, +void _css_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, + struct cgroup_subsys_state *css, unsigned long flags, const bool fast_path) { + struct cgroup *cgrp = css->cgroup; + if (fast_path) trace_cgroup_rstat_cpu_unlock_fastpath(cgrp, cpu, false); else @@ -80,16 +83,17 @@ void _cgroup_rstat_cpu_unlock(raw_spinlock_t *cpu_lock, int cpu, } /** - * cgroup_rstat_updated - keep track of updated rstat_cpu - * @cgrp: target cgroup + * css_rstat_updated - keep track of updated rstat_cpu + * @css: target cgroup subsystem state * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching - * rstat_cpu->updated_children list. See the comment on top of + * @css->cgroup's rstat_cpu on @cpu was updated. Put it on the parent's + * matching rstat_cpu->updated_children list. See the comment on top of * cgroup_rstat_cpu definition for details. */ -__bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) +__bpf_kfunc void css_rstat_updated(struct cgroup_subsys_state *css, int cpu) { + struct cgroup *cgrp = css->cgroup; raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); unsigned long flags; @@ -104,7 +108,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, cgrp, true); + flags = _css_rstat_cpu_lock(cpu_lock, cpu, css, true); /* put @cgrp and all ancestors on the corresponding updated lists */ while (true) { @@ -132,7 +136,7 @@ __bpf_kfunc void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) cgrp = parent; } - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, cgrp, flags, true); + _css_rstat_cpu_unlock(cpu_lock, cpu, css, flags, true); } /** @@ -213,7 +217,7 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) struct cgroup *head = NULL, *parent, *child; unsigned long flags; - flags = _cgroup_rstat_cpu_lock(cpu_lock, cpu, root, false); + flags = _css_rstat_cpu_lock(cpu_lock, cpu, &root->self, false); /* Return NULL if this subtree is not on-list */ if (!rstatc->updated_next) @@ -250,14 +254,14 @@ static struct cgroup *cgroup_rstat_updated_list(struct cgroup *root, int cpu) if (child != root) head = cgroup_rstat_push_children(head, child, cpu); unlock_ret: - _cgroup_rstat_cpu_unlock(cpu_lock, cpu, root, flags, false); + _css_rstat_cpu_unlock(cpu_lock, cpu, &root->self, flags, false); return head; } /* * A hook for bpf stat collectors to attach to and flush their stats. - * Together with providing bpf kfuncs for cgroup_rstat_updated() and - * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that + * Together with providing bpf kfuncs for css_rstat_updated() and + * css_rstat_flush(), this enables a complete workflow where bpf progs that * collect cgroup stats can integrate with rstat for efficient flushing. * * A static noinline declaration here could cause the compiler to optimize away @@ -285,9 +289,11 @@ __bpf_hook_end(); * value -1 is used when obtaining the main lock else this is the CPU * number processed last. */ -static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) +static inline void __css_rstat_lock(struct cgroup_subsys_state *css, + int cpu_in_loop) __acquires(&cgroup_rstat_lock) { + struct cgroup *cgrp = css->cgroup; bool contended; contended = !spin_trylock_irq(&cgroup_rstat_lock); @@ -298,28 +304,32 @@ static inline void __cgroup_rstat_lock(struct cgroup *cgrp, int cpu_in_loop) trace_cgroup_rstat_locked(cgrp, cpu_in_loop, contended); } -static inline void __cgroup_rstat_unlock(struct cgroup *cgrp, int cpu_in_loop) +static inline void __css_rstat_unlock(struct cgroup_subsys_state *css, + int cpu_in_loop) __releases(&cgroup_rstat_lock) { + struct cgroup *cgrp = css->cgroup; + trace_cgroup_rstat_unlock(cgrp, cpu_in_loop, false); spin_unlock_irq(&cgroup_rstat_lock); } /** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup + * css_rstat_flush - flush stats in @css->cgroup's subtree + * @css: target cgroup subsystem state * - * Collect all per-cpu stats in @cgrp's subtree into the global counters + * Collect all per-cpu stats in @css->cgroup's subtree into the global counters * and propagate them upwards. After this function returns, all cgroups in * the subtree have up-to-date ->stat. * - * This also gets all cgroups in the subtree including @cgrp off the + * This also gets all cgroups in the subtree including @css->cgroup off the * ->updated_children lists. * * This function may block. */ -__bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) +__bpf_kfunc void css_rstat_flush(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; might_sleep(); @@ -327,7 +337,7 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) struct cgroup *pos; /* Reacquire for each CPU to avoid disabling IRQs too long */ - __cgroup_rstat_lock(cgrp, cpu); + __css_rstat_lock(css, cpu); pos = cgroup_rstat_updated_list(cgrp, cpu); for (; pos; pos = pos->rstat_flush_next) { struct cgroup_subsys_state *css; @@ -341,14 +351,15 @@ __bpf_kfunc void cgroup_rstat_flush(struct cgroup *cgrp) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - __cgroup_rstat_unlock(cgrp, cpu); + __css_rstat_unlock(css, cpu); if (!cond_resched()) cpu_relax(); } } -int cgroup_rstat_init(struct cgroup *cgrp) +int css_rstat_init(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; /* the root cgrp has rstat_cpu preallocated */ @@ -379,11 +390,12 @@ int cgroup_rstat_init(struct cgroup *cgrp) return 0; } -void cgroup_rstat_exit(struct cgroup *cgrp) +void css_rstat_exit(struct cgroup_subsys_state *css) { + struct cgroup *cgrp = css->cgroup; int cpu; - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); /* sanity check */ for_each_possible_cpu(cpu) { @@ -490,7 +502,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, unsigned long flags) { u64_stats_update_end_irqrestore(&rstatbc->bsync, flags); - cgroup_rstat_updated(cgrp, smp_processor_id()); + css_rstat_updated(&cgrp->self, smp_processor_id()); put_cpu_ptr(rstatbc); } @@ -592,12 +604,12 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) struct cgroup_base_stat bstat; if (cgroup_parent(cgrp)) { - cgroup_rstat_flush(cgrp); - __cgroup_rstat_lock(cgrp, -1); + css_rstat_flush(&cgrp->self); + __css_rstat_lock(&cgrp->self, -1); bstat = cgrp->bstat; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &bstat.cputime.utime, &bstat.cputime.stime); - __cgroup_rstat_unlock(cgrp, -1); + __css_rstat_unlock(&cgrp->self, -1); } else { root_cgroup_cputime(&bstat); } @@ -619,10 +631,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) cgroup_force_idle_show(seq, &bstat); } -/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */ +/* Add bpf kfuncs for css_rstat_updated() and css_rstat_flush() */ BTF_KFUNCS_START(bpf_rstat_kfunc_ids) -BTF_ID_FLAGS(func, cgroup_rstat_updated) -BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE) +BTF_ID_FLAGS(func, css_rstat_updated) +BTF_ID_FLAGS(func, css_rstat_flush, KF_SLEEPABLE) BTF_KFUNCS_END(bpf_rstat_kfunc_ids) static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 83c2df73e4b6..7152d9623cc8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -579,7 +579,7 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) if (!val) return; - cgroup_rstat_updated(memcg->css.cgroup, cpu); + css_rstat_updated(&memcg->css, cpu); statc = this_cpu_ptr(memcg->vmstats_percpu); for (; statc; statc = statc->parent) { stats_updates = READ_ONCE(statc->stats_updates) + abs(val); @@ -611,7 +611,7 @@ static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); - cgroup_rstat_flush(memcg->css.cgroup); + css_rstat_flush(&memcg->css); } /* diff --git a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c index c74362854948..ff189a736ad8 100644 --- a/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c +++ b/tools/testing/selftests/bpf/progs/cgroup_hierarchical_stats.c @@ -37,8 +37,9 @@ struct { __type(value, struct attach_counter); } attach_counters SEC(".maps"); -extern void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) __ksym; -extern void cgroup_rstat_flush(struct cgroup *cgrp) __ksym; +extern void css_rstat_updated( + struct cgroup_subsys_state *css, int cpu) __ksym; +extern void css_rstat_flush(struct cgroup_subsys_state *css) __ksym; static uint64_t cgroup_id(struct cgroup *cgrp) { @@ -75,7 +76,7 @@ int BPF_PROG(counter, struct cgroup *dst_cgrp, struct task_struct *leader, else if (create_percpu_attach_counter(cg_id, 1)) return 0; - cgroup_rstat_updated(dst_cgrp, bpf_get_smp_processor_id()); + css_rstat_updated(&dst_cgrp->self, bpf_get_smp_processor_id()); return 0; } @@ -141,7 +142,7 @@ int BPF_PROG(dumper, struct bpf_iter_meta *meta, struct cgroup *cgrp) return 1; /* Flush the stats to make sure we get the most updated numbers */ - cgroup_rstat_flush(cgrp); + css_rstat_flush(&cgrp->self); total_counter = bpf_map_lookup_elem(&attach_counters, &cg_id); if (!total_counter) { -- cgit v1.2.3 From dd8a9807fa03666bff52cb28472fb227eaac36c9 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Mar 2025 13:35:59 +0300 Subject: spi: Group CS related fields in struct spi_device The CS related fields are sparse in the struct spi_device. Group them. While at it, fix the comment style of cs_index_mask. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20250331103609.4160281-1-andriy.shevchenko@linux.intel.com Signed-off-by: Mark Brown --- include/linux/spi/spi.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 0ba5e49bace4..e10f0ebb8250 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -136,13 +136,6 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @max_speed_hz: Maximum clock rate to be used with this chip * (on this board); may be changed by the device's driver. * The spi_transfer.speed_hz can override this for each transfer. - * @chip_select: Array of physical chipselect, spi->chipselect[i] gives - * the corresponding physical CS for logical CS i. - * @mode: The spi mode defines how data is clocked out and in. - * This may be changed by the device's driver. - * The "active low" default for chipselect mode can be overridden - * (by specifying SPI_CS_HIGH) as can the "MSB first" default for - * each word in a transfer (by specifying SPI_LSB_FIRST). * @bits_per_word: Data transfers involve one or more words; word sizes * like eight or 12 bits are common. In-memory wordsizes are * powers of two bytes (e.g. 20 bit samples use 32 bits). @@ -150,6 +143,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * default (0) indicating protocol words are eight bit bytes. * The spi_transfer.bits_per_word can override this for each transfer. * @rt: Make the pump thread real time priority. + * @mode: The spi mode defines how data is clocked out and in. + * This may be changed by the device's driver. + * The "active low" default for chipselect mode can be overridden + * (by specifying SPI_CS_HIGH) as can the "MSB first" default for + * each word in a transfer (by specifying SPI_LSB_FIRST). * @irq: Negative, or the number passed to request_irq() to receive * interrupts from this device. * @controller_state: Controller's runtime state @@ -162,8 +160,7 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * the device will bind to the named driver and only the named driver. * Do not set directly, because core frees it; use driver_set_override() to * set or clear it. - * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines - * (optional, NULL when not using a GPIO line) + * @pcpu_statistics: statistics for the spi_device * @word_delay: delay to be inserted between consecutive * words of a transfer * @cs_setup: delay to be introduced by the controller after CS is asserted @@ -171,8 +168,11 @@ extern void spi_transfer_cs_change_delay_exec(struct spi_message *msg, * @cs_inactive: delay to be introduced by the controller after CS is * deasserted. If @cs_change_delay is used from @spi_transfer, then the * two delays will be added up. - * @pcpu_statistics: statistics for the spi_device + * @chip_select: Array of physical chipselect, spi->chipselect[i] gives + * the corresponding physical CS for logical CS i. * @cs_index_mask: Bit mask of the active chipselect(s) in the chipselect array + * @cs_gpiod: Array of GPIO descriptors of the corresponding chipselect lines + * (optional, NULL when not using a GPIO line) * * A @spi_device is used to interchange data between an SPI target device * (usually a discrete chip) and CPU memory. @@ -187,7 +187,6 @@ struct spi_device { struct device dev; struct spi_controller *controller; u32 max_speed_hz; - u8 chip_select[SPI_CS_CNT_MAX]; u8 bits_per_word; bool rt; #define SPI_NO_TX BIT(31) /* No transmit wire */ @@ -218,23 +217,29 @@ struct spi_device { void *controller_data; char modalias[SPI_NAME_SIZE]; const char *driver_override; - struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + + /* The statistics */ + struct spi_statistics __percpu *pcpu_statistics; + struct spi_delay word_delay; /* Inter-word delay */ + /* CS delays */ struct spi_delay cs_setup; struct spi_delay cs_hold; struct spi_delay cs_inactive; - /* The statistics */ - struct spi_statistics __percpu *pcpu_statistics; + u8 chip_select[SPI_CS_CNT_MAX]; - /* Bit mask of the chipselect(s) that the driver need to use from - * the chipselect array.When the controller is capable to handle + /* + * Bit mask of the chipselect(s) that the driver need to use from + * the chipselect array. When the controller is capable to handle * multiple chip selects & memories are connected in parallel * then more than one bit need to be set in cs_index_mask. */ u32 cs_index_mask : SPI_CS_CNT_MAX; + struct gpio_desc *cs_gpiod[SPI_CS_CNT_MAX]; /* Chip select gpio desc */ + /* * Likely need more hooks for more protocol options affecting how * the controller talks to each chip, like: -- cgit v1.2.3 From f7b86e0e75bc234751cb7a82d888083a57ef28b2 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Mon, 24 Mar 2025 21:15:17 +0000 Subject: crypto: ccp - Add new SEV/SNP platform shutdown API Add new API interface to do SEV/SNP platform shutdown when KVM module is unloaded. Reviewed-by: Dionna Glaze Reviewed-by: Tom Lendacky Signed-off-by: Ashish Kalra Signed-off-by: Herbert Xu --- drivers/crypto/ccp/sev-dev.c | 9 +++++++++ include/linux/psp-sev.h | 3 +++ 2 files changed, 12 insertions(+) (limited to 'include/linux') diff --git a/drivers/crypto/ccp/sev-dev.c b/drivers/crypto/ccp/sev-dev.c index 6fdbb3bf44b5..671347702ae7 100644 --- a/drivers/crypto/ccp/sev-dev.c +++ b/drivers/crypto/ccp/sev-dev.c @@ -2468,6 +2468,15 @@ static void sev_firmware_shutdown(struct sev_device *sev) mutex_unlock(&sev_cmd_mutex); } +void sev_platform_shutdown(void) +{ + if (!psp_master || !psp_master->sev_data) + return; + + sev_firmware_shutdown(psp_master->sev_data); +} +EXPORT_SYMBOL_GPL(sev_platform_shutdown); + void sev_dev_destroy(struct psp_device *psp) { struct sev_device *sev = psp->sev_data; diff --git a/include/linux/psp-sev.h b/include/linux/psp-sev.h index f3cad182d4ef..0b3a36bdaa90 100644 --- a/include/linux/psp-sev.h +++ b/include/linux/psp-sev.h @@ -954,6 +954,7 @@ int sev_do_cmd(int cmd, void *data, int *psp_ret); void *psp_copy_user_blob(u64 uaddr, u32 len); void *snp_alloc_firmware_page(gfp_t mask); void snp_free_firmware_page(void *addr); +void sev_platform_shutdown(void); #else /* !CONFIG_CRYPTO_DEV_SP_PSP */ @@ -988,6 +989,8 @@ static inline void *snp_alloc_firmware_page(gfp_t mask) static inline void snp_free_firmware_page(void *addr) { } +static inline void sev_platform_shutdown(void) { } + #endif /* CONFIG_CRYPTO_DEV_SP_PSP */ #endif /* __PSP_SEV_H__ */ -- cgit v1.2.3 From 1be1cd03a93339f14c8f4fe300bca321fddc6478 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 3 Apr 2025 18:59:14 +0300 Subject: gpiolib: acpi: Reduce memory footprint for struct acpi_gpio_params The line_index member in the struct acpi_gpio_params replicates what is covered in the ACPI GpioIo() or GpioInt() resource. The value there is limited to 16-bit one, so we don't really need to have a full 32-bit storage for it. Together with followed boolean the structure will be smaller. add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-3 (-3) Function old new delta acpi_gpio_property_lookup 417 414 -3 Total: Before=15361, After=15358, chg -0.02% `pahole` difference before and after: - /* size: 12, cachelines: 1, members: 3 */ - /* padding: 3 */ + /* size: 8, cachelines: 1, members: 3 */ + /* padding: 1 */ Acked-by: Mika Westerberg Link: https://lore.kernel.org/r/20250403160034.2680485-4-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko --- include/linux/gpio/consumer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 45b651c05b9c..899179972bec 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -587,7 +587,7 @@ struct gpio_desc *devm_fwnode_gpiod_get(struct device *dev, struct acpi_gpio_params { unsigned int crs_entry_index; - unsigned int line_index; + unsigned short line_index; bool active_low; }; -- cgit v1.2.3 From 5741909697a31cfb08e45d56b4211959fb791487 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Mar 2025 14:01:32 +1100 Subject: VFS: improve interface for lookup_one functions The family of functions: lookup_one() lookup_one_unlocked() lookup_one_positive_unlocked() appear designed to be used by external clients of the filesystem rather than by filesystems acting on themselves as the lookup_one_len family are used. They are used by: btrfs/ioctl - which is a user-space interface rather than an internal activity exportfs - i.e. from nfsd or the open_by_handle_at interface overlayfs - at access the underlying filesystems smb/server - for file service They should be used by nfsd (more than just the exportfs path) and cachefs but aren't. It would help if the documentation didn't claim they should "not be called by generic code". Also the path component name is passed as "name" and "len" which are (confusingly?) separate by the "base". In some cases the len in simply "strlen" and so passing a qstr using QSTR() would make the calling clearer. Other callers do pass separate name and len which are stored in a struct. Sometimes these are already stored in a qstr, other times it easily could be. So this patch changes these three functions to receive a 'struct qstr *', and improves the documentation. QSTR_LEN() is added to make it easy to pass a QSTR containing a known len. [brauner@kernel.org: take a struct qstr pointer] Signed-off-by: NeilBrown Link: https://lore.kernel.org/r/20250319031545.2999807-2-neil@brown.name Signed-off-by: Christian Brauner --- Documentation/filesystems/porting.rst | 9 +++++++ fs/btrfs/ioctl.c | 9 +++---- fs/exportfs/expfs.c | 5 ++-- fs/namei.c | 51 +++++++++++++++-------------------- fs/overlayfs/namei.c | 12 ++++----- fs/overlayfs/overlayfs.h | 2 +- fs/overlayfs/readdir.c | 9 ++++--- fs/smb/server/smb2pdu.c | 7 ++--- include/linux/dcache.h | 3 ++- include/linux/namei.h | 9 +++---- 10 files changed, 58 insertions(+), 58 deletions(-) (limited to 'include/linux') diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 767b2927c762..57dcba6de743 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1203,3 +1203,12 @@ should use d_drop();d_splice_alias() and return the result of the latter. If a positive dentry cannot be returned for some reason, in-kernel clients such as cachefiles, nfsd, smb/server may not perform ideally but will fail-safe. + +--- + +** mandatory** + +lookup_one(), lookup_one_unlocked(), lookup_one_positive_unlocked() now +take a qstr instead of a name and len. These, not the "one_len" +versions, should be used whenever accessing a filesystem from outside +that filesysmtem, through a mount point - which will have a mnt_idmap. diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a13d81bb56a0..502303438dd7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -909,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(idmap, name, parent->dentry, namelen); + dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; @@ -2288,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2411,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2427,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 128dd092916b..f0ede3e81cf7 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -143,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -549,8 +549,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_idmap(mnt), nbuf, - target_dir, strlen(nbuf)); + nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); diff --git a/fs/namei.c b/fs/namei.c index 360a86ca1f02..e499fb609271 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2922,19 +2922,17 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) EXPORT_SYMBOL(lookup_one_len); /** - * lookup_one - filesystem helper to lookup single pathname component + * lookup_one - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, - struct dentry *base, int len) +struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) { struct dentry *dentry; struct qstr this; @@ -2942,7 +2940,7 @@ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name->name, base, name->len, &this); if (err) return ERR_PTR(err); @@ -2952,27 +2950,24 @@ struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, EXPORT_SYMBOL(lookup_one); /** - * lookup_one_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr olding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * i_rwsem held, and will take the i_rwsem itself if necessary. */ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len) + struct qstr *name, struct dentry *base) { struct qstr this; int err; struct dentry *ret; - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name->name, base, name->len, &this); if (err) return ERR_PTR(err); @@ -2984,12 +2979,10 @@ struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, EXPORT_SYMBOL(lookup_one_unlocked); /** - * lookup_one_positive_unlocked - filesystem helper to lookup single - * pathname component + * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. @@ -2998,16 +2991,15 @@ EXPORT_SYMBOL(lookup_one_unlocked); * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * - * The helper should be called without i_mutex held. + * The helper should be called without i_rwsem held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len) + struct qstr *name, + struct dentry *base) { - struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -3032,7 +3024,7 @@ EXPORT_SYMBOL(lookup_one_positive_unlocked); struct dentry *lookup_one_len_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); + return lookup_one_unlocked(&nop_mnt_idmap, &QSTR_LEN(name, len), base); } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -3047,7 +3039,8 @@ EXPORT_SYMBOL(lookup_one_len_unlocked); struct dentry *lookup_positive_unlocked(const char *name, struct dentry *base, int len) { - return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); + return lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, len), base); } EXPORT_SYMBOL(lookup_positive_unlocked); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index be5c65d6f848..ac6e893e846a 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -205,8 +205,8 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, - base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), + &QSTR_LEN(name, len), base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -789,8 +789,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, - ofs->workdir, name.len); + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name, + ofs->workdir); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -1371,7 +1371,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - const struct qstr *name = &dentry->d_name; + struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1396,7 +1396,7 @@ bool ovl_lower_positive(struct dentry *dentry) this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name->name, parentpath->dentry, name->len); + name, parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 6f2f8f4cfbbc..0db8d415c6d8 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -402,7 +402,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base); } static inline bool ovl_open_flags_need_copy_up(int flags) diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 881ec5592da5..2fa450a7854d 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -271,7 +271,6 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; - struct ovl_cache_entry *p; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; @@ -280,9 +279,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { - p = rdd->first_maybe_whiteout; + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), dir); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -492,7 +493,7 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, } } /* This checks also for xwhiteouts */ - this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index d24d95d15d87..0894128b9266 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -4117,9 +4117,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(idmap, priv->d_info->name, - priv->dir_fp->filp->f_path.dentry, - priv->d_info->name_len); + dent = lookup_one(idmap, + &QSTR_LEN(priv->d_info->name, + priv->d_info->name_len), + priv->dir_fp->filp->f_path.dentry); unlock_dir(priv->dir_fp); if (IS_ERR(dent)) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8d1395f945bf..e974e63bcdbc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -57,7 +57,8 @@ struct qstr { }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } -#define QSTR(n) (struct qstr)QSTR_INIT(n, strlen(n)) +#define QSTR_LEN(n,l) (struct qstr)QSTR_INIT(n,l) +#define QSTR(n) QSTR_LEN(n, strlen(n)) extern const struct qstr empty_name; extern const struct qstr slash_name; diff --git a/include/linux/namei.h b/include/linux/namei.h index e3042176cdf4..ba02304a2a8a 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -73,13 +73,12 @@ extern struct dentry *try_lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len(const char *, struct dentry *, int); extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int); extern struct dentry *lookup_positive_unlocked(const char *, struct dentry *, int); -struct dentry *lookup_one(struct mnt_idmap *, const char *, struct dentry *, int); +struct dentry *lookup_one(struct mnt_idmap *, struct qstr *, struct dentry *); struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len); + struct qstr *name, struct dentry *base); struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len); + struct qstr *name, + struct dentry *base); extern int follow_down_one(struct path *); extern int follow_down(struct path *path, unsigned int flags); -- cgit v1.2.3 From 0a02e1f4a54ace747304687ced3b76d159e58914 Mon Sep 17 00:00:00 2001 From: Yixun Lan Date: Wed, 26 Mar 2025 06:06:19 +0800 Subject: irqdomain: Support three-cell scheme interrupts Add new function *_twothreecell() to extend support to parse three-cell interrupts which encoded as , the translate function will retrieve irq number and flag from last two cells. This API will be used in gpio irq driver which need to work with two or three cells cases. Signed-off-by: Yixun Lan Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250326-04-gpio-irq-threecell-v3-1-aab006ab0e00@gentoo.org --- include/linux/irqdomain.h | 20 ++++++++--------- kernel/irq/irqdomain.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index bb7111105296..df7e9278c8ac 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -571,16 +571,16 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, int irq_domain_xlate_onetwocell(struct irq_domain *d, struct device_node *ctrlr, const u32 *intspec, unsigned int intsize, irq_hw_number_t *out_hwirq, unsigned int *out_type); - -int irq_domain_translate_twocell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); - -int irq_domain_translate_onecell(struct irq_domain *d, - struct irq_fwspec *fwspec, - unsigned long *out_hwirq, - unsigned int *out_type); +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type); + +int irq_domain_translate_onecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twocell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type); /* IPI functions */ int irq_reserve_ipi(struct irq_domain *domain, const struct cpumask *dest); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9d5c8651492d..b294c3ff73b6 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1132,6 +1132,31 @@ int irq_domain_xlate_twocell(struct irq_domain *d, struct device_node *ctrlr, } EXPORT_SYMBOL_GPL(irq_domain_xlate_twocell); +/** + * irq_domain_xlate_twothreecell() - Generic xlate for direct two or three cell bindings + * @d: Interrupt domain involved in the translation + * @ctrlr: The device tree node for the device whose interrupt is translated + * @intspec: The interrupt specifier data from the device tree + * @intsize: The number of entries in @intspec + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Device Tree interrupt specifier translation function for two or three + * cell bindings, where the cell values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_xlate_twothreecell(struct irq_domain *d, struct device_node *ctrlr, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_type) +{ + struct irq_fwspec fwspec; + + of_phandle_args_to_fwspec(ctrlr, intspec, intsize, &fwspec); + + return irq_domain_translate_twothreecell(d, &fwspec, out_hwirq, out_type); +} +EXPORT_SYMBOL_GPL(irq_domain_xlate_twothreecell); + /** * irq_domain_xlate_onetwocell() - Generic xlate for one or two cell bindings * @d: Interrupt domain involved in the translation @@ -1216,6 +1241,37 @@ int irq_domain_translate_twocell(struct irq_domain *d, } EXPORT_SYMBOL_GPL(irq_domain_translate_twocell); +/** + * irq_domain_translate_twothreecell() - Generic translate for direct two or three cell + * bindings + * @d: Interrupt domain involved in the translation + * @fwspec: The firmware interrupt specifier to translate + * @out_hwirq: Pointer to storage for the hardware interrupt number + * @out_type: Pointer to storage for the interrupt type + * + * Firmware interrupt specifier translation function for two or three cell + * specifications, where the parameter values map directly to the hardware + * interrupt number and the type specifier. + */ +int irq_domain_translate_twothreecell(struct irq_domain *d, struct irq_fwspec *fwspec, + unsigned long *out_hwirq, unsigned int *out_type) +{ + if (fwspec->param_count == 2) { + *out_hwirq = fwspec->param[0]; + *out_type = fwspec->param[1] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + if (fwspec->param_count == 3) { + *out_hwirq = fwspec->param[1]; + *out_type = fwspec->param[2] & IRQ_TYPE_SENSE_MASK; + return 0; + } + + return -EINVAL; +} +EXPORT_SYMBOL_GPL(irq_domain_translate_twothreecell); + int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, int node, const struct irq_affinity_desc *affinity) { -- cgit v1.2.3 From 7b73c12c6ebf006ad496f0e38a605d92dfe05157 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 15:59:59 +0100 Subject: shmem: Add shmem_writeout() This will be the replacement for shmem_writepage(). Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-6-willy@infradead.org Reviewed-by: Baolin Wang Signed-off-by: Christian Brauner --- include/linux/shmem_fs.h | 7 ++++--- mm/shmem.c | 20 ++++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 0b273a7b9f01..5f03a39a26f7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -104,10 +104,11 @@ static inline bool shmem_mapping(struct address_space *mapping) return false; } #endif /* CONFIG_SHMEM */ -extern void shmem_unlock_mapping(struct address_space *mapping); -extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, +void shmem_unlock_mapping(struct address_space *mapping); +struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); -extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); +int shmem_writeout(struct folio *folio, struct writeback_control *wbc); +void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/shmem.c b/mm/shmem.c index 99327c30507c..7d377ceae035 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1536,12 +1536,20 @@ int shmem_unuse(unsigned int type) return error; } -/* - * Move the page from the page cache to the swap cache. - */ static int shmem_writepage(struct page *page, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); + return shmem_writeout(page_folio(page), wbc); +} + +/** + * shmem_writeout - Write the folio to swap + * @folio: The folio to write + * @wbc: How writeback is to be done + * + * Move the folio from the page cache to the swap cache. + */ +int shmem_writeout(struct folio *folio, struct writeback_control *wbc) +{ struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1586,9 +1594,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) try_split: /* Ensure the subpages are still dirty */ folio_test_set_dirty(folio); - if (split_huge_page_to_list_to_order(page, wbc->list, 0)) + if (split_folio_to_list(folio, wbc->list)) goto redirty; - folio = page_folio(page); folio_clear_dirty(folio); } @@ -1660,6 +1667,7 @@ redirty: folio_unlock(folio); return 0; } +EXPORT_SYMBOL_GPL(shmem_writeout); #if defined(CONFIG_NUMA) && defined(CONFIG_TMPFS) static void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol) -- cgit v1.2.3 From 6b0dfabb35550cb7b0808585dea6c24971d685d3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Apr 2025 16:00:03 +0100 Subject: fs: Remove aops->writepage All callers and implementations are now removed, so remove the operation and update the documentation to match. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250402150005.2309458-10-willy@infradead.org Signed-off-by: Christian Brauner --- Documentation/admin-guide/cgroup-v2.rst | 2 +- Documentation/filesystems/fscrypt.rst | 2 +- Documentation/filesystems/locking.rst | 54 ++------------------------------- Documentation/filesystems/vfs.rst | 39 ++++++------------------ fs/buffer.c | 4 +-- include/linux/fs.h | 1 - mm/vmscan.c | 1 - 7 files changed, 15 insertions(+), 88 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1a16ce68a4d7..9e7de8e70048 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -3019,7 +3019,7 @@ Filesystem Support for Writeback -------------------------------- A filesystem can support cgroup writeback by updating -address_space_operations->writepage[s]() to annotate bio's using the +address_space_operations->writepages() to annotate bio's using the following two functions. wbc_init_bio(@wbc, @bio) diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst index e80329908549..3d22e2db732d 100644 --- a/Documentation/filesystems/fscrypt.rst +++ b/Documentation/filesystems/fscrypt.rst @@ -1409,7 +1409,7 @@ read the ciphertext into the page cache and decrypt it in-place. The folio lock must be held until decryption has finished, to prevent the folio from becoming visible to userspace prematurely. -For the write path (->writepage()) of regular files, filesystems +For the write path (->writepages()) of regular files, filesystems cannot encrypt data in-place in the page cache, since the cached plaintext must be preserved. Instead, filesystems must encrypt into a temporary buffer or "bounce page", then write out the temporary diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 0ec0bb6eb0fb..2e567e341c3b 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -249,7 +249,6 @@ address_space_operations ======================== prototypes:: - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *folio); @@ -280,7 +279,6 @@ locking rules: ====================== ======================== ========= =============== ops folio locked i_rwsem invalidate_lock ====================== ======================== ========= =============== -writepage: yes, unlocks (see below) read_folio: yes, unlocks shared writepages: dirty_folio: maybe @@ -309,54 +307,6 @@ completion. ->readahead() unlocks the folios that I/O is attempted on like ->read_folio(). -->writepage() is used for two purposes: for "memory cleansing" and for -"sync". These are quite different operations and the behaviour may differ -depending upon the mode. - -If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then -it *must* start I/O against the page, even if that would involve -blocking on in-progress I/O. - -If writepage is called for memory cleansing (sync_mode == -WBC_SYNC_NONE) then its role is to get as much writeout underway as -possible. So writepage should try to avoid blocking against -currently-in-progress I/O. - -If the filesystem is not called for "sync" and it determines that it -would need to block against in-progress I/O to be able to start new I/O -against the page the filesystem should redirty the page with -redirty_page_for_writepage(), then unlock the page and return zero. -This may also be done to avoid internal deadlocks, but rarely. - -If the filesystem is called for sync then it must wait on any -in-progress I/O and then start new I/O. - -The filesystem should unlock the page synchronously, before returning to the -caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE -value. WRITEPAGE_ACTIVATE means that page cannot really be written out -currently, and VM should stop calling ->writepage() on this page for some -time. VM does this by moving page to the head of the active list, hence the -name. - -Unless the filesystem is going to redirty_page_for_writepage(), unlock the page -and return zero, writepage *must* run set_page_writeback() against the page, -followed by unlocking it. Once set_page_writeback() has been run against the -page, write I/O can be submitted and the write I/O completion handler must run -end_page_writeback() once the I/O is complete. If no I/O is submitted, the -filesystem must run end_page_writeback() against the page before returning from -writepage. - -That is: after 2.5.12, pages which are under writeout are *not* locked. Note, -if the filesystem needs the page to be locked during writeout, that is ok, too, -the page is allowed to be unlocked at any point in time between the calls to -set_page_writeback() and end_page_writeback(). - -Note, failure to run either redirty_page_for_writepage() or the combination of -set_page_writeback()/end_page_writeback() on a page submitted to writepage -will leave the page itself marked clean but it will be tagged as dirty in the -radix tree. This incoherency can lead to all sorts of hard-to-debug problems -in the filesystem like having dirty inodes at umount and losing written data. - ->writepages() is used for periodic writeback and for syscall-initiated sync operations. The address_space should start I/O against at least ``*nr_to_write`` pages. ``*nr_to_write`` must be decremented for each page @@ -364,8 +314,8 @@ which is written. The address_space implementation may write more (or less) pages than ``*nr_to_write`` asks for, but it should try to be reasonably close. If nr_to_write is NULL, all dirty pages must be written. -writepages should _only_ write pages which are present on -mapping->io_pages. +writepages should _only_ write pages which are present in +mapping->i_pages. ->dirty_folio() is called from various places in the kernel when the target folio is marked as needing writeback. The folio cannot be diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index ae79c30b6c0c..bf051c7da6b8 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -716,9 +716,8 @@ page lookup by address, and keeping track of pages tagged as Dirty or Writeback. The first can be used independently to the others. The VM can try to -either write dirty pages in order to clean them, or release clean pages -in order to reuse them. To do this it can call the ->writepage method -on dirty pages, and ->release_folio on clean folios with the private +release clean pages in order to reuse them. To do this it can call +->release_folio on clean folios with the private flag set. Clean pages without PagePrivate and with no external references will be released without notice being given to the address_space. @@ -731,8 +730,8 @@ maintains information about the PG_Dirty and PG_Writeback status of each page, so that pages with either of these flags can be found quickly. The Dirty tag is primarily used by mpage_writepages - the default -->writepages method. It uses the tag to find dirty pages to call -->writepage on. If mpage_writepages is not used (i.e. the address +->writepages method. It uses the tag to find dirty pages to +write back. If mpage_writepages is not used (i.e. the address provides its own ->writepages) , the PAGECACHE_TAG_DIRTY tag is almost unused. write_inode_now and sync_inode do use it (through __sync_single_inode) to check if ->writepages has been successful in @@ -756,23 +755,23 @@ pages, however the address_space has finer control of write sizes. The read process essentially only requires 'read_folio'. The write process is more complicated and uses write_begin/write_end or -dirty_folio to write data into the address_space, and writepage and +dirty_folio to write data into the address_space, and writepages to writeback data to storage. Adding and removing pages to/from an address_space is protected by the inode's i_mutex. When data is written to a page, the PG_Dirty flag should be set. It -typically remains set until writepage asks for it to be written. This +typically remains set until writepages asks for it to be written. This should clear PG_Dirty and set PG_Writeback. It can be actually written at any point after PG_Dirty is clear. Once it is known to be safe, PG_Writeback is cleared. Writeback makes use of a writeback_control structure to direct the -operations. This gives the writepage and writepages operations some +operations. This gives the writepages operation some information about the nature of and reason for the writeback request, and the constraints under which it is being done. It is also used to -return information back to the caller about the result of a writepage or +return information back to the caller about the result of a writepages request. @@ -819,7 +818,6 @@ cache in your filesystem. The following members are defined: .. code-block:: c struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); int (*writepages)(struct address_space *, struct writeback_control *); bool (*dirty_folio)(struct address_space *, struct folio *); @@ -848,25 +846,6 @@ cache in your filesystem. The following members are defined: int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter); }; -``writepage`` - called by the VM to write a dirty page to backing store. This - may happen for data integrity reasons (i.e. 'sync'), or to free - up memory (flush). The difference can be seen in - wbc->sync_mode. The PG_Dirty flag has been cleared and - PageLocked is true. writepage should start writeout, should set - PG_Writeback, and should make sure the page is unlocked, either - synchronously or asynchronously when the write operation - completes. - - If wbc->sync_mode is WB_SYNC_NONE, ->writepage doesn't have to - try too hard if there are problems, and may choose to write out - other pages from the mapping if that is easier (e.g. due to - internal dependencies). If it chooses not to start writeout, it - should return AOP_WRITEPAGE_ACTIVATE so that the VM will not - keep calling ->writepage on that page. - - See the file "Locking" for more details. - ``read_folio`` Called by the page cache to read a folio from the backing store. The 'file' argument supplies authentication information to network @@ -909,7 +888,7 @@ cache in your filesystem. The following members are defined: given and that many pages should be written if possible. If no ->writepages is given, then mpage_writepages is used instead. This will choose pages from the address space that are tagged as - DIRTY and will pass them to ->writepage. + DIRTY and will write them back. ``dirty_folio`` called by the VM to mark a folio as dirty. This is particularly diff --git a/fs/buffer.c b/fs/buffer.c index c7abb4a029dc..b99dc69dba37 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2695,7 +2695,7 @@ unlock: EXPORT_SYMBOL(block_truncate_page); /* - * The generic ->writepage function for buffer-backed address_spaces + * The generic write folio function for buffer-backed address_spaces */ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block) @@ -2715,7 +2715,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, /* * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped + * writeback invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..f7beefb917a8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,6 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb) } struct address_space_operations { - int (*writepage)(struct page *page, struct writeback_control *wbc); int (*read_folio)(struct file *, struct folio *); /* Write back some dirty pages from this mapping. */ diff --git a/mm/vmscan.c b/mm/vmscan.c index d172c998d592..6d56f4816171 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -648,7 +648,6 @@ typedef enum { /* * pageout is called by shrink_folio_list() for each dirty folio. - * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, struct swap_iocb **plug, struct list_head *folio_list) -- cgit v1.2.3 From 559b3bbfa978ce3b23dc9c52d09a0eddca52c439 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:10 -0400 Subject: locking/percpu-rwsem: add freezable alternative to down_read Percpu-rwsems are used for superblock locking. However, we know the read percpu-rwsem we take for sb_start_write() on a frozen filesystem needs not to inhibit system from suspending or hibernating. That means it needs to wait with TASK_UNINTERRUPTIBLE | TASK_FREEZABLE. Introduce a new percpu_down_read_freezable() that allows us to control whether TASK_FREEZABLE is added to the wait flags. Signed-off-by: James Bottomley Link: https://lore.kernel.org/r/20250327140613.25178-2-James.Bottomley@HansenPartnership.com Signed-off-by: Christian Brauner --- include/linux/percpu-rwsem.h | 20 ++++++++++++++++---- kernel/locking/percpu-rwsem.c | 13 ++++++++----- 2 files changed, 24 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index af7d75ede619..288f5235649a 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -43,9 +43,10 @@ is_static struct percpu_rw_semaphore name = { \ #define DEFINE_STATIC_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, static) -extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); +extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool, bool); -static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +static inline void percpu_down_read_internal(struct percpu_rw_semaphore *sem, + bool freezable) { might_sleep(); @@ -63,7 +64,7 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - __percpu_down_read(sem, false); /* Unconditional memory barrier */ + __percpu_down_read(sem, false, freezable); /* Unconditional memory barrier */ /* * The preempt_enable() prevents the compiler from * bleeding the critical section out. @@ -71,6 +72,17 @@ static inline void percpu_down_read(struct percpu_rw_semaphore *sem) preempt_enable(); } +static inline void percpu_down_read(struct percpu_rw_semaphore *sem) +{ + percpu_down_read_internal(sem, false); +} + +static inline void percpu_down_read_freezable(struct percpu_rw_semaphore *sem, + bool freeze) +{ + percpu_down_read_internal(sem, freeze); +} + static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { bool ret = true; @@ -82,7 +94,7 @@ static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else - ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ + ret = __percpu_down_read(sem, true, false); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index d6964fc29f51..ef234469baac 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -138,7 +138,8 @@ static int percpu_rwsem_wake_function(struct wait_queue_entry *wq_entry, return !reader; /* wake (readers until) 1 writer */ } -static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) +static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader, + bool freeze) { DEFINE_WAIT_FUNC(wq_entry, percpu_rwsem_wake_function); bool wait; @@ -156,7 +157,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) spin_unlock_irq(&sem->waiters.lock); while (wait) { - set_current_state(TASK_UNINTERRUPTIBLE); + set_current_state(TASK_UNINTERRUPTIBLE | + (freeze ? TASK_FREEZABLE : 0)); if (!smp_load_acquire(&wq_entry.private)) break; schedule(); @@ -164,7 +166,8 @@ static void percpu_rwsem_wait(struct percpu_rw_semaphore *sem, bool reader) __set_current_state(TASK_RUNNING); } -bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) +bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try, + bool freeze) { if (__percpu_down_read_trylock(sem)) return true; @@ -174,7 +177,7 @@ bool __sched __percpu_down_read(struct percpu_rw_semaphore *sem, bool try) trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_READ); preempt_enable(); - percpu_rwsem_wait(sem, /* .reader = */ true); + percpu_rwsem_wait(sem, /* .reader = */ true, freeze); preempt_disable(); trace_contention_end(sem, 0); @@ -237,7 +240,7 @@ void __sched percpu_down_write(struct percpu_rw_semaphore *sem) */ if (!__percpu_down_write_trylock(sem)) { trace_contention_begin(sem, LCB_F_PERCPU | LCB_F_WRITE); - percpu_rwsem_wait(sem, /* .reader = */ false); + percpu_rwsem_wait(sem, /* .reader = */ false, false); contended = true; } -- cgit v1.2.3 From f73bae83675b860cae9f58dba233b6be92e750ca Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Thu, 27 Mar 2025 10:06:11 -0400 Subject: fs: allow all writers to be frozen During freeze/thaw we need to be able to freeze all writers during suspend/hibernate. Otherwise tasks such as systemd-journald that mmap a file and write to it will not be frozen after we've already frozen the filesystem. This has some risk of not being able to freeze processes in case a process has acquired SB_FREEZE_PAGEFAULT under mmap_sem or SB_FREEZE_INTERNAL under some other filesytem specific lock. If the filesystem is frozen, a task can block on the frozen filesystem with e.g., mmap_sem held. If some other task then blocks on grabbing that mmap_sem, hibernation ill fail because it is unable to hibernate a task holding mmap_sem. This could be fixed by making a range of filesystem related locks use freezable sleeping. That's impractical and not warranted just for suspend/hibernate. Assume that this is an infrequent problem and we've given userspace a way to skip filesystem freezing through a sysfs file. Link: https://lore.kernel.org/r/20250402-work-freeze-v2-2-6719a97b52ac@kernel.org Link: https://lore.kernel.org/r/20250327140613.25178-3-James.Bottomley@HansenPartnership.com [brauner: make all freeze levels set TASK_FREEZABLE and rewrite commit message] Reviewed-by: Jan Kara Signed-off-by: James Bottomley Signed-off-by: Christian Brauner --- include/linux/fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 016b0fe1536e..35b81f497904 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1780,7 +1780,7 @@ static inline void __sb_end_write(struct super_block *sb, int level) static inline void __sb_start_write(struct super_block *sb, int level) { - percpu_down_read(sb->s_writers.rw_sem + level - 1); + percpu_down_read_freezable(sb->s_writers.rw_sem + level - 1, true); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) -- cgit v1.2.3 From 2992476528aeecbaee17ba0a6396a817481205a3 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:17 +0100 Subject: super: use a common iterator (Part 1) Use a common iterator for all callbacks. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-4-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 67 +++++++++++------------------------------------------- include/linux/fs.h | 6 ++++- 2 files changed, 18 insertions(+), 55 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index c67ea3cdda41..0dd208804a74 100644 --- a/fs/super.c +++ b/fs/super.c @@ -887,37 +887,7 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); -static void __iterate_supers(void (*f)(struct super_block *)) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (super_flags(sb, SB_DYING)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); - - f(sb); - - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); -} -/** - * iterate_supers - call function for all active superblocks - * @f: function to call - * @arg: argument to pass to it - * - * Scans the superblock list and calls given function, passing it - * locked superblock and given argument. - */ -void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl) { struct super_block *sb, *p = NULL; @@ -927,14 +897,13 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) if (super_flags(sb, SB_DYING)) continue; - sb->s_count++; spin_unlock(&sb_lock); - locked = super_lock_shared(sb); + locked = super_lock(sb, excl); if (locked) { f(sb, arg); - super_unlock_shared(sb); + super_unlock(sb, excl); } spin_lock(&sb_lock); @@ -1111,11 +1080,9 @@ cancel_readonly: return retval; } -static void do_emergency_remount_callback(struct super_block *sb) +static void do_emergency_remount_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1126,13 +1093,11 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - if (locked) - super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { - __iterate_supers(do_emergency_remount_callback); + __iterate_supers(do_emergency_remount_callback, NULL, true); kfree(work); printk("Emergency Remount complete\n"); } @@ -1148,24 +1113,18 @@ void emergency_remount(void) } } -static void do_thaw_all_callback(struct super_block *sb) +static void do_thaw_all_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root) { - if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) - pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); - thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - return; - } - if (locked) - super_unlock_excl(sb); + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); + return; } static void do_thaw_all(struct work_struct *work) { - __iterate_supers(do_thaw_all_callback); + __iterate_supers(do_thaw_all_callback, NULL, true); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 35b81f497904..5e007bffd00e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,7 +3515,11 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -extern void iterate_supers(void (*)(struct super_block *, void *), void *); +void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); +static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, false); +} extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From b47e42d10e8c20525febccbd6e0dc8528861aea4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 29 Mar 2025 09:42:18 +0100 Subject: super: use common iterator (Part 2) Use a common iterator for all callbacks. We could go for something even more elaborate (advance step-by-step similar to iov_iter) but I really don't think this is warranted. Link: https://lore.kernel.org/r/20250329-work-freeze-v2-5-a47af37ecc3d@kernel.org Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/super.c | 48 +++++++++++++++++++++++++++++++++++++++--------- include/linux/fs.h | 6 +----- 2 files changed, 40 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/fs/super.c b/fs/super.c index 0dd208804a74..796e7f402a41 100644 --- a/fs/super.c +++ b/fs/super.c @@ -887,21 +887,46 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); -void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl) +enum super_iter_flags_t { + SUPER_ITER_EXCL = (1U << 0), + SUPER_ITER_UNLOCKED = (1U << 1), + SUPER_ITER_REVERSE = (1U << 2), +}; + +static inline struct super_block *first_super(enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_last_entry(&super_blocks, struct super_block, s_list); + return list_first_entry(&super_blocks, struct super_block, s_list); +} + +static inline struct super_block *next_super(struct super_block *sb, + enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_prev_entry(sb, s_list); + return list_next_entry(sb, s_list); +} + +static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, + enum super_iter_flags_t flags) { struct super_block *sb, *p = NULL; + bool excl = flags & SUPER_ITER_EXCL; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - bool locked; + guard(spinlock)(&sb_lock); + for (sb = first_super(flags); + !list_entry_is_head(sb, &super_blocks, s_list); + sb = next_super(sb, flags)) { if (super_flags(sb, SB_DYING)) continue; sb->s_count++; spin_unlock(&sb_lock); - locked = super_lock(sb, excl); - if (locked) { + if (flags & SUPER_ITER_UNLOCKED) { + f(sb, arg); + } else if (super_lock(sb, excl)) { f(sb, arg); super_unlock(sb, excl); } @@ -913,7 +938,11 @@ void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool e } if (p) __put_super(p); - spin_unlock(&sb_lock); +} + +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, 0); } /** @@ -1097,7 +1126,8 @@ static void do_emergency_remount_callback(struct super_block *sb, void *unused) static void do_emergency_remount(struct work_struct *work) { - __iterate_supers(do_emergency_remount_callback, NULL, true); + __iterate_supers(do_emergency_remount_callback, NULL, + SUPER_ITER_EXCL | SUPER_ITER_REVERSE); kfree(work); printk("Emergency Remount complete\n"); } @@ -1124,7 +1154,7 @@ static void do_thaw_all_callback(struct super_block *sb, void *unused) static void do_thaw_all(struct work_struct *work) { - __iterate_supers(do_thaw_all_callback, NULL, true); + __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 5e007bffd00e..96f8c8a794ea 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3515,11 +3515,7 @@ extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); extern void drop_super(struct super_block *sb); extern void drop_super_exclusive(struct super_block *sb); -void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, bool excl); -static inline void iterate_supers(void (*f)(struct super_block *, void *), void *arg) -{ - __iterate_supers(f, arg, false); -} +extern void iterate_supers(void (*f)(struct super_block *, void *), void *arg); extern void iterate_supers_type(struct file_system_type *, void (*)(struct super_block *, void *), void *); -- cgit v1.2.3 From 06f2f68a670aae28b825065439301831e74da880 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:15 +0100 Subject: genirq/generic-chip: Make locking unconditional The SMP conditional wrappers around raw_spin_[un]lock() have no real value. On !SMP kernels the lock operations are NOOPs except for a preempt_disable/enable() pair on PREEMPT enabled kernels, which are not really worth to optimize for. Aside of that this evades lockdep on !SMP kernels. Remove the !SMP stubs and make it unconditional. No functional change. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.011345765@linutronix.de --- include/linux/irq.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index dd5df1e2d032..500772943207 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,7 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -#ifdef CONFIG_SMP static inline void irq_gc_lock(struct irq_chip_generic *gc) { raw_spin_lock(&gc->lock); @@ -1232,10 +1231,6 @@ static inline void irq_gc_unlock(struct irq_chip_generic *gc) { raw_spin_unlock(&gc->lock); } -#else -static inline void irq_gc_lock(struct irq_chip_generic *gc) { } -static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } -#endif /* * The irqsave variants are for usage in non interrupt code. Do not use -- cgit v1.2.3 From 7ae844a6650c5c15ccfbf76ed767e7f2cc61ec1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 13 Mar 2025 15:31:28 +0100 Subject: genirq/generic-chip: Remove unused lock wrappers All users are converted to lock guards. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250313142524.388478168@linutronix.de --- include/linux/irq.h | 20 -------------------- 1 file changed, 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 500772943207..d896d3a471ec 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -1222,26 +1222,6 @@ static inline struct irq_chip_type *irq_data_get_chip_type(struct irq_data *d) #define IRQ_MSK(n) (u32)((n) < 32 ? ((1 << (n)) - 1) : UINT_MAX) -static inline void irq_gc_lock(struct irq_chip_generic *gc) -{ - raw_spin_lock(&gc->lock); -} - -static inline void irq_gc_unlock(struct irq_chip_generic *gc) -{ - raw_spin_unlock(&gc->lock); -} - -/* - * The irqsave variants are for usage in non interrupt code. Do not use - * them in irq_chip callbacks. Use irq_gc_lock() instead. - */ -#define irq_gc_lock_irqsave(gc, flags) \ - raw_spin_lock_irqsave(&(gc)->lock, flags) - -#define irq_gc_unlock_irqrestore(gc, flags) \ - raw_spin_unlock_irqrestore(&(gc)->lock, flags) - static inline void irq_reg_writel(struct irq_chip_generic *gc, u32 val, int reg_offset) { -- cgit v1.2.3 From 1ce4c3aeef333be1e6290ec6d1f7891c2bfc7a1f Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 1 Apr 2025 11:37:16 +0200 Subject: firmware: sysfb: Move bpp-depth calculation into screen_info helper Move the calculation of the bits per pixels for screen_info into a helper function. This will make it available to other callers besides the firmware code. Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/20250401094056.32904-14-tzimmermann@suse.de --- drivers/firmware/sysfb_simplefb.c | 31 +------------------------------ drivers/video/screen_info_generic.c | 36 ++++++++++++++++++++++++++++++++++++ include/linux/screen_info.h | 2 ++ 3 files changed, 39 insertions(+), 30 deletions(-) (limited to 'include/linux') diff --git a/drivers/firmware/sysfb_simplefb.c b/drivers/firmware/sysfb_simplefb.c index 75a186bf8f8e..592d8a644619 100644 --- a/drivers/firmware/sysfb_simplefb.c +++ b/drivers/firmware/sysfb_simplefb.c @@ -35,36 +35,7 @@ __init bool sysfb_parse_mode(const struct screen_info *si, if (type != VIDEO_TYPE_VLFB && type != VIDEO_TYPE_EFI) return false; - /* - * The meaning of depth and bpp for direct-color formats is - * inconsistent: - * - * - DRM format info specifies depth as the number of color - * bits; including alpha, but not including filler bits. - * - Linux' EFI platform code computes lfb_depth from the - * individual color channels, including the reserved bits. - * - VBE 1.1 defines lfb_depth for XRGB1555 as 16, but later - * versions use 15. - * - On the kernel command line, 'bpp' of 32 is usually - * XRGB8888 including the filler bits, but 15 is XRGB1555 - * not including the filler bit. - * - * It's not easily possible to fix this in struct screen_info, - * as this could break UAPI. The best solution is to compute - * bits_per_pixel from the color bits, reserved bits and - * reported lfb_depth, whichever is highest. In the loop below, - * ignore simplefb formats with alpha bits, as EFI and VESA - * don't specify alpha channels. - */ - if (si->lfb_depth > 8) { - bits_per_pixel = max(max3(si->red_size + si->red_pos, - si->green_size + si->green_pos, - si->blue_size + si->blue_pos), - si->rsvd_size + si->rsvd_pos); - bits_per_pixel = max_t(u32, bits_per_pixel, si->lfb_depth); - } else { - bits_per_pixel = si->lfb_depth; - } + bits_per_pixel = __screen_info_lfb_bits_per_pixel(si); for (i = 0; i < ARRAY_SIZE(formats); ++i) { const struct simplefb_format *f = &formats[i]; diff --git a/drivers/video/screen_info_generic.c b/drivers/video/screen_info_generic.c index 64117c6367ab..900e9386eceb 100644 --- a/drivers/video/screen_info_generic.c +++ b/drivers/video/screen_info_generic.c @@ -144,3 +144,39 @@ ssize_t screen_info_resources(const struct screen_info *si, struct resource *r, return pos - r; } EXPORT_SYMBOL(screen_info_resources); + +/* + * The meaning of depth and bpp for direct-color formats is + * inconsistent: + * + * - DRM format info specifies depth as the number of color + * bits; including alpha, but not including filler bits. + * - Linux' EFI platform code computes lfb_depth from the + * individual color channels, including the reserved bits. + * - VBE 1.1 defines lfb_depth for XRGB1555 as 16, but later + * versions use 15. + * - On the kernel command line, 'bpp' of 32 is usually + * XRGB8888 including the filler bits, but 15 is XRGB1555 + * not including the filler bit. + * + * It is not easily possible to fix this in struct screen_info, + * as this could break UAPI. The best solution is to compute + * bits_per_pixel from the color bits, reserved bits and + * reported lfb_depth, whichever is highest. + */ + +u32 __screen_info_lfb_bits_per_pixel(const struct screen_info *si) +{ + u32 bits_per_pixel = si->lfb_depth; + + if (bits_per_pixel > 8) { + bits_per_pixel = max(max3(si->red_size + si->red_pos, + si->green_size + si->green_pos, + si->blue_size + si->blue_pos), + si->rsvd_size + si->rsvd_pos); + bits_per_pixel = max_t(u32, bits_per_pixel, si->lfb_depth); + } + + return bits_per_pixel; +} +EXPORT_SYMBOL(__screen_info_lfb_bits_per_pixel); diff --git a/include/linux/screen_info.h b/include/linux/screen_info.h index 6a4a3cec4638..ab3cffbb58b7 100644 --- a/include/linux/screen_info.h +++ b/include/linux/screen_info.h @@ -128,6 +128,8 @@ static inline unsigned int screen_info_video_type(const struct screen_info *si) ssize_t screen_info_resources(const struct screen_info *si, struct resource *r, size_t num); +u32 __screen_info_lfb_bits_per_pixel(const struct screen_info *si); + #if defined(CONFIG_PCI) void screen_info_apply_fixups(void); struct pci_dev *screen_info_pci_dev(const struct screen_info *si); -- cgit v1.2.3 From 814d270b31d27b6ea4f722b8ae2db9802fe332ff Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Tue, 1 Apr 2025 11:37:21 +0200 Subject: drm/sysfb: vesadrm: Add gamma correction Add palette support and export GAMMA properties via sysfs. User-space compositors can use this interface for programming gamma ramps or night mode. Vesadrm supports palette updates via VGA DAC registers or VESA palette calls. Up to 256 palette entries are available. Userspace always supplies gamma ramps of 256 entries. If the native color format does not match this because pixel component have less then 8 bits, vesadrm interpolates among the palette entries. The code uses CamelCase style in a few places to match the VESA manuals. v3: - fix coding style v2: - use CONFIG_X86_32 instead of __i386__ (checkpatch) - protect struct vesadrm.pmi with CONFIG_X86_32 Signed-off-by: Thomas Zimmermann Reviewed-by: Javier Martinez Canillas Link: https://lore.kernel.org/r/20250401094056.32904-19-tzimmermann@suse.de --- drivers/gpu/drm/sysfb/vesadrm.c | 206 ++++++++++++++++++++++++++++++++++++++++ include/linux/screen_info.h | 7 ++ 2 files changed, 213 insertions(+) (limited to 'include/linux') diff --git a/drivers/gpu/drm/sysfb/vesadrm.c b/drivers/gpu/drm/sysfb/vesadrm.c index 07f59880ce0f..9cc50e3072ea 100644 --- a/drivers/gpu/drm/sysfb/vesadrm.c +++ b/drivers/gpu/drm/sysfb/vesadrm.c @@ -25,6 +25,7 @@ #include