diff options
Diffstat (limited to 'io_uring/zcrx.c')
| -rw-r--r-- | io_uring/zcrx.c | 421 |
1 files changed, 324 insertions, 97 deletions
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c index b1b723222cdb..b99cf2c6670a 100644 --- a/io_uring/zcrx.c +++ b/io_uring/zcrx.c @@ -8,6 +8,7 @@ #include <linux/netdevice.h> #include <linux/rtnetlink.h> #include <linux/skbuff_ref.h> +#include <linux/anon_inodes.h> #include <net/page_pool/helpers.h> #include <net/page_pool/memory_provider.h> @@ -170,7 +171,7 @@ static unsigned long io_count_account_pages(struct page **pages, unsigned nr_pag if (folio == last_folio) continue; last_folio = folio; - res += 1UL << folio_order(folio); + res += folio_nr_pages(folio); } return res; } @@ -200,7 +201,7 @@ static int io_import_umem(struct io_zcrx_ifq *ifq, } mem->account_pages = io_count_account_pages(pages, nr_pages); - ret = io_account_mem(ifq->ctx, mem->account_pages); + ret = io_account_mem(ifq->user, ifq->mm_account, mem->account_pages); if (ret < 0) mem->account_pages = 0; @@ -344,7 +345,15 @@ static void io_zcrx_get_niov_uref(struct net_iov *niov) atomic_inc(io_get_user_counter(niov)); } -static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, +static void io_fill_zcrx_offsets(struct io_uring_zcrx_offsets *offsets) +{ + offsets->head = offsetof(struct io_uring, head); + offsets->tail = offsetof(struct io_uring, tail); + offsets->rqes = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); +} + +static int io_allocate_rbuf_ring(struct io_ring_ctx *ctx, + struct io_zcrx_ifq *ifq, struct io_uring_zcrx_ifq_reg *reg, struct io_uring_region_desc *rd, u32 id) @@ -354,7 +363,8 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, void *ptr; int ret; - off = ALIGN(sizeof(struct io_uring), L1_CACHE_BYTES); + io_fill_zcrx_offsets(®->offsets); + off = reg->offsets.rqes; size = off + sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries; if (size > rd->size) return -EINVAL; @@ -362,7 +372,7 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, mmap_offset = IORING_MAP_OFF_ZCRX_REGION; mmap_offset += id << IORING_OFF_PBUF_SHIFT; - ret = io_create_region(ifq->ctx, &ifq->region, rd, mmap_offset); + ret = io_create_region(ctx, &ifq->region, rd, mmap_offset); if (ret < 0) return ret; @@ -370,26 +380,25 @@ static int io_allocate_rbuf_ring(struct io_zcrx_ifq *ifq, ifq->rq_ring = (struct io_uring *)ptr; ifq->rqes = (struct io_uring_zcrx_rqe *)(ptr + off); - reg->offsets.head = offsetof(struct io_uring, head); - reg->offsets.tail = offsetof(struct io_uring, tail); - reg->offsets.rqes = off; return 0; } static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq) { - io_free_region(ifq->ctx, &ifq->region); + io_free_region(ifq->user, &ifq->region); ifq->rq_ring = NULL; ifq->rqes = NULL; } -static void io_zcrx_free_area(struct io_zcrx_area *area) +static void io_zcrx_free_area(struct io_zcrx_ifq *ifq, + struct io_zcrx_area *area) { - io_zcrx_unmap_area(area->ifq, area); + io_zcrx_unmap_area(ifq, area); io_release_area_mem(&area->mem); if (area->mem.account_pages) - io_unaccount_mem(area->ifq->ctx, area->mem.account_pages); + io_unaccount_mem(ifq->user, ifq->mm_account, + area->mem.account_pages); kvfree(area->freelist); kvfree(area->nia.niovs); @@ -463,7 +472,7 @@ static int io_zcrx_create_area(struct io_zcrx_ifq *ifq, return 0; err: if (area) - io_zcrx_free_area(area); + io_zcrx_free_area(ifq, area); return ret; } @@ -476,9 +485,10 @@ static struct io_zcrx_ifq *io_zcrx_ifq_alloc(struct io_ring_ctx *ctx) return NULL; ifq->if_rxq = -1; - ifq->ctx = ctx; spin_lock_init(&ifq->rq_lock); mutex_init(&ifq->pp_lock); + refcount_set(&ifq->refs, 1); + refcount_set(&ifq->user_refs, 1); return ifq; } @@ -522,7 +532,10 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) io_close_queue(ifq); if (ifq->area) - io_zcrx_free_area(ifq->area); + io_zcrx_free_area(ifq, ifq->area); + free_uid(ifq->user); + if (ifq->mm_account) + mmdrop(ifq->mm_account); if (ifq->dev) put_device(ifq->dev); @@ -531,6 +544,63 @@ static void io_zcrx_ifq_free(struct io_zcrx_ifq *ifq) kfree(ifq); } +static void io_put_zcrx_ifq(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->refs)) + io_zcrx_ifq_free(ifq); +} + +static void io_zcrx_return_niov_freelist(struct net_iov *niov) +{ + struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); + + spin_lock_bh(&area->freelist_lock); + area->freelist[area->free_count++] = net_iov_idx(niov); + spin_unlock_bh(&area->freelist_lock); +} + +static void io_zcrx_return_niov(struct net_iov *niov) +{ + netmem_ref netmem = net_iov_to_netmem(niov); + + if (!niov->desc.pp) { + /* copy fallback allocated niovs */ + io_zcrx_return_niov_freelist(niov); + return; + } + page_pool_put_unrefed_netmem(niov->desc.pp, netmem, -1, false); +} + +static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) +{ + struct io_zcrx_area *area = ifq->area; + int i; + + if (!area) + return; + + /* Reclaim back all buffers given to the user space. */ + for (i = 0; i < area->nia.num_niovs; i++) { + struct net_iov *niov = &area->nia.niovs[i]; + int nr; + + if (!atomic_read(io_get_user_counter(niov))) + continue; + nr = atomic_xchg(io_get_user_counter(niov), 0); + if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) + io_zcrx_return_niov(niov); + } +} + +static void zcrx_unregister(struct io_zcrx_ifq *ifq) +{ + if (refcount_dec_and_test(&ifq->user_refs)) { + io_close_queue(ifq); + io_zcrx_scrub(ifq); + } + io_put_zcrx_ifq(ifq); +} + struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, unsigned int id) { @@ -541,6 +611,112 @@ struct io_mapped_region *io_zcrx_get_region(struct io_ring_ctx *ctx, return ifq ? &ifq->region : NULL; } +static int zcrx_box_release(struct inode *inode, struct file *file) +{ + struct io_zcrx_ifq *ifq = file->private_data; + + if (WARN_ON_ONCE(!ifq)) + return -EFAULT; + zcrx_unregister(ifq); + return 0; +} + +static const struct file_operations zcrx_box_fops = { + .owner = THIS_MODULE, + .release = zcrx_box_release, +}; + +static int zcrx_export(struct io_ring_ctx *ctx, struct io_zcrx_ifq *ifq, + struct zcrx_ctrl *ctrl, void __user *arg) +{ + struct zcrx_ctrl_export *ce = &ctrl->zc_export; + struct file *file; + int fd = -1; + + if (!mem_is_zero(ce, sizeof(*ce))) + return -EINVAL; + fd = get_unused_fd_flags(O_CLOEXEC); + if (fd < 0) + return fd; + + ce->zcrx_fd = fd; + if (copy_to_user(arg, ctrl, sizeof(*ctrl))) { + put_unused_fd(fd); + return -EFAULT; + } + + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + file = anon_inode_create_getfile("[zcrx]", &zcrx_box_fops, + ifq, O_CLOEXEC, NULL); + if (IS_ERR(file)) { + put_unused_fd(fd); + zcrx_unregister(ifq); + return PTR_ERR(file); + } + + fd_install(fd, file); + return 0; +} + +static int import_zcrx(struct io_ring_ctx *ctx, + struct io_uring_zcrx_ifq_reg __user *arg, + struct io_uring_zcrx_ifq_reg *reg) +{ + struct io_zcrx_ifq *ifq; + struct file *file; + int fd, ret; + u32 id; + + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + return -EINVAL; + if (!(ctx->flags & (IORING_SETUP_CQE32|IORING_SETUP_CQE_MIXED))) + return -EINVAL; + if (reg->if_rxq || reg->rq_entries || reg->area_ptr || reg->region_ptr) + return -EINVAL; + + fd = reg->if_idx; + CLASS(fd, f)(fd); + if (fd_empty(f)) + return -EBADF; + + file = fd_file(f); + if (file->f_op != &zcrx_box_fops || !file->private_data) + return -EBADF; + + ifq = file->private_data; + refcount_inc(&ifq->refs); + refcount_inc(&ifq->user_refs); + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = xa_alloc(&ctx->zcrx_ctxs, &id, NULL, xa_limit_31b, GFP_KERNEL); + if (ret) + goto err; + } + + reg->zcrx_id = id; + io_fill_zcrx_offsets(®->offsets); + if (copy_to_user(arg, reg, sizeof(*reg))) { + ret = -EFAULT; + goto err_xa_erase; + } + + scoped_guard(mutex, &ctx->mmap_lock) { + ret = -ENOMEM; + if (xa_store(&ctx->zcrx_ctxs, id, ifq, GFP_KERNEL)) + goto err_xa_erase; + } + + return 0; +err_xa_erase: + scoped_guard(mutex, &ctx->mmap_lock) + xa_erase(&ctx->zcrx_ctxs, id); +err: + zcrx_unregister(ifq); + return ret; +} + int io_register_zcrx_ifq(struct io_ring_ctx *ctx, struct io_uring_zcrx_ifq_reg __user *arg) { @@ -566,11 +742,13 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, return -EINVAL; if (copy_from_user(®, arg, sizeof(reg))) return -EFAULT; - if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) - return -EFAULT; if (!mem_is_zero(®.__resv, sizeof(reg.__resv)) || reg.__resv2 || reg.zcrx_id) return -EINVAL; + if (reg.flags & ZCRX_REG_IMPORT) + return import_zcrx(ctx, arg, ®); + if (copy_from_user(&rd, u64_to_user_ptr(reg.region_ptr), sizeof(rd))) + return -EFAULT; if (reg.if_rxq == -1 || !reg.rq_entries || reg.flags) return -EINVAL; if (reg.rq_entries > IO_RQ_MAX_ENTRIES) { @@ -586,6 +764,15 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, ifq = io_zcrx_ifq_alloc(ctx); if (!ifq) return -ENOMEM; + + if (ctx->user) { + get_uid(ctx->user); + ifq->user = ctx->user; + } + if (ctx->mm_account) { + mmgrab(ctx->mm_account); + ifq->mm_account = ctx->mm_account; + } ifq->rq_entries = reg.rq_entries; scoped_guard(mutex, &ctx->mmap_lock) { @@ -595,33 +782,34 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto ifq_free; } - ret = io_allocate_rbuf_ring(ifq, ®, &rd, id); + ret = io_allocate_rbuf_ring(ctx, ifq, ®, &rd, id); if (ret) goto err; - ifq->netdev = netdev_get_by_index(current->nsproxy->net_ns, reg.if_idx, - &ifq->netdev_tracker, GFP_KERNEL); + ifq->netdev = netdev_get_by_index_lock(current->nsproxy->net_ns, reg.if_idx); if (!ifq->netdev) { ret = -ENODEV; goto err; } + netdev_hold(ifq->netdev, &ifq->netdev_tracker, GFP_KERNEL); ifq->dev = netdev_queue_get_dma_dev(ifq->netdev, reg.if_rxq); if (!ifq->dev) { ret = -EOPNOTSUPP; - goto err; + goto netdev_put_unlock; } get_device(ifq->dev); ret = io_zcrx_create_area(ifq, &area); if (ret) - goto err; + goto netdev_put_unlock; mp_param.mp_ops = &io_uring_pp_zc_ops; mp_param.mp_priv = ifq; - ret = net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param); + ret = __net_mp_open_rxq(ifq->netdev, reg.if_rxq, &mp_param, NULL); if (ret) - goto err; + goto netdev_put_unlock; + netdev_unlock(ifq->netdev); ifq->if_rxq = reg.if_rxq; reg.zcrx_id = id; @@ -640,6 +828,9 @@ int io_register_zcrx_ifq(struct io_ring_ctx *ctx, goto err; } return 0; +netdev_put_unlock: + netdev_put(ifq->netdev, &ifq->netdev_tracker); + netdev_unlock(ifq->netdev); err: scoped_guard(mutex, &ctx->mmap_lock) xa_erase(&ctx->zcrx_ctxs, id); @@ -648,6 +839,16 @@ ifq_free: return ret; } +static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) +{ + unsigned niov_idx; + + lockdep_assert_held(&area->freelist_lock); + + niov_idx = area->freelist[--area->free_count]; + return &area->nia.niovs[niov_idx]; +} + void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) { struct io_zcrx_ifq *ifq; @@ -664,77 +865,12 @@ void io_unregister_zcrx_ifqs(struct io_ring_ctx *ctx) } if (!ifq) break; - io_zcrx_ifq_free(ifq); + zcrx_unregister(ifq); } xa_destroy(&ctx->zcrx_ctxs); } -static struct net_iov *__io_zcrx_get_free_niov(struct io_zcrx_area *area) -{ - unsigned niov_idx; - - lockdep_assert_held(&area->freelist_lock); - - niov_idx = area->freelist[--area->free_count]; - return &area->nia.niovs[niov_idx]; -} - -static void io_zcrx_return_niov_freelist(struct net_iov *niov) -{ - struct io_zcrx_area *area = io_zcrx_iov_to_area(niov); - - spin_lock_bh(&area->freelist_lock); - area->freelist[area->free_count++] = net_iov_idx(niov); - spin_unlock_bh(&area->freelist_lock); -} - -static void io_zcrx_return_niov(struct net_iov *niov) -{ - netmem_ref netmem = net_iov_to_netmem(niov); - - if (!niov->pp) { - /* copy fallback allocated niovs */ - io_zcrx_return_niov_freelist(niov); - return; - } - page_pool_put_unrefed_netmem(niov->pp, netmem, -1, false); -} - -static void io_zcrx_scrub(struct io_zcrx_ifq *ifq) -{ - struct io_zcrx_area *area = ifq->area; - int i; - - if (!area) - return; - - /* Reclaim back all buffers given to the user space. */ - for (i = 0; i < area->nia.num_niovs; i++) { - struct net_iov *niov = &area->nia.niovs[i]; - int nr; - - if (!atomic_read(io_get_user_counter(niov))) - continue; - nr = atomic_xchg(io_get_user_counter(niov), 0); - if (nr && !page_pool_unref_netmem(net_iov_to_netmem(niov), nr)) - io_zcrx_return_niov(niov); - } -} - -void io_shutdown_zcrx_ifqs(struct io_ring_ctx *ctx) -{ - struct io_zcrx_ifq *ifq; - unsigned long index; - - lockdep_assert_held(&ctx->uring_lock); - - xa_for_each(&ctx->zcrx_ctxs, index, ifq) { - io_zcrx_scrub(ifq); - io_close_queue(ifq); - } -} - static inline u32 io_zcrx_rqring_entries(struct io_zcrx_ifq *ifq) { u32 entries; @@ -800,7 +936,7 @@ static void io_zcrx_ring_refill(struct page_pool *pp, if (!page_pool_unref_and_test(netmem)) continue; - if (unlikely(niov->pp != pp)) { + if (unlikely(niov->desc.pp != pp)) { io_zcrx_return_niov(niov); continue; } @@ -880,15 +1016,13 @@ static int io_pp_zc_init(struct page_pool *pp) if (ret) return ret; - percpu_ref_get(&ifq->ctx->refs); + refcount_inc(&ifq->refs); return 0; } static void io_pp_zc_destroy(struct page_pool *pp) { - struct io_zcrx_ifq *ifq = io_pp_to_ifq(pp); - - percpu_ref_put(&ifq->ctx->refs); + io_put_zcrx_ifq(io_pp_to_ifq(pp)); } static int io_pp_nl_fill(void *mp_priv, struct sk_buff *rsp, @@ -928,6 +1062,97 @@ static const struct memory_provider_ops io_uring_pp_zc_ops = { .uninstall = io_pp_uninstall, }; +static unsigned zcrx_parse_rq(netmem_ref *netmem_array, unsigned nr, + struct io_zcrx_ifq *zcrx) +{ + unsigned int mask = zcrx->rq_entries - 1; + unsigned int i; + + guard(spinlock_bh)(&zcrx->rq_lock); + + nr = min(nr, io_zcrx_rqring_entries(zcrx)); + for (i = 0; i < nr; i++) { + struct io_uring_zcrx_rqe *rqe = io_zcrx_get_rqe(zcrx, mask); + struct net_iov *niov; + + if (!io_parse_rqe(rqe, zcrx, &niov)) + break; + netmem_array[i] = net_iov_to_netmem(niov); + } + + smp_store_release(&zcrx->rq_ring->head, zcrx->cached_rq_head); + return i; +} + +#define ZCRX_FLUSH_BATCH 32 + +static void zcrx_return_buffers(netmem_ref *netmems, unsigned nr) +{ + unsigned i; + + for (i = 0; i < nr; i++) { + netmem_ref netmem = netmems[i]; + struct net_iov *niov = netmem_to_net_iov(netmem); + + if (!io_zcrx_put_niov_uref(niov)) + continue; + if (!page_pool_unref_and_test(netmem)) + continue; + io_zcrx_return_niov(niov); + } +} + +static int zcrx_flush_rq(struct io_ring_ctx *ctx, struct io_zcrx_ifq *zcrx, + struct zcrx_ctrl *ctrl) +{ + struct zcrx_ctrl_flush_rq *frq = &ctrl->zc_flush; + netmem_ref netmems[ZCRX_FLUSH_BATCH]; + unsigned total = 0; + unsigned nr; + + if (!mem_is_zero(&frq->__resv, sizeof(frq->__resv))) + return -EINVAL; + + do { + nr = zcrx_parse_rq(netmems, ZCRX_FLUSH_BATCH, zcrx); + + zcrx_return_buffers(netmems, nr); + total += nr; + + if (fatal_signal_pending(current)) + break; + cond_resched(); + } while (nr == ZCRX_FLUSH_BATCH && total < zcrx->rq_entries); + + return 0; +} + +int io_zcrx_ctrl(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) +{ + struct zcrx_ctrl ctrl; + struct io_zcrx_ifq *zcrx; + + if (nr_args) + return -EINVAL; + if (copy_from_user(&ctrl, arg, sizeof(ctrl))) + return -EFAULT; + if (!mem_is_zero(&ctrl.__resv, sizeof(ctrl.__resv))) + return -EFAULT; + + zcrx = xa_load(&ctx->zcrx_ctxs, ctrl.zcrx_id); + if (!zcrx) + return -ENXIO; + + switch (ctrl.op) { + case ZCRX_CTRL_FLUSH_RQ: + return zcrx_flush_rq(ctx, zcrx, &ctrl); + case ZCRX_CTRL_EXPORT: + return zcrx_export(ctx, zcrx, &ctrl, arg); + } + + return -EOPNOTSUPP; +} + static bool io_zcrx_queue_cqe(struct io_kiocb *req, struct net_iov *niov, struct io_zcrx_ifq *ifq, int off, int len) { @@ -1069,13 +1294,15 @@ static int io_zcrx_recv_frag(struct io_kiocb *req, struct io_zcrx_ifq *ifq, const skb_frag_t *frag, int off, int len) { struct net_iov *niov; + struct page_pool *pp; if (unlikely(!skb_frag_is_net_iov(frag))) return io_zcrx_copy_frag(req, ifq, frag, off, len); niov = netmem_to_net_iov(frag->netmem); - if (!niov->pp || niov->pp->mp_ops != &io_uring_pp_zc_ops || - io_pp_to_ifq(niov->pp) != ifq) + pp = niov->desc.pp; + + if (!pp || pp->mp_ops != &io_uring_pp_zc_ops || io_pp_to_ifq(pp) != ifq) return -EFAULT; if (!io_zcrx_queue_cqe(req, niov, ifq, off + skb_frag_off(frag), len)) |
