From 42b33468987bac0dd95c30f14820c7abac04a153 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 10:59:47 +0200 Subject: xdp: add flags argument to ndo_xdp_xmit API This patch only change the API and reject any use of flags. This is an intermediate step that allows us to implement the flush flag operation later, for each individual driver in a separate patch. The plan is to implement flush operation via XDP_XMIT_FLUSH flag and then remove XDP_XMIT_FLAGS_NONE when done. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 7ad779237ae8..0c45f0f943ed 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,11 @@ enum xdp_mem_type { MEM_TYPE_MAX, }; +/* XDP flags for ndo_xdp_xmit */ +#define XDP_XMIT_FLAGS_NONE 0U +#define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ +#define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH + struct xdp_mem_info { u32 type; /* enum xdp_mem_type, but known size type */ u32 id; -- cgit v1.2.3 From 73de5717eb30ceedef6abf9da4bc2194f25d92ca Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 31 May 2018 11:00:13 +0200 Subject: xdp: done implementing ndo_xdp_xmit flush flag for all drivers Removing XDP_XMIT_FLAGS_NONE as all driver now implement a flush operation in their ndo_xdp_xmit call. The compiler will catch if any users of XDP_XMIT_FLAGS_NONE remains. Signed-off-by: Jesper Dangaard Brouer Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index 0c45f0f943ed..a3b71a4dd71d 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -41,7 +41,6 @@ enum xdp_mem_type { }; /* XDP flags for ndo_xdp_xmit */ -#define XDP_XMIT_FLAGS_NONE 0U #define XDP_XMIT_FLUSH (1U << 0) /* doorbell signal consumer */ #define XDP_XMIT_FLAGS_MASK XDP_XMIT_FLUSH -- cgit v1.2.3 From e61e62b9e2cc14b336f330f37f517f9d373ff31e Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:51 +0200 Subject: xsk: moved struct xdp_umem definition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Moved struct xdp_umem to xdp_sock.h, in order to prepare for zero-copy support. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 24 +++++++++++++++++++++++- net/xdp/xdp_umem.c | 1 + net/xdp/xdp_umem.h | 22 +--------------------- net/xdp/xsk_queue.h | 3 +-- 4 files changed, 26 insertions(+), 24 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 7a647c56ec15..3a6cd88f179d 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -6,12 +6,34 @@ #ifndef _LINUX_XDP_SOCK_H #define _LINUX_XDP_SOCK_H +#include +#include #include +#include #include struct net_device; struct xsk_queue; -struct xdp_umem; + +struct xdp_umem_props { + u64 chunk_mask; + u64 size; +}; + +struct xdp_umem { + struct xsk_queue *fq; + struct xsk_queue *cq; + struct page **pgs; + struct xdp_umem_props props; + u32 headroom; + u32 chunk_size_nohr; + struct user_struct *user; + struct pid *pid; + unsigned long address; + refcount_t users; + struct work_struct work; + u32 npgs; +}; struct xdp_sock { /* struct sock must be the first member of struct xdp_sock */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 9ad791ff4739..2793a503223e 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -13,6 +13,7 @@ #include #include "xdp_umem.h" +#include "xsk_queue.h" #define XDP_UMEM_MIN_CHUNK_SIZE 2048 diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index aeadd1bcb72d..9433e8af650a 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -6,27 +6,7 @@ #ifndef XDP_UMEM_H_ #define XDP_UMEM_H_ -#include -#include -#include - -#include "xsk_queue.h" -#include "xdp_umem_props.h" - -struct xdp_umem { - struct xsk_queue *fq; - struct xsk_queue *cq; - struct page **pgs; - struct xdp_umem_props props; - u32 headroom; - u32 chunk_size_nohr; - struct user_struct *user; - struct pid *pid; - unsigned long address; - refcount_t users; - struct work_struct work; - u32 npgs; -}; +#include static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 337e5ad3b10e..5246ed420a16 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -8,8 +8,7 @@ #include #include - -#include "xdp_umem_props.h" +#include #define RX_BATCH_SIZE 16 -- cgit v1.2.3 From 8aef7340ae9695912a411886452ae9773206e845 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:52 +0200 Subject: xsk: introduce xdp_umem_page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_umem_page holds the address for a page. Trade memory for faster lookup. Later, we'll add DMA address here as well. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 7 ++++++- net/xdp/xdp_umem.c | 15 ++++++++++++++- net/xdp/xdp_umem.h | 3 +-- 3 files changed, 21 insertions(+), 4 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 3a6cd88f179d..caf343a7e224 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -20,10 +20,14 @@ struct xdp_umem_props { u64 size; }; +struct xdp_umem_page { + void *addr; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; - struct page **pgs; + struct xdp_umem_page *pages; struct xdp_umem_props props; u32 headroom; u32 chunk_size_nohr; @@ -32,6 +36,7 @@ struct xdp_umem { unsigned long address; refcount_t users; struct work_struct work; + struct page **pgs; u32 npgs; }; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 2793a503223e..aca826011f6c 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -65,6 +65,9 @@ static void xdp_umem_release(struct xdp_umem *umem) goto out; mmput(mm); + kfree(umem->pages); + umem->pages = NULL; + xdp_umem_unaccount_pages(umem); out: kfree(umem); @@ -155,7 +158,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) u32 chunk_size = mr->chunk_size, headroom = mr->headroom; unsigned int chunks, chunks_per_page; u64 addr = mr->addr, size = mr->len; - int size_chk, err; + int size_chk, err, i; if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) { /* Strictly speaking we could support this, if: @@ -213,6 +216,16 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) err = xdp_umem_pin_pages(umem); if (err) goto out_account; + + umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL); + if (!umem->pages) { + err = -ENOMEM; + goto out_account; + } + + for (i = 0; i < umem->npgs; i++) + umem->pages[i].addr = page_address(umem->pgs[i]); + return 0; out_account: diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 9433e8af650a..40e8fa4a92af 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -10,8 +10,7 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { - return page_address(umem->pgs[addr >> PAGE_SHIFT]) + - (addr & (PAGE_SIZE - 1)); + return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } bool xdp_umem_validate_queues(struct xdp_umem *umem); -- cgit v1.2.3 From 02b55e5657c3a569fc681ba851e464cfa6b90d4f Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:54 +0200 Subject: xdp: add MEM_TYPE_ZERO_COPY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Here, a new type of allocator support is added to the XDP return API. A zero-copy allocated xdp_buff cannot be converted to an xdp_frame. Instead is the buff has to be copied. This is not supported at all in this commit. Also, an opaque "handle" is added to xdp_buff. This can be used as a context for the zero-copy allocator implementation. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp.h | 10 ++++++++++ net/core/xdp.c | 19 ++++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp.h b/include/net/xdp.h index a3b71a4dd71d..2deea7166a34 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -37,6 +37,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, + MEM_TYPE_ZERO_COPY, MEM_TYPE_MAX, }; @@ -51,6 +52,10 @@ struct xdp_mem_info { struct page_pool; +struct zero_copy_allocator { + void (*free)(struct zero_copy_allocator *zca, unsigned long handle); +}; + struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -63,6 +68,7 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; + unsigned long handle; struct xdp_rxq_info *rxq; }; @@ -86,6 +92,10 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; + /* TODO: implement clone, copy, use "native" MEM_TYPE */ + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + return NULL; + /* Assure headroom is available for storing info */ headroom = xdp->data - xdp->data_hard_start; metasize = xdp->data - xdp->data_meta; diff --git a/net/core/xdp.c b/net/core/xdp.c index cb8c4e061a5a..9d1f22072d5d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -31,6 +31,7 @@ struct xdp_mem_allocator { union { void *allocator; struct page_pool *page_pool; + struct zero_copy_allocator *zc_alloc; }; struct rhash_head node; struct rcu_head rcu; @@ -261,7 +262,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL) + if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -314,7 +315,8 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * is used for those calls sites. Thus, allowing for faster recycling * of xdp_frames/pages in those cases. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, + unsigned long handle) { struct xdp_mem_allocator *xa; struct page *page; @@ -338,6 +340,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; + case MEM_TYPE_ZERO_COPY: + /* NB! Only valid from an xdp_buff! */ + rcu_read_lock(); + /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ + xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); + xa->zc_alloc->free(xa->zc_alloc, handle); + rcu_read_unlock(); default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ break; @@ -346,18 +355,18 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false); + __xdp_return(xdpf->data, &xdpf->mem, false, 0); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true); + __xdp_return(xdpf->data, &xdpf->mem, true, 0); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); } EXPORT_SYMBOL_GPL(xdp_return_buff); -- cgit v1.2.3 From 173d3adb6f437037f216270955886ca9878187a5 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Mon, 4 Jun 2018 14:05:55 +0200 Subject: xsk: add zero-copy support for Rx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the xsk_rcv to support the new MEM_TYPE_ZERO_COPY memory, and wireup ndo_bpf call in bind. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 6 +++ include/uapi/linux/if_xdp.h | 4 +- net/xdp/xdp_umem.c | 77 ++++++++++++++++++++++++++++++++++++ net/xdp/xdp_umem.h | 3 ++ net/xdp/xsk.c | 96 +++++++++++++++++++++++++++++++++++---------- 5 files changed, 165 insertions(+), 21 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index caf343a7e224..d93d3aac3fc9 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -22,6 +22,7 @@ struct xdp_umem_props { struct xdp_umem_page { void *addr; + dma_addr_t dma; }; struct xdp_umem { @@ -38,6 +39,9 @@ struct xdp_umem { struct work_struct work; struct page **pgs; u32 npgs; + struct net_device *dev; + u16 queue_id; + bool zc; }; struct xdp_sock { @@ -60,6 +64,8 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); +void xsk_umem_discard_addr(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/include/uapi/linux/if_xdp.h b/include/uapi/linux/if_xdp.h index e411d6f9ac65..1fa0e977ea8d 100644 --- a/include/uapi/linux/if_xdp.h +++ b/include/uapi/linux/if_xdp.h @@ -13,7 +13,9 @@ #include /* Options for the sxdp_flags field */ -#define XDP_SHARED_UMEM 1 +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ struct sockaddr_xdp { __u16 sxdp_family; diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index aca826011f6c..f729d79b8d91 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,81 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags) +{ + bool force_zc, force_copy; + struct netdev_bpf bpf; + int err; + + force_zc = flags & XDP_ZEROCOPY; + force_copy = flags & XDP_COPY; + + if (force_zc && force_copy) + return -EINVAL; + + if (force_copy) + return 0; + + dev_hold(dev); + + if (dev->netdev_ops->ndo_bpf) { + bpf.command = XDP_QUERY_XSK_UMEM; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; + } + + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = umem; + bpf.xsk.queue_id = queue_id; + + rtnl_lock(); + err = dev->netdev_ops->ndo_bpf(dev, &bpf); + rtnl_unlock(); + + if (err) { + dev_put(dev); + return force_zc ? err : 0; /* fail or fallback */ + } + + umem->dev = dev; + umem->queue_id = queue_id; + umem->zc = true; + return 0; + } + + dev_put(dev); + return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ +} + +void xdp_umem_clear_dev(struct xdp_umem *umem) +{ + struct netdev_bpf bpf; + int err; + + if (umem->dev) { + bpf.command = XDP_SETUP_XSK_UMEM; + bpf.xsk.umem = NULL; + bpf.xsk.queue_id = umem->queue_id; + + rtnl_lock(); + err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf); + rtnl_unlock(); + + if (err) + WARN(1, "failed to disable umem!\n"); + + dev_put(umem->dev); + umem->dev = NULL; + } +} + static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unsigned int i; @@ -43,6 +118,8 @@ static void xdp_umem_release(struct xdp_umem *umem) struct task_struct *task; struct mm_struct *mm; + xdp_umem_clear_dev(umem); + if (umem->fq) { xskq_destroy(umem->fq); umem->fq = NULL; diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 40e8fa4a92af..674508a32a4d 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,6 +13,9 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, + u32 queue_id, u16 flags); +void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 4688c750df1d..ab64bd8260ea 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -36,19 +36,28 @@ static struct xdp_sock *xdp_sk(struct sock *sk) bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { - return !!xs->rx; + return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && + READ_ONCE(xs->umem->fq); } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) +{ + return xskq_peek_addr(umem->fq, addr); +} +EXPORT_SYMBOL(xsk_umem_peek_addr); + +void xsk_umem_discard_addr(struct xdp_umem *umem) +{ + xskq_discard_addr(umem->fq); +} +EXPORT_SYMBOL(xsk_umem_discard_addr); + +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - u32 len = xdp->data_end - xdp->data; void *buffer; u64 addr; int err; - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) - return -EINVAL; - if (!xskq_peek_addr(xs->umem->fq, &addr) || len > xs->umem->chunk_size_nohr) { xs->rx_dropped++; @@ -60,25 +69,41 @@ static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) buffer = xdp_umem_get_data(xs->umem, addr); memcpy(buffer, xdp->data, len); err = xskq_produce_batch_desc(xs->rx, addr, len); - if (!err) + if (!err) { xskq_discard_addr(xs->umem->fq); - else - xs->rx_dropped++; + xdp_return_buff(xdp); + return 0; + } + xs->rx_dropped++; return err; } -int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - int err; + int err = xskq_produce_batch_desc(xs->rx, (u64)xdp->handle, len); - err = __xsk_rcv(xs, xdp); - if (likely(!err)) + if (err) { xdp_return_buff(xdp); + xs->rx_dropped++; + } return err; } +int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +{ + u32 len; + + if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) + return -EINVAL; + + len = xdp->data_end - xdp->data; + + return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? + __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); +} + void xsk_flush(struct xdp_sock *xs) { xskq_produce_flush_desc(xs->rx); @@ -87,12 +112,29 @@ void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { + u32 len = xdp->data_end - xdp->data; + void *buffer; + u64 addr; int err; - err = __xsk_rcv(xs, xdp); - if (!err) + if (!xskq_peek_addr(xs->umem->fq, &addr) || + len > xs->umem->chunk_size_nohr) { + xs->rx_dropped++; + return -ENOSPC; + } + + addr += xs->umem->headroom; + + buffer = xdp_umem_get_data(xs->umem, addr); + memcpy(buffer, xdp->data, len); + err = xskq_produce_batch_desc(xs->rx, addr, len); + if (!err) { + xskq_discard_addr(xs->umem->fq); xsk_flush(xs); + return 0; + } + xs->rx_dropped++; return err; } @@ -291,6 +333,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); struct net_device *dev; + u32 flags, qid; int err = 0; if (addr_len < sizeof(struct sockaddr_xdp)) @@ -315,16 +358,26 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - if ((xs->rx && sxdp->sxdp_queue_id >= dev->real_num_rx_queues) || - (xs->tx && sxdp->sxdp_queue_id >= dev->real_num_tx_queues)) { + qid = sxdp->sxdp_queue_id; + + if ((xs->rx && qid >= dev->real_num_rx_queues) || + (xs->tx && qid >= dev->real_num_tx_queues)) { err = -EINVAL; goto out_unlock; } - if (sxdp->sxdp_flags & XDP_SHARED_UMEM) { + flags = sxdp->sxdp_flags; + + if (flags & XDP_SHARED_UMEM) { struct xdp_sock *umem_xs; struct socket *sock; + if ((flags & XDP_COPY) || (flags & XDP_ZEROCOPY)) { + /* Cannot specify flags for shared sockets. */ + err = -EINVAL; + goto out_unlock; + } + if (xs->umem) { /* We have already our own. */ err = -EINVAL; @@ -343,8 +396,7 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) err = -EBADF; sockfd_put(sock); goto out_unlock; - } else if (umem_xs->dev != dev || - umem_xs->queue_id != sxdp->sxdp_queue_id) { + } else if (umem_xs->dev != dev || umem_xs->queue_id != qid) { err = -EINVAL; sockfd_put(sock); goto out_unlock; @@ -360,6 +412,10 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) /* This xsk has its own umem. */ xskq_set_umem(xs->umem->fq, &xs->umem->props); xskq_set_umem(xs->umem->cq, &xs->umem->props); + + err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); + if (err) + goto out_unlock; } xs->dev = dev; -- cgit v1.2.3 From ac98d8aab61baf785eb8f099b36daf34fc76a70e Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Mon, 4 Jun 2018 14:05:57 +0200 Subject: xsk: wire upp Tx zero-copy functions Here we add the functionality required to support zero-copy Tx, and also exposes various zero-copy related functions for the netdevs. Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann --- include/net/xdp_sock.h | 9 +++++++ net/xdp/xdp_umem.c | 29 +++++++++++++++++++-- net/xdp/xdp_umem.h | 8 +++++- net/xdp/xsk.c | 70 +++++++++++++++++++++++++++++++++++++++++++++----- net/xdp/xsk_queue.h | 32 ++++++++++++++++++++++- 5 files changed, 137 insertions(+), 11 deletions(-) (limited to 'include/net') diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index d93d3aac3fc9..9fe472f2ac95 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,8 @@ struct xdp_umem { struct net_device *dev; u16 queue_id; bool zc; + spinlock_t xsk_list_lock; + struct list_head xsk_list; }; struct xdp_sock { @@ -53,6 +56,8 @@ struct xdp_sock { struct list_head flush_node; u16 queue_id; struct xsk_queue *tx ____cacheline_aligned_in_smp; + struct list_head list; + bool zc; /* Protects multiple processes in the control path */ struct mutex mutex; u64 rx_dropped; @@ -64,8 +69,12 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp); void xsk_flush(struct xdp_sock *xs); bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs); +/* Used from netdev driver */ u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); void xsk_umem_discard_addr(struct xdp_umem *umem); +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); +void xsk_umem_consume_tx_done(struct xdp_umem *umem); #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index f729d79b8d91..7eb4948a38d2 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -17,6 +17,29 @@ #define XDP_UMEM_MIN_CHUNK_SIZE 2048 +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_add_rcu(&xs->list, &umem->xsk_list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); +} + +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs) +{ + unsigned long flags; + + if (xs->dev) { + spin_lock_irqsave(&umem->xsk_list_lock, flags); + list_del_rcu(&xs->list); + spin_unlock_irqrestore(&umem->xsk_list_lock, flags); + + if (umem->zc) + synchronize_net(); + } +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags) { @@ -35,7 +58,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, dev_hold(dev); - if (dev->netdev_ops->ndo_bpf) { + if (dev->netdev_ops->ndo_bpf && dev->netdev_ops->ndo_xsk_async_xmit) { bpf.command = XDP_QUERY_XSK_UMEM; rtnl_lock(); @@ -70,7 +93,7 @@ int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, return force_zc ? -ENOTSUPP : 0; /* fail or fallback */ } -void xdp_umem_clear_dev(struct xdp_umem *umem) +static void xdp_umem_clear_dev(struct xdp_umem *umem) { struct netdev_bpf bpf; int err; @@ -283,6 +306,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; + INIT_LIST_HEAD(&umem->xsk_list); + spin_lock_init(&umem->xsk_list_lock); refcount_set(&umem->users, 1); diff --git a/net/xdp/xdp_umem.h b/net/xdp/xdp_umem.h index 674508a32a4d..f11560334f88 100644 --- a/net/xdp/xdp_umem.h +++ b/net/xdp/xdp_umem.h @@ -13,12 +13,18 @@ static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) return umem->pages[addr >> PAGE_SHIFT].addr + (addr & (PAGE_SIZE - 1)); } +static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) +{ + return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); +} + int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev, u32 queue_id, u16 flags); -void xdp_umem_clear_dev(struct xdp_umem *umem); bool xdp_umem_validate_queues(struct xdp_umem *umem); void xdp_get_umem(struct xdp_umem *umem); void xdp_put_umem(struct xdp_umem *umem); +void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); +void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs); struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr); #endif /* XDP_UMEM_H_ */ diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index ab64bd8260ea..ddca4bf1cfc8 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -138,6 +139,59 @@ int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) return err; } +void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) +{ + xskq_produce_flush_addr_n(umem->cq, nb_entries); +} +EXPORT_SYMBOL(xsk_umem_complete_tx); + +void xsk_umem_consume_tx_done(struct xdp_umem *umem) +{ + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + xs->sk.sk_write_space(&xs->sk); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(xsk_umem_consume_tx_done); + +bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len) +{ + struct xdp_desc desc; + struct xdp_sock *xs; + + rcu_read_lock(); + list_for_each_entry_rcu(xs, &umem->xsk_list, list) { + if (!xskq_peek_desc(xs->tx, &desc)) + continue; + + if (xskq_produce_addr_lazy(umem->cq, desc.addr)) + goto out; + + *dma = xdp_umem_get_dma(umem, desc.addr); + *len = desc.len; + + xskq_discard_desc(xs->tx); + rcu_read_unlock(); + return true; + } + +out: + rcu_read_unlock(); + return false; +} +EXPORT_SYMBOL(xsk_umem_consume_tx); + +static int xsk_zc_xmit(struct sock *sk) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct net_device *dev = xs->dev; + + return dev->netdev_ops->ndo_xsk_async_xmit(dev, xs->queue_id); +} + static void xsk_destruct_skb(struct sk_buff *skb) { u64 addr = (u64)(long)skb_shinfo(skb)->destructor_arg; @@ -151,7 +205,6 @@ static void xsk_destruct_skb(struct sk_buff *skb) static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, size_t total_len) { - bool need_wait = !(m->msg_flags & MSG_DONTWAIT); u32 max_batch = TX_BATCH_SIZE; struct xdp_sock *xs = xdp_sk(sk); bool sent_frame = false; @@ -161,8 +214,6 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, if (unlikely(!xs->tx)) return -ENOBUFS; - if (need_wait) - return -EOPNOTSUPP; mutex_lock(&xs->mutex); @@ -192,7 +243,7 @@ static int xsk_generic_xmit(struct sock *sk, struct msghdr *m, goto out; } - skb = sock_alloc_send_skb(sk, len, !need_wait, &err); + skb = sock_alloc_send_skb(sk, len, 1, &err); if (unlikely(!skb)) { err = -EAGAIN; goto out; @@ -235,6 +286,7 @@ out: static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) { + bool need_wait = !(m->msg_flags & MSG_DONTWAIT); struct sock *sk = sock->sk; struct xdp_sock *xs = xdp_sk(sk); @@ -242,8 +294,10 @@ static int xsk_sendmsg(struct socket *sock, struct msghdr *m, size_t total_len) return -ENXIO; if (unlikely(!(xs->dev->flags & IFF_UP))) return -ENETDOWN; + if (need_wait) + return -EOPNOTSUPP; - return xsk_generic_xmit(sk, m, total_len); + return (xs->zc) ? xsk_zc_xmit(sk) : xsk_generic_xmit(sk, m, total_len); } static unsigned int xsk_poll(struct file *file, struct socket *sock, @@ -419,10 +473,11 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) } xs->dev = dev; - xs->queue_id = sxdp->sxdp_queue_id; - + xs->zc = xs->umem->zc; + xs->queue_id = qid; xskq_set_umem(xs->rx, &xs->umem->props); xskq_set_umem(xs->tx, &xs->umem->props); + xdp_add_sk_umem(xs->umem, xs); out_unlock: if (err) @@ -660,6 +715,7 @@ static void xsk_destruct(struct sock *sk) xskq_destroy(xs->rx); xskq_destroy(xs->tx); + xdp_del_sk_umem(xs->umem, xs); xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 5246ed420a16..ef6a6f0ec949 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -11,6 +11,7 @@ #include #define RX_BATCH_SIZE 16 +#define LAZY_UPDATE_THRESHOLD 128 struct xdp_ring { u32 producer ____cacheline_aligned_in_smp; @@ -61,9 +62,14 @@ static inline u32 xskq_nb_avail(struct xsk_queue *q, u32 dcnt) return (entries > dcnt) ? dcnt : entries; } +static inline u32 xskq_nb_free_lazy(struct xsk_queue *q, u32 producer) +{ + return q->nentries - (producer - q->cons_tail); +} + static inline u32 xskq_nb_free(struct xsk_queue *q, u32 producer, u32 dcnt) { - u32 free_entries = q->nentries - (producer - q->cons_tail); + u32 free_entries = xskq_nb_free_lazy(q, producer); if (free_entries >= dcnt) return free_entries; @@ -123,6 +129,9 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + if (xskq_nb_free(q, q->prod_tail, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + ring->desc[q->prod_tail++ & q->ring_mask] = addr; /* Order producer and data */ @@ -132,6 +141,27 @@ static inline int xskq_produce_addr(struct xsk_queue *q, u64 addr) return 0; } +static inline int xskq_produce_addr_lazy(struct xsk_queue *q, u64 addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (xskq_nb_free(q, q->prod_head, LAZY_UPDATE_THRESHOLD) == 0) + return -ENOSPC; + + ring->desc[q->prod_head++ & q->ring_mask] = addr; + return 0; +} + +static inline void xskq_produce_flush_addr_n(struct xsk_queue *q, + u32 nb_entries) +{ + /* Order producer and data */ + smp_wmb(); + + q->prod_tail += nb_entries; + WRITE_ONCE(q->ring->producer, q->prod_tail); +} + static inline int xskq_reserve_addr(struct xsk_queue *q) { if (xskq_nb_free(q, q->prod_head, 1) == 0) -- cgit v1.2.3