From 28155524eaa2eabdc97e588db195d0a45d7e4d6f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 15:16:06 -0500 Subject: SUNRPC: Clean up: Replace dprintk and BUG_ON call sites in svcauth_gss.c Signed-off-by: Chuck Lever --- include/trace/events/rpcgss.h | 59 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/trace/events/rpcgss.h b/include/trace/events/rpcgss.h index 9827f535f032..32d88c4fb063 100644 --- a/include/trace/events/rpcgss.h +++ b/include/trace/events/rpcgss.h @@ -126,7 +126,7 @@ DEFINE_GSSAPI_EVENT(verify_mic); DEFINE_GSSAPI_EVENT(wrap); DEFINE_GSSAPI_EVENT(unwrap); -TRACE_EVENT(rpcgss_accept_upcall, +TRACE_EVENT(rpcgss_svc_accept_upcall, TP_PROTO( __be32 xid, u32 major_status, @@ -154,6 +154,29 @@ TRACE_EVENT(rpcgss_accept_upcall, ) ); +TRACE_EVENT(rpcgss_svc_accept, + TP_PROTO( + __be32 xid, + size_t len + ), + + TP_ARGS(xid, len), + + TP_STRUCT__entry( + __field(u32, xid) + __field(size_t, len) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(xid); + __entry->len = len; + ), + + TP_printk("xid=0x%08x len=%zu", + __entry->xid, __entry->len + ) +); + /** ** GSS auth unwrap failures @@ -268,6 +291,40 @@ TRACE_EVENT(rpcgss_need_reencode, __entry->ret ? "" : "un") ); +DECLARE_EVENT_CLASS(rpcgss_svc_seqno_class, + TP_PROTO( + __be32 xid, + u32 seqno + ), + + TP_ARGS(xid, seqno), + + TP_STRUCT__entry( + __field(u32, xid) + __field(u32, seqno) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(xid); + __entry->seqno = seqno; + ), + + TP_printk("xid=0x%08x seqno=%u, request discarded", + __entry->xid, __entry->seqno) +); + +#define DEFINE_SVC_SEQNO_EVENT(name) \ + DEFINE_EVENT(rpcgss_svc_seqno_class, rpcgss_svc_##name, \ + TP_PROTO( \ + __be32 xid, \ + u32 seqno \ + ), \ + TP_ARGS(xid, seqno)) + +DEFINE_SVC_SEQNO_EVENT(large_seqno); +DEFINE_SVC_SEQNO_EVENT(old_seqno); + + /** ** gssd upcall related trace events **/ -- cgit v1.3 From b20dfc3fcd6ed1e16c828c81e1fc6f4aea2cfa77 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 15:01:08 -0500 Subject: svcrdma: Create a generic tracing class for displaying xdr_buf layout This class can be used to create trace points in either the RPC client or RPC server paths. It simply displays the length of each part of an xdr_buf, which is useful to determine that the transport and XDR codecs are operating correctly. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 43 +++++++++++++++++++++++++++++++++++++++++++ net/sunrpc/clnt.c | 1 + net/sunrpc/svc_xprt.c | 3 +++ net/sunrpc/xprt.c | 3 +-- 4 files changed, 48 insertions(+), 2 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index ee993575d2fa..1577223add43 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -14,6 +14,49 @@ #include #include +DECLARE_EVENT_CLASS(xdr_buf_class, + TP_PROTO( + const struct xdr_buf *xdr + ), + + TP_ARGS(xdr), + + TP_STRUCT__entry( + __field(const void *, head_base) + __field(size_t, head_len) + __field(const void *, tail_base) + __field(size_t, tail_len) + __field(unsigned int, page_len) + __field(unsigned int, msg_len) + ), + + TP_fast_assign( + __entry->head_base = xdr->head[0].iov_base; + __entry->head_len = xdr->head[0].iov_len; + __entry->tail_base = xdr->tail[0].iov_base; + __entry->tail_len = xdr->tail[0].iov_len; + __entry->page_len = xdr->page_len; + __entry->msg_len = xdr->len; + ), + + TP_printk("head=[%p,%zu] page=%u tail=[%p,%zu] len=%u", + __entry->head_base, __entry->head_len, __entry->page_len, + __entry->tail_base, __entry->tail_len, __entry->msg_len + ) +); + +#define DEFINE_XDRBUF_EVENT(name) \ + DEFINE_EVENT(xdr_buf_class, name, \ + TP_PROTO( \ + const struct xdr_buf *xdr \ + ), \ + TP_ARGS(xdr)) + +DEFINE_XDRBUF_EVENT(xprt_sendto); +DEFINE_XDRBUF_EVENT(xprt_recvfrom); +DEFINE_XDRBUF_EVENT(svc_recvfrom); +DEFINE_XDRBUF_EVENT(svc_sendto); + TRACE_DEFINE_ENUM(RPC_AUTH_OK); TRACE_DEFINE_ENUM(RPC_AUTH_BADCRED); TRACE_DEFINE_ENUM(RPC_AUTH_REJECTEDCRED); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7324b21f923e..07992d3e8703 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2509,6 +2509,7 @@ call_decode(struct rpc_task *task) goto out; req->rq_rcv_buf.len = req->rq_private_buf.len; + trace_xprt_recvfrom(&req->rq_rcv_buf); /* Check that the softirq receive buffer is valid */ WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index de3c077733a7..d53259346235 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -802,6 +802,8 @@ static int svc_handle_xprt(struct svc_rqst *rqstp, struct svc_xprt *xprt) len = svc_deferred_recv(rqstp); else len = xprt->xpt_ops->xpo_recvfrom(rqstp); + if (len > 0) + trace_svc_recvfrom(&rqstp->rq_arg); rqstp->rq_stime = ktime_get(); rqstp->rq_reserved = serv->sv_max_mesg; atomic_add(rqstp->rq_reserved, &xprt->xpt_reserved); @@ -905,6 +907,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; + trace_svc_sendto(xb); /* Grab mutex to serialize outgoing data. */ mutex_lock(&xprt->xpt_mutex); diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 1aafe8d3f3f4..493a30a296fc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1117,8 +1117,6 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - dprintk("RPC: %5u xid %08x complete (%d bytes received)\n", - task->tk_pid, ntohl(req->rq_xid), copied); trace_xprt_complete_rqst(xprt, req->rq_xid, copied); xprt->stat.recvs++; @@ -1462,6 +1460,7 @@ xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) */ req->rq_ntrans++; + trace_xprt_sendto(&req->rq_snd_buf); connect_cookie = xprt->connect_cookie; status = xprt->ops->send_request(req); if (status != 0) { -- cgit v1.3 From 2426ddfdf169a4390e498b79b6d34d0d69c515bc Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 15:01:08 -0500 Subject: svcrdma: Remove svcrdma_cm_event() trace point Clean up. This trace point is no longer needed because the RDMA/core CMA code has an equivalent trace point that was added by commit ed999f820a6c ("RDMA/cma: Add trace points in RDMA Connection Manager"). Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 28 ---------------------------- net/sunrpc/xprtrdma/svc_rdma_transport.c | 7 ------- 2 files changed, 35 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index c0e4c93324f5..545fe936a0cc 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1813,34 +1813,6 @@ TRACE_EVENT(svcrdma_post_rw, DEFINE_SENDCOMP_EVENT(read); DEFINE_SENDCOMP_EVENT(write); -TRACE_EVENT(svcrdma_cm_event, - TP_PROTO( - const struct rdma_cm_event *event, - const struct sockaddr *sap - ), - - TP_ARGS(event, sap), - - TP_STRUCT__entry( - __field(unsigned int, event) - __field(int, status) - __array(__u8, addr, INET6_ADDRSTRLEN + 10) - ), - - TP_fast_assign( - __entry->event = event->event; - __entry->status = event->status; - snprintf(__entry->addr, sizeof(__entry->addr) - 1, - "%pISpc", sap); - ), - - TP_printk("addr=%s event=%s (%u/%d)", - __entry->addr, - rdma_show_cm_event(__entry->event), - __entry->event, __entry->status - ) -); - TRACE_EVENT(svcrdma_qp_error, TP_PROTO( const struct ib_event *event, diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index f6aad2798063..8bb99980ae85 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -241,10 +241,6 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, static int rdma_listen_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.src_addr; - - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_CONNECT_REQUEST: dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, " @@ -266,12 +262,9 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id, static int rdma_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { - struct sockaddr *sap = (struct sockaddr *)&cma_id->route.addr.dst_addr; struct svcxprt_rdma *rdma = cma_id->context; struct svc_xprt *xprt = &rdma->sc_xprt; - trace_svcrdma_cm_event(event, sap); - switch (event->event) { case RDMA_CM_EVENT_ESTABLISHED: /* Accept complete */ -- cgit v1.3 From e604aad2cac7357162f661e45f2f60e46faa7b17 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 15:01:08 -0500 Subject: svcrdma: Use struct xdr_stream to decode ingress transport headers The logic that checks incoming network headers has to be scrupulous. De-duplicate: replace open-coded buffer overflow checks with the use of xdr_stream helpers that are used most everywhere else XDR decoding is done. One minor change to the sanity checks: instead of checking the length of individual segments, cap the length of the whole chunk to be sure it can fit in the set of pages available in rq_pages. This should be a better test of whether the server can handle the chunks in each request. Signed-off-by: Chuck Lever --- include/linux/sunrpc/rpc_rdma.h | 3 +- include/linux/sunrpc/svc_rdma.h | 1 + include/trace/events/rpcrdma.h | 7 +- net/sunrpc/xprtrdma/svc_rdma_recvfrom.c | 206 +++++++++++++++++++------------- 4 files changed, 130 insertions(+), 87 deletions(-) (limited to 'include/trace') diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 92d182fd8e3b..320c672d84de 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -58,7 +58,8 @@ enum { enum { rpcrdma_fixed_maxsz = 4, rpcrdma_segment_maxsz = 4, - rpcrdma_readchunk_maxsz = 2 + rpcrdma_segment_maxsz, + rpcrdma_readseg_maxsz = 1 + rpcrdma_segment_maxsz, + rpcrdma_readchunk_maxsz = 1 + rpcrdma_readseg_maxsz, }; /* diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index 04e4a34d1c6a..c790dbb0dd90 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -132,6 +132,7 @@ struct svc_rdma_recv_ctxt { struct ib_sge rc_recv_sge; void *rc_recv_buf; struct xdr_buf rc_arg; + struct xdr_stream rc_stream; bool rc_temp; u32 rc_byte_len; unsigned int rc_page_count; diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 545fe936a0cc..814b73bd2cc7 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1469,7 +1469,7 @@ DECLARE_EVENT_CLASS(svcrdma_segment_event, ); #define DEFINE_SEGMENT_EVENT(name) \ - DEFINE_EVENT(svcrdma_segment_event, svcrdma_encode_##name,\ + DEFINE_EVENT(svcrdma_segment_event, svcrdma_##name,\ TP_PROTO( \ u32 handle, \ u32 length, \ @@ -1477,8 +1477,9 @@ DECLARE_EVENT_CLASS(svcrdma_segment_event, ), \ TP_ARGS(handle, length, offset)) -DEFINE_SEGMENT_EVENT(rseg); -DEFINE_SEGMENT_EVENT(wseg); +DEFINE_SEGMENT_EVENT(decode_wseg); +DEFINE_SEGMENT_EVENT(encode_rseg); +DEFINE_SEGMENT_EVENT(encode_wseg); DECLARE_EVENT_CLASS(svcrdma_chunk_event, TP_PROTO( diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index 71127d898562..bd92ed611b4c 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -358,15 +358,14 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, arg->len = ctxt->rc_byte_len; } -/* This accommodates the largest possible Write chunk, - * in one segment. +/* This accommodates the largest possible Write chunk. */ -#define MAX_BYTES_WRITE_SEG ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) +#define MAX_BYTES_WRITE_CHUNK ((u32)(RPCSVC_MAXPAGES << PAGE_SHIFT)) /* This accommodates the largest possible Position-Zero - * Read chunk or Reply chunk, in one segment. + * Read chunk or Reply chunk. */ -#define MAX_BYTES_SPECIAL_SEG ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) +#define MAX_BYTES_SPECIAL_CHUNK ((u32)((RPCSVC_MAXPAGES + 2) << PAGE_SHIFT)) /* Sanity check the Read list. * @@ -374,7 +373,7 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, * - This implementation supports only one Read chunk. * * Sanity checks: - * - Read list does not overflow buffer. + * - Read list does not overflow Receive buffer. * - Segment size limited by largest NFS data payload. * * The segment count is limited to how many segments can @@ -382,30 +381,44 @@ static void svc_rdma_build_arg_xdr(struct svc_rqst *rqstp, * buffer. That's about 40 Read segments for a 1KB inline * threshold. * - * Returns pointer to the following Write list. + * Return values: + * %true: Read list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Read list. + * %false: Read list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) +static bool xdr_check_read_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 position; + u32 position, len; bool first; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + len = 0; first = true; - while (*p++ != xdr_zero) { + while (*p != xdr_zero) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_readseg_maxsz * sizeof(*p)); + if (!p) + return false; + if (first) { - position = be32_to_cpup(p++); + position = be32_to_cpup(p); first = false; - } else if (be32_to_cpup(p++) != position) { - return NULL; + } else if (be32_to_cpup(p) != position) { + return false; } - p++; /* handle */ - if (be32_to_cpup(p++) > MAX_BYTES_SPECIAL_SEG) - return NULL; - p += 2; /* offset */ + p += 2; + len += be32_to_cpup(p); - if (p > end) - return NULL; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; } - return p; + return len <= MAX_BYTES_SPECIAL_CHUNK; } /* The segment count is limited to how many segments can @@ -413,67 +426,93 @@ static __be32 *xdr_check_read_list(__be32 *p, const __be32 *end) * buffer. That's about 60 Write segments for a 1KB inline * threshold. */ -static __be32 *xdr_check_write_chunk(__be32 *p, const __be32 *end, - u32 maxlen) +static bool xdr_check_write_chunk(struct svc_rdma_recv_ctxt *rctxt, u32 maxlen) { - u32 i, segcount; + u32 i, segcount, total; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + segcount = be32_to_cpup(p); - segcount = be32_to_cpup(p++); + total = 0; for (i = 0; i < segcount; i++) { - p++; /* handle */ - if (be32_to_cpup(p++) > maxlen) - return NULL; - p += 2; /* offset */ + u32 handle, length; + u64 offset; - if (p > end) - return NULL; - } + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_segment_maxsz * sizeof(*p)); + if (!p) + return false; + + handle = be32_to_cpup(p++); + length = be32_to_cpup(p++); + xdr_decode_hyper(p, &offset); + trace_svcrdma_decode_wseg(handle, length, offset); - return p; + total += length; + } + return total <= maxlen; } /* Sanity check the Write list. * * Implementation limits: - * - This implementation supports only one Write chunk. + * - This implementation currently supports only one Write chunk. * * Sanity checks: - * - Write list does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following Reply chunk. + * - Write list does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Write list is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Write list. + * %false: Write list is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_write_list(__be32 *p, const __be32 *end) +static bool xdr_check_write_list(struct svc_rdma_recv_ctxt *rctxt) { - u32 chcount; + u32 chcount = 0; + __be32 *p; - chcount = 0; - while (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_WRITE_SEG); + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + while (*p != xdr_zero) { + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_WRITE_CHUNK)) + return false; + ++chcount; + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); if (!p) - return NULL; - if (chcount++ > 1) - return NULL; + return false; } - return p; + return chcount < 2; } /* Sanity check the Reply chunk. * * Sanity checks: - * - Reply chunk does not overflow buffer. - * - Segment size limited by largest NFS data payload. - * - * Returns pointer to the following RPC header. + * - Reply chunk does not overflow Receive buffer. + * - Chunk size limited by largest NFS data payload. + * + * Return values: + * %true: Reply chunk is valid. @rctxt's xdr_stream is updated + * to point to the first byte past the Reply chunk. + * %false: Reply chunk is corrupt. @rctxt's xdr_stream is left + * in an unknown state. */ -static __be32 *xdr_check_reply_chunk(__be32 *p, const __be32 *end) +static bool xdr_check_reply_chunk(struct svc_rdma_recv_ctxt *rctxt) { - if (*p++ != xdr_zero) { - p = xdr_check_write_chunk(p, end, MAX_BYTES_SPECIAL_SEG); - if (!p) - return NULL; - } - return p; + __be32 *p; + + p = xdr_inline_decode(&rctxt->rc_stream, sizeof(*p)); + if (!p) + return false; + if (*p != xdr_zero) + if (!xdr_check_write_chunk(rctxt, MAX_BYTES_SPECIAL_CHUNK)) + return false; + return true; } /* RPC-over-RDMA Version One private extension: Remote Invalidation. @@ -538,60 +577,61 @@ static void svc_rdma_get_inv_rkey(struct svcxprt_rdma *rdma, ctxt->rc_inv_rkey = be32_to_cpu(inv_rkey); } -/* On entry, xdr->head[0].iov_base points to first byte in the - * RPC-over-RDMA header. +/** + * svc_rdma_xdr_decode_req - Decode the transport header + * @rq_arg: xdr_buf containing ingress RPC/RDMA message + * @rctxt: state of decoding + * + * On entry, xdr->head[0].iov_base points to first byte of the + * RPC-over-RDMA transport header. * * On successful exit, head[0] points to first byte past the * RPC-over-RDMA header. For RDMA_MSG, this is the RPC message. + * * The length of the RPC-over-RDMA header is returned. * * Assumptions: * - The transport header is entirely contained in the head iovec. */ -static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) +static int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg, + struct svc_rdma_recv_ctxt *rctxt) { - __be32 *p, *end, *rdma_argp; + __be32 *p, *rdma_argp; unsigned int hdr_len; - /* Verify that there's enough bytes for header + something */ - if (rq_arg->len <= RPCRDMA_HDRLEN_ERR) - goto out_short; - rdma_argp = rq_arg->head[0].iov_base; - if (*(rdma_argp + 1) != rpcrdma_version) - goto out_version; + xdr_init_decode(&rctxt->rc_stream, rq_arg, rdma_argp, NULL); - switch (*(rdma_argp + 3)) { + p = xdr_inline_decode(&rctxt->rc_stream, + rpcrdma_fixed_maxsz * sizeof(*p)); + if (unlikely(!p)) + goto out_short; + p++; + if (*p != rpcrdma_version) + goto out_version; + p += 2; + switch (*p) { case rdma_msg: break; case rdma_nomsg: break; - case rdma_done: goto out_drop; - case rdma_error: goto out_drop; - default: goto out_proc; } - end = (__be32 *)((unsigned long)rdma_argp + rq_arg->len); - p = xdr_check_read_list(rdma_argp + 4, end); - if (!p) + if (!xdr_check_read_list(rctxt)) goto out_inval; - p = xdr_check_write_list(p, end); - if (!p) - goto out_inval; - p = xdr_check_reply_chunk(p, end); - if (!p) + if (!xdr_check_write_list(rctxt)) goto out_inval; - if (p > end) + if (!xdr_check_reply_chunk(rctxt)) goto out_inval; - rq_arg->head[0].iov_base = p; - hdr_len = (unsigned long)p - (unsigned long)rdma_argp; + rq_arg->head[0].iov_base = rctxt->rc_stream.p; + hdr_len = xdr_stream_pos(&rctxt->rc_stream); rq_arg->head[0].iov_len -= hdr_len; rq_arg->len -= hdr_len; trace_svcrdma_decode_rqst(rdma_argp, hdr_len); @@ -786,7 +826,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) rqstp->rq_next_page = rqstp->rq_respages; p = (__be32 *)rqstp->rq_arg.head[0].iov_base; - ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg); + ret = svc_rdma_xdr_decode_req(&rqstp->rq_arg, ctxt); if (ret < 0) goto out_err; if (ret == 0) -- cgit v1.3 From a406c563e84212c6c63c5204817e30e4c5250dbb Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 2 Mar 2020 15:02:20 -0500 Subject: svcrdma: Rename svcrdma_encode trace points in send routines These trace points are misnamed: trace_svcrdma_encode_wseg trace_svcrdma_encode_write trace_svcrdma_encode_reply trace_svcrdma_encode_rseg trace_svcrdma_encode_read trace_svcrdma_encode_pzr Because they actually trace posting on the Send Queue. Let's rename them so that I can add trace points in the chunk list encoders that actually do trace chunk list encoding events. Signed-off-by: Chuck Lever --- include/trace/events/rpcrdma.h | 14 +++++++++----- net/sunrpc/xprtrdma/svc_rdma_rw.c | 13 +++++++------ 2 files changed, 16 insertions(+), 11 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 814b73bd2cc7..74b68547eefb 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1479,7 +1479,9 @@ DECLARE_EVENT_CLASS(svcrdma_segment_event, DEFINE_SEGMENT_EVENT(decode_wseg); DEFINE_SEGMENT_EVENT(encode_rseg); +DEFINE_SEGMENT_EVENT(send_rseg); DEFINE_SEGMENT_EVENT(encode_wseg); +DEFINE_SEGMENT_EVENT(send_wseg); DECLARE_EVENT_CLASS(svcrdma_chunk_event, TP_PROTO( @@ -1502,17 +1504,19 @@ DECLARE_EVENT_CLASS(svcrdma_chunk_event, ); #define DEFINE_CHUNK_EVENT(name) \ - DEFINE_EVENT(svcrdma_chunk_event, svcrdma_encode_##name,\ + DEFINE_EVENT(svcrdma_chunk_event, svcrdma_##name, \ TP_PROTO( \ u32 length \ ), \ TP_ARGS(length)) -DEFINE_CHUNK_EVENT(pzr); -DEFINE_CHUNK_EVENT(write); -DEFINE_CHUNK_EVENT(reply); +DEFINE_CHUNK_EVENT(send_pzr); +DEFINE_CHUNK_EVENT(encode_write_chunk); +DEFINE_CHUNK_EVENT(send_write_chunk); +DEFINE_CHUNK_EVENT(encode_read_chunk); +DEFINE_CHUNK_EVENT(send_reply_chunk); -TRACE_EVENT(svcrdma_encode_read, +TRACE_EVENT(svcrdma_send_read_chunk, TP_PROTO( u32 length, u32 position diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c index aee8ee2d01da..bd7c195d872e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_rw.c +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -439,7 +439,8 @@ svc_rdma_build_writes(struct svc_rdma_write_info *info, if (ret < 0) goto out_initerr; - trace_svcrdma_encode_wseg(seg_handle, write_len, seg_offset); + trace_svcrdma_send_wseg(seg_handle, write_len, seg_offset); + list_add(&ctxt->rw_list, &cc->cc_rwctxts); cc->cc_sqecount += ret; if (write_len == seg_length - info->wi_seg_off) { @@ -534,7 +535,7 @@ int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, if (ret < 0) goto out_err; - trace_svcrdma_encode_write(xdr->page_len); + trace_svcrdma_send_write_chunk(xdr->page_len); return length; out_err: @@ -594,7 +595,7 @@ int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, if (ret < 0) goto out_err; - trace_svcrdma_encode_reply(consumed); + trace_svcrdma_send_reply_chunk(consumed); return consumed; out_err: @@ -697,7 +698,7 @@ static int svc_rdma_build_read_chunk(struct svc_rqst *rqstp, if (ret < 0) break; - trace_svcrdma_encode_rseg(rs_handle, rs_length, rs_offset); + trace_svcrdma_send_rseg(rs_handle, rs_length, rs_offset); info->ri_chunklen += rs_length; } @@ -728,7 +729,7 @@ static int svc_rdma_build_normal_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - trace_svcrdma_encode_read(info->ri_chunklen, info->ri_position); + trace_svcrdma_send_read_chunk(info->ri_chunklen, info->ri_position); head->rc_hdr_count = 0; @@ -784,7 +785,7 @@ static int svc_rdma_build_pz_read_chunk(struct svc_rqst *rqstp, if (ret < 0) goto out; - trace_svcrdma_encode_pzr(info->ri_chunklen); + trace_svcrdma_send_pzr(info->ri_chunklen); head->rc_arg.len += info->ri_chunklen; head->rc_arg.buflen += info->ri_chunklen; -- cgit v1.3 From 0dabe948f28274e7956a625a24f205016b810693 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 3 Mar 2020 13:28:14 -0500 Subject: svcrdma: Avoid DMA mapping small RPC Replies On some platforms, DMA mapping part of a page is more costly than copying bytes. Indeed, not involving the I/O MMU can help the RPC/RDMA transport scale better for tiny I/Os across more RDMA devices. This is because interaction with the I/O MMU is eliminated for each of these small I/Os. Without the explicit unmapping, the NIC no longer needs to do a costly internal TLB shoot down for buffers that are just a handful of bytes. Since pull-up is now a more a frequent operation, I've introduced a trace point in the pull-up path. It can be used for debugging or user-space tools that count pull-up frequency. Signed-off-by: Chuck Lever --- include/linux/sunrpc/svc_rdma.h | 1 + include/trace/events/rpcrdma.h | 18 ++++++++++++++++++ net/sunrpc/xprtrdma/svc_rdma_sendto.c | 13 ++++++++++++- 3 files changed, 31 insertions(+), 1 deletion(-) (limited to 'include/trace') diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index a3fa5b4fa2e4..78fe2ac6dc6c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -52,6 +52,7 @@ /* Default and maximum inline threshold sizes */ enum { + RPCRDMA_PULLUP_THRESH = RPCRDMA_V1_DEF_INLINE_SIZE >> 1, RPCRDMA_DEF_INLINE_THRESH = 4096, RPCRDMA_MAX_INLINE_THRESH = 65536 }; diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h index 74b68547eefb..9238d233f8cf 100644 --- a/include/trace/events/rpcrdma.h +++ b/include/trace/events/rpcrdma.h @@ -1639,6 +1639,24 @@ TRACE_EVENT(svcrdma_dma_map_rwctx, ) ); +TRACE_EVENT(svcrdma_send_pullup, + TP_PROTO( + unsigned int len + ), + + TP_ARGS(len), + + TP_STRUCT__entry( + __field(unsigned int, len) + ), + + TP_fast_assign( + __entry->len = len; + ), + + TP_printk("len=%u", __entry->len) +); + TRACE_EVENT(svcrdma_send_failed, TP_PROTO( const struct svc_rqst *rqst, diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 7b9853214769..90cba3058f04 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -541,6 +541,7 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, /** * svc_rdma_pull_up_needed - Determine whether to use pull-up * @rdma: controlling transport + * @sctxt: send_ctxt for the Send WR * @rctxt: Write and Reply chunks provided by client * @xdr: xdr_buf containing RPC message to transmit * @@ -549,11 +550,20 @@ static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, * %false otherwise */ static bool svc_rdma_pull_up_needed(struct svcxprt_rdma *rdma, + struct svc_rdma_send_ctxt *sctxt, const struct svc_rdma_recv_ctxt *rctxt, struct xdr_buf *xdr) { int elements; + /* For small messages, copying bytes is cheaper than DMA mapping. + */ + if (sctxt->sc_hdrbuf.len + xdr->len < RPCRDMA_PULLUP_THRESH) + return true; + + /* Check whether the xdr_buf has more elements than can + * fit in a single RDMA Send. + */ /* xdr->head */ elements = 1; @@ -636,6 +646,7 @@ static int svc_rdma_pull_up_reply_msg(struct svcxprt_rdma *rdma, memcpy(dst, tailbase, taillen); sctxt->sc_sges[0].length += xdr->len; + trace_svcrdma_send_pullup(sctxt->sc_sges[0].length); return 0; } @@ -675,7 +686,7 @@ int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, /* For pull-up, svc_rdma_send() will sync the transport header. * No additional DMA mapping is necessary. */ - if (svc_rdma_pull_up_needed(rdma, rctxt, xdr)) + if (svc_rdma_pull_up_needed(rdma, sctxt, rctxt, xdr)) return svc_rdma_pull_up_reply_msg(rdma, sctxt, rctxt, xdr); ++sctxt->sc_cur_sge_no; -- cgit v1.3 From 78a947f50aaabd8d49e634a84f451c0933af853a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 1 Mar 2020 18:21:44 -0500 Subject: sunrpc: Add tracing for cache events Add basic tracing for debugging the sunrpc cache events. Signed-off-by: Trond Myklebust Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 33 +++++++++++++++++++++++++++++++++ net/sunrpc/cache.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 9 deletions(-) (limited to 'include/trace') diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 1577223add43..ffd2215950dc 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1335,6 +1335,39 @@ DECLARE_EVENT_CLASS(svc_deferred_event, DEFINE_SVC_DEFERRED_EVENT(drop); DEFINE_SVC_DEFERRED_EVENT(revisit); +DECLARE_EVENT_CLASS(cache_event, + TP_PROTO( + const struct cache_detail *cd, + const struct cache_head *h + ), + + TP_ARGS(cd, h), + + TP_STRUCT__entry( + __field(const struct cache_head *, h) + __string(name, cd->name) + ), + + TP_fast_assign( + __entry->h = h; + __assign_str(name, cd->name); + ), + + TP_printk("cache=%s entry=%p", __get_str(name), __entry->h) +); +#define DEFINE_CACHE_EVENT(name) \ + DEFINE_EVENT(cache_event, name, \ + TP_PROTO( \ + const struct cache_detail *cd, \ + const struct cache_head *h \ + ), \ + TP_ARGS(cd, h)) +DEFINE_CACHE_EVENT(cache_entry_expired); +DEFINE_CACHE_EVENT(cache_entry_upcall); +DEFINE_CACHE_EVENT(cache_entry_update); +DEFINE_CACHE_EVENT(cache_entry_make_negative); +DEFINE_CACHE_EVENT(cache_entry_no_listener); + #endif /* _TRACE_SUNRPC_H */ #include diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index cd76ef2d26b8..af0ddd28b081 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "netns.h" #define RPCDBG_FACILITY RPCDBG_CACHE @@ -120,6 +121,7 @@ static struct cache_head *sunrpc_cache_add_entry(struct cache_detail *detail, if (test_bit(CACHE_VALID, &tmp->flags) && cache_is_expired(detail, tmp)) { sunrpc_begin_cache_remove_entry(tmp, detail); + trace_cache_entry_expired(detail, tmp); freeme = tmp; break; } @@ -176,6 +178,25 @@ static void cache_fresh_unlocked(struct cache_head *head, } } +static void cache_make_negative(struct cache_detail *detail, + struct cache_head *h) +{ + set_bit(CACHE_NEGATIVE, &h->flags); + trace_cache_entry_make_negative(detail, h); +} + +static void cache_entry_update(struct cache_detail *detail, + struct cache_head *h, + struct cache_head *new) +{ + if (!test_bit(CACHE_NEGATIVE, &new->flags)) { + detail->update(h, new); + trace_cache_entry_update(detail, h); + } else { + cache_make_negative(detail, h); + } +} + struct cache_head *sunrpc_cache_update(struct cache_detail *detail, struct cache_head *new, struct cache_head *old, int hash) { @@ -188,10 +209,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, if (!test_bit(CACHE_VALID, &old->flags)) { spin_lock(&detail->hash_lock); if (!test_bit(CACHE_VALID, &old->flags)) { - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &old->flags); - else - detail->update(old, new); + cache_entry_update(detail, old, new); cache_fresh_locked(old, new->expiry_time, detail); spin_unlock(&detail->hash_lock); cache_fresh_unlocked(old, detail); @@ -209,10 +227,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail, detail->init(tmp, old); spin_lock(&detail->hash_lock); - if (test_bit(CACHE_NEGATIVE, &new->flags)) - set_bit(CACHE_NEGATIVE, &tmp->flags); - else - detail->update(tmp, new); + cache_entry_update(detail, tmp, new); hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]); detail->entries++; cache_get(tmp); @@ -254,7 +269,7 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h spin_lock(&detail->hash_lock); rv = cache_is_valid(h); if (rv == -EAGAIN) { - set_bit(CACHE_NEGATIVE, &h->flags); + cache_make_negative(detail, h); cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY, detail); rv = -ENOENT; @@ -460,6 +475,7 @@ static int cache_clean(void) continue; sunrpc_begin_cache_remove_entry(ch, current_detail); + trace_cache_entry_expired(current_detail, ch); rv = 1; break; } @@ -1215,6 +1231,7 @@ static int cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h) if (test_bit(CACHE_PENDING, &h->flags)) { crq->item = cache_get(h); list_add_tail(&crq->q.list, &detail->queue); + trace_cache_entry_upcall(detail, h); } else /* Lost a race, no longer PENDING, so don't enqueue */ ret = -EAGAIN; @@ -1240,6 +1257,7 @@ int sunrpc_cache_pipe_upcall_timeout(struct cache_detail *detail, { if (!cache_listeners_exist(detail)) { warn_no_listener(detail); + trace_cache_entry_no_listener(detail, h); return -EINVAL; } return sunrpc_cache_pipe_upcall(detail, h); -- cgit v1.3