From fa3a03da549a889fc9dbc0d3c5908eb7882cac8f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 7 Jul 2019 22:15:13 +0200 Subject: batman-adv: Fix netlink dumping of all mcast_flags buckets The bucket variable is only updated outside the loop over the mcast_flags buckets. It will only be updated during a dumping run when the dumping has to be interrupted and a new message has to be started. This could result in repeated or missing entries when the multicast flags are dumped to userspace. Fixes: d2d489b7d851 ("batman-adv: Add inconsistent multicast netlink dump detection") Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- net/batman-adv/multicast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 67d7f83009ae..a3488cfb3d1e 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -2303,7 +2303,7 @@ __batadv_mcast_flags_dump(struct sk_buff *msg, u32 portid, while (bucket_tmp < hash->size) { if (batadv_mcast_flags_dump_bucket(msg, portid, cb, hash, - *bucket, &idx_tmp)) + bucket_tmp, &idx_tmp)) break; bucket_tmp++; -- cgit v1.2.3 From f7af86ccf1882084293b11077deec049fd01da63 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sun, 7 Jul 2019 23:04:57 +0200 Subject: batman-adv: Fix deletion of RTR(4|6) mcast list entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The multicast code uses the lists bat_priv->mcast.want_all_rtr*_list to store all all originator nodes which don't have the flag no-RTR4 or no-RTR6 set. When an originator is purged, it has to be removed from these lists. Since all entries without the BATADV_MCAST_WANT_NO_RTR4/6 are stored in these lists, they have to be handled like entries which have these flags set to force the update routines to remove them from the lists when purging the originator. Not doing so will leave a pointer to a freed memory region inside the list. Trying to operate on these lists will then cause an use-after-free error: BUG: KASAN: use-after-free in batadv_mcast_want_rtr4_update+0x335/0x3a0 [batman_adv] Write of size 8 at addr ffff888007b41a38 by task swapper/0/0 Fixes: 61caf3d109f5 ("batman-adv: mcast: detect, distribute and maintain multicast router presence") Signed-off-by: Sven Eckelmann Acked-by: Linus Lüssing Signed-off-by: Simon Wunderlich --- net/batman-adv/multicast.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index a3488cfb3d1e..1d5bdf3a4b65 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -2420,8 +2420,10 @@ void batadv_mcast_purge_orig(struct batadv_orig_node *orig) batadv_mcast_want_unsnoop_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv4_update(bat_priv, orig, BATADV_NO_FLAGS); batadv_mcast_want_ipv6_update(bat_priv, orig, BATADV_NO_FLAGS); - batadv_mcast_want_rtr4_update(bat_priv, orig, BATADV_NO_FLAGS); - batadv_mcast_want_rtr6_update(bat_priv, orig, BATADV_NO_FLAGS); + batadv_mcast_want_rtr4_update(bat_priv, orig, + BATADV_MCAST_WANT_NO_RTR4); + batadv_mcast_want_rtr6_update(bat_priv, orig, + BATADV_MCAST_WANT_NO_RTR6); spin_unlock_bh(&orig->mcast_handler_lock); } -- cgit v1.2.3 From 589b474a4b7ce409d6821ef17234a995841bd131 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 30 Jul 2019 14:57:19 +0200 Subject: netfilter: nf_flow_table: fix offload for flows that are subject to xfrm This makes the previously added 'encap test' pass. Because its possible that the xfrm dst entry becomes stale while such a flow is offloaded, we need to call dst_check() -- the notifier that handles this for non-tunneled traffic isn't sufficient, because SA or or policies might have changed. If dst becomes stale the flow offload entry will be tagged for teardown and packets will be passed to 'classic' forwarding path. Removing the entry right away is problematic, as this would introduce a race condition with the gc worker. In case flow is long-lived, it could eventually be offloaded again once the gc worker removes the entry from the flow table. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_ip.c | 43 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index cdfc33517e85..d68c801dd614 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -214,6 +214,25 @@ static bool nf_flow_exceeds_mtu(const struct sk_buff *skb, unsigned int mtu) return true; } +static int nf_flow_offload_dst_check(struct dst_entry *dst) +{ + if (unlikely(dst_xfrm(dst))) + return dst_check(dst, 0) ? 0 : -1; + + return 0; +} + +static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, + const struct nf_hook_state *state, + struct dst_entry *dst) +{ + skb_orphan(skb); + skb_dst_set_noref(skb, dst); + skb->tstamp = 0; + dst_output(state->net, state->sk, skb); + return NF_STOLEN; +} + unsigned int nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) @@ -254,6 +273,11 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, if (nf_flow_state_check(flow, ip_hdr(skb)->protocol, skb, thoff)) return NF_ACCEPT; + if (nf_flow_offload_dst_check(&rt->dst)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (nf_flow_nat_ip(flow, skb, thoff, dir) < 0) return NF_DROP; @@ -261,6 +285,13 @@ nf_flow_offload_ip_hook(void *priv, struct sk_buff *skb, iph = ip_hdr(skb); ip_decrease_ttl(iph); + if (unlikely(dst_xfrm(&rt->dst))) { + memset(skb->cb, 0, sizeof(struct inet_skb_parm)); + IPCB(skb)->iif = skb->dev->ifindex; + IPCB(skb)->flags = IPSKB_FORWARDED; + return nf_flow_xmit_xfrm(skb, state, &rt->dst); + } + skb->dev = outdev; nexthop = rt_nexthop(rt, flow->tuplehash[!dir].tuple.src_v4.s_addr); skb_dst_set_noref(skb, &rt->dst); @@ -467,6 +498,11 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, sizeof(*ip6h))) return NF_ACCEPT; + if (nf_flow_offload_dst_check(&rt->dst)) { + flow_offload_teardown(flow); + return NF_ACCEPT; + } + if (skb_try_make_writable(skb, sizeof(*ip6h))) return NF_DROP; @@ -477,6 +513,13 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, ip6h = ipv6_hdr(skb); ip6h->hop_limit--; + if (unlikely(dst_xfrm(&rt->dst))) { + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + IP6CB(skb)->iif = skb->dev->ifindex; + IP6CB(skb)->flags = IP6SKB_FORWARDED; + return nf_flow_xmit_xfrm(skb, state, &rt->dst); + } + skb->dev = outdev; nexthop = rt6_nexthop(rt, &flow->tuplehash[!dir].tuple.src_v6); skb_dst_set_noref(skb, &rt->dst); -- cgit v1.2.3 From 891584f48a9084ba462f10da4c6bb28b6181b543 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Fri, 2 Aug 2019 17:15:03 +0200 Subject: inet: frags: re-introduce skb coalescing for local delivery Before commit d4289fcc9b16 ("net: IP6 defrag: use rbtrees for IPv6 defrag"), a netperf UDP_STREAM test[0] using big IPv6 datagrams (thus generating many fragments) and running over an IPsec tunnel, reported more than 6Gbps throughput. After that patch, the same test gets only 9Mbps when receiving on a be2net nic (driver can make a big difference here, for example, ixgbe doesn't seem to be affected). By reusing the IPv4 defragmentation code, IPv6 lost fragment coalescing (IPv4 fragment coalescing was dropped by commit 14fe22e33462 ("Revert "ipv4: use skb coalescing in defragmentation"")). Without fragment coalescing, be2net runs out of Rx ring entries and starts to drop frames (ethtool reports rx_drops_no_frags errors). Since the netperf traffic is only composed of UDP fragments, any lost packet prevents reassembly of the full datagram. Therefore, fragments which have no possibility to ever get reassembled pile up in the reassembly queue, until the memory accounting exeeds the threshold. At that point no fragment is accepted anymore, which effectively discards all netperf traffic. When reassembly timeout expires, some stale fragments are removed from the reassembly queue, so a few packets can be received, reassembled and delivered to the netperf receiver. But the nic still drops frames and soon the reassembly queue gets filled again with stale fragments. These long time frames where no datagram can be received explain why the performance drop is so significant. Re-introducing fragment coalescing is enough to get the initial performances again (6.6Gbps with be2net): driver doesn't drop frames anymore (no more rx_drops_no_frags errors) and the reassembly engine works at full speed. This patch is quite conservative and only coalesces skbs for local IPv4 and IPv6 delivery (in order to avoid changing skb geometry when forwarding). Coalescing could be extended in the future if need be, as more scenarios would probably benefit from it. [0]: Test configuration Sender: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir in tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir out tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow netserver -D -L fc00:2::1 Receiver: ip xfrm policy flush ip xfrm state flush ip xfrm state add src fc00:2::1 dst fc00:1::1 proto esp spi 0x1001 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:2::1 dst fc00:1::1 ip xfrm policy add src fc00:2::1 dst fc00:1::1 dir in tmpl src fc00:2::1 dst fc00:1::1 proto esp mode transport action allow ip xfrm state add src fc00:1::1 dst fc00:2::1 proto esp spi 0x1000 aead 'rfc4106(gcm(aes))' 0x0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b0b 96 mode transport sel src fc00:1::1 dst fc00:2::1 ip xfrm policy add src fc00:1::1 dst fc00:2::1 dir out tmpl src fc00:1::1 dst fc00:2::1 proto esp mode transport action allow netperf -H fc00:2::1 -f k -P 0 -L fc00:1::1 -l 60 -t UDP_STREAM -I 99,5 -i 5,5 -T5,5 -6 Signed-off-by: Guillaume Nault Acked-by: Florian Westphal Signed-off-by: David S. Miller --- include/net/inet_frag.h | 2 +- net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv4/inet_fragment.c | 39 +++++++++++++++++++++++---------- net/ipv4/ip_fragment.c | 8 ++++++- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 2 +- 6 files changed, 39 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 010f26b31c89..bac79e817776 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -171,7 +171,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, - void *reasm_data); + void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index e4aba5d485be..bbe9b3b2d395 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -170,7 +170,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *skb, reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); if (!reasm_data) goto out_oom; - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->dev = ldev; skb->tstamp = fq->q.stamp; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index a999451345f9..10d31733297d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -475,11 +475,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, - void *reasm_data) + void *reasm_data, bool try_coalesce) { struct sk_buff **nextp = (struct sk_buff **)reasm_data; struct rb_node *rbn; struct sk_buff *fp; + int sum_truesize; skb_push(head, head->data - skb_network_header(head)); @@ -487,25 +488,41 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, fp = FRAG_CB(head)->next_frag; rbn = rb_next(&head->rbnode); rb_erase(&head->rbnode, &q->rb_fragments); + + sum_truesize = head->truesize; while (rbn || fp) { /* fp points to the next sk_buff in the current run; * rbn points to the next run. */ /* Go through the current run. */ while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; + struct sk_buff *next_frag = FRAG_CB(fp)->next_frag; + bool stolen; + int delta; + + sum_truesize += fp->truesize; if (head->ip_summed != fp->ip_summed) head->ip_summed = CHECKSUM_NONE; else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; + + if (try_coalesce && skb_try_coalesce(head, fp, &stolen, + &delta)) { + kfree_skb_partial(fp, stolen); + } else { + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + + head->data_len += fp->len; + head->len += fp->len; + head->truesize += fp->truesize; + + *nextp = fp; + nextp = &fp->next; + } + + fp = next_frag; } /* Move to the next run. */ if (rbn) { @@ -516,7 +533,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, rbn = rbnext; } } - sub_frag_mem_limit(q->fqdir, head->truesize); + sub_frag_mem_limit(q->fqdir, sum_truesize); *nextp = NULL; skb_mark_not_on_list(head); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 4385eb9e781f..cfeb8890f94e 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -393,6 +393,11 @@ err: return err; } +static bool ip_frag_coalesce_ok(const struct ipq *qp) +{ + return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER; +} + /* Build a new IP datagram from all its fragments. */ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, struct sk_buff *prev_tail, struct net_device *dev) @@ -421,7 +426,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, if (len > 65535) goto out_oversize; - inet_frag_reasm_finish(&qp->q, skb, reasm_data); + inet_frag_reasm_finish(&qp->q, skb, reasm_data, + ip_frag_coalesce_ok(qp)); skb->dev = dev; IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0f82c150543b..fed9666a2f7d 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -348,7 +348,7 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_reset_transport_header(skb); - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, false); skb->ignore_df = 1; skb->dev = dev; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index ca05b16f1bb9..1f5d4d196dcc 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -282,7 +282,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, skb_reset_transport_header(skb); - inet_frag_reasm_finish(&fq->q, skb, reasm_data); + inet_frag_reasm_finish(&fq->q, skb, reasm_data, true); skb->dev = dev; ipv6_hdr(skb)->payload_len = htons(payload_len); -- cgit v1.2.3 From 51650d33b2771acd505068da669cf85cffac369a Mon Sep 17 00:00:00 2001 From: Ivan Khoronzhuk Date: Wed, 7 Aug 2019 01:45:40 +0300 Subject: net: sched: sch_taprio: fix memleak in error path for sched list parse In error case, all entries should be freed from the sched list before deleting it. For simplicity use rcu way. Fixes: 5a781ccbd19e46 ("tc: Add support for configuring the taprio scheduler") Acked-by: Vinicius Costa Gomes Signed-off-by: Ivan Khoronzhuk Signed-off-by: David S. Miller --- net/sched/sch_taprio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index c39db507ba3f..e25d414ae12f 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -1195,7 +1195,8 @@ unlock: spin_unlock_bh(qdisc_lock(sch)); free_sched: - kfree(new_admin); + if (new_admin) + call_rcu(&new_admin->rcu, taprio_free_sched_cb); return err; } -- cgit v1.2.3 From e1fea322fc6d4075254ca9c5f2afdace0281da2a Mon Sep 17 00:00:00 2001 From: Roman Mashak Date: Wed, 7 Aug 2019 15:57:28 -0400 Subject: net sched: update skbedit action for batched events operations Add get_fill_size() routine used to calculate the action size when building a batch of events. Fixes: ca9b0e27e ("pkt_action: add new action skbedit") Signed-off-by: Roman Mashak Signed-off-by: David S. Miller --- net/sched/act_skbedit.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'net') diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index b100870f02a6..37dced00b63d 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -307,6 +307,17 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) return tcf_idr_search(tn, a, index); } +static size_t tcf_skbedit_get_fill_size(const struct tc_action *act) +{ + return nla_total_size(sizeof(struct tc_skbedit)) + + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_PRIORITY */ + + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_QUEUE_MAPPING */ + + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MARK */ + + nla_total_size(sizeof(u16)) /* TCA_SKBEDIT_PTYPE */ + + nla_total_size(sizeof(u32)) /* TCA_SKBEDIT_MASK */ + + nla_total_size_64bit(sizeof(u64)); /* TCA_SKBEDIT_FLAGS */ +} + static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", .id = TCA_ID_SKBEDIT, @@ -316,6 +327,7 @@ static struct tc_action_ops act_skbedit_ops = { .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, .walk = tcf_skbedit_walker, + .get_fill_size = tcf_skbedit_get_fill_size, .lookup = tcf_skbedit_search, .size = sizeof(struct tcf_skbedit), }; -- cgit v1.2.3 From 414776621d1006e57e80e6db7fdc3837897aaa64 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 7 Aug 2019 17:03:59 -0700 Subject: net/tls: prevent skb_orphan() from leaking TLS plain text with offload sk_validate_xmit_skb() and drivers depend on the sk member of struct sk_buff to identify segments requiring encryption. Any operation which removes or does not preserve the original TLS socket such as skb_orphan() or skb_clone() will cause clear text leaks. Make the TCP socket underlying an offloaded TLS connection mark all skbs as decrypted, if TLS TX is in offload mode. Then in sk_validate_xmit_skb() catch skbs which have no socket (or a socket with no validation) and decrypted flag set. Note that CONFIG_SOCK_VALIDATE_XMIT, CONFIG_TLS_DEVICE and sk->sk_validate_xmit_skb are slightly interchangeable right now, they all imply TLS offload. The new checks are guarded by CONFIG_TLS_DEVICE because that's the option guarding the sk_buff->decrypted member. Second, smaller issue with orphaning is that it breaks the guarantee that packets will be delivered to device queues in-order. All TLS offload drivers depend on that scheduling property. This means skb_orphan_partial()'s trick of preserving partial socket references will cause issues in the drivers. We need a full orphan, and as a result netem delay/throttling will cause all TLS offload skbs to be dropped. Reusing the sk_buff->decrypted flag also protects from leaking clear text when incoming, decrypted skb is redirected (e.g. by TC). See commit 0608c69c9a80 ("bpf: sk_msg, sock{map|hash} redirect through ULP") for justification why the internal flag is safe. The only location which could leak the flag in is tcp_bpf_sendmsg(), which is taken care of by clearing the previously unused bit. v2: - remove superfluous decrypted mark copy (Willem); - remove the stale doc entry (Boris); - rely entirely on EOR marking to prevent coalescing (Boris); - use an internal sendpages flag instead of marking the socket (Boris). v3 (Willem): - reorganize the can_skb_orphan_partial() condition; - fix the flag leak-in through tcp_bpf_sendmsg. Signed-off-by: Jakub Kicinski Acked-by: Willem de Bruijn Reviewed-by: Boris Pismenny Signed-off-by: David S. Miller --- Documentation/networking/tls-offload.rst | 18 ------------------ include/linux/skbuff.h | 8 ++++++++ include/linux/socket.h | 3 +++ include/net/sock.h | 10 +++++++++- net/core/sock.c | 19 ++++++++++++++----- net/ipv4/tcp.c | 3 +++ net/ipv4/tcp_bpf.c | 6 +++++- net/ipv4/tcp_output.c | 3 +++ net/tls/tls_device.c | 9 +++++++-- 9 files changed, 52 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index b70b70dc4524..0dd3f748239f 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -506,21 +506,3 @@ Drivers should ignore the changes to TLS the device feature flags. These flags will be acted upon accordingly by the core ``ktls`` code. TLS device feature flags only control adding of new TLS connection offloads, old connections will remain active after flags are cleared. - -Known bugs -========== - -skb_orphan() leaks clear text ------------------------------ - -Currently drivers depend on the :c:member:`sk` member of -:c:type:`struct sk_buff ` to identify segments requiring -encryption. Any operation which removes or does not preserve the socket -association such as :c:func:`skb_orphan` or :c:func:`skb_clone` -will cause the driver to miss the packets and lead to clear text leaks. - -Redirects leak clear text -------------------------- - -In the RX direction, if segment has already been decrypted by the device -and it gets redirected or mirrored - clear text will be transmitted out. diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d8af86d995d6..ba5583522d24 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1374,6 +1374,14 @@ static inline void skb_copy_hash(struct sk_buff *to, const struct sk_buff *from) to->l4_hash = from->l4_hash; }; +static inline void skb_copy_decrypted(struct sk_buff *to, + const struct sk_buff *from) +{ +#ifdef CONFIG_TLS_DEVICE + to->decrypted = from->decrypted; +#endif +} + #ifdef NET_SKBUFF_DATA_USES_OFFSET static inline unsigned char *skb_end_pointer(const struct sk_buff *skb) { diff --git a/include/linux/socket.h b/include/linux/socket.h index 97523818cb14..fc0bed59fc84 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -292,6 +292,9 @@ struct ucred { #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN #define MSG_NO_SHARED_FRAGS 0x80000 /* sendpage() internal : page frags are not shared */ +#define MSG_SENDPAGE_DECRYPTED 0x100000 /* sendpage() internal : page may carry + * plain text and require encryption + */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ diff --git a/include/net/sock.h b/include/net/sock.h index 228db3998e46..2c53f1a1d905 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2482,6 +2482,7 @@ static inline bool sk_fullsock(const struct sock *sk) /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) @@ -2489,8 +2490,15 @@ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; - if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) + if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); +#ifdef CONFIG_TLS_DEVICE + } else if (unlikely(skb->decrypted)) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; +#endif + } #endif return skb; diff --git a/net/core/sock.c b/net/core/sock.c index d57b0cc995a0..6d08553f885c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1992,6 +1992,19 @@ void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) } EXPORT_SYMBOL(skb_set_owner_w); +static bool can_skb_orphan_partial(const struct sk_buff *skb) +{ +#ifdef CONFIG_TLS_DEVICE + /* Drivers depend on in-order delivery for crypto offload, + * partial orphan breaks out-of-order-OK logic. + */ + if (skb->decrypted) + return false; +#endif + return (skb->destructor == sock_wfree || + (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); +} + /* This helper is used by netem, as it can hold packets in its * delay queue. We want to allow the owner socket to send more * packets, as if they were already TX completed by a typical driver. @@ -2003,11 +2016,7 @@ void skb_orphan_partial(struct sk_buff *skb) if (skb_is_tcp_pure_ack(skb)) return; - if (skb->destructor == sock_wfree -#ifdef CONFIG_INET - || skb->destructor == tcp_wfree -#endif - ) { + if (can_skb_orphan_partial(skb)) { struct sock *sk = skb->sk; if (refcount_inc_not_zero(&sk->sk_refcnt)) { diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 776905899ac0..77b485d60b9d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -984,6 +984,9 @@ new_segment: if (!skb) goto wait_for_memory; +#ifdef CONFIG_TLS_DEVICE + skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED); +#endif skb_entail(sk, skb); copy = size_goal; } diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c index 3d1e15401384..8a56e09cfb0e 100644 --- a/net/ipv4/tcp_bpf.c +++ b/net/ipv4/tcp_bpf.c @@ -398,10 +398,14 @@ more_data: static int tcp_bpf_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { struct sk_msg tmp, *msg_tx = NULL; - int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; int copied = 0, err = 0; struct sk_psock *psock; long timeo; + int flags; + + /* Don't let internal do_tcp_sendpages() flags through */ + flags = (msg->msg_flags & ~MSG_SENDPAGE_DECRYPTED); + flags |= MSG_NO_SHARED_FRAGS; psock = sk_psock_get(sk); if (unlikely(!psock)) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 6e4afc48d7bb..979520e46e33 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1320,6 +1320,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, buff = sk_stream_alloc_skb(sk, nsize, gfp, true); if (!buff) return -ENOMEM; /* We'll just try again later. */ + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -1874,6 +1875,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, buff = sk_stream_alloc_skb(sk, 0, gfp, true); if (unlikely(!buff)) return -ENOMEM; + skb_copy_decrypted(buff, skb); sk->sk_wmem_queued += buff->truesize; sk_mem_charge(sk, buff->truesize); @@ -2143,6 +2145,7 @@ static int tcp_mtu_probe(struct sock *sk) sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); + skb_copy_decrypted(nskb, skb); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size; diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 7c0b2b778703..43922d86e510 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -373,9 +373,9 @@ static int tls_push_data(struct sock *sk, struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); - int tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; int more = flags & (MSG_SENDPAGE_NOTLAST | MSG_MORE); struct tls_record_info *record = ctx->open_record; + int tls_push_record_flags; struct page_frag *pfrag; size_t orig_size = size; u32 max_open_record_len; @@ -390,6 +390,9 @@ static int tls_push_data(struct sock *sk, if (sk->sk_err) return -sk->sk_err; + flags |= MSG_SENDPAGE_DECRYPTED; + tls_push_record_flags = flags | MSG_SENDPAGE_NOTLAST; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); if (tls_is_partially_sent_record(tls_ctx)) { rc = tls_push_partial_record(sk, tls_ctx, flags); @@ -576,7 +579,9 @@ void tls_device_write_space(struct sock *sk, struct tls_context *ctx) gfp_t sk_allocation = sk->sk_allocation; sk->sk_allocation = GFP_ATOMIC; - tls_push_partial_record(sk, ctx, MSG_DONTWAIT | MSG_NOSIGNAL); + tls_push_partial_record(sk, ctx, + MSG_DONTWAIT | MSG_NOSIGNAL | + MSG_SENDPAGE_DECRYPTED); sk->sk_allocation = sk_allocation; } } -- cgit v1.2.3 From 6a0a8d10a3661a036b55af695542a714c429ab7c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Aug 2019 11:01:27 +0200 Subject: netfilter: nf_tables: use-after-free in failing rule with bound set If a rule that has already a bound anonymous set fails to be added, the preparation phase releases the rule and the bound set. However, the transaction object from the abort path still has a reference to the set object that is stale, leading to a use-after-free when checking for the set->bound field. Add a new field to the transaction that specifies if the set is bound, so the abort path can skip releasing it since the rule command owns it and it takes care of releasing it. After this update, the set->bound field is removed. [ 24.649883] Unable to handle kernel paging request at virtual address 0000000000040434 [ 24.657858] Mem abort info: [ 24.660686] ESR = 0x96000004 [ 24.663769] Exception class = DABT (current EL), IL = 32 bits [ 24.669725] SET = 0, FnV = 0 [ 24.672804] EA = 0, S1PTW = 0 [ 24.675975] Data abort info: [ 24.678880] ISV = 0, ISS = 0x00000004 [ 24.682743] CM = 0, WnR = 0 [ 24.685723] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000428952000 [ 24.692207] [0000000000040434] pgd=0000000000000000 [ 24.697119] Internal error: Oops: 96000004 [#1] SMP [...] [ 24.889414] Call trace: [ 24.891870] __nf_tables_abort+0x3f0/0x7a0 [ 24.895984] nf_tables_abort+0x20/0x40 [ 24.899750] nfnetlink_rcv_batch+0x17c/0x588 [ 24.904037] nfnetlink_rcv+0x13c/0x190 [ 24.907803] netlink_unicast+0x18c/0x208 [ 24.911742] netlink_sendmsg+0x1b0/0x350 [ 24.915682] sock_sendmsg+0x4c/0x68 [ 24.919185] ___sys_sendmsg+0x288/0x2c8 [ 24.923037] __sys_sendmsg+0x7c/0xd0 [ 24.926628] __arm64_sys_sendmsg+0x2c/0x38 [ 24.930744] el0_svc_common.constprop.0+0x94/0x158 [ 24.935556] el0_svc_handler+0x34/0x90 [ 24.939322] el0_svc+0x8/0xc [ 24.942216] Code: 37280300 f9404023 91014262 aa1703e0 (f9401863) [ 24.948336] ---[ end trace cebbb9dcbed3b56f ]--- Fixes: f6ac85858976 ("netfilter: nf_tables: unbind set in rule from commit path") Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 9 +++++++-- net/netfilter/nf_tables_api.c | 15 ++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 9b624566b82d..475d6f28ca67 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -421,8 +421,7 @@ struct nft_set { unsigned char *udata; /* runtime data below here */ const struct nft_set_ops *ops ____cacheline_aligned; - u16 flags:13, - bound:1, + u16 flags:14, genmask:2; u8 klen; u8 dlen; @@ -1348,12 +1347,15 @@ struct nft_trans_rule { struct nft_trans_set { struct nft_set *set; u32 set_id; + bool bound; }; #define nft_trans_set(trans) \ (((struct nft_trans_set *)trans->data)->set) #define nft_trans_set_id(trans) \ (((struct nft_trans_set *)trans->data)->set_id) +#define nft_trans_set_bound(trans) \ + (((struct nft_trans_set *)trans->data)->bound) struct nft_trans_chain { bool update; @@ -1384,12 +1386,15 @@ struct nft_trans_table { struct nft_trans_elem { struct nft_set *set; struct nft_set_elem elem; + bool bound; }; #define nft_trans_elem_set(trans) \ (((struct nft_trans_elem *)trans->data)->set) #define nft_trans_elem(trans) \ (((struct nft_trans_elem *)trans->data)->elem) +#define nft_trans_elem_set_bound(trans) \ + (((struct nft_trans_elem *)trans->data)->bound) struct nft_trans_obj { struct nft_object *obj; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 605a7cfe7ca7..88abbddf8967 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -138,9 +138,14 @@ static void nft_set_trans_bind(const struct nft_ctx *ctx, struct nft_set *set) return; list_for_each_entry_reverse(trans, &net->nft.commit_list, list) { - if (trans->msg_type == NFT_MSG_NEWSET && - nft_trans_set(trans) == set) { - set->bound = true; + switch (trans->msg_type) { + case NFT_MSG_NEWSET: + if (nft_trans_set(trans) == set) + nft_trans_set_bound(trans) = true; + break; + case NFT_MSG_NEWSETELEM: + if (nft_trans_elem_set(trans) == set) + nft_trans_elem_set_bound(trans) = true; break; } } @@ -6906,7 +6911,7 @@ static int __nf_tables_abort(struct net *net) break; case NFT_MSG_NEWSET: trans->ctx.table->use--; - if (nft_trans_set(trans)->bound) { + if (nft_trans_set_bound(trans)) { nft_trans_destroy(trans); break; } @@ -6918,7 +6923,7 @@ static int __nf_tables_abort(struct net *net) nft_trans_destroy(trans); break; case NFT_MSG_NEWSETELEM: - if (nft_trans_elem_set(trans)->bound) { + if (nft_trans_elem_set_bound(trans)) { nft_trans_destroy(trans); break; } -- cgit v1.2.3 From 3e68db2f6422d711550a32cbc87abd97bb6efab3 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Aug 2019 11:01:33 +0200 Subject: netfilter: nf_flow_table: conntrack picks up expired flows Update conntrack entry to pick up expired flows, otherwise the conntrack entry gets stuck with the internal offload timeout (one day). The TCP state also needs to be adjusted to ESTABLISHED state and tracking is set to liberal mode in order to give conntrack a chance to pick up the expired flow. Fixes: ac2a66665e23 ("netfilter: add generic flow table infrastructure") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index e3d797252a98..68a24471ffee 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -111,7 +111,7 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static void flow_offload_fixup_ct_state(struct nf_conn *ct) +static void flow_offload_fixup_ct(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; unsigned int timeout; @@ -208,6 +208,11 @@ int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow) } EXPORT_SYMBOL_GPL(flow_offload_add); +static inline bool nf_flow_has_expired(const struct flow_offload *flow) +{ + return (__s32)(flow->timeout - (u32)jiffies) <= 0; +} + static void flow_offload_del(struct nf_flowtable *flow_table, struct flow_offload *flow) { @@ -223,6 +228,9 @@ static void flow_offload_del(struct nf_flowtable *flow_table, e = container_of(flow, struct flow_offload_entry, flow); clear_bit(IPS_OFFLOAD_BIT, &e->ct->status); + if (nf_flow_has_expired(flow)) + flow_offload_fixup_ct(e->ct); + flow_offload_free(flow); } @@ -233,7 +241,7 @@ void flow_offload_teardown(struct flow_offload *flow) flow->flags |= FLOW_OFFLOAD_TEARDOWN; e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct_state(e->ct); + flow_offload_fixup_ct(e->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); @@ -298,11 +306,6 @@ nf_flow_table_iterate(struct nf_flowtable *flow_table, return err; } -static inline bool nf_flow_has_expired(const struct flow_offload *flow) -{ - return (__s32)(flow->timeout - (u32)jiffies) <= 0; -} - static void nf_flow_offload_gc_step(struct flow_offload *flow, void *data) { struct nf_flowtable *flow_table = data; -- cgit v1.2.3 From 1e5b2471bcc4838df298080ae1ec042c2cbc9ce9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 9 Aug 2019 11:01:35 +0200 Subject: netfilter: nf_flow_table: teardown flow timeout race Flows that are in teardown state (due to RST / FIN TCP packet) still have their offload flag set on. Hence, the conntrack garbage collector may race to undo the timeout adjustment that the fixup routine performs, leaving the conntrack entry in place with the internal offload timeout (one day). Update teardown flow state to ESTABLISHED and set tracking to liberal, then once the offload bit is cleared, adjust timeout if it is more than the default fixup timeout (conntrack might already have set a lower timeout from the packet path). Fixes: da5984e51063 ("netfilter: nf_flow_table: add support for sending flows back to the slow path") Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_flow_table_core.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 68a24471ffee..80a8f9ae4c93 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -111,15 +111,16 @@ static void flow_offload_fixup_tcp(struct ip_ct_tcp *tcp) #define NF_FLOWTABLE_TCP_PICKUP_TIMEOUT (120 * HZ) #define NF_FLOWTABLE_UDP_PICKUP_TIMEOUT (30 * HZ) -static void flow_offload_fixup_ct(struct nf_conn *ct) +static inline __s32 nf_flow_timeout_delta(unsigned int timeout) +{ + return (__s32)(timeout - (u32)jiffies); +} + +static void flow_offload_fixup_ct_timeout(struct nf_conn *ct) { const struct nf_conntrack_l4proto *l4proto; + int l4num = nf_ct_protonum(ct); unsigned int timeout; - int l4num; - - l4num = nf_ct_protonum(ct); - if (l4num == IPPROTO_TCP) - flow_offload_fixup_tcp(&ct->proto.tcp); l4proto = nf_ct_l4proto_find(l4num); if (!l4proto) @@ -132,7 +133,20 @@ static void flow_offload_fixup_ct(struct nf_conn *ct) else return; - ct->timeout = nfct_time_stamp + timeout; + if (nf_flow_timeout_delta(ct->timeout) > (__s32)timeout) + ct->timeout = nfct_time_stamp + timeout; +} + +static void flow_offload_fixup_ct_state(struct nf_conn *ct) +{ + if (nf_ct_protonum(ct) == IPPROTO_TCP) + flow_offload_fixup_tcp(&ct->proto.tcp); +} + +static void flow_offload_fixup_ct(struct nf_conn *ct) +{ + flow_offload_fixup_ct_state(ct); + flow_offload_fixup_ct_timeout(ct); } void flow_offload_free(struct flow_offload *flow) @@ -210,7 +224,7 @@ EXPORT_SYMBOL_GPL(flow_offload_add); static inline bool nf_flow_has_expired(const struct flow_offload *flow) { - return (__s32)(flow->timeout - (u32)jiffies) <= 0; + return nf_flow_timeout_delta(flow->timeout) <= 0; } static void flow_offload_del(struct nf_flowtable *flow_table, @@ -230,6 +244,8 @@ static void flow_offload_del(struct nf_flowtable *flow_table, if (nf_flow_has_expired(flow)) flow_offload_fixup_ct(e->ct); + else if (flow->flags & FLOW_OFFLOAD_TEARDOWN) + flow_offload_fixup_ct_timeout(e->ct); flow_offload_free(flow); } @@ -241,7 +257,7 @@ void flow_offload_teardown(struct flow_offload *flow) flow->flags |= FLOW_OFFLOAD_TEARDOWN; e = container_of(flow, struct flow_offload_entry, flow); - flow_offload_fixup_ct(e->ct); + flow_offload_fixup_ct_state(e->ct); } EXPORT_SYMBOL_GPL(flow_offload_teardown); -- cgit v1.2.3 From 730c5fd42c1e3652a065448fd235cb9fafb2bd10 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Aug 2019 15:20:41 +0100 Subject: rxrpc: Fix local endpoint refcounting The object lifetime management on the rxrpc_local struct is broken in that the rxrpc_local_processor() function is expected to clean up and remove an object - but it may get requeued by packets coming in on the backing UDP socket once it starts running. This may result in the assertion in rxrpc_local_rcu() firing because the memory has been scheduled for RCU destruction whilst still queued: rxrpc: Assertion failed ------------[ cut here ]------------ kernel BUG at net/rxrpc/local_object.c:468! Note that if the processor comes around before the RCU free function, it will just do nothing because ->dead is true. Fix this by adding a separate refcount to count active users of the endpoint that causes the endpoint to be destroyed when it reaches 0. The original refcount can then be used to refcount objects through the work processor and cause the memory to be rcu freed when that reaches 0. Fixes: 4f95dd78a77e ("rxrpc: Rework local endpoint management") Reported-by: syzbot+1e0edc4b8b7494c28450@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/af_rxrpc.c | 4 +-- net/rxrpc/ar-internal.h | 5 ++- net/rxrpc/input.c | 16 ++++++--- net/rxrpc/local_object.c | 86 ++++++++++++++++++++++++++++++------------------ 4 files changed, 72 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index d09eaf153544..8c9bd3ae9edf 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -193,7 +193,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len) service_in_use: write_unlock(&local->services_lock); - rxrpc_put_local(local); + rxrpc_unuse_local(local); ret = -EADDRINUSE; error_unlock: release_sock(&rx->sk); @@ -901,7 +901,7 @@ static int rxrpc_release_sock(struct sock *sk) rxrpc_queue_work(&rxnet->service_conn_reaper); rxrpc_queue_work(&rxnet->client_conn_reaper); - rxrpc_put_local(rx->local); + rxrpc_unuse_local(rx->local); rx->local = NULL; key_put(rx->key); rx->key = NULL; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 822f45386e31..9796c45d2f6a 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -254,7 +254,8 @@ struct rxrpc_security { */ struct rxrpc_local { struct rcu_head rcu; - atomic_t usage; + atomic_t active_users; /* Number of users of the local endpoint */ + atomic_t usage; /* Number of references to the structure */ struct rxrpc_net *rxnet; /* The network ns in which this resides */ struct list_head link; struct socket *socket; /* my UDP socket */ @@ -1002,6 +1003,8 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *, const struct sockaddr_rxrpc struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *); struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *); void rxrpc_put_local(struct rxrpc_local *); +struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *); +void rxrpc_unuse_local(struct rxrpc_local *); void rxrpc_queue_local(struct rxrpc_local *); void rxrpc_destroy_all_locals(struct rxrpc_net *); diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 5bd6f1546e5c..ee95d1cd1cdf 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -1108,8 +1108,12 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local, { _enter("%p,%p", local, skb); - skb_queue_tail(&local->event_queue, skb); - rxrpc_queue_local(local); + if (rxrpc_get_local_maybe(local)) { + skb_queue_tail(&local->event_queue, skb); + rxrpc_queue_local(local); + } else { + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + } } /* @@ -1119,8 +1123,12 @@ static void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb) { CHECK_SLAB_OKAY(&local->usage); - skb_queue_tail(&local->reject_queue, skb); - rxrpc_queue_local(local); + if (rxrpc_get_local_maybe(local)) { + skb_queue_tail(&local->reject_queue, skb); + rxrpc_queue_local(local); + } else { + rxrpc_free_skb(skb, rxrpc_skb_rx_freed); + } } /* diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index b1c71bad510b..9798159ee65f 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -79,6 +79,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local = kzalloc(sizeof(struct rxrpc_local), GFP_KERNEL); if (local) { atomic_set(&local->usage, 1); + atomic_set(&local->active_users, 1); local->rxnet = rxnet; INIT_LIST_HEAD(&local->link); INIT_WORK(&local->processor, rxrpc_local_processor); @@ -266,11 +267,8 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, * bind the transport socket may still fail if we're attempting * to use a local address that the dying object is still using. */ - if (!rxrpc_get_local_maybe(local)) { - cursor = cursor->next; - list_del_init(&local->link); + if (!rxrpc_use_local(local)) break; - } age = "old"; goto found; @@ -284,7 +282,10 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, if (ret < 0) goto sock_error; - list_add_tail(&local->link, cursor); + if (cursor != &rxnet->local_endpoints) + list_replace(cursor, &local->link); + else + list_add_tail(&local->link, cursor); age = "new"; found: @@ -342,7 +343,8 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) } /* - * Queue a local endpoint. + * Queue a local endpoint unless it has become unreferenced and pass the + * caller's reference to the work item. */ void rxrpc_queue_local(struct rxrpc_local *local) { @@ -351,15 +353,8 @@ void rxrpc_queue_local(struct rxrpc_local *local) if (rxrpc_queue_work(&local->processor)) trace_rxrpc_local(local, rxrpc_local_queued, atomic_read(&local->usage), here); -} - -/* - * A local endpoint reached its end of life. - */ -static void __rxrpc_put_local(struct rxrpc_local *local) -{ - _enter("%d", local->debug_id); - rxrpc_queue_work(&local->processor); + else + rxrpc_put_local(local); } /* @@ -375,10 +370,45 @@ void rxrpc_put_local(struct rxrpc_local *local) trace_rxrpc_local(local, rxrpc_local_put, n, here); if (n == 0) - __rxrpc_put_local(local); + call_rcu(&local->rcu, rxrpc_local_rcu); } } +/* + * Start using a local endpoint. + */ +struct rxrpc_local *rxrpc_use_local(struct rxrpc_local *local) +{ + unsigned int au; + + local = rxrpc_get_local_maybe(local); + if (!local) + return NULL; + + au = atomic_fetch_add_unless(&local->active_users, 1, 0); + if (au == 0) { + rxrpc_put_local(local); + return NULL; + } + + return local; +} + +/* + * Cease using a local endpoint. Once the number of active users reaches 0, we + * start the closure of the transport in the work processor. + */ +void rxrpc_unuse_local(struct rxrpc_local *local) +{ + unsigned int au; + + au = atomic_dec_return(&local->active_users); + if (au == 0) + rxrpc_queue_local(local); + else + rxrpc_put_local(local); +} + /* * Destroy a local endpoint's socket and then hand the record to RCU to dispose * of. @@ -393,16 +423,6 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) _enter("%d", local->debug_id); - /* We can get a race between an incoming call packet queueing the - * processor again and the work processor starting the destruction - * process which will shut down the UDP socket. - */ - if (local->dead) { - _leave(" [already dead]"); - return; - } - local->dead = true; - mutex_lock(&rxnet->local_mutex); list_del_init(&local->link); mutex_unlock(&rxnet->local_mutex); @@ -422,13 +442,11 @@ static void rxrpc_local_destroyer(struct rxrpc_local *local) */ rxrpc_purge_queue(&local->reject_queue); rxrpc_purge_queue(&local->event_queue); - - _debug("rcu local %d", local->debug_id); - call_rcu(&local->rcu, rxrpc_local_rcu); } /* - * Process events on an endpoint + * Process events on an endpoint. The work item carries a ref which + * we must release. */ static void rxrpc_local_processor(struct work_struct *work) { @@ -441,8 +459,10 @@ static void rxrpc_local_processor(struct work_struct *work) do { again = false; - if (atomic_read(&local->usage) == 0) - return rxrpc_local_destroyer(local); + if (atomic_read(&local->active_users) == 0) { + rxrpc_local_destroyer(local); + break; + } if (!skb_queue_empty(&local->reject_queue)) { rxrpc_reject_packets(local); @@ -454,6 +474,8 @@ static void rxrpc_local_processor(struct work_struct *work) again = true; } } while (again); + + rxrpc_put_local(local); } /* -- cgit v1.2.3 From e8c3af6bb33a9e4b56920ee00aef92eb5e4cf485 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Aug 2019 15:20:41 +0100 Subject: rxrpc: Don't bother generating maxSkew in the ACK packet Don't bother generating maxSkew in the ACK packet as it has been obsolete since AFS 3.1. Signed-off-by: David Howells Reviewed-by: Jeffrey Altman --- net/rxrpc/af_rxrpc.c | 2 +- net/rxrpc/ar-internal.h | 3 +-- net/rxrpc/call_event.c | 15 ++++++--------- net/rxrpc/input.c | 43 ++++++++++++++++--------------------------- net/rxrpc/output.c | 3 +-- net/rxrpc/recvmsg.c | 6 +++--- 6 files changed, 28 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 8c9bd3ae9edf..0dbbfd1b6487 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -402,7 +402,7 @@ EXPORT_SYMBOL(rxrpc_kernel_check_life); */ void rxrpc_kernel_probe_life(struct socket *sock, struct rxrpc_call *call) { - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, rxrpc_propose_ack_ping_for_check_life); rxrpc_send_ack_packet(call, true, NULL); } diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 9796c45d2f6a..145335611af6 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -650,7 +650,6 @@ struct rxrpc_call { /* receive-phase ACK management */ u8 ackr_reason; /* reason to ACK */ - u16 ackr_skew; /* skew on packet being ACK'd */ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */ rxrpc_serial_t ackr_first_seq; /* first sequence number received */ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */ @@ -744,7 +743,7 @@ int rxrpc_reject_call(struct rxrpc_sock *); /* * call_event.c */ -void rxrpc_propose_ACK(struct rxrpc_call *, u8, u16, u32, bool, bool, +void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool, bool, enum rxrpc_propose_ack_trace); void rxrpc_process_call(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index bc2adeb3acb9..c767679bfa5d 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -43,8 +43,7 @@ static void rxrpc_propose_ping(struct rxrpc_call *call, * propose an ACK be sent */ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate, - bool background, + u32 serial, bool immediate, bool background, enum rxrpc_propose_ack_trace why) { enum rxrpc_propose_ack_outcome outcome = rxrpc_propose_ack_use; @@ -69,14 +68,12 @@ static void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, if (RXRPC_ACK_UPDATEABLE & (1 << ack_reason)) { outcome = rxrpc_propose_ack_update; call->ackr_serial = serial; - call->ackr_skew = skew; } if (!immediate) goto trace; } else if (prior > rxrpc_ack_priority[call->ackr_reason]) { call->ackr_reason = ack_reason; call->ackr_serial = serial; - call->ackr_skew = skew; } else { outcome = rxrpc_propose_ack_subsume; } @@ -137,11 +134,11 @@ trace: * propose an ACK be sent, locking the call structure */ void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason, - u16 skew, u32 serial, bool immediate, bool background, + u32 serial, bool immediate, bool background, enum rxrpc_propose_ack_trace why) { spin_lock_bh(&call->lock); - __rxrpc_propose_ACK(call, ack_reason, skew, serial, + __rxrpc_propose_ACK(call, ack_reason, serial, immediate, background, why); spin_unlock_bh(&call->lock); } @@ -239,7 +236,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) ack_ts = ktime_sub(now, call->acks_latest_ts); if (ktime_to_ns(ack_ts) < call->peer->rtt) goto out; - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); rxrpc_send_ack_packet(call, true, NULL); goto out; @@ -372,7 +369,7 @@ recheck_state: if (time_after_eq(now, t)) { trace_rxrpc_timer(call, rxrpc_timer_exp_keepalive, now); cmpxchg(&call->keepalive_at, t, now + MAX_JIFFY_OFFSET); - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, true, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, true, rxrpc_propose_ack_ping_for_keepalive); set_bit(RXRPC_CALL_EV_PING, &call->events); } @@ -407,7 +404,7 @@ recheck_state: send_ack = NULL; if (test_and_clear_bit(RXRPC_CALL_EV_ACK_LOST, &call->events)) { call->acks_lost_top = call->tx_top; - rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, 0, true, false, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, 0, true, false, rxrpc_propose_ack_ping_for_lost_ack); send_ack = &call->acks_lost_ping; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index ee95d1cd1cdf..dd47d465d1d3 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -196,15 +196,14 @@ send_extra_data: * Ping the other end to fill our RTT cache and to retrieve the rwind * and MTU parameters. */ -static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb, - int skew) +static void rxrpc_send_ping(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); ktime_t now = skb->tstamp; if (call->peer->rtt_usage < 3 || ktime_before(ktime_add_ms(call->peer->rtt_last_req, 1000), now)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, true, true, rxrpc_propose_ack_ping_for_params); } @@ -419,8 +418,7 @@ static void rxrpc_input_dup_data(struct rxrpc_call *call, rxrpc_seq_t seq, /* * Process a DATA packet, adding the packet to the Rx ring. */ -static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb, - u16 skew) +static void rxrpc_input_data(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); enum rxrpc_call_state state; @@ -600,11 +598,11 @@ skip: ack: if (ack) - rxrpc_propose_ACK(call, ack, skew, ack_serial, + rxrpc_propose_ACK(call, ack, ack_serial, immediate_ack, true, rxrpc_propose_ack_input_data); else - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, skew, serial, + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false, true, rxrpc_propose_ack_input_data); @@ -822,8 +820,7 @@ static void rxrpc_input_soft_acks(struct rxrpc_call *call, u8 *acks, * soft-ACK means that the packet may be discarded and retransmission * requested. A phase is complete when all packets are hard-ACK'd. */ -static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, - u16 skew) +static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb) { struct rxrpc_ack_summary summary = { 0 }; struct rxrpc_skb_priv *sp = rxrpc_skb(skb); @@ -867,11 +864,11 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, if (buf.ack.reason == RXRPC_ACK_PING) { _proto("Rx ACK %%%u PING Request", sp->hdr.serial); rxrpc_propose_ACK(call, RXRPC_ACK_PING_RESPONSE, - skew, sp->hdr.serial, true, true, + sp->hdr.serial, true, true, rxrpc_propose_ack_respond_to_ping); } else if (sp->hdr.flags & RXRPC_REQUEST_ACK) { rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, - skew, sp->hdr.serial, true, true, + sp->hdr.serial, true, true, rxrpc_propose_ack_respond_to_ack); } @@ -948,7 +945,7 @@ static void rxrpc_input_ack(struct rxrpc_call *call, struct sk_buff *skb, RXRPC_TX_ANNO_LAST && summary.nr_acks == call->tx_top - hard_ack && rxrpc_is_client_call(call)) - rxrpc_propose_ACK(call, RXRPC_ACK_PING, skew, sp->hdr.serial, + rxrpc_propose_ACK(call, RXRPC_ACK_PING, sp->hdr.serial, false, true, rxrpc_propose_ack_ping_for_lost_reply); @@ -1004,7 +1001,7 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) * Process an incoming call packet. */ static void rxrpc_input_call_packet(struct rxrpc_call *call, - struct sk_buff *skb, u16 skew) + struct sk_buff *skb) { struct rxrpc_skb_priv *sp = rxrpc_skb(skb); unsigned long timo; @@ -1023,11 +1020,11 @@ static void rxrpc_input_call_packet(struct rxrpc_call *call, switch (sp->hdr.type) { case RXRPC_PACKET_TYPE_DATA: - rxrpc_input_data(call, skb, skew); + rxrpc_input_data(call, skb); break; case RXRPC_PACKET_TYPE_ACK: - rxrpc_input_ack(call, skb, skew); + rxrpc_input_ack(call, skb); break; case RXRPC_PACKET_TYPE_BUSY: @@ -1181,7 +1178,6 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) struct rxrpc_peer *peer = NULL; struct rxrpc_sock *rx = NULL; unsigned int channel; - int skew = 0; _enter("%p", udp_sk); @@ -1309,15 +1305,8 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) goto out; } - /* Note the serial number skew here */ - skew = (int)sp->hdr.serial - (int)conn->hi_serial; - if (skew >= 0) { - if (skew > 0) - conn->hi_serial = sp->hdr.serial; - } else { - skew = -skew; - skew = min(skew, 65535); - } + if ((int)sp->hdr.serial - (int)conn->hi_serial > 0) + conn->hi_serial = sp->hdr.serial; /* Call-bound packets are routed by connection channel. */ channel = sp->hdr.cid & RXRPC_CHANNELMASK; @@ -1380,11 +1369,11 @@ int rxrpc_input_packet(struct sock *udp_sk, struct sk_buff *skb) call = rxrpc_new_incoming_call(local, rx, skb); if (!call) goto reject_packet; - rxrpc_send_ping(call, skb, skew); + rxrpc_send_ping(call, skb); mutex_unlock(&call->user_mutex); } - rxrpc_input_call_packet(call, skb, skew); + rxrpc_input_call_packet(call, skb); goto discard; discard: diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c index 948e3fe249ec..369e516c4bdf 100644 --- a/net/rxrpc/output.c +++ b/net/rxrpc/output.c @@ -87,7 +87,7 @@ static size_t rxrpc_fill_out_ack(struct rxrpc_connection *conn, *_top = top; pkt->ack.bufferSpace = htons(8); - pkt->ack.maxSkew = htons(call->ackr_skew); + pkt->ack.maxSkew = htons(0); pkt->ack.firstPacket = htonl(hard_ack + 1); pkt->ack.previousPacket = htonl(call->ackr_prev_seq); pkt->ack.serial = htonl(serial); @@ -228,7 +228,6 @@ int rxrpc_send_ack_packet(struct rxrpc_call *call, bool ping, if (ping) clear_bit(RXRPC_CALL_PINGING, &call->flags); rxrpc_propose_ACK(call, pkt->ack.reason, - ntohs(pkt->ack.maxSkew), ntohl(pkt->ack.serial), false, true, rxrpc_propose_ack_retry_tx); diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 5abf46cf9e6c..9a7e1bc9791d 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -141,7 +141,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) ASSERTCMP(call->rx_hard_ack, ==, call->rx_top); if (call->state == RXRPC_CALL_CLIENT_RECV_REPLY) { - rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, 0, serial, false, true, + rxrpc_propose_ACK(call, RXRPC_ACK_IDLE, serial, false, true, rxrpc_propose_ack_terminal_ack); //rxrpc_send_ack_packet(call, false, NULL); } @@ -159,7 +159,7 @@ static void rxrpc_end_rx_phase(struct rxrpc_call *call, rxrpc_serial_t serial) call->state = RXRPC_CALL_SERVER_ACK_REQUEST; call->expect_req_by = jiffies + MAX_JIFFY_OFFSET; write_unlock_bh(&call->state_lock); - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, false, true, + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, false, true, rxrpc_propose_ack_processing_op); break; default: @@ -212,7 +212,7 @@ static void rxrpc_rotate_rx_window(struct rxrpc_call *call) if (after_eq(hard_ack, call->ackr_consumed + 2) || after_eq(top, call->ackr_seen + 2) || (hard_ack == top && after(hard_ack, call->ackr_consumed))) - rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, 0, serial, + rxrpc_propose_ACK(call, RXRPC_ACK_DELAY, serial, true, true, rxrpc_propose_ack_rotate_rx); if (call->ackr_reason && call->ackr_reason != RXRPC_ACK_DELAY) -- cgit v1.2.3 From cd48bdda4fb82c2fe569d97af4217c530168c99c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 8 Aug 2019 13:57:25 +0200 Subject: sock: make cookie generation global instead of per netns Generating and retrieving socket cookies are a useful feature that is exposed to BPF for various program types through bpf_get_socket_cookie() helper. The fact that the cookie counter is per netns is quite a limitation for BPF in practice in particular for programs in host namespace that use socket cookies as part of a map lookup key since they will be causing socket cookie collisions e.g. when attached to BPF cgroup hooks or cls_bpf on tc egress in host namespace handling container traffic from veth or ipvlan devices with peer in different netns. Change the counter to be global instead. Socket cookie consumers must assume the value as opqaue in any case. Not every socket must have a cookie generated and knowledge of the counter value itself does not provide much value either way hence conversion to global is fine. Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Cc: Alexei Starovoitov Cc: Willem de Bruijn Cc: Martynas Pumputis Signed-off-by: David S. Miller --- include/net/net_namespace.h | 1 - include/uapi/linux/bpf.h | 4 ++-- net/core/sock_diag.c | 3 ++- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 4a9da951a794..cb668bc2692d 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -61,7 +61,6 @@ struct net { spinlock_t rules_mod_lock; u32 hash_mix; - atomic64_t cookie_gen; struct list_head list; /* list of network namespaces */ struct list_head exit_list; /* To linked to call pernet exit diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index fa1c753dcdbc..a5aa7d3ac6a1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1466,8 +1466,8 @@ union bpf_attr { * If no cookie has been set yet, generate a new cookie. Once * generated, the socket cookie remains stable for the life of the * socket. This helper can be useful for monitoring per socket - * networking traffic statistics as it provides a unique socket - * identifier per namespace. + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. * Return * A 8-byte long non-decreasing number on success, or 0 if the * socket field is missing inside *skb*. diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 3312a5849a97..c13ffbd33d8d 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,6 +19,7 @@ static const struct sock_diag_handler *sock_diag_handlers[AF_MAX]; static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; +static atomic64_t cookie_gen; u64 sock_gen_cookie(struct sock *sk) { @@ -27,7 +28,7 @@ u64 sock_gen_cookie(struct sock *sk) if (res) return res; - res = atomic64_inc_return(&sock_net(sk)->cookie_gen); + res = atomic64_inc_return(&cookie_gen); atomic64_cmpxchg(&sk->sk_cookie, 0, res); } } -- cgit v1.2.3 From 57c722e932cfb82e9820bbaae1b1f7222ea97b52 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 9 Aug 2019 18:36:23 -0700 Subject: net/tls: swap sk_write_space on close Now that we swap the original proto and clear the ULP pointer on close we have to make sure no callback will try to access the freed state. sk_write_space is not part of sk_prot, remember to swap it. Reported-by: syzbot+dcdc9deefaec44785f32@syzkaller.appspotmail.com Fixes: 95fa145479fb ("bpf: sockmap/tls, close can race with map free") Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 9cbbae606ced..ce6ef56a65ef 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -308,6 +308,7 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (free_ctx) icsk->icsk_ulp_data = NULL; sk->sk_prot = ctx->sk_proto; + sk->sk_write_space = ctx->sk_write_space; write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (ctx->tx_conf == TLS_SW) -- cgit v1.2.3 From 68553f1a6f746bf860bce3eb42d78c26a717d9c0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 9 Aug 2019 22:47:47 +0100 Subject: rxrpc: Fix local refcounting Fix rxrpc_unuse_local() to handle a NULL local pointer as it can be called on an unbound socket on which rx->local is not yet set. The following reproduced (includes omitted): int main(void) { socket(AF_RXRPC, SOCK_DGRAM, AF_INET); return 0; } causes the following oops to occur: BUG: kernel NULL pointer dereference, address: 0000000000000010 ... RIP: 0010:rxrpc_unuse_local+0x8/0x1b ... Call Trace: rxrpc_release+0x2b5/0x338 __sock_release+0x37/0xa1 sock_close+0x14/0x17 __fput+0x115/0x1e9 task_work_run+0x72/0x98 do_exit+0x51b/0xa7a ? __context_tracking_exit+0x4e/0x10e do_group_exit+0xab/0xab __x64_sys_exit_group+0x14/0x17 do_syscall_64+0x89/0x1d4 entry_SYSCALL_64_after_hwframe+0x49/0xbe Reported-by: syzbot+20dee719a2e090427b5f@syzkaller.appspotmail.com Fixes: 730c5fd42c1e ("rxrpc: Fix local endpoint refcounting") Signed-off-by: David Howells cc: Jeffrey Altman Signed-off-by: David S. Miller --- net/rxrpc/local_object.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 9798159ee65f..c9db3e762d8d 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -402,11 +402,13 @@ void rxrpc_unuse_local(struct rxrpc_local *local) { unsigned int au; - au = atomic_dec_return(&local->active_users); - if (au == 0) - rxrpc_queue_local(local); - else - rxrpc_put_local(local); + if (local) { + au = atomic_dec_return(&local->active_users); + if (au == 0) + rxrpc_queue_local(local); + else + rxrpc_put_local(local); + } } /* -- cgit v1.2.3 From 58799865be84e2a895dab72de0e1b996ed943f22 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Sun, 11 Aug 2019 22:18:25 +0800 Subject: net: dsa: Check existence of .port_mdb_add callback before calling it The dsa framework has optional .port_mdb_{prepare,add,del} callback fields for drivers to handle multicast database entries. When adding an entry, the framework goes through a prepare phase, then a commit phase. Drivers not providing these callbacks should be detected in the prepare phase. DSA core may still bypass the bridge layer and call the dsa_port_mdb_add function directly with no prepare phase or no switchdev trans object, and the framework ends up calling an undefined .port_mdb_add callback. This results in a NULL pointer dereference, as shown in the log below. The other functions seem to be properly guarded. Do the same for .port_mdb_add in dsa_switch_mdb_add_bitmap() as well. 8<--- cut here --- Unable to handle kernel NULL pointer dereference at virtual address 00000000 pgd = (ptrval) [00000000] *pgd=00000000 Internal error: Oops: 80000005 [#1] SMP ARM Modules linked in: rtl8xxxu rtl8192cu rtl_usb rtl8192c_common rtlwifi mac80211 cfg80211 CPU: 1 PID: 134 Comm: kworker/1:2 Not tainted 5.3.0-rc1-00247-gd3519030752a #1 Hardware name: Allwinner sun7i (A20) Family Workqueue: events switchdev_deferred_process_work PC is at 0x0 LR is at dsa_switch_event+0x570/0x620 pc : [<00000000>] lr : [] psr: 80070013 sp : ee871db8 ip : 00000000 fp : ee98d0a4 r10: 0000000c r9 : 00000008 r8 : ee89f710 r7 : ee98d040 r6 : ee98d088 r5 : c0f04c48 r4 : ee98d04c r3 : 00000000 r2 : ee89f710 r1 : 00000008 r0 : ee98d040 Flags: Nzcv IRQs on FIQs on Mode SVC_32 ISA ARM Segment none Control: 10c5387d Table: 6deb406a DAC: 00000051 Process kworker/1:2 (pid: 134, stack limit = 0x(ptrval)) Stack: (0xee871db8 to 0xee872000) 1da0: ee871e14 103ace2d 1dc0: 00000000 ffffffff 00000000 ee871e14 00000005 00000000 c08524a0 00000000 1de0: ffffe000 c014bdfc c0f04c48 ee871e98 c0f04c48 ee9e5000 c0851120 c014bef0 1e00: 00000000 b643aea2 ee9b4068 c08509a8 ee2bf940 ee89f710 ee871ecb 00000000 1e20: 00000008 103ace2d 00000000 c087e248 ee29c868 103ace2d 00000001 ffffffff 1e40: 00000000 ee871e98 00000006 00000000 c0fb2a50 c087e2d0 ffffffff c08523c4 1e60: ffffffff c014bdfc 00000006 c0fad2d0 ee871e98 ee89f710 00000000 c014c500 1e80: 00000000 ee89f3c0 c0f04c48 00000000 ee9e5000 c087dfb4 ee9e5000 00000000 1ea0: ee89f710 ee871ecb 00000001 103ace2d 00000000 c0f04c48 00000000 c087e0a8 1ec0: 00000000 efd9a3e0 0089f3c0 103ace2d ee89f700 ee89f710 ee9e5000 00000122 1ee0: 00000100 c087e130 ee89f700 c0fad2c8 c1003ef0 c087de4c 2e928000 c0fad2ec 1f00: c0fad2ec ee839580 ef7a62c0 ef7a9400 00000000 c087def8 c0fad2ec c01447dc 1f20: ef315640 ef7a62c0 00000008 ee839580 ee839594 ef7a62c0 00000008 c0f03d00 1f40: ef7a62d8 ef7a62c0 ffffe000 c0145b84 ffffe000 c0fb2420 c0bfaa8c 00000000 1f60: ffffe000 ee84b600 ee84b5c0 00000000 ee870000 ee839580 c0145b40 ef0e5ea4 1f80: ee84b61c c014a6f8 00000001 ee84b5c0 c014a5b0 00000000 00000000 00000000 1fa0: 00000000 00000000 00000000 c01010e8 00000000 00000000 00000000 00000000 1fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 1fe0: 00000000 00000000 00000000 00000000 00000013 00000000 00000000 00000000 [] (dsa_switch_event) from [] (notifier_call_chain+0x48/0x84) [] (notifier_call_chain) from [] (raw_notifier_call_chain+0x18/0x20) [] (raw_notifier_call_chain) from [] (dsa_port_mdb_add+0x48/0x74) [] (dsa_port_mdb_add) from [] (__switchdev_handle_port_obj_add+0x54/0xd4) [] (__switchdev_handle_port_obj_add) from [] (switchdev_handle_port_obj_add+0x8/0x14) [] (switchdev_handle_port_obj_add) from [] (dsa_slave_switchdev_blocking_event+0x94/0xa4) [] (dsa_slave_switchdev_blocking_event) from [] (notifier_call_chain+0x48/0x84) [] (notifier_call_chain) from [] (blocking_notifier_call_chain+0x50/0x68) [] (blocking_notifier_call_chain) from [] (switchdev_port_obj_notify+0x44/0xa8) [] (switchdev_port_obj_notify) from [] (switchdev_port_obj_add_now+0x90/0x104) [] (switchdev_port_obj_add_now) from [] (switchdev_port_obj_add_deferred+0x14/0x5c) [] (switchdev_port_obj_add_deferred) from [] (switchdev_deferred_process+0x64/0x104) [] (switchdev_deferred_process) from [] (switchdev_deferred_process_work+0xc/0x14) [] (switchdev_deferred_process_work) from [] (process_one_work+0x218/0x50c) [] (process_one_work) from [] (worker_thread+0x44/0x5bc) [] (worker_thread) from [] (kthread+0x148/0x150) [] (kthread) from [] (ret_from_fork+0x14/0x2c) Exception stack(0xee871fb0 to 0xee871ff8) 1fa0: 00000000 00000000 00000000 00000000 1fc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 1fe0: 00000000 00000000 00000000 00000000 00000013 00000000 Code: bad PC value ---[ end trace 1292c61abd17b130 ]--- [] (dsa_switch_event) from [] (notifier_call_chain+0x48/0x84) corresponds to $ arm-linux-gnueabihf-addr2line -C -i -e vmlinux c08533ec linux/net/dsa/switch.c:156 linux/net/dsa/switch.c:178 linux/net/dsa/switch.c:328 Fixes: e6db98db8a95 ("net: dsa: add switch mdb bitmap functions") Signed-off-by: Chen-Yu Tsai Reviewed-by: Vivien Didelot Signed-off-by: David S. Miller --- net/dsa/switch.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 4ec5b7f85d51..09d9286b27cc 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -153,6 +153,9 @@ static void dsa_switch_mdb_add_bitmap(struct dsa_switch *ds, { int port; + if (!ds->ops->port_mdb_add) + return; + for_each_set_bit(port, bitmap, ds->num_ports) ds->ops->port_mdb_add(ds, port, mdb); } -- cgit v1.2.3 From 8874ecae2977e5a2d4f0ba301364435b81c05938 Mon Sep 17 00:00:00 2001 From: Chris Packham Date: Mon, 12 Aug 2019 08:18:25 +1200 Subject: tipc: initialise addr_trail_end when setting node addresses We set the field 'addr_trial_end' to 'jiffies', instead of the current value 0, at the moment the node address is initialized. This guarantees we don't inadvertently enter an address trial period when the node address is explicitly set by the user. Signed-off-by: Chris Packham Acked-by: Jon Maloy Signed-off-by: David S. Miller --- net/tipc/addr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index b88d48d00913..0f1eaed1bd1b 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -75,6 +75,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) tipc_set_node_id(net, node_id); } tn->trial_addr = addr; + tn->addr_trial_end = jiffies; pr_info("32-bit node address hash set to %x\n", addr); } -- cgit v1.2.3 From 48d9cc9d85dde37c87abb7ac9bbec6598ba44b56 Mon Sep 17 00:00:00 2001 From: Fabian Henneke Date: Mon, 15 Jul 2019 19:40:56 +0200 Subject: Bluetooth: hidp: Let hidp_send_message return number of queued bytes Let hidp_send_message return the number of successfully queued bytes instead of an unconditional 0. With the return value fixed to 0, other drivers relying on hidp, such as hidraw, can not return meaningful values from their respective implementations of write(). In particular, with the current behavior, a hidraw device's write() will have different return values depending on whether the device is connected via USB or Bluetooth, which makes it harder to abstract away the transport layer. Signed-off-by: Fabian Henneke Signed-off-by: Marcel Holtmann --- net/bluetooth/hidp/core.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 5abd423b55fa..8d889969ae7e 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -101,6 +101,7 @@ static int hidp_send_message(struct hidp_session *session, struct socket *sock, { struct sk_buff *skb; struct sock *sk = sock->sk; + int ret; BT_DBG("session %p data %p size %d", session, data, size); @@ -114,13 +115,17 @@ static int hidp_send_message(struct hidp_session *session, struct socket *sock, } skb_put_u8(skb, hdr); - if (data && size > 0) + if (data && size > 0) { skb_put_data(skb, data, size); + ret = size; + } else { + ret = 0; + } skb_queue_tail(transmit, skb); wake_up_interruptible(sk_sleep(sk)); - return 0; + return ret; } static int hidp_send_ctrl_message(struct hidp_session *session, -- cgit v1.2.3 From 656c8e9cc1badbc18eefe6ba01d33ebbcae61b9a Mon Sep 17 00:00:00 2001 From: Dirk Morris Date: Thu, 8 Aug 2019 13:57:51 -0700 Subject: netfilter: conntrack: Use consistent ct id hash calculation Change ct id hash calculation to only use invariants. Currently the ct id hash calculation is based on some fields that can change in the lifetime on a conntrack entry in some corner cases. The current hash uses the whole tuple which contains an hlist pointer which will change when the conntrack is placed on the dying list resulting in a ct id change. This patch also removes the reply-side tuple and extension pointer from the hash calculation so that the ct id will will not change from initialization until confirmation. Fixes: 3c79107631db1f7 ("netfilter: ctnetlink: don't use conntrack/expect object addresses as id") Signed-off-by: Dirk Morris Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_conntrack_core.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index a542761e90d1..81a8ef42b88d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -453,13 +453,12 @@ EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); * table location, we assume id gets exposed to userspace. * * Following nf_conn items do not change throughout lifetime - * of the nf_conn after it has been committed to main hash table: + * of the nf_conn: * * 1. nf_conn address - * 2. nf_conn->ext address - * 3. nf_conn->master address (normally NULL) - * 4. tuple - * 5. the associated net namespace + * 2. nf_conn->master address (normally NULL) + * 3. the associated net namespace + * 4. the original direction tuple */ u32 nf_ct_get_id(const struct nf_conn *ct) { @@ -469,9 +468,10 @@ u32 nf_ct_get_id(const struct nf_conn *ct) net_get_random_once(&ct_id_seed, sizeof(ct_id_seed)); a = (unsigned long)ct; - b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct)); - c = (unsigned long)ct->ext; - d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash), + b = (unsigned long)ct->master; + c = (unsigned long)nf_ct_net(ct); + d = (unsigned long)siphash(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + sizeof(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple), &ct_id_seed); #ifdef CONFIG_64BIT return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed); -- cgit v1.2.3 From a1794de8b92ea6bc2037f445b296814ac826693e Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 12 Aug 2019 20:49:12 +0800 Subject: sctp: fix the transport error_count check As the annotation says in sctp_do_8_2_transport_strike(): "If the transport error count is greater than the pf_retrans threshold, and less than pathmaxrtx ..." It should be transport->error_count checked with pathmaxrxt, instead of asoc->pf_retrans. Fixes: 5aa93bcf66f4 ("sctp: Implement quick failover draft from tsvwg") Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: Jakub Kicinski --- net/sctp/sm_sideeffect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index a554d6d15d1b..1cf5bb5b73c4 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -546,7 +546,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_cmd_seq *commands, */ if (net->sctp.pf_enable && (transport->state == SCTP_ACTIVE) && - (asoc->pf_retrans < transport->pathmaxrxt) && + (transport->error_count < transport->pathmaxrxt) && (transport->error_count > asoc->pf_retrans)) { sctp_assoc_control_transport(asoc, transport, -- cgit v1.2.3 From 6d5afe20397b478192ed8c38ec0ee10fa3aec649 Mon Sep 17 00:00:00 2001 From: zhengbin Date: Tue, 13 Aug 2019 22:05:50 +0800 Subject: sctp: fix memleak in sctp_send_reset_streams If the stream outq is not empty, need to kfree nstr_list. Fixes: d570a59c5b5f ("sctp: only allow the out stream reset when the stream outq is empty") Reported-by: Hulk Robot Signed-off-by: zhengbin Acked-by: Marcelo Ricardo Leitner Acked-by: Neil Horman Signed-off-by: Jakub Kicinski --- net/sctp/stream.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sctp/stream.c b/net/sctp/stream.c index 25946604af85..e83cdaa2ab76 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -316,6 +316,7 @@ int sctp_send_reset_streams(struct sctp_association *asoc, nstr_list[i] = htons(str_list[i]); if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) { + kfree(nstr_list); retval = -EAGAIN; goto out; } -- cgit v1.2.3 From dfe42be15fde16232340b8b2a57c359f51cc10d9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 13 Aug 2019 17:41:13 +0200 Subject: netfilter: nft_flow_offload: skip tcp rst and fin packets TCP rst and fin packets do not qualify to place a flow into the flowtable. Most likely there will be no more packets after connection closure. Without this patch, this flow entry expires and connection tracking picks up the entry in ESTABLISHED state using the fixup timeout, which makes this look inconsistent to the user for a connection that is actually already closed. Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nft_flow_offload.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index aa5f571d4361..060a4ed46d5e 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -72,11 +72,11 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, { struct nft_flow_offload *priv = nft_expr_priv(expr); struct nf_flowtable *flowtable = &priv->flowtable->data; + struct tcphdr _tcph, *tcph = NULL; enum ip_conntrack_info ctinfo; struct nf_flow_route route; struct flow_offload *flow; enum ip_conntrack_dir dir; - bool is_tcp = false; struct nf_conn *ct; int ret; @@ -89,7 +89,10 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, switch (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum) { case IPPROTO_TCP: - is_tcp = true; + tcph = skb_header_pointer(pkt->skb, pkt->xt.thoff, + sizeof(_tcph), &_tcph); + if (unlikely(!tcph || tcph->fin || tcph->rst)) + goto out; break; case IPPROTO_UDP: break; @@ -115,7 +118,7 @@ static void nft_flow_offload_eval(const struct nft_expr *expr, if (!flow) goto err_flow_alloc; - if (is_tcp) { + if (tcph) { ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL; } -- cgit v1.2.3 From b00df840fb4004b7087940ac5f68801562d0d2de Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 12 Aug 2019 23:30:06 +0100 Subject: rxrpc: Fix local endpoint replacement When a local endpoint (struct rxrpc_local) ceases to be in use by any AF_RXRPC sockets, it starts the process of being destroyed, but this doesn't cause it to be removed from the namespace endpoint list immediately as tearing it down isn't trivial and can't be done in softirq context, so it gets deferred. If a new socket comes along that wants to bind to the same endpoint, a new rxrpc_local object will be allocated and rxrpc_lookup_local() will use list_replace() to substitute the new one for the old. Then, when the dying object gets to rxrpc_local_destroyer(), it is removed unconditionally from whatever list it is on by calling list_del_init(). However, list_replace() doesn't reset the pointers in the replaced list_head and so the list_del_init() will likely corrupt the local endpoints list. Fix this by using list_replace_init() instead. Fixes: 730c5fd42c1e ("rxrpc: Fix local endpoint refcounting") Reported-by: syzbot+193e29e9387ea5837f1d@syzkaller.appspotmail.com Signed-off-by: David Howells --- net/rxrpc/local_object.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c9db3e762d8d..c45765b7263e 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -283,7 +283,7 @@ struct rxrpc_local *rxrpc_lookup_local(struct net *net, goto sock_error; if (cursor != &rxnet->local_endpoints) - list_replace(cursor, &local->link); + list_replace_init(cursor, &local->link); else list_add_tail(&local->link, cursor); age = "new"; -- cgit v1.2.3 From 06d9532fa6b34f12a6d75711162d47c17c1add72 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 13 Aug 2019 22:26:36 +0100 Subject: rxrpc: Fix read-after-free in rxrpc_queue_local() rxrpc_queue_local() attempts to queue the local endpoint it is given and then, if successful, prints a trace line. The trace line includes the current usage count - but we're not allowed to look at the local endpoint at this point as we passed our ref on it to the workqueue. Fix this by reading the usage count before queuing the work item. Also fix the reading of local->debug_id for trace lines, which must be done with the same consideration as reading the usage count. Fixes: 09d2bf595db4 ("rxrpc: Add a tracepoint to track rxrpc_local refcounting") Reported-by: syzbot+78e71c5bab4f76a6a719@syzkaller.appspotmail.com Signed-off-by: David Howells --- include/trace/events/rxrpc.h | 6 +++--- net/rxrpc/local_object.c | 19 ++++++++++--------- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index cc1d060cbf13..fa06b528c73c 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -498,10 +498,10 @@ rxrpc_tx_points; #define E_(a, b) { a, b } TRACE_EVENT(rxrpc_local, - TP_PROTO(struct rxrpc_local *local, enum rxrpc_local_trace op, + TP_PROTO(unsigned int local_debug_id, enum rxrpc_local_trace op, int usage, const void *where), - TP_ARGS(local, op, usage, where), + TP_ARGS(local_debug_id, op, usage, where), TP_STRUCT__entry( __field(unsigned int, local ) @@ -511,7 +511,7 @@ TRACE_EVENT(rxrpc_local, ), TP_fast_assign( - __entry->local = local->debug_id; + __entry->local = local_debug_id; __entry->op = op; __entry->usage = usage; __entry->where = where; diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index c45765b7263e..72a6e12a9304 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -93,7 +93,7 @@ static struct rxrpc_local *rxrpc_alloc_local(struct rxrpc_net *rxnet, local->debug_id = atomic_inc_return(&rxrpc_debug_id); memcpy(&local->srx, srx, sizeof(*srx)); local->srx.srx_service = 0; - trace_rxrpc_local(local, rxrpc_local_new, 1, NULL); + trace_rxrpc_local(local->debug_id, rxrpc_local_new, 1, NULL); } _leave(" = %p", local); @@ -321,7 +321,7 @@ struct rxrpc_local *rxrpc_get_local(struct rxrpc_local *local) int n; n = atomic_inc_return(&local->usage); - trace_rxrpc_local(local, rxrpc_local_got, n, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_got, n, here); return local; } @@ -335,7 +335,8 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) if (local) { int n = atomic_fetch_add_unless(&local->usage, 1, 0); if (n > 0) - trace_rxrpc_local(local, rxrpc_local_got, n + 1, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_got, + n + 1, here); else local = NULL; } @@ -343,16 +344,16 @@ struct rxrpc_local *rxrpc_get_local_maybe(struct rxrpc_local *local) } /* - * Queue a local endpoint unless it has become unreferenced and pass the - * caller's reference to the work item. + * Queue a local endpoint and pass the caller's reference to the work item. */ void rxrpc_queue_local(struct rxrpc_local *local) { const void *here = __builtin_return_address(0); + unsigned int debug_id = local->debug_id; + int n = atomic_read(&local->usage); if (rxrpc_queue_work(&local->processor)) - trace_rxrpc_local(local, rxrpc_local_queued, - atomic_read(&local->usage), here); + trace_rxrpc_local(debug_id, rxrpc_local_queued, n, here); else rxrpc_put_local(local); } @@ -367,7 +368,7 @@ void rxrpc_put_local(struct rxrpc_local *local) if (local) { n = atomic_dec_return(&local->usage); - trace_rxrpc_local(local, rxrpc_local_put, n, here); + trace_rxrpc_local(local->debug_id, rxrpc_local_put, n, here); if (n == 0) call_rcu(&local->rcu, rxrpc_local_rcu); @@ -456,7 +457,7 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; - trace_rxrpc_local(local, rxrpc_local_processing, + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, atomic_read(&local->usage), NULL); do { -- cgit v1.2.3 From d85f01775850a35eae47a0090839baf510c1ef12 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 14 Aug 2019 05:31:54 +0000 Subject: net: tls, fix sk_write_space NULL write when tx disabled The ctx->sk_write_space pointer is only set when TLS tx mode is enabled. When running without TX mode its a null pointer but we still set the sk sk_write_space pointer on close(). Fix the close path to only overwrite sk->sk_write_space when the current pointer is to the tls_write_space function indicating the tls module should clean it up properly as well. Reported-by: Hillf Danton Cc: Ying Xue Cc: Andrey Konovalov Fixes: 57c722e932cfb ("net/tls: swap sk_write_space on close") Signed-off-by: John Fastabend Reviewed-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/tls/tls_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index ce6ef56a65ef..43252a801c3f 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -308,7 +308,8 @@ static void tls_sk_proto_close(struct sock *sk, long timeout) if (free_ctx) icsk->icsk_ulp_data = NULL; sk->sk_prot = ctx->sk_proto; - sk->sk_write_space = ctx->sk_write_space; + if (sk->sk_write_space == tls_write_space) + sk->sk_write_space = ctx->sk_write_space; write_unlock_bh(&sk->sk_callback_lock); release_sock(sk); if (ctx->tx_conf == TLS_SW) -- cgit v1.2.3 From 32d3182cd2cd29b2e7e04df7b0db350fbe11289f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 14 Aug 2019 02:11:57 -0700 Subject: net/packet: fix race in tpacket_snd() packet_sendmsg() checks tx_ring.pg_vec to decide if it must call tpacket_snd(). Problem is that the check is lockless, meaning another thread can issue a concurrent setsockopt(PACKET_TX_RING ) to flip tx_ring.pg_vec back to NULL. Given that tpacket_snd() grabs pg_vec_lock mutex, we can perform the check again to solve the race. syzbot reported : kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 11429 Comm: syz-executor394 Not tainted 5.3.0-rc4+ #101 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:packet_lookup_frame+0x8d/0x270 net/packet/af_packet.c:474 Code: c1 ee 03 f7 73 0c 80 3c 0e 00 0f 85 cb 01 00 00 48 8b 0b 89 c0 4c 8d 24 c1 48 b8 00 00 00 00 00 fc ff df 4c 89 e1 48 c1 e9 03 <80> 3c 01 00 0f 85 94 01 00 00 48 8d 7b 10 4d 8b 3c 24 48 b8 00 00 RSP: 0018:ffff88809f82f7b8 EFLAGS: 00010246 RAX: dffffc0000000000 RBX: ffff8880a45c7030 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 1ffff110148b8e06 RDI: ffff8880a45c703c RBP: ffff88809f82f7e8 R08: ffff888087aea200 R09: fffffbfff134ae50 R10: fffffbfff134ae4f R11: ffffffff89a5727f R12: 0000000000000000 R13: 0000000000000001 R14: ffff8880a45c6ac0 R15: 0000000000000000 FS: 00007fa04716f700(0000) GS:ffff8880ae900000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa04716edb8 CR3: 0000000091eb4000 CR4: 00000000001406e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: packet_current_frame net/packet/af_packet.c:487 [inline] tpacket_snd net/packet/af_packet.c:2667 [inline] packet_sendmsg+0x590/0x6250 net/packet/af_packet.c:2975 sock_sendmsg_nosec net/socket.c:637 [inline] sock_sendmsg+0xd7/0x130 net/socket.c:657 ___sys_sendmsg+0x3e2/0x920 net/socket.c:2311 __sys_sendmmsg+0x1bf/0x4d0 net/socket.c:2413 __do_sys_sendmmsg net/socket.c:2442 [inline] __se_sys_sendmmsg net/socket.c:2439 [inline] __x64_sys_sendmmsg+0x9d/0x100 net/socket.c:2439 do_syscall_64+0xfd/0x6a0 arch/x86/entry/common.c:296 entry_SYSCALL_64_after_hwframe+0x49/0xbe Fixes: 69e3c75f4d54 ("net: TX_RING and packet mmap") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/packet/af_packet.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8d54f3047768..e2742b006d25 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2618,6 +2618,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) mutex_lock(&po->pg_vec_lock); + /* packet_sendmsg() check on tx_ring.pg_vec was lockless, + * we need to confirm it under protection of pg_vec_lock. + */ + if (unlikely(!po->tx_ring.pg_vec)) { + err = -EBUSY; + goto out; + } if (likely(saddr == NULL)) { dev = packet_cached_dev_get(po); proto = po->num; -- cgit v1.2.3 From 712042313b23b5df7451faf4b279beb3025e990c Mon Sep 17 00:00:00 2001 From: Tuong Lien Date: Thu, 15 Aug 2019 10:24:08 +0700 Subject: tipc: fix false detection of retransmit failures This commit eliminates the use of the link 'stale_limit' & 'prev_from' (besides the already removed - 'stale_cnt') variables in the detection of repeated retransmit failures as there is no proper way to initialize them to avoid a false detection, i.e. it is not really a retransmission failure but due to a garbage values in the variables. Instead, a jiffies variable will be added to individual skbs (like the way we restrict the skb retransmissions) in order to mark the first skb retransmit time. Later on, at the next retransmissions, the timestamp will be checked to see if the skb in the link transmq is "too stale", that is, the link tolerance time has passed, so that a link reset will be ordered. Note, just checking on the first skb in the queue is fine enough since it must be the oldest one. A counter is also added to keep track the actual skb retransmissions' number for later checking when the failure happens. The downside of this approach is that the skb->cb[] buffer is about to be exhausted, however it is always able to allocate another memory area and keep a reference to it when needed. Fixes: 77cf8edbc0e7 ("tipc: simplify stale link failure criteria") Reported-by: Hoang Le Acked-by: Ying Xue Acked-by: Jon Maloy Signed-off-by: Tuong Lien Signed-off-by: David S. Miller --- net/tipc/link.c | 92 ++++++++++++++++++++++++++++++++------------------------- net/tipc/msg.h | 8 +++-- 2 files changed, 57 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index 66d3a07bc571..c2c5c53cad22 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -106,8 +106,6 @@ struct tipc_stats { * @transmitq: queue for sent, non-acked messages * @backlogq: queue for messages waiting to be sent * @snt_nxt: next sequence number to use for outbound messages - * @prev_from: sequence number of most previous retransmission request - * @stale_limit: time when repeated identical retransmits must force link reset * @ackers: # of peers that needs to ack each packet before it can be released * @acked: # last packet acked by a certain peer. Used for broadcast. * @rcv_nxt: next sequence number to expect for inbound messages @@ -164,9 +162,7 @@ struct tipc_link { u16 limit; } backlog[5]; u16 snd_nxt; - u16 prev_from; u16 window; - unsigned long stale_limit; /* Reception */ u16 rcv_nxt; @@ -1044,47 +1040,53 @@ static void tipc_link_advance_backlog(struct tipc_link *l, * link_retransmit_failure() - Detect repeated retransmit failures * @l: tipc link sender * @r: tipc link receiver (= l in case of unicast) - * @from: seqno of the 1st packet in retransmit request * @rc: returned code * * Return: true if the repeated retransmit failures happens, otherwise * false */ static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r, - u16 from, int *rc) + int *rc) { struct sk_buff *skb = skb_peek(&l->transmq); struct tipc_msg *hdr; if (!skb) return false; - hdr = buf_msg(skb); - /* Detect repeated retransmit failures on same packet */ - if (r->prev_from != from) { - r->prev_from = from; - r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance); - } else if (time_after(jiffies, r->stale_limit)) { - pr_warn("Retransmission failure on link <%s>\n", l->name); - link_print(l, "State of link "); - pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", - msg_user(hdr), msg_type(hdr), msg_size(hdr), - msg_errcode(hdr)); - pr_info("sqno %u, prev: %x, src: %x\n", - msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr)); - - trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); - trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); - trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); + if (!TIPC_SKB_CB(skb)->retr_cnt) + return false; - if (link_is_bc_sndlink(l)) - *rc = TIPC_LINK_DOWN_EVT; + if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp + + msecs_to_jiffies(r->tolerance))) + return false; + + hdr = buf_msg(skb); + if (link_is_bc_sndlink(l) && !less(r->acked, msg_seqno(hdr))) + return false; + pr_warn("Retransmission failure on link <%s>\n", l->name); + link_print(l, "State of link "); + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n", + msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr)); + pr_info("sqno %u, prev: %x, dest: %x\n", + msg_seqno(hdr), msg_prevnode(hdr), msg_destnode(hdr)); + pr_info("retr_stamp %d, retr_cnt %d\n", + jiffies_to_msecs(TIPC_SKB_CB(skb)->retr_stamp), + TIPC_SKB_CB(skb)->retr_cnt); + + trace_tipc_list_dump(&l->transmq, true, "retrans failure!"); + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!"); + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!"); + + if (link_is_bc_sndlink(l)) { + r->state = LINK_RESET; + *rc = TIPC_LINK_DOWN_EVT; + } else { *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT); - return true; } - return false; + return true; } /* tipc_link_bc_retrans() - retransmit zero or more packets @@ -1110,7 +1112,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, trace_tipc_link_retrans(r, from, to, &l->transmq); - if (link_retransmit_failure(l, r, from, &rc)) + if (link_retransmit_failure(l, r, &rc)) return rc; skb_queue_walk(&l->transmq, skb) { @@ -1119,11 +1121,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, continue; if (more(msg_seqno(hdr), to)) break; - if (link_is_bc_sndlink(l)) { - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) - continue; - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; - } + + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) + continue; + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM; _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC); if (!_skb) return 0; @@ -1133,6 +1134,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l, struct tipc_link *r, _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; + + /* Increase actual retrans counter & mark first time */ + if (!TIPC_SKB_CB(skb)->retr_cnt++) + TIPC_SKB_CB(skb)->retr_stamp = jiffies; } return 0; } @@ -1357,12 +1362,10 @@ static int tipc_link_advance_transmq(struct tipc_link *l, u16 acked, u16 gap, struct tipc_msg *hdr; u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1; u16 ack = l->rcv_nxt - 1; + bool passed = false; u16 seqno, n = 0; int rc = 0; - if (gap && link_retransmit_failure(l, l, acked + 1, &rc)) - return rc; - skb_queue_walk_safe(&l->transmq, skb, tmp) { seqno = buf_seqno(skb); @@ -1372,12 +1375,17 @@ next_gap_ack: __skb_unlink(skb, &l->transmq); kfree_skb(skb); } else if (less_eq(seqno, acked + gap)) { - /* retransmit skb */ + /* First, check if repeated retrans failures occurs? */ + if (!passed && link_retransmit_failure(l, l, &rc)) + return rc; + passed = true; + + /* retransmit skb if unrestricted*/ if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr)) continue; TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME; - - _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC); + _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, + GFP_ATOMIC); if (!_skb) continue; hdr = buf_msg(_skb); @@ -1386,6 +1394,10 @@ next_gap_ack: _skb->priority = TC_PRIO_CONTROL; __skb_queue_tail(xmitq, _skb); l->stats.retransmitted++; + + /* Increase actual retrans counter & mark first time */ + if (!TIPC_SKB_CB(skb)->retr_cnt++) + TIPC_SKB_CB(skb)->retr_stamp = jiffies; } else { /* retry with Gap ACK blocks if any */ if (!ga || n >= ga->gack_cnt) @@ -2577,7 +2589,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues, char *buf) i += scnprintf(buf + i, sz - i, " %x", l->peer_caps); i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt); i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt); - i += scnprintf(buf + i, sz - i, " %u", l->prev_from); + i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", 0); i += scnprintf(buf + i, sz - i, " %u", l->acked); diff --git a/net/tipc/msg.h b/net/tipc/msg.h index da509f0eb9ca..d7ebc9e955f6 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -102,13 +102,15 @@ struct plist; #define TIPC_MEDIA_INFO_OFFSET 5 struct tipc_skb_cb { - u32 bytes_read; - u32 orig_member; struct sk_buff *tail; unsigned long nxt_retr; - bool validated; + unsigned long retr_stamp; + u32 bytes_read; + u32 orig_member; u16 chain_imp; u16 ackers; + u16 retr_cnt; + bool validated; }; #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0])) -- cgit v1.2.3 From 58a96fc35375ab87db7c5b69336f5befde1b548f Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 16 Jul 2019 20:34:41 +0200 Subject: Bluetooth: Add debug setting for changing minimum encryption key size For testing and qualification purposes it is useful to allow changing the minimum encryption key size value that the host stack is going to enforce. This adds a new debugfs setting min_encrypt_key_size to achieve this functionality. Signed-off-by: Marcel Holtmann Signed-off-by: Johan Hedberg --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/hci_core.c | 1 + net/bluetooth/hci_debugfs.c | 31 +++++++++++++++++++++++++++++++ net/bluetooth/l2cap_core.c | 2 +- 4 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index ded574b32c20..ffc95b382eb5 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -278,6 +278,7 @@ struct hci_dev { __u16 conn_info_min_age; __u16 conn_info_max_age; __u16 auth_payload_timeout; + __u8 min_enc_key_size; __u8 ssp_debug_mode; __u8 hw_error_code; __u32 clock; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b9585e7d9d2e..04bc79359a17 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3202,6 +3202,7 @@ struct hci_dev *hci_alloc_dev(void) hdev->conn_info_min_age = DEFAULT_CONN_INFO_MIN_AGE; hdev->conn_info_max_age = DEFAULT_CONN_INFO_MAX_AGE; hdev->auth_payload_timeout = DEFAULT_AUTH_PAYLOAD_TIMEOUT; + hdev->min_enc_key_size = HCI_MIN_ENC_KEY_SIZE; mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index bb67f4a5479a..402e2cc54044 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -433,6 +433,35 @@ static int auto_accept_delay_set(void *data, u64 val) return 0; } +static int min_encrypt_key_size_set(void *data, u64 val) +{ + struct hci_dev *hdev = data; + + if (val < 1 || val > 16) + return -EINVAL; + + hci_dev_lock(hdev); + hdev->min_enc_key_size = val; + hci_dev_unlock(hdev); + + return 0; +} + +static int min_encrypt_key_size_get(void *data, u64 *val) +{ + struct hci_dev *hdev = data; + + hci_dev_lock(hdev); + *val = hdev->min_enc_key_size; + hci_dev_unlock(hdev); + + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(min_encrypt_key_size_fops, + min_encrypt_key_size_get, + min_encrypt_key_size_set, "%llu\n"); + static int auto_accept_delay_get(void *data, u64 *val) { struct hci_dev *hdev = data; @@ -545,6 +574,8 @@ void hci_debugfs_create_bredr(struct hci_dev *hdev) if (lmp_ssp_capable(hdev)) { debugfs_create_file("ssp_debug_mode", 0444, hdev->debugfs, hdev, &ssp_debug_mode_fops); + debugfs_create_file("min_encrypt_key_size", 0644, hdev->debugfs, + hdev, &min_encrypt_key_size_fops); debugfs_create_file("auto_accept_delay", 0644, hdev->debugfs, hdev, &auto_accept_delay_fops); } diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index cc506fe99b4d..dfc1edb168b7 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1361,7 +1361,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) * actually encrypted before enforcing a key size. */ return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || - hcon->enc_key_size >= HCI_MIN_ENC_KEY_SIZE); + hcon->enc_key_size >= hcon->hdev->min_enc_key_size); } static void l2cap_do_start(struct l2cap_chan *chan) -- cgit v1.2.3 From 3bc158f8d0330f0ac58597c023acca2234c14616 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 16 Aug 2019 03:24:10 +0200 Subject: netfilter: nf_tables: map basechain priority to hardware priority This patch adds initial support for offloading basechains using the priority range from 1 to 65535. This is restricting the netfilter priority range to 16-bit integer since this is what most drivers assume so far from tc. It should be possible to extend this range of supported priorities later on once drivers are updated to support for 32-bit integer priorities. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/netfilter/nf_tables_offload.h | 2 ++ net/netfilter/nf_tables_api.c | 4 ++++ net/netfilter/nf_tables_offload.c | 17 ++++++++++++++--- 3 files changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/net/netfilter/nf_tables_offload.h b/include/net/netfilter/nf_tables_offload.h index 3196663a10e3..c8b9dec376f5 100644 --- a/include/net/netfilter/nf_tables_offload.h +++ b/include/net/netfilter/nf_tables_offload.h @@ -73,4 +73,6 @@ int nft_flow_rule_offload_commit(struct net *net); (__reg)->key = __key; \ memset(&(__reg)->mask, 0xff, (__reg)->len); +int nft_chain_offload_priority(struct nft_base_chain *basechain); + #endif diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 88abbddf8967..d47469f824a1 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1667,6 +1667,10 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, chain->flags |= NFT_BASE_CHAIN | flags; basechain->policy = NF_ACCEPT; + if (chain->flags & NFT_CHAIN_HW_OFFLOAD && + nft_chain_offload_priority(basechain) < 0) + return -EOPNOTSUPP; + flow_block_init(&basechain->flow_block); } else { chain = kzalloc(sizeof(*chain), GFP_KERNEL); diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 64f5fd5f240e..c0d18c1d77ac 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -103,10 +103,11 @@ void nft_offload_update_dependency(struct nft_offload_ctx *ctx, } static void nft_flow_offload_common_init(struct flow_cls_common_offload *common, - __be16 proto, - struct netlink_ext_ack *extack) + __be16 proto, int priority, + struct netlink_ext_ack *extack) { common->protocol = proto; + common->prio = priority; common->extack = extack; } @@ -124,6 +125,15 @@ static int nft_setup_cb_call(struct nft_base_chain *basechain, return 0; } +int nft_chain_offload_priority(struct nft_base_chain *basechain) +{ + if (basechain->ops.priority <= 0 || + basechain->ops.priority > USHRT_MAX) + return -1; + + return 0; +} + static int nft_flow_offload_rule(struct nft_trans *trans, enum flow_cls_command command) { @@ -142,7 +152,8 @@ static int nft_flow_offload_rule(struct nft_trans *trans, if (flow) proto = flow->proto; - nft_flow_offload_common_init(&cls_flow.common, proto, &extack); + nft_flow_offload_common_init(&cls_flow.common, proto, + basechain->ops.priority, &extack); cls_flow.command = command; cls_flow.cookie = (unsigned long) rule; if (flow) -- cgit v1.2.3