summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/802/Makefile1
-rw-r--r--net/802/hippi.c193
-rw-r--r--net/atm/signaling.c56
-rw-r--r--net/ax25/Kconfig14
-rw-r--r--net/ax25/ax25_dev.c2
-rw-r--r--net/bluetooth/hci_conn.c162
-rw-r--r--net/bluetooth/hci_core.c1
-rw-r--r--net/bluetooth/hci_event.c79
-rw-r--r--net/bluetooth/hci_sync.c125
-rw-r--r--net/bluetooth/iso.c10
-rw-r--r--net/bluetooth/l2cap_core.c46
-rw-r--r--net/bluetooth/l2cap_sock.c20
-rw-r--r--net/bluetooth/mgmt_config.c21
-rw-r--r--net/bridge/br_multicast.c2
-rw-r--r--net/bridge/br_netfilter_ipv6.c2
-rw-r--r--net/bridge/br_stp_if.c8
-rw-r--r--net/bridge/br_sysfs_br.c108
-rw-r--r--net/bridge/br_sysfs_if.c32
-rw-r--r--net/bridge/netfilter/nf_conntrack_bridge.c7
-rw-r--r--net/can/Kconfig1
-rw-r--r--net/can/af_can.c23
-rw-r--r--net/can/bcm.c26
-rw-r--r--net/can/gw.c42
-rw-r--r--net/can/isotp.c46
-rw-r--r--net/can/j1939/socket.c16
-rw-r--r--net/can/j1939/transport.c39
-rw-r--r--net/can/raw.c23
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c24
-rw-r--r--net/core/dev.h5
-rw-r--r--net/core/dev_ioctl.c60
-rw-r--r--net/core/devmem.c27
-rw-r--r--net/core/devmem.h17
-rw-r--r--net/core/gro.c4
-rw-r--r--net/core/neighbour.c150
-rw-r--r--net/core/net_namespace.c34
-rw-r--r--net/core/netdev_config.c78
-rw-r--r--net/core/netdev_rx_queue.c53
-rw-r--r--net/core/request_sock.c127
-rw-r--r--net/core/skbuff.c166
-rw-r--r--net/core/sock.c16
-rw-r--r--net/core/sysctl_net_core.c11
-rw-r--r--net/devlink/core.c6
-rw-r--r--net/devlink/dev.c7
-rw-r--r--net/devlink/devl_internal.h6
-rw-r--r--net/devlink/rate.c13
-rw-r--r--net/dsa/Kconfig7
-rw-r--r--net/dsa/Makefile1
-rw-r--r--net/dsa/tag_mxl862xx.c110
-rw-r--r--net/dsa/tag_yt921x.c89
-rw-r--r--net/dsa/user.c5
-rw-r--r--net/ethtool/common.c301
-rw-r--r--net/hsr/hsr_framereg.c362
-rw-r--r--net/hsr/hsr_framereg.h39
-rw-r--r--net/hsr/prp_dup_discard_test.c156
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/cipso_ipv4.c3
-rw-r--r--net/ipv4/fib_lookup.h6
-rw-r--r--net/ipv4/fib_trie.c4
-rw-r--r--net/ipv4/icmp.c139
-rw-r--r--net/ipv4/igmp.c4
-rw-r--r--net/ipv4/inet_connection_sock.c24
-rw-r--r--net/ipv4/ip_output.c17
-rw-r--r--net/ipv4/ip_sockglue.c2
-rw-r--r--net/ipv4/ipconfig.c89
-rw-r--r--net/ipv4/ipmr.c2
-rw-r--r--net/ipv4/metrics.c2
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c2
-rw-r--r--net/ipv4/ping.c7
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/route.c8
-rw-r--r--net/ipv4/sysctl_net_ipv4.c4
-rw-r--r--net/ipv4/tcp.c84
-rw-r--r--net/ipv4/tcp_cong.c5
-rw-r--r--net/ipv4/tcp_fastopen.c86
-rw-r--r--net/ipv4/tcp_input.c305
-rw-r--r--net/ipv4/tcp_ipv4.c37
-rw-r--r--net/ipv4/tcp_minisocks.c43
-rw-r--r--net/ipv4/tcp_offload.c3
-rw-r--r--net/ipv4/tcp_output.c117
-rw-r--r--net/ipv4/tcp_rate.c209
-rw-r--r--net/ipv4/tcp_recovery.c75
-rw-r--r--net/ipv4/tcp_timer.c3
-rw-r--r--net/ipv4/udp.c32
-rw-r--r--net/ipv4/udp_offload.c6
-rw-r--r--net/ipv6/Makefile2
-rw-r--r--net/ipv6/addrconf.c23
-rw-r--r--net/ipv6/af_inet6.c61
-rw-r--r--net/ipv6/datagram.c21
-rw-r--r--net/ipv6/exthdrs.c79
-rw-r--r--net/ipv6/icmp.c9
-rw-r--r--net/ipv6/inet6_connection_sock.c65
-rw-r--r--net/ipv6/ip6_fib.c12
-rw-r--r--net/ipv6/ip6_gre.c2
-rw-r--r--net/ipv6/ip6_input.c2
-rw-r--r--net/ipv6/ip6_offload.c79
-rw-r--r--net/ipv6/ip6_output.c122
-rw-r--r--net/ipv6/ip6_tunnel.c33
-rw-r--r--net/ipv6/ipv6_sockglue.c4
-rw-r--r--net/ipv6/output_core.c7
-rw-r--r--net/ipv6/raw.c34
-rw-r--r--net/ipv6/route.c41
-rw-r--r--net/ipv6/sit.c2
-rw-r--r--net/ipv6/tcp_ipv6.c78
-rw-r--r--net/ipv6/tcpv6_offload.c12
-rw-r--r--net/ipv6/udp.c5
-rw-r--r--net/ipv6/udp_offload.c3
-rw-r--r--net/iucv/iucv.c227
-rw-r--r--net/mac80211/Makefile2
-rw-r--r--net/mac80211/cfg.c60
-rw-r--r--net/mac80211/driver-ops.h21
-rw-r--r--net/mac80211/drop.h46
-rw-r--r--net/mac80211/eht.c175
-rw-r--r--net/mac80211/ieee80211_i.h36
-rw-r--r--net/mac80211/iface.c18
-rw-r--r--net/mac80211/link.c4
-rw-r--r--net/mac80211/main.c15
-rw-r--r--net/mac80211/mlme.c164
-rw-r--r--net/mac80211/parse.c25
-rw-r--r--net/mac80211/rx.c148
-rw-r--r--net/mac80211/sta_info.c35
-rw-r--r--net/mac80211/sta_info.h84
-rw-r--r--net/mac80211/trace.h32
-rw-r--r--net/mac80211/tx.c4
-rw-r--r--net/mac80211/uhr.c30
-rw-r--r--net/mac80211/util.c126
-rw-r--r--net/mac80211/wpa.c6
-rw-r--r--net/mptcp/pm_kernel.c29
-rw-r--r--net/mptcp/protocol.c275
-rw-r--r--net/mptcp/protocol.h12
-rw-r--r--net/mptcp/subflow.c2
-rw-r--r--net/mptcp/token.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c2
-rw-r--r--net/netfilter/nf_conncount.c30
-rw-r--r--net/netfilter/nf_conntrack_bpf.c1
-rw-r--r--net/netfilter/nf_conntrack_h323_main.c1
-rw-r--r--net/netfilter/nf_conntrack_netlink.c1
-rw-r--r--net/netfilter/nf_conntrack_ovs.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_generic.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_gre.c2
-rw-r--r--net/netfilter/nf_conntrack_proto_icmp.c1
-rw-r--r--net/netfilter/nf_conntrack_proto_icmpv6.c1
-rw-r--r--net/netfilter/nf_flow_table_core.c12
-rw-r--r--net/netfilter/nf_flow_table_ip.c245
-rw-r--r--net/netfilter/nf_flow_table_offload.c1
-rw-r--r--net/netfilter/nf_flow_table_path.c1
-rw-r--r--net/netfilter/nf_log_syslog.c2
-rw-r--r--net/netfilter/nf_nat_ovs.c3
-rw-r--r--net/netfilter/nf_nat_proto.c1
-rw-r--r--net/netfilter/nf_synproxy_core.c1
-rw-r--r--net/netfilter/nf_tables_api.c37
-rw-r--r--net/netfilter/nfnetlink_queue.c342
-rw-r--r--net/netfilter/nft_compat.c13
-rw-r--r--net/netfilter/nft_counter.c4
-rw-r--r--net/netfilter/nft_flow_offload.c1
-rw-r--r--net/netfilter/nft_set_hash.c9
-rw-r--r--net/netfilter/nft_set_pipapo.c2
-rw-r--r--net/netfilter/nft_set_rbtree.c800
-rw-r--r--net/netfilter/nft_synproxy.c1
-rw-r--r--net/netfilter/xt_tcpmss.c2
-rw-r--r--net/netfilter/xt_time.c8
-rw-r--r--net/nfc/hci/llc_shdlc.c8
-rw-r--r--net/packet/af_packet.c5
-rw-r--r--net/rds/cong.c2
-rw-r--r--net/rds/connection.c38
-rw-r--r--net/rds/ib_recv.c2
-rw-r--r--net/rds/ib_send.c44
-rw-r--r--net/rds/message.c66
-rw-r--r--net/rds/rds.h170
-rw-r--r--net/rds/recv.c39
-rw-r--r--net/rds/send.c139
-rw-r--r--net/rds/stats.c1
-rw-r--r--net/rds/tcp.c28
-rw-r--r--net/rds/tcp.h27
-rw-r--r--net/rds/tcp_connect.c79
-rw-r--r--net/rds/tcp_listen.c212
-rw-r--r--net/rds/tcp_recv.c6
-rw-r--r--net/rds/tcp_send.c4
-rw-r--r--net/rds/threads.c16
-rw-r--r--net/sched/act_ct.c2
-rw-r--r--net/sched/act_ctinfo.c1
-rw-r--r--net/sched/sch_api.c2
-rw-r--r--net/sched/sch_cake.c495
-rw-r--r--net/sched/sch_fq.c28
-rw-r--r--net/sched/sch_generic.c8
-rw-r--r--net/sched/sch_mq.c71
-rw-r--r--net/smc/af_smc.c91
-rw-r--r--net/tipc/crypto.c2
-rw-r--r--net/unix/af_unix.c11
-rw-r--r--net/vmw_vsock/af_vsock.c335
-rw-r--r--net/vmw_vsock/hyperv_transport.c7
-rw-r--r--net/vmw_vsock/virtio_transport.c22
-rw-r--r--net/vmw_vsock/virtio_transport_common.c62
-rw-r--r--net/vmw_vsock/vmci_transport.c28
-rw-r--r--net/vmw_vsock/vsock_loopback.c22
-rw-r--r--net/wireless/core.c38
-rw-r--r--net/wireless/core.h4
-rw-r--r--net/wireless/nl80211.c172
-rw-r--r--net/wireless/pmsr.c27
-rw-r--r--net/wireless/reg.c17
-rw-r--r--net/wireless/scan.c2
-rw-r--r--net/wireless/sysfs.c2
-rw-r--r--net/wireless/trace.h13
-rw-r--r--net/wireless/util.c106
-rw-r--r--net/xdp/xsk.c15
-rw-r--r--net/xdp/xsk_buff_pool.c6
-rw-r--r--net/xdp/xsk_queue.h5
207 files changed, 7130 insertions, 3474 deletions
diff --git a/net/802/Makefile b/net/802/Makefile
index 99abc29d537c..9503ef6b2e06 100644
--- a/net/802/Makefile
+++ b/net/802/Makefile
@@ -6,7 +6,6 @@
obj-$(CONFIG_LLC) += psnap.o
obj-$(CONFIG_NET_FC) += fc.o
obj-$(CONFIG_FDDI) += fddi.o
-obj-$(CONFIG_HIPPI) += hippi.o
obj-$(CONFIG_ATALK) += psnap.o
obj-$(CONFIG_STP) += stp.o
obj-$(CONFIG_GARP) += garp.o
diff --git a/net/802/hippi.c b/net/802/hippi.c
deleted file mode 100644
index 1997b7dd265e..000000000000
--- a/net/802/hippi.c
+++ /dev/null
@@ -1,193 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * INET An implementation of the TCP/IP protocol suite for the LINUX
- * operating system. INET is implemented using the BSD Socket
- * interface as the means of communication with the user level.
- *
- * HIPPI-type device handling.
- *
- * Version: @(#)hippi.c 1.0.0 05/29/97
- *
- * Authors: Ross Biro
- * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
- * Mark Evans, <evansmp@uhura.aston.ac.uk>
- * Florian La Roche, <rzsfl@rz.uni-sb.de>
- * Alan Cox, <gw4pts@gw4pts.ampr.org>
- * Jes Sorensen, <Jes.Sorensen@cern.ch>
- */
-
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/socket.h>
-#include <linux/in.h>
-#include <linux/inet.h>
-#include <linux/netdevice.h>
-#include <linux/hippidevice.h>
-#include <linux/skbuff.h>
-#include <linux/errno.h>
-#include <net/arp.h>
-#include <net/sock.h>
-#include <linux/uaccess.h>
-
-/*
- * Create the HIPPI MAC header for an arbitrary protocol layer
- *
- * saddr=NULL means use device source address
- * daddr=NULL means leave destination address (eg unresolved arp)
- */
-
-static int hippi_header(struct sk_buff *skb, struct net_device *dev,
- unsigned short type,
- const void *daddr, const void *saddr, unsigned int len)
-{
- struct hippi_hdr *hip = skb_push(skb, HIPPI_HLEN);
- struct hippi_cb *hcb = (struct hippi_cb *) skb->cb;
-
- if (!len){
- len = skb->len - HIPPI_HLEN;
- printk("hippi_header(): length not supplied\n");
- }
-
- /*
- * Due to the stupidity of the little endian byte-order we
- * have to set the fp field this way.
- */
- hip->fp.fixed = htonl(0x04800018);
- hip->fp.d2_size = htonl(len + 8);
- hip->le.fc = 0;
- hip->le.double_wide = 0; /* only HIPPI 800 for the time being */
- hip->le.message_type = 0; /* Data PDU */
-
- hip->le.dest_addr_type = 2; /* 12 bit SC address */
- hip->le.src_addr_type = 2; /* 12 bit SC address */
-
- memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
- memset_startat(&hip->le, 0, reserved);
-
- hip->snap.dsap = HIPPI_EXTENDED_SAP;
- hip->snap.ssap = HIPPI_EXTENDED_SAP;
- hip->snap.ctrl = HIPPI_UI_CMD;
- hip->snap.oui[0] = 0x00;
- hip->snap.oui[1] = 0x00;
- hip->snap.oui[2] = 0x00;
- hip->snap.ethertype = htons(type);
-
- if (daddr)
- {
- memcpy(hip->le.dest_switch_addr, daddr + 3, 3);
- memcpy(&hcb->ifield, daddr + 2, 4);
- return HIPPI_HLEN;
- }
- hcb->ifield = 0;
- return -((int)HIPPI_HLEN);
-}
-
-
-/*
- * Determine the packet's protocol ID.
- */
-
-__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev)
-{
- struct hippi_hdr *hip;
-
- /*
- * This is actually wrong ... question is if we really should
- * set the raw address here.
- */
- skb->dev = dev;
- skb_reset_mac_header(skb);
- hip = (struct hippi_hdr *)skb_mac_header(skb);
- skb_pull(skb, HIPPI_HLEN);
-
- /*
- * No fancy promisc stuff here now.
- */
-
- return hip->snap.ethertype;
-}
-
-EXPORT_SYMBOL(hippi_type_trans);
-
-/*
- * For HIPPI we will actually use the lower 4 bytes of the hardware
- * address as the I-FIELD rather than the actual hardware address.
- */
-int hippi_mac_addr(struct net_device *dev, void *p)
-{
- struct sockaddr *addr = p;
- if (netif_running(dev))
- return -EBUSY;
- dev_addr_set(dev, addr->sa_data);
- return 0;
-}
-EXPORT_SYMBOL(hippi_mac_addr);
-
-int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p)
-{
- /* Never send broadcast/multicast ARP messages */
- NEIGH_VAR_INIT(p, MCAST_PROBES, 0);
-
- /* In IPv6 unicast probes are valid even on NBMA,
- * because they are encapsulated in normal IPv6 protocol.
- * Should be a generic flag.
- */
- if (p->tbl->family != AF_INET6)
- NEIGH_VAR_INIT(p, UCAST_PROBES, 0);
- return 0;
-}
-EXPORT_SYMBOL(hippi_neigh_setup_dev);
-
-static const struct header_ops hippi_header_ops = {
- .create = hippi_header,
-};
-
-
-static void hippi_setup(struct net_device *dev)
-{
- dev->header_ops = &hippi_header_ops;
-
- /*
- * We don't support HIPPI `ARP' for the time being, and probably
- * never will unless someone else implements it. However we
- * still need a fake ARPHRD to make ifconfig and friends play ball.
- */
- dev->type = ARPHRD_HIPPI;
- dev->hard_header_len = HIPPI_HLEN;
- dev->mtu = 65280;
- dev->min_mtu = 68;
- dev->max_mtu = 65280;
- dev->addr_len = HIPPI_ALEN;
- dev->tx_queue_len = 25 /* 5 */;
- memset(dev->broadcast, 0xFF, HIPPI_ALEN);
-
-
- /*
- * HIPPI doesn't support broadcast+multicast and we only use
- * static ARP tables. ARP is disabled by hippi_neigh_setup_dev.
- */
- dev->flags = 0;
-}
-
-/**
- * alloc_hippi_dev - Register HIPPI device
- * @sizeof_priv: Size of additional driver-private structure to be allocated
- * for this HIPPI device
- *
- * Fill in the fields of the device structure with HIPPI-generic values.
- *
- * Constructs a new net device, complete with a private data area of
- * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for
- * this private data area.
- */
-
-struct net_device *alloc_hippi_dev(int sizeof_priv)
-{
- return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN,
- hippi_setup);
-}
-
-EXPORT_SYMBOL(alloc_hippi_dev);
diff --git a/net/atm/signaling.c b/net/atm/signaling.c
index e70ae2c113f9..358fbe5e4d1d 100644
--- a/net/atm/signaling.c
+++ b/net/atm/signaling.c
@@ -22,6 +22,36 @@
struct atm_vcc *sigd = NULL;
+/*
+ * find_get_vcc - validate and get a reference to a vcc pointer
+ * @vcc: the vcc pointer to validate
+ *
+ * This function validates that @vcc points to a registered VCC in vcc_hash.
+ * If found, it increments the socket reference count and returns the vcc.
+ * The caller must call sock_put(sk_atm(vcc)) when done.
+ *
+ * Returns the vcc pointer if valid, NULL otherwise.
+ */
+static struct atm_vcc *find_get_vcc(struct atm_vcc *vcc)
+{
+ int i;
+
+ read_lock(&vcc_sklist_lock);
+ for (i = 0; i < VCC_HTABLE_SIZE; i++) {
+ struct sock *s;
+
+ sk_for_each(s, &vcc_hash[i]) {
+ if (atm_sk(s) == vcc) {
+ sock_hold(s);
+ read_unlock(&vcc_sklist_lock);
+ return vcc;
+ }
+ }
+ }
+ read_unlock(&vcc_sklist_lock);
+ return NULL;
+}
+
static void sigd_put_skb(struct sk_buff *skb)
{
if (!sigd) {
@@ -69,7 +99,14 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
msg = (struct atmsvc_msg *) skb->data;
WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc));
- vcc = *(struct atm_vcc **) &msg->vcc;
+
+ vcc = find_get_vcc(*(struct atm_vcc **)&msg->vcc);
+ if (!vcc) {
+ pr_debug("invalid vcc pointer in msg\n");
+ dev_kfree_skb(skb);
+ return -EINVAL;
+ }
+
pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc);
sk = sk_atm(vcc);
@@ -100,7 +137,16 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
clear_bit(ATM_VF_WAITING, &vcc->flags);
break;
case as_indicate:
- vcc = *(struct atm_vcc **)&msg->listen_vcc;
+ /* Release the reference from msg->vcc, we'll use msg->listen_vcc instead */
+ sock_put(sk);
+
+ vcc = find_get_vcc(*(struct atm_vcc **)&msg->listen_vcc);
+ if (!vcc) {
+ pr_debug("invalid listen_vcc pointer in msg\n");
+ dev_kfree_skb(skb);
+ return -EINVAL;
+ }
+
sk = sk_atm(vcc);
pr_debug("as_indicate!!!\n");
lock_sock(sk);
@@ -115,6 +161,8 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb)
sk->sk_state_change(sk);
as_indicate_complete:
release_sock(sk);
+ /* Paired with find_get_vcc(msg->listen_vcc) above */
+ sock_put(sk);
return 0;
case as_close:
set_bit(ATM_VF_RELEASED, &vcc->flags);
@@ -131,11 +179,15 @@ as_indicate_complete:
break;
default:
pr_alert("bad message type %d\n", (int)msg->type);
+ /* Paired with find_get_vcc(msg->vcc) above */
+ sock_put(sk);
return -EINVAL;
}
sk->sk_state_change(sk);
out:
dev_kfree_skb(skb);
+ /* Paired with find_get_vcc(msg->vcc) above */
+ sock_put(sk);
return 0;
}
diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig
index e23a3dc14b93..310169ce1488 100644
--- a/net/ax25/Kconfig
+++ b/net/ax25/Kconfig
@@ -63,20 +63,6 @@ config AX25_DAMA_SLAVE
be enabled at runtime. For more about DAMA see
<https://linux-ax25.in-berlin.de>. If unsure, say Y.
-# placeholder until implemented
-config AX25_DAMA_MASTER
- bool 'AX.25 DAMA Master support'
- depends on AX25_DAMA_SLAVE && BROKEN
- help
- DAMA is a mechanism to prevent collisions when doing AX.25
- networking. A DAMA server (called "master") accepts incoming traffic
- from clients (called "slaves") and redistributes it to other slaves.
- If you say Y here, your Linux box will act as a DAMA master; this is
- transparent in that you don't have to do any special DAMA
- configuration. Linux cannot yet act as a DAMA server. This option
- only compiles DAMA slave support into the kernel. It still needs to
- be explicitly enabled, so if unsure, say Y.
-
config NETROM
tristate "Amateur Radio NET/ROM protocol"
depends on AX25
diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c
index 3733c0254a50..c504ed9c3a88 100644
--- a/net/ax25/ax25_dev.c
+++ b/net/ax25/ax25_dev.c
@@ -82,9 +82,7 @@ void ax25_dev_device_up(struct net_device *dev)
#ifdef CONFIG_AX25_DAMA_SLAVE
ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT;
-#endif
-#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER)
ax25_ds_setup_timer(ax25_dev);
#endif
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index c3f7828bf9d5..0795818963a5 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -1002,12 +1002,18 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type,
switch (type) {
case ACL_LINK:
conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK;
+ conn->link_policy = hdev->link_policy;
conn->mtu = hdev->acl_mtu;
break;
case LE_LINK:
/* conn->src should reflect the local identity address */
hci_copy_identity_address(hdev, &conn->src, &conn->src_type);
conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu;
+ /* Use the controller supported PHYS as default until the
+ * remote features are resolved.
+ */
+ conn->le_tx_def_phys = hdev->le_tx_def_phys;
+ conn->le_rx_def_phys = hdev->le_tx_def_phys;
break;
case CIS_LINK:
/* conn->src should reflect the local identity address */
@@ -1819,7 +1825,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos)
cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu);
cp.bis.latency = cpu_to_le16(qos->bcast.out.latency);
cp.bis.rtn = qos->bcast.out.rtn;
- cp.bis.phy = qos->bcast.out.phy;
+ cp.bis.phy = qos->bcast.out.phys;
cp.bis.packing = qos->bcast.packing;
cp.bis.framing = qos->bcast.framing;
cp.bis.encryption = qos->bcast.encryption;
@@ -1869,10 +1875,10 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data)
cis->cis_id = cis_id;
cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu);
cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu);
- cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy :
- qos->ucast.in.phy;
- cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy :
- qos->ucast.out.phy;
+ cis->c_phys = qos->ucast.out.phys ? qos->ucast.out.phys :
+ qos->ucast.in.phys;
+ cis->p_phys = qos->ucast.in.phys ? qos->ucast.in.phys :
+ qos->ucast.out.phys;
cis->c_rtn = qos->ucast.out.rtn;
cis->p_rtn = qos->ucast.in.rtn;
}
@@ -1974,8 +1980,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
return cis;
/* Update LINK PHYs according to QoS preference */
- cis->le_tx_phy = qos->ucast.out.phy;
- cis->le_rx_phy = qos->ucast.in.phy;
+ cis->le_tx_phy = qos->ucast.out.phys;
+ cis->le_rx_phy = qos->ucast.in.phys;
/* If output interval is not set use the input interval as it cannot be
* 0x000000.
@@ -2090,15 +2096,15 @@ int hci_le_create_cis_pending(struct hci_dev *hdev)
}
static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn,
- struct bt_iso_io_qos *qos, __u8 phy)
+ struct bt_iso_io_qos *qos, __u8 phys)
{
/* Only set MTU if PHY is enabled */
- if (!qos->sdu && qos->phy)
+ if (!qos->sdu && qos->phys)
qos->sdu = conn->mtu;
/* Use the same PHY as ACL if set to any */
- if (qos->phy == BT_ISO_PHY_ANY)
- qos->phy = phy;
+ if (qos->phys == BT_ISO_PHY_ANY)
+ qos->phys = phys;
/* Use LE ACL connection interval if not set */
if (!qos->interval)
@@ -2118,7 +2124,7 @@ static int create_big_sync(struct hci_dev *hdev, void *data)
u32 flags = 0;
int err;
- if (qos->bcast.out.phy == 0x02)
+ if (qos->bcast.out.phys == BIT(1))
flags |= MGMT_ADV_FLAG_SEC_2M;
/* Align intervals */
@@ -2227,8 +2233,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
return conn;
/* Update LINK PHYs according to QoS preference */
- conn->le_tx_phy = qos->bcast.out.phy;
- conn->le_tx_phy = qos->bcast.out.phy;
+ conn->le_tx_def_phys = qos->bcast.out.phys;
/* Add Basic Announcement into Peridic Adv Data if BASE is set */
if (base_len && base) {
@@ -2237,7 +2242,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
}
hci_iso_qos_setup(hdev, conn, &qos->bcast.out,
- conn->le_tx_phy ? conn->le_tx_phy :
+ conn->le_tx_def_phys ? conn->le_tx_def_phys :
hdev->le_tx_def_phys);
conn->iso_qos = *qos;
@@ -2357,9 +2362,11 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
return le;
hci_iso_qos_setup(hdev, le, &qos->ucast.out,
- le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys);
+ le->le_tx_def_phys ? le->le_tx_def_phys :
+ hdev->le_tx_def_phys);
hci_iso_qos_setup(hdev, le, &qos->ucast.in,
- le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
+ le->le_rx_def_phys ? le->le_rx_def_phys :
+ hdev->le_rx_def_phys);
cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout);
if (IS_ERR(cis)) {
@@ -2614,8 +2621,8 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active)
timer:
if (hdev->idle_timeout > 0)
- queue_delayed_work(hdev->workqueue, &conn->idle_work,
- msecs_to_jiffies(hdev->idle_timeout));
+ mod_delayed_work(hdev->workqueue, &conn->idle_work,
+ msecs_to_jiffies(hdev->idle_timeout));
}
/* Drop all connection on the device */
@@ -2928,22 +2935,22 @@ u32 hci_conn_get_phy(struct hci_conn *conn)
break;
case LE_LINK:
- if (conn->le_tx_phy & HCI_LE_SET_PHY_1M)
+ if (conn->le_tx_def_phys & HCI_LE_SET_PHY_1M)
phys |= BT_PHY_LE_1M_TX;
- if (conn->le_rx_phy & HCI_LE_SET_PHY_1M)
+ if (conn->le_rx_def_phys & HCI_LE_SET_PHY_1M)
phys |= BT_PHY_LE_1M_RX;
- if (conn->le_tx_phy & HCI_LE_SET_PHY_2M)
+ if (conn->le_tx_def_phys & HCI_LE_SET_PHY_2M)
phys |= BT_PHY_LE_2M_TX;
- if (conn->le_rx_phy & HCI_LE_SET_PHY_2M)
+ if (conn->le_rx_def_phys & HCI_LE_SET_PHY_2M)
phys |= BT_PHY_LE_2M_RX;
- if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED)
+ if (conn->le_tx_def_phys & HCI_LE_SET_PHY_CODED)
phys |= BT_PHY_LE_CODED_TX;
- if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED)
+ if (conn->le_rx_def_phys & HCI_LE_SET_PHY_CODED)
phys |= BT_PHY_LE_CODED_RX;
break;
@@ -2952,6 +2959,111 @@ u32 hci_conn_get_phy(struct hci_conn *conn)
return phys;
}
+static u16 bt_phy_pkt_type(struct hci_conn *conn, u32 phys)
+{
+ u16 pkt_type = conn->pkt_type;
+
+ if (phys & BT_PHY_BR_1M_3SLOT)
+ pkt_type |= HCI_DM3 | HCI_DH3;
+ else
+ pkt_type &= ~(HCI_DM3 | HCI_DH3);
+
+ if (phys & BT_PHY_BR_1M_5SLOT)
+ pkt_type |= HCI_DM5 | HCI_DH5;
+ else
+ pkt_type &= ~(HCI_DM5 | HCI_DH5);
+
+ if (phys & BT_PHY_EDR_2M_1SLOT)
+ pkt_type &= ~HCI_2DH1;
+ else
+ pkt_type |= HCI_2DH1;
+
+ if (phys & BT_PHY_EDR_2M_3SLOT)
+ pkt_type &= ~HCI_2DH3;
+ else
+ pkt_type |= HCI_2DH3;
+
+ if (phys & BT_PHY_EDR_2M_5SLOT)
+ pkt_type &= ~HCI_2DH5;
+ else
+ pkt_type |= HCI_2DH5;
+
+ if (phys & BT_PHY_EDR_3M_1SLOT)
+ pkt_type &= ~HCI_3DH1;
+ else
+ pkt_type |= HCI_3DH1;
+
+ if (phys & BT_PHY_EDR_3M_3SLOT)
+ pkt_type &= ~HCI_3DH3;
+ else
+ pkt_type |= HCI_3DH3;
+
+ if (phys & BT_PHY_EDR_3M_5SLOT)
+ pkt_type &= ~HCI_3DH5;
+ else
+ pkt_type |= HCI_3DH5;
+
+ return pkt_type;
+}
+
+static int bt_phy_le_phy(u32 phys, u8 *tx_phys, u8 *rx_phys)
+{
+ if (!tx_phys || !rx_phys)
+ return -EINVAL;
+
+ *tx_phys = 0;
+ *rx_phys = 0;
+
+ if (phys & BT_PHY_LE_1M_TX)
+ *tx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (phys & BT_PHY_LE_1M_RX)
+ *rx_phys |= HCI_LE_SET_PHY_1M;
+
+ if (phys & BT_PHY_LE_2M_TX)
+ *tx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (phys & BT_PHY_LE_2M_RX)
+ *rx_phys |= HCI_LE_SET_PHY_2M;
+
+ if (phys & BT_PHY_LE_CODED_TX)
+ *tx_phys |= HCI_LE_SET_PHY_CODED;
+
+ if (phys & BT_PHY_LE_CODED_RX)
+ *rx_phys |= HCI_LE_SET_PHY_CODED;
+
+ return 0;
+}
+
+int hci_conn_set_phy(struct hci_conn *conn, u32 phys)
+{
+ u8 tx_phys, rx_phys;
+
+ switch (conn->type) {
+ case SCO_LINK:
+ case ESCO_LINK:
+ return -EINVAL;
+ case ACL_LINK:
+ /* Only allow setting BR/EDR PHYs if link type is ACL */
+ if (phys & ~BT_PHY_BREDR_MASK)
+ return -EINVAL;
+
+ return hci_acl_change_pkt_type(conn,
+ bt_phy_pkt_type(conn, phys));
+ case LE_LINK:
+ /* Only allow setting LE PHYs if link type is LE */
+ if (phys & ~BT_PHY_LE_MASK)
+ return -EINVAL;
+
+ if (bt_phy_le_phy(phys, &tx_phys, &rx_phys))
+ return -EINVAL;
+
+ return hci_le_set_phy(conn, tx_phys, rx_phys);
+ default:
+ return -EINVAL;
+ }
+}
+
static int abort_conn_sync(struct hci_dev *hdev, void *data)
{
struct hci_conn *conn = data;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 8ccec73dce45..b069607b145b 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -117,6 +117,7 @@ bool hci_discovery_active(struct hci_dev *hdev)
return false;
}
}
+EXPORT_SYMBOL(hci_discovery_active);
void hci_discovery_set_state(struct hci_dev *hdev, int state)
{
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index a9868f17ef40..286529d2e554 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -2869,6 +2869,31 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status)
hci_dev_unlock(hdev);
}
+static void hci_cs_le_set_phy(struct hci_dev *hdev, u8 status)
+{
+ struct hci_cp_le_set_phy *cp;
+ struct hci_conn *conn;
+
+ bt_dev_dbg(hdev, "status 0x%2.2x", status);
+
+ if (status)
+ return;
+
+ cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PHY);
+ if (!cp)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle));
+ if (conn) {
+ conn->le_tx_def_phys = cp->tx_phys;
+ conn->le_rx_def_phys = cp->rx_phys;
+ }
+
+ hci_dev_unlock(hdev);
+}
+
static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status)
{
struct hci_cp_le_read_remote_features *cp;
@@ -4359,6 +4384,7 @@ static const struct hci_cs {
HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn),
HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features),
HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc),
+ HCI_CS(HCI_OP_LE_SET_PHY, hci_cs_le_set_phy),
HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn),
HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis),
HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big),
@@ -6607,8 +6633,20 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data,
conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle));
if (conn) {
- if (!ev->status)
- memcpy(conn->features[0], ev->features, 8);
+ if (!ev->status) {
+ memcpy(conn->le_features, ev->features, 8);
+
+ /* Update supported PHYs */
+ if (!(conn->le_features[1] & HCI_LE_PHY_2M)) {
+ conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M;
+ conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M;
+ }
+
+ if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) {
+ conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED;
+ conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED;
+ }
+ }
if (conn->state == BT_CONFIG) {
__u8 status;
@@ -6829,6 +6867,21 @@ unlock:
hci_dev_unlock(hdev);
}
+/* Convert LE PHY to QoS PHYs */
+static u8 le_phy_qos(u8 phy)
+{
+ switch (phy) {
+ case 0x01:
+ return HCI_LE_SET_PHY_1M;
+ case 0x02:
+ return HCI_LE_SET_PHY_2M;
+ case 0x03:
+ return HCI_LE_SET_PHY_CODED;
+ }
+
+ return 0;
+}
+
static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
struct sk_buff *skb)
{
@@ -6890,8 +6943,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
1000);
qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
- qos->ucast.in.phy = ev->c_phy;
- qos->ucast.out.phy = ev->p_phy;
+ qos->ucast.in.phys = le_phy_qos(ev->c_phy);
+ qos->ucast.out.phys = le_phy_qos(ev->p_phy);
break;
case HCI_ROLE_MASTER:
qos->ucast.in.interval = p_sdu_interval;
@@ -6905,8 +6958,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data,
1000);
qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0;
qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0;
- qos->ucast.out.phy = ev->c_phy;
- qos->ucast.in.phy = ev->p_phy;
+ qos->ucast.out.phys = le_phy_qos(ev->c_phy);
+ qos->ucast.in.phys = le_phy_qos(ev->p_phy);
break;
}
@@ -7221,9 +7274,21 @@ static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev,
if (!conn)
goto unlock;
- if (!ev->status)
+ if (!ev->status) {
memcpy(conn->le_features, ev->features, 248);
+ /* Update supported PHYs */
+ if (!(conn->le_features[1] & HCI_LE_PHY_2M)) {
+ conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M;
+ conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M;
+ }
+
+ if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) {
+ conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED;
+ conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED;
+ }
+ }
+
if (conn->state == BT_CONFIG) {
__u8 status;
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index cbc3a75d7326..f04a90bce4a9 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -2948,8 +2948,8 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
if (conn) {
struct bt_iso_qos *qos = &conn->iso_qos;
- if (qos->bcast.in.phy & BT_ISO_PHY_1M ||
- qos->bcast.in.phy & BT_ISO_PHY_2M) {
+ if (qos->bcast.in.phys & BT_ISO_PHY_1M ||
+ qos->bcast.in.phys & BT_ISO_PHY_2M) {
cp->scanning_phys |= LE_SCAN_PHY_1M;
hci_le_scan_phy_params(phy, type,
interval,
@@ -2958,7 +2958,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type,
phy++;
}
- if (qos->bcast.in.phy & BT_ISO_PHY_CODED) {
+ if (qos->bcast.in.phys & BT_ISO_PHY_CODED) {
cp->scanning_phys |= LE_SCAN_PHY_CODED;
hci_le_scan_phy_params(phy, type,
interval * 3,
@@ -4428,6 +4428,17 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev)
events[4] |= 0x02; /* LE BIG Info Advertising Report */
}
+ if (le_cs_capable(hdev)) {
+ /* Channel Sounding events */
+ events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */
+ events[5] |= 0x10; /* LE CS Read Remote FAE Table Complete event */
+ events[5] |= 0x20; /* LE CS Security Enable Complete event */
+ events[5] |= 0x40; /* LE CS Config Complete event */
+ events[5] |= 0x80; /* LE CS Procedure Enable Complete event */
+ events[6] |= 0x01; /* LE CS Subevent Result event */
+ events[6] |= 0x02; /* LE CS Subevent Result Continue event */
+ events[6] |= 0x04; /* LE CS Test End Complete event */
+ }
return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK,
sizeof(events), events, HCI_CMD_TIMEOUT);
}
@@ -4560,23 +4571,43 @@ static int hci_set_le_support_sync(struct hci_dev *hdev)
}
/* LE Set Host Feature */
-static int hci_le_set_host_feature_sync(struct hci_dev *hdev)
+static int hci_le_set_host_feature_sync(struct hci_dev *hdev, u8 bit, u8 value)
{
struct hci_cp_le_set_host_feature cp;
- if (!iso_capable(hdev))
- return 0;
-
memset(&cp, 0, sizeof(cp));
/* Connected Isochronous Channels (Host Support) */
- cp.bit_number = 32;
- cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00;
+ cp.bit_number = bit;
+ cp.bit_value = value;
return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE,
sizeof(cp), &cp, HCI_CMD_TIMEOUT);
}
+/* Set Host Features, each feature needs to be sent separately since
+ * HCI_OP_LE_SET_HOST_FEATURE doesn't support setting all of them at once.
+ */
+static int hci_le_set_host_features_sync(struct hci_dev *hdev)
+{
+ int err;
+
+ if (iso_capable(hdev)) {
+ /* Connected Isochronous Channels (Host Support) */
+ err = hci_le_set_host_feature_sync(hdev, 32,
+ (iso_enabled(hdev) ? 0x01 :
+ 0x00));
+ if (err)
+ return err;
+ }
+
+ if (le_cs_capable(hdev))
+ /* Channel Sounding (Host Support) */
+ err = hci_le_set_host_feature_sync(hdev, 47, 0x01);
+
+ return err;
+}
+
/* LE Controller init stage 3 command sequence */
static const struct hci_init_stage le_init3[] = {
/* HCI_OP_LE_SET_EVENT_MASK */
@@ -4604,7 +4635,7 @@ static const struct hci_init_stage le_init3[] = {
/* HCI_OP_WRITE_LE_HOST_SUPPORTED */
HCI_INIT(hci_set_le_support_sync),
/* HCI_OP_LE_SET_HOST_FEATURE */
- HCI_INIT(hci_le_set_host_feature_sync),
+ HCI_INIT(hci_le_set_host_features_sync),
{}
};
@@ -6897,8 +6928,6 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data)
conn->attempt++;
- conn->link_policy = hdev->link_policy;
-
memset(&cp, 0, sizeof(cp));
bacpy(&cp.bdaddr, &conn->dst);
cp.pscan_rep_mode = 0x02;
@@ -7419,3 +7448,75 @@ int hci_le_read_remote_features(struct hci_conn *conn)
return err;
}
+
+static void pkt_type_changed(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_cp_change_conn_ptype *cp = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ kfree(cp);
+}
+
+static int hci_change_conn_ptype_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_change_conn_ptype *cp = data;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_CHANGE_CONN_PTYPE,
+ sizeof(*cp), cp,
+ HCI_EV_PKT_TYPE_CHANGE,
+ HCI_CMD_TIMEOUT, NULL);
+}
+
+int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_change_conn_ptype *cp;
+
+ cp = kmalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp)
+ return -ENOMEM;
+
+ cp->handle = cpu_to_le16(conn->handle);
+ cp->pkt_type = cpu_to_le16(pkt_type);
+
+ return hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp,
+ pkt_type_changed);
+}
+
+static void le_phy_update_complete(struct hci_dev *hdev, void *data, int err)
+{
+ struct hci_cp_le_set_phy *cp = data;
+
+ bt_dev_dbg(hdev, "err %d", err);
+
+ kfree(cp);
+}
+
+static int hci_le_set_phy_sync(struct hci_dev *hdev, void *data)
+{
+ struct hci_cp_le_set_phy *cp = data;
+
+ return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_SET_PHY,
+ sizeof(*cp), cp,
+ HCI_EV_LE_PHY_UPDATE_COMPLETE,
+ HCI_CMD_TIMEOUT, NULL);
+}
+
+int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys)
+{
+ struct hci_dev *hdev = conn->hdev;
+ struct hci_cp_le_set_phy *cp;
+
+ cp = kmalloc(sizeof(*cp), GFP_KERNEL);
+ if (!cp)
+ return -ENOMEM;
+
+ memset(cp, 0, sizeof(*cp));
+ cp->handle = cpu_to_le16(conn->handle);
+ cp->tx_phys = tx_phys;
+ cp->rx_phys = rx_phys;
+
+ return hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp,
+ le_phy_update_complete);
+}
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index e36d24a9098b..1459ab161fd2 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -361,7 +361,7 @@ static int iso_connect_bis(struct sock *sk)
}
/* Fail if out PHYs are marked as disabled */
- if (!iso_pi(sk)->qos.bcast.out.phy) {
+ if (!iso_pi(sk)->qos.bcast.out.phys) {
err = -EINVAL;
goto unlock;
}
@@ -458,7 +458,7 @@ static int iso_connect_cis(struct sock *sk)
}
/* Fail if either PHYs are marked as disabled */
- if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) {
+ if (!iso_pi(sk)->qos.ucast.in.phys && !iso_pi(sk)->qos.ucast.out.phys) {
err = -EINVAL;
goto unlock;
}
@@ -894,7 +894,7 @@ static struct proto iso_proto = {
.interval = 10000u, \
.latency = 10u, \
.sdu = 40u, \
- .phy = BT_ISO_PHY_2M, \
+ .phys = BT_ISO_PHY_2M, \
.rtn = 2u, \
}
@@ -1661,7 +1661,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg,
static bool check_io_qos(struct bt_iso_io_qos *qos)
{
/* If no PHY is enable SDU must be 0 */
- if (!qos->phy && qos->sdu)
+ if (!qos->phys && qos->sdu)
return false;
if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff))
@@ -1670,7 +1670,7 @@ static bool check_io_qos(struct bt_iso_io_qos *qos)
if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0))
return false;
- if (qos->phy > BT_ISO_PHY_ANY)
+ if (qos->phys > BT_ISO_PHY_ANY)
return false;
return true;
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 07b493331fd7..b628b0fa39b2 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -924,26 +924,18 @@ int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator)
initiator);
}
-static u8 l2cap_get_ident(struct l2cap_conn *conn)
+static int l2cap_get_ident(struct l2cap_conn *conn)
{
- u8 id;
+ /* LE link does not support tools like l2ping so use the full range */
+ if (conn->hcon->type == LE_LINK)
+ return ida_alloc_range(&conn->tx_ida, 1, 255, GFP_ATOMIC);
/* Get next available identificator.
* 1 - 128 are used by kernel.
* 129 - 199 are reserved.
* 200 - 254 are used by utilities like l2ping, etc.
*/
-
- mutex_lock(&conn->ident_lock);
-
- if (++conn->tx_ident > 128)
- conn->tx_ident = 1;
-
- id = conn->tx_ident;
-
- mutex_unlock(&conn->ident_lock);
-
- return id;
+ return ida_alloc_range(&conn->tx_ida, 1, 128, GFP_ATOMIC);
}
static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb,
@@ -1773,6 +1765,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err)
if (work_pending(&conn->pending_rx_work))
cancel_work_sync(&conn->pending_rx_work);
+ ida_destroy(&conn->tx_ida);
+
cancel_delayed_work_sync(&conn->id_addr_timer);
l2cap_unregister_all_users(conn);
@@ -4782,12 +4776,34 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
return err;
}
+static void l2cap_put_ident(struct l2cap_conn *conn, u8 code, u8 id)
+{
+ switch (code) {
+ case L2CAP_COMMAND_REJ:
+ case L2CAP_CONN_RSP:
+ case L2CAP_CONF_RSP:
+ case L2CAP_DISCONN_RSP:
+ case L2CAP_ECHO_RSP:
+ case L2CAP_INFO_RSP:
+ case L2CAP_CONN_PARAM_UPDATE_RSP:
+ case L2CAP_ECRED_CONN_RSP:
+ case L2CAP_ECRED_RECONF_RSP:
+ /* First do a lookup since the remote may send bogus ids that
+ * would make ida_free to generate warnings.
+ */
+ if (ida_find_first_range(&conn->tx_ida, id, id) >= 0)
+ ida_free(&conn->tx_ida, id);
+ }
+}
+
static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn,
struct l2cap_cmd_hdr *cmd, u16 cmd_len,
u8 *data)
{
int err = 0;
+ l2cap_put_ident(conn, cmd->code, cmd->ident);
+
switch (cmd->code) {
case L2CAP_COMMAND_REJ:
l2cap_command_rej(conn, cmd, cmd_len, data);
@@ -5419,6 +5435,8 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn,
{
int err = 0;
+ l2cap_put_ident(conn, cmd->code, cmd->ident);
+
switch (cmd->code) {
case L2CAP_COMMAND_REJ:
l2cap_le_command_rej(conn, cmd, cmd_len, data);
@@ -6907,13 +6925,13 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon)
hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP)))
conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR;
- mutex_init(&conn->ident_lock);
mutex_init(&conn->lock);
INIT_LIST_HEAD(&conn->chan_l);
INIT_LIST_HEAD(&conn->users);
INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout);
+ ida_init(&conn->tx_ida);
skb_queue_head_init(&conn->pending_rx);
INIT_WORK(&conn->pending_rx_work, process_pending_rx);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 9ee189c815d4..3ba3ce7eaa98 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -885,7 +885,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
struct bt_power pwr;
struct l2cap_conn *conn;
int err = 0;
- u32 opt;
+ u32 opt, phys;
u16 mtu;
u8 mode;
@@ -1059,6 +1059,24 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname,
break;
+ case BT_PHY:
+ if (sk->sk_state != BT_CONNECTED) {
+ err = -ENOTCONN;
+ break;
+ }
+
+ err = copy_safe_from_sockptr(&phys, sizeof(phys), optval,
+ optlen);
+ if (err)
+ break;
+
+ if (!chan->conn)
+ break;
+
+ conn = chan->conn;
+ err = hci_conn_set_phy(conn->hcon, phys);
+ break;
+
case BT_MODE:
if (!enable_ecred) {
err = -ENOPROTOOPT;
diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c
index c4063d200c0a..fdcc752c6f13 100644
--- a/net/bluetooth/mgmt_config.c
+++ b/net/bluetooth/mgmt_config.c
@@ -11,6 +11,12 @@
#include "mgmt_util.h"
#include "mgmt_config.h"
+#define HDEV_PARAM_U32(_param_name_) \
+ struct {\
+ struct mgmt_tlv_hdr entry; \
+ __le32 value; \
+ } __packed _param_name_
+
#define HDEV_PARAM_U16(_param_name_) \
struct {\
struct mgmt_tlv_hdr entry; \
@@ -29,6 +35,12 @@
cpu_to_le16(hdev->_param_name_) \
}
+#define TLV_SET_U32(_param_code_, _param_name_) \
+ { \
+ { cpu_to_le16(_param_code_), sizeof(__u32) }, \
+ cpu_to_le32(hdev->_param_name_) \
+ }
+
#define TLV_SET_U8(_param_code_, _param_name_) \
{ \
{ cpu_to_le16(_param_code_), sizeof(__u8) }, \
@@ -78,6 +90,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
HDEV_PARAM_U16(advmon_allowlist_duration);
HDEV_PARAM_U16(advmon_no_filter_duration);
HDEV_PARAM_U8(enable_advmon_interleave_scan);
+ HDEV_PARAM_U32(idle_timeout);
} __packed rp = {
TLV_SET_U16(0x0000, def_page_scan_type),
TLV_SET_U16(0x0001, def_page_scan_int),
@@ -111,6 +124,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
TLV_SET_U16(0x001d, advmon_allowlist_duration),
TLV_SET_U16(0x001e, advmon_no_filter_duration),
TLV_SET_U8(0x001f, enable_advmon_interleave_scan),
+ TLV_SET_U32(0x0020, idle_timeout),
};
bt_dev_dbg(hdev, "sock %p", sk);
@@ -122,6 +136,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
}
#define TO_TLV(x) ((struct mgmt_tlv *)(x))
+#define TLV_GET_LE32(tlv) le32_to_cpu(*((__le32 *)(TO_TLV(tlv)->value)))
#define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value)))
#define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value)))
@@ -191,6 +206,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
case 0x001f:
exp_type_len = sizeof(u8);
break;
+ case 0x0020:
+ exp_type_len = sizeof(u32);
+ break;
default:
exp_type_len = 0;
bt_dev_warn(hdev, "unsupported parameter %u", type);
@@ -314,6 +332,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data,
case 0x0001f:
hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer);
break;
+ case 0x00020:
+ hdev->idle_timeout = TLV_GET_LE32(buffer);
+ break;
default:
bt_dev_warn(hdev, "unsupported parameter %u", type);
break;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index d55a4ab87837..dccae08b4f4c 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -5201,7 +5201,7 @@ void br_multicast_get_stats(const struct net_bridge *br,
do {
start = u64_stats_fetch_begin(&cpu_stats->syncp);
- memcpy(&temp, &cpu_stats->mstats, sizeof(temp));
+ u64_stats_copy(&temp, &cpu_stats->mstats, sizeof(temp));
} while (u64_stats_fetch_retry(&cpu_stats->syncp, start));
mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries);
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index e0421eaa3abc..76ce70b4e7f3 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -58,7 +58,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb)
if (hdr->version != 6)
goto inhdr_error;
- pkt_len = ntohs(hdr->payload_len);
+ pkt_len = ipv6_payload_len(skb, hdr);
if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len))
goto drop;
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index c20a41bf253b..cc4b27ff1b08 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -344,8 +344,8 @@ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost)
ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id)
{
- return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n",
- id->prio[0], id->prio[1],
- id->addr[0], id->addr[1], id->addr[2],
- id->addr[3], id->addr[4], id->addr[5]);
+ return sysfs_emit(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n",
+ id->prio[0], id->prio[1],
+ id->addr[0], id->addr[1], id->addr[2],
+ id->addr[3], id->addr[4], id->addr[5]);
}
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index cb4855ed9500..8888300b65c1 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -67,7 +67,7 @@ static ssize_t forward_delay_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
+ return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay));
}
static int set_forward_delay(struct net_bridge *br, unsigned long val,
@@ -87,8 +87,8 @@ static DEVICE_ATTR_RW(forward_delay);
static ssize_t hello_time_show(struct device *d, struct device_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(to_bridge(d)->hello_time));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->hello_time));
}
static int set_hello_time(struct net_bridge *br, unsigned long val,
@@ -108,8 +108,8 @@ static DEVICE_ATTR_RW(hello_time);
static ssize_t max_age_show(struct device *d, struct device_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(to_bridge(d)->max_age));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(to_bridge(d)->max_age));
}
static int set_max_age(struct net_bridge *br, unsigned long val,
@@ -129,7 +129,7 @@ static ssize_t ageing_time_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
+ return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time));
}
static int set_ageing_time(struct net_bridge *br, unsigned long val,
@@ -150,7 +150,7 @@ static ssize_t stp_state_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->stp_enabled);
+ return sysfs_emit(buf, "%d\n", br->stp_enabled);
}
@@ -173,7 +173,7 @@ static ssize_t group_fwd_mask_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%#x\n", br->group_fwd_mask);
+ return sysfs_emit(buf, "%#x\n", br->group_fwd_mask);
}
static int set_group_fwd_mask(struct net_bridge *br, unsigned long val,
@@ -200,8 +200,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n",
- (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
+ return sysfs_emit(buf, "%d\n",
+ (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]);
}
static int set_priority(struct net_bridge *br, unsigned long val,
@@ -235,21 +235,21 @@ static DEVICE_ATTR_RO(bridge_id);
static ssize_t root_port_show(struct device *d, struct device_attribute *attr,
char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(d)->root_port);
+ return sysfs_emit(buf, "%d\n", to_bridge(d)->root_port);
}
static DEVICE_ATTR_RO(root_port);
static ssize_t root_path_cost_show(struct device *d,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost);
+ return sysfs_emit(buf, "%d\n", to_bridge(d)->root_path_cost);
}
static DEVICE_ATTR_RO(root_path_cost);
static ssize_t topology_change_show(struct device *d,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%d\n", to_bridge(d)->topology_change);
+ return sysfs_emit(buf, "%d\n", to_bridge(d)->topology_change);
}
static DEVICE_ATTR_RO(topology_change);
@@ -258,7 +258,7 @@ static ssize_t topology_change_detected_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->topology_change_detected);
+ return sysfs_emit(buf, "%d\n", br->topology_change_detected);
}
static DEVICE_ATTR_RO(topology_change_detected);
@@ -266,7 +266,7 @@ static ssize_t hello_timer_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&br->hello_timer));
}
static DEVICE_ATTR_RO(hello_timer);
@@ -274,7 +274,7 @@ static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&br->tcn_timer));
}
static DEVICE_ATTR_RO(tcn_timer);
@@ -283,7 +283,7 @@ static ssize_t topology_change_timer_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&br->topology_change_timer));
}
static DEVICE_ATTR_RO(topology_change_timer);
@@ -291,7 +291,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
}
static DEVICE_ATTR_RO(gc_timer);
@@ -299,7 +299,7 @@ static ssize_t group_addr_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%pM\n", br->group_addr);
+ return sysfs_emit(buf, "%pM\n", br->group_addr);
}
static ssize_t group_addr_store(struct device *d,
@@ -365,7 +365,7 @@ static ssize_t no_linklocal_learn_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
+ return sysfs_emit(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN));
}
static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val,
@@ -387,7 +387,7 @@ static ssize_t multicast_router_show(struct device *d,
struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router);
+ return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_router);
}
static int set_multicast_router(struct net_bridge *br, unsigned long val,
@@ -409,7 +409,7 @@ static ssize_t multicast_snooping_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
+ return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED));
}
static ssize_t multicast_snooping_store(struct device *d,
@@ -425,8 +425,8 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n",
- br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
+ return sysfs_emit(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR));
}
static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val,
@@ -450,7 +450,7 @@ static ssize_t multicast_querier_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier);
+ return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_querier);
}
static int set_multicast_querier(struct net_bridge *br, unsigned long val,
@@ -470,7 +470,7 @@ static DEVICE_ATTR_RW(multicast_querier);
static ssize_t hash_elasticity_show(struct device *d,
struct device_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", RHT_ELASTICITY);
+ return sysfs_emit(buf, "%u\n", RHT_ELASTICITY);
}
static int set_elasticity(struct net_bridge *br, unsigned long val,
@@ -494,7 +494,7 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->hash_max);
+ return sysfs_emit(buf, "%u\n", br->hash_max);
}
static int set_hash_max(struct net_bridge *br, unsigned long val,
@@ -517,7 +517,7 @@ static ssize_t multicast_igmp_version_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version);
+ return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_igmp_version);
}
static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val,
@@ -539,7 +539,7 @@ static ssize_t multicast_last_member_count_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count);
+ return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_last_member_count);
}
static int set_last_member_count(struct net_bridge *br, unsigned long val,
@@ -561,7 +561,7 @@ static ssize_t multicast_startup_query_count_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count);
+ return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count);
}
static int set_startup_query_count(struct net_bridge *br, unsigned long val,
@@ -583,8 +583,8 @@ static ssize_t multicast_last_member_interval_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval));
}
static int set_last_member_interval(struct net_bridge *br, unsigned long val,
@@ -606,8 +606,8 @@ static ssize_t multicast_membership_interval_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval));
}
static int set_membership_interval(struct net_bridge *br, unsigned long val,
@@ -630,8 +630,8 @@ static ssize_t multicast_querier_interval_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval));
}
static int set_querier_interval(struct net_bridge *br, unsigned long val,
@@ -654,8 +654,8 @@ static ssize_t multicast_query_interval_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval));
}
static int set_query_interval(struct net_bridge *br, unsigned long val,
@@ -677,9 +677,8 @@ static ssize_t multicast_query_response_interval_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(
- buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval));
}
static int set_query_response_interval(struct net_bridge *br, unsigned long val,
@@ -701,9 +700,8 @@ static ssize_t multicast_startup_query_interval_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(
- buf, "%lu\n",
- jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval));
+ return sysfs_emit(buf, "%lu\n",
+ jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval));
}
static int set_startup_query_interval(struct net_bridge *br, unsigned long val,
@@ -727,8 +725,8 @@ static ssize_t multicast_stats_enabled_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n",
- br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
+ return sysfs_emit(buf, "%d\n",
+ br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED));
}
static int set_stats_enabled(struct net_bridge *br, unsigned long val,
@@ -754,7 +752,7 @@ static ssize_t multicast_mld_version_show(struct device *d,
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version);
+ return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_mld_version);
}
static int set_multicast_mld_version(struct net_bridge *br, unsigned long val,
@@ -777,7 +775,7 @@ static ssize_t nf_call_iptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
+ return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES));
}
static int set_nf_call_iptables(struct net_bridge *br, unsigned long val,
@@ -799,7 +797,7 @@ static ssize_t nf_call_ip6tables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
+ return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES));
}
static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val,
@@ -821,7 +819,7 @@ static ssize_t nf_call_arptables_show(
struct device *d, struct device_attribute *attr, char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
+ return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES));
}
static int set_nf_call_arptables(struct net_bridge *br, unsigned long val,
@@ -845,7 +843,7 @@ static ssize_t vlan_filtering_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
+ return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED));
}
static ssize_t vlan_filtering_store(struct device *d,
@@ -861,7 +859,7 @@ static ssize_t vlan_protocol_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto));
+ return sysfs_emit(buf, "%#06x\n", ntohs(br->vlan_proto));
}
static ssize_t vlan_protocol_store(struct device *d,
@@ -877,7 +875,7 @@ static ssize_t default_pvid_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%d\n", br->default_pvid);
+ return sysfs_emit(buf, "%d\n", br->default_pvid);
}
static ssize_t default_pvid_store(struct device *d,
@@ -893,7 +891,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
+ return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED));
}
static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val,
@@ -915,7 +913,7 @@ static ssize_t vlan_stats_per_port_show(struct device *d,
char *buf)
{
struct net_bridge *br = to_bridge(d);
- return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
+ return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT));
}
static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val,
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index 74fdd8105dca..1f57c36a7fc0 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -47,7 +47,7 @@ const struct brport_attribute brport_attr_##_name = { \
#define BRPORT_ATTR_FLAG(_name, _mask) \
static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \
{ \
- return sprintf(buf, "%d\n", !!(p->flags & _mask)); \
+ return sysfs_emit(buf, "%d\n", !!(p->flags & _mask)); \
} \
static int store_##_name(struct net_bridge_port *p, unsigned long v) \
{ \
@@ -83,7 +83,7 @@ static int store_flag(struct net_bridge_port *p, unsigned long v,
static ssize_t show_path_cost(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->path_cost);
+ return sysfs_emit(buf, "%d\n", p->path_cost);
}
static BRPORT_ATTR(path_cost, 0644,
@@ -91,7 +91,7 @@ static BRPORT_ATTR(path_cost, 0644,
static ssize_t show_priority(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->priority);
+ return sysfs_emit(buf, "%d\n", p->priority);
}
static BRPORT_ATTR(priority, 0644,
@@ -111,65 +111,65 @@ static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL);
static ssize_t show_designated_port(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->designated_port);
+ return sysfs_emit(buf, "%d\n", p->designated_port);
}
static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL);
static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->designated_cost);
+ return sysfs_emit(buf, "%d\n", p->designated_cost);
}
static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL);
static ssize_t show_port_id(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "0x%x\n", p->port_id);
+ return sysfs_emit(buf, "0x%x\n", p->port_id);
}
static BRPORT_ATTR(port_id, 0444, show_port_id, NULL);
static ssize_t show_port_no(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "0x%x\n", p->port_no);
+ return sysfs_emit(buf, "0x%x\n", p->port_no);
}
static BRPORT_ATTR(port_no, 0444, show_port_no, NULL);
static ssize_t show_change_ack(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->topology_change_ack);
+ return sysfs_emit(buf, "%d\n", p->topology_change_ack);
}
static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL);
static ssize_t show_config_pending(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->config_pending);
+ return sysfs_emit(buf, "%d\n", p->config_pending);
}
static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL);
static ssize_t show_port_state(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->state);
+ return sysfs_emit(buf, "%d\n", p->state);
}
static BRPORT_ATTR(state, 0444, show_port_state, NULL);
static ssize_t show_message_age_timer(struct net_bridge_port *p,
char *buf)
{
- return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&p->message_age_timer));
}
static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL);
static ssize_t show_forward_delay_timer(struct net_bridge_port *p,
char *buf)
{
- return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&p->forward_delay_timer));
}
static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL);
static ssize_t show_hold_timer(struct net_bridge_port *p,
char *buf)
{
- return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer));
+ return sysfs_emit(buf, "%ld\n", br_timer_value(&p->hold_timer));
}
static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL);
@@ -182,7 +182,7 @@ static BRPORT_ATTR(flush, 0200, NULL, store_flush);
static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%#x\n", p->group_fwd_mask);
+ return sysfs_emit(buf, "%#x\n", p->group_fwd_mask);
}
static int store_group_fwd_mask(struct net_bridge_port *p,
@@ -205,7 +205,7 @@ static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
rcu_read_lock();
backup_p = rcu_dereference(p->backup_port);
if (backup_p)
- ret = sprintf(buf, "%s\n", backup_p->dev->name);
+ ret = sysfs_emit(buf, "%s\n", backup_p->dev->name);
rcu_read_unlock();
return ret;
@@ -244,7 +244,7 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED);
#ifdef CONFIG_BRIDGE_IGMP_SNOOPING
static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
{
- return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router);
+ return sysfs_emit(buf, "%d\n", p->multicast_ctx.multicast_router);
}
static int store_multicast_router(struct net_bridge_port *p,
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 6482de4d8750..58a33d0380b0 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -16,8 +16,7 @@
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_bridge.h>
-#include <linux/netfilter/nf_tables.h>
-#include <net/netfilter/nf_tables.h>
+#include <linux/netfilter_ipv4.h>
#include "../br_private.h"
@@ -230,7 +229,7 @@ static int nf_ct_br_ipv6_check(const struct sk_buff *skb)
if (hdr->version != 6)
return -1;
- len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff;
+ len = ipv6_payload_len(skb, hdr) + sizeof(struct ipv6hdr) + nhoff;
if (skb->len < len)
return -1;
@@ -270,7 +269,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
return NF_ACCEPT;
- len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len);
+ len = sizeof(struct ipv6hdr) + skb_ipv6_payload_len(skb);
if (pskb_trim_rcsum(skb, len))
return NF_ACCEPT;
diff --git a/net/can/Kconfig b/net/can/Kconfig
index af64a6f76458..abbb4be7ad21 100644
--- a/net/can/Kconfig
+++ b/net/can/Kconfig
@@ -5,6 +5,7 @@
menuconfig CAN
tristate "CAN bus subsystem support"
+ select SKB_EXTENSIONS
help
Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial
communications protocol. Development of the CAN bus started in
diff --git a/net/can/af_can.c b/net/can/af_can.c
index 770173d8db42..22c65a014861 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -641,6 +641,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buf
return matches;
}
+void can_set_skb_uid(struct sk_buff *skb)
+{
+ /* create non-zero unique skb identifier together with *skb */
+ while (!(skb->hash))
+ skb->hash = atomic_inc_return(&skbcounter);
+
+ skb->sw_hash = 1;
+}
+EXPORT_SYMBOL(can_set_skb_uid);
+
static void can_receive(struct sk_buff *skb, struct net_device *dev)
{
struct can_dev_rcv_lists *dev_rcv_lists;
@@ -652,9 +662,7 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
atomic_long_inc(&pkg_stats->rx_frames);
atomic_long_inc(&pkg_stats->rx_frames_delta);
- /* create non-zero unique skb identifier together with *skb */
- while (!(can_skb_prv(skb)->skbcnt))
- can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter);
+ can_set_skb_uid(skb);
rcu_read_lock();
@@ -679,7 +687,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev)
static int can_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) {
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) ||
+ !can_skb_ext_find(skb) || !can_is_can_skb(skb))) {
pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n",
dev->type, skb->len);
@@ -694,7 +703,8 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev,
static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) {
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) ||
+ !can_skb_ext_find(skb) || !can_is_canfd_skb(skb))) {
pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n",
dev->type, skb->len);
@@ -709,7 +719,8 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev,
static int canxl_rcv(struct sk_buff *skb, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) {
+ if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) ||
+ !can_skb_ext_find(skb) || !can_is_canxl_skb(skb))) {
pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n",
dev->type, skb->len);
diff --git a/net/can/bcm.c b/net/can/bcm.c
index 7eba8ae01a5b..b7324e9c955b 100644
--- a/net/can/bcm.c
+++ b/net/can/bcm.c
@@ -59,6 +59,7 @@
#include <linux/can/bcm.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <net/can.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -291,6 +292,7 @@ static int bcm_proc_show(struct seq_file *m, void *v)
static void bcm_can_tx(struct bcm_op *op)
{
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct net_device *dev;
struct canfd_frame *cf;
int err;
@@ -310,13 +312,17 @@ static void bcm_can_tx(struct bcm_op *op)
return;
}
- skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any());
+ skb = alloc_skb(op->cfsiz, gfp_any());
if (!skb)
goto out;
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = dev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ goto out;
+ }
+
+ csx->can_iif = dev->ifindex;
skb_put_data(skb, cf, op->cfsiz);
@@ -1318,6 +1324,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
int cfsiz)
{
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct net_device *dev;
int err;
@@ -1325,11 +1332,15 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
if (!ifindex)
return -ENODEV;
- skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL);
+ skb = alloc_skb(cfsiz, GFP_KERNEL);
if (!skb)
return -ENOMEM;
- can_skb_reserve(skb);
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ return -ENOMEM;
+ }
err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz);
if (err < 0) {
@@ -1343,8 +1354,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk,
return -ENODEV;
}
- can_skb_prv(skb)->ifindex = dev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx->can_iif = dev->ifindex;
skb->dev = dev;
can_skb_set_owner(skb, sk);
err = can_send(skb, 1); /* send with loopback */
diff --git a/net/can/gw.c b/net/can/gw.c
index 55eccb1c7620..61a1e6b1b83f 100644
--- a/net/can/gw.c
+++ b/net/can/gw.c
@@ -55,6 +55,7 @@
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/gw.h>
+#include <net/can.h>
#include <net/rtnetlink.h>
#include <net/net_namespace.h>
#include <net/sock.h>
@@ -70,8 +71,8 @@ MODULE_ALIAS(CAN_GW_NAME);
#define CGW_MAX_HOPS 6
#define CGW_DEFAULT_HOPS 1
-static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS;
-module_param(max_hops, uint, 0444);
+static unsigned char max_hops __read_mostly = CGW_DEFAULT_HOPS;
+module_param(max_hops, byte, 0444);
MODULE_PARM_DESC(max_hops,
"maximum " CAN_GW_NAME " routing hops for CAN frames "
"(valid values: " __stringify(CGW_MIN_HOPS) "-"
@@ -459,6 +460,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
struct cgw_job *gwj = (struct cgw_job *)data;
struct canfd_frame *cf;
struct sk_buff *nskb;
+ struct can_skb_ext *csx, *ncsx;
struct cf_mod *mod;
int modidx = 0;
@@ -471,22 +473,15 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
return;
}
+ csx = can_skb_ext_find(skb);
+ if (!csx)
+ return;
+
/* Do not handle CAN frames routed more than 'max_hops' times.
* In general we should never catch this delimiter which is intended
* to cover a misconfiguration protection (e.g. circular CAN routes).
- *
- * The Controller Area Network controllers only accept CAN frames with
- * correct CRCs - which are not visible in the controller registers.
- * According to skbuff.h documentation the csum_start element for IP
- * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY.
- * Only CAN skbs can be processed here which already have this property.
*/
-
-#define cgw_hops(skb) ((skb)->csum_start)
-
- BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY);
-
- if (cgw_hops(skb) >= max_hops) {
+ if (csx->can_gw_hops >= max_hops) {
/* indicate deleted frames due to misconfiguration */
gwj->deleted_frames++;
return;
@@ -499,7 +494,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
/* is sending the skb back to the incoming interface not allowed? */
if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) &&
- can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex)
+ csx->can_iif == gwj->dst.dev->ifindex)
return;
/* clone the given skb, which has not been done in can_rcv()
@@ -518,12 +513,23 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data)
return;
}
+ /* the cloned/copied nskb points to the skb extension of the original
+ * skb with an increased refcount. skb_ext_add() creates a copy to
+ * separate the skb extension data to modify the can_gw_hops.
+ */
+ ncsx = skb_ext_add(nskb, SKB_EXT_CAN);
+ if (!ncsx) {
+ kfree_skb(nskb);
+ gwj->dropped_frames++;
+ return;
+ }
+
/* put the incremented hop counter in the cloned skb */
- cgw_hops(nskb) = cgw_hops(skb) + 1;
+ ncsx->can_gw_hops = csx->can_gw_hops + 1;
/* first processing of this CAN frame -> adjust to private hop limit */
- if (gwj->limit_hops && cgw_hops(nskb) == 1)
- cgw_hops(nskb) = max_hops - gwj->limit_hops + 1;
+ if (gwj->limit_hops && ncsx->can_gw_hops == 1)
+ ncsx->can_gw_hops = max_hops - gwj->limit_hops + 1;
nskb->dev = gwj->dst.dev;
diff --git a/net/can/isotp.c b/net/can/isotp.c
index ce588b85665a..da3b72e7afcc 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -69,6 +69,7 @@
#include <linux/can/skb.h>
#include <linux/can/isotp.h>
#include <linux/slab.h>
+#include <net/can.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -214,24 +215,28 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus)
{
struct net_device *dev;
struct sk_buff *nskb;
+ struct can_skb_ext *csx;
struct canfd_frame *ncf;
struct isotp_sock *so = isotp_sk(sk);
int can_send_ret;
- nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any());
+ nskb = alloc_skb(so->ll.mtu, gfp_any());
if (!nskb)
return 1;
+ csx = can_skb_ext_add(nskb);
+ if (!csx) {
+ kfree_skb(nskb);
+ return 1;
+ }
+
dev = dev_get_by_index(sock_net(sk), so->ifindex);
if (!dev) {
kfree_skb(nskb);
return 1;
}
- can_skb_reserve(nskb);
- can_skb_prv(nskb)->ifindex = dev->ifindex;
- can_skb_prv(nskb)->skbcnt = 0;
-
+ csx->can_iif = dev->ifindex;
nskb->dev = dev;
can_skb_set_owner(nskb, sk);
ncf = (struct canfd_frame *)nskb->data;
@@ -763,6 +768,7 @@ static void isotp_send_cframe(struct isotp_sock *so)
{
struct sock *sk = &so->sk;
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct net_device *dev;
struct canfd_frame *cf;
int can_send_ret;
@@ -772,15 +778,20 @@ static void isotp_send_cframe(struct isotp_sock *so)
if (!dev)
return;
- skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ skb = alloc_skb(so->ll.mtu, GFP_ATOMIC);
if (!skb) {
dev_put(dev);
return;
}
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = dev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ netdev_put(dev, NULL);
+ return;
+ }
+
+ csx->can_iif = dev->ifindex;
cf = (struct canfd_frame *)skb->data;
skb_put_zero(skb, so->ll.mtu);
@@ -940,6 +951,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
struct sock *sk = sock->sk;
struct isotp_sock *so = isotp_sk(sk);
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct net_device *dev;
struct canfd_frame *cf;
int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0;
@@ -1000,16 +1012,22 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
goto err_out_drop;
}
- skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv),
- msg->msg_flags & MSG_DONTWAIT, &err);
+ skb = sock_alloc_send_skb(sk, so->ll.mtu, msg->msg_flags & MSG_DONTWAIT,
+ &err);
if (!skb) {
dev_put(dev);
goto err_out_drop;
}
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = dev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ netdev_put(dev, NULL);
+ err = -ENOMEM;
+ goto err_out_drop;
+ }
+
+ csx->can_iif = dev->ifindex;
so->tx.len = size;
so->tx.idx = 0;
diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c
index ff9c4fd7b433..0502b030d238 100644
--- a/net/can/j1939/socket.c
+++ b/net/can/j1939/socket.c
@@ -17,6 +17,7 @@
#include <linux/can/skb.h>
#include <linux/errqueue.h>
#include <linux/if_arp.h>
+#include <net/can.h>
#include "j1939-priv.h"
@@ -884,20 +885,25 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev,
struct j1939_sock *jsk = j1939_sk(sk);
struct j1939_sk_buff_cb *skcb;
struct sk_buff *skb;
+ struct can_skb_ext *csx;
int ret;
skb = sock_alloc_send_skb(sk,
size +
sizeof(struct can_frame) -
- sizeof(((struct can_frame *)NULL)->data) +
- sizeof(struct can_skb_priv),
+ sizeof(((struct can_frame *)NULL)->data),
msg->msg_flags & MSG_DONTWAIT, &ret);
if (!skb)
goto failure;
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = ndev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ ret = -ENOMEM;
+ goto failure;
+ }
+
+ csx->can_iif = ndev->ifindex;
skb_reserve(skb, offsetof(struct can_frame, data));
ret = memcpy_from_msg(skb_put(skb, size), msg, size);
diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c
index 8656ab388c83..2cbe94fc487a 100644
--- a/net/can/j1939/transport.c
+++ b/net/can/j1939/transport.c
@@ -9,6 +9,7 @@
// Oleksij Rempel <kernel@pengutronix.de>
#include <linux/can/skb.h>
+#include <net/can.h>
#include "j1939-priv.h"
@@ -591,17 +592,21 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv,
bool swap_src_dst)
{
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct j1939_sk_buff_cb *skcb;
- skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv),
- GFP_ATOMIC);
+ skb = alloc_skb(sizeof(struct can_frame), GFP_ATOMIC);
if (unlikely(!skb))
return ERR_PTR(-ENOMEM);
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ return ERR_PTR(-ENOMEM);
+ }
+
skb->dev = priv->ndev;
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx->can_iif = priv->ndev->ifindex;
/* reserve CAN header */
skb_reserve(skb, offsetof(struct can_frame, data));
@@ -1052,6 +1057,17 @@ static int j1939_simple_txnext(struct j1939_session *session)
goto out_free;
}
+ /* the cloned skb points to the skb extension of the original se_skb
+ * with an increased refcount. skb_ext_add() creates a copy to
+ * separate the skb extension data which is needed to modify the
+ * can_framelen in can_put_echo_skb().
+ */
+ if (!skb_ext_add(skb, SKB_EXT_CAN)) {
+ kfree_skb(skb);
+ ret = -ENOMEM;
+ goto out_free;
+ }
+
can_skb_set_owner(skb, se_skb->sk);
j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS);
@@ -1526,17 +1542,22 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv,
const struct j1939_sk_buff_cb *rel_skcb)
{
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct j1939_sk_buff_cb *skcb;
struct j1939_session *session;
- skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC);
+ skb = alloc_skb(size, GFP_ATOMIC);
if (unlikely(!skb))
return NULL;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ return NULL;
+ }
+
skb->dev = priv->ndev;
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = priv->ndev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx->can_iif = priv->ndev->ifindex;
skcb = j1939_skb_to_cb(skb);
memcpy(skcb, rel_skcb, sizeof(*skcb));
diff --git a/net/can/raw.c b/net/can/raw.c
index 12293363413c..eee244ffc31e 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -53,6 +53,7 @@
#include <linux/can/core.h>
#include <linux/can/skb.h>
#include <linux/can/raw.h>
+#include <net/can.h>
#include <net/sock.h>
#include <net/net_namespace.h>
@@ -76,7 +77,7 @@ MODULE_ALIAS("can-proto-1");
struct uniqframe {
const struct sk_buff *skb;
- int skbcnt;
+ u32 hash;
unsigned int join_rx_count;
};
@@ -164,7 +165,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
/* eliminate multiple filter matches for the same skb */
if (this_cpu_ptr(ro->uniq)->skb == oskb &&
- this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) {
+ this_cpu_ptr(ro->uniq)->hash == oskb->hash) {
if (!ro->join_filters)
return;
@@ -174,7 +175,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data)
return;
} else {
this_cpu_ptr(ro->uniq)->skb = oskb;
- this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt;
+ this_cpu_ptr(ro->uniq)->hash = oskb->hash;
this_cpu_ptr(ro->uniq)->join_rx_count = 1;
/* drop first frame to check all enabled filters? */
if (ro->join_filters && ro->count > 1)
@@ -918,6 +919,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
struct raw_sock *ro = raw_sk(sk);
struct sockcm_cookie sockc;
struct sk_buff *skb;
+ struct can_skb_ext *csx;
struct net_device *dev;
unsigned int txmtu;
int ifindex;
@@ -951,14 +953,19 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
goto put_dev;
}
- skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv),
- msg->msg_flags & MSG_DONTWAIT, &err);
+ skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT,
+ &err);
if (!skb)
goto put_dev;
- can_skb_reserve(skb);
- can_skb_prv(skb)->ifindex = dev->ifindex;
- can_skb_prv(skb)->skbcnt = 0;
+ csx = can_skb_ext_add(skb);
+ if (!csx) {
+ kfree_skb(skb);
+ err = -ENOMEM;
+ goto put_dev;
+ }
+
+ csx->can_iif = dev->ifindex;
/* fill the skb before testing for valid CAN frames */
err = memcpy_from_msg(skb_put(skb, size), msg, size);
diff --git a/net/core/Makefile b/net/core/Makefile
index 9ef2099c5426..dc17c5a61e9a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,7 +3,7 @@
# Makefile for the Linux networking core.
#
-obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+obj-y := sock.o skbuff.o datagram.o stream.o scm.o \
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
flow_dissector.o
@@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
+obj-y += netdev_config.o
obj-y += netdev_rx_queue.o
obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
diff --git a/net/core/dev.c b/net/core/dev.c
index ccef685023c2..ac6bcb2a0784 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -246,12 +246,11 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)
}
static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
- unsigned long *flags)
+ unsigned long flags)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
- spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else
- local_irq_restore(*flags);
+ spin_unlock(&sd->input_pkt_queue.lock);
+ local_irq_restore(flags);
}
static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
@@ -3803,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
- features &= ~NETIF_F_TSO_MANGLEID;
+ features &= ~dev->mangleid_features;
}
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
@@ -3814,8 +3813,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
skb_transport_header_was_set(skb) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
return features;
@@ -3918,8 +3916,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
goto sw_checksum;
switch (skb->csum_offset) {
@@ -5260,7 +5257,7 @@ void kick_defer_list_purge(unsigned int cpu)
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
__napi_schedule_irqoff(&sd->backlog);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
smp_call_function_single_async(cpu, &sd->defer_csd);
@@ -5347,14 +5344,14 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
}
__skb_queue_tail(&sd->input_pkt_queue, skb);
tail = rps_input_queue_tail_incr(sd);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
/* save the tail outside of the critical section */
rps_input_queue_tail_save(qtail, tail);
return NET_RX_SUCCESS;
}
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
cpu_backlog_drop:
reason = SKB_DROP_REASON_CPU_BACKLOG;
@@ -11386,6 +11383,9 @@ int register_netdevice(struct net_device *dev)
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+ /* TSO_MANGLEID belongs in mangleid_features by definition */
+ dev->mangleid_features |= NETIF_F_TSO_MANGLEID;
+
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
diff --git a/net/core/dev.h b/net/core/dev.h
index da18536cbd35..98793a738f43 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -10,6 +10,7 @@
struct net;
struct netlink_ext_ack;
+struct netdev_queue_config;
struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
@@ -91,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem;
extern struct list_head net_todo_list;
void netdev_run_todo(void);
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+
/* netdev management, shared between various uAPI entry points */
struct netdev_name_node {
struct hlist_node hlist;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 53a53357cfef..7a8966544c9d 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -287,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
int err;
if (!ops->ndo_hwtstamp_get)
- return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -414,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
}
if (!ops->ndo_hwtstamp_set)
- return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -438,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return 0;
}
-static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
- struct kernel_hwtstamp_config *kernel_cfg)
-{
- struct ifreq ifrr;
- int err;
-
- if (!kernel_cfg->ifr)
- return -EINVAL;
-
- strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
- ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
-
- err = dev_eth_ioctl(dev, &ifrr, cmd);
- if (err)
- return err;
-
- kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
- kernel_cfg->copied_to_user = true;
-
- return 0;
-}
-
int generic_hwtstamp_get_lower(struct net_device *dev,
struct kernel_hwtstamp_config *kernel_cfg)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_get_lower);
@@ -488,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_set)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_set_lower);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index ec4217d6c0b4..63f093f7d2b2 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
static const struct memory_provider_ops dmabuf_devmem_ops;
-bool net_is_devmem_iov(struct net_iov *niov)
-{
- return niov->type == NET_IOV_DMABUF;
-}
-
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
struct gen_pool_chunk *chunk,
void *not_used)
@@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
+static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref)
+{
+ struct net_devmem_dmabuf_binding *binding =
+ container_of(ref, struct net_devmem_dmabuf_binding, ref);
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
@@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ percpu_ref_exit(&binding->ref);
kvfree(binding->tx_vec);
kfree(binding);
}
@@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- net_devmem_dmabuf_binding_put(binding);
+ percpu_ref_kill(&binding->ref);
}
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
@@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dev = dev;
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
- refcount_set(&binding->ref, 1);
+ err = percpu_ref_init(&binding->ref,
+ net_devmem_dmabuf_binding_release,
+ 0, GFP_KERNEL);
+ if (err < 0)
+ goto err_free_binding;
mutex_init(&binding->lock);
@@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_binding;
+ goto err_exit_ref;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
@@ -322,6 +331,8 @@ err_unmap:
direction);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
+err_exit_ref:
+ percpu_ref_exit(&binding->ref);
err_free_binding:
kfree(binding);
err_put_dmabuf:
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 0b43a648cd2e..1c5c18581fcb 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding {
* retransmits) hold a reference to the binding until the skb holding
* them is freed.
*/
- refcount_t ref;
+ struct percpu_ref ref;
/* The list of bindings currently active. Used for netlink to notify us
* of the user dropping the bind.
@@ -125,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- return refcount_inc_not_zero(&binding->ref);
+ return percpu_ref_tryget(&binding->ref);
}
static inline void
net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
{
- if (!refcount_dec_and_test(&binding->ref))
- return;
-
- INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
- schedule_work(&binding->unbind_w);
+ percpu_ref_put(&binding->ref);
}
void net_devmem_get_net_iov(struct net_iov *niov);
@@ -145,7 +141,7 @@ struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
-bool net_is_devmem_iov(struct net_iov *niov);
+
struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
struct net_iov *
@@ -218,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
return 0;
}
-static inline bool net_is_devmem_iov(struct net_iov *niov)
-{
- return false;
-}
-
static inline struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
{
diff --git a/net/core/gro.c b/net/core/gro.c
index 482fa7d7f598..31d21de5b15a 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
- (p->protocol == htons(ETH_P_IPV6) &&
- skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation)
return -E2BIG;
}
@@ -417,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
struct skb_shared_info *pinfo = skb_shinfo(skb);
- BUG_ON(skb->end - skb->tail < grow);
+ DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96a3b1a93252..e0897eb41c8d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -51,9 +51,8 @@ do { \
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(struct timer_list *t);
-static void __neigh_notify(struct neighbour *n, int type, int flags,
- u32 pid);
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
+static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm);
@@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
trace_neigh_cleanup_and_release(neigh, 0);
- __neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
+ neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -1105,6 +1104,7 @@ static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
struct neighbour *neigh = timer_container_of(neigh, t, timer);
+ bool skip_probe = false;
unsigned int state;
int notify = 0;
@@ -1172,9 +1172,15 @@ static void neigh_timer_handler(struct timer_list *t)
neigh_invalidate(neigh);
}
notify = 1;
- goto out;
+ skip_probe = true;
}
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0);
+
+ if (skip_probe)
+ goto out;
+
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/100))
next = jiffies + HZ/100;
@@ -1189,7 +1195,7 @@ out:
}
if (notify)
- neigh_update_notify(neigh, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
trace_neigh_timer_handler(neigh, 0);
@@ -1303,6 +1309,47 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
+static void neigh_update_process_arp_queue(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ struct sk_buff *skb;
+
+ /* Again: avoid deadlock if something went wrong. */
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
+
+ write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst &&
+ READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ READ_ONCE(n1->output)(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
+ write_lock_bh(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -1329,6 +1376,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
struct netlink_ext_ack *extack)
{
bool gc_update = false, managed_update = false;
+ bool process_arp_queue = false;
int update_isrouter = 0;
struct net_device *dev;
int err, notify = 0;
@@ -1462,53 +1510,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
neigh_connect(neigh);
else
neigh_suspect(neigh);
- if (!(old & NUD_VALID)) {
- struct sk_buff *skb;
- /* Again: avoid dead loop if something went wrong */
+ if (!(old & NUD_VALID))
+ process_arp_queue = true;
- while (neigh->nud_state & NUD_VALID &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n2, *n1 = neigh;
- write_unlock_bh(&neigh->lock);
-
- rcu_read_lock();
-
- /* Why not just use 'neigh' as-is? The problem is that
- * things such as shaper, eql, and sch_teql can end up
- * using alternative, different, neigh objects to output
- * the packet in the output path. So what we need to do
- * here is re-lookup the top-level neigh in the path so
- * we can reinject the packet there.
- */
- n2 = NULL;
- if (dst &&
- READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
- n2 = dst_neigh_lookup_skb(dst, skb);
- if (n2)
- n1 = n2;
- }
- READ_ONCE(n1->output)(n1, skb);
- if (n2)
- neigh_release(n2);
- rcu_read_unlock();
-
- write_lock_bh(&neigh->lock);
- }
- __skb_queue_purge(&neigh->arp_queue);
- neigh->arp_queue_len_bytes = 0;
- }
out:
if (update_isrouter)
neigh_update_is_router(neigh, flags, &notify);
+
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
+
+ if (process_arp_queue)
+ neigh_update_process_arp_queue(neigh);
+
write_unlock_bh(&neigh->lock);
+
if (((new ^ old) & NUD_PERMANENT) || gc_update)
neigh_update_gc_list(neigh);
if (managed_update)
neigh_update_managed_list(neigh);
+
if (notify)
- neigh_update_notify(neigh, nlmsg_pid);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+
trace_neigh_update_done(neigh, err);
return err;
}
@@ -2622,8 +2647,8 @@ out:
return skb->len;
}
-static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
- u32 pid, u32 seq, int type, unsigned int flags)
+static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
{
u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
@@ -2649,23 +2674,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
goto nla_put_failure;
- read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
if (neigh->nud_state & NUD_VALID) {
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, neigh, neigh->dev);
- if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
- read_unlock_bh(&neigh->lock);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0)
goto nla_put_failure;
- }
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
- read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
@@ -2684,6 +2705,20 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ int err;
+
+ read_lock_bh(&neigh->lock);
+ err = __neigh_fill_info(skb, neigh, pid, seq, type, flags);
+ read_unlock_bh(&neigh->lock);
+
+ return err;
+}
+
static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
@@ -2727,12 +2762,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
-{
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
- __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
-}
-
static bool neigh_master_filtered(struct net_device *dev, int master_idx)
{
struct net_device *master;
@@ -3545,7 +3574,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
if (skb == NULL)
goto errout;
- err = neigh_fill_info(skb, n, pid, 0, type, flags);
+ err = __neigh_fill_info(skb, n, pid, 0, type, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3560,9 +3589,16 @@ out:
rcu_read_unlock();
}
+static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid)
+{
+ read_lock_bh(&neigh->lock);
+ __neigh_notify(neigh, type, flags, pid);
+ read_unlock_bh(&neigh->lock);
+}
+
void neigh_app_ns(struct neighbour *n)
{
- __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
+ neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
}
EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a6e6a964a287..aef44e617361 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -624,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);
-static void unhash_nsid(struct net *net, struct net *last)
+static void unhash_nsid(struct net *last)
{
- struct net *tmp;
+ struct net *tmp, *peer;
+
/* This function is only called from cleanup_net() work,
* and this work is the only process, that may delete
* a net from net_namespace_list. So, when the below
@@ -634,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last)
* use for_each_net_rcu() or net_rwsem.
*/
for_each_net(tmp) {
- int id;
+ int id = 0;
spin_lock(&tmp->nsid_lock);
- id = __peernet2id(tmp, net);
- if (id >= 0)
- idr_remove(&tmp->netns_ids, id);
- spin_unlock(&tmp->nsid_lock);
- if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ while ((peer = idr_get_next(&tmp->netns_ids, &id))) {
+ int curr_id = id;
+
+ id++;
+ if (!peer->is_dying)
+ continue;
+
+ idr_remove(&tmp->netns_ids, curr_id);
+ spin_unlock(&tmp->nsid_lock);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL,
GFP_KERNEL);
+ spin_lock(&tmp->nsid_lock);
+ }
+ spin_unlock(&tmp->nsid_lock);
if (tmp == last)
break;
}
- spin_lock(&net->nsid_lock);
- idr_destroy(&net->netns_ids);
- spin_unlock(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
@@ -674,6 +679,7 @@ static void cleanup_net(struct work_struct *work)
llist_for_each_entry(net, net_kill_list, cleanup_list) {
ns_tree_remove(net);
list_del_rcu(&net->list);
+ net->is_dying = true;
}
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
@@ -688,8 +694,10 @@ static void cleanup_net(struct work_struct *work)
last = list_last_entry(&net_namespace_list, struct net, list);
up_write(&net_rwsem);
+ unhash_nsid(last);
+
llist_for_each_entry(net, net_kill_list, cleanup_list) {
- unhash_nsid(net, last);
+ idr_destroy(&net->netns_ids);
list_add_tail(&net->exit_list, &net_exit_list);
}
diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c
new file mode 100644
index 000000000000..f14af365d5cd
--- /dev/null
+++ b/net/core/netdev_config.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+
+#include "dev.h"
+
+static int netdev_nop_validate_qcfg(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static int __netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack,
+ bool validate)
+{
+ int (*validate_cb)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+ struct pp_memory_provider_params *mpp;
+ int err;
+
+ validate_cb = netdev_nop_validate_qcfg;
+ if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg)
+ validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg;
+
+ memset(qcfg, 0, sizeof(*qcfg));
+
+ /* Get defaults from the driver, in case user config not set */
+ if (dev->queue_mgmt_ops->ndo_default_qcfg)
+ dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg);
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ /* Apply MP overrides */
+ mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params;
+ if (mpp->rx_page_size)
+ qcfg->rx_page_size = mpp->rx_page_size;
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * netdev_queue_config() - get configuration for a given queue
+ * @dev: net_device instance
+ * @rxq_idx: index of the queue of interest
+ * @qcfg: queue configuration struct (output)
+ *
+ * Render the configuration for a given queue. This helper should be used
+ * by drivers which support queue configuration to retrieve config for
+ * a particular queue.
+ *
+ * @qcfg is an output parameter and is always fully initialized by this
+ * function. Some values may not be set by the user, drivers may either
+ * deal with the "unset" values in @qcfg, or provide the callback
+ * to populate defaults in queue_management_ops.
+ */
+void netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg)
+{
+ __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false);
+}
+EXPORT_SYMBOL(netdev_queue_config);
+
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true);
+}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index c7d9341b7630..668a90658f25 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -7,6 +7,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/memory_provider.h>
+#include "dev.h"
#include "page_pool_priv.h"
/* See also page_pool_is_unreadable() */
@@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+static int netdev_rx_queue_reconfig(struct net_device *dev,
+ unsigned int rxq_idx,
+ struct netdev_queue_config *qcfg_old,
+ struct netdev_queue_config *qcfg_new)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
@@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
goto err_free_new_mem;
}
- err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_free_old_mem;
@@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
if (err)
goto err_free_new_queue_mem;
- err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_start_queue;
} else {
@@ -76,7 +80,7 @@ err_start_queue:
* WARN if we fail to recover the old rx queue, and at least free
* old_mem so we don't also leak that.
*/
- if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) {
WARN(1,
"Failed to restart old queue in error path. RX queue %d may be unhealthy.",
rxq_idx);
@@ -94,12 +98,22 @@ err_free_new_mem:
return err;
}
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_queue_config qcfg;
+
+ netdev_queue_config(dev, rxq_idx, &qcfg);
+ return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg);
+}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack)
{
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int ret;
@@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
return -EEXIST;
}
+ if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) {
+ NL_SET_ERR_MSG(extack, "device does not support: rx_page_size");
+ return -EOPNOTSUPP;
+ }
rxq = __netif_get_rx_queue(dev, rxq_idx);
if (rxq->mp_params.mp_ops) {
@@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
}
#endif
+ netdev_queue_config(dev, rxq_idx, &qcfg[0]);
rxq->mp_params = *p;
- ret = netdev_rx_queue_restart(dev, rxq_idx);
- if (ret) {
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- }
+ ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack);
+ if (ret)
+ goto err_clear_mp;
+
+ ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]);
+ if (ret)
+ goto err_clear_mp;
+
+ return 0;
+
+err_clear_mp:
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
return ret;
}
@@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
const struct pp_memory_provider_params *old_p)
{
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
@@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
rxq->mp_params.mp_priv != old_p->mp_priv))
return;
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- err = netdev_rx_queue_restart(dev, ifq_idx);
+ netdev_queue_config(dev, ifq_idx, &qcfg[0]);
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
+ netdev_queue_config(dev, ifq_idx, &qcfg[1]);
+
+ err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]);
WARN_ON(err && err != -ENETDOWN);
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
deleted file mode 100644
index 897a8f01a67b..000000000000
--- a/net/core/request_sock.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * NET Generic infrastructure for Network protocols.
- *
- * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * From code originally in include/net/tcp.h
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/tcp.h>
-#include <linux/vmalloc.h>
-
-#include <net/request_sock.h>
-
-/*
- * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
- * One SYN_RECV socket costs about 80bytes on a 32bit machine.
- * It would be better to replace it with a global counter for all sockets
- * but then some measure against one socket starving all other sockets
- * would be needed.
- *
- * The minimum value of it is 128. Experiments with real servers show that
- * it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems.
- * This value is adjusted to 128 for low memory machines,
- * and it will increase in proportion to the memory of machine.
- * Note : Dont forget somaxconn that may limit backlog too.
- */
-
-void reqsk_queue_alloc(struct request_sock_queue *queue)
-{
- queue->fastopenq.rskq_rst_head = NULL;
- queue->fastopenq.rskq_rst_tail = NULL;
- queue->fastopenq.qlen = 0;
-
- queue->rskq_accept_head = NULL;
-}
-
-/*
- * This function is called to set a Fast Open socket's "fastopen_rsk" field
- * to NULL when a TFO socket no longer needs to access the request_sock.
- * This happens only after 3WHS has been either completed or aborted (e.g.,
- * RST is received).
- *
- * Before TFO, a child socket is created only after 3WHS is completed,
- * hence it never needs to access the request_sock. things get a lot more
- * complex with TFO. A child socket, accepted or not, has to access its
- * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
- * until 3WHS is either completed or aborted. Afterwards the req will stay
- * until either the child socket is accepted, or in the rare case when the
- * listener is closed before the child is accepted.
- *
- * In short, a request socket is only freed after BOTH 3WHS has completed
- * (or aborted) and the child socket has been accepted (or listener closed).
- * When a child socket is accepted, its corresponding req->sk is set to
- * NULL since it's no longer needed. More importantly, "req->sk == NULL"
- * will be used by the code below to determine if a child socket has been
- * accepted or not, and the check is protected by the fastopenq->lock
- * described below.
- *
- * Note that fastopen_rsk is only accessed from the child socket's context
- * with its socket lock held. But a request_sock (req) can be accessed by
- * both its child socket through fastopen_rsk, and a listener socket through
- * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
- * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
- * only in the rare case when both the listener and the child locks are held,
- * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
- * The lock also protects other fields such as fastopenq->qlen, which is
- * decremented by this function when fastopen_rsk is no longer needed.
- *
- * Note that another solution was to simply use the existing socket lock
- * from the listener. But first socket lock is difficult to use. It is not
- * a simple spin lock - one must consider sock_owned_by_user() and arrange
- * to use sk_add_backlog() stuff. But what really makes it infeasible is the
- * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock.
- *
- * This function also sets "treq->tfo_listener" to false.
- * treq->tfo_listener is used by the listener so it is protected by the
- * fastopenq->lock in this function.
- */
-void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
- bool reset)
-{
- struct sock *lsk = req->rsk_listener;
- struct fastopen_queue *fastopenq;
-
- fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
-
- RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
- spin_lock_bh(&fastopenq->lock);
- fastopenq->qlen--;
- tcp_rsk(req)->tfo_listener = false;
- if (req->sk) /* the child socket hasn't been accepted yet */
- goto out;
-
- if (!reset || lsk->sk_state != TCP_LISTEN) {
- /* If the listener has been closed don't bother with the
- * special RST handling below.
- */
- spin_unlock_bh(&fastopenq->lock);
- reqsk_put(req);
- return;
- }
- /* Wait for 60secs before removing a req that has triggered RST.
- * This is a simple defense against TFO spoofing attack - by
- * counting the req against fastopen.max_qlen, and disabling
- * TFO when the qlen exceeds max_qlen.
- *
- * For more details see CoNext'11 "TCP Fast Open" paper.
- */
- req->rsk_timer.expires = jiffies + 60*HZ;
- if (fastopenq->rskq_rst_head == NULL)
- fastopenq->rskq_rst_head = req;
- else
- fastopenq->rskq_rst_tail->dl_next = req;
-
- req->dl_next = NULL;
- fastopenq->rskq_rst_tail = req;
- fastopenq->qlen++;
-out:
- spin_unlock_bh(&fastopenq->lock);
-}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 61746c2b95f6..699c401a5eae 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -78,6 +78,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
+#include <net/can.h>
#include <net/page_pool/helpers.h>
#include <net/psp/types.h>
#include <net/dropreason.h>
@@ -280,7 +281,7 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align);
*/
static u32 skbuff_cache_size __read_mostly;
-static struct sk_buff *napi_skb_cache_get(bool alloc)
+static inline struct sk_buff *napi_skb_cache_get(bool alloc)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
@@ -307,6 +308,23 @@ static struct sk_buff *napi_skb_cache_get(bool alloc)
return skb;
}
+/*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise later. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+static inline void skbuff_clear(struct sk_buff *skb)
+{
+ /* Replace memset(skb, 0, offsetof(struct sk_buff, tail))
+ * with two smaller memset(), with a barrier() between them.
+ * This forces the compiler to inline both calls.
+ */
+ BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128);
+ memset(skb, 0, 128);
+ barrier();
+ memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128);
+}
+
/**
* napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
* @skbs: pointer to an at least @n-sized array to fill with skb pointers
@@ -357,7 +375,7 @@ get:
skbs[i] = nc->skb_cache[base + i];
kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
- memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skbs[i]);
}
nc->skb_count -= n;
@@ -424,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
@@ -476,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -537,7 +555,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -566,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(napi_build_skb);
+static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node)
+{
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+ if (!obj_size)
+ return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
+ flags, node);
+ return kmalloc_node_track_caller(obj_size, flags, node);
+}
+
/*
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
* the caller if emergency pfmemalloc reserves are being used. If it is and
@@ -574,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb);
* memory is free
*/
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
- bool *pfmemalloc)
+ struct sk_buff *skb)
{
- bool ret_pfmemalloc = false;
size_t obj_size;
void *obj;
@@ -587,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
*size = SKB_SMALL_HEAD_CACHE_SIZE;
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
- goto out;
+ if (skb)
+ skb->pfmemalloc = true;
+ return kmalloc_pfmemalloc(0, flags, node);
}
obj_size = kmalloc_size_roundup(obj_size);
@@ -608,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(obj_size, flags, node);
-
+ if (skb)
+ skb->pfmemalloc = true;
+ obj = kmalloc_pfmemalloc(obj_size, flags, node);
out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
-
return obj;
}
@@ -650,7 +674,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct sk_buff *skb = NULL;
struct kmem_cache *cache;
- bool pfmemalloc;
u8 *data;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
@@ -680,37 +703,35 @@ fallback:
if (unlikely(!skb))
return NULL;
}
- prefetchw(skb);
+ skbuff_clear(skb);
/* We do our best to align skb_shared_info on a separate cache
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, skb);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- prefetchw(data + SKB_WITH_OVERHEAD(size));
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, size);
- skb->pfmemalloc = pfmemalloc;
+ __finalize_skb_around(skb, data, size);
if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff_fclones *fclones;
fclones = container_of(skb, struct sk_buff_fclones, skb1);
- skb->fclone = SKB_FCLONE_ORIG;
+ /* skb->fclone is a 2bits field.
+ * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG)
+ * with a single OR.
+ */
+ BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0);
+ DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE);
+ skb->fclone |= SKB_FCLONE_ORIG;
+
refcount_set(&fclones->fclone_ref, 1);
}
@@ -1488,9 +1509,20 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
napi_skb_cache_put(skb);
}
+/**
+ * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache
+ * @skb: buffer to free
+ * @budget: NAPI budget
+ *
+ * Non-zero @budget must come from the @budget argument passed by the core
+ * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll
+ * for example when polling for netpoll / netconsole.
+ *
+ * Passing @budget of 0 is safe from any context, it turns this function
+ * into dev_consume_skb_any().
+ */
void napi_consume_skb(struct sk_buff *skb, int budget)
{
- /* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
@@ -5108,6 +5140,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_INET_PSP)
[SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
#endif
+#if IS_ENABLED(CONFIG_CAN)
+ [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -5123,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void)
static void skb_extensions_init(void)
{
- BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(SKB_EXT_NUM > 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif
@@ -7392,31 +7427,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
-void get_netmem(netmem_ref netmem)
+void __get_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_get_net_iov(netmem_to_net_iov(netmem));
- return;
- }
- get_page(netmem_to_page(netmem));
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
}
-EXPORT_SYMBOL(get_netmem);
+EXPORT_SYMBOL(__get_netmem);
-void put_netmem(netmem_ref netmem)
+void __put_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_put_net_iov(netmem_to_net_iov(netmem));
- return;
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+}
+EXPORT_SYMBOL(__put_netmem);
+
+struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
+ __be16 type,
+ int mac_offset)
+{
+ unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;
+
+ /* if type is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (vlan_depth) {
+ if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN))
+ return (struct vlan_type_depth) { 0 };
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
}
+ do {
+ struct vlan_hdr vhdr, *vh;
+
+ vh = skb_header_pointer(skb, mac_offset + vlan_depth,
+ sizeof(vhdr), &vhdr);
+ if (unlikely(!vh || !--parse_depth))
+ return (struct vlan_type_depth) { 0 };
- put_page(netmem_to_page(netmem));
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (eth_type_vlan(type));
+
+ return (struct vlan_type_depth) {
+ .type = type,
+ .depth = vlan_depth
+ };
}
-EXPORT_SYMBOL(put_netmem);
+EXPORT_SYMBOL(__vlan_get_protocol_offset);
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d56..693e6d80f501 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
return -EINVAL;
}
if (alloc_slab) {
- prot->slab = kmem_cache_create_usercopy(prot->name,
- prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
- prot->slab_flags,
- prot->useroffset, prot->usersize,
- NULL);
+ struct kmem_cache_args args = {
+ .useroffset = prot->useroffset,
+ .usersize = prot->usersize,
+ .freeptr_offset = prot->freeptr_offset,
+ .use_freeptr_offset = !!prot->freeptr_offset,
+ };
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+ &args,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 05dd55cf8b58..03aea10073f0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/sched/isolation.h>
+#include <linux/hex.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write,
static int proc_do_rss_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
+ struct ctl_table fake_table;
+ char *pos = buf;
+
+ for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) {
+ pos = hex_byte_pack(pos, netdev_rss_key[i]);
+ *pos++ = ':';
+ }
+ *(--pos) = 0;
- snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
fake_table.data = buf;
fake_table.maxlen = sizeof(buf);
return proc_dostring(&fake_table, write, buffer, lenp, ppos);
diff --git a/net/devlink/core.c b/net/devlink/core.c
index 58093f49c090..da56e2b8afc1 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -178,9 +178,7 @@ int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index,
* a notification of a change of this object should be sent
* over netlink. The parent devlink instance lock needs to be
* taken during the notification preparation.
- * However, since the devlink lock of nested instance is held here,
- * we would end with wrong devlink instance lock ordering and
- * deadlock. Therefore the work is utilized to avoid that.
+ * Since the parent may or may not be locked, 'work' is utilized.
*/
void devlink_rel_nested_in_notify(struct devlink *devlink)
{
@@ -477,7 +475,7 @@ void devlink_free(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->resource_list));
WARN_ON(!list_empty(&devlink->dpipe_table_list));
WARN_ON(!list_empty(&devlink->sb_list));
- WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(devlink_rates_check(devlink, NULL, NULL));
WARN_ON(!list_empty(&devlink->linecard_list));
WARN_ON(!xa_empty(&devlink->ports));
diff --git a/net/devlink/dev.c b/net/devlink/dev.c
index 02602704bdea..e3a36de4f4ae 100644
--- a/net/devlink/dev.c
+++ b/net/devlink/dev.c
@@ -434,7 +434,7 @@ static void devlink_reload_reinit_sanity_check(struct devlink *devlink)
WARN_ON(!list_empty(&devlink->trap_list));
WARN_ON(!list_empty(&devlink->dpipe_table_list));
WARN_ON(!list_empty(&devlink->sb_list));
- WARN_ON(!list_empty(&devlink->rate_list));
+ WARN_ON(devlink_rates_check(devlink, NULL, NULL));
WARN_ON(!list_empty(&devlink->linecard_list));
WARN_ON(!xa_empty(&devlink->ports));
}
@@ -713,10 +713,11 @@ int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info)
if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) {
if (!ops->eswitch_mode_set)
return -EOPNOTSUPP;
- mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
- err = devlink_rate_nodes_check(devlink, mode, info->extack);
+ err = devlink_rates_check(devlink, devlink_rate_is_node,
+ info->extack);
if (err)
return err;
+ mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]);
err = ops->eswitch_mode_set(devlink, mode, info->extack);
if (err)
return err;
diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h
index 14eaad9cfe35..1377864383bc 100644
--- a/net/devlink/devl_internal.h
+++ b/net/devlink/devl_internal.h
@@ -297,8 +297,10 @@ int devlink_resources_validate(struct devlink *devlink,
struct genl_info *info);
/* Rates */
-int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
- struct netlink_ext_ack *extack);
+bool devlink_rate_is_node(const struct devlink_rate *devlink_rate);
+int devlink_rates_check(struct devlink *devlink,
+ bool (*rate_filter)(const struct devlink_rate *),
+ struct netlink_ext_ack *extack);
/* Linecards */
unsigned int devlink_linecard_index(struct devlink_linecard *linecard);
diff --git a/net/devlink/rate.c b/net/devlink/rate.c
index d157a8419bca..0d68b5c477dc 100644
--- a/net/devlink/rate.c
+++ b/net/devlink/rate.c
@@ -12,8 +12,7 @@ devlink_rate_is_leaf(struct devlink_rate *devlink_rate)
return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF;
}
-static inline bool
-devlink_rate_is_node(struct devlink_rate *devlink_rate)
+bool devlink_rate_is_node(const struct devlink_rate *devlink_rate)
{
return devlink_rate->type == DEVLINK_RATE_TYPE_NODE;
}
@@ -688,14 +687,16 @@ int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info)
return err;
}
-int devlink_rate_nodes_check(struct devlink *devlink, u16 mode,
- struct netlink_ext_ack *extack)
+int devlink_rates_check(struct devlink *devlink,
+ bool (*rate_filter)(const struct devlink_rate *),
+ struct netlink_ext_ack *extack)
{
struct devlink_rate *devlink_rate;
list_for_each_entry(devlink_rate, &devlink->rate_list, list)
- if (devlink_rate_is_node(devlink_rate)) {
- NL_SET_ERR_MSG(extack, "Rate node(s) exists.");
+ if (!rate_filter || rate_filter(devlink_rate)) {
+ if (extack)
+ NL_SET_ERR_MSG(extack, "Rate node(s) exists.");
return -EBUSY;
}
return 0;
diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig
index f86b30742122..5ed8c704636d 100644
--- a/net/dsa/Kconfig
+++ b/net/dsa/Kconfig
@@ -104,6 +104,13 @@ config NET_DSA_TAG_MTK
Say Y or M if you want to enable support for tagging frames for
Mediatek switches.
+config NET_DSA_TAG_MXL_862XX
+ tristate "Tag driver for MaxLinear MxL862xx switches"
+ help
+ Say Y or M if you want to enable support for tagging frames for the
+ MaxLinear MxL86252 and MxL86282 switches using their native 8-byte
+ tagging protocol.
+
config NET_DSA_TAG_MXL_GSW1XX
tristate "Tag driver for MaxLinear GSW1xx switches"
help
diff --git a/net/dsa/Makefile b/net/dsa/Makefile
index 42d173f5a701..bf7247759a64 100644
--- a/net/dsa/Makefile
+++ b/net/dsa/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o
obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o
obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o
obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o
+obj-$(CONFIG_NET_DSA_TAG_MXL_862XX) += tag_mxl862xx.o
obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o
obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o
obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o
diff --git a/net/dsa/tag_mxl862xx.c b/net/dsa/tag_mxl862xx.c
new file mode 100644
index 000000000000..01f215868271
--- /dev/null
+++ b/net/dsa/tag_mxl862xx.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * DSA Special Tag for MaxLinear 862xx switch chips
+ *
+ * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org>
+ * Copyright (C) 2024 MaxLinear Inc.
+ */
+
+#include <linux/bitops.h>
+#include <linux/etherdevice.h>
+#include <linux/skbuff.h>
+#include <net/dsa.h>
+#include "tag.h"
+
+#define MXL862_NAME "mxl862xx"
+
+#define MXL862_HEADER_LEN 8
+
+/* Word 0 -> EtherType */
+
+/* Word 2 */
+#define MXL862_SUBIF_ID GENMASK(4, 0)
+
+/* Word 3 */
+#define MXL862_IGP_EGP GENMASK(3, 0)
+
+static struct sk_buff *mxl862_tag_xmit(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ struct dsa_port *dp = dsa_user_to_port(dev);
+ struct dsa_port *cpu_dp = dp->cpu_dp;
+ unsigned int cpu_port, sub_interface;
+ __be16 *mxl862_tag;
+
+ cpu_port = cpu_dp->index;
+
+ /* target port sub-interface ID relative to the CPU port */
+ sub_interface = dp->index + 16 - cpu_port;
+
+ /* provide additional space 'MXL862_HEADER_LEN' bytes */
+ skb_push(skb, MXL862_HEADER_LEN);
+
+ /* shift MAC address to the beginning of the enlarged buffer,
+ * releasing the space required for DSA tag (between MAC address and
+ * Ethertype)
+ */
+ dsa_alloc_etype_header(skb, MXL862_HEADER_LEN);
+
+ /* special tag ingress (from the perspective of the switch) */
+ mxl862_tag = dsa_etype_header_pos_tx(skb);
+ mxl862_tag[0] = htons(ETH_P_MXLGSW);
+ mxl862_tag[1] = 0;
+ mxl862_tag[2] = htons(FIELD_PREP(MXL862_SUBIF_ID, sub_interface));
+ mxl862_tag[3] = htons(FIELD_PREP(MXL862_IGP_EGP, cpu_port));
+
+ return skb;
+}
+
+static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ __be16 *mxl862_tag;
+ int port;
+
+ if (unlikely(!pskb_may_pull(skb, MXL862_HEADER_LEN))) {
+ dev_warn_ratelimited(&dev->dev, "Cannot pull SKB, packet dropped\n");
+ return NULL;
+ }
+
+ mxl862_tag = dsa_etype_header_pos_rx(skb);
+
+ if (unlikely(mxl862_tag[0] != htons(ETH_P_MXLGSW))) {
+ dev_warn_ratelimited(&dev->dev,
+ "Invalid special tag marker, packet dropped, tag: %8ph\n",
+ mxl862_tag);
+ return NULL;
+ }
+
+ /* Get source port information */
+ port = FIELD_GET(MXL862_IGP_EGP, ntohs(mxl862_tag[3]));
+ skb->dev = dsa_conduit_find_user(dev, 0, port);
+ if (unlikely(!skb->dev)) {
+ dev_warn_ratelimited(&dev->dev,
+ "Invalid source port, packet dropped, tag: %8ph\n",
+ mxl862_tag);
+ return NULL;
+ }
+
+ /* remove the MxL862xx special tag between the MAC addresses and the
+ * current ethertype field.
+ */
+ skb_pull_rcsum(skb, MXL862_HEADER_LEN);
+ dsa_strip_etype_header(skb, MXL862_HEADER_LEN);
+
+ return skb;
+}
+
+static const struct dsa_device_ops mxl862_netdev_ops = {
+ .name = MXL862_NAME,
+ .proto = DSA_TAG_PROTO_MXL862,
+ .xmit = mxl862_tag_xmit,
+ .rcv = mxl862_tag_rcv,
+ .needed_headroom = MXL862_HEADER_LEN,
+};
+
+MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL862, MXL862_NAME);
+MODULE_DESCRIPTION("DSA tag driver for MaxLinear MxL862xx switches");
+MODULE_LICENSE("GPL");
+
+module_dsa_tag_driver(mxl862_netdev_ops);
diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c
index 6bbfd42dc5df..aefef8c770e3 100644
--- a/net/dsa/tag_yt921x.c
+++ b/net/dsa/tag_yt921x.c
@@ -14,11 +14,14 @@
* are conflicts somewhere and/or you want to change it for some reason.
* Tag:
* 2: VLAN Tag
- * 2: Rx Port
+ * 2:
* 15b: Rx Port Valid
* 14b-11b: Rx Port
- * 10b-0b: Cmd?
- * 2: Tx Port(s)
+ * 10b-8b: Tx/Rx Priority
+ * 7b: Tx/Rx Code Valid
+ * 6b-1b: Tx/Rx Code
+ * 0b: ? (unset)
+ * 2:
* 15b: Tx Port(s) Valid
* 10b-0b: Tx Port(s) Mask
*/
@@ -33,18 +36,30 @@
#define YT921X_TAG_PORT_EN BIT(15)
#define YT921X_TAG_RX_PORT_M GENMASK(14, 11)
-#define YT921X_TAG_RX_CMD_M GENMASK(10, 0)
-#define YT921X_TAG_RX_CMD(x) FIELD_PREP(YT921X_TAG_RX_CMD_M, (x))
-#define YT921X_TAG_RX_CMD_FORWARDED 0x80
-#define YT921X_TAG_RX_CMD_UNK_UCAST 0xb2
-#define YT921X_TAG_RX_CMD_UNK_MCAST 0xb4
-#define YT921X_TAG_TX_PORTS GENMASK(10, 0)
+#define YT921X_TAG_PRIO_M GENMASK(10, 8)
+#define YT921X_TAG_PRIO(x) FIELD_PREP(YT921X_TAG_PRIO_M, (x))
+#define YT921X_TAG_CODE_EN BIT(7)
+#define YT921X_TAG_CODE_M GENMASK(6, 1)
+#define YT921X_TAG_CODE(x) FIELD_PREP(YT921X_TAG_CODE_M, (x))
+#define YT921X_TAG_TX_PORTS_M GENMASK(10, 0)
+#define YT921X_TAG_TX_PORTS(x) FIELD_PREP(YT921X_TAG_TX_PORTS_M, (x))
+
+/* Incomplete. Some are configurable via RMA_CTRL_CPU_CODE, the meaning of
+ * others remains unknown.
+ */
+enum yt921x_tag_code {
+ YT921X_TAG_CODE_FORWARD = 0,
+ YT921X_TAG_CODE_UNK_UCAST = 0x19,
+ YT921X_TAG_CODE_UNK_MCAST = 0x1a,
+ YT921X_TAG_CODE_PORT_COPY = 0x1b,
+ YT921X_TAG_CODE_FDB_COPY = 0x1c,
+};
static struct sk_buff *
yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev)
{
__be16 *tag;
- u16 tx;
+ u16 ctrl;
skb_push(skb, YT921X_TAG_LEN);
dsa_alloc_etype_header(skb, YT921X_TAG_LEN);
@@ -54,10 +69,12 @@ yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev)
tag[0] = htons(ETH_P_YT921X);
/* VLAN tag unrelated when TX */
tag[1] = 0;
- tag[2] = 0;
- tx = FIELD_PREP(YT921X_TAG_TX_PORTS, dsa_xmit_port_mask(skb, netdev)) |
- YT921X_TAG_PORT_EN;
- tag[3] = htons(tx);
+ ctrl = YT921X_TAG_CODE(YT921X_TAG_CODE_FORWARD) | YT921X_TAG_CODE_EN |
+ YT921X_TAG_PRIO(skb->priority);
+ tag[2] = htons(ctrl);
+ ctrl = YT921X_TAG_TX_PORTS(dsa_xmit_port_mask(skb, netdev)) |
+ YT921X_TAG_PORT_EN;
+ tag[3] = htons(ctrl);
return skb;
}
@@ -67,7 +84,6 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
{
unsigned int port;
__be16 *tag;
- u16 cmd;
u16 rx;
if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN)))
@@ -98,23 +114,34 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev)
return NULL;
}
- cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx);
- switch (cmd) {
- case YT921X_TAG_RX_CMD_FORWARDED:
- /* Already forwarded by hardware */
- dsa_default_offload_fwd_mark(skb);
- break;
- case YT921X_TAG_RX_CMD_UNK_UCAST:
- case YT921X_TAG_RX_CMD_UNK_MCAST:
- /* NOTE: hardware doesn't distinguish between TRAP (copy to CPU
- * only) and COPY (forward and copy to CPU). In order to perform
- * a soft switch, NEVER use COPY action in the switch driver.
- */
- break;
- default:
+ skb->priority = FIELD_GET(YT921X_TAG_PRIO_M, rx);
+
+ if (!(rx & YT921X_TAG_CODE_EN)) {
dev_warn_ratelimited(&netdev->dev,
- "Unexpected rx cmd 0x%02x\n", cmd);
- break;
+ "Tag code not enabled in rx packet\n");
+ } else {
+ u16 code = FIELD_GET(YT921X_TAG_CODE_M, rx);
+
+ switch (code) {
+ case YT921X_TAG_CODE_FORWARD:
+ case YT921X_TAG_CODE_PORT_COPY:
+ case YT921X_TAG_CODE_FDB_COPY:
+ /* Already forwarded by hardware */
+ dsa_default_offload_fwd_mark(skb);
+ break;
+ case YT921X_TAG_CODE_UNK_UCAST:
+ case YT921X_TAG_CODE_UNK_MCAST:
+ /* NOTE: hardware doesn't distinguish between TRAP (copy
+ * to CPU only) and COPY (forward and copy to CPU). In
+ * order to perform a soft switch, NEVER use COPY action
+ * in the switch driver.
+ */
+ break;
+ default:
+ dev_warn_ratelimited(&netdev->dev,
+ "Unknown code 0x%02x\n", code);
+ break;
+ }
}
/* Remove YT921x tag and update checksum */
diff --git a/net/dsa/user.c b/net/dsa/user.c
index f59d66f0975d..5697291d43cf 100644
--- a/net/dsa/user.c
+++ b/net/dsa/user.c
@@ -1459,8 +1459,8 @@ dsa_user_add_cls_matchall_police(struct net_device *dev,
struct netlink_ext_ack *extack = cls->common.extack;
struct dsa_port *dp = dsa_user_to_port(dev);
struct dsa_user_priv *p = netdev_priv(dev);
- struct dsa_mall_policer_tc_entry *policer;
struct dsa_mall_tc_entry *mall_tc_entry;
+ struct flow_action_police *policer;
struct dsa_switch *ds = dp->ds;
struct flow_action_entry *act;
int err;
@@ -1497,8 +1497,7 @@ dsa_user_add_cls_matchall_police(struct net_device *dev,
mall_tc_entry->cookie = cls->cookie;
mall_tc_entry->type = DSA_PORT_MALL_POLICER;
policer = &mall_tc_entry->policer;
- policer->rate_bytes_per_sec = act->police.rate_bytes_ps;
- policer->burst = act->police.burst;
+ *policer = act->police;
err = ds->ops->port_policer_add(ds, dp->index, policer);
if (err) {
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index d47a279eb8b9..5fae329795c8 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -285,12 +285,35 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
#define __LINK_MODE_LANES_DR8_2 8
#define __LINK_MODE_LANES_T1BRR 1
-#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \
+#define __DEFINE_LINK_MODE_PARAMS_PAIRS(_speed, _type, _min_pairs, _pairs, _duplex, _medium) \
[ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
.speed = SPEED_ ## _speed, \
.lanes = __LINK_MODE_LANES_ ## _type, \
- .duplex = __DUPLEX_ ## _duplex \
+ .min_pairs = _min_pairs, \
+ .pairs = _pairs, \
+ .duplex = __DUPLEX_ ## _duplex, \
+ .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \
}
+
+#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex, _medium) \
+ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
+ .speed = SPEED_ ## _speed, \
+ .lanes = __LINK_MODE_LANES_ ## _type, \
+ .min_pairs = 0, \
+ .pairs = 0, \
+ .duplex = __DUPLEX_ ## _duplex, \
+ .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \
+ }
+#define __DEFINE_LINK_MODE_PARAMS_MEDIUMS(_speed, _type, _duplex, _mediums) \
+ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \
+ .speed = SPEED_ ## _speed, \
+ .lanes = __LINK_MODE_LANES_ ## _type, \
+ .min_pairs = 0, \
+ .pairs = 0, \
+ .duplex = __DUPLEX_ ## _duplex, \
+ .mediums = (_mediums) \
+ }
+#define __MED(_medium) (BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium))
#define __DUPLEX_Half DUPLEX_HALF
#define __DUPLEX_Full DUPLEX_FULL
#define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \
@@ -298,142 +321,168 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS);
.speed = SPEED_UNKNOWN, \
.lanes = 0, \
.duplex = DUPLEX_UNKNOWN, \
+ .mediums = BIT(ETHTOOL_LINK_MEDIUM_NONE), \
}
const struct link_mode_info link_mode_params[] = {
- __DEFINE_LINK_MODE_PARAMS(10, T, Half),
- __DEFINE_LINK_MODE_PARAMS(10, T, Full),
- __DEFINE_LINK_MODE_PARAMS(100, T, Half),
- __DEFINE_LINK_MODE_PARAMS(100, T, Full),
- __DEFINE_LINK_MODE_PARAMS(1000, T, Half),
- __DEFINE_LINK_MODE_PARAMS(1000, T, Full),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Half, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Full, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Half, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Full, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Half, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Full, T),
__DEFINE_SPECIAL_MODE_PARAMS(Autoneg),
__DEFINE_SPECIAL_MODE_PARAMS(TP),
__DEFINE_SPECIAL_MODE_PARAMS(AUI),
__DEFINE_SPECIAL_MODE_PARAMS(MII),
__DEFINE_SPECIAL_MODE_PARAMS(FIBRE),
__DEFINE_SPECIAL_MODE_PARAMS(BNC),
- __DEFINE_LINK_MODE_PARAMS(10000, T, Full),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10000, T, 4, 4, Full, T),
__DEFINE_SPECIAL_MODE_PARAMS(Pause),
__DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause),
- __DEFINE_LINK_MODE_PARAMS(2500, X, Full),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(2500, X, Full,
+ __MED(C) | __MED(S) | __MED(L)),
__DEFINE_SPECIAL_MODE_PARAMS(Backplane),
- __DEFINE_LINK_MODE_PARAMS(1000, KX, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, KR, Full),
+ __DEFINE_LINK_MODE_PARAMS(1000, KX, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(10000, KR, Full, K),
[ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = {
.speed = SPEED_10000,
.lanes = 1,
.duplex = DUPLEX_FULL,
},
- __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full),
- __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full),
- __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full),
- __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full),
- __DEFINE_LINK_MODE_PARAMS(25000, CR, Full),
- __DEFINE_LINK_MODE_PARAMS(25000, KR, Full),
- __DEFINE_LINK_MODE_PARAMS(25000, SR, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full),
- __DEFINE_LINK_MODE_PARAMS(1000, X, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, CR, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, SR, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, LR, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full),
- __DEFINE_LINK_MODE_PARAMS(10000, ER, Full),
- __DEFINE_LINK_MODE_PARAMS(2500, T, Full),
- __DEFINE_LINK_MODE_PARAMS(5000, T, Full),
+ __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full, MLD),
+ __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full, L),
+ __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full, L),
+ __DEFINE_LINK_MODE_PARAMS(25000, CR, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(25000, KR, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(25000, SR, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR4_ER4, Full,
+ __MED(L) | __MED(E)),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(1000, X, Full,
+ __MED(C) | __MED(S) | __MED(L)),
+ __DEFINE_LINK_MODE_PARAMS(10000, CR, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(10000, SR, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(10000, LR, Full, L),
+ __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full, L),
+ __DEFINE_LINK_MODE_PARAMS(10000, ER, Full, E),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(2500, T, 4, 4, Full, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(5000, T, 4, 4, Full, T),
__DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE),
__DEFINE_SPECIAL_MODE_PARAMS(FEC_RS),
__DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER),
- __DEFINE_LINK_MODE_PARAMS(50000, KR, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, SR, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, CR, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full),
- __DEFINE_LINK_MODE_PARAMS(50000, DR, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(100, T1, Full),
- __DEFINE_LINK_MODE_PARAMS(1000, T1, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full),
+ __DEFINE_LINK_MODE_PARAMS(50000, KR, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(50000, SR, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(50000, CR, Full, C),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(50000, LR_ER_FR, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(50000, DR, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full, C),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR2_ER2_FR2, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR4_ER4_FR4, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T1, 1, 1, Full, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T1, 1, 1, Full, T),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR8_ER8_FR8, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full, C),
__DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS),
- __DEFINE_LINK_MODE_PARAMS(100000, KR, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, SR, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, DR, Full),
- __DEFINE_LINK_MODE_PARAMS(100000, CR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(100, FX, Half),
- __DEFINE_LINK_MODE_PARAMS(100, FX, Full),
- __DEFINE_LINK_MODE_PARAMS(10, T1L, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full),
- __DEFINE_LINK_MODE_PARAMS(10, T1S, Full),
- __DEFINE_LINK_MODE_PARAMS(10, T1S, Half),
- __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half),
- __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, CR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, KR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, DR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, SR, Full),
- __DEFINE_LINK_MODE_PARAMS(200000, VR, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full),
- __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full),
- __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full),
- __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full),
- __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full),
- __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full),
- __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full),
+ __DEFINE_LINK_MODE_PARAMS(100000, KR, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(100000, SR, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR_ER_FR, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(100000, DR, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(100000, CR, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR2_ER2_FR2, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR4_ER4_FR4, Full,
+ __MED(L) | __MED(E) | __MED(F)),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Half, F),
+ __DEFINE_LINK_MODE_PARAMS(100, FX, Full, F),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1L, 1, 1, Full, T),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full, V),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Full, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Half, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S_P2MP, 1, 1, Half, T),
+ __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1BRR, 1, 1, Full, T),
+ __DEFINE_LINK_MODE_PARAMS(200000, CR, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(200000, KR, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(200000, SR, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(200000, VR, Full, V),
+ __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full, V),
+ __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full, S),
+ __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full, V),
+ __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full, C),
+ __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full, K),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full, D),
+ __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full, D),
};
static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS);
EXPORT_SYMBOL_GPL(link_mode_params);
+static const char ethtool_link_medium_names[][ETH_GSTRING_LEN] = {
+ [ETHTOOL_LINK_MEDIUM_BASET] = "BaseT",
+ [ETHTOOL_LINK_MEDIUM_BASEK] = "BaseK",
+ [ETHTOOL_LINK_MEDIUM_BASES] = "BaseS",
+ [ETHTOOL_LINK_MEDIUM_BASEC] = "BaseC",
+ [ETHTOOL_LINK_MEDIUM_BASEL] = "BaseL",
+ [ETHTOOL_LINK_MEDIUM_BASED] = "BaseD",
+ [ETHTOOL_LINK_MEDIUM_BASEE] = "BaseE",
+ [ETHTOOL_LINK_MEDIUM_BASEF] = "BaseF",
+ [ETHTOOL_LINK_MEDIUM_BASEV] = "BaseV",
+ [ETHTOOL_LINK_MEDIUM_BASEMLD] = "BaseMLD",
+ [ETHTOOL_LINK_MEDIUM_NONE] = "None",
+};
+static_assert(ARRAY_SIZE(ethtool_link_medium_names) == __ETHTOOL_LINK_MEDIUM_LAST);
+
const char netif_msg_class_names[][ETH_GSTRING_LEN] = {
[NETIF_MSG_DRV_BIT] = "drv",
[NETIF_MSG_PROBE_BIT] = "probe",
@@ -588,21 +637,11 @@ int __ethtool_get_link(struct net_device *dev)
int ethtool_get_rx_ring_count(struct net_device *dev)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
- struct ethtool_rxnfc rx_rings = {};
- int ret;
-
- if (ops->get_rx_ring_count)
- return ops->get_rx_ring_count(dev);
- if (!ops->get_rxnfc)
+ if (!ops->get_rx_ring_count)
return -EOPNOTSUPP;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- ret = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (ret < 0)
- return ret;
-
- return rx_rings.data;
+ return ops->get_rx_ring_count(dev);
}
static int ethtool_get_rxnfc_rule_count(struct net_device *dev)
@@ -1164,3 +1203,15 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id)
ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id);
}
EXPORT_SYMBOL(ethtool_rxfh_context_lost);
+
+enum ethtool_link_medium ethtool_str_to_medium(const char *str)
+{
+ int i;
+
+ for (i = 0; i < __ETHTOOL_LINK_MEDIUM_LAST; i++)
+ if (!strcmp(ethtool_link_medium_names[i], str))
+ return i;
+
+ return ETHTOOL_LINK_MEDIUM_NONE;
+}
+EXPORT_SYMBOL_GPL(ethtool_str_to_medium);
diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c
index 3a2a2fa7a0a3..7d87f304ded4 100644
--- a/net/hsr/hsr_framereg.c
+++ b/net/hsr/hsr_framereg.c
@@ -11,6 +11,7 @@
* Same code handles filtering of duplicates for PRP as well.
*/
+#include <kunit/visibility.h>
#include <linux/if_ether.h>
#include <linux/etherdevice.h>
#include <linux/slab.h>
@@ -19,24 +20,6 @@
#include "hsr_framereg.h"
#include "hsr_netlink.h"
-/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b,
- * false otherwise.
- */
-static bool seq_nr_after(u16 a, u16 b)
-{
- /* Remove inconsistency where
- * seq_nr_after(a, b) == seq_nr_before(a, b)
- */
- if ((int)b - a == 32768)
- return false;
-
- return (((s16)(b - a)) < 0);
-}
-
-#define seq_nr_before(a, b) seq_nr_after((b), (a))
-#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b)))
-#define PRP_DROP_WINDOW_LEN 32768
-
bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr)
{
if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox))
@@ -126,13 +109,29 @@ void hsr_del_self_node(struct hsr_priv *hsr)
kfree_rcu(old, rcu_head);
}
+static void hsr_free_node(struct hsr_node *node)
+{
+ xa_destroy(&node->seq_blocks);
+ kfree(node->block_buf);
+ kfree(node);
+}
+
+static void hsr_free_node_rcu(struct rcu_head *rn)
+{
+ struct hsr_node *node = container_of(rn, struct hsr_node, rcu_head);
+
+ hsr_free_node(node);
+}
+
void hsr_del_nodes(struct list_head *node_db)
{
struct hsr_node *node;
struct hsr_node *tmp;
- list_for_each_entry_safe(node, tmp, node_db, mac_list)
- kfree(node);
+ list_for_each_entry_safe(node, tmp, node_db, mac_list) {
+ list_del(&node->mac_list);
+ hsr_free_node(node);
+ }
}
void prp_handle_san_frame(bool san, enum hsr_port_type port,
@@ -148,18 +147,16 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port,
node->san_b = true;
}
-/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A;
- * seq_out is used to initialize filtering of outgoing duplicate frames
- * originating from the newly added node.
+/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A.
*/
static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
struct list_head *node_db,
- unsigned char addr[],
- u16 seq_out, bool san,
+ unsigned char addr[], bool san,
enum hsr_port_type rx_port)
{
- struct hsr_node *new_node, *node;
+ struct hsr_node *new_node, *node = NULL;
unsigned long now;
+ size_t block_sz;
int i;
new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC);
@@ -169,18 +166,24 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
ether_addr_copy(new_node->macaddress_A, addr);
spin_lock_init(&new_node->seq_out_lock);
+ if (hsr->prot_version == PRP_V1)
+ new_node->seq_port_cnt = 1;
+ else
+ new_node->seq_port_cnt = HSR_PT_PORTS - 1;
+
+ block_sz = hsr_seq_block_size(new_node);
+ new_node->block_buf = kcalloc(HSR_MAX_SEQ_BLOCKS, block_sz, GFP_ATOMIC);
+ if (!new_node->block_buf)
+ goto free;
+
+ xa_init(&new_node->seq_blocks);
+
/* We are only interested in time diffs here, so use current jiffies
* as initialization. (0 could trigger an spurious ring error warning).
*/
now = jiffies;
for (i = 0; i < HSR_PT_PORTS; i++) {
new_node->time_in[i] = now;
- new_node->time_out[i] = now;
- }
- for (i = 0; i < HSR_PT_PORTS; i++) {
- new_node->seq_out[i] = seq_out;
- new_node->seq_expected[i] = seq_out + 1;
- new_node->seq_start[i] = seq_out + 1;
}
if (san && hsr->proto_ops->handle_san_frame)
@@ -199,6 +202,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr,
return new_node;
out:
spin_unlock_bh(&hsr->list_lock);
+ kfree(new_node->block_buf);
+free:
kfree(new_node);
return node;
}
@@ -223,7 +228,6 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
struct ethhdr *ethhdr;
struct prp_rct *rct;
bool san = false;
- u16 seq_out;
if (!skb_mac_header_was_set(skb))
return NULL;
@@ -260,25 +264,72 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db,
/* Check if skb contains hsr_ethhdr */
if (skb->mac_len < sizeof(struct hsr_ethhdr))
return NULL;
-
- /* Use the existing sequence_nr from the tag as starting point
- * for filtering duplicate frames.
- */
- seq_out = hsr_get_skb_sequence_nr(skb) - 1;
} else {
rct = skb_get_PRP_rct(skb);
- if (rct && prp_check_lsdu_size(skb, rct, is_sup)) {
- seq_out = prp_get_skb_sequence_nr(rct);
- } else {
- if (rx_port != HSR_PT_MASTER)
- san = true;
- seq_out = HSR_SEQNR_START;
+ if (!rct && rx_port != HSR_PT_MASTER)
+ san = true;
+ }
+
+ return hsr_add_node(hsr, node_db, ethhdr->h_source, san, rx_port);
+}
+
+static bool hsr_seq_block_is_old(struct hsr_seq_block *block)
+{
+ unsigned long expiry = msecs_to_jiffies(HSR_ENTRY_FORGET_TIME);
+
+ return time_is_before_jiffies(block->time + expiry);
+}
+
+static void hsr_forget_seq_block(struct hsr_node *node,
+ struct hsr_seq_block *block)
+{
+ if (block->time)
+ xa_erase(&node->seq_blocks, block->block_idx);
+ block->time = 0;
+}
+
+/* Get the currently active sequence number block. If there is no block yet, or
+ * the existing one is expired, a new block is created. The idea is to maintain
+ * a "sparse bitmap" where a bitmap for the whole sequence number space is
+ * split into blocks and not all blocks exist all the time. The blocks can
+ * expire after time (in low traffic situations) or when they are replaced in
+ * the backing fixed size buffer (in high traffic situations).
+ */
+VISIBLE_IF_KUNIT struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node,
+ u16 block_idx)
+{
+ struct hsr_seq_block *block, *res;
+ size_t block_sz;
+
+ block = xa_load(&node->seq_blocks, block_idx);
+
+ if (block && hsr_seq_block_is_old(block)) {
+ hsr_forget_seq_block(node, block);
+ block = NULL;
+ }
+
+ if (!block) {
+ block_sz = hsr_seq_block_size(node);
+ block = node->block_buf + node->next_block * block_sz;
+ hsr_forget_seq_block(node, block);
+
+ memset(block, 0, block_sz);
+ block->time = jiffies;
+ block->block_idx = block_idx;
+
+ res = xa_store(&node->seq_blocks, block_idx, block, GFP_ATOMIC);
+ if (xa_is_err(res)) {
+ block->time = 0;
+ return NULL;
}
+
+ node->next_block =
+ (node->next_block + 1) & (HSR_MAX_SEQ_BLOCKS - 1);
}
- return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out,
- san, rx_port);
+ return block;
}
+EXPORT_SYMBOL_IF_KUNIT(hsr_get_seq_block);
/* Use the Supervision frame's info about an eventual macaddress_B for merging
* nodes that has previously had their macaddress_B registered as a separate
@@ -288,16 +339,18 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
{
struct hsr_node *node_curr = frame->node_src;
struct hsr_port *port_rcv = frame->port_rcv;
+ struct hsr_seq_block *src_blk, *merge_blk;
struct hsr_priv *hsr = port_rcv->hsr;
- struct hsr_sup_payload *hsr_sp;
struct hsr_sup_tlv *hsr_sup_tlv;
+ struct hsr_sup_payload *hsr_sp;
struct hsr_node *node_real;
struct sk_buff *skb = NULL;
struct list_head *node_db;
struct ethhdr *ethhdr;
- int i;
- unsigned int pull_size = 0;
unsigned int total_pull_size = 0;
+ unsigned int pull_size = 0;
+ unsigned long idx;
+ int i;
/* Here either frame->skb_hsr or frame->skb_prp should be
* valid as supervision frame always will have protocol
@@ -340,8 +393,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
if (!node_real)
/* No frame received from AddrA of this node yet */
node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A,
- HSR_SEQNR_START - 1, true,
- port_rcv->type);
+ true, port_rcv->type);
if (!node_real)
goto done; /* No mem */
if (node_real == node_curr)
@@ -388,8 +440,20 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
node_real->time_in_stale[i] =
node_curr->time_in_stale[i];
}
- if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i]))
- node_real->seq_out[i] = node_curr->seq_out[i];
+ }
+
+ xa_for_each(&node_curr->seq_blocks, idx, src_blk) {
+ if (hsr_seq_block_is_old(src_blk))
+ continue;
+
+ merge_blk = hsr_get_seq_block(node_real, src_blk->block_idx);
+ if (!merge_blk)
+ continue;
+ merge_blk->time = min(merge_blk->time, src_blk->time);
+ for (i = 0; i < node_real->seq_port_cnt; i++) {
+ bitmap_or(merge_blk->seq_nrs[i], merge_blk->seq_nrs[i],
+ src_blk->seq_nrs[i], HSR_SEQ_BLOCK_SIZE);
+ }
}
spin_unlock_bh(&node_real->seq_out_lock);
node_real->addr_B_port = port_rcv->type;
@@ -398,7 +462,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame)
if (!node_curr->removed) {
list_del_rcu(&node_curr->mac_list);
node_curr->removed = true;
- kfree_rcu(node_curr, rcu_head);
+ call_rcu(&node_curr->rcu_head, hsr_free_node_rcu);
}
spin_unlock_bh(&hsr->list_lock);
@@ -466,56 +530,79 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb,
void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port,
u16 sequence_nr)
{
- /* Don't register incoming frames without a valid sequence number. This
- * ensures entries of restarted nodes gets pruned so that they can
- * re-register and resume communications.
- */
- if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) &&
- seq_nr_before(sequence_nr, node->seq_out[port->type]))
- return;
-
node->time_in[port->type] = jiffies;
node->time_in_stale[port->type] = false;
}
-/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid
- * ethhdr->h_source address and skb->mac_header set.
+/* Duplicate discard algorithm: we maintain a bitmap where we set a bit for
+ * every seen sequence number. The bitmap is split into blocks and the block
+ * management is detailed in hsr_get_seq_block(). In any case, we err on the
+ * side of accepting a packet, as the specification requires the algorithm to
+ * be "designed such that it never rejects a legitimate frame, while occasional
+ * acceptance of a duplicate can be tolerated." (IEC 62439-3:2021, 4.1.10.3).
+ * While this requirement is explicit for PRP, applying it to HSR does no harm
+ * either.
+ *
+ * 'frame' is the frame to be sent
+ * 'port_type' is the type of the outgoing interface
*
* Return:
* 1 if frame can be shown to have been sent recently on this interface,
- * 0 otherwise, or
- * negative error code on error
+ * 0 otherwise
*/
-int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
+static int hsr_check_duplicate(struct hsr_frame_info *frame,
+ unsigned int port_type)
{
- struct hsr_node *node = frame->node_src;
- u16 sequence_nr = frame->sequence_nr;
+ u16 sequence_nr, seq_bit, block_idx;
+ struct hsr_seq_block *block;
+ struct hsr_node *node;
+
+ node = frame->node_src;
+ sequence_nr = frame->sequence_nr;
+
+ if (WARN_ON_ONCE(port_type >= node->seq_port_cnt))
+ return 0;
spin_lock_bh(&node->seq_out_lock);
- if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) &&
- time_is_after_jiffies(node->time_out[port->type] +
- msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) {
- spin_unlock_bh(&node->seq_out_lock);
- return 1;
- }
- node->time_out[port->type] = jiffies;
- node->seq_out[port->type] = sequence_nr;
+ block_idx = hsr_seq_block_index(sequence_nr);
+ block = hsr_get_seq_block(node, block_idx);
+ if (!block)
+ goto out_new;
+
+ seq_bit = hsr_seq_block_bit(sequence_nr);
+ if (__test_and_set_bit(seq_bit, block->seq_nrs[port_type]))
+ goto out_seen;
+
+out_new:
spin_unlock_bh(&node->seq_out_lock);
return 0;
+
+out_seen:
+ spin_unlock_bh(&node->seq_out_lock);
+ return 1;
}
-/* Adaptation of the PRP duplicate discard algorithm described in wireshark
- * wiki (https://wiki.wireshark.org/PRP)
+/* HSR duplicate discard: we check if the same frame has already been sent on
+ * this outgoing interface. The check follows the general duplicate discard
+ * algorithm.
*
- * A drop window is maintained for both LANs with start sequence set to the
- * first sequence accepted on the LAN that has not been seen on the other LAN,
- * and expected sequence set to the latest received sequence number plus one.
+ * 'port' is the outgoing interface
+ * 'frame' is the frame to be sent
*
- * When a frame is received on either LAN it is compared against the received
- * frames on the other LAN. If it is outside the drop window of the other LAN
- * the frame is accepted and the drop window is updated.
- * The drop window for the other LAN is reset.
+ * Return:
+ * 1 if frame can be shown to have been sent recently on this interface,
+ * 0 otherwise
+ */
+int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
+{
+ return hsr_check_duplicate(frame, port->type - 1);
+}
+
+/* PRP duplicate discard: we only consider frames that are received on port A
+ * or port B and should go to the master port. For those, we check if they have
+ * already been received by the host, i.e., master port. The check uses the
+ * general duplicate discard algorithm, but without tracking multiple ports.
*
* 'port' is the outgoing interface
* 'frame' is the frame to be sent
@@ -526,18 +613,9 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
*/
int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
{
- enum hsr_port_type other_port;
- enum hsr_port_type rcv_port;
- struct hsr_node *node;
- u16 sequence_diff;
- u16 sequence_exp;
- u16 sequence_nr;
-
- /* out-going frames are always in order
- * and can be checked the same way as for HSR
- */
+ /* out-going frames are always in order */
if (frame->port_rcv->type == HSR_PT_MASTER)
- return hsr_register_frame_out(port, frame);
+ return 0;
/* for PRP we should only forward frames from the slave ports
* to the master port
@@ -545,52 +623,9 @@ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame)
if (port->type != HSR_PT_MASTER)
return 1;
- node = frame->node_src;
- sequence_nr = frame->sequence_nr;
- sequence_exp = sequence_nr + 1;
- rcv_port = frame->port_rcv->type;
- other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B :
- HSR_PT_SLAVE_A;
-
- spin_lock_bh(&node->seq_out_lock);
- if (time_is_before_jiffies(node->time_out[port->type] +
- msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) ||
- (node->seq_start[rcv_port] == node->seq_expected[rcv_port] &&
- node->seq_start[other_port] == node->seq_expected[other_port])) {
- /* the node hasn't been sending for a while
- * or both drop windows are empty, forward the frame
- */
- node->seq_start[rcv_port] = sequence_nr;
- } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) &&
- seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) {
- /* drop the frame, update the drop window for the other port
- * and reset our drop window
- */
- node->seq_start[other_port] = sequence_exp;
- node->seq_expected[rcv_port] = sequence_exp;
- node->seq_start[rcv_port] = node->seq_expected[rcv_port];
- spin_unlock_bh(&node->seq_out_lock);
- return 1;
- }
-
- /* update the drop window for the port where this frame was received
- * and clear the drop window for the other port
- */
- node->seq_start[other_port] = node->seq_expected[other_port];
- node->seq_expected[rcv_port] = sequence_exp;
- sequence_diff = sequence_exp - node->seq_start[rcv_port];
- if (sequence_diff > PRP_DROP_WINDOW_LEN)
- node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN;
-
- node->time_out[port->type] = jiffies;
- node->seq_out[port->type] = sequence_nr;
- spin_unlock_bh(&node->seq_out_lock);
- return 0;
+ return hsr_check_duplicate(frame, 0);
}
-
-#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST)
-EXPORT_SYMBOL(prp_register_frame_out);
-#endif
+EXPORT_SYMBOL_IF_KUNIT(prp_register_frame_out);
static struct hsr_port *get_late_port(struct hsr_priv *hsr,
struct hsr_node *node)
@@ -672,7 +707,7 @@ void hsr_prune_nodes(struct timer_list *t)
list_del_rcu(&node->mac_list);
node->removed = true;
/* Note that we need to free this entry later: */
- kfree_rcu(node, rcu_head);
+ call_rcu(&node->rcu_head, hsr_free_node_rcu);
}
}
}
@@ -706,7 +741,7 @@ void hsr_prune_proxy_nodes(struct timer_list *t)
list_del_rcu(&node->mac_list);
node->removed = true;
/* Note that we need to free this entry later: */
- kfree_rcu(node, rcu_head);
+ call_rcu(&node->rcu_head, hsr_free_node_rcu);
}
}
}
@@ -740,6 +775,39 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos,
return NULL;
}
+/* Fill the last sequence number that has been received from node on if1 by
+ * finding the last sequence number sent on port B; accordingly get the last
+ * received sequence number for if2 using sent sequence numbers on port A.
+ */
+static void fill_last_seq_nrs(struct hsr_node *node, u16 *if1_seq, u16 *if2_seq)
+{
+ struct hsr_seq_block *block;
+ unsigned int block_off;
+ size_t block_sz;
+ u16 seq_bit;
+
+ spin_lock_bh(&node->seq_out_lock);
+
+ /* Get last inserted block */
+ block_off = (node->next_block - 1) & (HSR_MAX_SEQ_BLOCKS - 1);
+ block_sz = hsr_seq_block_size(node);
+ block = node->block_buf + block_off * block_sz;
+
+ if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_B - 1],
+ HSR_SEQ_BLOCK_SIZE)) {
+ seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_B - 1],
+ HSR_SEQ_BLOCK_SIZE);
+ *if1_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit;
+ }
+ if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_A - 1],
+ HSR_SEQ_BLOCK_SIZE)) {
+ seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_A - 1],
+ HSR_SEQ_BLOCK_SIZE);
+ *if2_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit;
+ }
+ spin_unlock_bh(&node->seq_out_lock);
+}
+
int hsr_get_node_data(struct hsr_priv *hsr,
const unsigned char *addr,
unsigned char addr_b[ETH_ALEN],
@@ -780,8 +848,10 @@ int hsr_get_node_data(struct hsr_priv *hsr,
*if2_age = jiffies_to_msecs(tdiff);
/* Present sequence numbers as if they were incoming on interface */
- *if1_seq = node->seq_out[HSR_PT_SLAVE_B];
- *if2_seq = node->seq_out[HSR_PT_SLAVE_A];
+ *if1_seq = 0;
+ *if2_seq = 0;
+ if (hsr->prot_version != PRP_V1)
+ fill_last_seq_nrs(node, if1_seq, if2_seq);
if (node->addr_B_port != HSR_PT_NONE) {
port = hsr_port_get_hsr(hsr, node->addr_B_port);
diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h
index b04948659d84..c65ecb925734 100644
--- a/net/hsr/hsr_framereg.h
+++ b/net/hsr/hsr_framereg.h
@@ -74,9 +74,30 @@ bool hsr_is_node_in_db(struct list_head *node_db,
int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame);
+#if IS_ENABLED(CONFIG_KUNIT)
+struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node, u16 block_idx);
+#endif
+
+#define HSR_SEQ_BLOCK_SHIFT 7 /* 128 bits */
+#define HSR_SEQ_BLOCK_SIZE (1 << HSR_SEQ_BLOCK_SHIFT)
+#define HSR_SEQ_BLOCK_MASK (HSR_SEQ_BLOCK_SIZE - 1)
+#define HSR_MAX_SEQ_BLOCKS 64
+
+#define hsr_seq_block_index(sequence_nr) ((sequence_nr) >> HSR_SEQ_BLOCK_SHIFT)
+#define hsr_seq_block_bit(sequence_nr) ((sequence_nr) & HSR_SEQ_BLOCK_MASK)
+
+struct hsr_seq_block {
+ unsigned long time;
+ u16 block_idx;
+ /* Should be a flexible array member of what DECLARE_BITMAP() would
+ * produce.
+ */
+ unsigned long seq_nrs[][BITS_TO_LONGS(HSR_SEQ_BLOCK_SIZE)];
+};
+
struct hsr_node {
struct list_head mac_list;
- /* Protect R/W access to seq_out */
+ /* Protect R/W access seq_blocks */
spinlock_t seq_out_lock;
unsigned char macaddress_A[ETH_ALEN];
unsigned char macaddress_B[ETH_ALEN];
@@ -84,16 +105,22 @@ struct hsr_node {
enum hsr_port_type addr_B_port;
unsigned long time_in[HSR_PT_PORTS];
bool time_in_stale[HSR_PT_PORTS];
- unsigned long time_out[HSR_PT_PORTS];
/* if the node is a SAN */
bool san_a;
bool san_b;
- u16 seq_out[HSR_PT_PORTS];
bool removed;
- /* PRP specific duplicate handling */
- u16 seq_expected[HSR_PT_PORTS];
- u16 seq_start[HSR_PT_PORTS];
+ /* Duplicate detection */
+ struct xarray seq_blocks;
+ void *block_buf;
+ unsigned int next_block;
+ unsigned int seq_port_cnt;
struct rcu_head rcu_head;
};
+static inline size_t hsr_seq_block_size(struct hsr_node *node)
+{
+ WARN_ON_ONCE(node->seq_port_cnt == 0);
+ return struct_size_t(struct hsr_seq_block, seq_nrs, node->seq_port_cnt);
+}
+
#endif /* __HSR_FRAMEREG_H */
diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c
index e86b7b633ae8..b028e71e6a0f 100644
--- a/net/hsr/prp_dup_discard_test.c
+++ b/net/hsr/prp_dup_discard_test.c
@@ -4,6 +4,8 @@
#include "hsr_main.h"
#include "hsr_framereg.h"
+MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING");
+
struct prp_test_data {
struct hsr_port port;
struct hsr_port port_rcv;
@@ -13,37 +15,55 @@ struct prp_test_data {
static struct prp_test_data *build_prp_test_data(struct kunit *test)
{
+ size_t block_sz;
+
struct prp_test_data *data = kunit_kzalloc(test,
sizeof(struct prp_test_data), GFP_USER);
KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data);
+ data->node.seq_port_cnt = 1;
+ block_sz = hsr_seq_block_size(&data->node);
+ data->node.block_buf = kunit_kcalloc(test, HSR_MAX_SEQ_BLOCKS, block_sz,
+ GFP_ATOMIC);
+ KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data->node.block_buf);
+
+ xa_init(&data->node.seq_blocks);
+ spin_lock_init(&data->node.seq_out_lock);
+
data->frame.node_src = &data->node;
data->frame.port_rcv = &data->port_rcv;
data->port_rcv.type = HSR_PT_SLAVE_A;
- data->node.seq_start[HSR_PT_SLAVE_A] = 1;
- data->node.seq_expected[HSR_PT_SLAVE_A] = 1;
- data->node.seq_start[HSR_PT_SLAVE_B] = 1;
- data->node.seq_expected[HSR_PT_SLAVE_B] = 1;
- data->node.seq_out[HSR_PT_MASTER] = 0;
- data->node.time_out[HSR_PT_MASTER] = jiffies;
data->port.type = HSR_PT_MASTER;
return data;
}
-static void check_prp_counters(struct kunit *test,
- struct prp_test_data *data,
- u16 seq_start_a, u16 seq_expected_a,
- u16 seq_start_b, u16 seq_expected_b)
+static void check_prp_frame_seen(struct kunit *test, struct prp_test_data *data,
+ u16 sequence_nr)
+{
+ u16 block_idx, seq_bit;
+ struct hsr_seq_block *block;
+
+ block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT;
+ block = xa_load(&data->node.seq_blocks, block_idx);
+ KUNIT_EXPECT_NOT_NULL(test, block);
+
+ seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK;
+ KUNIT_EXPECT_TRUE(test, test_bit(seq_bit, block->seq_nrs[0]));
+}
+
+static void check_prp_frame_unseen(struct kunit *test,
+ struct prp_test_data *data, u16 sequence_nr)
{
- KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A],
- seq_start_a);
- KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B],
- seq_start_b);
- KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A],
- seq_expected_a);
- KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B],
- seq_expected_b);
+ u16 block_idx, seq_bit;
+ struct hsr_seq_block *block;
+
+ block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT;
+ block = hsr_get_seq_block(&data->node, block_idx);
+ KUNIT_EXPECT_NOT_NULL(test, block);
+
+ seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK;
+ KUNIT_EXPECT_FALSE(test, test_bit(seq_bit, block->seq_nrs[0]));
}
static void prp_dup_discard_forward(struct kunit *test)
@@ -54,52 +74,48 @@ static void prp_dup_discard_forward(struct kunit *test)
data->frame.sequence_nr = 2;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, data->frame.sequence_nr,
- data->frame.sequence_nr + 1, 1, 1);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
}
-static void prp_dup_discard_inside_dropwindow(struct kunit *test)
+static void prp_dup_discard_drop_duplicate(struct kunit *test)
{
- /* Normal situation, other LAN ahead by one. Frame is dropped */
struct prp_test_data *data = build_prp_test_data(test);
- unsigned long time = jiffies - 10;
- data->frame.sequence_nr = 1;
- data->node.seq_expected[HSR_PT_SLAVE_B] = 3;
- data->node.seq_out[HSR_PT_MASTER] = 2;
- data->node.time_out[HSR_PT_MASTER] = time;
+ data->frame.sequence_nr = 2;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
KUNIT_EXPECT_EQ(test, 1,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]);
- KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, 2, 2, 2, 3);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
}
-static void prp_dup_discard_node_timeout(struct kunit *test)
+static void prp_dup_discard_entry_timeout(struct kunit *test)
{
/* Timeout situation, node hasn't sent anything for a while */
struct prp_test_data *data = build_prp_test_data(test);
+ struct hsr_seq_block *block;
+ u16 block_idx;
data->frame.sequence_nr = 7;
- data->node.seq_start[HSR_PT_SLAVE_A] = 1234;
- data->node.seq_expected[HSR_PT_SLAVE_A] = 1235;
- data->node.seq_start[HSR_PT_SLAVE_B] = 1234;
- data->node.seq_expected[HSR_PT_SLAVE_B] = 1234;
- data->node.seq_out[HSR_PT_MASTER] = 1234;
- data->node.time_out[HSR_PT_MASTER] =
- jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
+
+ data->frame.sequence_nr = 11;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
+
+ block_idx = data->frame.sequence_nr >> HSR_SEQ_BLOCK_SHIFT;
+ block = hsr_get_seq_block(&data->node, block_idx);
+ block->time = jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, data->frame.sequence_nr,
- data->frame.sequence_nr + 1, 1234, 1234);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
+ check_prp_frame_unseen(test, data, 7);
}
static void prp_dup_discard_out_of_sequence(struct kunit *test)
@@ -107,50 +123,36 @@ static void prp_dup_discard_out_of_sequence(struct kunit *test)
/* One frame is received out of sequence on both LANs */
struct prp_test_data *data = build_prp_test_data(test);
- data->node.seq_start[HSR_PT_SLAVE_A] = 10;
- data->node.seq_expected[HSR_PT_SLAVE_A] = 10;
- data->node.seq_start[HSR_PT_SLAVE_B] = 10;
- data->node.seq_expected[HSR_PT_SLAVE_B] = 10;
- data->node.seq_out[HSR_PT_MASTER] = 9;
+ /* initial frame, should be accepted */
+ data->frame.sequence_nr = 9;
+ KUNIT_EXPECT_EQ(test, 0,
+ prp_register_frame_out(&data->port, &data->frame));
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
/* 1st old frame, should be accepted */
data->frame.sequence_nr = 8;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, data->frame.sequence_nr,
- data->frame.sequence_nr + 1, 10, 10);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
/* 2nd frame should be dropped */
data->frame.sequence_nr = 8;
data->port_rcv.type = HSR_PT_SLAVE_B;
KUNIT_EXPECT_EQ(test, 1,
prp_register_frame_out(&data->port, &data->frame));
- check_prp_counters(test, data, data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1);
/* Next frame, this is forwarded */
data->frame.sequence_nr = 10;
data->port_rcv.type = HSR_PT_SLAVE_A;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, data->frame.sequence_nr,
- data->frame.sequence_nr + 1, 9, 9);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
/* and next one is dropped */
data->frame.sequence_nr = 10;
data->port_rcv.type = HSR_PT_SLAVE_B;
KUNIT_EXPECT_EQ(test, 1,
prp_register_frame_out(&data->port, &data->frame));
- check_prp_counters(test, data, data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1,
- data->frame.sequence_nr + 1);
}
static void prp_dup_discard_lan_b_late(struct kunit *test)
@@ -158,43 +160,31 @@ static void prp_dup_discard_lan_b_late(struct kunit *test)
/* LAN B is behind */
struct prp_test_data *data = build_prp_test_data(test);
- data->node.seq_start[HSR_PT_SLAVE_A] = 9;
- data->node.seq_expected[HSR_PT_SLAVE_A] = 9;
- data->node.seq_start[HSR_PT_SLAVE_B] = 9;
- data->node.seq_expected[HSR_PT_SLAVE_B] = 9;
- data->node.seq_out[HSR_PT_MASTER] = 8;
-
data->frame.sequence_nr = 9;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, 9, 10, 9, 9);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
data->frame.sequence_nr = 10;
KUNIT_EXPECT_EQ(test, 0,
prp_register_frame_out(&data->port, &data->frame));
- KUNIT_EXPECT_EQ(test, data->frame.sequence_nr,
- data->node.seq_out[HSR_PT_MASTER]);
- check_prp_counters(test, data, 9, 11, 9, 9);
+ check_prp_frame_seen(test, data, data->frame.sequence_nr);
data->frame.sequence_nr = 9;
data->port_rcv.type = HSR_PT_SLAVE_B;
KUNIT_EXPECT_EQ(test, 1,
prp_register_frame_out(&data->port, &data->frame));
- check_prp_counters(test, data, 10, 11, 10, 10);
data->frame.sequence_nr = 10;
data->port_rcv.type = HSR_PT_SLAVE_B;
KUNIT_EXPECT_EQ(test, 1,
prp_register_frame_out(&data->port, &data->frame));
- check_prp_counters(test, data, 11, 11, 11, 11);
}
static struct kunit_case prp_dup_discard_test_cases[] = {
KUNIT_CASE(prp_dup_discard_forward),
- KUNIT_CASE(prp_dup_discard_inside_dropwindow),
- KUNIT_CASE(prp_dup_discard_node_timeout),
+ KUNIT_CASE(prp_dup_discard_drop_duplicate),
+ KUNIT_CASE(prp_dup_discard_entry_timeout),
KUNIT_CASE(prp_dup_discard_out_of_sequence),
KUNIT_CASE(prp_dup_discard_lan_b_late),
{}
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index ec36d2ec059e..18108a6f0499 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \
inet_timewait_sock.o inet_connection_sock.o \
tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
- tcp_rate.o tcp_recovery.o tcp_ulp.o \
+ tcp_recovery.o tcp_ulp.o \
tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 709021197e1c..32b951ebc0c2 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -2196,7 +2196,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb,
/* if we don't ensure enough headroom we could panic on the skb_push()
* call below so make sure we have enough, we are also "mangling" the
* packet so we should probably do a copy-on-write call anyway */
- ret_val = skb_cow(skb, skb_headroom(skb) + len_delta);
+ ret_val = skb_cow(skb,
+ skb_headroom(skb) + (len_delta > 0 ? len_delta : 0));
if (ret_val < 0)
return ret_val;
diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h
index f9b9e26c32c1..0b72796dd1ad 100644
--- a/net/ipv4/fib_lookup.h
+++ b/net/ipv4/fib_lookup.h
@@ -28,8 +28,10 @@ struct fib_alias {
/* Don't write on fa_state unless needed, to keep it shared on all cpus */
static inline void fib_alias_accessed(struct fib_alias *fa)
{
- if (!(fa->fa_state & FA_S_ACCESSED))
- fa->fa_state |= FA_S_ACCESSED;
+ u8 fa_state = READ_ONCE(fa->fa_state);
+
+ if (!(fa_state & FA_S_ACCESSED))
+ WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED);
}
/* Exported by fib_semantics.c */
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 7e2c17fec3fc..1308213791f1 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb,
new_fa->fa_dscp = fa->fa_dscp;
new_fa->fa_info = fi;
new_fa->fa_type = cfg->fc_type;
- state = fa->fa_state;
+ state = READ_ONCE(fa->fa_state);
new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen;
new_fa->tb_id = tb->tb_id;
@@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb,
fib_remove_alias(t, tp, l, fa_to_delete);
- if (fa_to_delete->fa_state & FA_S_ACCESSED)
+ if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED)
rt_cache_flush(cfg->fc_nlinfo.nl_net);
fib_release_info(fa_to_delete->fa_info);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 4abbec2f47ef..e216b6df6331 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -112,7 +112,9 @@ struct icmp_bxm {
__be32 times[3];
} data;
int head_len;
- struct ip_options_data replyopts;
+
+ /* Must be last as it ends in a flexible-array member. */
+ struct ip_options_rcu replyopts;
};
/* An array of errno for error messages from dest unreach. */
@@ -353,9 +355,12 @@ void icmp_out_count(struct net *net, unsigned char type)
static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd,
struct sk_buff *skb)
{
- struct icmp_bxm *icmp_param = from;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
__wsum csum;
+ icmp_param = from;
+
csum = skb_copy_and_csum_bits(icmp_param->skb,
icmp_param->offset + offset,
to, len);
@@ -413,7 +418,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
int type = icmp_param->data.icmph.type;
int code = icmp_param->data.icmph.code;
- if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb))
+ if (ip_options_echo(net, &icmp_param->replyopts.opt, skb))
return;
/* Needed by both icmpv4_global_allow and icmp_xmit_lock */
@@ -435,10 +440,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
daddr = ipc.addr = ip_hdr(skb)->saddr;
saddr = fib_compute_spec_dst(skb);
- if (icmp_param->replyopts.opt.opt.optlen) {
- ipc.opt = &icmp_param->replyopts.opt;
+ if (icmp_param->replyopts.opt.optlen) {
+ ipc.opt = &icmp_param->replyopts;
if (ipc.opt->opt.srr)
- daddr = icmp_param->replyopts.opt.opt.faddr;
+ daddr = icmp_param->replyopts.opt.faddr;
}
memset(&fl4, 0, sizeof(fl4));
fl4.daddr = daddr;
@@ -491,8 +496,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
int err;
memset(fl4, 0, sizeof(*fl4));
- fl4->daddr = (param->replyopts.opt.opt.srr ?
- param->replyopts.opt.opt.faddr : iph->saddr);
+ fl4->daddr = (param->replyopts.opt.srr ?
+ param->replyopts.opt.faddr : iph->saddr);
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
@@ -554,6 +559,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
/* steal dst entry from skb_in, don't drop refcnt */
skb_dstref_steal(skb_in);
skb_dstref_restore(skb_in, orefdst);
+
+ /*
+ * At this point, fl4_dec.daddr should NOT be local (we
+ * checked fl4_dec.saddr above). However, a race condition
+ * may occur if the address is added to the interface
+ * concurrently. In that case, ip_route_input() returns a
+ * LOCAL route with dst.output=ip_rt_bug, which must not
+ * be used for output.
+ */
+ if (!err && rt2 && rt2->rt_type == RTN_LOCAL) {
+ net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n",
+ &fl4_dec.daddr, &fl4_dec.saddr);
+ dst_release(&rt2->dst);
+ err = -EINVAL;
+ }
}
if (err)
@@ -775,9 +795,10 @@ free_skb:
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
const struct inet_skb_parm *parm)
{
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct iphdr *iph;
int room;
- struct icmp_bxm icmp_param;
struct rtable *rt = skb_rtable(skb_in);
bool apply_ratelimit = false;
struct sk_buff *ext_skb;
@@ -906,7 +927,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in,
&parm->opt))
goto out_unlock;
@@ -915,21 +936,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
* Prepare data for ICMP header.
*/
- icmp_param.data.icmph.type = type;
- icmp_param.data.icmph.code = code;
- icmp_param.data.icmph.un.gateway = info;
- icmp_param.data.icmph.checksum = 0;
- icmp_param.skb = skb_in;
- icmp_param.offset = skb_network_offset(skb_in);
+ icmp_param->data.icmph.type = type;
+ icmp_param->data.icmph.code = code;
+ icmp_param->data.icmph.un.gateway = info;
+ icmp_param->data.icmph.checksum = 0;
+ icmp_param->skb = skb_in;
+ icmp_param->offset = skb_network_offset(skb_in);
ipcm_init(&ipc);
ipc.tos = tos;
ipc.addr = iph->saddr;
- ipc.opt = &icmp_param.replyopts.opt;
+ ipc.opt = &icmp_param->replyopts;
ipc.sockc.mark = mark;
rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr,
inet_dsfield_to_dscp(tos), mark, type, code,
- &icmp_param);
+ icmp_param);
if (IS_ERR(rt))
goto out_unlock;
@@ -939,10 +960,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
/* RFC says return as much as we can without exceeding 576 bytes. */
- room = dst_mtu(&rt->dst);
+ room = dst4_mtu(&rt->dst);
if (room > 576)
room = 576;
- room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen;
+ room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen;
room -= sizeof(struct icmphdr);
/* Guard against tiny mtu. We need to include at least one
* IP network header for this message to make any sense.
@@ -950,15 +971,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
if (room <= (int)sizeof(struct iphdr))
goto ende;
- ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room,
+ ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room,
parm->iif);
if (ext_skb)
- icmp_param.skb = ext_skb;
+ icmp_param->skb = ext_skb;
- icmp_param.data_len = icmp_param.skb->len - icmp_param.offset;
- if (icmp_param.data_len > room)
- icmp_param.data_len = room;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data_len = icmp_param->skb->len - icmp_param->offset;
+ if (icmp_param->data_len > room)
+ icmp_param->data_len = room;
+ icmp_param->head_len = sizeof(struct icmphdr);
/* if we don't have a source address at this point, fall back to the
* dummy address instead of sending out a packet with a source address
@@ -969,7 +990,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
trace_icmp_send(skb_in, type, code);
- icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt);
+ icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt);
if (ext_skb)
consume_skb(ext_skb);
@@ -1031,16 +1052,22 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info)
/* Checkin full IP header plus 8 bytes of protocol to
* avoid additional coding at protocol handlers.
*/
- if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) {
- __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
- return;
- }
+ if (!pskb_may_pull(skb, iph->ihl * 4 + 8))
+ goto out;
+
+ /* IPPROTO_RAW sockets are not supposed to receive anything. */
+ if (protocol == IPPROTO_RAW)
+ goto out;
raw_icmp_error(skb, protocol, info);
ipprot = rcu_dereference(inet_protos[protocol]);
if (ipprot && ipprot->err_handler)
ipprot->err_handler(skb, info);
+ return;
+
+out:
+ __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS);
}
static bool icmp_tag_validation(int proto)
@@ -1206,7 +1233,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb)
static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net;
net = skb_dst_dev_net_rcu(skb);
@@ -1214,18 +1242,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb)
if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all))
return SKB_NOT_DROPPED_YET;
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = skb->len;
- icmp_param.head_len = sizeof(struct icmphdr);
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = skb->len;
+ icmp_param->head_len = sizeof(struct icmphdr);
- if (icmp_param.data.icmph.type == ICMP_ECHO)
- icmp_param.data.icmph.type = ICMP_ECHOREPLY;
- else if (!icmp_build_probe(skb, &icmp_param.data.icmph))
+ if (icmp_param->data.icmph.type == ICMP_ECHO)
+ icmp_param->data.icmph.type = ICMP_ECHOREPLY;
+ else if (!icmp_build_probe(skb, &icmp_param->data.icmph))
return SKB_NOT_DROPPED_YET;
- icmp_reply(&icmp_param, skb);
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
}
@@ -1353,7 +1381,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe);
*/
static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
{
- struct icmp_bxm icmp_param;
+ DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
/*
* Too short.
*/
@@ -1363,19 +1392,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- icmp_param.data.times[1] = inet_current_timestamp();
- icmp_param.data.times[2] = icmp_param.data.times[1];
-
- BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4));
-
- icmp_param.data.icmph = *icmp_hdr(skb);
- icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY;
- icmp_param.data.icmph.code = 0;
- icmp_param.skb = skb;
- icmp_param.offset = 0;
- icmp_param.data_len = 0;
- icmp_param.head_len = sizeof(struct icmphdr) + 12;
- icmp_reply(&icmp_param, skb);
+ icmp_param->data.times[1] = inet_current_timestamp();
+ icmp_param->data.times[2] = icmp_param->data.times[1];
+
+ BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4));
+
+ icmp_param->data.icmph = *icmp_hdr(skb);
+ icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY;
+ icmp_param->data.icmph.code = 0;
+ icmp_param->skb = skb;
+ icmp_param->offset = 0;
+ icmp_param->data_len = 0;
+ icmp_param->head_len = sizeof(struct icmphdr) + 12;
+ icmp_reply(icmp_param, skb);
return SKB_NOT_DROPPED_YET;
out_err:
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 7182f1419c2a..0adc993c211d 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay)
static void igmp_gq_start_timer(struct in_device *in_dev)
{
- int tv = get_random_u32_below(in_dev->mr_maxdelay);
+ int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay));
unsigned long exp = jiffies + tv + 2;
if (in_dev->mr_gq_running &&
@@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE);
if (!max_delay)
max_delay = 1; /* can't mod w/ 0 */
- in_dev->mr_maxdelay = max_delay;
+ WRITE_ONCE(in_dev->mr_maxdelay, max_delay);
/* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently
* received value was zero, use the default or statically
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 97d57c52b9ad..5dfac6ce1110 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -20,6 +20,7 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/sock_reuseport.h>
#include <net/addrconf.h>
@@ -918,6 +919,16 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,
}
EXPORT_SYMBOL(inet_reqsk_alloc);
+void __reqsk_free(struct request_sock *req)
+{
+ req->rsk_ops->destructor(req);
+ if (req->rsk_listener)
+ sock_put(req->rsk_listener);
+ kfree(req->saved_syn);
+ kmem_cache_free(req->rsk_ops->slab, req);
+}
+EXPORT_SYMBOL_GPL(__reqsk_free);
+
static struct request_sock *inet_reqsk_clone(struct request_sock *req,
struct sock *sk)
{
@@ -1103,6 +1114,8 @@ static void reqsk_timer_handler(struct timer_list *t)
(!resend ||
!tcp_rtx_synack(sk_listener, req) ||
inet_rsk(req)->acked)) {
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
if (req->num_timeout++ == 0)
atomic_dec(&queue->young);
mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req));
@@ -1196,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
{
struct sock *newsk = sk_clone_lock(sk, priority);
struct inet_connection_sock *newicsk;
- struct inet_request_sock *ireq;
+ const struct inet_request_sock *ireq;
struct inet_sock *newinet;
if (!newsk)
@@ -1311,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk)
return 0;
}
+static void reqsk_queue_alloc(struct request_sock_queue *queue)
+{
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+
+ queue->rskq_accept_head = NULL;
+}
+
int inet_csk_listen_start(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index ff11d3a85a36..e4790cc7b5c2 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork,
return -EFAULT;
cork->fragsize = ip_sk_use_pmtu(sk) ?
- dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
+ dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu);
if (!inetdev_valid_mtu(cork->fragsize))
return -ENETUNREACH;
@@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk,
pmtudisc = READ_ONCE(inet->pmtudisc);
if (pmtudisc == IP_PMTUDISC_DO ||
pmtudisc == IP_PMTUDISC_PROBE ||
- (skb->len <= dst_mtu(&rt->dst) &&
+ (skb->len <= dst4_mtu(&rt->dst) &&
ip_dont_fragment(sk, &rt->dst)))
df = htons(IP_DF);
@@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
const struct ip_reply_arg *arg,
unsigned int len, u64 transmit_time, u32 txhash)
{
- struct ip_options_data replyopts;
+ DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct ipcm_cookie ipc;
struct flowi4 fl4;
struct rtable *rt = skb_rtable(skb);
@@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
int err;
int oif;
- if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt))
+ if (__ip_options_echo(net, &replyopts->opt, skb, sopt))
return;
ipcm_init(&ipc);
ipc.addr = daddr;
ipc.sockc.transmit_time = transmit_time;
- if (replyopts.opt.opt.optlen) {
- ipc.opt = &replyopts.opt;
+ if (replyopts->opt.optlen) {
+ ipc.opt = replyopts;
- if (replyopts.opt.opt.srr)
- daddr = replyopts.opt.opt.faddr;
+ if (replyopts->opt.srr)
+ daddr = replyopts->opt.faddr;
}
oif = arg->bound_dev_if;
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6d9c5c20b1c4..c062d9519818 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = 0;
dst = sk_dst_get(sk);
if (dst) {
- val = dst_mtu(dst);
+ val = dst4_mtu(dst);
dst_release(dst);
}
if (!val)
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 019408d3ca2c..b1e1be00ff8b 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 };
static void __init
ic_dhcp_init_options(u8 *options, struct ic_device *d)
{
- u8 mt = ((ic_servaddr == NONE)
- ? DHCPDISCOVER : DHCPREQUEST);
+ static const u8 ic_req_params[] = {
+ 1, /* Subnet mask */
+ 3, /* Default gateway */
+ 6, /* DNS server */
+ 12, /* Host name */
+ 15, /* Domain name */
+ 17, /* Boot path */
+ 26, /* MTU */
+ 40, /* NIS domain name */
+ 42, /* NTP servers */
+ };
+ u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST;
u8 *e = options;
int len;
@@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d)
e += 4;
}
- /* always? */
- {
- static const u8 ic_req_params[] = {
- 1, /* Subnet mask */
- 3, /* Default gateway */
- 6, /* DNS server */
- 12, /* Host name */
- 15, /* Domain name */
- 17, /* Boot path */
- 26, /* MTU */
- 40, /* NIS domain name */
- 42, /* NTP servers */
- };
-
- *e++ = 55; /* Parameter request list */
- *e++ = sizeof(ic_req_params);
- memcpy(e, ic_req_params, sizeof(ic_req_params));
- e += sizeof(ic_req_params);
-
- if (ic_host_name_set) {
- *e++ = 12; /* host-name */
- len = strlen(utsname()->nodename);
- *e++ = len;
- memcpy(e, utsname()->nodename, len);
- e += len;
- }
- if (*vendor_class_identifier) {
- pr_info("DHCP: sending class identifier \"%s\"\n",
- vendor_class_identifier);
- *e++ = 60; /* Class-identifier */
- len = strlen(vendor_class_identifier);
- *e++ = len;
- memcpy(e, vendor_class_identifier, len);
- e += len;
- }
- len = strlen(dhcp_client_identifier + 1);
- /* the minimum length of identifier is 2, include 1 byte type,
- * and can not be larger than the length of options
- */
- if (len >= 1 && len < 312 - (e - options) - 1) {
- *e++ = 61;
- *e++ = len + 1;
- memcpy(e, dhcp_client_identifier, len + 1);
- e += len + 1;
- }
+ *e++ = 55; /* Parameter request list */
+ *e++ = sizeof(ic_req_params);
+ memcpy(e, ic_req_params, sizeof(ic_req_params));
+ e += sizeof(ic_req_params);
+
+ if (ic_host_name_set) {
+ *e++ = 12; /* host-name */
+ len = strlen(utsname()->nodename);
+ *e++ = len;
+ memcpy(e, utsname()->nodename, len);
+ e += len;
+ }
+ if (*vendor_class_identifier) {
+ pr_info("DHCP: sending class identifier \"%s\"\n",
+ vendor_class_identifier);
+ *e++ = 60; /* Class-identifier */
+ len = strlen(vendor_class_identifier);
+ *e++ = len;
+ memcpy(e, vendor_class_identifier, len);
+ e += len;
+ }
+ len = strlen(dhcp_client_identifier + 1);
+ /* the minimum length of identifier is 2, include 1 byte type,
+ * and can not be larger than the length of options
+ */
+ if (len >= 1 && len < 312 - (e - options) - 1) {
+ *e++ = 61;
+ *e++ = len + 1;
+ memcpy(e, dhcp_client_identifier, len + 1);
+ e += len + 1;
}
*e++ = 255; /* End of the list */
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index ca9eaee4c2ef..131382c388e9 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
+ if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
/* Do not fragment multicasts. Alas, IPv4 does not
* allow to send ICMP, so that packets will disappear
* to blackhole.
diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c
index 8ddac1f595ed..82cf8a9e5ded 100644
--- a/net/ipv4/metrics.c
+++ b/net/ipv4/metrics.c
@@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx,
return fib_metrics;
}
-EXPORT_SYMBOL_GPL(ip_fib_metrics_init);
+EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index fae4aa4a5f09..fecf6621f679 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
goto free_nskb;
/* "Never happens" */
- if (nskb->len > dst_mtu(skb_dst(nskb)))
+ if (nskb->len > dst4_mtu(skb_dst(nskb)))
goto free_nskb;
nf_ct_attach(nskb, oldskb);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index cfbd563498e8..ebfc5a3d3ad6 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -690,6 +690,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct net *net = sock_net(sk);
struct flowi4 fl4;
struct inet_sock *inet = inet_sk(sk);
@@ -697,7 +699,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
struct icmphdr user_icmph;
struct pingfakehdr pfh;
struct rtable *rt = NULL;
- struct ip_options_data opt_copy;
int free = 0;
__be32 saddr, daddr, faddr;
u8 scope;
@@ -746,9 +747,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 5998c4cc6f47..e20c41206e29 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct net *net = sock_net(sk);
struct ipcm_cookie ipc;
@@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
__be32 daddr;
__be32 saddr;
int uc_index, err;
- struct ip_options_data opt_copy;
struct raw_frag_vec rfv;
int hdrincl;
@@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11d990703d31..06aa39ae80d6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev,
* RFC1812 recommendation, if source is martian,
* the only hint is MAC header.
*/
- pr_warn("martian source %pI4 from %pI4, on dev %s\n",
- &daddr, &saddr, dev->name);
+ pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
print_hex_dump(KERN_WARNING, "ll header: ",
DUMP_PREFIX_OFFSET, 16, 1,
@@ -2475,8 +2475,8 @@ martian_destination:
RT_CACHE_STAT_INC(in_martian_dst);
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev))
- net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
- &daddr, &saddr, dev->name);
+ net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n",
+ &saddr, &daddr, dev->name);
#endif
goto out;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index a1a50a5c80dc..643763bc2142 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,7 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
-static int tcp_ecn_mode_max = 2;
+static int tcp_ecn_mode_max = 5;
static u32 icmp_errors_extension_mask_all =
GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0);
@@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
- .extra2 = SYSCTL_TWO,
+ .extra2 = SYSCTL_THREE,
},
{
.procname = "tcp_ecn_option_beacon",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index d5319ebe2452..6ce03a9adb4a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp;
EXPORT_IPV6_MOD(tcp_sockets_allocated);
/*
- * TCP splice context
- */
-struct tcp_splice_state {
- struct pipe_inode_info *pipe;
- size_t len;
- unsigned int flags;
-};
-
-/*
* Pressure flag: try to collapse.
* Technical note: it is used by multiple contexts non atomically.
* All the __sk_mem_schedule() is of this nature: accounting
@@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
struct sk_buff *skb = tcp_write_queue_tail(sk);
u32 tsflags = sockc->tsflags;
+ if (unlikely(!skb))
+ skb = skb_rb_last(&sk->tcp_rtx_queue);
+
if (tsflags && skb) {
struct skb_shared_info *shinfo = skb_shinfo(skb);
struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
@@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc)
bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB);
}
+/* @wake is one when sk_stream_write_space() calls us.
+ * This sends EPOLLOUT only if notsent_bytes is half the limit.
+ * This mimics the strategy used in sock_def_write_space().
+ */
+bool tcp_stream_memory_free(const struct sock *sk, int wake)
+{
+ const struct tcp_sock *tp = tcp_sk(sk);
+ u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+
+ return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
+}
+EXPORT_SYMBOL(tcp_stream_memory_free);
+
static bool tcp_stream_is_readable(struct sock *sk, int target)
{
if (tcp_epollin_ready(sk, target))
@@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
__tcp_push_pending_frames(sk, mss_now, nonagle);
}
-static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
- unsigned int offset, size_t len)
+int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb,
+ unsigned int offset, size_t len)
{
struct tcp_splice_state *tss = rd_desc->arg.data;
int ret;
@@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
}
EXPORT_IPV6_MOD(tcp_splice_read);
+/* We allow to exceed memory limits for FIN packets to expedite
+ * connection tear down and (memory) recovery.
+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
+ * or even be forced to close flow without any FIN.
+ * In general, we want to allow one skb per socket to avoid hangs
+ * with edge trigger epoll()
+ */
+void sk_forced_mem_schedule(struct sock *sk, int size)
+{
+ int delta, amt;
+
+ delta = size - sk->sk_forward_alloc;
+ if (delta <= 0)
+ return;
+
+ amt = sk_mem_pages(delta);
+ sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
+
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
+
+ if (sk->sk_bypass_prot_mem)
+ return;
+
+ sk_memory_allocated_add(sk, amt);
+}
+
struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp,
bool force_schedule)
{
@@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
return err;
}
+/* If a gap is detected between sends, mark the socket application-limited. */
+void tcp_rate_check_app_limited(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (/* We have less than one packet to send. */
+ tp->write_seq - tp->snd_nxt < tp->mss_cache &&
+ /* Nothing in sending host's qdisc queues or NIC tx queue. */
+ sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
+ /* We are not limited by CWND. */
+ tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
+ /* All lost packets have been retransmitted. */
+ tp->lost_out <= tp->retrans_out)
+ tp->app_limited =
+ (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
+}
+EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
+
int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
{
struct net_devmem_dmabuf_binding *binding = NULL;
@@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags)
tcp_accecn_init_counters(tp);
tp->prev_ecnfield = 0;
tp->accecn_opt_tstamp = 0;
+ tp->pkts_acked_ewma = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ if (tcp_ecn_disabled(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED;
+ else if (tcp_ecn_mode_rfc3168(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168;
+ else if (tcp_ecn_mode_accecn(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN;
+ else if (tcp_ecn_mode_pending(tp))
+ info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING;
info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
info->tcpi_received_ce = tp->received_ce;
@@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index df758adbb445..e9f6c77e0631 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -16,6 +16,7 @@
#include <linux/gfp.h>
#include <linux/jhash.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <trace/events/tcp.h>
static DEFINE_SPINLOCK(tcp_cong_list_lock);
@@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk)
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
}
@@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk,
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN)
- INET_ECN_xmit(sk);
+ INET_ECN_xmit_ect_1_negotiation(sk);
else
INET_ECN_dontxmit(sk);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 7d945a527daf..b30090cff3cf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -5,6 +5,92 @@
#include <net/tcp.h>
#include <net/busy_poll.h>
+/*
+ * This function is called to set a Fast Open socket's "fastopen_rsk" field
+ * to NULL when a TFO socket no longer needs to access the request_sock.
+ * This happens only after 3WHS has been either completed or aborted (e.g.,
+ * RST is received).
+ *
+ * Before TFO, a child socket is created only after 3WHS is completed,
+ * hence it never needs to access the request_sock. things get a lot more
+ * complex with TFO. A child socket, accepted or not, has to access its
+ * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
+ * until 3WHS is either completed or aborted. Afterwards the req will stay
+ * until either the child socket is accepted, or in the rare case when the
+ * listener is closed before the child is accepted.
+ *
+ * In short, a request socket is only freed after BOTH 3WHS has completed
+ * (or aborted) and the child socket has been accepted (or listener closed).
+ * When a child socket is accepted, its corresponding req->sk is set to
+ * NULL since it's no longer needed. More importantly, "req->sk == NULL"
+ * will be used by the code below to determine if a child socket has been
+ * accepted or not, and the check is protected by the fastopenq->lock
+ * described below.
+ *
+ * Note that fastopen_rsk is only accessed from the child socket's context
+ * with its socket lock held. But a request_sock (req) can be accessed by
+ * both its child socket through fastopen_rsk, and a listener socket through
+ * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
+ * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
+ * only in the rare case when both the listener and the child locks are held,
+ * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
+ * The lock also protects other fields such as fastopenq->qlen, which is
+ * decremented by this function when fastopen_rsk is no longer needed.
+ *
+ * Note that another solution was to simply use the existing socket lock
+ * from the listener. But first socket lock is difficult to use. It is not
+ * a simple spin lock - one must consider sock_owned_by_user() and arrange
+ * to use sk_add_backlog() stuff. But what really makes it infeasible is the
+ * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
+ * acquire a child's lock while holding listener's socket lock.
+ *
+ * This function also sets "treq->tfo_listener" to false.
+ * treq->tfo_listener is used by the listener so it is protected by the
+ * fastopenq->lock in this function.
+ */
+void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
+ bool reset)
+{
+ struct sock *lsk = req->rsk_listener;
+ struct fastopen_queue *fastopenq;
+
+ fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
+
+ RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
+ spin_lock_bh(&fastopenq->lock);
+ fastopenq->qlen--;
+ tcp_rsk(req)->tfo_listener = false;
+ if (req->sk) /* the child socket hasn't been accepted yet */
+ goto out;
+
+ if (!reset || lsk->sk_state != TCP_LISTEN) {
+ /* If the listener has been closed don't bother with the
+ * special RST handling below.
+ */
+ spin_unlock_bh(&fastopenq->lock);
+ reqsk_put(req);
+ return;
+ }
+ /* Wait for 60secs before removing a req that has triggered RST.
+ * This is a simple defense against TFO spoofing attack - by
+ * counting the req against fastopen.max_qlen, and disabling
+ * TFO when the qlen exceeds max_qlen.
+ *
+ * For more details see CoNext'11 "TCP Fast Open" paper.
+ */
+ req->rsk_timer.expires = jiffies + 60*HZ;
+ if (fastopenq->rskq_rst_head == NULL)
+ fastopenq->rskq_rst_head = req;
+ else
+ fastopenq->rskq_rst_tail->dl_next = req;
+
+ req->dl_next = NULL;
+ fastopenq->rskq_rst_tail = req;
+ fastopenq->qlen++;
+out:
+ spin_unlock_bh(&fastopenq->lock);
+}
+
void tcp_fastopen_init_key_once(struct net *net)
{
u8 key[TCP_FASTOPEN_KEY_LENGTH];
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 198f8a0d37be..e7b41abb82aa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
tcp_count_delivered_ce(tp, delivered);
}
+#define PKTS_ACKED_WEIGHT 6
+#define PKTS_ACKED_PREC 6
+#define ACK_COMP_THRESH 4
+
/* Returns the ECN CE delta */
static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delivered_pkts, u32 delivered_bytes,
@@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
u32 delta, safe_delta, d_ceb;
bool opt_deltas_valid;
u32 corrected_ace;
+ u32 ewma;
/* Reordered ACK or uncertain due to lack of data to send and ts */
if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
@@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
opt_deltas_valid = tcp_accecn_process_option(tp, skb,
delivered_bytes, flag);
+ if (delivered_pkts) {
+ if (!tp->pkts_acked_ewma) {
+ ewma = delivered_pkts << PKTS_ACKED_PREC;
+ } else {
+ ewma = tp->pkts_acked_ewma;
+ ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) +
+ (delivered_pkts << PKTS_ACKED_PREC)) >>
+ PKTS_ACKED_WEIGHT;
+ }
+ tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU);
+ }
+
if (!(flag & FLAG_SLOWPATH)) {
/* AccECN counter might overflow on large ACKs */
if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
@@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
if (d_ceb <
safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
return delta;
- }
+ } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC))
+ return delta;
return safe_delta;
}
@@ -1558,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
return in_sack;
}
+/* Record the most recently (re)sent time among the (s)acked packets
+ * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
+ * draft-cheng-tcpm-rack-00.txt
+ */
+static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked,
+ u32 end_seq, u64 xmit_time)
+{
+ u32 rtt_us;
+
+ rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
+ if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
+ /* If the sacked packet was retransmitted, it's ambiguous
+ * whether the retransmission or the original (or the prior
+ * retransmission) was sacked.
+ *
+ * If the original is lost, there is no ambiguity. Otherwise
+ * we assume the original can be delayed up to aRTT + min_rtt.
+ * the aRTT term is bounded by the fast recovery or timeout,
+ * so it's at least one RTT (i.e., retransmission is at least
+ * an RTT later).
+ */
+ return;
+ }
+ tp->rack.advanced = 1;
+ tp->rack.rtt_us = rtt_us;
+ if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
+ end_seq, tp->rack.end_seq)) {
+ tp->rack.mstamp = xmit_time;
+ tp->rack.end_seq = end_seq;
+ }
+}
+
/* Mark the given newly-SACKed range as such, adjusting counters and hints. */
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
@@ -1637,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk,
return sacked;
}
+/* The bandwidth estimator estimates the rate at which the network
+ * can currently deliver outbound data packets for this flow. At a high
+ * level, it operates by taking a delivery rate sample for each ACK.
+ *
+ * A rate sample records the rate at which the network delivered packets
+ * for this flow, calculated over the time interval between the transmission
+ * of a data packet and the acknowledgment of that packet.
+ *
+ * Specifically, over the interval between each transmit and corresponding ACK,
+ * the estimator generates a delivery rate sample. Typically it uses the rate
+ * at which packets were acknowledged. However, the approach of using only the
+ * acknowledgment rate faces a challenge under the prevalent ACK decimation or
+ * compression: packets can temporarily appear to be delivered much quicker
+ * than the bottleneck rate. Since it is physically impossible to do that in a
+ * sustained fashion, when the estimator notices that the ACK rate is faster
+ * than the transmit rate, it uses the latter:
+ *
+ * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
+ * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
+ * bw = min(send_rate, ack_rate)
+ *
+ * Notice the estimator essentially estimates the goodput, not always the
+ * network bottleneck link rate when the sending or receiving is limited by
+ * other factors like applications or receiver window limits. The estimator
+ * deliberately avoids using the inter-packet spacing approach because that
+ * approach requires a large number of samples and sophisticated filtering.
+ *
+ * TCP flows can often be application-limited in request/response workloads.
+ * The estimator marks a bandwidth sample as application-limited if there
+ * was some moment during the sampled window of packets when there was no data
+ * ready to send in the write queue.
+ */
+
+/* Update the connection delivery information and generate a rate sample. */
+static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+ bool is_sack_reneg, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 snd_us, ack_us;
+
+ /* Clear app limited if bubble is acked and gone. */
+ if (tp->app_limited && after(tp->delivered, tp->app_limited))
+ tp->app_limited = 0;
+
+ /* TODO: there are multiple places throughout tcp_ack() to get
+ * current time. Refactor the code using a new "tcp_acktag_state"
+ * to carry current time, flags, stats like "tcp_sacktag_state".
+ */
+ if (delivered)
+ tp->delivered_mstamp = tp->tcp_mstamp;
+
+ rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
+ rs->losses = lost; /* freshly marked lost */
+ /* Return an invalid sample if no timing information is available or
+ * in recovery from loss with SACK reneging. Rate samples taken during
+ * a SACK reneging event may overestimate bw by including packets that
+ * were SACKed before the reneg.
+ */
+ if (!rs->prior_mstamp || is_sack_reneg) {
+ rs->delivered = -1;
+ rs->interval_us = -1;
+ return;
+ }
+ rs->delivered = tp->delivered - rs->prior_delivered;
+
+ rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
+ /* delivered_ce occupies less than 32 bits in the skb control block */
+ rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
+
+ /* Model sending data and receiving ACKs as separate pipeline phases
+ * for a window. Usually the ACK phase is longer, but with ACK
+ * compression the send phase can be longer. To be safe we use the
+ * longer phase.
+ */
+ snd_us = rs->interval_us; /* send phase */
+ ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
+ rs->prior_mstamp); /* ack phase */
+ rs->interval_us = max(snd_us, ack_us);
+
+ /* Record both segment send and ack receive intervals */
+ rs->snd_interval_us = snd_us;
+ rs->rcv_interval_us = ack_us;
+
+ /* Normally we expect interval_us >= min-rtt.
+ * Note that rate may still be over-estimated when a spuriously
+ * retransmistted skb was first (s)acked because "interval_us"
+ * is under-estimated (up to an RTT). However continuously
+ * measuring the delivery rate during loss recovery is crucial
+ * for connections suffer heavy or prolonged losses.
+ */
+ if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
+ if (!rs->is_retrans)
+ pr_debug("tcp rate: %ld %d %u %u %u\n",
+ rs->interval_us, rs->delivered,
+ inet_csk(sk)->icsk_ca_state,
+ tp->rx_opt.sack_ok, tcp_min_rtt(tp));
+ rs->interval_us = -1;
+ return;
+ }
+
+ /* Record the last non-app-limited or the highest app-limited bw */
+ if (!rs->is_app_limited ||
+ ((u64)rs->delivered * tp->rate_interval_us >=
+ (u64)tp->rate_delivered * rs->interval_us)) {
+ tp->rate_delivered = rs->delivered;
+ tp->rate_interval_us = rs->interval_us;
+ tp->rate_app_limited = rs->is_app_limited;
+ }
+}
+
+/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
+ * delivery information when the skb was last transmitted.
+ *
+ * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
+ * called multiple times. We favor the information from the most recently
+ * sent skb, i.e., the skb with the most recently sent time and the highest
+ * sequence.
+ */
+static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+ struct rate_sample *rs)
+{
+ struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u64 tx_tstamp;
+
+ if (!scb->tx.delivered_mstamp)
+ return;
+
+ tx_tstamp = tcp_skb_timestamp_us(skb);
+ if (!rs->prior_delivered ||
+ tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
+ scb->end_seq, rs->last_end_seq)) {
+ rs->prior_delivered_ce = scb->tx.delivered_ce;
+ rs->prior_delivered = scb->tx.delivered;
+ rs->prior_mstamp = scb->tx.delivered_mstamp;
+ rs->is_app_limited = scb->tx.is_app_limited;
+ rs->is_retrans = scb->sacked & TCPCB_RETRANS;
+ rs->last_end_seq = scb->end_seq;
+
+ /* Record send time of most recently ACKed packet: */
+ tp->first_tx_mstamp = tx_tstamp;
+ /* Find the duration of the "send phase" of this window: */
+ rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
+ scb->tx.first_tx_mstamp);
+
+ }
+ /* Mark off the skb delivered once it's sacked to avoid being
+ * used again when it's cumulatively acked. For acked packets
+ * we don't need to reset since it'll be freed soon.
+ */
+ if (scb->sacked & TCPCB_SACKED_ACKED)
+ scb->tx.delivered_mstamp = 0;
+}
+
/* Shift newly-SACKed bytes from this skb to the immediately previous
* already-SACKed sk_buff. Mark the newly-SACKed bytes as such.
*/
@@ -3995,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
return delivered;
}
+/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
+ *
+ * If a DSACK is received that seems like it may have been due to reordering
+ * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
+ * by srtt), since there is possibility that spurious retransmission was
+ * due to reordering delay longer than reo_wnd.
+ *
+ * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
+ * no. of successful recoveries (accounts for full DSACK-based loss
+ * recovery undo). After that, reset it to default (min_rtt/4).
+ *
+ * At max, reo_wnd is incremented only once per rtt. So that the new
+ * DSACK on which we are reacting, is due to the spurious retx (approx)
+ * after the reo_wnd has been updated last time.
+ *
+ * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
+ * absolute value to account for change in rtt.
+ */
+static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
+ TCP_RACK_STATIC_REO_WND) ||
+ !rs->prior_delivered)
+ return;
+
+ /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
+ if (before(rs->prior_delivered, tp->rack.last_delivered))
+ tp->rack.dsack_seen = 0;
+
+ /* Adjust the reo_wnd if update is pending */
+ if (tp->rack.dsack_seen) {
+ tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
+ tp->rack.reo_wnd_steps + 1);
+ tp->rack.dsack_seen = 0;
+ tp->rack.last_delivered = tp->delivered;
+ tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
+ } else if (!tp->rack.reo_wnd_persist) {
+ tp->rack.reo_wnd_steps = 1;
+ }
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -4129,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_in_ack_event(sk, flag);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
if (tcp_ack_is_dubious(sk, flag)) {
@@ -4179,7 +4426,7 @@ no_queue:
*/
tcp_ack_probe(sk);
- if (tp->tlp_high_seq)
+ if (unlikely(tp->tlp_high_seq))
tcp_process_tlp_ack(sk, ack, flag);
return 1;
@@ -4799,8 +5046,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq)
tcp_sack_extend(tp->duplicate_sack, seq, end_seq);
}
-static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
+static void tcp_rcv_spurious_retrans(struct sock *sk,
+ const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+
/* When the ACK path fails or drops most ACKs, the sender would
* timeout and spuriously retransmit the same segment repeatedly.
* If it seems our ACKs are not reaching the other side,
@@ -4820,6 +5070,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb)
/* Save last flowlabel after a spurious retrans. */
tcp_save_lrcv_flowlabel(sk, skb);
#endif
+ /* Check DSACK info to detect that the previous ACK carrying the
+ * AccECN option was lost after the second retransmision, and then
+ * stop sending AccECN option in all subsequent ACKs.
+ */
+ if (tcp_ecn_mode_accecn(tp) &&
+ tp->accecn_opt_sent_w_dsack &&
+ TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND);
}
static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb)
@@ -5527,25 +5785,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb,
return next;
}
-/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
-void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
-{
- struct rb_node **p = &root->rb_node;
- struct rb_node *parent = NULL;
- struct sk_buff *skb1;
-
- while (*p) {
- parent = *p;
- skb1 = rb_to_skb(parent);
- if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
- p = &parent->rb_left;
- else
- p = &parent->rb_right;
- }
- rb_link_node(&skb->rbnode, parent, p);
- rb_insert_color(&skb->rbnode, root);
-}
-
/* Collapse contiguous sequence of skbs head..tail with
* sequence numbers start..end.
*
@@ -5879,16 +6118,11 @@ static void tcp_new_space(struct sock *sk)
* small enough that tcp_stream_memory_free() decides it
* is time to generate EPOLLOUT.
*/
-void tcp_check_space(struct sock *sk)
+void __tcp_check_space(struct sock *sk)
{
- /* pairs with tcp_poll() */
- smp_mb();
- if (sk->sk_socket &&
- test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
- tcp_new_space(sk);
- if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
- tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
- }
+ tcp_new_space(sk);
+ if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
+ tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
}
static inline void tcp_data_snd_check(struct sock *sk)
@@ -6222,6 +6456,8 @@ step1:
if (th->syn) {
if (tcp_ecn_mode_accecn(tp)) {
accecn_reflector = true;
+ tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
if (tp->rx_opt.accecn &&
tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
@@ -6843,7 +7079,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th, skb);
+ tcp_ecn_rcv_syn(sk, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -7248,7 +7484,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
u32 ecn_ok_dst;
if (tcp_accecn_syn_requested(th) &&
- READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 ||
+ tcp_ca_needs_accecn(listen_sk))) {
inet_rsk(req)->ecn_ok = 1;
tcp_rsk(req)->accecn_ok = 1;
tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8a9596e8f4d..6264fc0b2be5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -374,7 +374,7 @@ void tcp_v4_mtu_reduced(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
struct dst_entry *dst;
- u32 mtu;
+ u32 mtu, dmtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
@@ -386,15 +386,14 @@ void tcp_v4_mtu_reduced(struct sock *sk)
/* Something is about to be wrong... Remember soft error
* for the case, if this connection will not able to recover.
*/
- if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
+ dmtu = dst4_mtu(dst);
+ if (mtu < dmtu && ip_dont_fragment(sk, dst))
WRITE_ONCE(sk->sk_err_soft, EMSGSIZE);
- mtu = dst_mtu(dst);
-
if (inet->pmtudisc != IP_PMTUDISC_DONT &&
ip_sk_accept_pmtu(sk) &&
- inet_csk(sk)->icsk_pmtu_cookie > mtu) {
- tcp_sync_mss(sk, mtu);
+ inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
+ tcp_sync_mss(sk, dmtu);
/* Resend the TCP packet because it's
* clear that the old packet has been
@@ -1760,7 +1759,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
tcp_ca_openreq_child(newsk, dst);
- tcp_sync_mss(newsk, dst_mtu(dst));
+ tcp_sync_mss(newsk, dst4_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -2110,14 +2109,6 @@ no_coalesce:
}
EXPORT_IPV6_MOD(tcp_add_backlog);
-int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason)
-{
- struct tcphdr *th = (struct tcphdr *)skb->data;
-
- return sk_filter_trim_cap(sk, skb, th->doff * 4, reason);
-}
-EXPORT_IPV6_MOD(tcp_filter);
-
static void tcp_v4_restore_cb(struct sk_buff *skb)
{
memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4,
@@ -3418,20 +3409,6 @@ void tcp4_proc_exit(void)
}
#endif /* CONFIG_PROC_FS */
-/* @wake is one when sk_stream_write_space() calls us.
- * This sends EPOLLOUT only if notsent_bytes is half the limit.
- * This mimics the strategy used in sock_def_write_space().
- */
-bool tcp_stream_memory_free(const struct sock *sk, int wake)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
- u32 notsent_bytes = READ_ONCE(tp->write_seq) -
- READ_ONCE(tp->snd_nxt);
-
- return (notsent_bytes << wake) < tcp_notsent_lowat(tp);
-}
-EXPORT_SYMBOL(tcp_stream_memory_free);
-
struct proto tcp_prot = {
.name = "TCP",
.owner = THIS_MODULE,
@@ -3474,6 +3451,8 @@ struct proto tcp_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp_sock),
+ .freeptr_offset = offsetof(struct tcp_sock,
+ inet_conn.icsk_inet.sk.sk_freeptr),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index bd5462154f97..ec128865f5c0 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -481,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk,
tp->syn_ect_snt = treq->syn_ect_snt;
tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
tp->saw_accecn_opt = treq->saw_accecn_opt;
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND);
+ if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV);
tp->prev_ecnfield = treq->syn_ect_rcv;
tp->accecn_opt_demand = 1;
tcp_ecn_received_counters_payload(sk, skb);
} else {
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk))
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
+ else
+ tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
}
}
@@ -748,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
*/
if (!tcp_oow_rate_limited(sock_net(sk), skb,
LINUX_MIB_TCPACKSKIPPEDSYNRECV,
- &tcp_rsk(req)->last_oow_ack_time) &&
-
- !tcp_rtx_synack(sk, req)) {
- unsigned long expires = jiffies;
-
- expires += tcp_reqsk_timeout(req);
- if (!fastopen)
- mod_timer_pending(&req->rsk_timer, expires);
- else
- req->rsk_timer.expires = expires;
+ &tcp_rsk(req)->last_oow_ack_time)) {
+ if (tcp_rsk(req)->accecn_ok) {
+ u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+
+ tcp_rsk(req)->syn_ect_rcv = ect_rcv;
+ if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV;
+ }
+ if (!tcp_rtx_synack(sk, req)) {
+ unsigned long expires = jiffies;
+
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
+
+ expires += tcp_reqsk_timeout(req);
+ if (!fastopen)
+ mod_timer_pending(&req->rsk_timer,
+ expires);
+ else
+ req->rsk_timer.expires = expires;
+ }
}
return NULL;
}
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 942a948f1a31..3b1fdcd3cb29 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -304,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb,
goto out_check_final;
th2 = tcp_hdr(p);
- flush = (__force int)(flags & TCP_FLAG_CWR);
- flush |= (__force int)((flags ^ tcp_flag_word(th2)) &
+ flush = (__force int)((flags ^ tcp_flag_word(th2)) &
~(TCP_FLAG_FIN | TCP_FLAG_PSH));
flush |= (__force int)(th->ack_seq ^ th2->ack_seq);
for (i = sizeof(*th); i < thlen; i += 4)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 479afb714bdf..326b58ff1118 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -66,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp)
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
+/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */
+void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct sk_buff *skb1;
+
+ while (*p) {
+ parent = *p;
+ skb1 = rb_to_skb(parent);
+ if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq))
+ p = &parent->rb_left;
+ else
+ p = &parent->rb_right;
+ }
+ rb_link_node(&skb->rbnode, parent, p);
+ rb_insert_color(&skb->rbnode, root);
+}
+
/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
@@ -334,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
return;
if (tcp_ecn_mode_accecn(tp)) {
- if (!tcp_accecn_ace_fail_recv(tp))
+ if (!tcp_accecn_ace_fail_recv(tp) &&
+ !tcp_accecn_ace_fail_send(tp))
INET_ECN_xmit(sk);
+ else
+ INET_ECN_dontxmit(sk);
tcp_accecn_set_ace(tp, skb, th);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
} else {
@@ -712,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
if (tp) {
tp->accecn_minlen = 0;
tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack;
if (tp->accecn_opt_demand)
tp->accecn_opt_demand--;
}
+ } else if (tp) {
+ tp->accecn_opt_sent_w_dsack = 0;
}
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
@@ -1106,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
if (treq->accecn_ok &&
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
- req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) {
opts->use_synack_ecn_bytes = 1;
remaining -= tcp_options_fit_accecn(opts, 0, remaining);
}
@@ -1186,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
if (tcp_ecn_mode_accecn(tp)) {
int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
- if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ if (ecn_opt && tp->saw_accecn_opt &&
+ (ecn_opt >= TCP_ACCECN_OPTION_PERSIST ||
+ !tcp_accecn_opt_fail_send(tp)) &&
(ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
tcp_accecn_option_beacon_check(sk))) {
opts->use_synack_ecn_bytes = 0;
@@ -1432,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb,
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
}
+/* Snapshot the current delivery information in the skb, to generate
+ * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
+ */
+static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ /* In general we need to start delivery rate samples from the
+ * time we received the most recent ACK, to ensure we include
+ * the full time the network needs to deliver all in-flight
+ * packets. If there are no packets in flight yet, then we
+ * know that any ACKs after now indicate that the network was
+ * able to deliver those packets completely in the sampling
+ * interval between now and the next ACK.
+ *
+ * Note that we use packets_out instead of tcp_packets_in_flight(tp)
+ * because the latter is a guess based on RTO and loss-marking
+ * heuristics. We don't want spurious RTOs or loss markings to cause
+ * a spuriously small time interval, causing a spuriously high
+ * bandwidth estimate.
+ */
+ if (!tp->packets_out) {
+ u64 tstamp_us = tcp_skb_timestamp_us(skb);
+
+ tp->first_tx_mstamp = tstamp_us;
+ tp->delivered_mstamp = tstamp_us;
+ }
+
+ TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
+ TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
+ TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
+ TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
+}
+
INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl));
INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb));
@@ -1530,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
*/
skb->pfmemalloc = 0;
- skb_push(skb, tcp_header_size);
+ __skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
skb_orphan(skb);
@@ -3571,12 +3633,15 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback
- * As AccECN uses the same SYN flags (+ AE), this check covers both
- * cases.
- */
- if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
- tcp_ecn_clear_syn(sk, skb);
+ if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) {
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check
+ * covers both cases.
+ */
+ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) ==
+ TCPHDR_SYN_ECN)
+ tcp_ecn_clear_syn(sk, skb);
+ }
/* Update global and local TCP statistics. */
segs = tcp_skb_pcount(skb);
@@ -3732,33 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
inet_csk(sk)->icsk_rto, true);
}
-/* We allow to exceed memory limits for FIN packets to expedite
- * connection tear down and (memory) recovery.
- * Otherwise tcp_send_fin() could be tempted to either delay FIN
- * or even be forced to close flow without any FIN.
- * In general, we want to allow one skb per socket to avoid hangs
- * with edge trigger epoll()
- */
-void sk_forced_mem_schedule(struct sock *sk, int size)
-{
- int delta, amt;
-
- delta = size - sk->sk_forward_alloc;
- if (delta <= 0)
- return;
-
- amt = sk_mem_pages(delta);
- sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
-
- if (mem_cgroup_sk_enabled(sk))
- mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
-
- if (sk->sk_bypass_prot_mem)
- return;
-
- sk_memory_allocated_add(sk, amt);
-}
-
/* Send a FIN. The caller locks the socket for us.
* We should try to send a FIN packet really hard, but eventually give up.
*/
@@ -3918,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
switch (synack_type) {
case TCP_SYNACK_NORMAL:
+ case TCP_SYNACK_RETRANS:
skb_set_owner_edemux(skb, req_to_sk(req));
break;
case TCP_SYNACK_COOKIE:
@@ -4000,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
memset(th, 0, sizeof(struct tcphdr));
th->syn = 1;
th->ack = 1;
- tcp_ecn_make_synack(req, th);
+ tcp_ecn_make_synack(req, th, synack_type);
th->source = htons(ireq->ir_num);
th->dest = ireq->ir_rmt_port;
skb->mark = ireq->ir_mark;
@@ -4603,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
/* Paired with WRITE_ONCE() in sock_setsockopt() */
if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED)
WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash());
- res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL,
+ res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS,
NULL);
if (!res) {
TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS);
diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c
deleted file mode 100644
index a8f6d9d06f2e..000000000000
--- a/net/ipv4/tcp_rate.c
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-only
-#include <net/tcp.h>
-
-/* The bandwidth estimator estimates the rate at which the network
- * can currently deliver outbound data packets for this flow. At a high
- * level, it operates by taking a delivery rate sample for each ACK.
- *
- * A rate sample records the rate at which the network delivered packets
- * for this flow, calculated over the time interval between the transmission
- * of a data packet and the acknowledgment of that packet.
- *
- * Specifically, over the interval between each transmit and corresponding ACK,
- * the estimator generates a delivery rate sample. Typically it uses the rate
- * at which packets were acknowledged. However, the approach of using only the
- * acknowledgment rate faces a challenge under the prevalent ACK decimation or
- * compression: packets can temporarily appear to be delivered much quicker
- * than the bottleneck rate. Since it is physically impossible to do that in a
- * sustained fashion, when the estimator notices that the ACK rate is faster
- * than the transmit rate, it uses the latter:
- *
- * send_rate = #pkts_delivered/(last_snd_time - first_snd_time)
- * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time)
- * bw = min(send_rate, ack_rate)
- *
- * Notice the estimator essentially estimates the goodput, not always the
- * network bottleneck link rate when the sending or receiving is limited by
- * other factors like applications or receiver window limits. The estimator
- * deliberately avoids using the inter-packet spacing approach because that
- * approach requires a large number of samples and sophisticated filtering.
- *
- * TCP flows can often be application-limited in request/response workloads.
- * The estimator marks a bandwidth sample as application-limited if there
- * was some moment during the sampled window of packets when there was no data
- * ready to send in the write queue.
- */
-
-/* Snapshot the current delivery information in the skb, to generate
- * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered().
- */
-void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- /* In general we need to start delivery rate samples from the
- * time we received the most recent ACK, to ensure we include
- * the full time the network needs to deliver all in-flight
- * packets. If there are no packets in flight yet, then we
- * know that any ACKs after now indicate that the network was
- * able to deliver those packets completely in the sampling
- * interval between now and the next ACK.
- *
- * Note that we use packets_out instead of tcp_packets_in_flight(tp)
- * because the latter is a guess based on RTO and loss-marking
- * heuristics. We don't want spurious RTOs or loss markings to cause
- * a spuriously small time interval, causing a spuriously high
- * bandwidth estimate.
- */
- if (!tp->packets_out) {
- u64 tstamp_us = tcp_skb_timestamp_us(skb);
-
- tp->first_tx_mstamp = tstamp_us;
- tp->delivered_mstamp = tstamp_us;
- }
-
- TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp;
- TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp;
- TCP_SKB_CB(skb)->tx.delivered = tp->delivered;
- TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce;
- TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0;
-}
-
-/* When an skb is sacked or acked, we fill in the rate sample with the (prior)
- * delivery information when the skb was last transmitted.
- *
- * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is
- * called multiple times. We favor the information from the most recently
- * sent skb, i.e., the skb with the most recently sent time and the highest
- * sequence.
- */
-void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
- struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
- u64 tx_tstamp;
-
- if (!scb->tx.delivered_mstamp)
- return;
-
- tx_tstamp = tcp_skb_timestamp_us(skb);
- if (!rs->prior_delivered ||
- tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp,
- scb->end_seq, rs->last_end_seq)) {
- rs->prior_delivered_ce = scb->tx.delivered_ce;
- rs->prior_delivered = scb->tx.delivered;
- rs->prior_mstamp = scb->tx.delivered_mstamp;
- rs->is_app_limited = scb->tx.is_app_limited;
- rs->is_retrans = scb->sacked & TCPCB_RETRANS;
- rs->last_end_seq = scb->end_seq;
-
- /* Record send time of most recently ACKed packet: */
- tp->first_tx_mstamp = tx_tstamp;
- /* Find the duration of the "send phase" of this window: */
- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp,
- scb->tx.first_tx_mstamp);
-
- }
- /* Mark off the skb delivered once it's sacked to avoid being
- * used again when it's cumulatively acked. For acked packets
- * we don't need to reset since it'll be freed soon.
- */
- if (scb->sacked & TCPCB_SACKED_ACKED)
- scb->tx.delivered_mstamp = 0;
-}
-
-/* Update the connection delivery information and generate a rate sample. */
-void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
- bool is_sack_reneg, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- u32 snd_us, ack_us;
-
- /* Clear app limited if bubble is acked and gone. */
- if (tp->app_limited && after(tp->delivered, tp->app_limited))
- tp->app_limited = 0;
-
- /* TODO: there are multiple places throughout tcp_ack() to get
- * current time. Refactor the code using a new "tcp_acktag_state"
- * to carry current time, flags, stats like "tcp_sacktag_state".
- */
- if (delivered)
- tp->delivered_mstamp = tp->tcp_mstamp;
-
- rs->acked_sacked = delivered; /* freshly ACKed or SACKed */
- rs->losses = lost; /* freshly marked lost */
- /* Return an invalid sample if no timing information is available or
- * in recovery from loss with SACK reneging. Rate samples taken during
- * a SACK reneging event may overestimate bw by including packets that
- * were SACKed before the reneg.
- */
- if (!rs->prior_mstamp || is_sack_reneg) {
- rs->delivered = -1;
- rs->interval_us = -1;
- return;
- }
- rs->delivered = tp->delivered - rs->prior_delivered;
-
- rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce;
- /* delivered_ce occupies less than 32 bits in the skb control block */
- rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK;
-
- /* Model sending data and receiving ACKs as separate pipeline phases
- * for a window. Usually the ACK phase is longer, but with ACK
- * compression the send phase can be longer. To be safe we use the
- * longer phase.
- */
- snd_us = rs->interval_us; /* send phase */
- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp,
- rs->prior_mstamp); /* ack phase */
- rs->interval_us = max(snd_us, ack_us);
-
- /* Record both segment send and ack receive intervals */
- rs->snd_interval_us = snd_us;
- rs->rcv_interval_us = ack_us;
-
- /* Normally we expect interval_us >= min-rtt.
- * Note that rate may still be over-estimated when a spuriously
- * retransmistted skb was first (s)acked because "interval_us"
- * is under-estimated (up to an RTT). However continuously
- * measuring the delivery rate during loss recovery is crucial
- * for connections suffer heavy or prolonged losses.
- */
- if (unlikely(rs->interval_us < tcp_min_rtt(tp))) {
- if (!rs->is_retrans)
- pr_debug("tcp rate: %ld %d %u %u %u\n",
- rs->interval_us, rs->delivered,
- inet_csk(sk)->icsk_ca_state,
- tp->rx_opt.sack_ok, tcp_min_rtt(tp));
- rs->interval_us = -1;
- return;
- }
-
- /* Record the last non-app-limited or the highest app-limited bw */
- if (!rs->is_app_limited ||
- ((u64)rs->delivered * tp->rate_interval_us >=
- (u64)tp->rate_delivered * rs->interval_us)) {
- tp->rate_delivered = rs->delivered;
- tp->rate_interval_us = rs->interval_us;
- tp->rate_app_limited = rs->is_app_limited;
- }
-}
-
-/* If a gap is detected between sends, mark the socket application-limited. */
-void tcp_rate_check_app_limited(struct sock *sk)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if (/* We have less than one packet to send. */
- tp->write_seq - tp->snd_nxt < tp->mss_cache &&
- /* Nothing in sending host's qdisc queues or NIC tx queue. */
- sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) &&
- /* We are not limited by CWND. */
- tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) &&
- /* All lost packets have been retransmitted. */
- tp->lost_out <= tp->retrans_out)
- tp->app_limited =
- (tp->delivered + tcp_packets_in_flight(tp)) ? : 1;
-}
-EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited);
diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c
index c52fd3254b6e..139646751073 100644
--- a/net/ipv4/tcp_recovery.c
+++ b/net/ipv4/tcp_recovery.c
@@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk)
return !!timeout;
}
-/* Record the most recently (re)sent time among the (s)acked packets
- * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from
- * draft-cheng-tcpm-rack-00.txt
- */
-void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
- u64 xmit_time)
-{
- u32 rtt_us;
-
- rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time);
- if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) {
- /* If the sacked packet was retransmitted, it's ambiguous
- * whether the retransmission or the original (or the prior
- * retransmission) was sacked.
- *
- * If the original is lost, there is no ambiguity. Otherwise
- * we assume the original can be delayed up to aRTT + min_rtt.
- * the aRTT term is bounded by the fast recovery or timeout,
- * so it's at least one RTT (i.e., retransmission is at least
- * an RTT later).
- */
- return;
- }
- tp->rack.advanced = 1;
- tp->rack.rtt_us = rtt_us;
- if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp,
- end_seq, tp->rack.end_seq)) {
- tp->rack.mstamp = xmit_time;
- tp->rack.end_seq = end_seq;
- }
-}
-
/* We have waited long enough to accommodate reordering. Mark the expired
* packets lost and retransmit them.
*/
@@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk)
tcp_rearm_rto(sk);
}
-/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries.
- *
- * If a DSACK is received that seems like it may have been due to reordering
- * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded
- * by srtt), since there is possibility that spurious retransmission was
- * due to reordering delay longer than reo_wnd.
- *
- * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16)
- * no. of successful recoveries (accounts for full DSACK-based loss
- * recovery undo). After that, reset it to default (min_rtt/4).
- *
- * At max, reo_wnd is incremented only once per rtt. So that the new
- * DSACK on which we are reacting, is due to the spurious retx (approx)
- * after the reo_wnd has been updated last time.
- *
- * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than
- * absolute value to account for change in rtt.
- */
-void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs)
-{
- struct tcp_sock *tp = tcp_sk(sk);
-
- if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) &
- TCP_RACK_STATIC_REO_WND) ||
- !rs->prior_delivered)
- return;
-
- /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */
- if (before(rs->prior_delivered, tp->rack.last_delivered))
- tp->rack.dsack_seen = 0;
-
- /* Adjust the reo_wnd if update is pending */
- if (tp->rack.dsack_seen) {
- tp->rack.reo_wnd_steps = min_t(u32, 0xFF,
- tp->rack.reo_wnd_steps + 1);
- tp->rack.dsack_seen = 0;
- tp->rack.last_delivered = tp->delivered;
- tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH;
- } else if (!tp->rack.reo_wnd_persist) {
- tp->rack.reo_wnd_steps = 1;
- }
-}
-
/* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits
* the next unacked packet upon receiving
* a) three or more DUPACKs to start the fast recovery
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 160080c9021d..5a14a53a3c9e 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,6 +22,7 @@
#include <linux/module.h>
#include <linux/gfp.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/rstreason.h>
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
@@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)
* it's not good to give up too easily.
*/
tcp_rtx_synack(sk, req);
+ if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok)
+ tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND;
req->num_timeout++;
tcp_update_rto_stats(sk);
if (!tp->retrans_stamp)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index ee63af0ef42c..b96e47f1c8a2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1193,7 +1193,7 @@ csum_partial:
send:
err = ip_send_skb(sock_net(sk), skb);
- if (err) {
+ if (unlikely(err)) {
if (err == -ENOBUFS &&
!inet_test_bit(RECVERR, sk)) {
UDP_INC_STATS(sock_net(sk),
@@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send);
int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data,
+ IP_OPTIONS_DATA_FIXED_SIZE);
struct inet_sock *inet = inet_sk(sk);
struct udp_sock *up = udp_sk(sk);
DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name);
@@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE;
int (*getfrag)(void *, char *, int, int, int, struct sk_buff *);
struct sk_buff *skb;
- struct ip_options_data opt_copy;
int uc_index;
if (len > 0xFFFF)
@@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
rcu_read_lock();
inet_opt = rcu_dereference(inet->inet_opt);
if (inet_opt) {
- memcpy(&opt_copy, inet_opt,
+ memcpy(opt_copy, inet_opt,
sizeof(*inet_opt) + inet_opt->opt.optlen);
- ipc.opt = &opt_copy.opt;
+ ipc.opt = opt_copy;
}
rcu_read_unlock();
}
@@ -1793,14 +1794,32 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
}
if (unlikely(to_drop)) {
+ int err_ipv4 = 0;
+ int err_ipv6 = 0;
+
for (nb = 0; to_drop != NULL; nb++) {
skb = to_drop;
+ if (skb->protocol == htons(ETH_P_IP))
+ err_ipv4++;
+ else
+ err_ipv6++;
to_drop = skb->next;
skb_mark_not_on_list(skb);
- /* TODO: update SNMP values. */
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
}
numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ if (err_ipv4 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS,
+ err_ipv4);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS,
+ err_ipv4);
+ }
+ if (err_ipv6 > 0) {
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS,
+ err_ipv6);
+ SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS,
+ err_ipv6);
+ }
}
atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
@@ -2429,7 +2448,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets
*/
- if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) &&
+ UDP_SKB_CB(skb)->partial_cov)) {
u16 pcrlen = READ_ONCE(up->pcrlen);
/*
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 589456bd8b5f..6b1654c1ad4a 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
struct sock *sk = gso_skb->sk;
unsigned int sum_truesize = 0;
struct sk_buff *segs, *seg;
+ __be16 newlen, msslen;
struct udphdr *uh;
unsigned int mss;
bool copy_dtor;
__sum16 check;
- __be16 newlen;
int ret = 0;
mss = skb_shinfo(gso_skb)->gso_size;
@@ -556,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
return segs;
}
+ msslen = htons(sizeof(*uh) + mss);
+
/* GSO partial and frag_list segmentation only requires splitting
* the frame into an MSS multiple and possibly a remainder, both
* cases return a GSO skb. So update the mss now.
@@ -585,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
if (!seg->next)
break;
- uh->len = newlen;
+ uh->len = msslen;
uh->check = check;
if (seg->ip_summed == CHECKSUM_PARTIAL)
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index d283c59df4c1..0492f1a0b491 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -45,7 +45,7 @@ obj-$(CONFIG_IPV6_FOU) += fou6.o
obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o
obj-$(CONFIG_INET) += output_core.o protocol.o \
- ip6_offload.o tcpv6_offload.o exthdrs_offload.o
+ ip6_offload.o exthdrs_offload.o
obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 27ab9d7adc64..6db9cf9e2a50 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp)
list_for_each(p, &idev->addr_list) {
struct inet6_ifaddr *ifa
= list_entry(p, struct inet6_ifaddr, if_list);
- if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr))
+ if (ifp_scope > ipv6_addr_src_scope(&ifa->addr))
break;
}
@@ -3339,11 +3339,10 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
const struct inet6_dev *idev)
{
static DEFINE_SPINLOCK(lock);
- static __u32 digest[SHA1_DIGEST_WORDS];
- static __u32 workspace[SHA1_WORKSPACE_WORDS];
+ static struct sha1_ctx sha_ctx;
static union {
- char __data[SHA1_BLOCK_SIZE];
+ u8 __data[SHA1_BLOCK_SIZE];
struct {
struct in6_addr secret;
__be32 prefix[2];
@@ -3368,20 +3367,26 @@ static int ipv6_generate_stable_address(struct in6_addr *address,
retry:
spin_lock_bh(&lock);
- sha1_init_raw(digest);
+ sha1_init(&sha_ctx);
+
memset(&data, 0, sizeof(data));
- memset(workspace, 0, sizeof(workspace));
memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len);
data.prefix[0] = address->s6_addr32[0];
data.prefix[1] = address->s6_addr32[1];
data.secret = secret;
data.dad_count = dad_count;
- sha1_transform(digest, data.__data, workspace);
+ sha1_update(&sha_ctx, data.__data, sizeof(data));
+ /*
+ * Note that the SHA-1 finalization is omitted here, and the digest is
+ * pulled directly from the internal SHA-1 state (making it incompatible
+ * with standard SHA-1). Unusual, but technically okay since the data
+ * length is fixed and is a multiple of the SHA-1 block size.
+ */
temp = *address;
- temp.s6_addr32[2] = (__force __be32)digest[0];
- temp.s6_addr32[3] = (__force __be32)digest[1];
+ temp.s6_addr32[2] = (__force __be32)sha_ctx.state.h[0];
+ temp.s6_addr32[3] = (__force __be32)sha_ctx.state.h[1];
spin_unlock_bh(&lock);
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index b705751eb73c..31ba677d0442 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -224,8 +224,8 @@ lookup_protocol:
inet6_set_bit(MC6_LOOP, sk);
inet6_set_bit(MC6_ALL, sk);
np->pmtudisc = IPV6_PMTUDISC_WANT;
- inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect &
- FLOWLABEL_REFLECT_ESTABLISHED);
+ inet6_assign_bit(REPFLOW, sk, READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) &
+ FLOWLABEL_REFLECT_ESTABLISHED);
sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash);
@@ -824,45 +824,42 @@ EXPORT_SYMBOL(inet6_unregister_protosw);
int inet6_sk_rebuild_header(struct sock *sk)
{
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct inet_sock *inet = inet_sk(sk);
+ struct in6_addr *final_p;
struct dst_entry *dst;
+ struct flowi6 *fl6;
dst = __sk_dst_check(sk, np->dst_cookie);
+ if (dst)
+ return 0;
+
+ fl6 = &inet->cork.fl.u.ip6;
+ memset(fl6, 0, sizeof(*fl6));
+ fl6->flowi6_proto = sk->sk_protocol;
+ fl6->daddr = sk->sk_v6_daddr;
+ fl6->saddr = np->saddr;
+ fl6->flowlabel = np->flow_label;
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
+ fl6->flowi6_mark = sk->sk_mark;
+ fl6->fl6_dport = inet->inet_dport;
+ fl6->fl6_sport = inet->inet_sport;
+ fl6->flowi6_uid = sk_uid(sk);
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
- if (!dst) {
- struct inet_sock *inet = inet_sk(sk);
- struct in6_addr *final_p, final;
- struct flowi6 fl6;
-
- memset(&fl6, 0, sizeof(fl6));
- fl6.flowi6_proto = sk->sk_protocol;
- fl6.daddr = sk->sk_v6_daddr;
- fl6.saddr = np->saddr;
- fl6.flowlabel = np->flow_label;
- fl6.flowi6_oif = sk->sk_bound_dev_if;
- fl6.flowi6_mark = sk->sk_mark;
- fl6.fl6_dport = inet->inet_dport;
- fl6.fl6_sport = inet->inet_sport;
- fl6.flowi6_uid = sk_uid(sk);
- security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
-
- rcu_read_lock();
- final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
- &final);
- rcu_read_unlock();
-
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
- if (IS_ERR(dst)) {
- sk->sk_route_caps = 0;
- WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
- return PTR_ERR(dst);
- }
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final);
+ rcu_read_unlock();
- ip6_dst_store(sk, dst, false, false);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+ if (IS_ERR(dst)) {
+ sk->sk_route_caps = 0;
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
+ return PTR_ERR(dst);
}
+ ip6_dst_store(sk, dst, false, false);
return 0;
}
-EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header);
bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb,
const struct inet6_skb_parm *opt)
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 83e03176819c..c564b68a0562 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -72,12 +72,12 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6,
int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
{
struct ip6_flowlabel *flowlabel = NULL;
- struct in6_addr *final_p, final;
- struct ipv6_txoptions *opt;
- struct dst_entry *dst;
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi6 fl6;
+ struct ipv6_txoptions *opt;
+ struct in6_addr *final_p;
+ struct dst_entry *dst;
+ struct flowi6 *fl6;
int err = 0;
if (inet6_test_bit(SNDFLOW, sk) &&
@@ -86,14 +86,15 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
if (IS_ERR(flowlabel))
return -EINVAL;
}
- ip6_datagram_flow_key_init(&fl6, sk);
+ fl6 = &inet_sk(sk)->cork.fl.u.ip6;
+ ip6_datagram_flow_key_init(fl6, sk);
rcu_read_lock();
opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
- final_p = fl6_update_dst(&fl6, opt, &final);
+ final_p = fl6_update_dst(fl6, opt, &np->final);
rcu_read_unlock();
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto out;
@@ -101,17 +102,17 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr)
if (fix_sk_saddr) {
if (ipv6_addr_any(&np->saddr))
- np->saddr = fl6.saddr;
+ np->saddr = fl6->saddr;
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) {
- sk->sk_v6_rcv_saddr = fl6.saddr;
+ sk->sk_v6_rcv_saddr = fl6->saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
if (sk->sk_prot->rehash)
sk->sk_prot->rehash(sk);
}
}
- ip6_sk_dst_store_flow(sk, dst, &fl6);
+ ip6_sk_dst_store_flow(sk, dst, fl6);
out:
fl6_sock_release(flowlabel);
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index a23eb8734e15..209fdf1b1aa9 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -314,7 +314,7 @@ fail_and_free:
}
extlen = (skb_transport_header(skb)[1] + 1) << 3;
- if (extlen > net->ipv6.sysctl.max_dst_opts_len)
+ if (extlen > READ_ONCE(net->ipv6.sysctl.max_dst_opts_len))
goto fail_and_free;
opt->lastopt = opt->dst1 = skb_network_header_len(skb);
@@ -322,7 +322,8 @@ fail_and_free:
dstbuf = opt->dst1;
#endif
- if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) {
+ if (ip6_parse_tlv(false, skb,
+ READ_ONCE(net->ipv6.sysctl.max_dst_opts_cnt))) {
skb->transport_header += extlen;
opt = IP6CB(skb);
#if IS_ENABLED(CONFIG_IPV6_MIP6)
@@ -1049,11 +1050,12 @@ fail_and_free:
}
extlen = (skb_transport_header(skb)[1] + 1) << 3;
- if (extlen > net->ipv6.sysctl.max_hbh_opts_len)
+ if (extlen > READ_ONCE(net->ipv6.sysctl.max_hbh_opts_len))
goto fail_and_free;
opt->flags |= IP6SKB_HOPBYHOP;
- if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) {
+ if (ip6_parse_tlv(true, skb,
+ READ_ONCE(net->ipv6.sysctl.max_hbh_opts_cnt))) {
skb->transport_header += extlen;
opt = IP6CB(skb);
opt->nhoff = sizeof(struct ipv6hdr);
@@ -1072,9 +1074,9 @@ fail_and_free:
* for headers.
*/
-static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p, struct in6_addr *saddr)
+static u8 ipv6_push_rthdr0(struct sk_buff *skb, u8 proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
struct rt0_hdr *phdr, *ihdr;
int hops;
@@ -1093,13 +1095,13 @@ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto,
phdr->addr[hops - 1] = **addr_p;
*addr_p = ihdr->addr;
- phdr->rt_hdr.nexthdr = *proto;
- *proto = NEXTHDR_ROUTING;
+ phdr->rt_hdr.nexthdr = proto;
+ return NEXTHDR_ROUTING;
}
-static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p, struct in6_addr *saddr)
+static u8 ipv6_push_rthdr4(struct sk_buff *skb, u8 proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
struct ipv6_sr_hdr *sr_phdr, *sr_ihdr;
int plen, hops;
@@ -1142,58 +1144,61 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto,
}
#endif
- sr_phdr->nexthdr = *proto;
- *proto = NEXTHDR_ROUTING;
+ sr_phdr->nexthdr = proto;
+ return NEXTHDR_ROUTING;
}
-static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto,
- struct ipv6_rt_hdr *opt,
- struct in6_addr **addr_p, struct in6_addr *saddr)
+static u8 ipv6_push_rthdr(struct sk_buff *skb, u8 proto,
+ struct ipv6_rt_hdr *opt,
+ struct in6_addr **addr_p, struct in6_addr *saddr)
{
switch (opt->type) {
case IPV6_SRCRT_TYPE_0:
case IPV6_SRCRT_STRICT:
case IPV6_SRCRT_TYPE_2:
- ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
+ proto = ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr);
break;
case IPV6_SRCRT_TYPE_4:
- ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
+ proto = ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr);
break;
default:
break;
}
+ return proto;
}
-static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt)
+static u8 ipv6_push_exthdr(struct sk_buff *skb, u8 proto, u8 type, struct ipv6_opt_hdr *opt)
{
struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt));
memcpy(h, opt, ipv6_optlen(opt));
- h->nexthdr = *proto;
- *proto = type;
+ h->nexthdr = proto;
+ return type;
}
-void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
- u8 *proto,
- struct in6_addr **daddr, struct in6_addr *saddr)
+u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt,
+ u8 proto,
+ struct in6_addr **daddr, struct in6_addr *saddr)
{
if (opt->srcrt) {
- ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
+ proto = ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr);
/*
* IPV6_RTHDRDSTOPTS is ignored
* unless IPV6_RTHDR is set (RFC3542).
*/
if (opt->dst0opt)
- ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
+ proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt);
}
if (opt->hopopt)
- ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+ proto = ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt);
+ return proto;
}
-void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto)
+u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 proto)
{
if (opt->dst1opt)
- ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+ proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt);
+ return proto;
}
EXPORT_SYMBOL(ipv6_push_frag_opts);
@@ -1334,21 +1339,21 @@ struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space,
EXPORT_SYMBOL_GPL(__ipv6_fixup_options);
/**
- * fl6_update_dst - update flowi destination address with info given
+ * __fl6_update_dst - update flowi destination address with info given
* by srcrt option, if any.
*
* @fl6: flowi6 for which daddr is to be updated
* @opt: struct ipv6_txoptions in which to look for srcrt opt
* @orig: copy of original daddr address if modified
*
- * Returns NULL if no txoptions or no srcrt, otherwise returns orig
+ * Return: NULL if no srcrt or invalid srcrt type, otherwise returns orig
* and initial value of fl6->daddr set in orig
*/
-struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
- const struct ipv6_txoptions *opt,
- struct in6_addr *orig)
+struct in6_addr *__fl6_update_dst(struct flowi6 *fl6,
+ const struct ipv6_txoptions *opt,
+ struct in6_addr *orig)
{
- if (!opt || !opt->srcrt)
+ if (!opt->srcrt)
return NULL;
*orig = fl6->daddr;
@@ -1372,4 +1377,4 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6,
return orig;
}
-EXPORT_SYMBOL_GPL(fl6_update_dst);
+EXPORT_SYMBOL_GPL(__fl6_update_dst);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 9d37e7711bc2..375ecd779fda 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -958,7 +958,8 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb)
tmp_hdr.icmp6_type = type;
memset(&fl6, 0, sizeof(fl6));
- if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
+ if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) &
+ FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES)
fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb));
fl6.flowi6_proto = IPPROTO_ICMPV6;
@@ -1066,6 +1067,12 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type,
if (reason != SKB_NOT_DROPPED_YET)
goto out;
+ if (nexthdr == IPPROTO_RAW) {
+ /* Add a more specific reason later ? */
+ reason = SKB_DROP_REASON_NOT_SPECIFIED;
+ goto out;
+ }
+
/* BUGGG_FUTURE: we should try to parse exthdrs in this packet.
Without this we will not able f.e. to make source routed
pmtu discovery.
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index ea5cf3fdfdd6..11fc2f7de2fe 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -25,14 +25,14 @@
#include <net/sock_reuseport.h>
struct dst_entry *inet6_csk_route_req(const struct sock *sk,
+ struct dst_entry *dst,
struct flowi6 *fl6,
const struct request_sock *req,
u8 proto)
{
- struct inet_request_sock *ireq = inet_rsk(req);
+ const struct inet_request_sock *ireq = inet_rsk(req);
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *final_p, final;
- struct dst_entry *dst;
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_proto = proto;
@@ -48,25 +48,20 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
fl6->flowi6_uid = sk_uid(sk);
security_req_classify_flow(req, flowi6_to_flowi_common(fl6));
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
- if (IS_ERR(dst))
- return NULL;
-
+ if (!dst) {
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+ if (IS_ERR(dst))
+ return NULL;
+ }
return dst;
}
-static inline
-struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
-{
- return __sk_dst_check(sk, cookie);
-}
-
static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
struct flowi6 *fl6)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
- struct in6_addr *final_p, final;
+ struct in6_addr *final_p;
struct dst_entry *dst;
memset(fl6, 0, sizeof(*fl6));
@@ -83,41 +78,41 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
rcu_read_lock();
- final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final);
rcu_read_unlock();
- dst = __inet6_csk_dst_check(sk, np->dst_cookie);
- if (!dst) {
- dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+ dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
+
+ if (!IS_ERR(dst))
+ ip6_dst_store(sk, dst, false, false);
- if (!IS_ERR(dst))
- ip6_dst_store(sk, dst, false, false);
- }
return dst;
}
int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused)
{
+ struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6;
struct ipv6_pinfo *np = inet6_sk(sk);
- struct flowi6 fl6;
struct dst_entry *dst;
int res;
- dst = inet6_csk_route_socket(sk, &fl6);
- if (IS_ERR(dst)) {
- WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
- sk->sk_route_caps = 0;
- kfree_skb(skb);
- return PTR_ERR(dst);
+ dst = __sk_dst_check(sk, np->dst_cookie);
+ if (unlikely(!dst)) {
+ dst = inet6_csk_route_socket(sk, fl6);
+ if (IS_ERR(dst)) {
+ WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst));
+ sk->sk_route_caps = 0;
+ kfree_skb(skb);
+ return PTR_ERR(dst);
+ }
+ /* Restore final destination back after routing done */
+ fl6->daddr = sk->sk_v6_daddr;
}
rcu_read_lock();
skb_dst_set_noref(skb, dst);
- /* Restore final destination back after routing done */
- fl6.daddr = sk->sk_v6_daddr;
-
- res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt),
+ res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
np->tclass, READ_ONCE(sk->sk_priority));
rcu_read_unlock();
return res;
@@ -126,13 +121,15 @@ EXPORT_SYMBOL_GPL(inet6_csk_xmit);
struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu)
{
- struct flowi6 fl6;
- struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6);
+ struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6;
+ struct dst_entry *dst;
+
+ dst = inet6_csk_route_socket(sk, fl6);
if (IS_ERR(dst))
return NULL;
dst->ops->update_pmtu(dst, sk, NULL, mtu, true);
- dst = inet6_csk_route_socket(sk, &fl6);
+ dst = inet6_csk_route_socket(sk, fl6);
return IS_ERR(dst) ? NULL : dst;
}
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c6439e30e892..9880d608392b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1375,14 +1375,14 @@ static void fib6_start_gc(struct net *net, struct fib6_info *rt)
if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
(rt->fib6_flags & RTF_EXPIRES))
mod_timer(&net->ipv6.ip6_fib_timer,
- jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+ jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
}
void fib6_force_start_gc(struct net *net)
{
if (!timer_pending(&net->ipv6.ip6_fib_timer))
mod_timer(&net->ipv6.ip6_fib_timer,
- jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
+ jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval));
}
static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
@@ -2414,6 +2414,7 @@ static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args)
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
struct fib6_gc_args gc_args;
+ int ip6_rt_gc_interval;
unsigned long now;
if (force) {
@@ -2422,8 +2423,8 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
return;
}
- gc_args.timeout = expires ? (int)expires :
- net->ipv6.sysctl.ip6_rt_gc_interval;
+ ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval);
+ gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval;
gc_args.more = 0;
fib6_gc_all(net, &gc_args);
@@ -2432,8 +2433,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
if (gc_args.more)
mod_timer(&net->ipv6.ip6_fib_timer,
- round_jiffies(now
- + net->ipv6.sysctl.ip6_rt_gc_interval));
+ round_jiffies(now + ip6_rt_gc_interval));
else
timer_delete(&net->ipv6.ip6_fib_timer);
spin_unlock_bh(&net->ipv6.fib6_gc_lock);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index d19d86ed4376..dafcc0dcd77a 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -1057,7 +1057,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb,
/* TooBig packet may have updated dst->dev's mtu */
if (!t->parms.collect_md && dst) {
mtu = READ_ONCE(dst_dev(dst)->mtu);
- if (dst_mtu(dst) > mtu)
+ if (dst6_mtu(dst) > mtu)
dst->ops->update_pmtu(dst, NULL, skb, mtu, false);
}
err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu,
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 168ec07e31cc..2bcb981c91aa 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -262,7 +262,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev,
skb->transport_header = skb->network_header + sizeof(*hdr);
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
- pkt_len = ntohs(hdr->payload_len);
+ pkt_len = ipv6_payload_len(skb, hdr);
/* pkt_len may be zero if Jumbo payload option is present */
if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index fce91183797a..bd7f780e37a5 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -19,23 +19,7 @@
#include <net/gso.h>
#include "ip6_offload.h"
-
-/* All GRO functions are always builtin, except UDP over ipv6, which lays in
- * ipv6 module, as it depends on UDPv6 lookup function, so we need special care
- * when ipv6 is built as a module
- */
-#if IS_BUILTIN(CONFIG_IPV6)
-#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__)
-#else
-#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__)
-#endif
-
-#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \
-({ \
- unlikely(gro_recursion_inc_test(skb)) ? \
- NAPI_GRO_CB(skb)->flush |= 1, NULL : \
- INDIRECT_CALL_L4(cb, f2, f1, head, skb); \
-})
+#include "tcpv6_offload.c"
static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto)
{
@@ -110,7 +94,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct ipv6hdr *ipv6h;
const struct net_offload *ops;
- int proto, err;
+ int proto;
struct frag_hdr *fptr;
unsigned int payload_len;
u8 *prevhdr;
@@ -120,9 +104,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
bool gso_partial;
skb_reset_network_header(skb);
- err = ipv6_hopopt_jumbo_remove(skb);
- if (err)
- return ERR_PTR(err);
nhoff = skb_network_header(skb) - skb_mac_header(skb);
if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
goto out;
@@ -298,9 +279,19 @@ not_same_flow:
skb_gro_postpull_rcsum(skb, iph, nlen);
- pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive,
- ops->callbacks.gro_receive, head, skb);
+ if (unlikely(gro_recursion_inc_test(skb))) {
+ flush = 1;
+ goto out;
+ }
+ if (likely(proto == IPPROTO_TCP))
+ pp = tcp6_gro_receive(head, skb);
+#if IS_BUILTIN(CONFIG_IPV6)
+ else if (likely(proto == IPPROTO_UDP))
+ pp = udp6_gro_receive(head, skb);
+#endif
+ else
+ pp = ops->callbacks.gro_receive(head, skb);
out:
skb_gro_flush_final(skb, pp, flush);
@@ -342,48 +333,28 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
const struct net_offload *ops;
struct ipv6hdr *iph;
int err = -ENOSYS;
- u32 payload_len;
if (skb->encapsulation) {
skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6));
skb_set_inner_network_header(skb, nhoff);
}
- payload_len = skb->len - nhoff - sizeof(*iph);
- if (unlikely(payload_len > IPV6_MAXPLEN)) {
- struct hop_jumbo_hdr *hop_jumbo;
- int hoplen = sizeof(*hop_jumbo);
-
- /* Move network header left */
- memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb),
- skb->transport_header - skb->mac_header);
- skb->data -= hoplen;
- skb->len += hoplen;
- skb->mac_header -= hoplen;
- skb->network_header -= hoplen;
- iph = (struct ipv6hdr *)(skb->data + nhoff);
- hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1);
-
- /* Build hop-by-hop options */
- hop_jumbo->nexthdr = iph->nexthdr;
- hop_jumbo->hdrlen = 0;
- hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
- hop_jumbo->tlv_len = 4;
- hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen);
-
- iph->nexthdr = NEXTHDR_HOP;
- iph->payload_len = 0;
- } else {
- iph = (struct ipv6hdr *)(skb->data + nhoff);
- iph->payload_len = htons(payload_len);
- }
+ iph = (struct ipv6hdr *)(skb->data + nhoff);
+ ipv6_set_payload_len(iph, skb->len - nhoff - sizeof(*iph));
nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops);
+
+ if (likely(ops == &net_hotdata.tcpv6_offload))
+ return tcp6_gro_complete(skb, nhoff);
+#if IS_BUILTIN(CONFIG_IPV6)
+ if (ops == &net_hotdata.udpv6_offload)
+ return udp6_gro_complete(skb, nhoff);
+#endif
+
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out;
- err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete,
- udp6_gro_complete, skb, nhoff);
+ err = ops->callbacks.gro_complete(skb, nhoff);
out:
return err;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index f904739e99b9..769c39fed1f3 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -80,7 +80,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
hdr = ipv6_hdr(skb);
daddr = &hdr->daddr;
- if (ipv6_addr_is_multicast(daddr)) {
+ if (unlikely(ipv6_addr_is_multicast(daddr))) {
if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
((mroute6_is_socket(net, skb) &&
!(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
@@ -179,8 +179,7 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
static int ip6_finish_output_gso(struct net *net, struct sock *sk,
struct sk_buff *skb, unsigned int mtu)
{
- if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
- !skb_gso_validate_network_len(skb, mtu))
+ if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
return ip6_finish_output2(net, sk, skb);
@@ -202,8 +201,8 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff
if (skb_is_gso(skb))
return ip6_finish_output_gso(net, sk, skb, mtu);
- if (skb->len > mtu ||
- (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
+ if (unlikely(skb->len > mtu ||
+ (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
return ip6_fragment(net, sk, skb, ip6_finish_output2);
return ip6_finish_output2(net, sk, skb);
@@ -273,8 +272,6 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
struct inet6_dev *idev = ip6_dst_idev(dst);
- struct hop_jumbo_hdr *hop_jumbo;
- int hoplen = sizeof(*hop_jumbo);
struct net *net = sock_net(sk);
unsigned int head_room;
struct net_device *dev;
@@ -287,7 +284,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
rcu_read_lock();
dev = dst_dev_rcu(dst);
- head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
+ head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
if (opt)
head_room += opt->opt_nflen + opt->opt_flen;
@@ -301,32 +298,22 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
}
}
- if (opt) {
+ if (unlikely(opt)) {
seg_len += opt->opt_nflen + opt->opt_flen;
if (opt->opt_flen)
- ipv6_push_frag_opts(skb, opt, &proto);
+ proto = ipv6_push_frag_opts(skb, opt, proto);
if (opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
- &fl6->saddr);
+ proto = ipv6_push_nfrag_opts(skb, opt, proto,
+ &first_hop,
+ &fl6->saddr);
}
- if (unlikely(seg_len > IPV6_MAXPLEN)) {
- hop_jumbo = skb_push(skb, hoplen);
-
- hop_jumbo->nexthdr = proto;
- hop_jumbo->hdrlen = 0;
- hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
- hop_jumbo->tlv_len = 4;
- hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
-
- proto = IPPROTO_HOPOPTS;
+ if (unlikely(seg_len > IPV6_MAXPLEN))
seg_len = 0;
- IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
- }
- skb_push(skb, sizeof(struct ipv6hdr));
+ __skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
@@ -352,8 +339,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
skb->priority = priority;
skb->mark = mark;
- mtu = dst_mtu(dst);
- if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
+ mtu = dst6_mtu(dst);
+ if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
/* if egress device is enslaved to an L3 master device pass the
@@ -382,7 +369,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
- kfree_skb(skb);
+ kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
unlock:
rcu_read_unlock();
return ret;
@@ -653,7 +640,7 @@ int ip6_forward(struct sk_buff *skb)
if (mtu < IPV6_MIN_MTU)
mtu = IPV6_MIN_MTU;
- if (ip6_pkt_too_big(skb, mtu)) {
+ if (unlikely(ip6_pkt_too_big(skb, mtu))) {
/* Again, force OUTPUT device used as source address */
skb->dev = dev;
icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
@@ -1352,12 +1339,13 @@ static void ip6_append_data_mtu(unsigned int *mtu,
}
static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
- struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
+ struct ipcm6_cookie *ipc6,
struct rt6_info *rt)
{
+ struct ipv6_txoptions *nopt, *opt = ipc6->opt;
+ struct inet6_cork *v6_cork = &cork->base6;
struct ipv6_pinfo *np = inet6_sk(sk);
unsigned int mtu, frag_size;
- struct ipv6_txoptions *nopt, *opt = ipc6->opt;
/* callers pass dst together with a reference, set it first so
* ip6_cork_release() can put it down even in case of an error.
@@ -1367,7 +1355,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
/*
* setup for corking
*/
- if (opt) {
+ if (unlikely(opt)) {
if (WARN_ON(v6_cork->opt))
return -EINVAL;
@@ -1402,10 +1390,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
v6_cork->dontfrag = ipc6->dontfrag;
if (rt->dst.flags & DST_XFRM_TUNNEL)
mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
- READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
+ READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
else
mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
- READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
+ READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
frag_size = READ_ONCE(np->frag_size);
if (frag_size && frag_size < mtu)
@@ -1430,17 +1418,17 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
static int __ip6_append_data(struct sock *sk,
struct sk_buff_head *queue,
struct inet_cork_full *cork_full,
- struct inet6_cork *v6_cork,
struct page_frag *pfrag,
int getfrag(void *from, char *to, int offset,
int len, int odd, struct sk_buff *skb),
void *from, size_t length, int transhdrlen,
unsigned int flags)
{
- struct sk_buff *skb, *skb_prev = NULL;
+ unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+ struct inet6_cork *v6_cork = &cork_full->base6;
struct inet_cork *cork = &cork_full->base;
struct flowi6 *fl6 = &cork_full->fl.u.ip6;
- unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
+ struct sk_buff *skb, *skb_prev = NULL;
struct ubuf_info *uarg = NULL;
int exthdrlen = 0;
int dst_exthdrlen = 0;
@@ -1843,7 +1831,6 @@ int ip6_append_data(struct sock *sk,
struct rt6_info *rt, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
- struct ipv6_pinfo *np = inet6_sk(sk);
int exthdrlen;
int err;
@@ -1854,7 +1841,7 @@ int ip6_append_data(struct sock *sk,
* setup for corking
*/
dst_hold(&rt->dst);
- err = ip6_setup_cork(sk, &inet->cork, &np->cork,
+ err = ip6_setup_cork(sk, &inet->cork,
ipc6, rt);
if (err)
return err;
@@ -1868,7 +1855,7 @@ int ip6_append_data(struct sock *sk,
}
return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
- &np->cork, sk_page_frag(sk), getfrag,
+ sk_page_frag(sk), getfrag,
from, length, transhdrlen, flags);
}
EXPORT_SYMBOL_GPL(ip6_append_data);
@@ -1881,10 +1868,11 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
skb_dst_set(skb, dst);
}
-static void ip6_cork_release(struct inet_cork_full *cork,
- struct inet6_cork *v6_cork)
+static void ip6_cork_release(struct inet_cork_full *cork)
{
- if (v6_cork->opt) {
+ struct inet6_cork *v6_cork = &cork->base6;
+
+ if (unlikely(v6_cork->opt)) {
struct ipv6_txoptions *opt = v6_cork->opt;
kfree(opt->dst0opt);
@@ -1903,15 +1891,14 @@ static void ip6_cork_release(struct inet_cork_full *cork,
struct sk_buff *__ip6_make_skb(struct sock *sk,
struct sk_buff_head *queue,
- struct inet_cork_full *cork,
- struct inet6_cork *v6_cork)
+ struct inet_cork_full *cork)
{
struct sk_buff *skb, *tmp_skb;
struct sk_buff **tail_skb;
struct in6_addr *final_dst;
struct net *net = sock_net(sk);
struct ipv6hdr *hdr;
- struct ipv6_txoptions *opt = v6_cork->opt;
+ struct ipv6_txoptions *opt;
struct rt6_info *rt = dst_rt6_info(cork->base.dst);
struct flowi6 *fl6 = &cork->fl.u.ip6;
unsigned char proto = fl6->flowi6_proto;
@@ -1940,19 +1927,22 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
__skb_pull(skb, skb_network_header_len(skb));
final_dst = &fl6->daddr;
- if (opt && opt->opt_flen)
- ipv6_push_frag_opts(skb, opt, &proto);
- if (opt && opt->opt_nflen)
- ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
-
+ opt = cork->base6.opt;
+ if (unlikely(opt)) {
+ if (opt->opt_flen)
+ proto = ipv6_push_frag_opts(skb, opt, proto);
+ if (opt->opt_nflen)
+ proto = ipv6_push_nfrag_opts(skb, opt, proto,
+ &final_dst, &fl6->saddr);
+ }
skb_push(skb, sizeof(struct ipv6hdr));
skb_reset_network_header(skb);
hdr = ipv6_hdr(skb);
- ip6_flow_hdr(hdr, v6_cork->tclass,
+ ip6_flow_hdr(hdr, cork->base6.tclass,
ip6_make_flowlabel(net, skb, fl6->flowlabel,
ip6_autoflowlabel(net, sk), fl6));
- hdr->hop_limit = v6_cork->hop_limit;
+ hdr->hop_limit = cork->base6.hop_limit;
hdr->nexthdr = proto;
hdr->saddr = fl6->saddr;
hdr->daddr = *final_dst;
@@ -1966,7 +1956,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ip6_cork_steal_dst(skb, cork);
IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
- if (proto == IPPROTO_ICMPV6) {
+ if (unlikely(proto == IPPROTO_ICMPV6)) {
struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
u8 icmp6_type;
@@ -1979,7 +1969,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk,
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
}
- ip6_cork_release(cork, v6_cork);
+ ip6_cork_release(cork);
out:
return skb;
}
@@ -2018,8 +2008,7 @@ EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
static void __ip6_flush_pending_frames(struct sock *sk,
struct sk_buff_head *queue,
- struct inet_cork_full *cork,
- struct inet6_cork *v6_cork)
+ struct inet_cork_full *cork)
{
struct sk_buff *skb;
@@ -2030,13 +2019,13 @@ static void __ip6_flush_pending_frames(struct sock *sk,
kfree_skb(skb);
}
- ip6_cork_release(cork, v6_cork);
+ ip6_cork_release(cork);
}
void ip6_flush_pending_frames(struct sock *sk)
{
__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
- &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
+ &inet_sk(sk)->cork);
}
EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
@@ -2047,9 +2036,8 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
struct ipcm6_cookie *ipc6, struct rt6_info *rt,
unsigned int flags, struct inet_cork_full *cork)
{
- struct inet6_cork v6_cork;
- struct sk_buff_head queue;
int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
+ struct sk_buff_head queue;
int err;
if (flags & MSG_PROBE) {
@@ -2062,21 +2050,21 @@ struct sk_buff *ip6_make_skb(struct sock *sk,
cork->base.flags = 0;
cork->base.addr = 0;
cork->base.opt = NULL;
- v6_cork.opt = NULL;
- err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
+ cork->base6.opt = NULL;
+ err = ip6_setup_cork(sk, cork, ipc6, rt);
if (err) {
- ip6_cork_release(cork, &v6_cork);
+ ip6_cork_release(cork);
return ERR_PTR(err);
}
- err = __ip6_append_data(sk, &queue, cork, &v6_cork,
+ err = __ip6_append_data(sk, &queue, cork,
&current->task_frag, getfrag, from,
length + exthdrlen, transhdrlen + exthdrlen,
flags);
if (err) {
- __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
+ __ip6_flush_pending_frames(sk, &queue, cork);
return ERR_PTR(err);
}
- return __ip6_make_skb(sk, &queue, cork, &v6_cork);
+ return __ip6_make_skb(sk, &queue, cork);
}
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index c1f39735a236..4c29aa94e86e 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -638,7 +638,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
/* change mtu on this route */
if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) {
- if (rel_info > dst_mtu(skb_dst(skb2)))
+ if (rel_info > dst6_mtu(skb_dst(skb2)))
goto out;
skb_dst_update_pmtu_no_confirm(skb2, rel_info);
@@ -1187,7 +1187,7 @@ route_lookup:
t->parms.name);
goto tx_err_dst_release;
}
- mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen;
+ mtu = dst6_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen;
if (encap_limit >= 0) {
max_headroom += 8;
mtu -= 8;
@@ -1265,7 +1265,7 @@ route_lookup:
if (encap_limit >= 0) {
init_tel_txopt(&opt, encap_limit);
- ipv6_push_frag_opts(skb, &opt.ops, &proto);
+ proto = ipv6_push_frag_opts(skb, &opt.ops, proto);
}
skb_push(skb, sizeof(struct ipv6hdr));
@@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t,
}
EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup);
+static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx,
+ struct net_device_path *path)
+{
+ struct ip6_tnl *t = netdev_priv(ctx->dev);
+ struct flowi6 fl6 = {
+ .daddr = t->parms.raddr,
+ };
+ struct dst_entry *dst;
+ int err;
+
+ dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
+ if (!dst->error) {
+ path->type = DEV_PATH_TUN;
+ path->tun.src_v6 = t->parms.laddr;
+ path->tun.dst_v6 = t->parms.raddr;
+ path->tun.l3_proto = IPPROTO_IPV6;
+ path->dev = ctx->dev;
+ ctx->dev = dst->dev;
+ }
+
+ err = dst->error;
+ dst_release(dst);
+
+ return err;
+}
+
static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_init = ip6_tnl_dev_init,
.ndo_uninit = ip6_tnl_dev_uninit,
@@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = {
.ndo_change_mtu = ip6_tnl_change_mtu,
.ndo_get_stats64 = dev_get_tstats64,
.ndo_get_iflink = ip6_tnl_get_iflink,
+ .ndo_fill_forward_path = ip6_tnl_fill_forward_path,
};
#define IPXIPX_FEATURES (NETIF_F_SG | \
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index a61e742794f9..d784a8644ff2 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -1184,7 +1184,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
rcu_read_lock();
dst = __sk_dst_get(sk);
if (dst)
- val = dst_mtu(dst);
+ val = dst6_mtu(dst);
rcu_read_unlock();
if (!val)
return -ENOTCONN;
@@ -1283,7 +1283,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
rcu_read_lock();
dst = __sk_dst_get(sk);
if (dst)
- mtuinfo.ip6m_mtu = dst_mtu(dst);
+ mtuinfo.ip6m_mtu = dst6_mtu(dst);
rcu_read_unlock();
if (!mtuinfo.ip6m_mtu)
return -ENOTCONN;
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index 1c9b283a4132..cba1684a3f30 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -125,12 +125,7 @@ EXPORT_SYMBOL(ip6_dst_hoplimit);
int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- int len;
-
- len = skb->len - sizeof(struct ipv6hdr);
- if (len > IPV6_MAXPLEN)
- len = 0;
- ipv6_hdr(skb)->payload_len = htons(len);
+ ipv6_set_payload_len(ipv6_hdr(skb), skb->len - sizeof(struct ipv6hdr));
IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
/* if egress device is enslaved to an L3 master device pass the
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index b4cd05dba9b6..27a268059168 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -90,23 +90,24 @@ EXPORT_SYMBOL_GPL(raw_v6_match);
* 0 - deliver
* 1 - block
*/
-static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb)
+static int icmpv6_filter(const struct sock *sk, struct sk_buff *skb)
{
- struct icmp6hdr _hdr;
const struct icmp6hdr *hdr;
+ const __u32 *data;
+ unsigned int type;
/* We require only the four bytes of the ICMPv6 header, not any
* additional bytes of message body in "struct icmp6hdr".
*/
- hdr = skb_header_pointer(skb, skb_transport_offset(skb),
- ICMPV6_HDRLEN, &_hdr);
- if (hdr) {
- const __u32 *data = &raw6_sk(sk)->filter.data[0];
- unsigned int type = hdr->icmp6_type;
+ if (!pskb_may_pull(skb, ICMPV6_HDRLEN))
+ return 1;
- return (data[type >> 5] & (1U << (type & 31))) != 0;
- }
- return 1;
+ hdr = (struct icmp6hdr *)skb->data;
+ type = hdr->icmp6_type;
+
+ data = &raw6_sk(sk)->filter.data[0];
+
+ return (data[type >> 5] & (1U << (type & 31))) != 0;
}
#if IS_ENABLED(CONFIG_IPV6_MIP6)
@@ -141,15 +142,13 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister);
static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
{
struct net *net = dev_net(skb->dev);
- const struct in6_addr *saddr;
- const struct in6_addr *daddr;
+ const struct ipv6hdr *ip6h;
struct hlist_head *hlist;
- struct sock *sk;
bool delivered = false;
+ struct sock *sk;
__u8 hash;
- saddr = &ipv6_hdr(skb)->saddr;
- daddr = saddr + 1;
+ ip6h = ipv6_hdr(skb);
hash = raw_hashfunc(net, nexthdr);
hlist = &raw_v6_hashinfo.ht[hash];
@@ -157,7 +156,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
sk_for_each_rcu(sk, hlist) {
int filtered;
- if (!raw_v6_match(net, sk, nexthdr, daddr, saddr,
+ if (!raw_v6_match(net, sk, nexthdr, &ip6h->daddr, &ip6h->saddr,
inet6_iif(skb), inet6_sdif(skb)))
continue;
@@ -171,6 +170,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
switch (nexthdr) {
case IPPROTO_ICMPV6:
filtered = icmpv6_filter(sk, skb);
+ ip6h = ipv6_hdr(skb);
break;
#if IS_ENABLED(CONFIG_IPV6_MIP6)
@@ -529,7 +529,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6,
offset = rp->offset;
total_len = inet_sk(sk)->cork.base.length;
- opt = inet6_sk(sk)->cork.opt;
+ opt = inet_sk(sk)->cork.base6.opt;
total_len -= opt ? opt->opt_flen : 0;
if (offset >= total_len - 1) {
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e3a260a5564b..c0350d97307e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2049,6 +2049,8 @@ unlock:
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
struct rt6_info *rt, int mtu)
{
+ u32 dmtu = dst6_mtu(&rt->dst);
+
/* If the new MTU is lower than the route PMTU, this new MTU will be the
* lowest MTU in the path: always allow updating the route PMTU to
* reflect PMTU decreases.
@@ -2059,10 +2061,10 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
* handle this.
*/
- if (dst_mtu(&rt->dst) >= mtu)
+ if (dmtu >= mtu)
return true;
- if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
+ if (dmtu == idev->cnf.mtu6)
return true;
return false;
@@ -2895,7 +2897,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
dst_metric_set(&rt->dst, RTAX_MTU, mtu);
rt->rt6i_flags |= RTF_MODIFIED;
- rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
+ rt6_update_expires(rt, READ_ONCE(net->ipv6.sysctl.ip6_rt_mtu_expires));
}
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
@@ -2932,7 +2934,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (mtu < IPV6_MIN_MTU)
return;
- if (mtu >= dst_mtu(dst))
+ if (mtu >= dst6_mtu(dst))
return;
if (!rt6_cache_allowed_for_pmtu(rt6)) {
@@ -3248,7 +3250,7 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect);
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
- unsigned int mtu = dst_mtu(dst);
+ unsigned int mtu = dst6_mtu(dst);
struct net *net;
mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
@@ -3256,8 +3258,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
rcu_read_lock();
net = dst_dev_net_rcu(dst);
- if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
- mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
+ mtu = max_t(unsigned int, mtu,
+ READ_ONCE(net->ipv6.sysctl.ip6_rt_min_advmss));
rcu_read_unlock();
@@ -3359,10 +3361,10 @@ out:
static void ip6_dst_gc(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
- int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
- int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
- int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
- unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
+ int rt_min_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_min_interval);
+ int rt_elasticity = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_elasticity);
+ int rt_gc_timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_timeout);
+ unsigned long rt_last_gc = READ_ONCE(net->ipv6.ip6_rt_last_gc);
unsigned int val;
int entries;
@@ -3419,11 +3421,8 @@ static int ip6_route_check_nh_onlink(struct net *net,
err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res);
if (!err && !(res.fib6_flags & RTF_REJECT) &&
- /* ignore match if it is the default route */
- !ipv6_addr_any(&res.f6i->fib6_dst.addr) &&
- (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) {
- NL_SET_ERR_MSG(extack,
- "Nexthop has invalid gateway or device mismatch");
+ res.fib6_type != RTN_UNICAST) {
+ NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway");
err = -EINVAL;
}
@@ -5008,7 +5007,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
};
struct net *net = dev_net(dev);
- if (net->ipv6.sysctl.skip_notify_on_dev_down)
+ if (READ_ONCE(net->ipv6.sysctl.skip_notify_on_dev_down))
fib6_clean_all_skip_notify(net, fib6_ifdown, &arg);
else
fib6_clean_all(net, fib6_ifdown, &arg);
@@ -6408,6 +6407,7 @@ errout:
void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
bool offload, bool trap, bool offload_failed)
{
+ u8 fib_notify_on_flag_change;
struct sk_buff *skb;
int err;
@@ -6419,8 +6419,9 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
WRITE_ONCE(f6i->offload, offload);
WRITE_ONCE(f6i->trap, trap);
+ fib_notify_on_flag_change = READ_ONCE(net->ipv6.sysctl.fib_notify_on_flag_change);
/* 2 means send notifications only if offload_failed was changed. */
- if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 &&
+ if (fib_notify_on_flag_change == 2 &&
READ_ONCE(f6i->offload_failed) == offload_failed)
return;
@@ -6432,7 +6433,7 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i,
*/
return;
- if (!net->ipv6.sysctl.fib_notify_on_flag_change)
+ if (!fib_notify_on_flag_change)
return;
skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL);
@@ -6529,7 +6530,7 @@ static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write,
return ret;
net = (struct net *)ctl->extra1;
- delay = net->ipv6.sysctl.flush_delay;
+ delay = READ_ONCE(net->ipv6.sysctl.flush_delay);
fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
return 0;
}
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index cf37ad9686e6..439c8a1c6625 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -962,7 +962,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
}
if (df) {
- mtu = dst_mtu(&rt->dst) - t_hlen;
+ mtu = dst4_mtu(&rt->dst) - t_hlen;
if (mtu < IPV4_MIN_MTU) {
DEV_STATS_INC(dev, collisions);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 280fe5978559..d10487b4e5bf 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -138,15 +138,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
{
struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr;
struct inet_connection_sock *icsk = inet_csk(sk);
- struct in6_addr *saddr = NULL, *final_p, final;
struct inet_timewait_death_row *tcp_death_row;
struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct in6_addr *saddr = NULL, *final_p;
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct ipv6_txoptions *opt;
struct dst_entry *dst;
- struct flowi6 fl6;
+ struct flowi6 *fl6;
int addr_type;
int err;
@@ -156,14 +156,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
if (usin->sin6_family != AF_INET6)
return -EAFNOSUPPORT;
- memset(&fl6, 0, sizeof(fl6));
+ fl6 = &inet_sk(sk)->cork.fl.u.ip6;
+ memset(fl6, 0, sizeof(*fl6));
if (inet6_test_bit(SNDFLOW, sk)) {
- fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK;
- IP6_ECN_flow_init(fl6.flowlabel);
- if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) {
+ fl6->flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK;
+ IP6_ECN_flow_init(fl6->flowlabel);
+ if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) {
struct ip6_flowlabel *flowlabel;
- flowlabel = fl6_sock_lookup(sk, fl6.flowlabel);
+ flowlabel = fl6_sock_lookup(sk, fl6->flowlabel);
if (IS_ERR(flowlabel))
return -EINVAL;
fl6_sock_release(flowlabel);
@@ -212,7 +213,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
}
sk->sk_v6_daddr = usin->sin6_addr;
- np->flow_label = fl6.flowlabel;
+ np->flow_label = fl6->flowlabel;
/*
* TCP over IPv4
@@ -260,24 +261,24 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr))
saddr = &sk->sk_v6_rcv_saddr;
- fl6.flowi6_proto = IPPROTO_TCP;
- fl6.daddr = sk->sk_v6_daddr;
- fl6.saddr = saddr ? *saddr : np->saddr;
- fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
- fl6.flowi6_oif = sk->sk_bound_dev_if;
- fl6.flowi6_mark = sk->sk_mark;
- fl6.fl6_dport = usin->sin6_port;
- fl6.fl6_sport = inet->inet_sport;
- if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport)
- fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT;
- fl6.flowi6_uid = sk_uid(sk);
+ fl6->flowi6_proto = IPPROTO_TCP;
+ fl6->daddr = sk->sk_v6_daddr;
+ fl6->saddr = saddr ? *saddr : np->saddr;
+ fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label);
+ fl6->flowi6_oif = sk->sk_bound_dev_if;
+ fl6->flowi6_mark = sk->sk_mark;
+ fl6->fl6_dport = usin->sin6_port;
+ fl6->fl6_sport = inet->inet_sport;
+ if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6->fl6_sport)
+ fl6->flowi6_flags = FLOWI_FLAG_ANY_SPORT;
+ fl6->flowi6_uid = sk_uid(sk);
opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk));
- final_p = fl6_update_dst(&fl6, opt, &final);
+ final_p = fl6_update_dst(fl6, opt, &np->final);
- security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6));
+ security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6));
- dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p);
+ dst = ip6_dst_lookup_flow(net, sk, fl6, final_p);
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto failure;
@@ -287,7 +288,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr,
tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
if (!saddr) {
- saddr = &fl6.saddr;
+ saddr = &fl6->saddr;
err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
if (err)
@@ -351,7 +352,7 @@ failure:
static void tcp_v6_mtu_reduced(struct sock *sk)
{
struct dst_entry *dst;
- u32 mtu;
+ u32 mtu, dmtu;
if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE))
return;
@@ -368,8 +369,9 @@ static void tcp_v6_mtu_reduced(struct sock *sk)
if (!dst)
return;
- if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) {
- tcp_sync_mss(sk, dst_mtu(dst));
+ dmtu = dst6_mtu(dst);
+ if (inet_csk(sk)->icsk_pmtu_cookie > dmtu) {
+ tcp_sync_mss(sk, dmtu);
tcp_simple_retransmit(sk);
}
}
@@ -537,7 +539,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
u8 tclass;
/* First, grab a route. */
- if (!dst && (dst = inet6_csk_route_req(sk, fl6, req,
+ if (!dst && (dst = inet6_csk_route_req(sk, NULL, fl6, req,
IPPROTO_TCP)) == NULL)
goto done;
@@ -787,7 +789,7 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk,
if (security_inet_conn_request(sk, skb, req))
return NULL;
- return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP);
+ return inet6_csk_route_req(sk, NULL, &fl->u.ip6, req, IPPROTO_TCP);
}
struct request_sock_ops tcp6_request_sock_ops __read_mostly = {
@@ -1085,7 +1087,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
txhash = inet_twsk(sk)->tw_txhash;
}
} else {
- if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET)
+ if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) &
+ FLOWLABEL_REFLECT_TCP_RESET)
label = ip6_flowlabel(ipv6h);
}
@@ -1315,12 +1318,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
struct request_sock *req_unhash,
bool *own_req)
{
- struct inet_request_sock *ireq;
- struct ipv6_pinfo *newnp;
const struct ipv6_pinfo *np = tcp_inet6_sk(sk);
+ struct inet_request_sock *ireq;
struct ipv6_txoptions *opt;
struct inet_sock *newinet;
bool found_dup_sk = false;
+ struct ipv6_pinfo *newnp;
struct tcp_sock *newtp;
struct sock *newsk;
#ifdef CONFIG_TCP_MD5SIG
@@ -1389,11 +1392,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
if (sk_acceptq_is_full(sk))
goto exit_overflow;
- if (!dst) {
- dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP);
- if (!dst)
- goto exit;
- }
+ dst = inet6_csk_route_req(sk, dst, &fl6, req, IPPROTO_TCP);
+ if (!dst)
+ goto exit;
newsk = tcp_create_openreq_child(sk, req, skb);
if (!newsk)
@@ -1409,6 +1410,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
inet6_sk_rx_dst_set(newsk, skb);
newinet = inet_sk(newsk);
+ newinet->cork.fl.u.ip6 = fl6;
newinet->pinet6 = tcp_inet6_sk(newsk);
newinet->ipv6_fl_list = NULL;
newinet->inet_opt = NULL;
@@ -1466,7 +1468,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
tcp_ca_openreq_child(newsk, dst);
- tcp_sync_mss(newsk, dst_mtu(dst));
+ tcp_sync_mss(newsk, dst6_mtu(dst));
newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst));
tcp_initialize_rcv_mss(newsk);
@@ -2331,6 +2333,8 @@ struct proto tcpv6_prot = {
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
.max_header = MAX_TCP_HEADER,
.obj_size = sizeof(struct tcp6_sock),
+ .freeptr_offset = offsetof(struct tcp6_sock,
+ tcp.inet_conn.icsk_inet.sk.sk_freeptr),
.ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6),
.slab_flags = SLAB_TYPESAFE_BY_RCU,
.twsk_prot = &tcp6_timewait_sock_ops,
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index 5670d32c27f8..f2a659cd6183 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -24,9 +24,6 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
struct net *net;
int iif, sdif;
- if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST)))
- return;
-
p = tcp_gro_lookup(head, th);
if (p) {
NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist;
@@ -45,8 +42,8 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
#endif /* IS_ENABLED(CONFIG_IPV6) */
}
-INDIRECT_CALLABLE_SCOPE
-struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
+static __always_inline struct sk_buff *tcp6_gro_receive(struct list_head *head,
+ struct sk_buff *skb)
{
struct tcphdr *th;
@@ -60,7 +57,8 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)
if (!th)
goto flush;
- tcp6_check_fraglist_gro(head, skb, th);
+ if (unlikely(skb->dev->features & NETIF_F_GRO_FRAGLIST))
+ tcp6_check_fraglist_gro(head, skb, th);
return tcp_gro_receive(head, skb, th);
@@ -69,7 +67,7 @@ flush:
return NULL;
}
-INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff)
+static __always_inline int tcp6_gro_complete(struct sk_buff *skb, int thoff)
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 794c13674e8a..010b909275dd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -875,7 +875,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb)
/*
* UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c).
*/
- if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) {
+ if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) &&
+ UDP_SKB_CB(skb)->partial_cov)) {
u16 pcrlen = READ_ONCE(up->pcrlen);
if (pcrlen == 0) { /* full coverage was set */
@@ -1439,7 +1440,7 @@ csum_partial:
send:
err = ip6_send_skb(skb);
- if (err) {
+ if (unlikely(err)) {
if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) {
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_SNDBUFERRORS, is_udplite);
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 046f13b1d77a..e003b8494dc0 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -132,7 +132,6 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport,
sdif, net->ipv4.udp_table, NULL);
}
-INDIRECT_CALLABLE_SCOPE
struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
{
struct udphdr *uh = udp_gro_udphdr(skb);
@@ -165,7 +164,7 @@ flush:
return NULL;
}
-INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff)
+int udp6_gro_complete(struct sk_buff *skb, int nhoff)
{
const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation];
const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset);
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index da2af413c89d..b43b1059eea8 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -313,13 +313,12 @@ static union iucv_param *iucv_param[NR_CPUS];
static union iucv_param *iucv_param_irq[NR_CPUS];
/**
- * __iucv_call_b2f0
+ * __iucv_call_b2f0 - Calls CP to execute IUCV commands.
+ *
* @command: identifier of IUCV call to CP.
* @parm: pointer to a struct iucv_parm block
*
- * Calls CP to execute IUCV commands.
- *
- * Returns the result of the CP IUCV call.
+ * Returns: the result of the CP IUCV call.
*/
static inline int __iucv_call_b2f0(int command, union iucv_param *parm)
{
@@ -348,11 +347,10 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm)
}
/*
- * iucv_query_maxconn
+ * iucv_query_maxconn - Determine the maximum number of connections that
+ * may be established.
*
- * Determines the maximum number of connections that may be established.
- *
- * Returns the maximum number of connections or -EPERM is IUCV is not
+ * Returns: the maximum number of connections or -EPERM is IUCV is not
* available.
*/
static int __iucv_query_maxconn(void *param, unsigned long *max_pathid)
@@ -391,10 +389,9 @@ static int iucv_query_maxconn(void)
}
/**
- * iucv_allow_cpu
- * @data: unused
+ * iucv_allow_cpu - Allow iucv interrupts on this cpu.
*
- * Allow iucv interrupts on this cpu.
+ * @data: unused
*/
static void iucv_allow_cpu(void *data)
{
@@ -432,10 +429,9 @@ static void iucv_allow_cpu(void *data)
}
/**
- * iucv_block_cpu
- * @data: unused
+ * iucv_block_cpu - Block iucv interrupts on this cpu.
*
- * Block iucv interrupts on this cpu.
+ * @data: unused
*/
static void iucv_block_cpu(void *data)
{
@@ -452,10 +448,9 @@ static void iucv_block_cpu(void *data)
}
/**
- * iucv_declare_cpu
- * @data: unused
+ * iucv_declare_cpu - Declare a interrupt buffer on this cpu.
*
- * Declare a interrupt buffer on this cpu.
+ * @data: unused
*/
static void iucv_declare_cpu(void *data)
{
@@ -507,10 +502,9 @@ static void iucv_declare_cpu(void *data)
}
/**
- * iucv_retrieve_cpu
- * @data: unused
+ * iucv_retrieve_cpu - Retrieve interrupt buffer on this cpu.
*
- * Retrieve interrupt buffer on this cpu.
+ * @data: unused
*/
static void iucv_retrieve_cpu(void *data)
{
@@ -532,9 +526,7 @@ static void iucv_retrieve_cpu(void *data)
}
/*
- * iucv_setmask_mp
- *
- * Allow iucv interrupts on all cpus.
+ * iucv_setmask_mp - Allow iucv interrupts on all cpus.
*/
static void iucv_setmask_mp(void)
{
@@ -551,9 +543,7 @@ static void iucv_setmask_mp(void)
}
/*
- * iucv_setmask_up
- *
- * Allow iucv interrupts on a single cpu.
+ * iucv_setmask_up - Allow iucv interrupts on a single cpu.
*/
static void iucv_setmask_up(void)
{
@@ -568,12 +558,11 @@ static void iucv_setmask_up(void)
}
/*
- * iucv_enable
+ * iucv_enable - Make the iucv ready for use
*
- * This function makes iucv ready for use. It allocates the pathid
- * table, declares an iucv interrupt buffer and enables the iucv
- * interrupts. Called when the first user has registered an iucv
- * handler.
+ * It allocates the pathid table, declares an iucv interrupt buffer and
+ * enables the iucv interrupts. Called when the first user has registered
+ * an iucv handler.
*/
static int iucv_enable(void)
{
@@ -603,11 +592,10 @@ out:
}
/*
- * iucv_disable
+ * iucv_disable - Shuts down iucv.
*
- * This function shuts down iucv. It disables iucv interrupts, retrieves
- * the iucv interrupt buffer and frees the pathid table. Called after the
- * last user unregister its iucv handler.
+ * It disables iucv interrupts, retrieves the iucv interrupt buffer and frees
+ * the pathid table. Called after the last user unregister its iucv handler.
*/
static void iucv_disable(void)
{
@@ -695,11 +683,10 @@ __free_cpumask:
}
/**
- * iucv_sever_pathid
+ * iucv_sever_pathid - Sever an iucv path to free up the pathid. Used internally.
+ *
* @pathid: path identification number.
* @userdata: 16-bytes of user data.
- *
- * Sever an iucv path to free up the pathid. Used internally.
*/
static int iucv_sever_pathid(u16 pathid, u8 *userdata)
{
@@ -714,22 +701,20 @@ static int iucv_sever_pathid(u16 pathid, u8 *userdata)
}
/**
- * __iucv_cleanup_queue
- * @dummy: unused dummy argument
+ * __iucv_cleanup_queue - Nop function called via smp_call_function to force
+ * work items from pending external iucv interrupts to the work queue.
*
- * Nop function called via smp_call_function to force work items from
- * pending external iucv interrupts to the work queue.
+ * @dummy: unused dummy argument
*/
static void __iucv_cleanup_queue(void *dummy)
{
}
/**
- * iucv_cleanup_queue
+ * iucv_cleanup_queue - Called after a path has been severed to find all
+ * remaining work items for the now stale pathid.
*
- * Function called after a path has been severed to find all remaining
- * work items for the now stale pathid. The caller needs to hold the
- * iucv_table_lock.
+ * The caller needs to hold the iucv_table_lock.
*/
static void iucv_cleanup_queue(void)
{
@@ -757,13 +742,12 @@ static void iucv_cleanup_queue(void)
}
/**
- * iucv_register:
+ * iucv_register - Registers a driver with IUCV.
+ *
* @handler: address of iucv handler structure
* @smp: != 0 indicates that the handler can deal with out of order messages
*
- * Registers a driver with IUCV.
- *
- * Returns 0 on success, -ENOMEM if the memory allocation for the pathid
+ * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid
* table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus.
*/
int iucv_register(struct iucv_handler *handler, int smp)
@@ -794,11 +778,10 @@ out_mutex:
EXPORT_SYMBOL(iucv_register);
/**
- * iucv_unregister
+ * iucv_unregister - Unregister driver from IUCV.
+ *
* @handler: address of iucv handler structure
* @smp: != 0 indicates that the handler can deal with out of order messages
- *
- * Unregister driver from IUCV.
*/
void iucv_unregister(struct iucv_handler *handler, int smp)
{
@@ -852,7 +835,8 @@ static struct notifier_block iucv_reboot_notifier = {
};
/**
- * iucv_path_accept
+ * iucv_path_accept - Complete the IUCV communication path
+ *
* @path: address of iucv path structure
* @handler: address of iucv handler structure
* @userdata: 16 bytes of data reflected to the communication partner
@@ -861,7 +845,7 @@ static struct notifier_block iucv_reboot_notifier = {
* This function is issued after the user received a connection pending
* external interrupt and now wishes to complete the IUCV communication path.
*
- * Returns the result of the CP IUCV call.
+ * Returns: the result of the CP IUCV call.
*/
int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler,
u8 *userdata, void *private)
@@ -896,7 +880,8 @@ out:
EXPORT_SYMBOL(iucv_path_accept);
/**
- * iucv_path_connect
+ * iucv_path_connect - Establish an IUCV path
+ *
* @path: address of iucv path structure
* @handler: address of iucv handler structure
* @userid: 8-byte user identification
@@ -908,7 +893,7 @@ EXPORT_SYMBOL(iucv_path_accept);
* successfully, you are not able to use the path until you receive an IUCV
* Connection Complete external interrupt.
*
- * Returns the result of the CP IUCV call.
+ * Returns: the result of the CP IUCV call.
*/
int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler,
u8 *userid, u8 *system, u8 *userdata,
@@ -964,14 +949,14 @@ out:
EXPORT_SYMBOL(iucv_path_connect);
/**
- * iucv_path_quiesce:
+ * iucv_path_quiesce - Temporarily suspend incoming messages
* @path: address of iucv path structure
* @userdata: 16 bytes of data reflected to the communication partner
*
* This function temporarily suspends incoming messages on an IUCV path.
* You can later reactivate the path by invoking the iucv_resume function.
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_path_quiesce(struct iucv_path *path, u8 *userdata)
{
@@ -996,14 +981,15 @@ out:
EXPORT_SYMBOL(iucv_path_quiesce);
/**
- * iucv_path_resume:
+ * iucv_path_resume - Resume incoming messages on a suspended IUCV path
+ *
* @path: address of iucv path structure
* @userdata: 16 bytes of data reflected to the communication partner
*
* This function resumes incoming messages on an IUCV path that has
* been stopped with iucv_path_quiesce.
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_path_resume(struct iucv_path *path, u8 *userdata)
{
@@ -1027,13 +1013,12 @@ out:
}
/**
- * iucv_path_sever
+ * iucv_path_sever - Terminates an IUCV path.
+ *
* @path: address of iucv path structure
* @userdata: 16 bytes of data reflected to the communication partner
*
- * This function terminates an IUCV path.
- *
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_path_sever(struct iucv_path *path, u8 *userdata)
{
@@ -1058,14 +1043,13 @@ out:
EXPORT_SYMBOL(iucv_path_sever);
/**
- * iucv_message_purge
+ * iucv_message_purge - Cancels a message you have sent.
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @srccls: source class of message
*
- * Cancels a message you have sent.
- *
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg,
u32 srccls)
@@ -1096,13 +1080,15 @@ out:
EXPORT_SYMBOL(iucv_message_purge);
/**
- * iucv_message_receive_iprmdata
+ * iucv_message_receive_iprmdata - Internal function to receive RMDATA
+ * stored in &struct iucv_message
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @flags: how the message is received (IUCV_IPBUFLST)
* @buffer: address of data buffer or address of struct iucv_array
* @size: length of data buffer
- * @residual:
+ * @residual: number of bytes remaining in the data buffer
*
* Internal function used by iucv_message_receive and __iucv_message_receive
* to receive RMDATA data stored in struct iucv_message.
@@ -1140,10 +1126,11 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path,
}
/**
- * __iucv_message_receive
+ * __iucv_message_receive - Receives messages on an established path (no locking)
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
- * @flags: how the message is received (IUCV_IPBUFLST)
+ * @flags: flags that affect how the message is received (IUCV_IPBUFLST)
* @buffer: address of data buffer or address of struct iucv_array
* @size: length of data buffer
* @residual:
@@ -1154,7 +1141,7 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path,
*
* Locking: no locking
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *buffer, size_t size, size_t *residual)
@@ -1188,10 +1175,11 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
EXPORT_SYMBOL(__iucv_message_receive);
/**
- * iucv_message_receive
+ * iucv_message_receive - Receives messages on an established path, with locking
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
- * @flags: how the message is received (IUCV_IPBUFLST)
+ * @flags: flags that affect how the message is received (IUCV_IPBUFLST)
* @buffer: address of data buffer or address of struct iucv_array
* @size: length of data buffer
* @residual:
@@ -1202,7 +1190,7 @@ EXPORT_SYMBOL(__iucv_message_receive);
*
* Locking: local_bh_enable/local_bh_disable
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *buffer, size_t size, size_t *residual)
@@ -1220,7 +1208,8 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg,
EXPORT_SYMBOL(iucv_message_receive);
/**
- * iucv_message_reject
+ * iucv_message_reject - Refuses a specified message
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
*
@@ -1228,7 +1217,7 @@ EXPORT_SYMBOL(iucv_message_receive);
* are notified of a message and the time that you complete the message,
* the message may be rejected.
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg)
{
@@ -1254,7 +1243,8 @@ out:
EXPORT_SYMBOL(iucv_message_reject);
/**
- * iucv_message_reply
+ * iucv_message_reply - Replies to a specified message
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
@@ -1262,11 +1252,11 @@ EXPORT_SYMBOL(iucv_message_reject);
* @size: length of reply data buffer
*
* This function responds to the two-way messages that you receive. You
- * must identify completely the message to which you wish to reply. ie,
+ * must identify completely the message to which you wish to reply. I.e.,
* pathid, msgid, and trgcls. Prmmsg signifies the data is moved into
* the parameter list.
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg,
u8 flags, void *reply, size_t size)
@@ -1303,7 +1293,8 @@ out:
EXPORT_SYMBOL(iucv_message_reply);
/**
- * __iucv_message_send
+ * __iucv_message_send - Transmits a one-way message, no locking
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
@@ -1317,7 +1308,7 @@ EXPORT_SYMBOL(iucv_message_reply);
*
* Locking: no locking
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size)
@@ -1357,7 +1348,8 @@ out:
EXPORT_SYMBOL(__iucv_message_send);
/**
- * iucv_message_send
+ * iucv_message_send - Transmits a one-way message, with locking
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST)
@@ -1371,7 +1363,7 @@ EXPORT_SYMBOL(__iucv_message_send);
*
* Locking: local_bh_enable/local_bh_disable
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size)
@@ -1386,7 +1378,8 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg,
EXPORT_SYMBOL(iucv_message_send);
/**
- * iucv_message_send2way
+ * iucv_message_send2way - Transmits a two-way message
+ *
* @path: address of iucv path structure
* @msg: address of iucv msg structure
* @flags: how the message is sent and the reply is received
@@ -1403,7 +1396,7 @@ EXPORT_SYMBOL(iucv_message_send);
* reply to the message and a buffer is provided into which IUCV moves
* the reply to this message.
*
- * Returns the result from the CP IUCV call.
+ * Returns: the result from the CP IUCV call.
*/
int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg,
u8 flags, u32 srccls, void *buffer, size_t size,
@@ -1462,11 +1455,11 @@ struct iucv_path_pending {
} __packed;
/**
- * iucv_path_pending
+ * iucv_path_pending - Process connection pending work item
+ *
* @data: Pointer to external interrupt buffer
*
- * Process connection pending work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_path_pending(struct iucv_irq_data *data)
{
@@ -1523,11 +1516,11 @@ struct iucv_path_complete {
} __packed;
/**
- * iucv_path_complete
+ * iucv_path_complete - Process connection complete work item
+ *
* @data: Pointer to external interrupt buffer
*
- * Process connection complete work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_path_complete(struct iucv_irq_data *data)
{
@@ -1553,11 +1546,11 @@ struct iucv_path_severed {
} __packed;
/**
- * iucv_path_severed
+ * iucv_path_severed - Process connection severed work item.
+ *
* @data: Pointer to external interrupt buffer
*
- * Process connection severed work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_path_severed(struct iucv_irq_data *data)
{
@@ -1589,11 +1582,11 @@ struct iucv_path_quiesced {
} __packed;
/**
- * iucv_path_quiesced
+ * iucv_path_quiesced - Process connection quiesced work item.
+ *
* @data: Pointer to external interrupt buffer
*
- * Process connection quiesced work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_path_quiesced(struct iucv_irq_data *data)
{
@@ -1617,11 +1610,11 @@ struct iucv_path_resumed {
} __packed;
/**
- * iucv_path_resumed
+ * iucv_path_resumed - Process connection resumed work item.
+ *
* @data: Pointer to external interrupt buffer
*
- * Process connection resumed work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_path_resumed(struct iucv_irq_data *data)
{
@@ -1648,11 +1641,11 @@ struct iucv_message_complete {
} __packed;
/**
- * iucv_message_complete
+ * iucv_message_complete - Process message complete work item.
+ *
* @data: Pointer to external interrupt buffer
*
- * Process message complete work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_message_complete(struct iucv_irq_data *data)
{
@@ -1695,11 +1688,11 @@ struct iucv_message_pending {
} __packed;
/**
- * iucv_message_pending
+ * iucv_message_pending - Process message pending work item.
+ *
* @data: Pointer to external interrupt buffer
*
- * Process message pending work item. Called from tasklet while holding
- * iucv_table_lock.
+ * Context: Called from tasklet while holding iucv_table_lock.
*/
static void iucv_message_pending(struct iucv_irq_data *data)
{
@@ -1722,7 +1715,7 @@ static void iucv_message_pending(struct iucv_irq_data *data)
}
/*
- * iucv_tasklet_fn:
+ * iucv_tasklet_fn - Process the queue of IRQ buffers
*
* This tasklet loops over the queue of irq buffers created by
* iucv_external_interrupt, calls the appropriate action handler
@@ -1766,7 +1759,7 @@ static void iucv_tasklet_fn(unsigned long ignored)
}
/*
- * iucv_work_fn:
+ * iucv_work_fn - Process the queue of path pending IRQ blocks
*
* This work function loops over the queue of path pending irq blocks
* created by iucv_external_interrupt, calls the appropriate action
@@ -1797,9 +1790,8 @@ static void iucv_work_fn(struct work_struct *work)
}
/*
- * iucv_external_interrupt
+ * iucv_external_interrupt - Handles external interrupts coming in from CP.
*
- * Handles external interrupts coming in from CP.
* Places the interrupt buffer on a queue and schedules iucv_tasklet_fn().
*/
static void iucv_external_interrupt(struct ext_code ext_code,
@@ -1857,10 +1849,9 @@ struct iucv_interface iucv_if = {
EXPORT_SYMBOL(iucv_if);
static enum cpuhp_state iucv_online;
+
/**
- * iucv_init
- *
- * Allocates and initializes various data structures.
+ * iucv_init - Allocates and initializes various data structures.
*/
static int __init iucv_init(void)
{
@@ -1924,9 +1915,7 @@ out:
}
/**
- * iucv_exit
- *
- * Frees everything allocated from iucv_init.
+ * iucv_exit - Frees everything allocated from iucv_init.
*/
static void __exit iucv_exit(void)
{
diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile
index a33884967f21..b0e392eb7753 100644
--- a/net/mac80211/Makefile
+++ b/net/mac80211/Makefile
@@ -36,7 +36,7 @@ mac80211-y := \
tdls.o \
ocb.o \
airtime.o \
- eht.o
+ eht.o uhr.o
mac80211-$(CONFIG_MAC80211_LEDS) += led.o
mac80211-$(CONFIG_MAC80211_DEBUGFS) += \
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c81091a5cc3a..5d04d7d550b0 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -5,7 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/ieee80211.h>
@@ -680,10 +680,18 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev,
* association has completed, this rejects that attempt
* so it will set the key again after association.
*
+ * With (re)association frame encryption enabled, cfg80211
+ * may deliver keys to mac80211 before the station has
+ * associated. In that case, accept the key if the station
+ * is an Enhanced Privacy Protection (EPP) peer.
+ * If (re)association frame encryption support is not present,
+ * cfg80211 will not allow key installation in non‑AP STA mode.
+ *
* TODO: accept the key if we have a station entry and
- * add it to the device after the station.
+ * add it to the device after the station associates.
*/
- if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) {
+ if (!sta || (!sta->sta.epp_peer &&
+ !test_sta_flag(sta, WLAN_STA_ASSOC))) {
ieee80211_key_free_unused(key);
return -ENOENT;
}
@@ -1600,6 +1608,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev,
link_conf->eht_mu_beamformer = false;
}
+ if (params->uhr_oper) {
+ if (!link_conf->eht_support)
+ return -EOPNOTSUPP;
+
+ link_conf->uhr_support = true;
+ }
+
if (sdata->vif.type == NL80211_IFTYPE_AP &&
params->mbssid_config.tx_wdev) {
err = ieee80211_set_ap_mbssid_options(sdata,
@@ -1908,7 +1923,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev,
if (sdata->wdev.links[link_id].cac_started) {
chandef = link_conf->chanreq.oper;
- wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work);
+ wiphy_hrtimer_work_cancel(wiphy, &link->dfs_cac_timer_work);
cfg80211_cac_event(sdata->dev, &chandef,
NL80211_RADAR_CAC_ABORTED,
GFP_KERNEL, link_id);
@@ -2077,6 +2092,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
params->vht_capa ||
params->he_capa ||
params->eht_capa ||
+ params->uhr_capa ||
params->s1g_capa ||
params->opmode_notif_used;
@@ -2125,8 +2141,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
if (params->supported_rates &&
params->supported_rates_len &&
- !ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
- sband, params->supported_rates,
+ !ieee80211_parse_bitrates(sband, params->supported_rates,
params->supported_rates_len,
&link_sta->pub->supp_rates[sband->band]))
return -EINVAL;
@@ -2156,6 +2171,12 @@ static int sta_link_apply_parameters(struct ieee80211_local *local,
params->eht_capa_len,
link_sta);
+ if (params->uhr_capa)
+ ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband,
+ params->uhr_capa,
+ params->uhr_capa_len,
+ link_sta);
+
if (params->s1g_capa)
ieee80211_s1g_cap_to_sta_s1g_cap(sdata, params->s1g_capa,
link_sta);
@@ -2199,6 +2220,9 @@ static int sta_apply_parameters(struct ieee80211_local *local,
mask = params->sta_flags_mask;
set = params->sta_flags_set;
+ if (params->epp_peer)
+ sta->sta.epp_peer = true;
+
if (ieee80211_vif_is_mesh(&sdata->vif)) {
/*
* In mesh mode, ASSOCIATED isn't part of the nl80211
@@ -2987,8 +3011,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy,
return -EINVAL;
if (params->basic_rates) {
- if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width,
- wiphy->bands[sband->band],
+ if (!ieee80211_parse_bitrates(sband,
params->basic_rates,
params->basic_rates_len,
&link->conf->basic_rates))
@@ -3865,8 +3888,8 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy,
if (err)
return err;
- wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work,
- msecs_to_jiffies(cac_time_ms));
+ wiphy_hrtimer_work_queue(wiphy, &link_data->dfs_cac_timer_work,
+ ms_to_ktime(cac_time_ms));
return 0;
}
@@ -3885,7 +3908,7 @@ static void ieee80211_end_cac(struct wiphy *wiphy,
if (!link_data)
continue;
- wiphy_delayed_work_cancel(wiphy,
+ wiphy_hrtimer_work_cancel(wiphy,
&link_data->dfs_cac_timer_work);
if (sdata->wdev.links[link_id].cac_started) {
@@ -4151,12 +4174,21 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data)
{
struct ieee80211_sub_if_data *sdata = link_data->sdata;
+ int link_id = -1;
if (__ieee80211_csa_finalize(link_data)) {
sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n",
link_data->link_id);
- cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev,
- GFP_KERNEL);
+ if (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_P2P_GO)
+ /*
+ * link_id is expected only for AP/P2P_GO type
+ * currently
+ */
+ link_id = link_data->link_id;
+
+ cfg80211_stop_link(sdata->local->hw.wiphy, &sdata->wdev,
+ link_id, GFP_KERNEL);
}
}
@@ -4400,7 +4432,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
goto out;
/* if reservation is invalid then this will fail */
- err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1);
+ err = ieee80211_check_combinations(sdata, NULL, 0, 0, -1);
if (err) {
ieee80211_link_unreserve_chanctx(link_data);
goto out;
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 55105d238d6b..51bf3c7822a7 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -1772,4 +1772,25 @@ drv_prep_add_interface(struct ieee80211_local *local,
trace_drv_return_void(local);
}
+static inline int drv_set_eml_op_mode(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ struct ieee80211_eml_params *eml_params)
+{
+ struct ieee80211_local *local = sdata->local;
+ int ret = -EOPNOTSUPP;
+
+ might_sleep();
+ lockdep_assert_wiphy(local->hw.wiphy);
+
+ trace_drv_set_eml_op_mode(local, sdata, sta, eml_params->link_id,
+ eml_params->control,
+ eml_params->link_bitmap);
+ if (local->ops->set_eml_op_mode)
+ ret = local->ops->set_eml_op_mode(&local->hw, &sdata->vif,
+ sta, eml_params);
+ trace_drv_return_int(local, ret);
+
+ return ret;
+}
+
#endif /* __MAC80211_DRIVER_OPS */
diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h
index eb9ab310f91c..f06a8aa905c5 100644
--- a/net/mac80211/drop.h
+++ b/net/mac80211/drop.h
@@ -2,7 +2,7 @@
/*
* mac80211 drop reason list
*
- * Copyright (C) 2023-2024 Intel Corporation
+ * Copyright (C) 2023-2024, 2026 Intel Corporation
*/
#ifndef MAC80211_DROP_H
@@ -65,6 +65,49 @@ typedef unsigned int __bitwise ieee80211_rx_result;
/* 0x30 */ \
R(RX_DROP_U_BAD_MGMT_KEYIDX) \
R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \
+ R(RX_DROP_U_MESH_DS_BITS) \
+ R(RX_DROP_U_MESH_A3_MISMATCH) \
+ R(RX_DROP_U_MESH_NO_A4) \
+ R(RX_DROP_U_MESH_A4_MISMATCH) \
+ R(RX_DROP_U_MESH_UNEXP_DATA) \
+ R(RX_DROP_U_MESH_WRONG_ACTION) \
+ R(RX_DROP_U_MESH_UNEXP_MGMT) \
+ R(RX_DROP_U_SPURIOUS_NOTIF) \
+ R(RX_DROP_U_RUNT_DATA) \
+ R(RX_DROP_U_KEY_TAINTED) \
+ R(RX_DROP_U_UNPROTECTED) \
+ R(RX_DROP_U_MCAST_FRAGMENT) \
+ R(RX_DROP_U_DEFRAG_MISMATCH) \
+ R(RX_DROP_U_RUNT_MESH_DATA) \
+ /* 0x40 */ \
+ R(RX_DROP_U_MESH_NO_TTL) \
+ R(RX_DROP_U_MESH_RMC) \
+ R(RX_DROP_U_MESH_BAD_AE) \
+ R(RX_DROP_U_MESH_TTL_EXPIRED) \
+ R(RX_DROP_U_MESH_NOT_FORWARDING) \
+ R(RX_DROP_U_AMSDU_WITHOUT_DATA) \
+ R(RX_DROP_U_NULL_DATA) \
+ R(RX_DROP_U_UNEXPECTED_4ADDR) \
+ R(RX_DROP_U_PORT_CONTROL) \
+ R(RX_DROP_U_UNKNOWN_STA) \
+ R(RX_DROP_U_RUNT_BAR) \
+ R(RX_DROP_U_BAR_OUTSIDE_SESSION) \
+ R(RX_DROP_U_CTRL_FRAME) \
+ R(RX_DROP_U_RUNT_MGMT) \
+ R(RX_DROP_U_EXPECTED_MGMT) \
+ R(RX_DROP_U_NONBCAST_BEACON) \
+ /* 0x50 */ \
+ R(RX_DROP_U_MALFORMED_ACTION) \
+ R(RX_DROP_U_UNKNOWN_MCAST_ACTION) \
+ R(RX_DROP_U_UNEXPECTED_EXT_FRAME) \
+ R(RX_DROP_U_UNHANDLED_MGMT) \
+ R(RX_DROP_U_MCAST_DEAUTH) \
+ R(RX_DROP_U_UNHANDLED_DEAUTH) \
+ R(RX_DROP_U_MCAST_DISASSOC) \
+ R(RX_DROP_U_UNHANDLED_DISASSOC) \
+ R(RX_DROP_U_UNHANDLED_PREQ) \
+ R(RX_DROP_U_UNHANDLED_MGMT_STYPE) \
+ R(RX_DROP_U_NO_LINK) \
/* this line for the trailing \ - add before this */
/* having two enums allows for checking ieee80211_rx_result use with sparse */
@@ -85,7 +128,6 @@ enum ___mac80211_drop_reason {
enum mac80211_drop_reason {
RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE,
RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED,
- RX_DROP = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE,
#define DEF(x) x = (__force ieee80211_rx_result)___ ## x,
MAC80211_DROP_REASONS_UNUSABLE(DEF)
#undef DEF
diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c
index fd41046e3b68..75096b2195d2 100644
--- a/net/mac80211/eht.c
+++ b/net/mac80211/eht.c
@@ -5,6 +5,7 @@
* Copyright(c) 2021-2025 Intel Corporation
*/
+#include "driver-ops.h"
#include "ieee80211_i.h"
void
@@ -102,3 +103,177 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
ieee80211_sta_recalc_aggregates(&link_sta->sta->sta);
}
+
+static void
+ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *req, int opt_len)
+{
+ int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn);
+ struct ieee80211_local *local = sdata->local;
+ struct ieee80211_mgmt *mgmt;
+ struct sk_buff *skb;
+
+ len += opt_len; /* optional len */
+ skb = dev_alloc_skb(local->tx_headroom + len);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, local->tx_headroom);
+ mgmt = skb_put_zero(skb, len);
+ mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_ACTION);
+ memcpy(mgmt->da, req->sa, ETH_ALEN);
+ memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN);
+ memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN);
+
+ mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT;
+ mgmt->u.action.u.eml_omn.action_code =
+ WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF;
+ mgmt->u.action.u.eml_omn.dialog_token =
+ req->u.action.u.eml_omn.dialog_token;
+ mgmt->u.action.u.eml_omn.control = req->u.action.u.eml_omn.control &
+ ~(IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE |
+ IEEE80211_EML_CTRL_INDEV_COEX_ACT);
+ /* Copy optional fields from the received notification frame */
+ memcpy(mgmt->u.action.u.eml_omn.variable,
+ req->u.action.u.eml_omn.variable, opt_len);
+
+ ieee80211_tx_skb(sdata, skb);
+}
+
+void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb)
+{
+ int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn);
+ enum nl80211_iftype type = ieee80211_vif_type_p2p(&sdata->vif);
+ struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb);
+ const struct wiphy_iftype_ext_capab *ift_ext_capa;
+ struct ieee80211_mgmt *mgmt = (void *)skb->data;
+ struct ieee80211_local *local = sdata->local;
+ u8 control = mgmt->u.action.u.eml_omn.control;
+ u8 *ptr = mgmt->u.action.u.eml_omn.variable;
+ struct ieee80211_eml_params eml_params = {
+ .link_id = status->link_id,
+ };
+ struct sta_info *sta;
+ int opt_len = 0;
+
+ if (!ieee80211_vif_is_mld(&sdata->vif))
+ return;
+
+ /* eMLSR and eMLMR can't be enabled at the same time */
+ if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) &&
+ (control & IEEE80211_EML_CTRL_EMLMR_MODE))
+ return;
+
+ if ((control & IEEE80211_EML_CTRL_EMLMR_MODE) &&
+ (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE))
+ return;
+
+ ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, type);
+ if (!ift_ext_capa)
+ return;
+
+ if (!status->link_valid)
+ return;
+
+ sta = sta_info_get_bss(sdata, mgmt->sa);
+ if (!sta)
+ return;
+
+ if (control & IEEE80211_EML_CTRL_EMLSR_MODE) {
+ u8 emlsr_param_update_len;
+
+ if (!(ift_ext_capa->eml_capabilities &
+ IEEE80211_EML_CAP_EMLSR_SUPP))
+ return;
+
+ opt_len += sizeof(__le16); /* eMLSR link_bitmap */
+ /* eMLSR param update field is not part of Notfication frame
+ * sent by the AP to client so account it separately.
+ */
+ emlsr_param_update_len =
+ !!(control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE);
+
+ if (skb->len < len + opt_len + emlsr_param_update_len)
+ return;
+
+ if (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE) {
+ u8 pad_delay, trans_delay;
+
+ pad_delay = u8_get_bits(ptr[2],
+ IEEE80211_EML_EMLSR_PAD_DELAY);
+ if (pad_delay >
+ IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US)
+ return;
+
+ trans_delay = u8_get_bits(ptr[2],
+ IEEE80211_EML_EMLSR_TRANS_DELAY);
+ if (trans_delay >
+ IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US)
+ return;
+
+ /* Update sta padding and transition delay */
+ sta->sta.eml_cap =
+ u8_replace_bits(sta->sta.eml_cap,
+ pad_delay,
+ IEEE80211_EML_CAP_EMLSR_PADDING_DELAY);
+ sta->sta.eml_cap =
+ u8_replace_bits(sta->sta.eml_cap,
+ trans_delay,
+ IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY);
+ }
+ }
+
+ if (control & IEEE80211_EML_CTRL_EMLMR_MODE) {
+ u8 mcs_map_size;
+ int i;
+
+ if (!(ift_ext_capa->eml_capabilities &
+ IEEE80211_EML_CAP_EMLMR_SUPPORT))
+ return;
+
+ opt_len += sizeof(__le16); /* eMLMR link_bitmap */
+ opt_len++; /* eMLMR mcs_map_count */
+ if (skb->len < len + opt_len)
+ return;
+
+ eml_params.emlmr_mcs_map_count = ptr[2];
+ if (eml_params.emlmr_mcs_map_count > 2)
+ return;
+
+ mcs_map_size = 3 * (1 + eml_params.emlmr_mcs_map_count);
+ opt_len += mcs_map_size;
+ if (skb->len < len + opt_len)
+ return;
+
+ for (i = 0; i < mcs_map_size; i++) {
+ u8 rx_mcs, tx_mcs;
+
+ rx_mcs = u8_get_bits(ptr[3 + i],
+ IEEE80211_EML_EMLMR_RX_MCS_MAP);
+ if (rx_mcs > 8)
+ return;
+
+ tx_mcs = u8_get_bits(ptr[3 + i],
+ IEEE80211_EML_EMLMR_TX_MCS_MAP);
+ if (tx_mcs > 8)
+ return;
+ }
+
+ memcpy(eml_params.emlmr_mcs_map_bw, &ptr[3], mcs_map_size);
+ }
+
+ if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) ||
+ (control & IEEE80211_EML_CTRL_EMLMR_MODE)) {
+ eml_params.link_bitmap = get_unaligned_le16(ptr);
+ if ((eml_params.link_bitmap & sdata->vif.active_links) !=
+ eml_params.link_bitmap)
+ return;
+ }
+
+ if (drv_set_eml_op_mode(sdata, &sta->sta, &eml_params))
+ return;
+
+ ieee80211_send_eml_op_mode_notif(sdata, mgmt, opt_len);
+}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index bd573f8e61fb..e60b814dd89e 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2015 Intel Mobile Communications GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#ifndef IEEE80211_I_H
@@ -394,9 +394,10 @@ enum ieee80211_conn_mode {
IEEE80211_CONN_MODE_VHT,
IEEE80211_CONN_MODE_HE,
IEEE80211_CONN_MODE_EHT,
+ IEEE80211_CONN_MODE_UHR,
};
-#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT
+#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_UHR
enum ieee80211_conn_bw_limit {
IEEE80211_CONN_BW_LIMIT_20,
@@ -430,7 +431,7 @@ struct ieee80211_mgd_auth_data {
u8 ap_addr[ETH_ALEN] __aligned(2);
- u16 sae_trans, sae_status;
+ u16 trans, status;
size_t data_len;
u8 data[];
};
@@ -1099,7 +1100,7 @@ struct ieee80211_link_data {
int ap_power_level; /* in dBm */
bool radar_required;
- struct wiphy_delayed_work dfs_cac_timer_work;
+ struct wiphy_hrtimer_work dfs_cac_timer_work;
union {
struct ieee80211_link_data_managed mgd;
@@ -1824,6 +1825,8 @@ struct ieee802_11_elems {
const struct ieee80211_multi_link_elem *ml_epcs;
const struct ieee80211_bandwidth_indication *bandwidth_indication;
const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT];
+ const struct ieee80211_uhr_cap *uhr_cap;
+ const struct ieee80211_uhr_operation *uhr_operation;
/* not the order in the psd values is per element, not per chandef */
struct ieee80211_parsed_tpe tpe;
@@ -1848,6 +1851,8 @@ struct ieee802_11_elems {
u8 country_elem_len;
u8 bssid_index_len;
u8 eht_cap_len;
+ u8 uhr_cap_len;
+ u8 uhr_operation_len;
/* mult-link element can be de-fragmented and thus u8 is not sufficient */
size_t ml_basic_len;
@@ -2391,6 +2396,14 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb, int tid, int link_id,
enum nl80211_band band);
+static inline bool ieee80211_require_encrypted_assoc(__le16 fc,
+ struct sta_info *sta)
+{
+ return (sta && sta->sta.epp_peer &&
+ (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc) ||
+ ieee80211_is_assoc_resp(fc) || ieee80211_is_reassoc_resp(fc)));
+}
+
/* sta_out needs to be checked for ERR_PTR() before using */
int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata,
struct sk_buff *skb,
@@ -2658,8 +2671,7 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata);
u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef);
u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef,
const struct ieee80211_sta_eht_cap *eht_cap);
-int ieee80211_parse_bitrates(enum nl80211_chan_width width,
- const struct ieee80211_supported_band *sband,
+int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates);
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata,
@@ -2684,6 +2696,9 @@ int ieee80211_put_eht_cap(struct sk_buff *skb,
struct ieee80211_sub_if_data *sdata,
const struct ieee80211_supported_band *sband,
const struct ieee80211_conn_settings *conn);
+int ieee80211_put_uhr_cap(struct sk_buff *skb,
+ struct ieee80211_sub_if_data *sdata,
+ const struct ieee80211_supported_band *sband);
int ieee80211_put_reg_conn(struct sk_buff *skb,
enum ieee80211_channel_flags flags);
@@ -2828,6 +2843,8 @@ void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache);
u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata);
+void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata,
+ struct sk_buff *skb);
void
ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata,
struct ieee80211_supported_band *sband,
@@ -2859,6 +2876,13 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata,
struct ieee80211_mgmt *mgmt, size_t len);
void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata);
+void
+ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_uhr_cap *uhr_cap,
+ u8 uhr_cap_len,
+ struct link_sta_info *link_sta);
+
#if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST)
#define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym)
#define VISIBLE_IF_MAC80211_KUNIT
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 515384ca2f8f..676b2a43c9f2 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -8,7 +8,7 @@
* Copyright 2008, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/slab.h>
#include <linux/kernel.h>
@@ -565,7 +565,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa.finalize_work);
wiphy_work_cancel(local->hw.wiphy,
&sdata->deflink.color_change_finalize_work);
- wiphy_delayed_work_cancel(local->hw.wiphy,
+ wiphy_hrtimer_work_cancel(local->hw.wiphy,
&sdata->deflink.dfs_cac_timer_work);
if (sdata->wdev.links[0].cac_started) {
@@ -1668,7 +1668,15 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local,
}
} else if (ieee80211_is_action(mgmt->frame_control) &&
mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) {
- if (sdata->vif.type == NL80211_IFTYPE_STATION) {
+ if (sdata->vif.type == NL80211_IFTYPE_AP) {
+ switch (mgmt->u.action.u.eml_omn.action_code) {
+ case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF:
+ ieee80211_rx_eml_op_mode_notif(sdata, skb);
+ break;
+ default:
+ break;
+ }
+ } else if (sdata->vif.type == NL80211_IFTYPE_STATION) {
switch (mgmt->u.action.u.ttlm_req.action_code) {
case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ:
ieee80211_process_neg_ttlm_req(sdata, mgmt,
@@ -1793,7 +1801,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work)
else
ieee80211_iface_process_skb(local, sdata, skb);
- kfree_skb(skb);
+ consume_skb(skb);
kcov_remote_stop();
}
@@ -1802,7 +1810,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work)
kcov_remote_start_common(skb_get_kcov_handle(skb));
ieee80211_iface_process_status(sdata, skb);
- kfree_skb(skb);
+ consume_skb(skb);
kcov_remote_stop();
}
diff --git a/net/mac80211/link.c b/net/mac80211/link.c
index 1e05845872af..17bf55dabd31 100644
--- a/net/mac80211/link.c
+++ b/net/mac80211/link.c
@@ -116,7 +116,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata,
ieee80211_color_change_finalize_work);
wiphy_delayed_work_init(&link->color_collision_detect_work,
ieee80211_color_collision_detection_work);
- wiphy_delayed_work_init(&link->dfs_cac_timer_work,
+ wiphy_hrtimer_work_init(&link->dfs_cac_timer_work,
ieee80211_dfs_cac_timer_work);
if (!deflink) {
@@ -155,7 +155,7 @@ void ieee80211_link_stop(struct ieee80211_link_data *link)
&link->csa.finalize_work);
if (link->sdata->wdev.links[link->link_id].cac_started) {
- wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy,
+ wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy,
&link->dfs_cac_timer_work);
cfg80211_cac_event(link->sdata->dev,
&link->conf->chanreq.oper,
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index b05e313c7f17..bedc81956fbc 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <net/mac80211.h>
@@ -1123,7 +1123,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
int result, i;
enum nl80211_band band;
int channels, max_bitrates;
- bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g;
+ bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g, supp_uhr;
struct cfg80211_chan_def dflt_chandef = {};
if (ieee80211_hw_check(hw, QUEUE_CONTROL) &&
@@ -1237,6 +1237,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_he = false;
supp_eht = false;
supp_s1g = false;
+ supp_uhr = false;
for (band = 0; band < NUM_NL80211_BANDS; band++) {
const struct ieee80211_sband_iftype_data *iftd;
struct ieee80211_supported_band *sband;
@@ -1293,6 +1294,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
supp_he = supp_he || iftd->he_cap.has_he;
supp_eht = supp_eht || iftd->eht_cap.has_eht;
+ supp_uhr = supp_uhr || iftd->uhr_cap.has_uhr;
if (band == NL80211_BAND_2GHZ)
he_40_mhz_cap =
@@ -1325,6 +1327,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (WARN_ON(supp_eht && !supp_he))
return -EINVAL;
+ /* UHR requires EHT support */
+ if (WARN_ON(supp_uhr && !supp_eht))
+ return -EINVAL;
+
if (!sband->ht_cap.ht_supported)
continue;
@@ -1437,6 +1443,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
IEEE80211_EHT_PPE_THRES_MAX_LEN;
}
+ if (supp_uhr)
+ local->scan_ies_len +=
+ 3 + sizeof(struct ieee80211_uhr_cap) +
+ sizeof(struct ieee80211_uhr_cap_phy);
+
if (!local->ops->hw_scan) {
/* For hw_scan, driver needs to set these up. */
local->hw.wiphy->max_scan_ssids = 4;
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index 73f57b9e0ebf..e83582b2c377 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -162,6 +162,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
const struct ieee80211_vht_operation *vht_oper = elems->vht_operation;
const struct ieee80211_he_operation *he_oper = elems->he_operation;
const struct ieee80211_eht_operation *eht_oper = elems->eht_operation;
+ const struct ieee80211_uhr_operation *uhr_oper = elems->uhr_operation;
struct ieee80211_supported_band *sband =
sdata->local->hw.wiphy->bands[channel->band];
struct cfg80211_chan_def vht_chandef;
@@ -192,7 +193,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
/* get special 6 GHz case out of the way */
if (sband->band == NL80211_BAND_6GHZ) {
- enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT;
+ enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_HIGHEST;
/* this is an error */
if (conn->mode < IEEE80211_CONN_MODE_HE)
@@ -215,7 +216,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
return IEEE80211_CONN_MODE_LEGACY;
}
- return mode;
+ if (mode <= IEEE80211_CONN_MODE_EHT)
+ return mode;
+ goto check_uhr;
}
/* now we have the progression HT, VHT, ... */
@@ -340,7 +343,63 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
*chandef = eht_chandef;
}
- return IEEE80211_CONN_MODE_EHT;
+check_uhr:
+ if (conn->mode < IEEE80211_CONN_MODE_UHR || !uhr_oper)
+ return IEEE80211_CONN_MODE_EHT;
+
+ /*
+ * In beacons we don't have all the data - but we know the size was OK,
+ * so if the size is valid as a non-beacon case, we have more data and
+ * can validate the NPCA parameters.
+ */
+ if (ieee80211_uhr_oper_size_ok((const void *)uhr_oper,
+ elems->uhr_operation_len,
+ false)) {
+ struct cfg80211_chan_def npca_chandef = *chandef;
+ const struct ieee80211_uhr_npca_info *npca;
+ const __le16 *dis_subch_bmap;
+ u16 punct = chandef->punctured, npca_punct;
+
+ npca = ieee80211_uhr_npca_info(uhr_oper);
+ if (npca) {
+ int width = cfg80211_chandef_get_width(chandef);
+ u8 offs = le32_get_bits(npca->params,
+ IEEE80211_UHR_NPCA_PARAMS_PRIMARY_CHAN_OFFS);
+ u32 cf1 = chandef->center_freq1;
+ bool pri_upper, npca_upper;
+
+ pri_upper = chandef->chan->center_freq > cf1;
+ npca_upper = 20 * offs >= width / 2;
+
+ if (20 * offs >= cfg80211_chandef_get_width(chandef) ||
+ pri_upper == npca_upper) {
+ sdata_info(sdata,
+ "AP UHR NPCA primary channel invalid, disabling UHR\n");
+ return IEEE80211_CONN_MODE_EHT;
+ }
+ }
+
+ dis_subch_bmap = ieee80211_uhr_npca_dis_subch_bitmap(uhr_oper);
+
+ if (dis_subch_bmap) {
+ npca_punct = get_unaligned_le16(dis_subch_bmap);
+ npca_chandef.punctured = npca_punct;
+ }
+
+ /*
+ * must be a valid puncturing pattern for this channel as
+ * well as puncturing all subchannels that are already in
+ * the disabled subchannel bitmap on the primary channel
+ */
+ if (!cfg80211_chandef_valid(&npca_chandef) ||
+ ((punct & npca_punct) != punct)) {
+ sdata_info(sdata,
+ "AP UHR NPCA disabled subchannel bitmap invalid, disabling UHR\n");
+ return IEEE80211_CONN_MODE_EHT;
+ }
+ }
+
+ return IEEE80211_CONN_MODE_UHR;
}
static bool
@@ -1091,6 +1150,7 @@ again:
IEEE80211_CONN_BW_LIMIT_160);
break;
case IEEE80211_CONN_MODE_EHT:
+ case IEEE80211_CONN_MODE_UHR:
conn->bw_limit = min_t(enum ieee80211_conn_bw_limit,
conn->bw_limit,
IEEE80211_CONN_BW_LIMIT_320);
@@ -1108,6 +1168,8 @@ again:
set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors);
if (conn->mode >= IEEE80211_CONN_MODE_EHT)
set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors);
+ if (conn->mode >= IEEE80211_CONN_MODE_UHR)
+ set_bit(BSS_MEMBERSHIP_SELECTOR_UHR_PHY, sta_selectors);
/*
* We do not support EPD or GLK so never add them.
@@ -1155,6 +1217,11 @@ again:
IEEE80211_CONN_BW_LIMIT_160);
}
+ if (conn->mode >= IEEE80211_CONN_MODE_UHR &&
+ !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper,
+ IEEE80211_CHAN_NO_UHR))
+ conn->mode = IEEE80211_CONN_MODE_EHT;
+
if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode)
link_id_info(sdata, link_id,
"regulatory prevented using AP config, downgraded\n");
@@ -1548,7 +1615,7 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local,
* in the association request (e.g. D-Link DAP 1353 in
* b-only mode)...
*/
- ieee80211_parse_bitrates(width, sband,
+ ieee80211_parse_bitrates(sband,
assoc_data->supp_rates,
assoc_data->supp_rates_len,
&rates);
@@ -1884,11 +1951,13 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata,
/*
* careful - need to know about all the present elems before
- * calling ieee80211_assoc_add_ml_elem(), so add this one if
- * we're going to put it after the ML element
+ * calling ieee80211_assoc_add_ml_elem(), so add these if
+ * we're going to put them after the ML element
*/
if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT)
ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY);
+ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR)
+ ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_UHR_CAPA);
if (link_id == assoc_data->assoc_link_id)
ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa,
@@ -1901,6 +1970,9 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata,
ieee80211_put_eht_cap(skb, sdata, sband,
&assoc_data->link[link_id].conn);
+ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR)
+ ieee80211_put_uhr_cap(skb, sdata, sband);
+
if (sband->band == NL80211_BAND_S1GHZ) {
ieee80211_add_aid_request_ie(sdata, skb);
ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb);
@@ -2135,6 +2207,9 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata,
sizeof(struct ieee80211_eht_mcs_nss_supp) +
IEEE80211_EHT_PPE_THRES_MAX_LEN;
+ size += 2 + 1 + sizeof(struct ieee80211_uhr_cap) +
+ sizeof(struct ieee80211_uhr_cap_phy);
+
return size;
}
@@ -2155,6 +2230,8 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
struct ieee80211_prep_tx_info info = {};
unsigned int link_id, n_links = 0;
u16 present_elems[PRESENT_ELEMS_MAX] = {};
+ struct sta_info *sta;
+ bool assoc_encrypt;
void *capab_pos;
size_t size;
int ret;
@@ -2335,7 +2412,15 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
info.link_id = assoc_data->assoc_link_id;
drv_mgd_prepare_tx(local, sdata, &info);
- IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+ sta = sta_info_get_bss(sdata, sdata->vif.cfg.ap_addr);
+
+ assoc_encrypt = sta && sta->sta.epp_peer &&
+ wiphy_dereference(sdata->local->hw.wiphy,
+ sta->ptk[sta->ptk_idx]);
+
+ if (!assoc_encrypt)
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT;
+
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS |
IEEE80211_TX_INTFL_MLME_CONN_TX;
@@ -4911,6 +4996,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata,
case WLAN_AUTH_FILS_SK:
case WLAN_AUTH_FILS_SK_PFS:
case WLAN_AUTH_FILS_PK:
+ case WLAN_AUTH_EPPKE:
break;
case WLAN_AUTH_SHARED_KEY:
if (ifmgd->auth_data->expected_transaction != 4) {
@@ -5520,6 +5606,18 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link,
bss_conf->epcs_support = false;
}
+ if (elems->uhr_operation && elems->uhr_cap &&
+ link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_UHR) {
+ ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband,
+ elems->uhr_cap,
+ elems->uhr_cap_len,
+ link_sta);
+
+ bss_conf->uhr_support = link_sta->pub->uhr_cap.has_uhr;
+ } else {
+ bss_conf->uhr_support = false;
+ }
+
if (elems->s1g_oper &&
link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G &&
elems->s1g_capab)
@@ -5810,6 +5908,7 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata,
bool is_6ghz = sband->band == NL80211_BAND_6GHZ;
const struct ieee80211_sta_he_cap *he_cap;
const struct ieee80211_sta_eht_cap *eht_cap;
+ const struct ieee80211_sta_uhr_cap *uhr_cap;
struct ieee80211_sta_vht_cap vht_cap;
if (sband->band == NL80211_BAND_S1GHZ) {
@@ -5985,9 +6084,6 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata,
"no EHT support, limiting to HE\n");
goto out;
}
-
- /* we have EHT */
-
conn->mode = IEEE80211_CONN_MODE_EHT;
/* check bandwidth */
@@ -5998,6 +6094,20 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata,
mlme_link_id_dbg(sdata, link_id,
"no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n");
+ if (req && req->flags & ASSOC_REQ_DISABLE_UHR) {
+ mlme_link_id_dbg(sdata, link_id,
+ "UHR disabled by flag, limiting to EHT\n");
+ goto out;
+ }
+
+ uhr_cap = ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif);
+ if (!uhr_cap) {
+ mlme_link_id_dbg(sdata, link_id,
+ "no UHR support, limiting to EHT\n");
+ goto out;
+ }
+ conn->mode = IEEE80211_CONN_MODE_UHR;
+
out:
mlme_link_id_dbg(sdata, link_id,
"determined local STA to be %s, BW limited to %d MHz\n",
@@ -8307,6 +8417,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
if (WARN_ON_ONCE(!auth_data))
return -EINVAL;
+ if (auth_data->algorithm == WLAN_AUTH_EPPKE &&
+ ieee80211_vif_is_mld(&sdata->vif) &&
+ !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK,
+ auth_data->data, auth_data->data_len))
+ return -EINVAL;
+
auth_data->tries++;
if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) {
@@ -8335,9 +8451,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata)
auth_data->expected_transaction = 2;
if (auth_data->algorithm == WLAN_AUTH_SAE) {
- trans = auth_data->sae_trans;
- status = auth_data->sae_status;
+ trans = auth_data->trans;
+ status = auth_data->status;
auth_data->expected_transaction = trans;
+ } else if (auth_data->algorithm == WLAN_AUTH_EPPKE) {
+ trans = auth_data->trans;
+ status = auth_data->status;
}
if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS))
@@ -8994,6 +9113,10 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata,
goto out_err;
}
+ if (ifmgd->auth_data &&
+ ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE)
+ new_sta->sta.epp_peer = true;
+
new_sta->sta.mlo = mlo;
}
@@ -9248,6 +9371,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
case NL80211_AUTHTYPE_FILS_PK:
auth_alg = WLAN_AUTH_FILS_PK;
break;
+ case NL80211_AUTHTYPE_EPPKE:
+ auth_alg = WLAN_AUTH_EPPKE;
+ break;
default:
return -EOPNOTSUPP;
}
@@ -9272,12 +9398,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
auth_data->link_id = req->link_id;
if (req->auth_data_len >= 4) {
- if (req->auth_type == NL80211_AUTHTYPE_SAE) {
+ if (req->auth_type == NL80211_AUTHTYPE_SAE ||
+ req->auth_type == NL80211_AUTHTYPE_EPPKE) {
__le16 *pos = (__le16 *) req->auth_data;
- auth_data->sae_trans = le16_to_cpu(pos[0]);
- auth_data->sae_status = le16_to_cpu(pos[1]);
+ auth_data->trans = le16_to_cpu(pos[0]);
+ auth_data->status = le16_to_cpu(pos[1]);
}
+
memcpy(auth_data->data, req->auth_data + 4,
req->auth_data_len - 4);
auth_data->data_len += req->auth_data_len - 4;
@@ -9328,7 +9456,11 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
* out SAE Confirm.
*/
if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE &&
- auth_data->peer_confirmed && auth_data->sae_trans == 2)
+ auth_data->peer_confirmed && auth_data->trans == 2)
+ ieee80211_mark_sta_auth(sdata);
+
+ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_EPPKE &&
+ auth_data->trans == 3)
ieee80211_mark_sta_auth(sdata);
if (ifmgd->associated) {
diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c
index bfc4ecb7a048..8260f6bdd5b2 100644
--- a/net/mac80211/parse.c
+++ b/net/mac80211/parse.c
@@ -6,7 +6,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*
* element parsing for mac80211
*/
@@ -189,6 +189,26 @@ ieee80211_parse_extension_element(u32 *crc,
elems->ttlm_num++;
}
break;
+ case WLAN_EID_EXT_UHR_OPER:
+ if (params->mode < IEEE80211_CONN_MODE_UHR)
+ break;
+ calc_crc = true;
+ if (ieee80211_uhr_oper_size_ok(data, len,
+ params->type == (IEEE80211_FTYPE_MGMT |
+ IEEE80211_STYPE_BEACON))) {
+ elems->uhr_operation = data;
+ elems->uhr_operation_len = len;
+ }
+ break;
+ case WLAN_EID_EXT_UHR_CAPA:
+ if (params->mode < IEEE80211_CONN_MODE_UHR)
+ break;
+ calc_crc = true;
+ if (ieee80211_uhr_capa_size_ok(data, len, true)) {
+ elems->uhr_cap = data;
+ elems->uhr_cap_len = len;
+ }
+ break;
}
if (crc && calc_crc)
@@ -1115,8 +1135,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params)
}
EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full);
-int ieee80211_parse_bitrates(enum nl80211_chan_width width,
- const struct ieee80211_supported_band *sband,
+int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband,
const u8 *srates, int srates_len, u32 *rates)
{
struct ieee80211_rate *br;
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index e0ccd9749853..11d6c56c9d7e 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -6,7 +6,7 @@
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/jiffies.h>
@@ -1137,14 +1137,14 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
if (is_multicast_ether_addr(hdr->addr1)) {
if (ieee80211_has_tods(hdr->frame_control) ||
!ieee80211_has_fromds(hdr->frame_control))
- return RX_DROP;
+ return RX_DROP_U_MESH_DS_BITS;
if (ether_addr_equal(hdr->addr3, dev_addr))
- return RX_DROP;
+ return RX_DROP_U_MESH_A3_MISMATCH;
} else {
if (!ieee80211_has_a4(hdr->frame_control))
- return RX_DROP;
+ return RX_DROP_U_MESH_NO_A4;
if (ether_addr_equal(hdr->addr4, dev_addr))
- return RX_DROP;
+ return RX_DROP_U_MESH_A4_MISMATCH;
}
}
@@ -1156,20 +1156,20 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
struct ieee80211_mgmt *mgmt;
if (!ieee80211_is_mgmt(hdr->frame_control))
- return RX_DROP;
+ return RX_DROP_U_MESH_UNEXP_DATA;
if (ieee80211_is_action(hdr->frame_control)) {
u8 category;
/* make sure category field is present */
if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE)
- return RX_DROP;
+ return RX_DROP_U_RUNT_ACTION;
mgmt = (struct ieee80211_mgmt *)hdr;
category = mgmt->u.action.category;
if (category != WLAN_CATEGORY_MESH_ACTION &&
category != WLAN_CATEGORY_SELF_PROTECTED)
- return RX_DROP;
+ return RX_DROP_U_MESH_WRONG_ACTION;
return RX_CONTINUE;
}
@@ -1179,7 +1179,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
ieee80211_is_auth(hdr->frame_control))
return RX_CONTINUE;
- return RX_DROP;
+ return RX_DROP_U_MESH_UNEXP_MGMT;
}
return RX_CONTINUE;
@@ -1605,7 +1605,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
hdrlen = ieee80211_hdrlen(hdr->frame_control);
if (rx->skb->len < hdrlen + 8)
- return RX_DROP;
+ return RX_DROP_U_RUNT_DATA;
skb_copy_bits(rx->skb, hdrlen + 6, &ethertype, 2);
if (ethertype == rx->sdata->control_port_protocol)
@@ -1615,9 +1615,9 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx)
if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2,
rx->link_id, GFP_ATOMIC))
- return RX_DROP_U_SPURIOUS;
+ return RX_DROP_U_SPURIOUS_NOTIF;
- return RX_DROP;
+ return RX_DROP_U_SPURIOUS;
}
return RX_CONTINUE;
@@ -1880,7 +1880,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx)
link_sta->rx_stats.fragments++;
u64_stats_update_begin(&link_sta->rx_stats.syncp);
- link_sta->rx_stats.bytes += rx->skb->len;
+ u64_stats_add(&link_sta->rx_stats.bytes, rx->skb->len);
u64_stats_update_end(&link_sta->rx_stats.syncp);
if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) {
@@ -2106,7 +2106,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (rx->link_sta) {
if (ieee80211_is_group_privacy_action(skb) &&
test_sta_flag(rx->sta, WLAN_STA_MFP))
- return RX_DROP;
+ return RX_DROP_U_UNPROTECTED;
rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]);
}
@@ -2191,11 +2191,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx)
if (rx->key) {
if (unlikely(rx->key->flags & KEY_FLAG_TAINTED))
- return RX_DROP;
+ return RX_DROP_U_KEY_TAINTED;
/* TODO: add threshold stuff again */
} else {
- return RX_DROP;
+ return RX_DROP_U_UNPROTECTED;
}
switch (rx->key->conf.cipher) {
@@ -2371,7 +2371,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
goto out;
if (is_multicast_ether_addr(hdr->addr1))
- return RX_DROP;
+ return RX_DROP_U_MCAST_FRAGMENT;
I802_DEBUG_INC(rx->local->rx_handlers_fragments);
@@ -2426,7 +2426,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->seqno_idx, hdr);
if (!entry) {
I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag);
- return RX_DROP;
+ return RX_DROP_U_DEFRAG_MISMATCH;
}
/* "The receiver shall discard MSDUs and MMPDUs whose constituent
@@ -2609,6 +2609,14 @@ ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx)
(!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC)))
return RX_DROP_U_UNPROT_ROBUST_ACTION;
+ /*
+ * Drop unprotected (Re)Association Request/Response frame received from
+ * an EPP Peer.
+ */
+ if (!ieee80211_has_protected(fc) &&
+ ieee80211_require_encrypted_assoc(fc, rx->sta))
+ return RX_DROP_U_UNPROT_UCAST_MGMT;
+
return RX_CONTINUE;
}
EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_drop_unencrypted_mgmt);
@@ -2777,7 +2785,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx)
* frame, so count MSDUs.
*/
u64_stats_update_begin(&rx->link_sta->rx_stats.syncp);
- rx->link_sta->rx_stats.msdu[rx->seqno_idx]++;
+ u64_stats_inc(&rx->link_sta->rx_stats.msdu[rx->seqno_idx]);
u64_stats_update_end(&rx->link_sta->rx_stats.syncp);
}
@@ -2948,25 +2956,25 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta
return RX_CONTINUE;
if (!pskb_may_pull(skb, sizeof(*eth) + 6))
- return RX_DROP;
+ return RX_DROP_U_RUNT_MESH_DATA;
mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth));
mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr);
if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen))
- return RX_DROP;
+ return RX_DROP_U_RUNT_MESH_DATA;
eth = (struct ethhdr *)skb->data;
multicast = is_multicast_ether_addr(eth->h_dest);
mesh_hdr = (struct ieee80211s_hdr *)(eth + 1);
if (!mesh_hdr->ttl)
- return RX_DROP;
+ return RX_DROP_U_MESH_NO_TTL;
/* frame is in RMC, don't forward */
if (is_multicast_ether_addr(eth->h_dest) &&
mesh_rmc_check(sdata, eth->h_source, mesh_hdr))
- return RX_DROP;
+ return RX_DROP_U_MESH_RMC;
/* forward packet */
if (sdata->crypto_tx_tailroom_needed_cnt)
@@ -2983,7 +2991,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta
/* has_a4 already checked in ieee80211_rx_mesh_check */
proxied_addr = mesh_hdr->eaddr2;
else
- return RX_DROP;
+ return RX_DROP_U_MESH_BAD_AE;
rcu_read_lock();
mppath = mpp_path_lookup(sdata, proxied_addr);
@@ -3015,14 +3023,14 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta
goto rx_accept;
IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl);
- return RX_DROP;
+ return RX_DROP_U_MESH_TTL_EXPIRED;
}
if (!ifmsh->mshcfg.dot11MeshForwarding) {
if (is_multicast_ether_addr(eth->h_dest))
goto rx_accept;
- return RX_DROP;
+ return RX_DROP_U_MESH_NOT_FORWARDING;
}
skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]);
@@ -3208,7 +3216,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (unlikely(!ieee80211_is_data_present(fc)))
- return RX_DROP;
+ return RX_DROP_U_AMSDU_WITHOUT_DATA;
if (unlikely(ieee80211_has_a4(hdr->frame_control))) {
switch (rx->sdata->vif.type) {
@@ -3265,7 +3273,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
- return RX_DROP;
+ return RX_DROP_U_NULL_DATA;
/* Send unexpected-4addr-frame event to hostapd */
if (ieee80211_has_a4(hdr->frame_control) &&
@@ -3275,7 +3283,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
cfg80211_rx_unexpected_4addr_frame(
rx->sdata->dev, rx->sta->sta.addr, rx->link_id,
GFP_ATOMIC);
- return RX_DROP;
+ return RX_DROP_U_UNEXPECTED_4ADDR;
}
res = __ieee80211_data_to_8023(rx, &port_control);
@@ -3287,7 +3295,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx)
return res;
if (!ieee80211_frame_allowed(rx, fc))
- return RX_DROP;
+ return RX_DROP_U_PORT_CONTROL;
/* directly handle TDLS channel switch requests/responses */
if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto ==
@@ -3352,11 +3360,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
};
if (!rx->sta)
- return RX_DROP;
+ return RX_DROP_U_UNKNOWN_STA;
if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control),
&bar_data, sizeof(bar_data)))
- return RX_DROP;
+ return RX_DROP_U_RUNT_BAR;
tid = le16_to_cpu(bar_data.control) >> 12;
@@ -3368,7 +3376,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]);
if (!tid_agg_rx)
- return RX_DROP;
+ return RX_DROP_U_BAR_OUTSIDE_SESSION;
start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4;
event.u.ba.tid = tid;
@@ -3392,7 +3400,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames)
return RX_QUEUED;
}
- return RX_DROP;
+ return RX_DROP_U_CTRL_FRAME;
}
static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata,
@@ -3501,10 +3509,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
* and unknown (reserved) frames are useless.
*/
if (rx->skb->len < 24)
- return RX_DROP;
+ return RX_DROP_U_RUNT_MGMT;
if (!ieee80211_is_mgmt(mgmt->frame_control))
- return RX_DROP;
+ return RX_DROP_U_EXPECTED_MGMT;
/* drop too small action frames */
if (ieee80211_is_action(mgmt->frame_control) &&
@@ -3514,7 +3522,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx)
/* Drop non-broadcast Beacon frames */
if (ieee80211_is_beacon(mgmt->frame_control) &&
!is_broadcast_ether_addr(mgmt->da))
- return RX_DROP;
+ return RX_DROP_U_NONBCAST_BEACON;
if (rx->sdata->vif.type == NL80211_IFTYPE_AP &&
ieee80211_is_beacon(mgmt->frame_control) &&
@@ -3920,6 +3928,14 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
u.action.u.epcs))
goto invalid;
goto queue;
+ case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF:
+ if (sdata->vif.type != NL80211_IFTYPE_AP)
+ break;
+
+ if (len < offsetofend(typeof(*mgmt),
+ u.action.u.eml_omn))
+ goto invalid;
+ goto queue;
default:
break;
}
@@ -4046,10 +4062,10 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx)
if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) &&
(sdata->vif.type == NL80211_IFTYPE_AP ||
sdata->vif.type == NL80211_IFTYPE_AP_VLAN))
- return RX_DROP;
+ return RX_DROP_U_MALFORMED_ACTION;
if (is_multicast_ether_addr(mgmt->da))
- return RX_DROP;
+ return RX_DROP_U_UNKNOWN_MCAST_ACTION;
/* do not return rejected action frames */
if (mgmt->u.action.category & 0x80)
@@ -4094,7 +4110,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
if (sdata->vif.type != NL80211_IFTYPE_STATION)
- return RX_DROP;
+ return RX_DROP_U_UNEXPECTED_EXT_FRAME;
/* for now only beacons are ext, so queue them */
ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb);
@@ -4115,7 +4131,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
sdata->vif.type != NL80211_IFTYPE_ADHOC &&
sdata->vif.type != NL80211_IFTYPE_OCB &&
sdata->vif.type != NL80211_IFTYPE_STATION)
- return RX_DROP;
+ return RX_DROP_U_UNHANDLED_MGMT;
switch (stype) {
case cpu_to_le16(IEEE80211_STYPE_AUTH):
@@ -4126,32 +4142,32 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx)
case cpu_to_le16(IEEE80211_STYPE_DEAUTH):
if (is_multicast_ether_addr(mgmt->da) &&
!is_broadcast_ether_addr(mgmt->da))
- return RX_DROP;
+ return RX_DROP_U_MCAST_DEAUTH;
/* process only for station/IBSS */
if (sdata->vif.type != NL80211_IFTYPE_STATION &&
sdata->vif.type != NL80211_IFTYPE_ADHOC)
- return RX_DROP;
+ return RX_DROP_U_UNHANDLED_DEAUTH;
break;
case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP):
case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP):
case cpu_to_le16(IEEE80211_STYPE_DISASSOC):
if (is_multicast_ether_addr(mgmt->da) &&
!is_broadcast_ether_addr(mgmt->da))
- return RX_DROP;
+ return RX_DROP_U_MCAST_DISASSOC;
/* process only for station */
if (sdata->vif.type != NL80211_IFTYPE_STATION)
- return RX_DROP;
+ return RX_DROP_U_UNHANDLED_DISASSOC;
break;
case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ):
/* process only for ibss and mesh */
if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
sdata->vif.type != NL80211_IFTYPE_MESH_POINT)
- return RX_DROP;
+ return RX_DROP_U_UNHANDLED_PREQ;
break;
default:
- return RX_DROP;
+ return RX_DROP_U_UNHANDLED_MGMT_STYPE;
}
ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb);
@@ -4179,7 +4195,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx,
static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
struct sk_buff_head *frames)
{
- ieee80211_rx_result res = RX_DROP;
+ ieee80211_rx_result res;
struct sk_buff *skb;
#define CALL_RXH(rxh) \
@@ -4205,8 +4221,10 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
*/
rx->skb = skb;
- if (WARN_ON_ONCE(!rx->link))
+ if (WARN_ON_ONCE(!rx->link)) {
+ res = RX_DROP_U_NO_LINK;
goto rxh_next;
+ }
CALL_RXH(ieee80211_rx_h_check_more_data);
CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll);
@@ -4243,7 +4261,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx,
static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx)
{
struct sk_buff_head reorder_release;
- ieee80211_rx_result res = RX_DROP;
+ ieee80211_rx_result res;
__skb_queue_head_init(&reorder_release);
@@ -4868,8 +4886,8 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx,
* frame, so count MSDUs.
*/
u64_stats_update_begin(&stats->syncp);
- stats->msdu[rx->seqno_idx]++;
- stats->bytes += orig_len;
+ u64_stats_inc(&stats->msdu[rx->seqno_idx]);
+ u64_stats_add(&stats->bytes, orig_len);
u64_stats_update_end(&stats->syncp);
if (fast_rx->internal_forward) {
@@ -5508,6 +5526,32 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta,
status->rate_idx, status->nss, status->eht.gi))
goto drop;
break;
+ case RX_ENC_UHR:
+ if (WARN_ONCE(!(status->rate_idx <= 15 ||
+ status->rate_idx == 17 ||
+ status->rate_idx == 19 ||
+ status->rate_idx == 20 ||
+ status->rate_idx == 23) ||
+ !status->nss ||
+ status->nss > 8 ||
+ status->uhr.gi > NL80211_RATE_INFO_EHT_GI_3_2,
+ "Rate marked as a UHR rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n",
+ status->rate_idx, status->nss, status->uhr.gi))
+ goto drop;
+ if (WARN_ONCE(status->uhr.elr &&
+ (status->nss != 1 || status->rate_idx > 1 ||
+ status->uhr.gi != NL80211_RATE_INFO_EHT_GI_1_6 ||
+ status->bw != RATE_INFO_BW_20 || status->uhr.im),
+ "bad UHR ELR MCS MCS:%d, NSS:%d, GI:%d, BW:%d, IM:%d\n",
+ status->rate_idx, status->nss, status->uhr.gi,
+ status->bw, status->uhr.im))
+ goto drop;
+ if (WARN_ONCE(status->uhr.im &&
+ (status->nss != 1 || status->rate_idx == 15),
+ "bad UHR IM MCS MCS:%d, NSS:%d\n",
+ status->rate_idx, status->nss))
+ goto drop;
+ break;
default:
WARN_ON_ONCE(1);
fallthrough;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 1a995bc301b1..a79ebeb43585 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015 - 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/module.h>
@@ -360,7 +360,9 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id)
struct link_sta_info *link_sta = wiphy_dereference(sta->local->hw.wiphy,
sta->link[link_id]);
struct ieee80211_link_data *link;
+ unsigned int start;
int ac, tid;
+ u64 value;
u32 thr;
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
@@ -369,8 +371,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id)
sta->rem_link_stats.tx_bytes += link_sta->tx_stats.bytes[ac];
}
+ do {
+ start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp);
+ value = u64_stats_read(&link_sta->rx_stats.bytes);
+ } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp, start));
+
sta->rem_link_stats.rx_packets += link_sta->rx_stats.packets;
- sta->rem_link_stats.rx_bytes += link_sta->rx_stats.bytes;
+ sta->rem_link_stats.rx_bytes += value;
sta->rem_link_stats.tx_retries += link_sta->status_stats.retry_count;
sta->rem_link_stats.tx_failed += link_sta->status_stats.retry_failed;
sta->rem_link_stats.rx_dropped_misc += link_sta->rx_stats.dropped;
@@ -380,8 +387,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id)
sta->rem_link_stats.expected_throughput += thr;
for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) {
- sta->rem_link_stats.pertid_stats.rx_msdu +=
- link_sta->rx_stats.msdu[tid];
+ do {
+ start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp);
+ value = u64_stats_read(&link_sta->rx_stats.msdu[tid]);
+ } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp,
+ start));
+
+ sta->rem_link_stats.pertid_stats.rx_msdu += value;
sta->rem_link_stats.pertid_stats.tx_msdu +=
link_sta->tx_stats.msdu[tid];
sta->rem_link_stats.pertid_stats.tx_msdu_retries +=
@@ -2555,6 +2567,17 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate,
rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate);
rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate);
break;
+ case STA_STATS_RATE_TYPE_UHR:
+ rinfo->flags = RATE_INFO_FLAGS_UHR_MCS;
+ rinfo->mcs = STA_STATS_GET(UHR_MCS, rate);
+ rinfo->nss = STA_STATS_GET(UHR_NSS, rate);
+ rinfo->eht_gi = STA_STATS_GET(UHR_GI, rate);
+ rinfo->eht_ru_alloc = STA_STATS_GET(UHR_RU, rate);
+ if (STA_STATS_GET(UHR_ELR, rate))
+ rinfo->flags |= RATE_INFO_FLAGS_UHR_ELR_MCS;
+ if (STA_STATS_GET(UHR_IM, rate))
+ rinfo->flags |= RATE_INFO_FLAGS_UHR_IM;
+ break;
}
}
@@ -2578,7 +2601,7 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats,
do {
start = u64_stats_fetch_begin(&rxstats->syncp);
- value = rxstats->msdu[tid];
+ value = u64_stats_read(&rxstats->msdu[tid]);
} while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
@@ -2654,7 +2677,7 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats)
do {
start = u64_stats_fetch_begin(&rxstats->syncp);
- value = rxstats->bytes;
+ value = u64_stats_read(&rxstats->bytes);
} while (u64_stats_fetch_retry(&rxstats->syncp, start));
return value;
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 5288d5286651..2875ef7d7946 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -3,7 +3,7 @@
* Copyright 2002-2005, Devicescape Software, Inc.
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright(c) 2015-2017 Intel Deutschland GmbH
- * Copyright(c) 2020-2024 Intel Corporation
+ * Copyright(c) 2020-2026 Intel Corporation
*/
#ifndef STA_INFO_H
@@ -434,8 +434,8 @@ struct ieee80211_sta_rx_stats {
s8 chain_signal_last[IEEE80211_MAX_CHAINS];
u32 last_rate;
struct u64_stats_sync syncp;
- u64 bytes;
- u64 msdu[IEEE80211_NUM_TIDS + 1];
+ u64_stats_t bytes;
+ u64_stats_t msdu[IEEE80211_NUM_TIDS + 1];
};
/*
@@ -1009,25 +1009,49 @@ enum sta_stats_type {
STA_STATS_RATE_TYPE_HE,
STA_STATS_RATE_TYPE_S1G,
STA_STATS_RATE_TYPE_EHT,
+ STA_STATS_RATE_TYPE_UHR,
};
-#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0)
-#define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0)
-#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4)
-#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0)
-#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4)
-#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0)
-#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4)
-#define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0)
-#define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4)
-#define STA_STATS_FIELD_BW GENMASK(12, 8)
-#define STA_STATS_FIELD_SGI GENMASK(13, 13)
-#define STA_STATS_FIELD_TYPE GENMASK(16, 14)
-#define STA_STATS_FIELD_HE_RU GENMASK(19, 17)
-#define STA_STATS_FIELD_HE_GI GENMASK(21, 20)
-#define STA_STATS_FIELD_HE_DCM GENMASK(22, 22)
-#define STA_STATS_FIELD_EHT_RU GENMASK(20, 17)
-#define STA_STATS_FIELD_EHT_GI GENMASK(22, 21)
+/* common */
+#define STA_STATS_FIELD_TYPE 0x0000000F
+#define STA_STATS_FIELD_BW 0x000001F0
+#define STA_STATS_FIELD_RESERVED 0x00000E00
+
+/* STA_STATS_RATE_TYPE_LEGACY */
+#define STA_STATS_FIELD_LEGACY_IDX 0x0000F000
+#define STA_STATS_FIELD_LEGACY_BAND 0x000F0000
+
+/* STA_STATS_RATE_TYPE_HT */
+#define STA_STATS_FIELD_HT_MCS 0x000FF000
+
+/* STA_STATS_RATE_TYPE_VHT */
+#define STA_STATS_FIELD_VHT_MCS 0x0000F000
+#define STA_STATS_FIELD_VHT_NSS 0x000F0000
+
+/* HT & VHT */
+#define STA_STATS_FIELD_SGI 0x00100000
+
+/* STA_STATS_RATE_TYPE_HE */
+#define STA_STATS_FIELD_HE_MCS 0x0000F000
+#define STA_STATS_FIELD_HE_NSS 0x000F0000
+#define STA_STATS_FIELD_HE_RU 0x00700000
+#define STA_STATS_FIELD_HE_GI 0x01800000
+#define STA_STATS_FIELD_HE_DCM 0x02000000
+
+/* STA_STATS_RATE_TYPE_EHT */
+#define STA_STATS_FIELD_EHT_MCS 0x0000F000
+#define STA_STATS_FIELD_EHT_NSS 0x000F0000
+#define STA_STATS_FIELD_EHT_RU 0x00F00000
+#define STA_STATS_FIELD_EHT_GI 0x03000000
+
+/* STA_STATS_RATE_TYPE_UHR */
+#define STA_STATS_FIELD_UHR_MCS 0x0001F000
+#define STA_STATS_FIELD_UHR_NSS 0x001E0000
+#define STA_STATS_FIELD_UHR_RU 0x01E00000
+#define STA_STATS_FIELD_UHR_GI 0x06000000
+#define STA_STATS_FIELD_UHR_ELR 0x08000000
+#define STA_STATS_FIELD_UHR_IM 0x10000000
+
#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v)
#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v)
@@ -1040,8 +1064,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
r = STA_STATS_FIELD(BW, s->bw);
- if (s->enc_flags & RX_ENC_FLAG_SHORT_GI)
- r |= STA_STATS_FIELD(SGI, 1);
+ switch (s->encoding) {
+ case RX_ENC_HT:
+ case RX_ENC_VHT:
+ if (s->enc_flags & RX_ENC_FLAG_SHORT_GI)
+ r |= STA_STATS_FIELD(SGI, 1);
+ break;
+ default:
+ break;
+ }
switch (s->encoding) {
case RX_ENC_VHT:
@@ -1073,6 +1104,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s)
r |= STA_STATS_FIELD(EHT_GI, s->eht.gi);
r |= STA_STATS_FIELD(EHT_RU, s->eht.ru);
break;
+ case RX_ENC_UHR:
+ r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_UHR);
+ r |= STA_STATS_FIELD(UHR_NSS, s->nss);
+ r |= STA_STATS_FIELD(UHR_MCS, s->rate_idx);
+ r |= STA_STATS_FIELD(UHR_GI, s->uhr.gi);
+ r |= STA_STATS_FIELD(UHR_RU, s->uhr.ru);
+ r |= STA_STATS_FIELD(UHR_ELR, s->uhr.elr);
+ r |= STA_STATS_FIELD(UHR_IM, s->uhr.im);
+ break;
default:
WARN_ON(1);
return STA_STATS_RATE_INVALID;
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 0bfbce157486..c04d4547e8f4 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -3353,6 +3353,38 @@ TRACE_EVENT(drv_prep_add_interface,
)
);
+TRACE_EVENT(drv_set_eml_op_mode,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_sta *sta,
+ unsigned int link_id,
+ u8 control, u16 link_bitmap),
+
+ TP_ARGS(local, sdata, sta, link_id, control, link_bitmap),
+
+ TP_STRUCT__entry(LOCAL_ENTRY
+ VIF_ENTRY
+ STA_ENTRY
+ __field(u32, link_id)
+ __field(u8, control)
+ __field(u16, link_bitmap)),
+
+ TP_fast_assign(LOCAL_ASSIGN;
+ VIF_ASSIGN;
+ STA_NAMED_ASSIGN(sta);
+ __entry->link_id = link_id;
+ __entry->control = control;
+ __entry->link_bitmap = link_bitmap;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT
+ " (link:%d control:%02x link_bitmap:%04x)",
+ LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id,
+ __entry->control, __entry->link_bitmap
+ )
+);
+
#endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 1b55e8340413..007f5a368d41 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -640,7 +640,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx)
if (!ieee80211_is_data_present(hdr->frame_control) &&
!ieee80211_use_mfp(hdr->frame_control, tx->sta,
tx->skb) &&
- !ieee80211_is_group_privacy_action(tx->skb))
+ !ieee80211_is_group_privacy_action(tx->skb) &&
+ !ieee80211_require_encrypted_assoc(hdr->frame_control,
+ tx->sta))
tx->key = NULL;
else
skip_hw = (tx->key->conf.flags &
diff --git a/net/mac80211/uhr.c b/net/mac80211/uhr.c
new file mode 100644
index 000000000000..2d8f5e5480ef
--- /dev/null
+++ b/net/mac80211/uhr.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * UHR handling
+ *
+ * Copyright(c) 2025-2026 Intel Corporation
+ */
+
+#include "ieee80211_i.h"
+
+void
+ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_supported_band *sband,
+ const struct ieee80211_uhr_cap *uhr_cap,
+ u8 uhr_cap_len,
+ struct link_sta_info *link_sta)
+{
+ struct ieee80211_sta_uhr_cap *sta_uhr_cap = &link_sta->pub->uhr_cap;
+ bool from_ap;
+
+ memset(sta_uhr_cap, 0, sizeof(*sta_uhr_cap));
+
+ if (!ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif))
+ return;
+
+ sta_uhr_cap->has_uhr = true;
+
+ sta_uhr_cap->mac = uhr_cap->mac;
+ from_ap = sdata->vif.type == NL80211_IFTYPE_STATION;
+ sta_uhr_cap->phy = *ieee80211_uhr_phy_cap(uhr_cap, from_ap);
+}
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 0c46009a3d63..a5e09c0fa6b3 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -6,7 +6,7 @@
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright (C) 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*
* utilities for mac80211
*/
@@ -101,7 +101,6 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len,
return NULL;
}
-EXPORT_SYMBOL(ieee80211_get_bssid);
void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx)
{
@@ -800,20 +799,56 @@ void ieee80211_iterate_active_interfaces_atomic(
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic);
-void ieee80211_iterate_active_interfaces_mtx(
- struct ieee80211_hw *hw, u32 iter_flags,
- void (*iterator)(void *data, u8 *mac,
- struct ieee80211_vif *vif),
- void *data)
+struct ieee80211_vif *
+__ieee80211_iterate_interfaces(struct ieee80211_hw *hw,
+ struct ieee80211_vif *prev,
+ u32 iter_flags)
{
+ bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE;
+ struct ieee80211_sub_if_data *sdata = NULL, *monitor;
struct ieee80211_local *local = hw_to_local(hw);
lockdep_assert_wiphy(hw->wiphy);
- __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE,
- iterator, data);
+ if (prev)
+ sdata = vif_to_sdata(prev);
+
+ monitor = rcu_dereference_check(local->monitor_sdata,
+ lockdep_is_held(&hw->wiphy->mtx));
+ if (monitor && monitor == sdata)
+ return NULL;
+
+ sdata = list_prepare_entry(sdata, &local->interfaces, list);
+ list_for_each_entry_continue(sdata, &local->interfaces, list) {
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_MONITOR:
+ if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) &&
+ !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR))
+ continue;
+ break;
+ case NL80211_IFTYPE_AP_VLAN:
+ continue;
+ default:
+ break;
+ }
+ if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) &&
+ active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ continue;
+ if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) &&
+ !(sdata->flags & IEEE80211_SDATA_IN_DRIVER))
+ continue;
+ if (ieee80211_sdata_running(sdata) || !active_only)
+ return &sdata->vif;
+ }
+
+ if (monitor && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) &&
+ (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only ||
+ monitor->flags & IEEE80211_SDATA_IN_DRIVER))
+ return &monitor->vif;
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx);
+EXPORT_SYMBOL_GPL(__ieee80211_iterate_interfaces);
static void __iterate_stations(struct ieee80211_local *local,
void (*iterator)(void *data,
@@ -844,18 +879,29 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic);
-void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw,
- void (*iterator)(void *data,
- struct ieee80211_sta *sta),
- void *data)
+struct ieee80211_sta *
+__ieee80211_iterate_stations(struct ieee80211_hw *hw,
+ struct ieee80211_sta *prev)
{
struct ieee80211_local *local = hw_to_local(hw);
+ struct sta_info *sta = NULL;
lockdep_assert_wiphy(local->hw.wiphy);
- __iterate_stations(local, iterator, data);
+ if (prev)
+ sta = container_of(prev, struct sta_info, sta);
+
+ sta = list_prepare_entry(sta, &local->sta_list, list);
+ list_for_each_entry_continue(sta, &local->sta_list, list) {
+ if (!sta->uploaded)
+ continue;
+
+ return &sta->sta;
+ }
+
+ return NULL;
}
-EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx);
+EXPORT_SYMBOL_GPL(__ieee80211_iterate_stations);
struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev)
{
@@ -1096,14 +1142,17 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
.ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC),
.basic.len = sizeof(mle.basic),
};
+ bool add_mle;
int err;
- memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN);
+ add_mle = (multi_link &&
+ !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK,
+ extra, extra_len));
/* 24 + 6 = header + auth_algo + auth_transaction + status_code */
skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN +
24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN +
- multi_link * sizeof(mle));
+ add_mle * sizeof(mle));
if (!skb)
return;
@@ -1120,8 +1169,11 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata,
mgmt->u.auth.status_code = cpu_to_le16(status);
if (extra)
skb_put_data(skb, extra, extra_len);
- if (multi_link)
+
+ if (add_mle) {
+ memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN);
skb_put_data(skb, &mle, sizeof(mle));
+ }
if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) {
mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED);
@@ -1369,6 +1421,13 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb,
if (err)
return err;
+ if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band),
+ IEEE80211_CHAN_NO_UHR)) {
+ err = ieee80211_put_uhr_cap(skb, sdata, sband);
+ if (err)
+ return err;
+ }
+
/*
* If adding more here, adjust code in main.c
* that calculates local->scan_ies_len.
@@ -3545,7 +3604,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local,
if (ctx && &ctx->conf != chanctx_conf)
continue;
- wiphy_delayed_work_cancel(local->hw.wiphy,
+ wiphy_hrtimer_work_cancel(local->hw.wiphy,
&link->dfs_cac_timer_work);
if (!sdata->wdev.links[link_id].cac_started)
@@ -4475,6 +4534,32 @@ int ieee80211_put_eht_cap(struct sk_buff *skb,
return 0;
}
+int ieee80211_put_uhr_cap(struct sk_buff *skb,
+ struct ieee80211_sub_if_data *sdata,
+ const struct ieee80211_supported_band *sband)
+{
+ const struct ieee80211_sta_uhr_cap *uhr_cap =
+ ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif);
+ int len;
+
+ if (!uhr_cap)
+ return 0;
+
+ len = 2 + 1 + sizeof(struct ieee80211_uhr_cap) +
+ sizeof(struct ieee80211_uhr_cap_phy);
+
+ if (skb_tailroom(skb) < len)
+ return -ENOBUFS;
+
+ skb_put_u8(skb, WLAN_EID_EXTENSION);
+ skb_put_u8(skb, len - 2);
+ skb_put_u8(skb, WLAN_EID_EXT_UHR_CAPA);
+ skb_put_data(skb, &uhr_cap->mac, sizeof(uhr_cap->mac));
+ skb_put_data(skb, &uhr_cap->phy, sizeof(uhr_cap->phy));
+
+ return 0;
+}
+
const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode)
{
static const char * const modes[] = {
@@ -4484,6 +4569,7 @@ const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode)
[IEEE80211_CONN_MODE_VHT] = "VHT",
[IEEE80211_CONN_MODE_HE] = "HE",
[IEEE80211_CONN_MODE_EHT] = "EHT",
+ [IEEE80211_CONN_MODE_UHR] = "UHR",
};
if (WARN_ON(mode >= ARRAY_SIZE(modes)))
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index 4a858112e4ef..fdf98c21d32c 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -527,7 +527,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx,
hdrlen = ieee80211_hdrlen(hdr->frame_control);
if (!ieee80211_is_data(hdr->frame_control) &&
- !ieee80211_is_robust_mgmt_frame(skb))
+ !ieee80211_is_robust_mgmt_frame(skb) &&
+ !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta))
return RX_CONTINUE;
if (status->flag & RX_FLAG_DECRYPTED) {
@@ -723,7 +724,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx)
hdrlen = ieee80211_hdrlen(hdr->frame_control);
if (!ieee80211_is_data(hdr->frame_control) &&
- !ieee80211_is_robust_mgmt_frame(skb))
+ !ieee80211_is_robust_mgmt_frame(skb) &&
+ !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta))
return RX_CONTINUE;
if (status->flag & RX_FLAG_DECRYPTED) {
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index b26675054b0d..b5316a6c7d1b 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -1044,26 +1044,23 @@ out_free:
return ret;
}
-static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
+static void mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
{
struct mptcp_rm_list list = { .nr = 0 };
- bool ret;
+ bool announced;
list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr);
- ret = mptcp_remove_anno_list_by_saddr(msk, addr);
- if (ret || force) {
+ announced = mptcp_remove_anno_list_by_saddr(msk, addr);
+ if (announced || force) {
spin_lock_bh(&msk->pm.lock);
- if (ret) {
- __set_bit(addr->id, msk->pm.id_avail_bitmap);
+ if (announced)
msk->pm.add_addr_signaled--;
- }
mptcp_pm_remove_addr(msk, &list);
spin_unlock_bh(&msk->pm.lock);
}
- return ret;
}
static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id)
@@ -1097,17 +1094,15 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net,
!(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT));
list.ids[0] = mptcp_endp_get_local_id(msk, addr);
- if (remove_subflow) {
- spin_lock_bh(&msk->pm.lock);
- mptcp_pm_rm_subflow(msk, &list);
- spin_unlock_bh(&msk->pm.lock);
- }
- if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
- spin_lock_bh(&msk->pm.lock);
+ spin_lock_bh(&msk->pm.lock);
+ if (remove_subflow)
+ mptcp_pm_rm_subflow(msk, &list);
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)
__mark_subflow_endp_available(msk, list.ids[0]);
- spin_unlock_bh(&msk->pm.lock);
- }
+ else /* mark endp ID as available, e.g. Signal or MPC endp */
+ __set_bit(addr->id, msk->pm.id_avail_bitmap);
+ spin_unlock_bh(&msk->pm.lock);
if (msk->mpc_endpoint_id == entry->addr.id)
msk->mpc_endpoint_id = 0;
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8d3233667418..cf1852b99963 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -28,6 +28,8 @@
#include "protocol.h"
#include "mib.h"
+static unsigned int mptcp_inq_hint(const struct sock *sk);
+
#define CREATE_TRACE_POINTS
#include <trace/events/mptcp.h>
@@ -224,9 +226,6 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
do_div(grow, oldval);
rcvwin += grow << 1;
- if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
- rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
-
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
@@ -350,9 +349,6 @@ merge_right:
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
- /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
- if (sk->sk_socket)
- mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
}
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
@@ -1164,8 +1160,9 @@ struct mptcp_sendmsg_info {
bool data_lock_held;
};
-static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk,
- u64 data_seq, int avail_size)
+static size_t mptcp_check_allowed_size(const struct mptcp_sock *msk,
+ struct sock *ssk, u64 data_seq,
+ size_t avail_size)
{
u64 window_end = mptcp_wnd_end(msk);
u64 mptcp_snd_wnd;
@@ -1174,7 +1171,7 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s
return avail_size;
mptcp_snd_wnd = window_end - data_seq;
- avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size);
+ avail_size = min(mptcp_snd_wnd, avail_size);
if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) {
tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd);
@@ -1518,7 +1515,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
if (!ssk || !sk_stream_memory_free(ssk))
return NULL;
- burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
+ burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst)
return ssk;
@@ -1995,6 +1992,17 @@ do_error:
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
+static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ /* avoid the indirect call, we know the destructor is sock_rfree */
+ skb->destructor = NULL;
+ skb->sk = NULL;
+ atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(sk, skb->truesize);
+ __skb_unlink(skb, &sk->sk_receive_queue);
+ skb_attempt_defer_free(skb);
+}
+
static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
@@ -2049,13 +2057,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
break;
}
- /* avoid the indirect call, we know the destructor is sock_rfree */
- skb->destructor = NULL;
- skb->sk = NULL;
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
- sk_mem_uncharge(sk, skb->truesize);
- __skb_unlink(skb, &sk->sk_receive_queue);
- skb_attempt_defer_free(skb);
+ mptcp_eat_recv_skb(sk, skb);
}
if (copied >= len)
@@ -2066,6 +2068,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
return copied;
}
+static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
+{
+ const struct tcp_sock *tp = tcp_sk(ssk);
+
+ msk->rcvspace_init = 1;
+ msk->rcvq_space.copied = 0;
+ msk->rcvq_space.rtt_us = 0;
+
+ /* initial rcv_space offering made to peer */
+ msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
+ TCP_INIT_CWND * tp->advmss);
+ if (msk->rcvq_space.space == 0)
+ msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
+}
+
/* receive buffer autotuning. See tcp_rcv_space_adjust for more information.
*
* Only difference: Use highest rtt estimate of the subflows in use.
@@ -2088,8 +2105,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
msk->rcvq_space.copied += copied;
- mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC);
- time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time);
+ mstamp = mptcp_stamp();
+ time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time));
rtt_us = msk->rcvq_space.rtt_us;
if (rtt_us && time < (rtt_us >> 3))
@@ -2119,6 +2136,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
+ trace_mptcp_rcvbuf_grow(sk, time);
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
/* Make subflows follow along. If we do not do this, we
* get drops at subflow level if skbs can't be moved to
@@ -3040,6 +3058,7 @@ static int mptcp_init_sock(struct sock *sk)
sk_sockets_allocated_inc(sk);
sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]);
sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]);
+ sk->sk_write_space = sk_stream_write_space;
return 0;
}
@@ -3549,6 +3568,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
__mptcp_propagate_sndbuf(nsk, ssk);
mptcp_rcv_space_init(msk, ssk);
+ msk->rcvq_space.time = mptcp_stamp();
if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK)
__mptcp_subflow_fully_established(msk, subflow, mp_opt);
@@ -3558,23 +3578,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
return nsk;
}
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk)
-{
- const struct tcp_sock *tp = tcp_sk(ssk);
-
- msk->rcvspace_init = 1;
- msk->rcvq_space.copied = 0;
- msk->rcvq_space.rtt_us = 0;
-
- msk->rcvq_space.time = tp->tcp_mstamp;
-
- /* initial rcv_space offering made to peer */
- msk->rcvq_space.space = min_t(u32, tp->rcv_wnd,
- TCP_INIT_CWND * tp->advmss);
- if (msk->rcvq_space.space == 0)
- msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT;
-}
-
static void mptcp_destroy(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -3763,6 +3766,7 @@ void mptcp_finish_connect(struct sock *ssk)
* accessing the field below
*/
WRITE_ONCE(msk->local_key, subflow->local_key);
+ WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp());
mptcp_pm_new_connection(msk, ssk, 0);
}
@@ -4312,6 +4316,201 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
return mask;
}
+static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ u32 offset;
+
+ if (!list_empty(&msk->backlog_list))
+ mptcp_move_skbs(sk);
+
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ offset = MPTCP_SKB_CB(skb)->offset;
+ if (offset < skb->len) {
+ *off = offset;
+ return skb;
+ }
+ mptcp_eat_recv_skb(sk, skb);
+ }
+ return NULL;
+}
+
+/*
+ * Note:
+ * - It is assumed that the socket was locked by the caller.
+ */
+static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor, bool noack)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *skb;
+ int copied = 0;
+ u32 offset;
+
+ msk_owned_by_me(msk);
+
+ if (sk->sk_state == TCP_LISTEN)
+ return -ENOTCONN;
+ while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) {
+ u32 data_len = skb->len - offset;
+ int count;
+ u32 size;
+
+ size = min_t(size_t, data_len, INT_MAX);
+ count = recv_actor(desc, skb, offset, size);
+ if (count <= 0) {
+ if (!copied)
+ copied = count;
+ break;
+ }
+
+ copied += count;
+
+ msk->bytes_consumed += count;
+ if (count < data_len) {
+ MPTCP_SKB_CB(skb)->offset += count;
+ MPTCP_SKB_CB(skb)->map_seq += count;
+ break;
+ }
+
+ mptcp_eat_recv_skb(sk, skb);
+ }
+
+ if (noack)
+ goto out;
+
+ mptcp_rcv_space_adjust(msk, copied);
+
+ if (copied > 0) {
+ mptcp_recv_skb(sk, &offset);
+ mptcp_cleanup_rbuf(msk, copied);
+ }
+out:
+ return copied;
+}
+
+static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc,
+ sk_read_actor_t recv_actor)
+{
+ return __mptcp_read_sock(sk, desc, recv_actor, false);
+}
+
+static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss)
+{
+ /* Store TCP splice context information in read_descriptor_t. */
+ read_descriptor_t rd_desc = {
+ .arg.data = tss,
+ .count = tss->len,
+ };
+
+ return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
+}
+
+/**
+ * mptcp_splice_read - splice data from MPTCP socket to a pipe
+ * @sock: socket to splice from
+ * @ppos: position (not valid)
+ * @pipe: pipe to splice to
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * Will read pages from given socket and fill them into a pipe.
+ *
+ * Return:
+ * Amount of bytes that have been spliced.
+ *
+ **/
+static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct tcp_splice_state tss = {
+ .pipe = pipe,
+ .len = len,
+ .flags = flags,
+ };
+ struct sock *sk = sock->sk;
+ ssize_t spliced = 0;
+ int ret = 0;
+ long timeo;
+
+ /*
+ * We can't seek on a socket input
+ */
+ if (unlikely(*ppos))
+ return -ESPIPE;
+
+ lock_sock(sk);
+
+ mptcp_rps_record_subflows(mptcp_sk(sk));
+
+ timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
+ while (tss.len) {
+ ret = __mptcp_splice_read(sk, &tss);
+ if (ret < 0) {
+ break;
+ } else if (!ret) {
+ if (spliced)
+ break;
+ if (sock_flag(sk, SOCK_DONE))
+ break;
+ if (sk->sk_err) {
+ ret = sock_error(sk);
+ break;
+ }
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
+ break;
+ if (sk->sk_state == TCP_CLOSE) {
+ /*
+ * This occurs when user tries to read
+ * from never connected socket.
+ */
+ ret = -ENOTCONN;
+ break;
+ }
+ if (!timeo) {
+ ret = -EAGAIN;
+ break;
+ }
+ /* if __mptcp_splice_read() got nothing while we have
+ * an skb in receive queue, we do not want to loop.
+ * This might happen with URG data.
+ */
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ break;
+ ret = sk_wait_data(sk, &timeo, NULL);
+ if (ret < 0)
+ break;
+ if (signal_pending(current)) {
+ ret = sock_intr_errno(timeo);
+ break;
+ }
+ continue;
+ }
+ tss.len -= ret;
+ spliced += ret;
+
+ if (!tss.len || !timeo)
+ break;
+ release_sock(sk);
+ lock_sock(sk);
+
+ if (sk->sk_err || sk->sk_state == TCP_CLOSE ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ signal_pending(current))
+ break;
+ }
+
+ release_sock(sk);
+
+ if (spliced)
+ return spliced;
+
+ return ret;
+}
+
static const struct proto_ops mptcp_stream_ops = {
.family = PF_INET,
.owner = THIS_MODULE,
@@ -4332,6 +4531,8 @@ static const struct proto_ops mptcp_stream_ops = {
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct inet_protosw mptcp_protosw = {
@@ -4436,6 +4637,8 @@ static const struct proto_ops mptcp_v6_stream_ops = {
.compat_ioctl = inet6_compat_ioctl,
#endif
.set_rcvlowat = mptcp_set_rcvlowat,
+ .read_sock = mptcp_read_sock,
+ .splice_read = mptcp_splice_read,
};
static struct proto mptcp_v6_prot;
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 66e973500791..0bd1ee860316 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -246,14 +246,14 @@ struct mptcp_pm_data {
struct mptcp_pm_local {
struct mptcp_addr_info addr;
- u8 flags;
+ u32 flags;
int ifindex;
};
struct mptcp_pm_addr_entry {
struct list_head list;
struct mptcp_addr_info addr;
- u8 flags;
+ u32 flags;
int ifindex;
struct socket *lsk;
};
@@ -915,7 +915,11 @@ static inline bool mptcp_is_fully_established(struct sock *sk)
READ_ONCE(mptcp_sk(sk)->fully_established);
}
-void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk);
+static inline u64 mptcp_stamp(void)
+{
+ return div_u64(tcp_clock_ns(), NSEC_PER_USEC);
+}
+
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
bool mptcp_schedule_work(struct sock *sk);
@@ -971,7 +975,7 @@ static inline void mptcp_write_space(struct sock *sk)
/* pairs with memory barrier in mptcp_poll */
smp_mb();
if (mptcp_stream_memory_free(sk, 1))
- sk_stream_write_space(sk);
+ INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk);
}
static inline void __mptcp_sync_sndbuf(struct sock *sk)
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 96d54cb2cd93..f66129f1e649 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -462,8 +462,6 @@ void __mptcp_sync_state(struct sock *sk, int state)
subflow = mptcp_subflow_ctx(ssk);
__mptcp_propagate_sndbuf(sk, ssk);
- if (!msk->rcvspace_init)
- mptcp_rcv_space_init(msk, ssk);
if (sk->sk_state == TCP_SYN_SENT) {
/* subflow->idsn is always available is TCP_SYN_SENT state,
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 5bb924534387..f1a50f367add 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -103,7 +103,7 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn)
* It creates a unique token to identify the new mptcp connection,
* a secret local key and the initial data sequence number (idsn).
*
- * Returns 0 on success.
+ * Return: 0 on success.
*/
int mptcp_token_new_request(struct request_sock *req)
{
@@ -146,7 +146,7 @@ int mptcp_token_new_request(struct request_sock *req)
* the computed token at a later time, this is needed to process
* join requests.
*
- * returns 0 on success.
+ * Return: 0 on success.
*/
int mptcp_token_new_connect(struct sock *ssk)
{
@@ -241,7 +241,7 @@ found:
* This function returns the mptcp connection structure with the given token.
* A reference count on the mptcp socket returned is taken.
*
- * returns NULL if no connection with the given token value exists.
+ * Return: NULL if no connection with the given token value exists.
*/
struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token)
{
@@ -288,11 +288,13 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock);
* @s_slot: start slot number
* @s_num: start number inside the given lock
*
- * This function returns the first mptcp connection structure found inside the
- * token container starting from the specified position, or NULL.
+ * Description:
+ * On successful iteration, the iterator is moved to the next position and a
+ * reference to the returned socket is acquired.
*
- * On successful iteration, the iterator is moved to the next position and
- * a reference to the returned socket is acquired.
+ * Return:
+ * The first mptcp connection structure found inside the token container
+ * starting from the specified position, or NULL.
*/
struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot,
long *s_num)
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 64c697212578..f861d116cc33 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -949,7 +949,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af,
*next_protocol = IPPROTO_IPV6;
if (payload_len)
*payload_len =
- ntohs(old_ipv6h->payload_len) +
+ ipv6_payload_len(skb, old_ipv6h) +
sizeof(*old_ipv6h);
old_dsfield = ipv6_get_dsfield(old_ipv6h);
*ttl = old_ipv6h->hop_limit;
diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c
index 8487808c8761..14e62b3263cd 100644
--- a/net/netfilter/nf_conncount.c
+++ b/net/netfilter/nf_conncount.c
@@ -34,8 +34,9 @@
#define CONNCOUNT_SLOTS 256U
-#define CONNCOUNT_GC_MAX_NODES 8
-#define MAX_KEYLEN 5
+#define CONNCOUNT_GC_MAX_NODES 8
+#define CONNCOUNT_GC_MAX_COLLECT 64
+#define MAX_KEYLEN 5
/* we will save the tuples of all connections we care about */
struct nf_conncount_tuple {
@@ -178,16 +179,28 @@ static int __nf_conncount_add(struct net *net,
return -ENOENT;
if (ct && nf_ct_is_confirmed(ct)) {
- err = -EEXIST;
- goto out_put;
+ /* local connections are confirmed in postrouting so confirmation
+ * might have happened before hitting connlimit
+ */
+ if (skb->skb_iif != LOOPBACK_IFINDEX) {
+ err = -EEXIST;
+ goto out_put;
+ }
+
+ /* this is likely a local connection, skip optimization to avoid
+ * adding duplicates from a 'packet train'
+ */
+ goto check_connections;
}
- if ((u32)jiffies == list->last_gc)
+ if ((u32)jiffies == list->last_gc &&
+ (list->count - list->last_gc_count) < CONNCOUNT_GC_MAX_COLLECT)
goto add_new_node;
+check_connections:
/* check the saved connections */
list_for_each_entry_safe(conn, conn_n, &list->head, node) {
- if (collect > CONNCOUNT_GC_MAX_NODES)
+ if (collect > CONNCOUNT_GC_MAX_COLLECT)
break;
found = find_or_evict(net, list, conn);
@@ -230,6 +243,7 @@ static int __nf_conncount_add(struct net *net,
nf_ct_put(found_ct);
}
list->last_gc = (u32)jiffies;
+ list->last_gc_count = list->count;
add_new_node:
if (WARN_ON_ONCE(list->count > INT_MAX)) {
@@ -277,6 +291,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list)
spin_lock_init(&list->list_lock);
INIT_LIST_HEAD(&list->head);
list->count = 0;
+ list->last_gc_count = 0;
list->last_gc = (u32)jiffies;
}
EXPORT_SYMBOL_GPL(nf_conncount_list_init);
@@ -316,13 +331,14 @@ static bool __nf_conncount_gc_list(struct net *net,
}
nf_ct_put(found_ct);
- if (collected > CONNCOUNT_GC_MAX_NODES)
+ if (collected > CONNCOUNT_GC_MAX_COLLECT)
break;
}
if (!list->count)
ret = true;
list->last_gc = (u32)jiffies;
+ list->last_gc_count = list->count;
return ret;
}
diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c
index be654363f53f..40c261cd0af3 100644
--- a/net/netfilter/nf_conntrack_bpf.c
+++ b/net/netfilter/nf_conntrack_bpf.c
@@ -14,6 +14,7 @@
#include <linux/types.h>
#include <linux/btf_ids.h>
#include <linux/net_namespace.h>
+#include <net/sock.h>
#include <net/xdp.h>
#include <net/netfilter/nf_conntrack_bpf.h>
#include <net/netfilter/nf_conntrack_core.h>
diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c
index 14f73872f647..17f1f453d481 100644
--- a/net/netfilter/nf_conntrack_h323_main.c
+++ b/net/netfilter/nf_conntrack_h323_main.c
@@ -23,6 +23,7 @@
#include <linux/skbuff.h>
#include <net/route.h>
#include <net/ip6_route.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 3a04665adf99..662f6bbfa805 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -32,6 +32,7 @@
#include <linux/siphash.h>
#include <linux/netfilter.h>
+#include <net/ipv6.h>
#include <net/netlink.h>
#include <net/sock.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c
index 068e9489e1c2..a6988eeb1579 100644
--- a/net/netfilter/nf_conntrack_ovs.c
+++ b/net/netfilter/nf_conntrack_ovs.c
@@ -121,7 +121,7 @@ int nf_ct_skb_network_trim(struct sk_buff *skb, int family)
len = skb_ip_totlen(skb);
break;
case NFPROTO_IPV6:
- len = ntohs(ipv6_hdr(skb)->payload_len);
+ len = skb_ipv6_payload_len(skb);
if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) {
int err = nf_ip6_check_hbh_len(skb, &len);
diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c
index e831637bc8ca..cb260eb3d012 100644
--- a/net/netfilter/nf_conntrack_proto_generic.c
+++ b/net/netfilter/nf_conntrack_proto_generic.c
@@ -67,6 +67,7 @@ void nf_conntrack_generic_init_net(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic =
{
.l4proto = 255,
+ .allow_clash = true,
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
.ctnl_timeout = {
.nlattr_to_obj = generic_timeout_nlattr_to_obj,
diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c
index af369e686fc5..b894bb7a97ad 100644
--- a/net/netfilter/nf_conntrack_proto_gre.c
+++ b/net/netfilter/nf_conntrack_proto_gre.c
@@ -33,12 +33,14 @@
#include <linux/skbuff.h>
#include <linux/slab.h>
#include <net/dst.h>
+#include <net/gre.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/netfilter/nf_conntrack_l4proto.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_timeout.h>
+#include <net/pptp.h>
#include <linux/netfilter/nf_conntrack_proto_gre.h>
#include <linux/netfilter/nf_conntrack_pptp.h>
diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c
index b38b7164acd5..32148a3a8509 100644
--- a/net/netfilter/nf_conntrack_proto_icmp.c
+++ b/net/netfilter/nf_conntrack_proto_icmp.c
@@ -365,6 +365,7 @@ void nf_conntrack_icmp_init_net(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp =
{
.l4proto = IPPROTO_ICMP,
+ .allow_clash = true,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmp_tuple_to_nlattr,
.nlattr_tuple_size = icmp_nlattr_tuple_size,
diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c
index 327b8059025d..e508b3aa370a 100644
--- a/net/netfilter/nf_conntrack_proto_icmpv6.c
+++ b/net/netfilter/nf_conntrack_proto_icmpv6.c
@@ -343,6 +343,7 @@ void nf_conntrack_icmpv6_init_net(struct net *net)
const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 =
{
.l4proto = IPPROTO_ICMPV6,
+ .allow_clash = true,
#if IS_ENABLED(CONFIG_NF_CT_NETLINK)
.tuple_to_nlattr = icmpv6_tuple_to_nlattr,
.nlattr_tuple_size = icmpv6_nlattr_tuple_size,
diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c
index 06e8251a6644..2c4140e6f53c 100644
--- a/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@ -16,6 +16,7 @@
static DEFINE_MUTEX(flowtable_lock);
static LIST_HEAD(flowtables);
+static __read_mostly struct kmem_cache *flow_offload_cachep;
static void
flow_offload_fill_dir(struct flow_offload *flow,
@@ -56,7 +57,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
if (unlikely(nf_ct_is_dying(ct)))
return NULL;
- flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+ flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
if (!flow)
return NULL;
@@ -812,9 +813,13 @@ static int __init nf_flow_table_module_init(void)
{
int ret;
+ flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
+ if (!flow_offload_cachep)
+ return -ENOMEM;
+
ret = register_pernet_subsys(&nf_flow_table_net_ops);
if (ret < 0)
- return ret;
+ goto out_pernet;
ret = nf_flow_table_offload_init();
if (ret)
@@ -830,6 +835,8 @@ out_bpf:
nf_flow_table_offload_exit();
out_offload:
unregister_pernet_subsys(&nf_flow_table_net_ops);
+out_pernet:
+ kmem_cache_destroy(flow_offload_cachep);
return ret;
}
@@ -837,6 +844,7 @@ static void __exit nf_flow_table_module_exit(void)
{
nf_flow_table_offload_exit();
unregister_pernet_subsys(&nf_flow_table_net_ops);
+ kmem_cache_destroy(flow_offload_cachep);
}
module_init(nf_flow_table_module_init);
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 78883343e5d6..3fdb10d9bf7f 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -8,10 +8,13 @@
#include <linux/ipv6.h>
#include <linux/netdevice.h>
#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <net/gre.h>
#include <net/gso.h>
#include <net/ip.h>
#include <net/ipv6.h>
#include <net/ip6_route.h>
+#include <net/ip6_tunnel.h>
#include <net/neighbour.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_conntrack_acct.h>
@@ -142,12 +145,26 @@ static bool ip_has_options(unsigned int thoff)
return thoff != sizeof(struct iphdr);
}
-static void nf_flow_tuple_encap(struct sk_buff *skb,
+struct nf_flowtable_ctx {
+ const struct net_device *in;
+ u32 offset;
+ u32 hdrsize;
+ struct {
+ /* Tunnel IP header size */
+ u32 hdr_size;
+ /* IP tunnel protocol */
+ u8 proto;
+ } tun;
+};
+
+static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
__be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
struct pppoe_hdr *phdr;
+ struct ipv6hdr *ip6h;
struct iphdr *iph;
u16 offset = 0;
int i = 0;
@@ -174,22 +191,28 @@ static void nf_flow_tuple_encap(struct sk_buff *skb,
break;
}
- if (inner_proto == htons(ETH_P_IP)) {
+ switch (inner_proto) {
+ case htons(ETH_P_IP):
iph = (struct iphdr *)(skb_network_header(skb) + offset);
- if (iph->protocol == IPPROTO_IPIP) {
+ if (ctx->tun.proto == IPPROTO_IPIP) {
tuple->tun.dst_v4.s_addr = iph->daddr;
tuple->tun.src_v4.s_addr = iph->saddr;
tuple->tun.l3_proto = IPPROTO_IPIP;
}
+ break;
+ case htons(ETH_P_IPV6):
+ ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset);
+ if (ctx->tun.proto == IPPROTO_IPV6) {
+ tuple->tun.dst_v6 = ip6h->daddr;
+ tuple->tun.src_v6 = ip6h->saddr;
+ tuple->tun.l3_proto = IPPROTO_IPV6;
+ }
+ break;
+ default:
+ break;
}
}
-struct nf_flowtable_ctx {
- const struct net_device *in;
- u32 offset;
- u32 hdrsize;
-};
-
static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
@@ -257,7 +280,7 @@ static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
tuple->l3proto = AF_INET;
tuple->l4proto = ipproto;
tuple->iifidx = ctx->in->ifindex;
- nf_flow_tuple_encap(skb, tuple);
+ nf_flow_tuple_encap(ctx, skb, tuple);
return 0;
}
@@ -293,15 +316,16 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb,
return NF_STOLEN;
}
-static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
+static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
{
struct iphdr *iph;
u16 size;
- if (!pskb_may_pull(skb, sizeof(*iph) + *psize))
+ if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset))
return false;
- iph = (struct iphdr *)(skb_network_header(skb) + *psize);
+ iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset);
size = iph->ihl << 2;
if (ip_is_fragment(iph) || unlikely(ip_has_options(size)))
@@ -310,25 +334,62 @@ static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize)
if (iph->ttl <= 1)
return false;
- if (iph->protocol == IPPROTO_IPIP)
- *psize += size;
+ if (iph->protocol == IPPROTO_IPIP) {
+ ctx->tun.proto = IPPROTO_IPIP;
+ ctx->tun.hdr_size = size;
+ ctx->offset += size;
+ }
return true;
}
-static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb)
+static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
{
- struct iphdr *iph = (struct iphdr *)skb_network_header(skb);
+#if IS_ENABLED(CONFIG_IPV6)
+ struct ipv6hdr *ip6h, _ip6h;
+ __be16 frag_off;
+ u8 nexthdr;
+ int hdrlen;
+
+ ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h);
+ if (!ip6h)
+ return false;
+
+ if (ip6h->hop_limit <= 1)
+ return false;
+
+ nexthdr = ip6h->nexthdr;
+ hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr,
+ &frag_off);
+ if (hdrlen < 0)
+ return false;
+
+ if (nexthdr == IPPROTO_IPV6) {
+ ctx->tun.hdr_size = hdrlen;
+ ctx->tun.proto = IPPROTO_IPV6;
+ }
+ ctx->offset += ctx->tun.hdr_size;
+
+ return true;
+#else
+ return false;
+#endif /* IS_ENABLED(CONFIG_IPV6) */
+}
- if (iph->protocol != IPPROTO_IPIP)
+static void nf_flow_ip_tunnel_pop(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb)
+{
+ if (ctx->tun.proto != IPPROTO_IPIP &&
+ ctx->tun.proto != IPPROTO_IPV6)
return;
- skb_pull(skb, iph->ihl << 2);
+ skb_pull(skb, ctx->tun.hdr_size);
skb_reset_network_header(skb);
}
-static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
- u32 *offset)
+static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb, __be16 proto)
{
__be16 inner_proto = skb->protocol;
struct vlan_ethhdr *veth;
@@ -341,7 +402,7 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
veth = (struct vlan_ethhdr *)skb_mac_header(skb);
if (veth->h_vlan_encapsulated_proto == proto) {
- *offset += VLAN_HLEN;
+ ctx->offset += VLAN_HLEN;
inner_proto = proto;
ret = true;
}
@@ -349,19 +410,28 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
case htons(ETH_P_PPP_SES):
if (nf_flow_pppoe_proto(skb, &inner_proto) &&
inner_proto == proto) {
- *offset += PPPOE_SES_HLEN;
+ ctx->offset += PPPOE_SES_HLEN;
ret = true;
}
break;
}
- if (inner_proto == htons(ETH_P_IP))
- ret = nf_flow_ip4_tunnel_proto(skb, offset);
+ switch (inner_proto) {
+ case htons(ETH_P_IP):
+ ret = nf_flow_ip4_tunnel_proto(ctx, skb);
+ break;
+ case htons(ETH_P_IPV6):
+ ret = nf_flow_ip6_tunnel_proto(ctx, skb);
+ break;
+ default:
+ break;
+ }
return ret;
}
-static void nf_flow_encap_pop(struct sk_buff *skb,
+static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx,
+ struct sk_buff *skb,
struct flow_offload_tuple_rhash *tuplehash)
{
struct vlan_hdr *vlan_hdr;
@@ -387,8 +457,9 @@ static void nf_flow_encap_pop(struct sk_buff *skb,
}
}
- if (skb->protocol == htons(ETH_P_IP))
- nf_flow_ip4_tunnel_pop(skb);
+ if (skb->protocol == htons(ETH_P_IP) ||
+ skb->protocol == htons(ETH_P_IPV6))
+ nf_flow_ip_tunnel_pop(ctx, skb);
}
struct nf_flow_xmit {
@@ -414,7 +485,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx,
{
struct flow_offload_tuple tuple = {};
- if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset))
+ if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IP)))
return NULL;
if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0)
@@ -458,7 +529,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash);
+ nf_flow_encap_pop(ctx, skb, tuplehash);
thoff -= ctx->offset;
iph = ip_hdr(skb);
@@ -567,6 +638,97 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb,
return 0;
}
+struct ipv6_tel_txoption {
+ struct ipv6_txoptions ops;
+ __u8 dst_opt[8];
+};
+
+static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct in6_addr **ip6_daddr,
+ int encap_limit)
+{
+ struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb);
+ u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6;
+ struct rtable *rt = dst_rtable(tuple->dst_cache);
+ __u8 dsfield = ipv6_get_dsfield(ip6h);
+ struct flowi6 fl6 = {
+ .daddr = tuple->tun.src_v6,
+ .saddr = tuple->tun.dst_v6,
+ .flowi6_proto = proto,
+ };
+ int err, mtu;
+ u32 headroom;
+
+ err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6);
+ if (err)
+ return err;
+
+ skb_set_inner_ipproto(skb, proto);
+ headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) +
+ rt->dst.header_len;
+ if (encap_limit)
+ headroom += 8;
+ err = skb_cow_head(skb, headroom);
+ if (err)
+ return err;
+
+ skb_scrub_packet(skb, true);
+ mtu = dst_mtu(&rt->dst) - sizeof(*ip6h);
+ if (encap_limit)
+ mtu -= 8;
+ mtu = max(mtu, IPV6_MIN_MTU);
+ skb_dst_update_pmtu_no_confirm(skb, mtu);
+
+ if (encap_limit > 0) {
+ struct ipv6_tel_txoption opt = {
+ .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT,
+ .dst_opt[3] = 1,
+ .dst_opt[4] = encap_limit,
+ .dst_opt[5] = IPV6_TLV_PADN,
+ .dst_opt[6] = 1,
+ };
+ struct ipv6_opt_hdr *hopt;
+
+ opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt;
+ opt.ops.opt_nflen = 8;
+
+ hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt));
+ memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt));
+ hopt->nexthdr = IPPROTO_IPV6;
+ proto = NEXTHDR_DEST;
+ }
+
+ skb_push(skb, sizeof(*ip6h));
+ skb_reset_network_header(skb);
+
+ ip6h = ipv6_hdr(skb);
+ ip6_flow_hdr(ip6h, dsfield,
+ ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6));
+ ip6h->hop_limit = hop_limit;
+ ip6h->nexthdr = proto;
+ ip6h->daddr = tuple->tun.src_v6;
+ ip6h->saddr = tuple->tun.dst_v6;
+ ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h));
+ IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
+ *ip6_daddr = &tuple->tun.src_v6;
+
+ return 0;
+}
+
+static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
+ struct flow_offload_tuple *tuple,
+ struct in6_addr **ip6_daddr,
+ int encap_limit)
+{
+ if (tuple->tun_num)
+ return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr,
+ encap_limit);
+
+ return 0;
+}
+
static int nf_flow_encap_push(struct sk_buff *skb,
struct flow_offload_tuple *tuple)
{
@@ -836,7 +998,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
tuple->l3proto = AF_INET6;
tuple->l4proto = nexthdr;
tuple->iifidx = ctx->in->ifindex;
- nf_flow_tuple_encap(skb, tuple);
+ nf_flow_tuple_encap(ctx, skb, tuple);
return 0;
}
@@ -844,7 +1006,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb,
static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
struct nf_flowtable *flow_table,
struct flow_offload_tuple_rhash *tuplehash,
- struct sk_buff *skb)
+ struct sk_buff *skb, int encap_limit)
{
enum flow_offload_tuple_dir dir;
struct flow_offload *flow;
@@ -855,6 +1017,12 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
+ if (flow->tuplehash[!dir].tuple.tun_num) {
+ mtu -= sizeof(*ip6h);
+ if (encap_limit > 0)
+ mtu -= 8; /* encap limit option */
+ }
+
if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
return 0;
@@ -873,7 +1041,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
flow_offload_refresh(flow_table, flow, false);
- nf_flow_encap_pop(skb, tuplehash);
+ nf_flow_encap_pop(ctx, skb, tuplehash);
ip6h = ipv6_hdr(skb);
nf_flow_nat_ipv6(flow, skb, dir, ip6h);
@@ -894,8 +1062,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx,
{
struct flow_offload_tuple tuple = {};
- if (skb->protocol != htons(ETH_P_IPV6) &&
- !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset))
+ if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6)))
return NULL;
if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0)
@@ -908,6 +1075,7 @@ unsigned int
nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
const struct nf_hook_state *state)
{
+ int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT;
struct flow_offload_tuple_rhash *tuplehash;
struct nf_flowtable *flow_table = priv;
struct flow_offload_tuple *other_tuple;
@@ -926,7 +1094,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
if (tuplehash == NULL)
return NF_ACCEPT;
- ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
+ ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb,
+ encap_limit);
if (ret < 0)
return NF_DROP;
else if (ret == 0)
@@ -945,6 +1114,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
other_tuple = &flow->tuplehash[!dir].tuple;
ip6_daddr = &other_tuple->src_v6;
+ if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple,
+ &ip6_daddr, encap_limit) < 0)
+ return NF_DROP;
+
if (nf_flow_encap_push(skb, other_tuple) < 0)
return NF_DROP;
diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c
index d8f7bfd60ac6..b1966b68c48a 100644
--- a/net/netfilter/nf_flow_table_offload.c
+++ b/net/netfilter/nf_flow_table_offload.c
@@ -6,6 +6,7 @@
#include <linux/netdevice.h>
#include <linux/tc_act/tc_csum.h>
#include <net/flow_offload.h>
+#include <net/ip_tunnels.h>
#include <net/netfilter/nf_flow_table.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c
index eb24fe2715dc..6bb9579dcc2a 100644
--- a/net/netfilter/nf_flow_table_path.c
+++ b/net/netfilter/nf_flow_table_path.c
@@ -2,6 +2,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
+#include <linux/etherdevice.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/spinlock.h>
diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c
index 86d5fc5d28e3..41503847d9d7 100644
--- a/net/netfilter/nf_log_syslog.c
+++ b/net/netfilter/nf_log_syslog.c
@@ -561,7 +561,7 @@ dump_ipv6_packet(struct net *net, struct nf_log_buf *m,
/* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */
nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ",
- ntohs(ih->payload_len) + sizeof(struct ipv6hdr),
+ ipv6_payload_len(skb, ih) + sizeof(struct ipv6hdr),
(ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20,
ih->hop_limit,
(ntohl(*(__be32 *)ih) & 0x000fffff));
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
index 0f9a559f6207..31474e8c034a 100644
--- a/net/netfilter/nf_nat_ovs.c
+++ b/net/netfilter/nf_nat_ovs.c
@@ -2,6 +2,9 @@
/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
#include <net/netfilter/nf_nat.h>
+#include <net/ipv6.h>
+#include <linux/ip.h>
+#include <linux/if_vlan.h>
/* Modelled after nf_nat_ipv[46]_fn().
* range is only used for new, uninitialized NAT state.
diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c
index b14a434b9561..97c0f841fc96 100644
--- a/net/netfilter/nf_nat_proto.c
+++ b/net/netfilter/nf_nat_proto.c
@@ -25,6 +25,7 @@
#include <net/ip6_route.h>
#include <net/xfrm.h>
#include <net/ipv6.h>
+#include <net/pptp.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack.h>
diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c
index 3fa3f5dfb264..57f57e2fc80a 100644
--- a/net/netfilter/nf_synproxy_core.c
+++ b/net/netfilter/nf_synproxy_core.c
@@ -10,6 +10,7 @@
#include <net/netns/generic.h>
#include <linux/proc_fs.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter/nf_synproxy.h>
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index be92750e2af3..1ed034a47bd0 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -14,6 +14,7 @@
#include <linux/rhashtable.h>
#include <linux/audit.h>
#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_flow_table.h>
@@ -7269,7 +7270,8 @@ static u32 nft_set_maxsize(const struct nft_set *set)
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr, u32 nlmsg_flags)
+ const struct nlattr *attr, u32 nlmsg_flags,
+ bool last)
{
struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {};
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
@@ -7555,6 +7557,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
if (flags)
*nft_set_ext_flags(ext) = flags;
+ if (last)
+ elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
+ else
+ elem.flags = 0;
+
if (obj)
*nft_set_ext_obj(ext) = obj;
@@ -7635,6 +7642,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
* and an existing one.
*/
err = -EEXIST;
+ } else if (err == -ECANCELED) {
+ /* ECANCELED reports an existing nul-element in
+ * interval sets.
+ */
+ err = 0;
}
goto err_element_clash;
}
@@ -7713,7 +7725,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb,
nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla);
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags);
+ err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags,
+ nla_is_last(attr, rem));
if (err < 0) {
NL_SET_BAD_ATTR(extack, attr);
return err;
@@ -7806,7 +7819,8 @@ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx,
continue;
}
- if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv))
+ if (!te->set->ops->abort_skip_removal ||
+ nft_setelem_is_catchall(te->set, te->elems[i].priv))
nft_setelem_remove(ctx->net, te->set, te->elems[i].priv);
if (!nft_setelem_is_catchall(te->set, te->elems[i].priv))
@@ -7836,7 +7850,7 @@ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx,
}
static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
- const struct nlattr *attr)
+ const struct nlattr *attr, bool last)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_ext_tmpl tmpl;
@@ -7904,6 +7918,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (flags)
*nft_set_ext_flags(ext) = flags;
+ if (last)
+ elem.flags = NFT_SET_ELEM_INTERNAL_LAST;
+ else
+ elem.flags = 0;
+
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL)
goto fail_trans;
@@ -8051,7 +8070,8 @@ static int nf_tables_delsetelem(struct sk_buff *skb,
return nft_set_flush(&ctx, set, genmask);
nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) {
- err = nft_del_setelem(&ctx, set, attr);
+ err = nft_del_setelem(&ctx, set, attr,
+ nla_is_last(attr, rem));
if (err == -ENOENT &&
NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM)
continue;
@@ -11536,6 +11556,13 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb,
ret = __nf_tables_abort(net, action);
nft_gc_seq_end(nft_net, gc_seq);
+ if (action == NFNL_ABORT_NONE) {
+ struct nft_table *table;
+
+ list_for_each_entry(table, &nft_net->tables, list)
+ table->validate_state = NFT_VALIDATE_SKIP;
+ }
+
WARN_ON_ONCE(!list_empty(&nft_net->commit_list));
/* module autoload needs to happen after GC sequence update because it
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 8b7b39d8a109..f1c8049861a6 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -30,6 +30,8 @@
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/list.h>
#include <linux/cgroup-defs.h>
+#include <linux/rhashtable.h>
+#include <linux/jhash.h>
#include <net/gso.h>
#include <net/sock.h>
#include <net/tcp_states.h>
@@ -47,6 +49,8 @@
#endif
#define NFQNL_QMAX_DEFAULT 1024
+#define NFQNL_HASH_MIN 1024
+#define NFQNL_HASH_MAX 1048576
/* We're using struct nlattr which has 16bit nla_len. Note that nla_len
* includes the header length. Thus, the maximum packet length that we
@@ -56,6 +60,26 @@
*/
#define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN)
+/* Composite key for packet lookup: (net, queue_num, packet_id) */
+struct nfqnl_packet_key {
+ possible_net_t net;
+ u32 packet_id;
+ u16 queue_num;
+} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */
+
+/* Global rhashtable - one for entire system, all netns */
+static struct rhashtable nfqnl_packet_map __read_mostly;
+
+/* Helper to initialize composite key */
+static inline void nfqnl_init_key(struct nfqnl_packet_key *key,
+ struct net *net, u32 packet_id, u16 queue_num)
+{
+ memset(key, 0, sizeof(*key));
+ write_pnet(&key->net, net);
+ key->packet_id = packet_id;
+ key->queue_num = queue_num;
+}
+
struct nfqnl_instance {
struct hlist_node hlist; /* global list of queues */
struct rcu_head rcu;
@@ -100,6 +124,39 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num)
return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS;
}
+/* Extract composite key from nf_queue_entry for hashing */
+static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+ const struct nf_queue_entry *entry = data;
+ struct nfqnl_packet_key key;
+
+ nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num);
+
+ return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed);
+}
+
+/* Compare stack-allocated key against entry */
+static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct nfqnl_packet_key *key = arg->key;
+ const struct nf_queue_entry *entry = obj;
+
+ return !net_eq(entry->state.net, read_pnet(&key->net)) ||
+ entry->queue_num != key->queue_num ||
+ entry->id != key->packet_id;
+}
+
+static const struct rhashtable_params nfqnl_rhashtable_params = {
+ .head_offset = offsetof(struct nf_queue_entry, hash_node),
+ .key_len = sizeof(struct nfqnl_packet_key),
+ .obj_hashfn = nfqnl_packet_obj_hashfn,
+ .obj_cmpfn = nfqnl_packet_obj_cmpfn,
+ .automatic_shrinking = true,
+ .min_size = NFQNL_HASH_MIN,
+ .max_size = NFQNL_HASH_MAX,
+};
+
static struct nfqnl_instance *
instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num)
{
@@ -121,17 +178,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
unsigned int h;
int err;
- spin_lock(&q->instances_lock);
- if (instance_lookup(q, queue_num)) {
- err = -EEXIST;
- goto out_unlock;
- }
-
- inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
- if (!inst) {
- err = -ENOMEM;
- goto out_unlock;
- }
+ inst = kzalloc(sizeof(*inst), GFP_KERNEL_ACCOUNT);
+ if (!inst)
+ return ERR_PTR(-ENOMEM);
inst->queue_num = queue_num;
inst->peer_portid = portid;
@@ -141,9 +190,15 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
spin_lock_init(&inst->lock);
INIT_LIST_HEAD(&inst->queue_list);
+ spin_lock(&q->instances_lock);
+ if (instance_lookup(q, queue_num)) {
+ err = -EEXIST;
+ goto out_unlock;
+ }
+
if (!try_module_get(THIS_MODULE)) {
err = -EAGAIN;
- goto out_free;
+ goto out_unlock;
}
h = instance_hashfn(queue_num);
@@ -153,10 +208,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid)
return inst;
-out_free:
- kfree(inst);
out_unlock:
spin_unlock(&q->instances_lock);
+ kfree(inst);
return ERR_PTR(err);
}
@@ -191,33 +245,45 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst)
spin_unlock(&q->instances_lock);
}
-static inline void
+static int
__enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
- list_add_tail(&entry->list, &queue->queue_list);
- queue->queue_total++;
+ int err;
+
+ entry->queue_num = queue->queue_num;
+
+ err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node,
+ nfqnl_rhashtable_params);
+ if (unlikely(err))
+ return err;
+
+ list_add_tail(&entry->list, &queue->queue_list);
+ queue->queue_total++;
+
+ return 0;
}
static void
__dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
{
+ rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node,
+ nfqnl_rhashtable_params);
list_del(&entry->list);
queue->queue_total--;
}
static struct nf_queue_entry *
-find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id,
+ struct net *net)
{
- struct nf_queue_entry *entry = NULL, *i;
+ struct nfqnl_packet_key key;
+ struct nf_queue_entry *entry;
- spin_lock_bh(&queue->lock);
+ nfqnl_init_key(&key, net, id, queue->queue_num);
- list_for_each_entry(i, &queue->queue_list, list) {
- if (i->id == id) {
- entry = i;
- break;
- }
- }
+ spin_lock_bh(&queue->lock);
+ entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key,
+ nfqnl_rhashtable_params);
if (entry)
__dequeue_entry(queue, entry);
@@ -369,6 +435,34 @@ next_hook:
nf_queue_entry_free(entry);
}
+/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us
+ * exclusively.
+ */
+static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct nf_conn *ct = (void *)skb_nfct(entry->skb);
+
+ if (!ct || nf_ct_is_confirmed(ct))
+ return false;
+
+ if (is_unconfirmed)
+ *is_unconfirmed = true;
+
+ /* in some cases skb_clone() can occur after initial conntrack
+ * pickup, but conntrack assumes exclusive skb->_nfct ownership for
+ * unconfirmed entries.
+ *
+ * This happens for br_netfilter and with ip multicast routing.
+ * This can't be solved with serialization here because one clone
+ * could have been queued for local delivery or could be transmitted
+ * in parallel on another CPU.
+ */
+ return refcount_read(&ct->ct_general.use) > 1;
+#endif
+ return false;
+}
+
static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
{
const struct nf_ct_hook *ct_hook;
@@ -396,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
break;
}
}
+
+ if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) {
+ /* If first queued segment was already reinjected then
+ * there is a good chance the ct entry is now confirmed.
+ *
+ * Handle the rare cases:
+ * - out-of-order verdict
+ * - threaded userspace reinjecting in parallel
+ * - first segment was dropped
+ *
+ * In all of those cases we can't handle this packet
+ * because we can't be sure that another CPU won't modify
+ * nf_conn->ext in parallel which isn't allowed.
+ */
+ if (nf_ct_drop_unconfirmed(entry, NULL))
+ verdict = NF_DROP;
+ }
+
nf_reinject(entry, verdict);
}
@@ -407,8 +519,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
spin_lock_bh(&queue->lock);
list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
if (!cmpfn || cmpfn(entry, data)) {
- list_del(&entry->list);
- queue->queue_total--;
+ __dequeue_entry(queue, entry);
nfqnl_reinject(entry, NF_DROP);
}
}
@@ -826,49 +937,6 @@ nlmsg_failure:
return NULL;
}
-static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry)
-{
-#if IS_ENABLED(CONFIG_NF_CONNTRACK)
- static const unsigned long flags = IPS_CONFIRMED | IPS_DYING;
- struct nf_conn *ct = (void *)skb_nfct(entry->skb);
- unsigned long status;
- unsigned int use;
-
- if (!ct)
- return false;
-
- status = READ_ONCE(ct->status);
- if ((status & flags) == IPS_DYING)
- return true;
-
- if (status & IPS_CONFIRMED)
- return false;
-
- /* in some cases skb_clone() can occur after initial conntrack
- * pickup, but conntrack assumes exclusive skb->_nfct ownership for
- * unconfirmed entries.
- *
- * This happens for br_netfilter and with ip multicast routing.
- * We can't be solved with serialization here because one clone could
- * have been queued for local delivery.
- */
- use = refcount_read(&ct->ct_general.use);
- if (likely(use == 1))
- return false;
-
- /* Can't decrement further? Exclusive ownership. */
- if (!refcount_dec_not_one(&ct->ct_general.use))
- return false;
-
- skb_set_nfct(entry->skb, 0);
- /* No nf_ct_put(): we already decremented .use and it cannot
- * drop down to 0.
- */
- return true;
-#endif
- return false;
-}
-
static int
__nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
struct nf_queue_entry *entry)
@@ -885,26 +953,23 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
}
spin_lock_bh(&queue->lock);
- if (nf_ct_drop_unconfirmed(entry))
- goto err_out_free_nskb;
+ if (queue->queue_total >= queue->queue_maxlen)
+ goto err_out_queue_drop;
- if (queue->queue_total >= queue->queue_maxlen) {
- if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
- failopen = 1;
- err = 0;
- } else {
- queue->queue_dropped++;
- net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
- queue->queue_total);
- }
- goto err_out_free_nskb;
- }
entry->id = ++queue->id_sequence;
*packet_id_ptr = htonl(entry->id);
+ /* Insert into hash BEFORE unicast. If failure don't send to userspace. */
+ err = __enqueue_entry(queue, entry);
+ if (unlikely(err))
+ goto err_out_queue_drop;
+
/* nfnetlink_unicast will either free the nskb or add it to a socket */
err = nfnetlink_unicast(nskb, net, queue->peer_portid);
if (err < 0) {
+ /* Unicast failed - remove entry we just inserted */
+ __dequeue_entry(queue, entry);
+
if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
failopen = 1;
err = 0;
@@ -914,12 +979,22 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
goto err_out_unlock;
}
- __enqueue_entry(queue, entry);
-
spin_unlock_bh(&queue->lock);
return 0;
-err_out_free_nskb:
+err_out_queue_drop:
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_dropped++;
+
+ if (queue->queue_total >= queue->queue_maxlen)
+ net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n",
+ queue->queue_total);
+ else
+ net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err);
+ }
kfree_skb(nskb);
err_out_unlock:
spin_unlock_bh(&queue->lock);
@@ -998,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue,
static int
nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
{
- unsigned int queued;
- struct nfqnl_instance *queue;
struct sk_buff *skb, *segs, *nskb;
+ bool ct_is_unconfirmed = false;
+ struct nfqnl_instance *queue;
+ unsigned int queued;
int err = -ENOBUFS;
struct net *net = entry->state.net;
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
@@ -1024,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
break;
}
+ /* Check if someone already holds another reference to
+ * unconfirmed ct. If so, we cannot queue the skb:
+ * concurrent modifications of nf_conn->ext are not
+ * allowed and we can't know if another CPU isn't
+ * processing the same nf_conn entry in parallel.
+ */
+ if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed))
+ return -EINVAL;
+
if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb)))
return __nfqnl_enqueue_packet(net, queue, entry);
@@ -1037,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
goto out_err;
queued = 0;
err = 0;
+
skb_list_walk_safe(segs, segs, nskb) {
+ if (ct_is_unconfirmed && queued > 0) {
+ /* skb_gso_segment() increments the ct refcount.
+ * This is a problem for unconfirmed (not in hash)
+ * entries, those can race when reinjections happen
+ * in parallel.
+ *
+ * Annotate this for all queued entries except the
+ * first one.
+ *
+ * As long as the first one is reinjected first it
+ * will do the confirmation for us.
+ */
+ entry->nf_ct_is_unconfirmed = ct_is_unconfirmed;
+ }
+
if (err == 0)
err = __nfqnl_enqueue_packet_gso(net, queue,
segs, entry);
@@ -1430,7 +1531,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
verdict = ntohl(vhdr->verdict);
- entry = find_dequeue_entry(queue, ntohl(vhdr->id));
+ entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net);
if (entry == NULL)
return -ENOENT;
@@ -1498,7 +1599,8 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
struct nfqnl_msg_config_cmd *cmd = NULL;
struct nfqnl_instance *queue;
__u32 flags = 0, mask = 0;
- int ret = 0;
+
+ WARN_ON_ONCE(!lockdep_nfnl_is_held(NFNL_SUBSYS_QUEUE));
if (nfqa[NFQA_CFG_CMD]) {
cmd = nla_data(nfqa[NFQA_CFG_CMD]);
@@ -1544,47 +1646,44 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
}
}
+ /* Lookup queue under RCU. After peer_portid check (or for new queue
+ * in BIND case), the queue is owned by the socket sending this message.
+ * A socket cannot simultaneously send a message and close, so while
+ * processing this CONFIG message, nfqnl_rcv_nl_event() (triggered by
+ * socket close) cannot destroy this queue. Safe to use without RCU.
+ */
rcu_read_lock();
queue = instance_lookup(q, queue_num);
if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
- ret = -EPERM;
- goto err_out_unlock;
+ rcu_read_unlock();
+ return -EPERM;
}
+ rcu_read_unlock();
if (cmd != NULL) {
switch (cmd->command) {
case NFQNL_CFG_CMD_BIND:
- if (queue) {
- ret = -EBUSY;
- goto err_out_unlock;
- }
- queue = instance_create(q, queue_num,
- NETLINK_CB(skb).portid);
- if (IS_ERR(queue)) {
- ret = PTR_ERR(queue);
- goto err_out_unlock;
- }
+ if (queue)
+ return -EBUSY;
+ queue = instance_create(q, queue_num, NETLINK_CB(skb).portid);
+ if (IS_ERR(queue))
+ return PTR_ERR(queue);
break;
case NFQNL_CFG_CMD_UNBIND:
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
+ if (!queue)
+ return -ENODEV;
instance_destroy(q, queue);
- goto err_out_unlock;
+ return 0;
case NFQNL_CFG_CMD_PF_BIND:
case NFQNL_CFG_CMD_PF_UNBIND:
break;
default:
- ret = -ENOTSUPP;
- goto err_out_unlock;
+ return -EOPNOTSUPP;
}
}
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
+ if (!queue)
+ return -ENODEV;
if (nfqa[NFQA_CFG_PARAMS]) {
struct nfqnl_msg_config_params *params =
@@ -1609,9 +1708,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info,
spin_unlock_bh(&queue->lock);
}
-err_out_unlock:
- rcu_read_unlock();
- return ret;
+ return 0;
}
static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = {
@@ -1781,10 +1878,14 @@ static int __init nfnetlink_queue_init(void)
{
int status;
+ status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params);
+ if (status < 0)
+ return status;
+
status = register_pernet_subsys(&nfnl_queue_net_ops);
if (status < 0) {
pr_err("failed to register pernet ops\n");
- goto out;
+ goto cleanup_rhashtable;
}
netlink_register_notifier(&nfqnl_rtnl_notifier);
@@ -1809,7 +1910,8 @@ cleanup_netlink_subsys:
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
-out:
+cleanup_rhashtable:
+ rhashtable_destroy(&nfqnl_packet_map);
return status;
}
@@ -1821,6 +1923,8 @@ static void __exit nfnetlink_queue_fini(void)
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
unregister_pernet_subsys(&nfnl_queue_net_ops);
+ rhashtable_destroy(&nfqnl_packet_map);
+
rcu_barrier(); /* Wait for completion of call_rcu()'s */
}
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 72711d62fddf..08f620311b03 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -134,7 +134,8 @@ static void nft_target_eval_bridge(const struct nft_expr *expr,
}
static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = {
- [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING,
+ .len = XT_EXTENSION_MAXNAMELEN, },
[NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_TARGET_INFO] = { .type = NLA_BINARY },
};
@@ -434,7 +435,8 @@ static void nft_match_eval(const struct nft_expr *expr,
}
static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = {
- [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING },
+ [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING,
+ .len = XT_EXTENSION_MAXNAMELEN },
[NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_MATCH_INFO] = { .type = NLA_BINARY },
};
@@ -693,7 +695,12 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb,
name = nla_data(tb[NFTA_COMPAT_NAME]);
rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV]));
- target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE]));
+ /* x_tables api checks for 'target == 1' to mean target,
+ * everything else means 'match'.
+ * In x_tables world, the number is set by kernel, not
+ * userspace.
+ */
+ target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1);
switch(family) {
case AF_INET:
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index cc7325329496..0d70325280cc 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -117,8 +117,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
nft_sync = this_cpu_ptr(&nft_counter_sync);
u64_stats_update_begin(nft_sync);
- u64_stats_add(&this_cpu->packets, -total->packets);
- u64_stats_add(&this_cpu->bytes, -total->bytes);
+ u64_stats_sub(&this_cpu->packets, total->packets);
+ u64_stats_sub(&this_cpu->bytes, total->bytes);
u64_stats_update_end(nft_sync);
local_bh_enable();
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index b8f76c9057fd..179d0e59e2b5 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/etherdevice.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/init.h>
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index ba01ce75d6de..739b992bde59 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -619,15 +619,20 @@ static struct nft_elem_priv *
nft_hash_get(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem, unsigned int flags)
{
+ const u32 *key = (const u32 *)&elem->key.val;
struct nft_hash *priv = nft_set_priv(set);
u8 genmask = nft_genmask_cur(net);
struct nft_hash_elem *he;
u32 hash;
- hash = jhash(elem->key.val.data, set->klen, priv->seed);
+ if (set->klen == 4)
+ hash = jhash_1word(*key, priv->seed);
+ else
+ hash = jhash(key, set->klen, priv->seed);
+
hash = reciprocal_scale(hash, priv->buckets);
hlist_for_each_entry_rcu(he, &priv->table[hash], node) {
- if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) &&
+ if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) &&
nft_set_elem_active(&he->ext, genmask))
return &he->priv;
}
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 6d77a5f0088a..18e1903b1d3d 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -2370,6 +2370,7 @@ const struct nft_set_type nft_set_pipapo_type = {
.gc_init = nft_pipapo_gc_init,
.commit = nft_pipapo_commit,
.abort = nft_pipapo_abort,
+ .abort_skip_removal = true,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
@@ -2394,6 +2395,7 @@ const struct nft_set_type nft_set_pipapo_avx2_type = {
.gc_init = nft_pipapo_gc_init,
.commit = nft_pipapo_commit,
.abort = nft_pipapo_abort,
+ .abort_skip_removal = true,
.elemsize = offsetof(struct nft_pipapo_elem, ext),
},
};
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index ca594161b840..644d4b916705 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -10,21 +10,41 @@
#include <linux/module.h>
#include <linux/list.h>
#include <linux/rbtree.h>
+#include <linux/bsearch.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
+struct nft_array_interval {
+ struct nft_set_ext *from;
+ struct nft_set_ext *to;
+};
+
+struct nft_array {
+ u32 max_intervals;
+ u32 num_intervals;
+ struct nft_array_interval *intervals;
+ struct rcu_head rcu_head;
+};
+
struct nft_rbtree {
struct rb_root root;
rwlock_t lock;
- seqcount_rwlock_t count;
+ struct nft_array __rcu *array;
+ struct nft_array *array_next;
+ unsigned long start_rbe_cookie;
unsigned long last_gc;
+ struct list_head expired;
+ u64 last_tstamp;
};
struct nft_rbtree_elem {
struct nft_elem_priv priv;
- struct rb_node node;
+ union {
+ struct rb_node node;
+ struct list_head list;
+ };
struct nft_set_ext ext;
};
@@ -39,6 +59,13 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe)
return !nft_rbtree_interval_end(rbe);
}
+static bool nft_rbtree_interval_null(const struct nft_set *set,
+ const struct nft_rbtree_elem *rbe)
+{
+ return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) &&
+ nft_rbtree_interval_end(rbe));
+}
+
static int nft_rbtree_cmp(const struct nft_set *set,
const struct nft_rbtree_elem *e1,
const struct nft_rbtree_elem *e2)
@@ -47,67 +74,33 @@ static int nft_rbtree_cmp(const struct nft_set *set,
set->klen);
}
-static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
-{
- return nft_set_elem_expired(&rbe->ext);
-}
+struct nft_array_lookup_ctx {
+ const u32 *key;
+ u32 klen;
+};
-static const struct nft_set_ext *
-__nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key, unsigned int seq)
+static int nft_array_lookup_cmp(const void *pkey, const void *entry)
{
- struct nft_rbtree *priv = nft_set_priv(set);
- const struct nft_rbtree_elem *rbe, *interval = NULL;
- u8 genmask = nft_genmask_cur(net);
- const struct rb_node *parent;
- int d;
+ const struct nft_array_interval *interval = entry;
+ const struct nft_array_lookup_ctx *ctx = pkey;
+ int a, b;
- parent = rcu_dereference_raw(priv->root.rb_node);
- while (parent != NULL) {
- if (read_seqcount_retry(&priv->count, seq))
- return NULL;
-
- rbe = rb_entry(parent, struct nft_rbtree_elem, node);
+ if (!interval->from)
+ return 1;
- d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
- if (d < 0) {
- parent = rcu_dereference_raw(parent->rb_left);
- if (interval &&
- !nft_rbtree_cmp(set, rbe, interval) &&
- nft_rbtree_interval_end(rbe) &&
- nft_rbtree_interval_start(interval))
- continue;
- if (nft_set_elem_active(&rbe->ext, genmask) &&
- !nft_rbtree_elem_expired(rbe))
- interval = rbe;
- } else if (d > 0)
- parent = rcu_dereference_raw(parent->rb_right);
- else {
- if (!nft_set_elem_active(&rbe->ext, genmask)) {
- parent = rcu_dereference_raw(parent->rb_left);
- continue;
- }
-
- if (nft_rbtree_elem_expired(rbe))
- return NULL;
-
- if (nft_rbtree_interval_end(rbe)) {
- if (nft_set_is_anonymous(set))
- return NULL;
- parent = rcu_dereference_raw(parent->rb_left);
- interval = NULL;
- continue;
- }
+ a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
+ if (!interval->to)
+ b = -1;
+ else
+ b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
- return &rbe->ext;
- }
- }
+ if (a >= 0 && b < 0)
+ return 0;
- if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_rbtree_interval_start(interval))
- return &interval->ext;
+ if (a < 0)
+ return -1;
- return NULL;
+ return 1;
}
INDIRECT_CALLABLE_SCOPE
@@ -116,83 +109,57 @@ nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
const u32 *key)
{
struct nft_rbtree *priv = nft_set_priv(set);
- unsigned int seq = read_seqcount_begin(&priv->count);
- const struct nft_set_ext *ext;
-
- ext = __nft_rbtree_lookup(net, set, key, seq);
- if (ext || !read_seqcount_retry(&priv->count, seq))
- return ext;
-
- read_lock_bh(&priv->lock);
- seq = read_seqcount_begin(&priv->count);
- ext = __nft_rbtree_lookup(net, set, key, seq);
- read_unlock_bh(&priv->lock);
-
- return ext;
+ struct nft_array *array = rcu_dereference(priv->array);
+ const struct nft_array_interval *interval;
+ struct nft_array_lookup_ctx ctx = {
+ .key = key,
+ .klen = set->klen,
+ };
+
+ if (!array)
+ return NULL;
+
+ interval = bsearch(&ctx, array->intervals, array->num_intervals,
+ sizeof(struct nft_array_interval),
+ nft_array_lookup_cmp);
+ if (!interval || nft_set_elem_expired(interval->from))
+ return NULL;
+
+ return interval->from;
}
-static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set,
- const u32 *key, struct nft_rbtree_elem **elem,
- unsigned int seq, unsigned int flags, u8 genmask)
-{
- struct nft_rbtree_elem *rbe, *interval = NULL;
- struct nft_rbtree *priv = nft_set_priv(set);
- const struct rb_node *parent;
- const void *this;
- int d;
-
- parent = rcu_dereference_raw(priv->root.rb_node);
- while (parent != NULL) {
- if (read_seqcount_retry(&priv->count, seq))
- return false;
-
- rbe = rb_entry(parent, struct nft_rbtree_elem, node);
-
- this = nft_set_ext_key(&rbe->ext);
- d = memcmp(this, key, set->klen);
- if (d < 0) {
- parent = rcu_dereference_raw(parent->rb_left);
- if (!(flags & NFT_SET_ELEM_INTERVAL_END))
- interval = rbe;
- } else if (d > 0) {
- parent = rcu_dereference_raw(parent->rb_right);
- if (flags & NFT_SET_ELEM_INTERVAL_END)
- interval = rbe;
- } else {
- if (!nft_set_elem_active(&rbe->ext, genmask)) {
- parent = rcu_dereference_raw(parent->rb_left);
- continue;
- }
+struct nft_array_get_ctx {
+ const u32 *key;
+ unsigned int flags;
+ u32 klen;
+};
- if (nft_set_elem_expired(&rbe->ext))
- return false;
+static int nft_array_get_cmp(const void *pkey, const void *entry)
+{
+ const struct nft_array_interval *interval = entry;
+ const struct nft_array_get_ctx *ctx = pkey;
+ int a, b;
- if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) ||
- (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) ==
- (flags & NFT_SET_ELEM_INTERVAL_END)) {
- *elem = rbe;
- return true;
- }
+ if (!interval->from)
+ return 1;
- if (nft_rbtree_interval_end(rbe))
- interval = NULL;
+ a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen);
+ if (!interval->to)
+ b = -1;
+ else
+ b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen);
- parent = rcu_dereference_raw(parent->rb_left);
- }
+ if (a >= 0) {
+ if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0)
+ return 0;
+ else if (b < 0)
+ return 0;
}
- if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
- nft_set_elem_active(&interval->ext, genmask) &&
- !nft_set_elem_expired(&interval->ext) &&
- ((!nft_rbtree_interval_end(interval) &&
- !(flags & NFT_SET_ELEM_INTERVAL_END)) ||
- (nft_rbtree_interval_end(interval) &&
- (flags & NFT_SET_ELEM_INTERVAL_END)))) {
- *elem = interval;
- return true;
- }
+ if (a < 0)
+ return -1;
- return false;
+ return 1;
}
static struct nft_elem_priv *
@@ -200,34 +167,41 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem, unsigned int flags)
{
struct nft_rbtree *priv = nft_set_priv(set);
- unsigned int seq = read_seqcount_begin(&priv->count);
- struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT);
- const u32 *key = (const u32 *)&elem->key.val;
- u8 genmask = nft_genmask_cur(net);
- bool ret;
-
- ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
- if (ret || !read_seqcount_retry(&priv->count, seq))
- return &rbe->priv;
-
- read_lock_bh(&priv->lock);
- seq = read_seqcount_begin(&priv->count);
- ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask);
- read_unlock_bh(&priv->lock);
-
- if (!ret)
+ struct nft_array *array = rcu_dereference(priv->array);
+ const struct nft_array_interval *interval;
+ struct nft_array_get_ctx ctx = {
+ .key = (const u32 *)&elem->key.val,
+ .flags = flags,
+ .klen = set->klen,
+ };
+ struct nft_rbtree_elem *rbe;
+
+ if (!array)
return ERR_PTR(-ENOENT);
+ interval = bsearch(&ctx, array->intervals, array->num_intervals,
+ sizeof(struct nft_array_interval), nft_array_get_cmp);
+ if (!interval || nft_set_elem_expired(interval->from))
+ return ERR_PTR(-ENOENT);
+
+ if (flags & NFT_SET_ELEM_INTERVAL_END)
+ rbe = container_of(interval->to, struct nft_rbtree_elem, ext);
+ else
+ rbe = container_of(interval->from, struct nft_rbtree_elem, ext);
+
return &rbe->priv;
}
-static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
- struct nft_rbtree *priv,
- struct nft_rbtree_elem *rbe)
+static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
{
lockdep_assert_held_write(&priv->lock);
nft_setelem_data_deactivate(net, set, &rbe->priv);
rb_erase(&rbe->node, &priv->root);
+
+ /* collected later on in commit callback */
+ list_add(&rbe->list, &priv->expired);
}
static const struct nft_rbtree_elem *
@@ -238,11 +212,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
struct rb_node *prev = rb_prev(&rbe->node);
struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
- struct nft_trans_gc *gc;
-
- gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
- if (!gc)
- return ERR_PTR(-ENOMEM);
/* search for end interval coming before this element.
* end intervals don't carry a timeout extension, they
@@ -260,28 +229,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv,
rbe_prev = NULL;
if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
- nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
-
- /* There is always room in this trans gc for this element,
- * memory allocation never actually happens, hence, the warning
- * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
- * this is synchronous gc which never fails.
- */
- gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
- if (WARN_ON_ONCE(!gc))
- return ERR_PTR(-ENOMEM);
-
- nft_trans_gc_elem_add(gc, rbe_prev);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
}
- nft_rbtree_gc_elem_remove(net, set, priv, rbe);
- gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
- if (WARN_ON_ONCE(!gc))
- return ERR_PTR(-ENOMEM);
-
- nft_trans_gc_elem_add(gc, rbe);
-
- nft_trans_gc_queue_sync_done(gc);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe);
return rbe_prev;
}
@@ -302,16 +253,107 @@ static bool nft_rbtree_update_first(const struct nft_set *set,
return false;
}
+/* Only for anonymous sets which do not allow updates, all element are active. */
+static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe)
+{
+ struct rb_node *node;
+
+ node = rb_prev(&rbe->node);
+ if (!node)
+ return NULL;
+
+ return rb_entry(node, struct nft_rbtree_elem, node);
+}
+
+static struct nft_rbtree_elem *
+__nft_rbtree_next_active(struct rb_node *node, u8 genmask)
+{
+ struct nft_rbtree_elem *next_rbe;
+
+ while (node) {
+ next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (!nft_set_elem_active(&next_rbe->ext, genmask)) {
+ node = rb_next(node);
+ continue;
+ }
+
+ return next_rbe;
+ }
+
+ return NULL;
+}
+
+static struct nft_rbtree_elem *
+nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask)
+{
+ return __nft_rbtree_next_active(rb_next(&rbe->node), genmask);
+}
+
+static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv,
+ u64 tstamp)
+{
+ if (priv->last_tstamp != tstamp) {
+ priv->start_rbe_cookie = 0;
+ priv->last_tstamp = tstamp;
+ }
+}
+
+static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv,
+ const struct nft_rbtree_elem *rbe)
+{
+ priv->start_rbe_cookie = (unsigned long)rbe;
+}
+
+static void nft_rbtree_set_start_cookie_open(struct nft_rbtree *priv,
+ const struct nft_rbtree_elem *rbe,
+ unsigned long open_interval)
+{
+ priv->start_rbe_cookie = (unsigned long)rbe | open_interval;
+}
+
+#define NFT_RBTREE_OPEN_INTERVAL 1UL
+
+static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv,
+ const struct nft_rbtree_elem *rbe)
+{
+ return (priv->start_rbe_cookie & ~NFT_RBTREE_OPEN_INTERVAL) == (unsigned long)rbe;
+}
+
+static bool nft_rbtree_insert_same_interval(const struct net *net,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ u8 genmask = nft_genmask_next(net);
+ struct nft_rbtree_elem *next_rbe;
+
+ if (!priv->start_rbe_cookie)
+ return true;
+
+ next_rbe = nft_rbtree_next_active(rbe, genmask);
+ if (next_rbe) {
+ /* Closest start element differs from last element added. */
+ if (nft_rbtree_interval_start(next_rbe) &&
+ nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
+ priv->start_rbe_cookie = 0;
+ return true;
+ }
+ }
+
+ priv->start_rbe_cookie = 0;
+
+ return false;
+}
+
static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
struct nft_rbtree_elem *new,
- struct nft_elem_priv **elem_priv)
+ struct nft_elem_priv **elem_priv, u64 tstamp, bool last)
{
- struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL;
+ struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev;
struct rb_node *node, *next, *parent, **p, *first = NULL;
struct nft_rbtree *priv = nft_set_priv(set);
u8 cur_genmask = nft_genmask_cur(net);
u8 genmask = nft_genmask_next(net);
- u64 tstamp = nft_net_tstamp(net);
+ unsigned long open_interval = 0;
int d;
/* Descend the tree to search for an existing element greater than the
@@ -417,12 +459,46 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
}
}
+ if (nft_rbtree_interval_null(set, new)) {
+ priv->start_rbe_cookie = 0;
+ } else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) {
+ if (nft_set_is_anonymous(set)) {
+ priv->start_rbe_cookie = 0;
+ } else if (priv->start_rbe_cookie & NFT_RBTREE_OPEN_INTERVAL) {
+ /* Previous element is an open interval that partially
+ * overlaps with an existing non-open interval.
+ */
+ return -ENOTEMPTY;
+ }
+ }
+
/* - new start element matching existing start element: full overlap
* reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given.
*/
if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) &&
nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) {
*elem_priv = &rbe_ge->priv;
+
+ /* - Corner case: new start element of open interval (which
+ * comes as last element in the batch) overlaps the start of
+ * an existing interval with an end element: partial overlap.
+ */
+ node = rb_first(&priv->root);
+ rbe = __nft_rbtree_next_active(node, genmask);
+ if (rbe && nft_rbtree_interval_end(rbe)) {
+ rbe = nft_rbtree_next_active(rbe, genmask);
+ if (rbe &&
+ nft_rbtree_interval_start(rbe) &&
+ !nft_rbtree_cmp(set, new, rbe)) {
+ if (last)
+ return -ENOTEMPTY;
+
+ /* Maybe open interval? */
+ open_interval = NFT_RBTREE_OPEN_INTERVAL;
+ }
+ }
+ nft_rbtree_set_start_cookie_open(priv, rbe_ge, open_interval);
+
return -EEXIST;
}
@@ -431,18 +507,37 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
*/
if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) &&
nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) {
+ /* - ignore null interval, otherwise NLM_F_CREATE bogusly
+ * reports EEXIST.
+ */
+ if (nft_rbtree_interval_null(set, new))
+ return -ECANCELED;
+
*elem_priv = &rbe_le->priv;
+
+ /* - start and end element belong to the same interval. */
+ if (!nft_rbtree_insert_same_interval(net, priv, rbe_le))
+ return -ENOTEMPTY;
+
return -EEXIST;
}
/* - new start element with existing closest, less or equal key value
* being a start element: partial overlap, reported as -ENOTEMPTY.
* Anonymous sets allow for two consecutive start element since they
- * are constant, skip them to avoid bogus overlap reports.
+ * are constant, but validate that this new start element does not
+ * sit in between an existing start and end elements: partial overlap,
+ * reported as -ENOTEMPTY.
*/
- if (!nft_set_is_anonymous(set) && rbe_le &&
- nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new))
- return -ENOTEMPTY;
+ if (rbe_le &&
+ nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) {
+ if (!nft_set_is_anonymous(set))
+ return -ENOTEMPTY;
+
+ rbe_prev = nft_rbtree_prev_active(rbe_le);
+ if (rbe_prev && nft_rbtree_interval_end(rbe_prev))
+ return -ENOTEMPTY;
+ }
/* - new end element with existing closest, less or equal key value
* being a end element: partial overlap, reported as -ENOTEMPTY.
@@ -458,6 +553,12 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new))
return -ENOTEMPTY;
+ /* - start element overlaps an open interval but end element is new:
+ * partial overlap, reported as -ENOEMPTY.
+ */
+ if (!rbe_ge && priv->start_rbe_cookie && nft_rbtree_interval_end(new))
+ return -ENOTEMPTY;
+
/* Accepted element: pick insertion point depending on key value */
parent = NULL;
p = &priv->root.rb_node;
@@ -481,14 +582,102 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set,
return 0;
}
+static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals)
+{
+ struct nft_array_interval *intervals;
+
+ intervals = kvcalloc(max_intervals, sizeof(struct nft_array_interval),
+ GFP_KERNEL_ACCOUNT);
+ if (!intervals)
+ return -ENOMEM;
+
+ if (array->intervals)
+ kvfree(array->intervals);
+
+ array->intervals = intervals;
+ array->max_intervals = max_intervals;
+
+ return 0;
+}
+
+static struct nft_array *nft_array_alloc(u32 max_intervals)
+{
+ struct nft_array *array;
+
+ array = kzalloc(sizeof(*array), GFP_KERNEL_ACCOUNT);
+ if (!array)
+ return NULL;
+
+ if (nft_array_intervals_alloc(array, max_intervals) < 0) {
+ kfree(array);
+ return NULL;
+ }
+
+ return array;
+}
+
+#define NFT_ARRAY_EXTRA_SIZE 10240
+
+/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
+ * packed representation coming from userspace for anonymous sets too.
+ */
+static u32 nft_array_elems(const struct nft_set *set)
+{
+ u32 nelems = atomic_read(&set->nelems);
+
+ /* Adjacent intervals are represented with a single start element in
+ * anonymous sets, use the current element counter as is.
+ */
+ if (nft_set_is_anonymous(set))
+ return nelems;
+
+ /* Add extra room for never matching interval at the beginning and open
+ * interval at the end which only use a single element to represent it.
+ * The conversion to array will compact intervals, this allows reduce
+ * memory consumption.
+ */
+ return (nelems / 2) + 2;
+}
+
+static int nft_array_may_resize(const struct nft_set *set)
+{
+ u32 nelems = nft_array_elems(set), new_max_intervals;
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_array *array;
+
+ if (!priv->array_next) {
+ array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE);
+ if (!array)
+ return -ENOMEM;
+
+ priv->array_next = array;
+ }
+
+ if (nelems < priv->array_next->max_intervals)
+ return 0;
+
+ new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE;
+ if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
+ return -ENOMEM;
+
+ return 0;
+}
+
static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem,
struct nft_elem_priv **elem_priv)
{
struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv);
+ bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST);
struct nft_rbtree *priv = nft_set_priv(set);
+ u64 tstamp = nft_net_tstamp(net);
int err;
+ nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
+
+ if (nft_array_may_resize(set) < 0)
+ return -ENOMEM;
+
do {
if (fatal_signal_pending(current))
return -EINTR;
@@ -496,10 +685,12 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
cond_resched();
write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
- err = __nft_rbtree_insert(net, set, rbe, elem_priv);
- write_seqcount_end(&priv->count);
+ err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp, last);
write_unlock_bh(&priv->lock);
+
+ if (nft_rbtree_interval_end(rbe))
+ priv->start_rbe_cookie = 0;
+
} while (err == -EAGAIN);
return err;
@@ -508,9 +699,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe)
{
write_lock_bh(&priv->lock);
- write_seqcount_begin(&priv->count);
rb_erase(&rbe->node, &priv->root);
- write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock);
}
@@ -533,6 +722,48 @@ static void nft_rbtree_activate(const struct net *net,
nft_clear(net, &rbe->ext);
}
+static struct nft_rbtree_elem *
+nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask)
+{
+ struct nft_rbtree_elem *next_rbe;
+ struct rb_node *node;
+
+ node = rb_next(&rbe->node);
+ if (node) {
+ next_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ if (nft_rbtree_interval_start(next_rbe) &&
+ !nft_set_elem_active(&next_rbe->ext, genmask))
+ return next_rbe;
+ }
+
+ return NULL;
+}
+
+static bool nft_rbtree_deactivate_same_interval(const struct net *net,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
+{
+ u8 genmask = nft_genmask_next(net);
+ struct nft_rbtree_elem *next_rbe;
+
+ if (!priv->start_rbe_cookie)
+ return true;
+
+ next_rbe = nft_rbtree_next_inactive(rbe, genmask);
+ if (next_rbe) {
+ /* Closest start element differs from last element added. */
+ if (nft_rbtree_interval_start(next_rbe) &&
+ nft_rbtree_cmp_start_cookie(priv, next_rbe)) {
+ priv->start_rbe_cookie = 0;
+ return true;
+ }
+ }
+
+ priv->start_rbe_cookie = 0;
+
+ return false;
+}
+
static void nft_rbtree_flush(const struct net *net,
const struct nft_set *set,
struct nft_elem_priv *elem_priv)
@@ -547,12 +778,22 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv);
- const struct nft_rbtree *priv = nft_set_priv(set);
+ bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST);
+ struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
u8 genmask = nft_genmask_next(net);
u64 tstamp = nft_net_tstamp(net);
int d;
+ nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
+
+ if (nft_rbtree_interval_start(this) ||
+ nft_rbtree_interval_null(set, this))
+ priv->start_rbe_cookie = 0;
+
+ if (nft_array_may_resize(set) < 0)
+ return NULL;
+
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
@@ -577,6 +818,13 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
parent = parent->rb_left;
continue;
}
+
+ if (nft_rbtree_interval_start(rbe)) {
+ if (!last)
+ nft_rbtree_set_start_cookie(priv, rbe);
+ } else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe))
+ return NULL;
+
nft_rbtree_flush(net, set, &rbe->priv);
return &rbe->priv;
}
@@ -615,6 +863,11 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
switch (iter->type) {
case NFT_ITER_UPDATE:
lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (nft_array_may_resize(set) < 0) {
+ iter->err = -ENOMEM;
+ break;
+ }
nft_rbtree_do_walk(ctx, set, iter);
break;
case NFT_ITER_READ:
@@ -629,29 +882,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
}
}
-static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
- struct nft_rbtree *priv,
- struct nft_rbtree_elem *rbe)
-{
- nft_setelem_data_deactivate(net, set, &rbe->priv);
- nft_rbtree_erase(priv, rbe);
-}
-
-static void nft_rbtree_gc(struct nft_set *set)
+static void nft_rbtree_gc_scan(struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
struct net *net = read_pnet(&set->net);
u64 tstamp = nft_net_tstamp(net);
struct rb_node *node, *next;
- struct nft_trans_gc *gc;
-
- set = nft_set_container_of(priv);
- net = read_pnet(&set->net);
-
- gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
- if (!gc)
- return;
for (node = rb_first(&priv->root); node ; node = next) {
next = rb_next(node);
@@ -669,34 +906,46 @@ static void nft_rbtree_gc(struct nft_set *set)
if (!__nft_set_elem_expired(&rbe->ext, tstamp))
continue;
- gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
- if (!gc)
- goto try_later;
-
/* end element needs to be removed first, it has
* no timeout extension.
*/
+ write_lock_bh(&priv->lock);
if (rbe_end) {
- nft_rbtree_gc_remove(net, set, priv, rbe_end);
- nft_trans_gc_elem_add(gc, rbe_end);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
rbe_end = NULL;
}
- gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
- if (!gc)
- goto try_later;
-
- nft_rbtree_gc_remove(net, set, priv, rbe);
- nft_trans_gc_elem_add(gc, rbe);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe);
+ write_unlock_bh(&priv->lock);
}
-try_later:
+ priv->last_gc = jiffies;
+}
+
+static void nft_rbtree_gc_queue(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *rbe_end;
+ struct nft_trans_gc *gc;
- if (gc) {
- gc = nft_trans_gc_catchall_sync(gc);
- nft_trans_gc_queue_sync_done(gc);
- priv->last_gc = jiffies;
+ if (list_empty(&priv->expired))
+ return;
+
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
+ list_del(&rbe->list);
+ nft_trans_gc_elem_add(gc, rbe);
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return;
}
+
+ gc = nft_trans_gc_catchall_sync(gc);
+ nft_trans_gc_queue_sync_done(gc);
}
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
@@ -714,24 +963,45 @@ static int nft_rbtree_init(const struct nft_set *set,
BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0);
rwlock_init(&priv->lock);
- seqcount_rwlock_init(&priv->count, &priv->lock);
priv->root = RB_ROOT;
+ INIT_LIST_HEAD(&priv->expired);
+
+ priv->array = NULL;
+ priv->array_next = NULL;
return 0;
}
+static void __nft_array_free(struct nft_array *array)
+{
+ kvfree(array->intervals);
+ kfree(array);
+}
+
static void nft_rbtree_destroy(const struct nft_ctx *ctx,
const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe;
+ struct nft_rbtree_elem *rbe, *next;
+ struct nft_array *array;
struct rb_node *node;
+ list_for_each_entry_safe(rbe, next, &priv->expired, list) {
+ list_del(&rbe->list);
+ nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
+ }
+
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
}
+
+ array = rcu_dereference_protected(priv->array, true);
+ if (array)
+ __nft_array_free(array);
+ if (priv->array_next)
+ __nft_array_free(priv->array_next);
}
static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
@@ -752,12 +1022,105 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features,
return true;
}
+static void nft_array_free_rcu(struct rcu_head *rcu_head)
+{
+ struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head);
+
+ __nft_array_free(array);
+}
+
static void nft_rbtree_commit(struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *prev_rbe;
+ struct nft_array *old;
+ u32 num_intervals = 0;
+ struct rb_node *node;
+ /* No changes, skip, eg. elements updates only. */
+ if (!priv->array_next)
+ return;
+
+ /* GC can be performed if the binary search blob is going
+ * to be rebuilt. It has to be done in two phases: first
+ * scan tree and move all expired elements to the expired
+ * list.
+ *
+ * Then, after blob has been re-built and published to other
+ * CPUs, queue collected entries for freeing.
+ */
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- nft_rbtree_gc(set);
+ nft_rbtree_gc_scan(set);
+
+ /* Reverse walk to create an array from smaller to largest interval. */
+ node = rb_last(&priv->root);
+ if (node)
+ prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+ else
+ prev_rbe = NULL;
+
+ while (prev_rbe) {
+ rbe = prev_rbe;
+
+ if (nft_rbtree_interval_start(rbe))
+ priv->array_next->intervals[num_intervals].from = &rbe->ext;
+ else if (nft_rbtree_interval_end(rbe))
+ priv->array_next->intervals[num_intervals++].to = &rbe->ext;
+
+ if (num_intervals >= priv->array_next->max_intervals) {
+ pr_warn_once("malformed interval set from userspace?");
+ goto err_out;
+ }
+
+ node = rb_prev(node);
+ if (!node)
+ break;
+
+ prev_rbe = rb_entry(node, struct nft_rbtree_elem, node);
+
+ /* For anonymous sets, when adjacent ranges are found,
+ * the end element is not added to the set to pack the set
+ * representation. Use next start element to complete this
+ * interval.
+ */
+ if (nft_rbtree_interval_start(rbe) &&
+ nft_rbtree_interval_start(prev_rbe) &&
+ priv->array_next->intervals[num_intervals].from)
+ priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext;
+
+ if (num_intervals >= priv->array_next->max_intervals) {
+ pr_warn_once("malformed interval set from userspace?");
+ goto err_out;
+ }
+ }
+
+ if (priv->array_next->intervals[num_intervals].from)
+ num_intervals++;
+err_out:
+ priv->array_next->num_intervals = num_intervals;
+ old = rcu_replace_pointer(priv->array, priv->array_next,
+ lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
+ priv->array_next = NULL;
+ if (old)
+ call_rcu(&old->rcu_head, nft_array_free_rcu);
+
+ /* New blob is public, queue collected entries for freeing.
+ * call_rcu ensures elements stay around until readers are done.
+ */
+ nft_rbtree_gc_queue(set);
+}
+
+static void nft_rbtree_abort(const struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_array *array_next;
+
+ if (!priv->array_next)
+ return;
+
+ array_next = priv->array_next;
+ priv->array_next = NULL;
+ __nft_array_free(array_next);
}
static void nft_rbtree_gc_init(const struct nft_set *set)
@@ -821,6 +1184,7 @@ const struct nft_set_type nft_set_rbtree_type = {
.flush = nft_rbtree_flush,
.activate = nft_rbtree_activate,
.commit = nft_rbtree_commit,
+ .abort = nft_rbtree_abort,
.gc_init = nft_rbtree_gc_init,
.lookup = nft_rbtree_lookup,
.walk = nft_rbtree_walk,
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 4d3e5a31b412..b71ef18b0e8c 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -7,6 +7,7 @@
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_synproxy.h>
#include <net/netfilter/nf_synproxy.h>
+#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nf_tables.h>
#include <linux/netfilter/nf_synproxy.h>
diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c
index 37704ab01799..0d32d4841cb3 100644
--- a/net/netfilter/xt_tcpmss.c
+++ b/net/netfilter/xt_tcpmss.c
@@ -61,7 +61,7 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par)
return (mssval >= info->mss_min &&
mssval <= info->mss_max) ^ info->invert;
}
- if (op[i] < 2)
+ if (op[i] < 2 || i == optlen - 1)
i++;
else
i += op[i+1] ? : 1;
diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c
index 6aa12d0f54e2..00319d2a54da 100644
--- a/net/netfilter/xt_time.c
+++ b/net/netfilter/xt_time.c
@@ -14,6 +14,7 @@
#include <linux/ktime.h>
#include <linux/module.h>
+#include <linux/rtc.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/netfilter/x_tables.h>
@@ -64,11 +65,6 @@ static const u_int16_t days_since_epoch[] = {
3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0,
};
-static inline bool is_leap(unsigned int y)
-{
- return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0);
-}
-
/*
* Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp.
* Since we match against days and daytime, the SSTE value needs to be
@@ -138,7 +134,7 @@ static void localtime_3(struct xtm *r, time64_t time)
* (A different approach to use would be to subtract a monthlength
* from w repeatedly while counting.)
*/
- if (is_leap(year)) {
+ if (is_leap_year(year)) {
/* use days_since_leapyear[] in a leap year */
for (i = ARRAY_SIZE(days_since_leapyear) - 1;
i > 0 && days_since_leapyear[i] > w; --i)
diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c
index 4fc37894860c..08c8aa1530d8 100644
--- a/net/nfc/hci/llc_shdlc.c
+++ b/net/nfc/hci/llc_shdlc.c
@@ -762,6 +762,14 @@ static void llc_shdlc_deinit(struct nfc_llc *llc)
{
struct llc_shdlc *shdlc = nfc_llc_get_data(llc);
+ timer_shutdown_sync(&shdlc->connect_timer);
+ timer_shutdown_sync(&shdlc->t1_timer);
+ timer_shutdown_sync(&shdlc->t2_timer);
+ shdlc->t1_active = false;
+ shdlc->t2_active = false;
+
+ cancel_work_sync(&shdlc->sm_work);
+
skb_queue_purge(&shdlc->rcv_q);
skb_queue_purge(&shdlc->send_q);
skb_queue_purge(&shdlc->ack_pending_q);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 494d628d10a5..a1005359085a 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -572,8 +572,9 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
__be16 proto = skb->protocol;
if (unlikely(eth_type_vlan(proto)))
- proto = __vlan_get_protocol_offset(skb, proto,
- skb_mac_offset(skb), NULL);
+ proto = vlan_get_protocol_offset_inline(skb, proto,
+ skb_mac_offset(skb),
+ NULL);
return proto;
}
diff --git a/net/rds/cong.c b/net/rds/cong.c
index 8b689ebbd5b5..ac1f120c10f9 100644
--- a/net/rds/cong.c
+++ b/net/rds/cong.c
@@ -242,7 +242,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map)
* therefore trigger warnings.
* Defer the xmit to rds_send_worker() instead.
*/
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
}
rcu_read_unlock();
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index 68bc88cce84e..185f73b01694 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -169,6 +169,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
struct rds_connection *conn, *parent = NULL;
struct hlist_head *head = rds_conn_bucket(laddr, faddr);
struct rds_transport *loop_trans;
+ struct rds_conn_path *free_cp = NULL;
unsigned long flags;
int ret, i;
int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1);
@@ -269,6 +270,11 @@ static struct rds_connection *__rds_conn_create(struct net *net,
__rds_conn_path_init(conn, &conn->c_path[i],
is_outgoing);
conn->c_path[i].cp_index = i;
+ conn->c_path[i].cp_wq =
+ alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0,
+ rds_conn_count, i);
+ if (!conn->c_path[i].cp_wq)
+ conn->c_path[i].cp_wq = rds_wq;
}
rcu_read_lock();
if (rds_destroy_pending(conn))
@@ -277,7 +283,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
ret = trans->conn_alloc(conn, GFP_ATOMIC);
if (ret) {
rcu_read_unlock();
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = ERR_PTR(ret);
goto out;
@@ -300,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
/* Creating passive conn */
if (parent->c_passive) {
trans->conn_free(conn->c_path[0].cp_transport_data);
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = parent->c_passive;
} else {
@@ -327,7 +333,7 @@ static struct rds_connection *__rds_conn_create(struct net *net,
if (cp->cp_transport_data)
trans->conn_free(cp->cp_transport_data);
}
- kfree(conn->c_path);
+ free_cp = conn->c_path;
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
@@ -342,6 +348,13 @@ static struct rds_connection *__rds_conn_create(struct net *net,
rcu_read_unlock();
out:
+ if (free_cp) {
+ for (i = 0; i < npaths; i++)
+ if (free_cp[i].cp_wq != rds_wq)
+ destroy_workqueue(free_cp[i].cp_wq);
+ kfree(free_cp);
+ }
+
return conn;
}
@@ -382,6 +395,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
if (!rds_conn_path_transition(cp, RDS_CONN_UP,
RDS_CONN_DISCONNECTING) &&
!rds_conn_path_transition(cp, RDS_CONN_ERROR,
+ RDS_CONN_DISCONNECTING) &&
+ !rds_conn_path_transition(cp, RDS_CONN_RESETTING,
RDS_CONN_DISCONNECTING)) {
rds_conn_path_error(cp,
"shutdown called in state %d\n",
@@ -427,13 +442,21 @@ void rds_conn_shutdown(struct rds_conn_path *cp)
* to the conn hash, so we never trigger a reconnect on this
* conn - the reconnect is always triggered by the active peer. */
cancel_delayed_work_sync(&cp->cp_conn_w);
+
+ clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags);
rcu_read_lock();
if (!hlist_unhashed(&conn->c_hash_node)) {
rcu_read_unlock();
+ if (conn->c_trans->t_mp_capable &&
+ cp->cp_index == 0)
+ rds_send_ping(conn, 0);
rds_queue_reconnect(cp);
} else {
rcu_read_unlock();
}
+
+ if (conn->c_trans->conn_slots_available)
+ conn->c_trans->conn_slots_available(conn, false);
}
/* destroy a single rds_conn_path. rds_conn_destroy() iterates over
@@ -469,6 +492,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp)
WARN_ON(delayed_work_pending(&cp->cp_conn_w));
WARN_ON(work_pending(&cp->cp_down_w));
+ if (cp->cp_wq != rds_wq) {
+ destroy_workqueue(cp->cp_wq);
+ cp->cp_wq = NULL;
+ }
+
cp->cp_conn->c_trans->conn_free(cp->cp_transport_data);
}
@@ -884,7 +912,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy)
rcu_read_unlock();
return;
}
- queue_work(rds_wq, &cp->cp_down_w);
+ queue_work(cp->cp_wq, &cp->cp_down_w);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_conn_path_drop);
@@ -909,7 +937,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp)
}
if (rds_conn_path_state(cp) == RDS_CONN_DOWN &&
!test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags))
- queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down);
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 4248dfa816eb..357128d34a54 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -457,7 +457,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
(must_wake ||
(can_wait && rds_ib_ring_low(&ic->i_recv_ring)) ||
rds_ib_ring_empty(&ic->i_recv_ring))) {
- queue_delayed_work(rds_wq, &conn->c_recv_w, 1);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_recv_w, 1);
}
if (can_wait)
cond_resched();
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 4190b90ff3b1..fcd04c29f543 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -297,7 +297,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
test_bit(0, &conn->c_map_queued))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0);
/* We expect errors as the qp is drained during shutdown */
if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) {
@@ -419,7 +419,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits)
atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
+ queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0);
WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
@@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm,
/* If it has a RDMA op, tell the peer we did it. This is
* used by the peer to release use-once RDMA MRs. */
if (rm->rdma.op_active) {
- struct rds_ext_header_rdma ext_hdr;
+ struct rds_ext_header_rdma ext_hdr = {};
+ struct rds_ext_header_rdma_bytes
+ rdma_bytes_ext_hdr = {};
ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
- rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA,
+ &ext_hdr)) {
+ /* prepare the rdma bytes ext header */
+ rdma_bytes_ext_hdr.h_rflags =
+ rm->rdma.op_write ?
+ RDS_FLAG_RDMA_WR_BYTES :
+ RDS_FLAG_RDMA_RD_BYTES;
+ rdma_bytes_ext_hdr.h_rdma_bytes =
+ cpu_to_be32(rm->rdma.op_bytes);
+ } else {
+ rdsdebug("RDS_EXTHDR_RDMA dropped");
+ }
+
+ if (rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_RDMA_BYTES,
+ &rdma_bytes_ext_hdr)) {
+ /* rdma bytes ext header was added successfully,
+ * notify the remote side via flag in header
+ */
+ rm->m_inc.i_hdr.h_flags |=
+ RDS_FLAG_EXTHDR_EXTENSION;
+ } else {
+ rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped");
+ }
}
- if (rm->m_rdma_cookie) {
- rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
- rds_rdma_cookie_key(rm->m_rdma_cookie),
- rds_rdma_cookie_offset(rm->m_rdma_cookie));
+ if (rm->m_rdma_cookie &&
+ !rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
+ rds_rdma_cookie_key(rm->m_rdma_cookie),
+ rds_rdma_cookie_offset(rm->m_rdma_cookie))) {
+ rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n");
}
/* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so
diff --git a/net/rds/message.c b/net/rds/message.c
index 199a899a43e9..54fd000806ea 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,8 +44,10 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
+[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes),
[RDS_EXTHDR_NPATHS] = sizeof(__be16),
[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
+[RDS_EXTHDR_SPORT_IDX] = 1,
};
void rds_message_addref(struct rds_message *rm)
@@ -191,31 +193,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
hdr->h_sport = sport;
hdr->h_dport = dport;
hdr->h_sequence = cpu_to_be64(seq);
- hdr->h_exthdr[0] = RDS_EXTHDR_NONE;
+ /* see rds_find_next_ext_space for reason why we memset the
+ * ext header
+ */
+ memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE);
}
EXPORT_SYMBOL_GPL(rds_message_populate_header);
-int rds_message_add_extension(struct rds_header *hdr, unsigned int type,
- const void *data, unsigned int len)
+/*
+ * Find the next place we can add an RDS header extension with
+ * specific length. Extension headers are pushed one after the
+ * other. In the following, the number after the colon is the number
+ * of bytes:
+ *
+ * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE
+ *
+ * If the extension headers fill the complete extension header space
+ * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted.
+ */
+static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len,
+ u8 **ext_start)
{
- unsigned int ext_len = sizeof(u8) + len;
- unsigned char *dst;
+ unsigned int ext_len;
+ unsigned int type;
+ int ind = 0;
+
+ while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) {
+ if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) {
+ *ext_start = hdr->h_exthdr + ind;
+ return 0;
+ }
- /* For now, refuse to add more than one extension header */
- if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE)
- return 0;
+ type = hdr->h_exthdr[ind];
+
+ ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type);
+ if (!ext_len)
+ return -EINVAL;
+
+ /* ind points to a valid ext hdr with known length */
+ ind += 1 + ext_len;
+ }
+
+ /* no room for extension */
+ return -ENOSPC;
+}
+
+/* The ext hdr space is prefilled with zero from the kzalloc() */
+int rds_message_add_extension(struct rds_header *hdr,
+ unsigned int type, const void *data)
+{
+ unsigned char *dst;
+ unsigned int len;
- if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type])
+ len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0;
+ if (!len)
return 0;
- if (ext_len >= RDS_HEADER_EXT_SPACE)
+ if (rds_find_next_ext_space(hdr, len, &dst))
return 0;
- dst = hdr->h_exthdr;
*dst++ = type;
memcpy(dst, data, len);
- dst[len] = RDS_EXTHDR_NONE;
return 1;
}
EXPORT_SYMBOL_GPL(rds_message_add_extension);
@@ -272,7 +312,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o
ext_hdr.h_rdma_rkey = cpu_to_be32(r_key);
ext_hdr.h_rdma_offset = cpu_to_be32(offset);
- return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr));
+ return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr);
}
EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension);
diff --git a/net/rds/rds.h b/net/rds/rds.h
index a029e5fcdea7..6e0790e4b570 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -118,6 +118,7 @@ struct rds_conn_path {
void *cp_transport_data;
+ struct workqueue_struct *cp_wq;
atomic_t cp_state;
unsigned long cp_send_gen;
unsigned long cp_flags;
@@ -146,6 +147,7 @@ struct rds_connection {
c_ping_triggered:1,
c_pad_to_32:29;
int c_npaths;
+ bool c_with_sport_idx;
struct rds_connection *c_passive;
struct rds_transport *c_trans;
@@ -168,6 +170,8 @@ struct rds_connection {
u32 c_my_gen_num;
u32 c_peer_gen_num;
+
+ u64 c_cp0_mprds_catchup_tx_seq;
};
static inline
@@ -182,10 +186,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net)
write_pnet(&conn->c_net, net);
}
-#define RDS_FLAG_CONG_BITMAP 0x01
-#define RDS_FLAG_ACK_REQUIRED 0x02
-#define RDS_FLAG_RETRANSMITTED 0x04
-#define RDS_MAX_ADV_CREDIT 255
+#define RDS_FLAG_CONG_BITMAP 0x01
+#define RDS_FLAG_ACK_REQUIRED 0x02
+#define RDS_FLAG_RETRANSMITTED 0x04
+#define RDS_FLAG_EXTHDR_EXTENSION 0x20
+#define RDS_MAX_ADV_CREDIT 255
/* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping
* probe to exchange control information before establishing a connection.
@@ -257,13 +262,29 @@ struct rds_ext_header_rdma_dest {
__be32 h_rdma_offset;
};
+/*
+ * This extension header tells the peer about delivered RDMA byte count.
+ */
+#define RDS_EXTHDR_RDMA_BYTES 4
+
+struct rds_ext_header_rdma_bytes {
+ __be32 h_rdma_bytes; /* byte count */
+ u8 h_rflags; /* direction of RDMA, write or read */
+ u8 h_pad[3];
+};
+
+#define RDS_FLAG_RDMA_WR_BYTES 0x01
+#define RDS_FLAG_RDMA_RD_BYTES 0x02
+
/* Extension header announcing number of paths.
* Implicit length = 2 bytes.
*/
#define RDS_EXTHDR_NPATHS 5
#define RDS_EXTHDR_GEN_NUM 6
+#define RDS_EXTHDR_SPORT_IDX 8
#define __RDS_EXTHDR_MAX 16 /* for now */
+
#define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1)
#define RDS_MSG_RX_HDR 0
#define RDS_MSG_RX_START 1
@@ -505,33 +526,6 @@ struct rds_notifier {
*/
#define RDS_TRANS_LOOP 3
-/**
- * struct rds_transport - transport specific behavioural hooks
- *
- * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
- * part of a message. The caller serializes on the send_sem so this
- * doesn't need to be reentrant for a given conn. The header must be
- * sent before the data payload. .xmit must be prepared to send a
- * message with no data payload. .xmit should return the number of
- * bytes that were sent down the connection, including header bytes.
- * Returning 0 tells the caller that it doesn't need to perform any
- * additional work now. This is usually the case when the transport has
- * filled the sending queue for its connection and will handle
- * triggering the rds thread to continue the send when space becomes
- * available. Returning -EAGAIN tells the caller to retry the send
- * immediately. Returning -ENOMEM tells the caller to retry the send at
- * some point in the future.
- *
- * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once
- * it returns the connection can not call rds_recv_incoming().
- * This will only be called once after conn_connect returns
- * non-zero success and will The caller serializes this with
- * the send and connecting paths (xmit_* and conn_*). The
- * transport is responsible for other serialization, including
- * rds_recv_incoming(). This is called in process context but
- * should try hard not to block.
- */
-
struct rds_transport {
char t_name[TRANSNAMSIZ];
struct list_head t_item;
@@ -544,10 +538,49 @@ struct rds_transport {
__u32 scope_id);
int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
void (*conn_free)(void *data);
+
+ /*
+ * conn_slots_available is invoked when a previously unavailable
+ * connection slot becomes available again. rds_tcp_accept_one_path may
+ * return -ENOBUFS if it cannot find an available slot, and then stashes
+ * the new socket in "rds_tcp_accepted_sock". This function re-issues
+ * `rds_tcp_accept_one_path`, which picks up the stashed socket and
+ * continuing where it left with "-ENOBUFS" last time. This ensures
+ * messages received on the new socket are not discarded when no
+ * connection path was available at the time.
+ */
+ void (*conn_slots_available)(struct rds_connection *conn, bool fan_out);
int (*conn_path_connect)(struct rds_conn_path *cp);
+
+ /*
+ * conn_shutdown stops traffic on the given connection. Once
+ * it returns the connection can not call rds_recv_incoming().
+ * This will only be called once after conn_connect returns
+ * non-zero success and will The caller serializes this with
+ * the send and connecting paths (xmit_* and conn_*). The
+ * transport is responsible for other serialization, including
+ * rds_recv_incoming(). This is called in process context but
+ * should try hard not to block.
+ */
void (*conn_path_shutdown)(struct rds_conn_path *conn);
void (*xmit_path_prepare)(struct rds_conn_path *cp);
void (*xmit_path_complete)(struct rds_conn_path *cp);
+
+ /*
+ * .xmit is called by rds_send_xmit() to tell the transport to send
+ * part of a message. The caller serializes on the send_sem so this
+ * doesn't need to be reentrant for a given conn. The header must be
+ * sent before the data payload. .xmit must be prepared to send a
+ * message with no data payload. .xmit should return the number of
+ * bytes that were sent down the connection, including header bytes.
+ * Returning 0 tells the caller that it doesn't need to perform any
+ * additional work now. This is usually the case when the transport has
+ * filled the sending queue for its connection and will handle
+ * triggering the rds thread to continue the send when space becomes
+ * available. Returning -EAGAIN tells the caller to retry the send
+ * immediately. Returning -ENOMEM tells the caller to retry the send at
+ * some point in the future.
+ */
int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op);
@@ -682,42 +715,43 @@ static inline int rds_sk_rcvbuf(struct rds_sock *rs)
}
struct rds_statistics {
- uint64_t s_conn_reset;
- uint64_t s_recv_drop_bad_checksum;
- uint64_t s_recv_drop_old_seq;
- uint64_t s_recv_drop_no_sock;
- uint64_t s_recv_drop_dead_sock;
- uint64_t s_recv_deliver_raced;
- uint64_t s_recv_delivered;
- uint64_t s_recv_queued;
- uint64_t s_recv_immediate_retry;
- uint64_t s_recv_delayed_retry;
- uint64_t s_recv_ack_required;
- uint64_t s_recv_rdma_bytes;
- uint64_t s_recv_ping;
- uint64_t s_send_queue_empty;
- uint64_t s_send_queue_full;
- uint64_t s_send_lock_contention;
- uint64_t s_send_lock_queue_raced;
- uint64_t s_send_immediate_retry;
- uint64_t s_send_delayed_retry;
- uint64_t s_send_drop_acked;
- uint64_t s_send_ack_required;
- uint64_t s_send_queued;
- uint64_t s_send_rdma;
- uint64_t s_send_rdma_bytes;
- uint64_t s_send_pong;
- uint64_t s_page_remainder_hit;
- uint64_t s_page_remainder_miss;
- uint64_t s_copy_to_user;
- uint64_t s_copy_from_user;
- uint64_t s_cong_update_queued;
- uint64_t s_cong_update_received;
- uint64_t s_cong_send_error;
- uint64_t s_cong_send_blocked;
- uint64_t s_recv_bytes_added_to_socket;
- uint64_t s_recv_bytes_removed_from_socket;
- uint64_t s_send_stuck_rm;
+ u64 s_conn_reset;
+ u64 s_recv_drop_bad_checksum;
+ u64 s_recv_drop_old_seq;
+ u64 s_recv_drop_no_sock;
+ u64 s_recv_drop_dead_sock;
+ u64 s_recv_deliver_raced;
+ u64 s_recv_delivered;
+ u64 s_recv_queued;
+ u64 s_recv_immediate_retry;
+ u64 s_recv_delayed_retry;
+ u64 s_recv_ack_required;
+ u64 s_recv_rdma_bytes;
+ u64 s_recv_ping;
+ u64 s_send_queue_empty;
+ u64 s_send_queue_full;
+ u64 s_send_lock_contention;
+ u64 s_send_lock_queue_raced;
+ u64 s_send_immediate_retry;
+ u64 s_send_delayed_retry;
+ u64 s_send_drop_acked;
+ u64 s_send_ack_required;
+ u64 s_send_queued;
+ u64 s_send_rdma;
+ u64 s_send_rdma_bytes;
+ u64 s_send_pong;
+ u64 s_page_remainder_hit;
+ u64 s_page_remainder_miss;
+ u64 s_copy_to_user;
+ u64 s_copy_from_user;
+ u64 s_cong_update_queued;
+ u64 s_cong_update_received;
+ u64 s_cong_send_error;
+ u64 s_cong_send_blocked;
+ u64 s_recv_bytes_added_to_socket;
+ u64 s_recv_bytes_removed_from_socket;
+ u64 s_send_stuck_rm;
+ u64 s_mprds_catchup_tx0_retries;
};
/* af_rds.c */
@@ -858,7 +892,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in
void rds_message_populate_header(struct rds_header *hdr, __be16 sport,
__be16 dport, u64 seq);
int rds_message_add_extension(struct rds_header *hdr,
- unsigned int type, const void *data, unsigned int len);
+ unsigned int type, const void *data);
int rds_message_next_extension(struct rds_header *hdr,
unsigned int *pos, void *buf, unsigned int *buflen);
int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset);
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 66205d6924bf..4b3f9e4a8bfd 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -204,8 +204,14 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
struct rds_ext_header_version version;
__be16 rds_npaths;
__be32 rds_gen_num;
+ u8 dummy;
} buffer;
+ bool new_with_sport_idx = false;
u32 new_peer_gen_num = 0;
+ int new_npaths;
+ bool fan_out;
+
+ new_npaths = conn->c_npaths;
while (1) {
len = sizeof(buffer);
@@ -215,21 +221,48 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
/* Process extension header here */
switch (type) {
case RDS_EXTHDR_NPATHS:
- conn->c_npaths = min_t(int, RDS_MPATH_WORKERS,
- be16_to_cpu(buffer.rds_npaths));
+ new_npaths = min_t(int, RDS_MPATH_WORKERS,
+ be16_to_cpu(buffer.rds_npaths));
break;
case RDS_EXTHDR_GEN_NUM:
new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num);
break;
+ case RDS_EXTHDR_SPORT_IDX:
+ new_with_sport_idx = true;
+ break;
default:
pr_warn_ratelimited("ignoring unknown exthdr type "
"0x%x\n", type);
}
}
+
+ conn->c_with_sport_idx = new_with_sport_idx;
+
+ if (new_npaths > 1 && new_npaths != conn->c_npaths) {
+ /* We're about to fan-out.
+ * Make sure that messages from cp_index#0
+ * are sent prior to handling other lanes.
+ */
+ struct rds_conn_path *cp0 = conn->c_path;
+ unsigned long flags;
+
+ spin_lock_irqsave(&cp0->cp_lock, flags);
+ conn->c_cp0_mprds_catchup_tx_seq = cp0->cp_next_tx_seq;
+ spin_unlock_irqrestore(&cp0->cp_lock, flags);
+ fan_out = true;
+ } else {
+ fan_out = false;
+ }
+
/* if RDS_EXTHDR_NPATHS was not found, default to a single-path */
- conn->c_npaths = max_t(int, conn->c_npaths, 1);
+ conn->c_npaths = max_t(int, new_npaths, 1);
+
conn->c_ping_triggered = 0;
rds_conn_peer_gen_update(conn, new_peer_gen_num);
+
+ if (conn->c_npaths > 1 &&
+ conn->c_trans->conn_slots_available)
+ conn->c_trans->conn_slots_available(conn, fan_out);
}
/* rds_start_mprds() will synchronously start multiple paths when appropriate.
diff --git a/net/rds/send.c b/net/rds/send.c
index 0b3d0ef2f008..6e96f108473e 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -120,6 +120,57 @@ static void release_in_xmit(struct rds_conn_path *cp)
}
/*
+ * Helper function for multipath fanout to ensure lane 0 transmits queued
+ * messages before other lanes to prevent out-of-order delivery.
+ *
+ * Returns true if lane 0 still has messages or false otherwise
+ */
+static bool rds_mprds_cp0_catchup(struct rds_connection *conn)
+{
+ struct rds_conn_path *cp0 = conn->c_path;
+ struct rds_message *rm0;
+ unsigned long flags;
+ bool ret = false;
+
+ spin_lock_irqsave(&cp0->cp_lock, flags);
+
+ /* the oldest / first message in the retransmit queue
+ * has to be at or beyond c_cp0_mprds_catchup_tx_seq
+ */
+ if (!list_empty(&cp0->cp_retrans)) {
+ rm0 = list_entry(cp0->cp_retrans.next, struct rds_message,
+ m_conn_item);
+ if (be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) <
+ conn->c_cp0_mprds_catchup_tx_seq) {
+ /* the retransmit queue of cp_index#0 has not
+ * quite caught up yet
+ */
+ ret = true;
+ goto unlock;
+ }
+ }
+
+ /* the oldest / first message of the send queue
+ * has to be at or beyond c_cp0_mprds_catchup_tx_seq
+ */
+ rm0 = cp0->cp_xmit_rm;
+ if (!rm0 && !list_empty(&cp0->cp_send_queue))
+ rm0 = list_entry(cp0->cp_send_queue.next, struct rds_message,
+ m_conn_item);
+ if (rm0 && be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) <
+ conn->c_cp0_mprds_catchup_tx_seq) {
+ /* the send queue of cp_index#0 has not quite
+ * caught up yet
+ */
+ ret = true;
+ }
+
+unlock:
+ spin_unlock_irqrestore(&cp0->cp_lock, flags);
+ return ret;
+}
+
+/*
* We're making the conscious trade-off here to only send one message
* down the connection at a time.
* Pro:
@@ -248,6 +299,14 @@ restart:
if (batch_count >= send_batch_count)
goto over_batch;
+ /* make sure cp_index#0 caught up during fan-out in
+ * order to avoid lane races
+ */
+ if (cp->cp_index > 0 && rds_mprds_cp0_catchup(conn)) {
+ rds_stats_inc(s_mprds_catchup_tx0_retries);
+ goto over_batch;
+ }
+
spin_lock_irqsave(&cp->cp_lock, flags);
if (!list_empty(&cp->cp_send_queue)) {
@@ -458,7 +517,8 @@ over_batch:
if (rds_destroy_pending(cp->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ queue_delayed_work(cp->cp_wq,
+ &cp->cp_send_w, 1);
rcu_read_unlock();
} else if (raced) {
rds_stats_inc(s_send_lock_queue_raced);
@@ -1041,39 +1101,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm,
return ret;
}
-static int rds_send_mprds_hash(struct rds_sock *rs,
- struct rds_connection *conn, int nonblock)
-{
- int hash;
-
- if (conn->c_npaths == 0)
- hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS);
- else
- hash = RDS_MPATH_HASH(rs, conn->c_npaths);
- if (conn->c_npaths == 0 && hash != 0) {
- rds_send_ping(conn, 0);
-
- /* The underlying connection is not up yet. Need to wait
- * until it is up to be sure that the non-zero c_path can be
- * used. But if we are interrupted, we have to use the zero
- * c_path in case the connection ends up being non-MP capable.
- */
- if (conn->c_npaths == 0) {
- /* Cannot wait for the connection be made, so just use
- * the base c_path.
- */
- if (nonblock)
- return 0;
- if (wait_event_interruptible(conn->c_hs_waitq,
- conn->c_npaths != 0))
- hash = 0;
- }
- if (conn->c_npaths == 1)
- hash = 0;
- }
- return hash;
-}
-
static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes)
{
struct rds_rdma_args *args;
@@ -1303,10 +1330,32 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
rs->rs_conn = conn;
}
- if (conn->c_trans->t_mp_capable)
- cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)];
- else
+ if (conn->c_trans->t_mp_capable) {
+ /* Use c_path[0] until we learn that
+ * the peer supports more (c_npaths > 1)
+ */
+ cpath = &conn->c_path[RDS_MPATH_HASH(rs, conn->c_npaths ? : 1)];
+ } else {
cpath = &conn->c_path[0];
+ }
+
+ /* If we're multipath capable and path 0 is down, queue reconnect
+ * and send a ping. This initiates the multipath handshake through
+ * rds_send_probe(), which sends RDS_EXTHDR_NPATHS to the peer,
+ * starting multipath capability negotiation.
+ */
+ if (conn->c_trans->t_mp_capable &&
+ !rds_conn_path_up(&conn->c_path[0])) {
+ /* Ensures that only one request is queued. And
+ * rds_send_ping() ensures that only one ping is
+ * outstanding.
+ */
+ if (!test_and_set_bit(RDS_RECONNECT_PENDING,
+ &conn->c_path[0].cp_flags))
+ queue_delayed_work(conn->c_path[0].cp_wq,
+ &conn->c_path[0].cp_conn_w, 0);
+ rds_send_ping(conn, 0);
+ }
rm->m_conn_path = cpath;
@@ -1380,7 +1429,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
if (rds_destroy_pending(cpath->cp_conn))
ret = -ENETUNREACH;
else
- queue_delayed_work(rds_wq, &cpath->cp_send_w, 1);
+ queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1);
rcu_read_unlock();
}
if (ret)
@@ -1456,24 +1505,26 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
cp->cp_conn->c_trans->t_mp_capable) {
__be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
__be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+ u8 dummy = 0;
rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_NPATHS, &npaths,
- sizeof(npaths));
+ RDS_EXTHDR_NPATHS, &npaths);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_GEN_NUM,
- &my_gen_num,
- sizeof(u32));
+ &my_gen_num);
+ rds_message_add_extension(&rm->m_inc.i_hdr,
+ RDS_EXTHDR_SPORT_IDX,
+ &dummy);
}
spin_unlock_irqrestore(&cp->cp_lock, flags);
rds_stats_inc(s_send_queued);
rds_stats_inc(s_send_pong);
- /* schedule the send work on rds_wq */
+ /* schedule the send work on cp_wq */
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 1);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1);
rcu_read_unlock();
rds_message_put(rm);
diff --git a/net/rds/stats.c b/net/rds/stats.c
index cb2e3d2cdf73..24ee22d09e8c 100644
--- a/net/rds/stats.c
+++ b/net/rds/stats.c
@@ -79,6 +79,7 @@ static const char *const rds_stat_names[] = {
"recv_bytes_added_to_sock",
"recv_bytes_freed_fromsock",
"send_stuck_rm",
+ "mprds_catchup_tx0_retries",
};
void rds_stats_info_copy(struct rds_info_iterator *iter,
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 3cc2f303bf78..45484a93d75f 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -213,6 +213,8 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp)
sock->sk->sk_data_ready = sock->sk->sk_user_data;
tc->t_sock = sock;
+ if (!tc->t_rtn)
+ tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid);
tc->t_cpath = cp;
tc->t_orig_data_ready = sock->sk->sk_data_ready;
tc->t_orig_write_space = sock->sk->sk_write_space;
@@ -378,9 +380,11 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp)
}
mutex_init(&tc->t_conn_path_lock);
tc->t_sock = NULL;
+ tc->t_rtn = NULL;
tc->t_tinc = NULL;
tc->t_tinc_hdr_rem = sizeof(struct rds_header);
tc->t_tinc_data_rem = 0;
+ init_waitqueue_head(&tc->t_recv_done_waitq);
conn->c_path[i].cp_transport_data = tc;
tc->t_cpath = &conn->c_path[i];
@@ -458,6 +462,7 @@ struct rds_transport rds_tcp_transport = {
.recv_path = rds_tcp_recv_path,
.conn_alloc = rds_tcp_conn_alloc,
.conn_free = rds_tcp_conn_free,
+ .conn_slots_available = rds_tcp_conn_slots_available,
.conn_path_connect = rds_tcp_conn_path_connect,
.conn_path_shutdown = rds_tcp_conn_path_shutdown,
.inc_copy_to_user = rds_tcp_inc_copy_to_user,
@@ -473,17 +478,7 @@ struct rds_transport rds_tcp_transport = {
.t_unloading = rds_tcp_is_unloading,
};
-static unsigned int rds_tcp_netid;
-
-/* per-network namespace private data for this module */
-struct rds_tcp_net {
- struct socket *rds_tcp_listen_sock;
- struct work_struct rds_tcp_accept_w;
- struct ctl_table_header *rds_tcp_sysctl;
- struct ctl_table *ctl_table;
- int sndbuf_size;
- int rcvbuf_size;
-};
+int rds_tcp_netid;
/* All module specific customizations to the RDS-TCP socket should be done in
* rds_tcp_tune() and applied after socket creation.
@@ -526,15 +521,12 @@ static void rds_tcp_accept_worker(struct work_struct *work)
struct rds_tcp_net,
rds_tcp_accept_w);
- while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0)
+ while (rds_tcp_accept_one(rtn) == 0)
cond_resched();
}
-void rds_tcp_accept_work(struct sock *sk)
+void rds_tcp_accept_work(struct rds_tcp_net *rtn)
{
- struct net *net = sock_net(sk);
- struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
-
queue_work(rds_wq, &rtn->rds_tcp_accept_w);
}
@@ -546,6 +538,8 @@ static __net_init int rds_tcp_init_net(struct net *net)
memset(rtn, 0, sizeof(*rtn));
+ mutex_init(&rtn->rds_tcp_accept_lock);
+
/* {snd, rcv}buf_size default to 0, which implies we let the
* stack pick the value, and permit auto-tuning of buffer size.
*/
@@ -609,6 +603,8 @@ static void rds_tcp_kill_sock(struct net *net)
rtn->rds_tcp_listen_sock = NULL;
rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w);
+ if (rtn->rds_tcp_accepted_sock)
+ sock_release(rtn->rds_tcp_accepted_sock);
spin_lock_irq(&rds_tcp_conn_lock);
list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net);
diff --git a/net/rds/tcp.h b/net/rds/tcp.h
index 053aa7da87ef..39c86347188c 100644
--- a/net/rds/tcp.h
+++ b/net/rds/tcp.h
@@ -4,6 +4,21 @@
#define RDS_TCP_PORT 16385
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+ /* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock"
+ * to protect "rds_tcp_accepted_sock"
+ */
+ struct mutex rds_tcp_accept_lock;
+ struct socket *rds_tcp_listen_sock;
+ struct socket *rds_tcp_accepted_sock;
+ struct work_struct rds_tcp_accept_w;
+ struct ctl_table_header *rds_tcp_sysctl;
+ const struct ctl_table *ctl_table;
+ int sndbuf_size;
+ int rcvbuf_size;
+};
+
struct rds_tcp_incoming {
struct rds_incoming ti_inc;
struct sk_buff_head ti_skb_list;
@@ -19,6 +34,8 @@ struct rds_tcp_connection {
*/
struct mutex t_conn_path_lock;
struct socket *t_sock;
+ u32 t_client_port_group;
+ struct rds_tcp_net *t_rtn;
void *t_orig_write_space;
void *t_orig_data_ready;
void *t_orig_state_change;
@@ -38,6 +55,9 @@ struct rds_tcp_connection {
u32 t_last_sent_nxt;
u32 t_last_expected_una;
u32 t_last_seen_una;
+
+ /* for rds_tcp_conn_path_shutdown */
+ wait_queue_head_t t_recv_done_waitq;
};
struct rds_tcp_statistics {
@@ -49,6 +69,7 @@ struct rds_tcp_statistics {
};
/* tcp.c */
+extern int rds_tcp_netid;
bool rds_tcp_tune(struct socket *sock);
void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp);
void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp);
@@ -57,7 +78,7 @@ void rds_tcp_restore_callbacks(struct socket *sock,
u32 rds_tcp_write_seq(struct rds_tcp_connection *tc);
u32 rds_tcp_snd_una(struct rds_tcp_connection *tc);
extern struct rds_transport rds_tcp_transport;
-void rds_tcp_accept_work(struct sock *sk);
+void rds_tcp_accept_work(struct rds_tcp_net *rtn);
int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr,
__u32 scope_id);
/* tcp_connect.c */
@@ -69,7 +90,8 @@ void rds_tcp_state_change(struct sock *sk);
struct socket *rds_tcp_listen_init(struct net *net, bool isv6);
void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor);
void rds_tcp_listen_data_ready(struct sock *sk);
-int rds_tcp_accept_one(struct socket *sock);
+void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out);
+int rds_tcp_accept_one(struct rds_tcp_net *rtn);
void rds_tcp_keepalive(struct socket *sock);
void *rds_tcp_listen_sock_def_readable(struct net *net);
@@ -86,6 +108,7 @@ void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp);
void rds_tcp_xmit_path_complete(struct rds_conn_path *cp);
int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm,
unsigned int hdr_off, unsigned int sg, unsigned int off);
+int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack);
void rds_tcp_write_space(struct sock *sk);
/* tcp_stats.c */
diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c
index 92891b0d224d..b77c88ffb199 100644
--- a/net/rds/tcp_connect.c
+++ b/net/rds/tcp_connect.c
@@ -75,8 +75,16 @@ void rds_tcp_state_change(struct sock *sk)
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
}
break;
+ case TCP_CLOSING:
+ case TCP_TIME_WAIT:
+ if (wq_has_sleeper(&tc->t_recv_done_waitq))
+ wake_up(&tc->t_recv_done_waitq);
+ break;
case TCP_CLOSE_WAIT:
+ case TCP_LAST_ACK:
case TCP_CLOSE:
+ if (wq_has_sleeper(&tc->t_recv_done_waitq))
+ wake_up(&tc->t_recv_done_waitq);
rds_conn_path_drop(cp, false);
break;
default:
@@ -93,6 +101,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
struct sockaddr_in6 sin6;
struct sockaddr_in sin;
struct sockaddr *addr;
+ int port_low, port_high, port;
+ int port_groups, groups_left;
int addrlen;
bool isv6;
int ret;
@@ -145,7 +155,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp)
addrlen = sizeof(sin);
}
- ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen);
+ /* encode cp->cp_index in lowest bits of source-port */
+ inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high);
+ port_low = ALIGN(port_low, RDS_MPATH_WORKERS);
+ port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS;
+ ret = -EADDRINUSE;
+ groups_left = port_groups;
+ while (groups_left-- > 0 && ret) {
+ if (++tc->t_client_port_group >= port_groups)
+ tc->t_client_port_group = 0;
+ port = port_low +
+ tc->t_client_port_group * RDS_MPATH_WORKERS +
+ cp->cp_index;
+
+ if (isv6)
+ sin6.sin6_port = htons(port);
+ else
+ sin.sin_port = htons(port);
+ ret = kernel_bind(sock, (struct sockaddr_unsized *)addr,
+ addrlen);
+ }
if (ret) {
rdsdebug("bind failed with %d at address %pI6c\n",
ret, &conn->c_laddr);
@@ -205,18 +234,58 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp)
{
struct rds_tcp_connection *tc = cp->cp_transport_data;
struct socket *sock = tc->t_sock;
+ struct sock *sk;
+ unsigned int rounds;
rdsdebug("shutting down conn %p tc %p sock %p\n",
cp->cp_conn, tc, sock);
if (sock) {
+ sk = sock->sk;
if (rds_destroy_pending(cp->cp_conn))
- sock_no_linger(sock->sk);
- sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN);
- lock_sock(sock->sk);
+ sock_no_linger(sk);
+
+ sock->ops->shutdown(sock, SHUT_WR);
+
+ /* after sending FIN,
+ * wait until we processed all incoming messages
+ * and we're sure that there won't be any more:
+ * i.e. state CLOSING, TIME_WAIT, CLOSE_WAIT,
+ * LAST_ACK, or CLOSE (RFC 793).
+ *
+ * Give up waiting after 5 seconds and allow messages
+ * to theoretically get dropped, if the TCP transition
+ * didn't happen.
+ */
+ rounds = 0;
+ do {
+ /* we need to ensure messages are dequeued here
+ * since "rds_recv_worker" only dispatches messages
+ * while the connection is still in RDS_CONN_UP
+ * and there is no guarantee that "rds_tcp_data_ready"
+ * was called nor that "sk_data_ready" still points to
+ * it.
+ */
+ rds_tcp_recv_path(cp);
+ } while (!wait_event_timeout(tc->t_recv_done_waitq,
+ (sk->sk_state == TCP_CLOSING ||
+ sk->sk_state == TCP_TIME_WAIT ||
+ sk->sk_state == TCP_CLOSE_WAIT ||
+ sk->sk_state == TCP_LAST_ACK ||
+ sk->sk_state == TCP_CLOSE) &&
+ skb_queue_empty_lockless(&sk->sk_receive_queue),
+ msecs_to_jiffies(100)) &&
+ ++rounds < 50);
+ lock_sock(sk);
+
+ /* discard messages that the peer received already */
+ tc->t_last_seen_una = rds_tcp_snd_una(tc);
+ rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc),
+ rds_tcp_is_acked);
+
rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */
- release_sock(sock->sk);
+ release_sock(sk);
sock_release(sock);
}
diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 820d3e20de19..6fb5c928b8fd 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -35,6 +35,8 @@
#include <linux/in.h>
#include <net/tcp.h>
#include <trace/events/sock.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
#include "rds.h"
#include "tcp.h"
@@ -54,49 +56,120 @@ void rds_tcp_keepalive(struct socket *sock)
tcp_sock_set_keepintvl(sock->sk, keepidle);
}
+static int
+rds_tcp_get_peer_sport(struct socket *sock)
+{
+ union {
+ struct sockaddr_storage storage;
+ struct sockaddr addr;
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } saddr;
+ int sport;
+
+ if (kernel_getpeername(sock, &saddr.addr) >= 0) {
+ switch (saddr.addr.sa_family) {
+ case AF_INET:
+ sport = ntohs(saddr.sin.sin_port);
+ break;
+ case AF_INET6:
+ sport = ntohs(saddr.sin6.sin6_port);
+ break;
+ default:
+ sport = -1;
+ }
+ } else {
+ sport = -1;
+ }
+
+ return sport;
+}
+
/* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the
* client's ipaddr < server's ipaddr. Otherwise, close the accepted
* socket and force a reconneect from smaller -> larger ip addr. The reason
* we special case cp_index 0 is to allow the rds probe ping itself to itself
* get through efficiently.
- * Since reconnects are only initiated from the node with the numerically
- * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side
- * by moving them to CONNECTING in this function.
*/
-static
-struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn)
+static struct rds_tcp_connection *
+rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock)
{
- int i;
- int npaths = max_t(int, 1, conn->c_npaths);
+ int sport, npaths, i_min, i_max, i;
- /* for mprds, all paths MUST be initiated by the peer
- * with the smaller address.
- */
- if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) {
- /* Make sure we initiate at least one path if this
- * has not already been done; rds_start_mprds() will
- * take care of additional paths, if necessary.
- */
- if (npaths == 1)
- rds_conn_path_connect_if_down(&conn->c_path[0]);
- return NULL;
+ if (conn->c_with_sport_idx)
+ /* cp->cp_index is encoded in lowest bits of source-port */
+ sport = rds_tcp_get_peer_sport(sock);
+ else
+ sport = -1;
+
+ npaths = max_t(int, 1, conn->c_npaths);
+
+ if (sport >= 0) {
+ i_min = sport % npaths;
+ i_max = i_min;
+ } else {
+ i_min = 0;
+ i_max = npaths - 1;
}
- for (i = 0; i < npaths; i++) {
+ for (i = i_min; i <= i_max; i++) {
struct rds_conn_path *cp = &conn->c_path[i];
if (rds_conn_path_transition(cp, RDS_CONN_DOWN,
- RDS_CONN_CONNECTING) ||
- rds_conn_path_transition(cp, RDS_CONN_ERROR,
- RDS_CONN_CONNECTING)) {
+ RDS_CONN_CONNECTING))
return cp->cp_transport_data;
- }
}
+
return NULL;
}
-int rds_tcp_accept_one(struct socket *sock)
+void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out)
+{
+ struct rds_tcp_connection *tc;
+ struct rds_tcp_net *rtn;
+ struct socket *sock;
+ int sport, npaths;
+
+ if (rds_destroy_pending(conn))
+ return;
+
+ tc = conn->c_path->cp_transport_data;
+ rtn = tc->t_rtn;
+ if (!rtn)
+ return;
+
+ sock = tc->t_sock;
+
+ /* During fan-out, check that the connection we already
+ * accepted in slot#0 carried the proper source port modulo.
+ */
+ if (fan_out && conn->c_with_sport_idx && sock &&
+ rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) {
+ /* cp->cp_index is encoded in lowest bits of source-port */
+ sport = rds_tcp_get_peer_sport(sock);
+ npaths = max_t(int, 1, conn->c_npaths);
+ if (sport >= 0 && sport % npaths != 0)
+ /* peer initiated with a non-#0 lane first */
+ rds_conn_path_drop(conn->c_path, 0);
+ }
+
+ /* As soon as a connection went down,
+ * it is safe to schedule a "rds_tcp_accept_one"
+ * attempt even if there are no connections pending:
+ * Function "rds_tcp_accept_one" won't block
+ * but simply return -EAGAIN in that case.
+ *
+ * Doing so is necessary to address the case where an
+ * incoming connection on "rds_tcp_listen_sock" is ready
+ * to be acccepted prior to a free slot being available:
+ * the -ENOBUFS case in "rds_tcp_accept_one".
+ */
+ rds_tcp_accept_work(rtn);
+}
+
+int rds_tcp_accept_one(struct rds_tcp_net *rtn)
{
+ struct socket *listen_sock = rtn->rds_tcp_listen_sock;
struct socket *new_sock = NULL;
struct rds_connection *conn;
int ret;
@@ -110,17 +183,23 @@ int rds_tcp_accept_one(struct socket *sock)
#endif
int dev_if = 0;
- if (!sock) /* module unload or netns delete in progress */
+ if (!listen_sock) /* module unload or netns delete in progress */
return -ENETUNREACH;
- ret = kernel_accept(sock, &new_sock, O_NONBLOCK);
- if (ret)
- return ret;
+ mutex_lock(&rtn->rds_tcp_accept_lock);
+ new_sock = rtn->rds_tcp_accepted_sock;
+ rtn->rds_tcp_accepted_sock = NULL;
- rds_tcp_keepalive(new_sock);
- if (!rds_tcp_tune(new_sock)) {
- ret = -EINVAL;
- goto out;
+ if (!new_sock) {
+ ret = kernel_accept(listen_sock, &new_sock, O_NONBLOCK);
+ if (ret)
+ goto out;
+
+ rds_tcp_keepalive(new_sock);
+ if (!rds_tcp_tune(new_sock)) {
+ ret = -EINVAL;
+ goto out;
+ }
}
inet = inet_sk(new_sock->sk);
@@ -135,7 +214,7 @@ int rds_tcp_accept_one(struct socket *sock)
peer_addr = &daddr;
#endif
rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n",
- sock->sk->sk_family,
+ listen_sock->sk->sk_family,
my_addr, ntohs(inet->inet_sport),
peer_addr, ntohs(inet->inet_dport));
@@ -155,13 +234,13 @@ int rds_tcp_accept_one(struct socket *sock)
}
#endif
- if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) {
+ if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) {
/* local address connection is only allowed via loopback */
ret = -EOPNOTSUPP;
goto out;
}
- conn = rds_conn_create(sock_net(sock->sk),
+ conn = rds_conn_create(sock_net(listen_sock->sk),
my_addr, peer_addr,
&rds_tcp_transport, 0, GFP_KERNEL, dev_if);
@@ -174,15 +253,51 @@ int rds_tcp_accept_one(struct socket *sock)
* If the client reboots, this conn will need to be cleaned up.
* rds_tcp_state_change() will do that cleanup
*/
- rs_tcp = rds_tcp_accept_one_path(conn);
- if (!rs_tcp)
+ if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) {
+ /* Try to obtain a free connection slot.
+ * If unsuccessful, we need to preserve "new_sock"
+ * that we just accepted, since its "sk_receive_queue"
+ * may contain messages already that have been acknowledged
+ * to and discarded by the sender.
+ * We must not throw those away!
+ */
+ rs_tcp = rds_tcp_accept_one_path(conn, new_sock);
+ if (!rs_tcp) {
+ /* It's okay to stash "new_sock", since
+ * "rds_tcp_conn_slots_available" triggers
+ * "rds_tcp_accept_one" again as soon as one of the
+ * connection slots becomes available again
+ */
+ rtn->rds_tcp_accepted_sock = new_sock;
+ new_sock = NULL;
+ ret = -ENOBUFS;
+ goto out;
+ }
+ } else {
+ /* This connection request came from a peer with
+ * a larger address.
+ * Function "rds_tcp_state_change" makes sure
+ * that the connection doesn't transition
+ * to state "RDS_CONN_UP", and therefore
+ * we should not have received any messages
+ * on this socket yet.
+ * This is the only case where it's okay to
+ * not dequeue messages from "sk_receive_queue".
+ */
+ if (conn->c_npaths <= 1)
+ rds_conn_path_connect_if_down(&conn->c_path[0]);
+ rs_tcp = NULL;
goto rst_nsk;
+ }
+
mutex_lock(&rs_tcp->t_conn_path_lock);
cp = rs_tcp->t_cpath;
conn_state = rds_conn_path_state(cp);
WARN_ON(conn_state == RDS_CONN_UP);
- if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR)
+ if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) {
+ rds_conn_path_drop(cp, 0);
goto rst_nsk;
+ }
if (rs_tcp->t_sock) {
/* Duelling SYN has been handled in rds_tcp_accept_one() */
rds_tcp_reset_callbacks(new_sock, cp);
@@ -192,6 +307,22 @@ int rds_tcp_accept_one(struct socket *sock)
rds_tcp_set_callbacks(new_sock, cp);
rds_connect_path_complete(cp, RDS_CONN_CONNECTING);
}
+
+ /* Since "rds_tcp_set_callbacks" happens this late
+ * the connection may already have been closed without
+ * "rds_tcp_state_change" doing its due diligence.
+ *
+ * If that's the case, we simply drop the path,
+ * knowing that "rds_tcp_conn_path_shutdown" will
+ * dequeue pending messages.
+ */
+ if (new_sock->sk->sk_state == TCP_CLOSE_WAIT ||
+ new_sock->sk->sk_state == TCP_LAST_ACK ||
+ new_sock->sk->sk_state == TCP_CLOSE)
+ rds_conn_path_drop(cp, 0);
+ else
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
+
new_sock = NULL;
ret = 0;
if (conn->c_npaths == 0)
@@ -212,6 +343,9 @@ out:
mutex_unlock(&rs_tcp->t_conn_path_lock);
if (new_sock)
sock_release(new_sock);
+
+ mutex_unlock(&rtn->rds_tcp_accept_lock);
+
return ret;
}
@@ -239,7 +373,7 @@ void rds_tcp_listen_data_ready(struct sock *sk)
* the listen socket is being torn down.
*/
if (sk->sk_state == TCP_LISTEN)
- rds_tcp_accept_work(sk);
+ rds_tcp_accept_work(net_generic(sock_net(sk), rds_tcp_netid));
else
ready = rds_tcp_listen_sock_def_readable(sock_net(sk));
diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index 7997a19d1da3..49f96ee0c40f 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -278,6 +278,10 @@ static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp)
rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp,
desc.error);
+ if (skb_queue_empty_lockless(&sock->sk->sk_receive_queue) &&
+ wq_has_sleeper(&tc->t_recv_done_waitq))
+ wake_up(&tc->t_recv_done_waitq);
+
return desc.error;
}
@@ -327,7 +331,7 @@ void rds_tcp_data_ready(struct sock *sk)
if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) {
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
rcu_read_unlock();
}
out:
diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 7d284ac7e81a..7c52acc749cf 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -169,7 +169,7 @@ out:
* unacked byte of the TCP sequence space. We have to do very careful
* wrapping 32bit comparisons here.
*/
-static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
+int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack)
{
if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags))
return 0;
@@ -201,7 +201,7 @@ void rds_tcp_write_space(struct sock *sk)
rcu_read_lock();
if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf &&
!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
rcu_read_unlock();
out:
diff --git a/net/rds/threads.c b/net/rds/threads.c
index 1f424cbfcbb4..639302bab51e 100644
--- a/net/rds/threads.c
+++ b/net/rds/threads.c
@@ -89,8 +89,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr)
set_bit(0, &cp->cp_conn->c_map_queued);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn)) {
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
}
rcu_read_unlock();
cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION;
@@ -140,7 +140,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies;
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_conn_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0);
rcu_read_unlock();
return;
}
@@ -151,7 +151,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp)
conn, &conn->c_laddr, &conn->c_faddr);
rcu_read_lock();
if (!rds_destroy_pending(cp->cp_conn))
- queue_delayed_work(rds_wq, &cp->cp_conn_w,
+ queue_delayed_work(cp->cp_wq, &cp->cp_conn_w,
rand % cp->cp_reconnect_jiffies);
rcu_read_unlock();
@@ -203,11 +203,11 @@ void rds_send_worker(struct work_struct *work)
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_send_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_send_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_send_w, 2);
+ queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 2);
break;
default:
break;
@@ -228,11 +228,11 @@ void rds_recv_worker(struct work_struct *work)
switch (ret) {
case -EAGAIN:
rds_stats_inc(s_recv_immediate_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 0);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0);
break;
case -ENOMEM:
rds_stats_inc(s_recv_delayed_retry);
- queue_delayed_work(rds_wq, &cp->cp_recv_w, 2);
+ queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 2);
break;
default:
break;
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 2b6ac7069dc1..81d488655793 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -13,9 +13,11 @@
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/pkt_cls.h>
+#include <linux/if_tunnel.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/rhashtable.h>
+#include <net/gre.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>
#include <net/pkt_cls.h>
diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c
index 71efe04d00b5..d2c750bab1d3 100644
--- a/net/sched/act_ctinfo.c
+++ b/net/sched/act_ctinfo.c
@@ -16,6 +16,7 @@
#include <net/pkt_sched.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include <net/inet_ecn.h>
#include <uapi/linux/tc_act/tc_ctinfo.h>
#include <net/tc_act/tc_ctinfo.h>
#include <net/tc_wrapper.h>
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f56b18c8aebf..443c116e8663 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1353,7 +1353,7 @@ err_out4:
ops->destroy(sch);
qdisc_put_stab(rtnl_dereference(sch->stab));
err_out3:
- lockdep_unregister_key(&sch->root_lock_key);
+ qdisc_lock_uninit(sch, ops);
netdev_put(dev, &sch->dev_tracker);
qdisc_free(sch);
err_out2:
diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 4a64d6397b6f..d2bbd5654d5b 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -67,6 +67,7 @@
#include <linux/if_vlan.h>
#include <net/gso.h>
#include <net/pkt_sched.h>
+#include <net/sch_priv.h>
#include <net/pkt_cls.h>
#include <net/tcp.h>
#include <net/flow_dissector.h>
@@ -197,40 +198,45 @@ struct cake_tin_data {
u32 way_collisions;
}; /* number of tins is small, so size of this struct doesn't matter much */
+struct cake_sched_config {
+ u64 rate_bps;
+ u64 interval;
+ u64 target;
+ u64 sync_time;
+ u32 buffer_config_limit;
+ u32 fwmark_mask;
+ u16 fwmark_shft;
+ s16 rate_overhead;
+ u16 rate_mpu;
+ u16 rate_flags;
+ u8 tin_mode;
+ u8 flow_mode;
+ u8 atm_mode;
+ u8 ack_filter;
+ u8 is_shared;
+};
+
struct cake_sched_data {
struct tcf_proto __rcu *filter_list; /* optional external classifier */
struct tcf_block *block;
struct cake_tin_data *tins;
+ struct cake_sched_config *config;
+ struct cake_sched_config initial_config;
struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
- u16 overflow_timeout;
-
- u16 tin_cnt;
- u8 tin_mode;
- u8 flow_mode;
- u8 ack_filter;
- u8 atm_mode;
-
- u32 fwmark_mask;
- u16 fwmark_shft;
/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
- u16 rate_shft;
ktime_t time_next_packet;
ktime_t failsafe_next_packet;
u64 rate_ns;
- u64 rate_bps;
- u16 rate_flags;
- s16 rate_overhead;
- u16 rate_mpu;
- u64 interval;
- u64 target;
+ u16 rate_shft;
+ u16 overflow_timeout;
+ u16 tin_cnt;
/* resource tracking */
u32 buffer_used;
u32 buffer_max_used;
u32 buffer_limit;
- u32 buffer_config_limit;
/* indices for dequeue */
u16 cur_tin;
@@ -254,6 +260,11 @@ struct cake_sched_data {
u16 max_adjlen;
u16 min_netlen;
u16 min_adjlen;
+
+ /* mq sync state */
+ u64 last_checked_active;
+ u64 last_active;
+ u32 active_queues;
};
enum {
@@ -380,6 +391,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
1239850263, 1191209601, 1147878294, 1108955788
};
+static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
+ u64 target_ns, u64 rtt_est_ns);
/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
* new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
*
@@ -1198,7 +1211,7 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph,
static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
struct cake_flow *flow)
{
- bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
+ bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE;
struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
struct sk_buff *skb_check, *skb_prev = NULL;
const struct ipv6hdr *ipv6h, *ipv6h_check;
@@ -1266,7 +1279,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
continue;
- seglen = ntohs(ipv6h_check->payload_len);
+ seglen = ipv6_payload_len(skb, ipv6h_check);
} else {
WARN_ON(1); /* shouldn't happen */
continue;
@@ -1358,15 +1371,17 @@ static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
return avg;
}
-static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
+static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off)
{
+ struct cake_sched_config *q = qd->config;
+
if (q->rate_flags & CAKE_FLAG_OVERHEAD)
len -= off;
- if (q->max_netlen < len)
- q->max_netlen = len;
- if (q->min_netlen > len)
- q->min_netlen = len;
+ if (qd->max_netlen < len)
+ qd->max_netlen = len;
+ if (qd->min_netlen > len)
+ qd->min_netlen = len;
len += q->rate_overhead;
@@ -1385,10 +1400,10 @@ static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
len += (len + 63) / 64;
}
- if (q->max_adjlen < len)
- q->max_adjlen = len;
- if (q->min_adjlen > len)
- q->min_adjlen = len;
+ if (qd->max_adjlen < len)
+ qd->max_adjlen = len;
+ if (qd->min_adjlen > len)
+ qd->min_adjlen = len;
return len;
}
@@ -1586,7 +1601,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
flow->dropped++;
b->tin_dropped++;
- if (q->rate_flags & CAKE_FLAG_INGRESS)
+ if (q->config->rate_flags & CAKE_FLAG_INGRESS)
cake_advance_shaper(q, b, skb, now, true);
qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT);
@@ -1656,7 +1671,8 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
struct sk_buff *skb)
{
- struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_sched_data *qd = qdisc_priv(sch);
+ struct cake_sched_config *q = qd->config;
u32 tin, mark;
bool wash;
u8 dscp;
@@ -1673,24 +1689,24 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
tin = 0;
- else if (mark && mark <= q->tin_cnt)
- tin = q->tin_order[mark - 1];
+ else if (mark && mark <= qd->tin_cnt)
+ tin = qd->tin_order[mark - 1];
else if (TC_H_MAJ(skb->priority) == sch->handle &&
TC_H_MIN(skb->priority) > 0 &&
- TC_H_MIN(skb->priority) <= q->tin_cnt)
- tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
+ TC_H_MIN(skb->priority) <= qd->tin_cnt)
+ tin = qd->tin_order[TC_H_MIN(skb->priority) - 1];
else {
if (!wash)
dscp = cake_handle_diffserv(skb, wash);
- tin = q->tin_index[dscp];
+ tin = qd->tin_index[dscp];
- if (unlikely(tin >= q->tin_cnt))
+ if (unlikely(tin >= qd->tin_cnt))
tin = 0;
}
- return &q->tins[tin];
+ return &qd->tins[tin];
}
static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
@@ -1746,7 +1762,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
bool same_flow = false;
/* choose flow to insert into */
- idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
+ idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret);
if (idx == 0) {
if (ret & __NET_XMIT_BYPASS)
qdisc_qstats_drop(sch);
@@ -1781,7 +1797,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (unlikely(len > b->max_skblen))
b->max_skblen = len;
- if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
+ if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) {
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
unsigned int slen = 0, numsegs = 0;
@@ -1823,7 +1839,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
flow_queue_add(flow, skb);
- if (q->ack_filter)
+ if (q->config->ack_filter)
ack = cake_ack_filter(q, flow);
if (ack) {
@@ -1832,7 +1848,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
ack_pkt_len = qdisc_pkt_len(ack);
b->bytes += ack_pkt_len;
q->buffer_used += skb->truesize - ack->truesize;
- if (q->rate_flags & CAKE_FLAG_INGRESS)
+ if (q->config->rate_flags & CAKE_FLAG_INGRESS)
cake_advance_shaper(q, b, ack, now, true);
qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len);
@@ -1855,7 +1871,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
cake_heapify_up(q, b->overflow_idx[idx]);
/* incoming bandwidth capacity estimate */
- if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
+ if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
u64 packet_interval = \
ktime_to_ns(ktime_sub(now, q->last_packet_time));
@@ -1887,7 +1903,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
if (ktime_after(now,
ktime_add_ms(q->last_reconfig_time,
250))) {
- q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
+ q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
cake_reconfigure(sch);
}
}
@@ -1907,7 +1923,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
flow->set = CAKE_SET_SPARSE;
b->sparse_flow_count++;
- flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode);
+ flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode);
} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
/* this flow was empty, accounted as a sparse flow, but actually
* in the bulk rotation.
@@ -1916,8 +1932,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
b->sparse_flow_count--;
b->bulk_flow_count++;
- cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
- cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
+ cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
}
if (q->buffer_used > q->buffer_max_used)
@@ -1997,6 +2013,40 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch)
u64 delay;
u32 len;
+ if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) {
+ struct net_device *dev = qdisc_dev(sch);
+ struct cake_sched_data *other_priv;
+ u64 new_rate = q->config->rate_bps;
+ u64 other_qlen, other_last_active;
+ struct Qdisc *other_sch;
+ u32 num_active_qs = 1;
+ unsigned int ntx;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
+ other_priv = qdisc_priv(other_sch);
+
+ if (other_priv == q)
+ continue;
+
+ other_qlen = READ_ONCE(other_sch->q.qlen);
+ other_last_active = READ_ONCE(other_priv->last_active);
+
+ if (other_qlen || other_last_active > q->last_checked_active)
+ num_active_qs++;
+ }
+
+ if (num_active_qs > 1)
+ new_rate = div64_u64(q->config->rate_bps, num_active_qs);
+
+ /* mtu = 0 is used to only update the rate and not mess with cobalt params */
+ cake_set_rate(b, new_rate, 0, 0, 0);
+ q->last_checked_active = now;
+ q->active_queues = num_active_qs;
+ q->rate_ns = b->tin_rate_ns;
+ q->rate_shft = b->tin_rate_shft;
+ }
+
begin:
if (!sch->q.qlen)
return NULL;
@@ -2104,8 +2154,8 @@ retry:
b->sparse_flow_count--;
b->bulk_flow_count++;
- cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode);
- cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
+ cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
flow->set = CAKE_SET_BULK;
} else {
@@ -2117,7 +2167,7 @@ retry:
}
}
- flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode);
+ flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode);
list_move_tail(&flow->flowchain, &b->old_flows);
goto retry;
@@ -2141,8 +2191,8 @@ retry:
if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
- cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
- cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
+ cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
b->decaying_flow_count++;
} else if (flow->set == CAKE_SET_SPARSE ||
@@ -2160,8 +2210,8 @@ retry:
else if (flow->set == CAKE_SET_BULK) {
b->bulk_flow_count--;
- cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode);
- cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode);
+ cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode);
+ cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode);
} else
b->decaying_flow_count--;
@@ -2172,14 +2222,14 @@ retry:
reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
(b->bulk_flow_count *
- !!(q->rate_flags &
+ !!(q->config->rate_flags &
CAKE_FLAG_INGRESS)));
/* Last packet in queue may be marked, shouldn't be dropped */
if (reason == SKB_NOT_DROPPED_YET || !flow->head)
break;
/* drop this packet, get another one */
- if (q->rate_flags & CAKE_FLAG_INGRESS) {
+ if (q->config->rate_flags & CAKE_FLAG_INGRESS) {
len = cake_advance_shaper(q, b, skb,
now, true);
flow->deficit -= len;
@@ -2190,12 +2240,13 @@ retry:
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_qstats_drop(sch);
qdisc_dequeue_drop(sch, skb, reason);
- if (q->rate_flags & CAKE_FLAG_INGRESS)
+ if (q->config->rate_flags & CAKE_FLAG_INGRESS)
goto retry;
}
b->tin_ecn_mark += !!flow->cvars.ecn_marked;
qdisc_bstats_update(sch, skb);
+ WRITE_ONCE(q->last_active, now);
/* collect delay stats */
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
@@ -2296,6 +2347,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
b->tin_rate_ns = rate_ns;
b->tin_rate_shft = rate_shft;
+ if (mtu == 0)
+ return;
+
byte_target_ns = (byte_target * rate_ns) >> rate_shft;
b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
@@ -2312,7 +2366,7 @@ static int cake_config_besteffort(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
struct cake_tin_data *b = &q->tins[0];
u32 mtu = psched_mtu(qdisc_dev(sch));
- u64 rate = q->rate_bps;
+ u64 rate = q->config->rate_bps;
q->tin_cnt = 1;
@@ -2320,7 +2374,7 @@ static int cake_config_besteffort(struct Qdisc *sch)
q->tin_order = normal_order;
cake_set_rate(b, rate, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
b->tin_quantum = 65535;
return 0;
@@ -2331,7 +2385,7 @@ static int cake_config_precedence(struct Qdisc *sch)
/* convert high-level (user visible) parameters into internal format */
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
- u64 rate = q->rate_bps;
+ u64 rate = q->config->rate_bps;
u32 quantum = 256;
u32 i;
@@ -2342,8 +2396,8 @@ static int cake_config_precedence(struct Qdisc *sch)
for (i = 0; i < q->tin_cnt; i++) {
struct cake_tin_data *b = &q->tins[i];
- cake_set_rate(b, rate, mtu, us_to_ns(q->target),
- us_to_ns(q->interval));
+ cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
+ us_to_ns(q->config->interval));
b->tin_quantum = max_t(u16, 1U, quantum);
@@ -2420,7 +2474,7 @@ static int cake_config_diffserv8(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
- u64 rate = q->rate_bps;
+ u64 rate = q->config->rate_bps;
u32 quantum = 256;
u32 i;
@@ -2434,8 +2488,8 @@ static int cake_config_diffserv8(struct Qdisc *sch)
for (i = 0; i < q->tin_cnt; i++) {
struct cake_tin_data *b = &q->tins[i];
- cake_set_rate(b, rate, mtu, us_to_ns(q->target),
- us_to_ns(q->interval));
+ cake_set_rate(b, rate, mtu, us_to_ns(q->config->target),
+ us_to_ns(q->config->interval));
b->tin_quantum = max_t(u16, 1U, quantum);
@@ -2464,7 +2518,7 @@ static int cake_config_diffserv4(struct Qdisc *sch)
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
- u64 rate = q->rate_bps;
+ u64 rate = q->config->rate_bps;
u32 quantum = 1024;
q->tin_cnt = 4;
@@ -2475,13 +2529,13 @@ static int cake_config_diffserv4(struct Qdisc *sch)
/* class characteristics */
cake_set_rate(&q->tins[0], rate, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
cake_set_rate(&q->tins[1], rate >> 4, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
cake_set_rate(&q->tins[2], rate >> 1, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
cake_set_rate(&q->tins[3], rate >> 2, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
/* bandwidth-sharing weights */
q->tins[0].tin_quantum = quantum;
@@ -2501,7 +2555,7 @@ static int cake_config_diffserv3(struct Qdisc *sch)
*/
struct cake_sched_data *q = qdisc_priv(sch);
u32 mtu = psched_mtu(qdisc_dev(sch));
- u64 rate = q->rate_bps;
+ u64 rate = q->config->rate_bps;
u32 quantum = 1024;
q->tin_cnt = 3;
@@ -2512,11 +2566,11 @@ static int cake_config_diffserv3(struct Qdisc *sch)
/* class characteristics */
cake_set_rate(&q->tins[0], rate, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
cake_set_rate(&q->tins[1], rate >> 4, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
cake_set_rate(&q->tins[2], rate >> 2, mtu,
- us_to_ns(q->target), us_to_ns(q->interval));
+ us_to_ns(q->config->target), us_to_ns(q->config->interval));
/* bandwidth-sharing weights */
q->tins[0].tin_quantum = quantum;
@@ -2528,7 +2582,8 @@ static int cake_config_diffserv3(struct Qdisc *sch)
static void cake_reconfigure(struct Qdisc *sch)
{
- struct cake_sched_data *q = qdisc_priv(sch);
+ struct cake_sched_data *qd = qdisc_priv(sch);
+ struct cake_sched_config *q = qd->config;
int c, ft;
switch (q->tin_mode) {
@@ -2554,39 +2609,38 @@ static void cake_reconfigure(struct Qdisc *sch)
break;
}
- for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
+ for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) {
cake_clear_tin(sch, c);
- q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
+ qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time;
}
- q->rate_ns = q->tins[ft].tin_rate_ns;
- q->rate_shft = q->tins[ft].tin_rate_shft;
+ qd->rate_ns = qd->tins[ft].tin_rate_ns;
+ qd->rate_shft = qd->tins[ft].tin_rate_shft;
if (q->buffer_config_limit) {
- q->buffer_limit = q->buffer_config_limit;
+ qd->buffer_limit = q->buffer_config_limit;
} else if (q->rate_bps) {
u64 t = q->rate_bps * q->interval;
do_div(t, USEC_PER_SEC / 4);
- q->buffer_limit = max_t(u32, t, 4U << 20);
+ qd->buffer_limit = max_t(u32, t, 4U << 20);
} else {
- q->buffer_limit = ~0;
+ qd->buffer_limit = ~0;
}
sch->flags &= ~TCQ_F_CAN_BYPASS;
- q->buffer_limit = min(q->buffer_limit,
- max(sch->limit * psched_mtu(qdisc_dev(sch)),
- q->buffer_config_limit));
+ qd->buffer_limit = min(qd->buffer_limit,
+ max(sch->limit * psched_mtu(qdisc_dev(sch)),
+ q->buffer_config_limit));
}
-static int cake_change(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt,
+ struct netlink_ext_ack *extack, bool *overhead_changed)
{
- struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CAKE_MAX + 1];
- u16 rate_flags;
- u8 flow_mode;
+ u16 rate_flags = q->rate_flags;
+ u8 flow_mode = q->flow_mode;
int err;
err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
@@ -2594,7 +2648,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
if (err < 0)
return err;
- flow_mode = q->flow_mode;
if (tb[TCA_CAKE_NAT]) {
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
flow_mode &= ~CAKE_FLOW_NAT_FLAG;
@@ -2607,6 +2660,19 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
#endif
}
+ if (tb[TCA_CAKE_AUTORATE]) {
+ if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) {
+ if (q->is_shared) {
+ NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE],
+ "Can't use autorate-ingress with cake_mq");
+ return -EOPNOTSUPP;
+ }
+ rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
+ } else {
+ rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
+ }
+ }
+
if (tb[TCA_CAKE_BASE_RATE64])
WRITE_ONCE(q->rate_bps,
nla_get_u64(tb[TCA_CAKE_BASE_RATE64]));
@@ -2615,7 +2681,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
WRITE_ONCE(q->tin_mode,
nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]));
- rate_flags = q->rate_flags;
if (tb[TCA_CAKE_WASH]) {
if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
rate_flags |= CAKE_FLAG_WASH;
@@ -2636,20 +2701,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
WRITE_ONCE(q->rate_overhead,
nla_get_s32(tb[TCA_CAKE_OVERHEAD]));
rate_flags |= CAKE_FLAG_OVERHEAD;
-
- q->max_netlen = 0;
- q->max_adjlen = 0;
- q->min_netlen = ~0;
- q->min_adjlen = ~0;
+ *overhead_changed = true;
}
if (tb[TCA_CAKE_RAW]) {
rate_flags &= ~CAKE_FLAG_OVERHEAD;
-
- q->max_netlen = 0;
- q->max_adjlen = 0;
- q->min_netlen = ~0;
- q->min_adjlen = ~0;
+ *overhead_changed = true;
}
if (tb[TCA_CAKE_MPU])
@@ -2668,13 +2725,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
WRITE_ONCE(q->target, max(target, 1U));
}
- if (tb[TCA_CAKE_AUTORATE]) {
- if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
- rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
- else
- rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
- }
-
if (tb[TCA_CAKE_INGRESS]) {
if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
rate_flags |= CAKE_FLAG_INGRESS;
@@ -2705,7 +2755,35 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
WRITE_ONCE(q->rate_flags, rate_flags);
WRITE_ONCE(q->flow_mode, flow_mode);
- if (q->tins) {
+
+ return 0;
+}
+
+static int cake_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *qd = qdisc_priv(sch);
+ struct cake_sched_config *q = qd->config;
+ bool overhead_changed = false;
+ int ret;
+
+ if (q->is_shared) {
+ NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs");
+ return -EOPNOTSUPP;
+ }
+
+ ret = cake_config_change(q, opt, extack, &overhead_changed);
+ if (ret)
+ return ret;
+
+ if (overhead_changed) {
+ qd->max_netlen = 0;
+ qd->max_adjlen = 0;
+ qd->min_netlen = ~0;
+ qd->min_adjlen = ~0;
+ }
+
+ if (qd->tins) {
sch_tree_lock(sch);
cake_reconfigure(sch);
sch_tree_unlock(sch);
@@ -2723,15 +2801,8 @@ static void cake_destroy(struct Qdisc *sch)
kvfree(q->tins);
}
-static int cake_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static void cake_config_init(struct cake_sched_config *q, bool is_shared)
{
- struct cake_sched_data *q = qdisc_priv(sch);
- int i, j, err;
-
- sch->limit = 10240;
- sch->flags |= TCQ_F_DEQUEUE_DROPS;
-
q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
q->flow_mode = CAKE_FLOW_TRIPLE;
@@ -2742,19 +2813,35 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
* for 5 to 10% of interval
*/
q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
- q->cur_tin = 0;
- q->cur_flow = 0;
+ q->is_shared = is_shared;
+ q->sync_time = 200 * NSEC_PER_USEC;
+}
+
+static int cake_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_sched_data *qd = qdisc_priv(sch);
+ struct cake_sched_config *q = &qd->initial_config;
+ int i, j, err;
+
+ cake_config_init(q, false);
+
+ sch->limit = 10240;
+ sch->flags |= TCQ_F_DEQUEUE_DROPS;
+
+ qd->cur_tin = 0;
+ qd->cur_flow = 0;
+ qd->config = q;
- qdisc_watchdog_init(&q->watchdog, sch);
+ qdisc_watchdog_init(&qd->watchdog, sch);
if (opt) {
err = cake_change(sch, opt, extack);
-
if (err)
return err;
}
- err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
+ err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack);
if (err)
return err;
@@ -2762,13 +2849,13 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
for (i = 1; i <= CAKE_QUEUES; i++)
quantum_div[i] = 65535 / i;
- q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
- GFP_KERNEL);
- if (!q->tins)
+ qd->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
+ GFP_KERNEL);
+ if (!qd->tins)
return -ENOMEM;
for (i = 0; i < CAKE_MAX_TINS; i++) {
- struct cake_tin_data *b = q->tins + i;
+ struct cake_tin_data *b = qd->tins + i;
INIT_LIST_HEAD(&b->new_flows);
INIT_LIST_HEAD(&b->old_flows);
@@ -2784,22 +2871,32 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt,
INIT_LIST_HEAD(&flow->flowchain);
cobalt_vars_init(&flow->cvars);
- q->overflow_heap[k].t = i;
- q->overflow_heap[k].b = j;
+ qd->overflow_heap[k].t = i;
+ qd->overflow_heap[k].b = j;
b->overflow_idx[j] = k;
}
}
cake_reconfigure(sch);
- q->avg_peak_bandwidth = q->rate_bps;
- q->min_netlen = ~0;
- q->min_adjlen = ~0;
+ qd->avg_peak_bandwidth = q->rate_bps;
+ qd->min_netlen = ~0;
+ qd->min_adjlen = ~0;
+ qd->active_queues = 0;
+ qd->last_checked_active = 0;
+
return 0;
}
-static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg)
+{
+ struct cake_sched_data *qd = qdisc_priv(sch);
+
+ qd->config = cfg;
+ cake_reconfigure(sch);
+}
+
+static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb)
{
- struct cake_sched_data *q = qdisc_priv(sch);
struct nlattr *opts;
u16 rate_flags;
u8 flow_mode;
@@ -2875,6 +2972,13 @@ nla_put_failure:
return -1;
}
+static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_sched_data *qd = qdisc_priv(sch);
+
+ return cake_config_dump(qd->config, skb);
+}
+
static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
@@ -2903,6 +3007,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
+ PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues);
#undef PUT_STAT_U32
#undef PUT_STAT_U64
@@ -3136,14 +3241,133 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
};
MODULE_ALIAS_NET_SCH("cake");
+struct cake_mq_sched {
+ struct mq_sched mq_priv; /* must be first */
+ struct cake_sched_config cake_config;
+};
+
+static void cake_mq_destroy(struct Qdisc *sch)
+{
+ mq_destroy_common(sch);
+}
+
+static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_mq_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ int ret, ntx;
+ bool _unused;
+
+ cake_config_init(&priv->cake_config, true);
+ if (opt) {
+ ret = cake_config_change(&priv->cake_config, opt, extack, &_unused);
+ if (ret)
+ return ret;
+ }
+
+ ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops);
+ if (ret)
+ return ret;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++)
+ cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config);
+
+ return 0;
+}
+
+static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ struct cake_mq_sched *priv = qdisc_priv(sch);
+
+ mq_dump_common(sch, skb);
+ return cake_config_dump(&priv->cake_config, skb);
+}
+
+static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ struct cake_mq_sched *priv = qdisc_priv(sch);
+ struct net_device *dev = qdisc_dev(sch);
+ bool overhead_changed = false;
+ unsigned int ntx;
+ int ret;
+
+ ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed);
+ if (ret)
+ return ret;
+
+ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
+ struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping);
+ struct cake_sched_data *qd = qdisc_priv(chld);
+
+ if (overhead_changed) {
+ qd->max_netlen = 0;
+ qd->max_adjlen = 0;
+ qd->min_netlen = ~0;
+ qd->min_adjlen = ~0;
+ }
+
+ if (qd->tins) {
+ sch_tree_lock(chld);
+ cake_reconfigure(chld);
+ sch_tree_unlock(chld);
+ }
+ }
+
+ return 0;
+}
+
+static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
+ struct Qdisc **old, struct netlink_ext_ack *extack)
+{
+ NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs");
+ return -EOPNOTSUPP;
+}
+
+static const struct Qdisc_class_ops cake_mq_class_ops = {
+ .select_queue = mq_select_queue,
+ .graft = cake_mq_graft,
+ .leaf = mq_leaf,
+ .find = mq_find,
+ .walk = mq_walk,
+ .dump = mq_dump_class,
+ .dump_stats = mq_dump_class_stats,
+};
+
+static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = {
+ .cl_ops = &cake_mq_class_ops,
+ .id = "cake_mq",
+ .priv_size = sizeof(struct cake_mq_sched),
+ .init = cake_mq_init,
+ .destroy = cake_mq_destroy,
+ .attach = mq_attach,
+ .change = cake_mq_change,
+ .change_real_num_tx = mq_change_real_num_tx,
+ .dump = cake_mq_dump,
+ .owner = THIS_MODULE,
+};
+MODULE_ALIAS_NET_SCH("cake_mq");
+
static int __init cake_module_init(void)
{
- return register_qdisc(&cake_qdisc_ops);
+ int ret;
+
+ ret = register_qdisc(&cake_qdisc_ops);
+ if (ret)
+ return ret;
+
+ ret = register_qdisc(&cake_mq_qdisc_ops);
+ if (ret)
+ unregister_qdisc(&cake_qdisc_ops);
+
+ return ret;
}
static void __exit cake_module_exit(void)
{
unregister_qdisc(&cake_qdisc_ops);
+ unregister_qdisc(&cake_mq_qdisc_ops);
}
module_init(cake_module_init)
@@ -3151,3 +3375,4 @@ module_exit(cake_module_exit)
MODULE_AUTHOR("Jonathan Morton");
MODULE_LICENSE("Dual BSD/GPL");
MODULE_DESCRIPTION("The CAKE shaper.");
+MODULE_IMPORT_NS("NET_SCHED_INTERNAL");
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 6e5f2f4f2415..80235e85f844 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -245,8 +245,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
static struct kmem_cache *fq_flow_cachep __read_mostly;
-/* limit number of collected flows per round */
-#define FQ_GC_MAX 8
#define FQ_GC_AGE (3*HZ)
static bool fq_gc_candidate(const struct fq_flow *f)
@@ -259,10 +257,9 @@ static void fq_gc(struct fq_sched_data *q,
struct rb_root *root,
struct sock *sk)
{
+ struct fq_flow *f, *tofree = NULL;
struct rb_node **p, *parent;
- void *tofree[FQ_GC_MAX];
- struct fq_flow *f;
- int i, fcnt = 0;
+ int fcnt;
p = &root->rb_node;
parent = NULL;
@@ -274,9 +271,8 @@ static void fq_gc(struct fq_sched_data *q,
break;
if (fq_gc_candidate(f)) {
- tofree[fcnt++] = f;
- if (fcnt == FQ_GC_MAX)
- break;
+ f->next = tofree;
+ tofree = f;
}
if (f->sk > sk)
@@ -285,18 +281,20 @@ static void fq_gc(struct fq_sched_data *q,
p = &parent->rb_left;
}
- if (!fcnt)
+ if (!tofree)
return;
- for (i = fcnt; i > 0; ) {
- f = tofree[--i];
+ fcnt = 0;
+ while (tofree) {
+ f = tofree;
+ tofree = f->next;
rb_erase(&f->fq_node, root);
+ kmem_cache_free(fq_flow_cachep, f);
+ fcnt++;
}
q->flows -= fcnt;
q->inactive_flows -= fcnt;
q->stat_gc_flows += fcnt;
-
- kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
}
/* Fast path can be used if :
@@ -665,7 +663,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
return NULL;
skb = fq_peek(&q->internal);
- if (unlikely(skb)) {
+ if (skb) {
q->internal.qlen--;
fq_dequeue_skb(sch, &q->internal, skb);
goto out;
@@ -716,7 +714,7 @@ begin:
}
prefetch(&skb->end);
fq_dequeue_skb(sch, f, skb);
- if ((s64)(now - time_next_packet - q->ce_threshold) > 0) {
+ if (unlikely((s64)(now - time_next_packet - q->ce_threshold) > 0)) {
INET_ECN_set_ce(skb);
q->stat_ce_mark++;
}
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 852e603c1755..98ffe64de51f 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -955,9 +955,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
__skb_queue_head_init(&sch->gso_skb);
__skb_queue_head_init(&sch->skb_bad_txq);
gnet_stats_basic_sync_init(&sch->bstats);
- lockdep_register_key(&sch->root_lock_key);
- spin_lock_init(&sch->q.lock);
- lockdep_set_class(&sch->q.lock, &sch->root_lock_key);
+ qdisc_lock_init(sch, ops);
if (ops->static_flags & TCQ_F_CPUSTATS) {
sch->cpu_bstats =
@@ -987,7 +985,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
return sch;
errout1:
- lockdep_unregister_key(&sch->root_lock_key);
+ qdisc_lock_uninit(sch, ops);
kfree(sch);
errout:
return ERR_PTR(err);
@@ -1076,7 +1074,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
if (ops->destroy)
ops->destroy(qdisc);
- lockdep_unregister_key(&qdisc->root_lock_key);
+ qdisc_lock_uninit(qdisc, ops);
bpf_module_put(ops, ops->owner);
netdev_put(dev, &qdisc->dev_tracker);
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index c860119a8f09..bb94cd577943 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -15,11 +15,7 @@
#include <net/netlink.h>
#include <net/pkt_cls.h>
#include <net/pkt_sched.h>
-#include <net/sch_generic.h>
-
-struct mq_sched {
- struct Qdisc **qdiscs;
-};
+#include <net/sch_priv.h>
static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd)
{
@@ -49,23 +45,29 @@ static int mq_offload_stats(struct Qdisc *sch)
return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt);
}
-static void mq_destroy(struct Qdisc *sch)
+void mq_destroy_common(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
unsigned int ntx;
- mq_offload(sch, TC_MQ_DESTROY);
-
if (!priv->qdiscs)
return;
for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++)
qdisc_put(priv->qdiscs[ntx]);
kfree(priv->qdiscs);
}
+EXPORT_SYMBOL_NS_GPL(mq_destroy_common, "NET_SCHED_INTERNAL");
-static int mq_init(struct Qdisc *sch, struct nlattr *opt,
- struct netlink_ext_ack *extack)
+static void mq_destroy(struct Qdisc *sch)
+{
+ mq_offload(sch, TC_MQ_DESTROY);
+ mq_destroy_common(sch);
+}
+
+int mq_init_common(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack,
+ const struct Qdisc_ops *qdisc_ops)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
@@ -87,7 +89,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
+ qdisc = qdisc_create_dflt(dev_queue,
+ qdisc_ops ?: get_default_qdisc_ops(dev, ntx),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)),
extack);
@@ -98,12 +101,24 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt,
}
sch->flags |= TCQ_F_MQROOT;
+ return 0;
+}
+EXPORT_SYMBOL_NS_GPL(mq_init_common, "NET_SCHED_INTERNAL");
+
+static int mq_init(struct Qdisc *sch, struct nlattr *opt,
+ struct netlink_ext_ack *extack)
+{
+ int ret;
+
+ ret = mq_init_common(sch, opt, extack, NULL);
+ if (ret)
+ return ret;
mq_offload(sch, TC_MQ_CREATE);
return 0;
}
-static void mq_attach(struct Qdisc *sch)
+void mq_attach(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mq_sched *priv = qdisc_priv(sch);
@@ -124,8 +139,9 @@ static void mq_attach(struct Qdisc *sch)
kfree(priv->qdiscs);
priv->qdiscs = NULL;
}
+EXPORT_SYMBOL_NS_GPL(mq_attach, "NET_SCHED_INTERNAL");
-static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+void mq_dump_common(struct Qdisc *sch, struct sk_buff *skb)
{
struct net_device *dev = qdisc_dev(sch);
struct Qdisc *qdisc;
@@ -152,7 +168,12 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
spin_unlock_bh(qdisc_lock(qdisc));
}
+}
+EXPORT_SYMBOL_NS_GPL(mq_dump_common, "NET_SCHED_INTERNAL");
+static int mq_dump(struct Qdisc *sch, struct sk_buff *skb)
+{
+ mq_dump_common(sch, skb);
return mq_offload_stats(sch);
}
@@ -166,11 +187,12 @@ static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl)
return netdev_get_tx_queue(dev, ntx);
}
-static struct netdev_queue *mq_select_queue(struct Qdisc *sch,
- struct tcmsg *tcm)
+struct netdev_queue *mq_select_queue(struct Qdisc *sch,
+ struct tcmsg *tcm)
{
return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent));
}
+EXPORT_SYMBOL_NS_GPL(mq_select_queue, "NET_SCHED_INTERNAL");
static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
struct Qdisc **old, struct netlink_ext_ack *extack)
@@ -198,14 +220,15 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
return 0;
}
-static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
+struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
return rtnl_dereference(dev_queue->qdisc_sleeping);
}
+EXPORT_SYMBOL_NS_GPL(mq_leaf, "NET_SCHED_INTERNAL");
-static unsigned long mq_find(struct Qdisc *sch, u32 classid)
+unsigned long mq_find(struct Qdisc *sch, u32 classid)
{
unsigned int ntx = TC_H_MIN(classid);
@@ -213,9 +236,10 @@ static unsigned long mq_find(struct Qdisc *sch, u32 classid)
return 0;
return ntx;
}
+EXPORT_SYMBOL_NS_GPL(mq_find, "NET_SCHED_INTERNAL");
-static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
- struct sk_buff *skb, struct tcmsg *tcm)
+int mq_dump_class(struct Qdisc *sch, unsigned long cl,
+ struct sk_buff *skb, struct tcmsg *tcm)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
@@ -224,9 +248,10 @@ static int mq_dump_class(struct Qdisc *sch, unsigned long cl,
tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle;
return 0;
}
+EXPORT_SYMBOL_NS_GPL(mq_dump_class, "NET_SCHED_INTERNAL");
-static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
- struct gnet_dump *d)
+int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
+ struct gnet_dump *d)
{
struct netdev_queue *dev_queue = mq_queue_get(sch, cl);
@@ -236,8 +261,9 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
return -1;
return 0;
}
+EXPORT_SYMBOL_NS_GPL(mq_dump_class_stats, "NET_SCHED_INTERNAL");
-static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
+void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
struct net_device *dev = qdisc_dev(sch);
unsigned int ntx;
@@ -251,6 +277,7 @@ static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
break;
}
}
+EXPORT_SYMBOL_NS_GPL(mq_walk, "NET_SCHED_INTERNAL");
static const struct Qdisc_class_ops mq_class_ops = {
.select_queue = mq_select_queue,
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f97f77b041d9..d8201eb3ac5f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -3357,11 +3357,10 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family)
return 0;
}
-static int __smc_create(struct net *net, struct socket *sock, int protocol,
- int kern, struct socket *clcsock)
+static int smc_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
- struct smc_sock *smc;
struct sock *sk;
int rc;
@@ -3380,15 +3379,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol,
if (!sk)
goto out;
- /* create internal TCP socket for CLC handshake and fallback */
- smc = smc_sk(sk);
-
- rc = 0;
- if (clcsock)
- smc->clcsock = clcsock;
- else
- rc = smc_create_clcsk(net, sk, family);
-
+ rc = smc_create_clcsk(net, sk, family);
if (rc) {
sk_common_release(sk);
sock->sk = NULL;
@@ -3397,76 +3388,12 @@ out:
return rc;
}
-static int smc_create(struct net *net, struct socket *sock, int protocol,
- int kern)
-{
- return __smc_create(net, sock, protocol, kern, NULL);
-}
-
static const struct net_proto_family smc_sock_family_ops = {
.family = PF_SMC,
.owner = THIS_MODULE,
.create = smc_create,
};
-static int smc_ulp_init(struct sock *sk)
-{
- struct socket *tcp = sk->sk_socket;
- struct net *net = sock_net(sk);
- struct socket *smcsock;
- int protocol, ret;
-
- /* only TCP can be replaced */
- if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
- (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
- return -ESOCKTNOSUPPORT;
- /* don't handle wq now */
- if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
- return -ENOTCONN;
-
- if (sk->sk_family == AF_INET)
- protocol = SMCPROTO_SMC;
- else
- protocol = SMCPROTO_SMC6;
-
- smcsock = sock_alloc();
- if (!smcsock)
- return -ENFILE;
-
- smcsock->type = SOCK_STREAM;
- __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
- ret = __smc_create(net, smcsock, protocol, 1, tcp);
- if (ret) {
- sock_release(smcsock); /* module_put() which ops won't be NULL */
- return ret;
- }
-
- /* replace tcp socket to smc */
- smcsock->file = tcp->file;
- smcsock->file->private_data = smcsock;
- smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
- smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
- tcp->file = NULL;
-
- return ret;
-}
-
-static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
- const gfp_t priority)
-{
- struct inet_connection_sock *icsk = inet_csk(newsk);
-
- /* don't inherit ulp ops to child when listen */
- icsk->icsk_ulp_ops = NULL;
-}
-
-static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
- .name = "smc",
- .owner = THIS_MODULE,
- .init = smc_ulp_init,
- .clone = smc_ulp_clone,
-};
-
unsigned int smc_net_id;
static __net_init int smc_net_init(struct net *net)
@@ -3589,16 +3516,10 @@ static int __init smc_init(void)
pr_err("%s: ib_register fails with %d\n", __func__, rc);
goto out_sock;
}
-
- rc = tcp_register_ulp(&smc_ulp_ops);
- if (rc) {
- pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
- goto out_ib;
- }
rc = smc_inet_init();
if (rc) {
pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
- goto out_ulp;
+ goto out_ib;
}
rc = bpf_smc_hs_ctrl_init();
if (rc) {
@@ -3610,8 +3531,6 @@ static int __init smc_init(void)
return 0;
out_inet:
smc_inet_exit();
-out_ulp:
- tcp_unregister_ulp(&smc_ulp_ops);
out_ib:
smc_ib_unregister_client();
out_sock:
@@ -3647,7 +3566,6 @@ static void __exit smc_exit(void)
{
static_branch_disable(&tcp_have_smc);
smc_inet_exit();
- tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
smc_ib_unregister_client();
@@ -3672,7 +3590,6 @@ MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
MODULE_DESCRIPTION("smc socket address family");
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_SMC);
-MODULE_ALIAS_TCP_ULP("smc");
/* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1);
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c
index 970db62bd029..a3f9ca28c3d5 100644
--- a/net/tipc/crypto.c
+++ b/net/tipc/crypto.c
@@ -460,7 +460,7 @@ static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim)
rcu_read_lock();
tmp = rcu_dereference(aead);
if (tmp)
- atomic_add_unless(&rcu_dereference(aead)->users, -1, lim);
+ atomic_add_unless(&tmp->users, -1, lim);
rcu_read_unlock();
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index d0511225799b..f6d56e70c7a2 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1650,10 +1650,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad
timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
- /* First of all allocate resources.
- * If we will make it after state is locked,
- * we will have to recheck all again in any case.
- */
+ err = prepare_peercred(&peercred);
+ if (err)
+ goto out;
/* create new sock for complete connection */
newsk = unix_create1(net, NULL, 0, sock->type);
@@ -1662,10 +1661,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad
goto out;
}
- err = prepare_peercred(&peercred);
- if (err)
- goto out;
-
/* Allocate skb for sending to listening sock */
skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
if (!skb) {
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index a3505a4dcee0..20ad2b2dc17b 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -83,6 +83,50 @@
* TCP_ESTABLISHED - connected
* TCP_CLOSING - disconnecting
* TCP_LISTEN - listening
+ *
+ * - Namespaces in vsock support two different modes: "local" and "global".
+ * Each mode defines how the namespace interacts with CIDs.
+ * Each namespace exposes two sysctl files:
+ *
+ * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's
+ * mode, which is set at namespace creation and immutable thereafter.
+ * - /proc/sys/net/vsock/child_ns_mode (writable) controls what mode future
+ * child namespaces will inherit when created. The default is "global".
+ *
+ * Changing child_ns_mode only affects newly created namespaces, not the
+ * current namespace or existing children. At namespace creation, ns_mode
+ * is inherited from the parent's child_ns_mode.
+ *
+ * The init_net mode is "global" and cannot be modified.
+ *
+ * The modes affect the allocation and accessibility of CIDs as follows:
+ *
+ * - global - access and allocation are all system-wide
+ * - all CID allocation from global namespaces draw from the same
+ * system-wide pool.
+ * - if one global namespace has already allocated some CID, another
+ * global namespace will not be able to allocate the same CID.
+ * - global mode AF_VSOCK sockets can reach any VM or socket in any global
+ * namespace, they are not contained to only their own namespace.
+ * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or
+ * sockets in any local mode namespace.
+ * - local - access and allocation are contained within the namespace
+ * - CID allocation draws only from a private pool local only to the
+ * namespace, and does not affect the CIDs available for allocation in any
+ * other namespace (global or local).
+ * - VMs in a local namespace do not collide with CIDs in any other local
+ * namespace or any global namespace. For example, if a VM in a local mode
+ * namespace is given CID 10, then CID 10 is still available for
+ * allocation in any other namespace, but not in the same namespace.
+ * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or
+ * other sockets within their own namespace.
+ * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve
+ * to any transport that is not compatible with local mode. There is no
+ * error that propagates to the user (as there is for connection attempts)
+ * because it is possible for some packet to reach this socket from
+ * a different transport that *does* support local mode. For
+ * example, virtio-vsock may not support local mode, but the socket
+ * may still accept a connection from vhost-vsock which does.
*/
#include <linux/compat.h>
@@ -100,20 +144,31 @@
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/net.h>
+#include <linux/proc_fs.h>
#include <linux/poll.h>
#include <linux/random.h>
#include <linux/skbuff.h>
#include <linux/smp.h>
#include <linux/socket.h>
#include <linux/stddef.h>
+#include <linux/sysctl.h>
#include <linux/unistd.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <net/sock.h>
#include <net/af_vsock.h>
+#include <net/netns/vsock.h>
#include <uapi/linux/vm_sockets.h>
#include <uapi/asm-generic/ioctls.h>
+#define VSOCK_NET_MODE_STR_GLOBAL "global"
+#define VSOCK_NET_MODE_STR_LOCAL "local"
+
+/* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'.
+ * The newline is added by proc_dostring() for read operations.
+ */
+#define VSOCK_NET_MODE_STR_MAX 8
+
static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr);
static void vsock_sk_destruct(struct sock *sk);
static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
@@ -235,33 +290,42 @@ static void __vsock_remove_connected(struct vsock_sock *vsk)
sock_put(&vsk->sk);
}
-static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr)
+static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+ struct net *net)
{
struct vsock_sock *vsk;
list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) {
- if (vsock_addr_equals_addr(addr, &vsk->local_addr))
- return sk_vsock(vsk);
+ struct sock *sk = sk_vsock(vsk);
+
+ if (vsock_addr_equals_addr(addr, &vsk->local_addr) &&
+ vsock_net_check_mode(sock_net(sk), net))
+ return sk;
if (addr->svm_port == vsk->local_addr.svm_port &&
(vsk->local_addr.svm_cid == VMADDR_CID_ANY ||
- addr->svm_cid == VMADDR_CID_ANY))
- return sk_vsock(vsk);
+ addr->svm_cid == VMADDR_CID_ANY) &&
+ vsock_net_check_mode(sock_net(sk), net))
+ return sk;
}
return NULL;
}
-static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src,
- struct sockaddr_vm *dst)
+static struct sock *
+__vsock_find_connected_socket_net(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst, struct net *net)
{
struct vsock_sock *vsk;
list_for_each_entry(vsk, vsock_connected_sockets(src, dst),
connected_table) {
+ struct sock *sk = sk_vsock(vsk);
+
if (vsock_addr_equals_addr(src, &vsk->remote_addr) &&
- dst->svm_port == vsk->local_addr.svm_port) {
- return sk_vsock(vsk);
+ dst->svm_port == vsk->local_addr.svm_port &&
+ vsock_net_check_mode(sock_net(sk), net)) {
+ return sk;
}
}
@@ -304,12 +368,18 @@ void vsock_remove_connected(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(vsock_remove_connected);
-struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+/* Find a bound socket, filtering by namespace and namespace mode.
+ *
+ * Use this in transports that are namespace-aware and can provide the
+ * network namespace context.
+ */
+struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr,
+ struct net *net)
{
struct sock *sk;
spin_lock_bh(&vsock_table_lock);
- sk = __vsock_find_bound_socket(addr);
+ sk = __vsock_find_bound_socket_net(addr, net);
if (sk)
sock_hold(sk);
@@ -317,15 +387,32 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
return sk;
}
+EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net);
+
+/* Find a bound socket without namespace filtering.
+ *
+ * Use this in transports that lack namespace context. All sockets are
+ * treated as if in global mode.
+ */
+struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr)
+{
+ return vsock_find_bound_socket_net(addr, NULL);
+}
EXPORT_SYMBOL_GPL(vsock_find_bound_socket);
-struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
- struct sockaddr_vm *dst)
+/* Find a connected socket, filtering by namespace and namespace mode.
+ *
+ * Use this in transports that are namespace-aware and can provide the
+ * network namespace context.
+ */
+struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst,
+ struct net *net)
{
struct sock *sk;
spin_lock_bh(&vsock_table_lock);
- sk = __vsock_find_connected_socket(src, dst);
+ sk = __vsock_find_connected_socket_net(src, dst, net);
if (sk)
sock_hold(sk);
@@ -333,6 +420,18 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
return sk;
}
+EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net);
+
+/* Find a connected socket without namespace filtering.
+ *
+ * Use this in transports that lack namespace context. All sockets are
+ * treated as if in global mode.
+ */
+struct sock *vsock_find_connected_socket(struct sockaddr_vm *src,
+ struct sockaddr_vm *dst)
+{
+ return vsock_find_connected_socket_net(src, dst, NULL);
+}
EXPORT_SYMBOL_GPL(vsock_find_connected_socket);
void vsock_remove_sock(struct vsock_sock *vsk)
@@ -528,7 +627,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk)
if (sk->sk_type == SOCK_SEQPACKET) {
if (!new_transport->seqpacket_allow ||
- !new_transport->seqpacket_allow(remote_cid)) {
+ !new_transport->seqpacket_allow(vsk, remote_cid)) {
module_put(new_transport->module);
return -ESOCKTNOSUPPORT;
}
@@ -676,11 +775,11 @@ out:
static int __vsock_bind_connectible(struct vsock_sock *vsk,
struct sockaddr_vm *addr)
{
- static u32 port;
+ struct net *net = sock_net(sk_vsock(vsk));
struct sockaddr_vm new_addr;
- if (!port)
- port = get_random_u32_above(LAST_RESERVED_PORT);
+ if (!net->vsock.port)
+ net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT);
vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port);
@@ -689,13 +788,13 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
unsigned int i;
for (i = 0; i < MAX_PORT_RETRIES; i++) {
- if (port == VMADDR_PORT_ANY ||
- port <= LAST_RESERVED_PORT)
- port = LAST_RESERVED_PORT + 1;
+ if (net->vsock.port == VMADDR_PORT_ANY ||
+ net->vsock.port <= LAST_RESERVED_PORT)
+ net->vsock.port = LAST_RESERVED_PORT + 1;
- new_addr.svm_port = port++;
+ new_addr.svm_port = net->vsock.port++;
- if (!__vsock_find_bound_socket(&new_addr)) {
+ if (!__vsock_find_bound_socket_net(&new_addr, net)) {
found = true;
break;
}
@@ -712,7 +811,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk,
return -EACCES;
}
- if (__vsock_find_bound_socket(&new_addr))
+ if (__vsock_find_bound_socket_net(&new_addr, net))
return -EADDRINUSE;
}
@@ -1314,7 +1413,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
goto out;
}
- if (!transport->dgram_allow(remote_addr->svm_cid,
+ if (!transport->dgram_allow(vsk, remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -EINVAL;
goto out;
@@ -1355,7 +1454,7 @@ static int vsock_dgram_connect(struct socket *sock,
if (err)
goto out;
- if (!vsk->transport->dgram_allow(remote_addr->svm_cid,
+ if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -EINVAL;
goto out;
@@ -1585,7 +1684,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr,
* endpoints.
*/
if (!transport ||
- !transport->stream_allow(remote_addr->svm_cid,
+ !transport->stream_allow(vsk, remote_addr->svm_cid,
remote_addr->svm_port)) {
err = -ENETUNREACH;
goto out;
@@ -2662,6 +2761,180 @@ static struct miscdevice vsock_device = {
.fops = &vsock_device_ops,
};
+static int __vsock_net_mode_string(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos,
+ enum vsock_net_mode mode,
+ enum vsock_net_mode *new_mode)
+{
+ char data[VSOCK_NET_MODE_STR_MAX] = {0};
+ struct ctl_table tmp;
+ int ret;
+
+ if (!table->data || !table->maxlen || !*lenp) {
+ *lenp = 0;
+ return 0;
+ }
+
+ tmp = *table;
+ tmp.data = data;
+
+ if (!write) {
+ const char *p;
+
+ switch (mode) {
+ case VSOCK_NET_MODE_GLOBAL:
+ p = VSOCK_NET_MODE_STR_GLOBAL;
+ break;
+ case VSOCK_NET_MODE_LOCAL:
+ p = VSOCK_NET_MODE_STR_LOCAL;
+ break;
+ default:
+ WARN_ONCE(true, "netns has invalid vsock mode");
+ *lenp = 0;
+ return 0;
+ }
+
+ strscpy(data, p, sizeof(data));
+ tmp.maxlen = strlen(p);
+ }
+
+ ret = proc_dostring(&tmp, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
+ if (*lenp >= sizeof(data))
+ return -EINVAL;
+
+ if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data)))
+ *new_mode = VSOCK_NET_MODE_GLOBAL;
+ else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data)))
+ *new_mode = VSOCK_NET_MODE_LOCAL;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+static int vsock_net_mode_string(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct net *net;
+
+ if (write)
+ return -EPERM;
+
+ net = current->nsproxy->net_ns;
+
+ return __vsock_net_mode_string(table, write, buffer, lenp, ppos,
+ vsock_net_mode(net), NULL);
+}
+
+static int vsock_net_child_mode_string(const struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ enum vsock_net_mode new_mode;
+ struct net *net;
+ int ret;
+
+ net = current->nsproxy->net_ns;
+
+ ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos,
+ vsock_net_child_mode(net), &new_mode);
+ if (ret)
+ return ret;
+
+ if (write)
+ vsock_net_set_child_mode(net, new_mode);
+
+ return 0;
+}
+
+static struct ctl_table vsock_table[] = {
+ {
+ .procname = "ns_mode",
+ .data = &init_net.vsock.mode,
+ .maxlen = VSOCK_NET_MODE_STR_MAX,
+ .mode = 0444,
+ .proc_handler = vsock_net_mode_string
+ },
+ {
+ .procname = "child_ns_mode",
+ .data = &init_net.vsock.child_ns_mode,
+ .maxlen = VSOCK_NET_MODE_STR_MAX,
+ .mode = 0644,
+ .proc_handler = vsock_net_child_mode_string
+ },
+};
+
+static int __net_init vsock_sysctl_register(struct net *net)
+{
+ struct ctl_table *table;
+
+ if (net_eq(net, &init_net)) {
+ table = vsock_table;
+ } else {
+ table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL);
+ if (!table)
+ goto err_alloc;
+
+ table[0].data = &net->vsock.mode;
+ table[1].data = &net->vsock.child_ns_mode;
+ }
+
+ net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table,
+ ARRAY_SIZE(vsock_table));
+ if (!net->vsock.sysctl_hdr)
+ goto err_reg;
+
+ return 0;
+
+err_reg:
+ if (!net_eq(net, &init_net))
+ kfree(table);
+err_alloc:
+ return -ENOMEM;
+}
+
+static void vsock_sysctl_unregister(struct net *net)
+{
+ const struct ctl_table *table;
+
+ table = net->vsock.sysctl_hdr->ctl_table_arg;
+ unregister_net_sysctl_table(net->vsock.sysctl_hdr);
+ if (!net_eq(net, &init_net))
+ kfree(table);
+}
+
+static void vsock_net_init(struct net *net)
+{
+ if (net_eq(net, &init_net))
+ net->vsock.mode = VSOCK_NET_MODE_GLOBAL;
+ else
+ net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns);
+
+ net->vsock.child_ns_mode = VSOCK_NET_MODE_GLOBAL;
+}
+
+static __net_init int vsock_sysctl_init_net(struct net *net)
+{
+ vsock_net_init(net);
+
+ if (vsock_sysctl_register(net))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void vsock_sysctl_exit_net(struct net *net)
+{
+ vsock_sysctl_unregister(net);
+}
+
+static struct pernet_operations vsock_sysctl_ops = {
+ .init = vsock_sysctl_init_net,
+ .exit = vsock_sysctl_exit_net,
+};
+
static int __init vsock_init(void)
{
int err = 0;
@@ -2689,10 +2962,17 @@ static int __init vsock_init(void)
goto err_unregister_proto;
}
+ if (register_pernet_subsys(&vsock_sysctl_ops)) {
+ err = -ENOMEM;
+ goto err_unregister_sock;
+ }
+
vsock_bpf_build_proto();
return 0;
+err_unregister_sock:
+ sock_unregister(AF_VSOCK);
err_unregister_proto:
proto_unregister(&vsock_proto);
err_deregister_misc:
@@ -2706,6 +2986,7 @@ static void __exit vsock_exit(void)
misc_deregister(&vsock_device);
sock_unregister(AF_VSOCK);
proto_unregister(&vsock_proto);
+ unregister_pernet_subsys(&vsock_sysctl_ops);
}
const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk)
diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c
index 432fcbbd14d4..c3010c874308 100644
--- a/net/vmw_vsock/hyperv_transport.c
+++ b/net/vmw_vsock/hyperv_transport.c
@@ -570,7 +570,7 @@ static int hvs_dgram_enqueue(struct vsock_sock *vsk,
return -EOPNOTSUPP;
}
-static bool hvs_dgram_allow(u32 cid, u32 port)
+static bool hvs_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port)
{
return false;
}
@@ -745,8 +745,11 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk)
return hvs->chan != NULL;
}
-static bool hvs_stream_allow(u32 cid, u32 port)
+static bool hvs_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
{
+ if (!vsock_net_mode_global(vsk))
+ return false;
+
if (cid == VMADDR_CID_HOST)
return true;
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 8c867023a2e5..3f7ea2db9bd7 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -231,7 +231,7 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc
}
static int
-virtio_transport_send_pkt(struct sk_buff *skb)
+virtio_transport_send_pkt(struct sk_buff *skb, struct net *net)
{
struct virtio_vsock_hdr *hdr;
struct virtio_vsock *vsock;
@@ -536,7 +536,13 @@ static bool virtio_transport_msgzerocopy_allow(void)
return true;
}
-static bool virtio_transport_seqpacket_allow(u32 remote_cid);
+bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port)
+{
+ return vsock_net_mode_global(vsk);
+}
+
+static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk,
+ u32 remote_cid);
static struct virtio_transport virtio_transport = {
.transport = {
@@ -593,11 +599,15 @@ static struct virtio_transport virtio_transport = {
.can_msgzerocopy = virtio_transport_can_msgzerocopy,
};
-static bool virtio_transport_seqpacket_allow(u32 remote_cid)
+static bool
+virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
struct virtio_vsock *vsock;
bool seqpacket_allow;
+ if (!vsock_net_mode_global(vsk))
+ return false;
+
seqpacket_allow = false;
rcu_read_lock();
vsock = rcu_dereference(the_virtio_vsock);
@@ -660,7 +670,11 @@ static void virtio_transport_rx_work(struct work_struct *work)
virtio_vsock_skb_put(skb, payload_len);
virtio_transport_deliver_tap_pkt(skb);
- virtio_transport_recv_pkt(&virtio_transport, skb);
+
+ /* Force virtio-transport into global mode since it
+ * does not yet support local-mode namespacing.
+ */
+ virtio_transport_recv_pkt(&virtio_transport, skb, NULL);
}
} while (!virtqueue_enable_cb(vq));
diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
index d3e26025ef58..d017ab318a7e 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -414,7 +414,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
virtio_transport_inc_tx_pkt(vvs, skb);
- ret = t_ops->send_pkt(skb);
+ ret = t_ops->send_pkt(skb, info->net);
if (ret < 0)
break;
@@ -526,6 +526,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_CREDIT_UPDATE,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1055,12 +1056,6 @@ bool virtio_transport_stream_is_active(struct vsock_sock *vsk)
}
EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active);
-bool virtio_transport_stream_allow(u32 cid, u32 port)
-{
- return true;
-}
-EXPORT_SYMBOL_GPL(virtio_transport_stream_allow);
-
int virtio_transport_dgram_bind(struct vsock_sock *vsk,
struct sockaddr_vm *addr)
{
@@ -1068,7 +1063,7 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk,
}
EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind);
-bool virtio_transport_dgram_allow(u32 cid, u32 port)
+bool virtio_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port)
{
return false;
}
@@ -1079,6 +1074,7 @@ int virtio_transport_connect(struct vsock_sock *vsk)
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_REQUEST,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1094,6 +1090,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode)
(mode & SEND_SHUTDOWN ?
VIRTIO_VSOCK_SHUTDOWN_SEND : 0),
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1120,6 +1117,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
.msg = msg,
.pkt_len = len,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1157,6 +1155,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
.op = VIRTIO_VSOCK_OP_RST,
.reply = !!skb,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
/* Send RST only if the original pkt is not a RST pkt */
@@ -1168,15 +1167,31 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
/* Normally packets are associated with a socket. There may be no socket if an
* attempt was made to connect to a socket that does not exist.
+ *
+ * net refers to the namespace of whoever sent the invalid message. For
+ * loopback, this is the namespace of the socket. For vhost, this is the
+ * namespace of the VM (i.e., vhost_vsock).
*/
static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net *net)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
struct virtio_vsock_pkt_info info = {
.op = VIRTIO_VSOCK_OP_RST,
.type = le16_to_cpu(hdr->type),
.reply = true,
+
+ /* Set sk owner to socket we are replying to (may be NULL for
+ * non-loopback). This keeps a reference to the sock and
+ * sock_net(sk) until the reply skb is freed.
+ */
+ .vsk = vsock_sk(skb->sk),
+
+ /* net is not defined here because we pass it directly to
+ * t->send_pkt(), instead of relying on
+ * virtio_transport_send_pkt_info() to pass it. It is not needed
+ * by virtio_transport_alloc_skb().
+ */
};
struct sk_buff *reply;
@@ -1195,7 +1210,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
if (!reply)
return -ENOMEM;
- return t->send_pkt(reply);
+ return t->send_pkt(reply, net);
}
/* This function should be called with sk_lock held and SOCK_DONE set */
@@ -1479,6 +1494,7 @@ virtio_transport_send_response(struct vsock_sock *vsk,
.remote_port = le32_to_cpu(hdr->src_port),
.reply = true,
.vsk = vsk,
+ .net = sock_net(sk_vsock(vsk)),
};
return virtio_transport_send_pkt_info(vsk, &info);
@@ -1521,12 +1537,12 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
int ret;
if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk));
return -EINVAL;
}
if (sk_acceptq_is_full(sk)) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk));
return -ENOMEM;
}
@@ -1534,13 +1550,13 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
* Subsequent enqueues would lead to a memory leak.
*/
if (sk->sk_shutdown == SHUTDOWN_MASK) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk));
return -ESHUTDOWN;
}
child = vsock_create_connected(sk);
if (!child) {
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk));
return -ENOMEM;
}
@@ -1562,7 +1578,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
*/
if (ret || vchild->transport != &t->transport) {
release_sock(child);
- virtio_transport_reset_no_sock(t, skb);
+ virtio_transport_reset_no_sock(t, skb, sock_net(sk));
sock_put(child);
return ret;
}
@@ -1590,7 +1606,7 @@ static bool virtio_transport_valid_type(u16 type)
* lock.
*/
void virtio_transport_recv_pkt(struct virtio_transport *t,
- struct sk_buff *skb)
+ struct sk_buff *skb, struct net *net)
{
struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
struct sockaddr_vm src, dst;
@@ -1613,24 +1629,24 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
le32_to_cpu(hdr->fwd_cnt));
if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net);
goto free_pkt;
}
/* The socket must be in connected or bound table
* otherwise send reset back
*/
- sk = vsock_find_connected_socket(&src, &dst);
+ sk = vsock_find_connected_socket_net(&src, &dst, net);
if (!sk) {
- sk = vsock_find_bound_socket(&dst);
+ sk = vsock_find_bound_socket_net(&dst, net);
if (!sk) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net);
goto free_pkt;
}
}
if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net);
sock_put(sk);
goto free_pkt;
}
@@ -1649,7 +1665,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
*/
if (sock_flag(sk, SOCK_DONE) ||
(sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) {
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net);
release_sock(sk);
sock_put(sk);
goto free_pkt;
@@ -1681,7 +1697,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
kfree_skb(skb);
break;
default:
- (void)virtio_transport_reset_no_sock(t, skb);
+ (void)virtio_transport_reset_no_sock(t, skb, net);
kfree_skb(skb);
break;
}
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 7eccd6708d66..a64522be1bad 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -161,7 +161,7 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt,
case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ:
case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE:
- memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait));
+ pkt->u.wait = *wait;
break;
case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2:
@@ -646,13 +646,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg)
return VMCI_SUCCESS;
}
-static bool vmci_transport_stream_allow(u32 cid, u32 port)
+static bool vmci_transport_stream_allow(struct vsock_sock *vsk, u32 cid,
+ u32 port)
{
static const u32 non_socket_contexts[] = {
VMADDR_CID_LOCAL,
};
int i;
+ if (!vsock_net_mode_global(vsk))
+ return false;
+
BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts));
for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) {
@@ -682,12 +686,10 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
err = VMCI_SUCCESS;
bh_process_pkt = false;
- /* Ignore incoming packets from contexts without sockets, or resources
- * that aren't vsock implementations.
+ /* Ignore incoming packets from resources that aren't vsock
+ * implementations.
*/
-
- if (!vmci_transport_stream_allow(dg->src.context, -1)
- || vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
+ if (vmci_transport_peer_rid(dg->src.context) != dg->src.resource)
return VMCI_ERROR_NO_ACCESS;
if (VMCI_DG_SIZE(dg) < sizeof(*pkt))
@@ -749,6 +751,12 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg)
goto out;
}
+ /* Ignore incoming packets from contexts without sockets. */
+ if (!vmci_transport_stream_allow(vsk, dg->src.context, -1)) {
+ err = VMCI_ERROR_NO_ACCESS;
+ goto out;
+ }
+
/* We do most everything in a work queue, but let's fast path the
* notification of reads and writes to help data transfer performance.
* We can only do this if there is no process context code executing
@@ -1784,8 +1792,12 @@ out:
return err;
}
-static bool vmci_transport_dgram_allow(u32 cid, u32 port)
+static bool vmci_transport_dgram_allow(struct vsock_sock *vsk, u32 cid,
+ u32 port)
{
+ if (!vsock_net_mode_global(vsk))
+ return false;
+
if (cid == VMADDR_CID_HYPERVISOR) {
/* Registrations of PBRPC Servers do not modify VMX/Hypervisor
* state and are allowed.
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index bc2ff918b315..8068d1b6e851 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -26,7 +26,7 @@ static u32 vsock_loopback_get_local_cid(void)
return VMADDR_CID_LOCAL;
}
-static int vsock_loopback_send_pkt(struct sk_buff *skb)
+static int vsock_loopback_send_pkt(struct sk_buff *skb, struct net *net)
{
struct vsock_loopback *vsock = &the_vsock_loopback;
int len = skb->len;
@@ -46,7 +46,15 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
return 0;
}
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid);
+static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk,
+ u32 remote_cid);
+
+static bool vsock_loopback_stream_allow(struct vsock_sock *vsk, u32 cid,
+ u32 port)
+{
+ return true;
+}
+
static bool vsock_loopback_msgzerocopy_allow(void)
{
return true;
@@ -76,7 +84,7 @@ static struct virtio_transport loopback_transport = {
.stream_has_space = virtio_transport_stream_has_space,
.stream_rcvhiwat = virtio_transport_stream_rcvhiwat,
.stream_is_active = virtio_transport_stream_is_active,
- .stream_allow = virtio_transport_stream_allow,
+ .stream_allow = vsock_loopback_stream_allow,
.seqpacket_dequeue = virtio_transport_seqpacket_dequeue,
.seqpacket_enqueue = virtio_transport_seqpacket_enqueue,
@@ -106,9 +114,10 @@ static struct virtio_transport loopback_transport = {
.send_pkt = vsock_loopback_send_pkt,
};
-static bool vsock_loopback_seqpacket_allow(u32 remote_cid)
+static bool
+vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid)
{
- return true;
+ return vsock_net_mode_global(vsk);
}
static void vsock_loopback_work(struct work_struct *work)
@@ -130,7 +139,8 @@ static void vsock_loopback_work(struct work_struct *work)
*/
virtio_transport_consume_skb_sent(skb, false);
virtio_transport_deliver_tap_pkt(skb);
- virtio_transport_recv_pkt(&loopback_transport, skb);
+ virtio_transport_recv_pkt(&loopback_transport, skb,
+ sock_net(skb->sk));
}
}
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 9a420d627d3c..9af85d655027 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -265,6 +265,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev,
rdev_stop_nan(rdev, wdev);
wdev->is_running = false;
+ eth_zero_addr(wdev->u.nan.cluster_id);
+
rdev->opencount--;
}
@@ -347,7 +349,7 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev)
guard(wiphy)(&rdev->wiphy);
- cfg80211_leave(rdev, wdev);
+ cfg80211_leave(rdev, wdev, -1);
cfg80211_remove_virtual_intf(rdev, wdev);
}
}
@@ -661,12 +663,8 @@ int wiphy_verify_iface_combinations(struct wiphy *wiphy,
c->limits[j].max > 1))
return -EINVAL;
- /* Only a single NAN can be allowed, avoid this
- * check for multi-radio global combination, since it
- * hold the capabilities of all radio combinations.
- */
- if (!combined_radio &&
- WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
+ /* Only a single NAN can be allowed */
+ if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) &&
c->limits[j].max > 1))
return -EINVAL;
@@ -1371,7 +1369,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
}
void cfg80211_leave(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev)
+ struct wireless_dev *wdev,
+ int link_id)
{
struct net_device *dev = wdev->netdev;
struct cfg80211_sched_scan_request *pos, *tmp;
@@ -1409,14 +1408,16 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
break;
case NL80211_IFTYPE_AP:
case NL80211_IFTYPE_P2P_GO:
- cfg80211_stop_ap(rdev, dev, -1, true);
+ cfg80211_stop_ap(rdev, dev, link_id, true);
break;
case NL80211_IFTYPE_OCB:
cfg80211_leave_ocb(rdev, dev);
break;
case NL80211_IFTYPE_P2P_DEVICE:
+ cfg80211_stop_p2p_device(rdev, wdev);
+ break;
case NL80211_IFTYPE_NAN:
- /* cannot happen, has no netdev */
+ cfg80211_stop_nan(rdev, wdev);
break;
case NL80211_IFTYPE_AP_VLAN:
case NL80211_IFTYPE_MONITOR:
@@ -1430,27 +1431,34 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev,
}
}
-void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev,
- gfp_t gfp)
+void cfg80211_stop_link(struct wiphy *wiphy, struct wireless_dev *wdev,
+ int link_id, gfp_t gfp)
{
struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
struct cfg80211_event *ev;
unsigned long flags;
- trace_cfg80211_stop_iface(wiphy, wdev);
+ /* Only AP/GO interfaces may have a specific link_id */
+ if (WARN_ON_ONCE(link_id != -1 &&
+ wdev->iftype != NL80211_IFTYPE_AP &&
+ wdev->iftype != NL80211_IFTYPE_P2P_GO))
+ link_id = -1;
+
+ trace_cfg80211_stop_link(wiphy, wdev, link_id);
ev = kzalloc(sizeof(*ev), gfp);
if (!ev)
return;
ev->type = EVENT_STOPPED;
+ ev->link_id = link_id;
spin_lock_irqsave(&wdev->event_lock, flags);
list_add_tail(&ev->list, &wdev->event_list);
spin_unlock_irqrestore(&wdev->event_lock, flags);
queue_work(cfg80211_wq, &rdev->event_work);
}
-EXPORT_SYMBOL(cfg80211_stop_iface);
+EXPORT_SYMBOL(cfg80211_stop_link);
void cfg80211_init_wdev(struct wireless_dev *wdev)
{
@@ -1589,7 +1597,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
break;
case NETDEV_GOING_DOWN:
scoped_guard(wiphy, &rdev->wiphy) {
- cfg80211_leave(rdev, wdev);
+ cfg80211_leave(rdev, wdev, -1);
cfg80211_remove_links(wdev);
}
/* since we just did cfg80211_leave() nothing to do there */
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 63dcf315dba7..6ac57b7b2615 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -289,6 +289,7 @@ struct cfg80211_event {
u8 td_bitmap_len;
} pa;
};
+ int link_id;
};
struct cfg80211_cached_keys {
@@ -537,7 +538,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev,
enum nl80211_iftype iftype, int num);
void cfg80211_leave(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev);
+ struct wireless_dev *wdev,
+ int link_id);
void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev,
struct wireless_dev *wdev);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 03efd45c007f..6e58b238a1f8 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -5,7 +5,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2015-2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2025 Intel Corporation
+ * Copyright (C) 2018-2026 Intel Corporation
*/
#include <linux/if.h>
@@ -332,6 +332,15 @@ static int validate_nan_cluster_id(const struct nlattr *attr,
return 0;
}
+static int validate_uhr_capa(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+
+ return ieee80211_uhr_capa_size_ok(data, len, false);
+}
+
/* policy for the attributes */
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
@@ -361,6 +370,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = {
[NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG },
[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG },
[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 },
+ [NL80211_PMSR_FTM_REQ_ATTR_RSTA] = { .type = NLA_FLAG },
};
static const struct nla_policy
@@ -932,6 +942,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
NLA_POLICY_NESTED(nl80211_s1g_short_beacon),
[NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG },
[NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG },
+ [NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG },
+ [NL80211_ATTR_UHR_CAPABILITY] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255),
+ [NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -1314,6 +1328,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ))
goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_UHR) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR))
+ goto nla_put_failure;
}
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -1675,7 +1695,9 @@ static int nl80211_key_allowed(struct wireless_dev *wdev)
return -ENOLINK;
case NL80211_IFTYPE_STATION:
case NL80211_IFTYPE_P2P_CLIENT:
- if (wdev->connected)
+ if (wdev->connected ||
+ (wiphy_ext_feature_isset(wdev->wiphy,
+ NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION)))
return 0;
return -ENOLINK;
case NL80211_IFTYPE_NAN:
@@ -1947,6 +1969,7 @@ nl80211_send_iftype_data(struct sk_buff *msg,
{
const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap;
const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap;
+ const struct ieee80211_sta_uhr_cap *uhr_cap = &iftdata->uhr_cap;
if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES,
iftdata->types_mask))
@@ -1998,6 +2021,14 @@ nl80211_send_iftype_data(struct sk_buff *msg,
return -ENOBUFS;
}
+ if (uhr_cap->has_uhr) {
+ if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC,
+ sizeof(uhr_cap->mac), &uhr_cap->mac) ||
+ nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY,
+ sizeof(uhr_cap->phy), &uhr_cap->phy))
+ return -ENOBUFS;
+ }
+
if (sband->band == NL80211_BAND_6GHZ &&
nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA,
sizeof(iftdata->he_6ghz_capa),
@@ -2307,6 +2338,32 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap,
if (cap->ftm.non_trigger_based &&
nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED))
return -ENOBUFS;
+ if (cap->ftm.support_6ghz &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP,
+ cap->ftm.max_tx_ltf_rep))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP,
+ cap->ftm.max_rx_ltf_rep))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS,
+ cap->ftm.max_tx_sts))
+ return -ENOBUFS;
+ if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS,
+ cap->ftm.max_rx_sts))
+ return -ENOBUFS;
+ if (cap->ftm.max_total_ltf_tx > 0 &&
+ nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX,
+ cap->ftm.max_total_ltf_tx))
+ return -ENOBUFS;
+ if (cap->ftm.max_total_ltf_rx > 0 &&
+ nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX,
+ cap->ftm.max_total_ltf_rx))
+ return -ENOBUFS;
+ if (cap->ftm.support_rsta &&
+ nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT))
+ return -ENOBUFS;
nla_nest_end(msg, ftm);
return 0;
@@ -6429,6 +6486,17 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params)
cap->datalen - 1))
return -EINVAL;
}
+
+ cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len);
+ if (cap) {
+ if (!cap->datalen)
+ return -EINVAL;
+ params->uhr_oper = (void *)(cap->data + 1);
+ if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper,
+ cap->datalen - 1, true))
+ return -EINVAL;
+ }
+
return 0;
}
@@ -6470,6 +6538,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
auth_type == NL80211_AUTHTYPE_FILS_PK))
return false;
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_EPPKE) &&
+ auth_type == NL80211_AUTHTYPE_EPPKE)
+ return false;
return true;
case NL80211_CMD_CONNECT:
if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) &&
@@ -6487,6 +6559,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev,
NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) &&
auth_type == NL80211_AUTHTYPE_FILS_SK)
return false;
+ if (!wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_EPPKE) &&
+ auth_type == NL80211_AUTHTYPE_EPPKE)
+ return false;
return true;
case NL80211_CMD_START_AP:
if (!wiphy_ext_feature_isset(&rdev->wiphy,
@@ -6552,6 +6628,9 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params
(channel->flags & IEEE80211_CHAN_NO_EHT))
return -EOPNOTSUPP;
+ if (params->uhr_oper && (channel->flags & IEEE80211_CHAN_NO_UHR))
+ return -EOPNOTSUPP;
+
return 0;
}
@@ -7134,7 +7213,8 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
break;
case RATE_INFO_BW_EHT_RU:
rate_flg = 0;
- WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS));
+ WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS) &&
+ !(info->flags & RATE_INFO_FLAGS_UHR_MCS));
break;
}
@@ -7187,6 +7267,23 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr)
nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
info->eht_ru_alloc))
return false;
+ } else if (info->flags & RATE_INFO_FLAGS_UHR_MCS) {
+ if (nla_put_u8(msg, NL80211_RATE_INFO_UHR_MCS, info->mcs))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss))
+ return false;
+ if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi))
+ return false;
+ if (info->bw == RATE_INFO_BW_EHT_RU &&
+ nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC,
+ info->eht_ru_alloc))
+ return false;
+ if (info->flags & RATE_INFO_FLAGS_UHR_ELR_MCS &&
+ nla_put_flag(msg, NL80211_RATE_INFO_UHR_ELR))
+ return false;
+ if (info->flags & RATE_INFO_FLAGS_UHR_IM &&
+ nla_put_flag(msg, NL80211_RATE_INFO_UHR_IM))
+ return false;
}
nla_nest_end(msg, rate);
@@ -8060,7 +8157,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy,
if (params->ext_capab || params->link_sta_params.ht_capa ||
params->link_sta_params.vht_capa ||
params->link_sta_params.he_capa ||
- params->link_sta_params.eht_capa)
+ params->link_sta_params.eht_capa ||
+ params->link_sta_params.uhr_capa)
return -EINVAL;
if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU))
return -EINVAL;
@@ -8280,6 +8378,16 @@ static int nl80211_set_station_tdls(struct genl_info *info,
}
}
+ if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+ if (!params->link_sta_params.eht_capa)
+ return -EINVAL;
+
+ params->link_sta_params.uhr_capa =
+ nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ params->link_sta_params.uhr_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ }
+
if (info->attrs[NL80211_ATTR_S1G_CAPABILITY])
params->link_sta_params.s1g_capa =
nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]);
@@ -8600,6 +8708,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
}
}
+ if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+ if (!params.link_sta_params.eht_capa)
+ return -EINVAL;
+
+ params.link_sta_params.uhr_capa =
+ nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ params.link_sta_params.uhr_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ }
+
if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) {
params.eml_cap_present = true;
params.eml_cap =
@@ -8659,10 +8777,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
params.link_sta_params.ht_capa = NULL;
params.link_sta_params.vht_capa = NULL;
- /* HE and EHT require WME */
+ /* HE, EHT and UHR require WME */
if (params.link_sta_params.he_capa_len ||
params.link_sta_params.he_6ghz_capa ||
- params.link_sta_params.eht_capa_len)
+ params.link_sta_params.eht_capa_len ||
+ params.link_sta_params.uhr_capa_len)
return -EINVAL;
}
@@ -8779,6 +8898,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
goto out;
}
}
+
+ params.epp_peer =
+ nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]);
+
err = rdev_add_station(rdev, dev, mac_addr, &params);
out:
dev_put(params.vlan);
@@ -11953,7 +12076,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
if ((auth_type == NL80211_AUTHTYPE_SAE ||
auth_type == NL80211_AUTHTYPE_FILS_SK ||
auth_type == NL80211_AUTHTYPE_FILS_SK_PFS ||
- auth_type == NL80211_AUTHTYPE_FILS_PK) &&
+ auth_type == NL80211_AUTHTYPE_FILS_PK ||
+ auth_type == NL80211_AUTHTYPE_EPPKE) &&
!info->attrs[NL80211_ATTR_AUTH_DATA])
return -EINVAL;
@@ -11961,7 +12085,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info)
if (auth_type != NL80211_AUTHTYPE_SAE &&
auth_type != NL80211_AUTHTYPE_FILS_SK &&
auth_type != NL80211_AUTHTYPE_FILS_SK_PFS &&
- auth_type != NL80211_AUTHTYPE_FILS_PK)
+ auth_type != NL80211_AUTHTYPE_FILS_PK &&
+ auth_type != NL80211_AUTHTYPE_EPPKE)
return -EINVAL;
req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]);
req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]);
@@ -12329,6 +12454,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
req.flags |= ASSOC_REQ_DISABLE_EHT;
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR]))
+ req.flags |= ASSOC_REQ_DISABLE_UHR;
+
if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
memcpy(&req.vht_capa_mask,
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
@@ -13201,6 +13329,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT]))
connect.flags |= ASSOC_REQ_DISABLE_EHT;
+ if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR]))
+ connect.flags |= ASSOC_REQ_DISABLE_UHR;
+
if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK])
memcpy(&connect.vht_capa_mask,
nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]),
@@ -15553,7 +15684,8 @@ static int nl80211_parse_nan_band_config(struct wiphy *wiphy,
static int nl80211_parse_nan_conf(struct wiphy *wiphy,
struct genl_info *info,
struct cfg80211_nan_conf *conf,
- u32 *changed_flags)
+ u32 *changed_flags,
+ bool start)
{
struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1];
int err, rem;
@@ -15600,7 +15732,7 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy,
return err;
changed |= CFG80211_NAN_CONF_CHANGED_CONFIG;
- if (attrs[NL80211_NAN_CONF_CLUSTER_ID])
+ if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start)
conf->cluster_id =
nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]);
@@ -15711,7 +15843,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
return -EINVAL;
- err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL);
+ err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL, true);
if (err)
return err;
@@ -16077,7 +16209,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
if (!wdev_running(wdev))
return -ENOTCONN;
- err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed);
+ err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed, false);
if (err)
return err;
@@ -16096,6 +16228,9 @@ void cfg80211_nan_match(struct wireless_dev *wdev,
struct sk_buff *msg;
void *hdr;
+ if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE))
+ return;
+
if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr))
return;
@@ -16178,6 +16313,9 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev,
struct nlattr *func_attr;
void *hdr;
+ if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE))
+ return;
+
if (WARN_ON(!inst_id))
return;
@@ -17626,6 +17764,16 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info,
}
}
+ if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) {
+ if (!params.eht_capa)
+ return -EINVAL;
+
+ params.uhr_capa =
+ nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ params.uhr_capa_len =
+ nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]);
+ }
+
if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY])
params.he_6ghz_capa =
nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]);
diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c
index a117f5093ca2..60e1e31c2185 100644
--- a/net/wireless/pmsr.c
+++ b/net/wireless/pmsr.c
@@ -85,11 +85,6 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
return -EINVAL;
}
- out->ftm.burst_duration = 15;
- if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
- out->ftm.burst_duration =
- nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
-
out->ftm.ftms_per_burst = 0;
if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST])
out->ftm.ftms_per_burst =
@@ -164,6 +159,12 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
return -EINVAL;
}
+ if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION])
+ out->ftm.burst_duration =
+ nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]);
+ else if (!out->ftm.non_trigger_based && !out->ftm.trigger_based)
+ out->ftm.burst_duration = 15;
+
out->ftm.lmr_feedback =
!!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK];
if (!out->ftm.trigger_based && !out->ftm.non_trigger_based &&
@@ -186,6 +187,21 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev,
nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]);
}
+ out->ftm.rsta = !!tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA];
+ if (out->ftm.rsta && !capa->ftm.support_rsta) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA],
+ "FTM: RSTA not supported by device");
+ return -EOPNOTSUPP;
+ }
+
+ if (out->ftm.rsta && !out->ftm.lmr_feedback) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA],
+ "FTM: RSTA set without LMR feedback");
+ return -EINVAL;
+ }
+
return 0;
}
@@ -453,6 +469,7 @@ static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg,
PUT(u8, NUM_BURSTS_EXP, num_bursts_exp);
PUT(u8, BURST_DURATION, burst_duration);
PUT(u8, FTMS_PER_BURST, ftms_per_burst);
+ PUT(u16, BURST_PERIOD, burst_period);
PUTOPT(s32, RSSI_AVG, rssi_avg);
PUTOPT(s32, RSSI_SPREAD, rssi_spread);
if (res->ftm.tx_rate_valid &&
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 73cab51f6379..139cb27e5a81 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -5,7 +5,7 @@
* Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2025 Intel Corporation
+ * Copyright (C) 2018 - 2026 Intel Corporation
*
* Permission to use, copy, modify, and/or distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
@@ -1605,6 +1605,8 @@ static u32 map_regdom_flags(u32 rd_flags)
channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP;
if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY)
channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY;
+ if (rd_flags & NL80211_RRF_NO_UHR)
+ channel_flags |= IEEE80211_CHAN_NO_UHR;
return channel_flags;
}
@@ -2332,8 +2334,17 @@ static void reg_process_ht_flags(struct wiphy *wiphy)
if (!wiphy)
return;
- for (band = 0; band < NUM_NL80211_BANDS; band++)
+ for (band = 0; band < NUM_NL80211_BANDS; band++) {
+ /*
+ * Don't apply HT flags to channels within the S1G band.
+ * Each bonded channel will instead be validated individually
+ * within cfg80211_s1g_usable().
+ */
+ if (band == NL80211_BAND_S1GHZ)
+ continue;
+
reg_process_ht_flags_band(wiphy, wiphy->bands[band]);
+ }
}
static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev)
@@ -2442,7 +2453,7 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy)
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
if (!reg_wdev_chan_valid(wiphy, wdev))
- cfg80211_leave(rdev, wdev);
+ cfg80211_leave(rdev, wdev, -1);
}
static void reg_check_chans_work(struct work_struct *work)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 7546647752fd..eb0e77813d46 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1959,7 +1959,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
ether_addr_copy(known->parent_bssid, new->parent_bssid);
known->pub.max_bssid_indicator = new->pub.max_bssid_indicator;
known->pub.bssid_index = new->pub.bssid_index;
- known->pub.use_for &= new->pub.use_for;
+ known->pub.use_for = new->pub.use_for;
known->pub.cannot_use_reasons = new->pub.cannot_use_reasons;
known->bss_source = new->bss_source;
diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c
index 8d142856e385..2e0ea69b9604 100644
--- a/net/wireless/sysfs.c
+++ b/net/wireless/sysfs.c
@@ -88,7 +88,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev)
struct wireless_dev *wdev;
list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list)
- cfg80211_leave(rdev, wdev);
+ cfg80211_leave(rdev, wdev, -1);
}
static int wiphy_suspend(struct device *dev)
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 2b71f1d867a0..643ccf4f0227 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3915,19 +3915,22 @@ TRACE_EVENT(cfg80211_ft_event,
WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap)
);
-TRACE_EVENT(cfg80211_stop_iface,
- TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
- TP_ARGS(wiphy, wdev),
+TRACE_EVENT(cfg80211_stop_link,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
+ int link_id),
+ TP_ARGS(wiphy, wdev, link_id),
TP_STRUCT__entry(
WIPHY_ENTRY
WDEV_ENTRY
+ __field(int, link_id)
),
TP_fast_assign(
WIPHY_ASSIGN;
WDEV_ASSIGN;
+ __entry->link_id = link_id;
),
- TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT,
- WIPHY_PR_ARG, WDEV_PR_ARG)
+ TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d",
+ WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id)
);
TRACE_EVENT(cfg80211_pmsr_report,
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 4f581aed45b7..404fe604a8db 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -5,7 +5,7 @@
* Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2018-2023, 2025 Intel Corporation
+ * Copyright (C) 2018-2023, 2025-2026 Intel Corporation
*/
#include <linux/export.h>
#include <linux/bitops.h>
@@ -1144,7 +1144,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
ev->ij.channel);
break;
case EVENT_STOPPED:
- cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
+ cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev,
+ ev->link_id);
break;
case EVENT_PORT_AUTHORIZED:
__cfg80211_port_authorized(wdev, ev->pa.peer_addr,
@@ -1203,7 +1204,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
dev->ieee80211_ptr->use_4addr = false;
rdev_set_qos_map(rdev, dev, NULL);
- cfg80211_leave(rdev, dev->ieee80211_ptr);
+ cfg80211_leave(rdev, dev->ieee80211_ptr, -1);
cfg80211_process_rdev_events(rdev);
cfg80211_mlme_purge_registrations(dev->ieee80211_ptr);
@@ -1573,26 +1574,30 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate)
return result / 10000;
}
-static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
+static u32 _cfg80211_calculate_bitrate_eht_uhr(struct rate_info *rate)
{
#define SCALE 6144
- static const u32 mcs_divisors[16] = {
- 102399, /* 16.666666... */
- 51201, /* 8.333333... */
- 34134, /* 5.555555... */
- 25599, /* 4.166666... */
- 17067, /* 2.777777... */
- 12801, /* 2.083333... */
- 11377, /* 1.851725... */
- 10239, /* 1.666666... */
- 8532, /* 1.388888... */
- 7680, /* 1.250000... */
- 6828, /* 1.111111... */
- 6144, /* 1.000000... */
- 5690, /* 0.926106... */
- 5120, /* 0.833333... */
- 409600, /* 66.666666... */
- 204800, /* 33.333333... */
+ static const u32 mcs_divisors[] = {
+ [ 0] = 102399, /* 16.666666... */
+ [ 1] = 51201, /* 8.333333... */
+ [ 2] = 34134, /* 5.555555... */
+ [ 3] = 25599, /* 4.166666... */
+ [ 4] = 17067, /* 2.777777... */
+ [ 5] = 12801, /* 2.083333... */
+ [ 6] = 11377, /* 1.851725... */
+ [ 7] = 10239, /* 1.666666... */
+ [ 8] = 8532, /* 1.388888... */
+ [ 9] = 7680, /* 1.250000... */
+ [10] = 6828, /* 1.111111... */
+ [11] = 6144, /* 1.000000... */
+ [12] = 5690, /* 0.926106... */
+ [13] = 5120, /* 0.833333... */
+ [14] = 409600, /* 66.666666... */
+ [15] = 204800, /* 33.333333... */
+ [17] = 38400, /* 6.250180... */
+ [19] = 19200, /* 3.125090... */
+ [20] = 15360, /* 2.500000... */
+ [23] = 9600, /* 1.562545... */
};
static const u32 rates_996[3] = { 480388888, 453700000, 408333333 };
static const u32 rates_484[3] = { 229411111, 216666666, 195000000 };
@@ -1603,8 +1608,6 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
u64 tmp;
u32 result;
- if (WARN_ON_ONCE(rate->mcs > 15))
- return 0;
if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2))
return 0;
if (WARN_ON_ONCE(rate->eht_ru_alloc >
@@ -1685,7 +1688,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26)
result = rates_26[rate->eht_gi];
else {
- WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n",
+ WARN(1, "invalid EHT or UHR MCS: bw:%d, ru:%d\n",
rate->bw, rate->eht_ru_alloc);
return 0;
}
@@ -1699,11 +1702,64 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
tmp *= rate->nss;
do_div(tmp, 8);
+ /* and handle interference mitigation - 0.9x */
+ if (rate->flags & RATE_INFO_FLAGS_UHR_IM) {
+ if (WARN(rate->nss != 1 || rate->mcs == 15,
+ "invalid NSS or MCS for UHR IM\n"))
+ return 0;
+ tmp *= 9000;
+ do_div(tmp, 10000);
+ }
+
result = tmp;
return result / 10000;
}
+static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate)
+{
+ if (WARN_ONCE(rate->mcs > 15, "bad EHT MCS %d\n", rate->mcs))
+ return 0;
+
+ if (WARN_ONCE(rate->flags & (RATE_INFO_FLAGS_UHR_ELR_MCS |
+ RATE_INFO_FLAGS_UHR_IM),
+ "bad EHT MCS flags 0x%x\n", rate->flags))
+ return 0;
+
+ return _cfg80211_calculate_bitrate_eht_uhr(rate);
+}
+
+static u32 cfg80211_calculate_bitrate_uhr(struct rate_info *rate)
+{
+ if (rate->flags & RATE_INFO_FLAGS_UHR_ELR_MCS) {
+ WARN_ONCE(rate->eht_gi != NL80211_RATE_INFO_EHT_GI_1_6,
+ "bad UHR ELR guard interval %d\n",
+ rate->eht_gi);
+ WARN_ONCE(rate->mcs > 1, "bad UHR ELR MCS %d\n", rate->mcs);
+ WARN_ONCE(rate->nss != 1, "bad UHR ELR NSS %d\n", rate->nss);
+ WARN_ONCE(rate->bw != RATE_INFO_BW_20,
+ "bad UHR ELR bandwidth %d\n",
+ rate->bw);
+ WARN_ONCE(rate->flags & RATE_INFO_FLAGS_UHR_IM,
+ "bad UHR MCS flags 0x%x\n", rate->flags);
+ if (rate->mcs == 0)
+ return 17;
+ return 33;
+ }
+
+ switch (rate->mcs) {
+ case 0 ... 15:
+ case 17:
+ case 19:
+ case 20:
+ case 23:
+ return _cfg80211_calculate_bitrate_eht_uhr(rate);
+ }
+
+ WARN_ONCE(1, "bad UHR MCS %d\n", rate->mcs);
+ return 0;
+}
+
static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate)
{
/* For 1, 2, 4, 8 and 16 MHz channels */
@@ -1828,6 +1884,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate)
return cfg80211_calculate_bitrate_he(rate);
if (rate->flags & RATE_INFO_FLAGS_EHT_MCS)
return cfg80211_calculate_bitrate_eht(rate);
+ if (rate->flags & RATE_INFO_FLAGS_UHR_MCS)
+ return cfg80211_calculate_bitrate_uhr(rate);
if (rate->flags & RATE_INFO_FLAGS_S1G_MCS)
return cfg80211_calculate_bitrate_s1g(rate);
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index f093c3453f64..3b46bc635c43 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -543,9 +543,9 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
{
int ret;
- spin_lock(&pool->cq_cached_prod_lock);
+ spin_lock(&pool->cq->cq_cached_prod_lock);
ret = xskq_prod_reserve(pool->cq);
- spin_unlock(&pool->cq_cached_prod_lock);
+ spin_unlock(&pool->cq->cq_cached_prod_lock);
return ret;
}
@@ -619,9 +619,9 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
{
- spin_lock(&pool->cq_cached_prod_lock);
+ spin_lock(&pool->cq->cq_cached_prod_lock);
xskq_prod_cancel_n(pool->cq, n);
- spin_unlock(&pool->cq_cached_prod_lock);
+ spin_unlock(&pool->cq->cq_cached_prod_lock);
}
INDIRECT_CALLABLE_SCOPE
@@ -1349,6 +1349,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr
}
if (umem_xs->queue_id != qid || umem_xs->dev != dev) {
+ /* One fill and completion ring required for each queue id. */
+ if (!xsk_validate_queues(xs)) {
+ err = -EINVAL;
+ sockfd_put(sock);
+ goto out_unlock;
+ }
+
/* Share the umem with another socket on another qid
* and/or device.
*/
diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
index 51526034c42a..cd5125b6af53 100644
--- a/net/xdp/xsk_buff_pool.c
+++ b/net/xdp/xsk_buff_pool.c
@@ -91,7 +91,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs,
INIT_LIST_HEAD(&pool->xsk_tx_list);
spin_lock_init(&pool->xsk_tx_list_lock);
spin_lock_init(&pool->cq_prod_lock);
- spin_lock_init(&pool->cq_cached_prod_lock);
+ spin_lock_init(&xs->cq_tmp->cq_cached_prod_lock);
refcount_set(&pool->users, 1);
pool->fq = xs->fq_tmp;
@@ -247,10 +247,6 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs,
u16 flags;
struct xdp_umem *umem = umem_xs->umem;
- /* One fill and completion ring required for each queue id. */
- if (!pool->fq || !pool->cq)
- return -EINVAL;
-
flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY;
if (umem_xs->pool->uses_need_wakeup)
flags |= XDP_USE_NEED_WAKEUP;
diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h
index 1eb8d9f8b104..ec08d9c102b1 100644
--- a/net/xdp/xsk_queue.h
+++ b/net/xdp/xsk_queue.h
@@ -46,6 +46,11 @@ struct xsk_queue {
u64 invalid_descs;
u64 queue_empty_descs;
size_t ring_vmalloc_size;
+ /* Mutual exclusion of the completion ring in the SKB mode.
+ * Protect: when sockets share a single cq when the same netdev
+ * and queue id is shared.
+ */
+ spinlock_t cq_cached_prod_lock;
};
struct parsed_desc {