diff options
Diffstat (limited to 'net')
207 files changed, 7130 insertions, 3474 deletions
diff --git a/net/802/Makefile b/net/802/Makefile index 99abc29d537c..9503ef6b2e06 100644 --- a/net/802/Makefile +++ b/net/802/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_LLC) += psnap.o obj-$(CONFIG_NET_FC) += fc.o obj-$(CONFIG_FDDI) += fddi.o -obj-$(CONFIG_HIPPI) += hippi.o obj-$(CONFIG_ATALK) += psnap.o obj-$(CONFIG_STP) += stp.o obj-$(CONFIG_GARP) += garp.o diff --git a/net/802/hippi.c b/net/802/hippi.c deleted file mode 100644 index 1997b7dd265e..000000000000 --- a/net/802/hippi.c +++ /dev/null @@ -1,193 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * INET An implementation of the TCP/IP protocol suite for the LINUX - * operating system. INET is implemented using the BSD Socket - * interface as the means of communication with the user level. - * - * HIPPI-type device handling. - * - * Version: @(#)hippi.c 1.0.0 05/29/97 - * - * Authors: Ross Biro - * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> - * Mark Evans, <evansmp@uhura.aston.ac.uk> - * Florian La Roche, <rzsfl@rz.uni-sb.de> - * Alan Cox, <gw4pts@gw4pts.ampr.org> - * Jes Sorensen, <Jes.Sorensen@cern.ch> - */ - -#include <linux/module.h> -#include <linux/types.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/mm.h> -#include <linux/socket.h> -#include <linux/in.h> -#include <linux/inet.h> -#include <linux/netdevice.h> -#include <linux/hippidevice.h> -#include <linux/skbuff.h> -#include <linux/errno.h> -#include <net/arp.h> -#include <net/sock.h> -#include <linux/uaccess.h> - -/* - * Create the HIPPI MAC header for an arbitrary protocol layer - * - * saddr=NULL means use device source address - * daddr=NULL means leave destination address (eg unresolved arp) - */ - -static int hippi_header(struct sk_buff *skb, struct net_device *dev, - unsigned short type, - const void *daddr, const void *saddr, unsigned int len) -{ - struct hippi_hdr *hip = skb_push(skb, HIPPI_HLEN); - struct hippi_cb *hcb = (struct hippi_cb *) skb->cb; - - if (!len){ - len = skb->len - HIPPI_HLEN; - printk("hippi_header(): length not supplied\n"); - } - - /* - * Due to the stupidity of the little endian byte-order we - * have to set the fp field this way. - */ - hip->fp.fixed = htonl(0x04800018); - hip->fp.d2_size = htonl(len + 8); - hip->le.fc = 0; - hip->le.double_wide = 0; /* only HIPPI 800 for the time being */ - hip->le.message_type = 0; /* Data PDU */ - - hip->le.dest_addr_type = 2; /* 12 bit SC address */ - hip->le.src_addr_type = 2; /* 12 bit SC address */ - - memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); - memset_startat(&hip->le, 0, reserved); - - hip->snap.dsap = HIPPI_EXTENDED_SAP; - hip->snap.ssap = HIPPI_EXTENDED_SAP; - hip->snap.ctrl = HIPPI_UI_CMD; - hip->snap.oui[0] = 0x00; - hip->snap.oui[1] = 0x00; - hip->snap.oui[2] = 0x00; - hip->snap.ethertype = htons(type); - - if (daddr) - { - memcpy(hip->le.dest_switch_addr, daddr + 3, 3); - memcpy(&hcb->ifield, daddr + 2, 4); - return HIPPI_HLEN; - } - hcb->ifield = 0; - return -((int)HIPPI_HLEN); -} - - -/* - * Determine the packet's protocol ID. - */ - -__be16 hippi_type_trans(struct sk_buff *skb, struct net_device *dev) -{ - struct hippi_hdr *hip; - - /* - * This is actually wrong ... question is if we really should - * set the raw address here. - */ - skb->dev = dev; - skb_reset_mac_header(skb); - hip = (struct hippi_hdr *)skb_mac_header(skb); - skb_pull(skb, HIPPI_HLEN); - - /* - * No fancy promisc stuff here now. - */ - - return hip->snap.ethertype; -} - -EXPORT_SYMBOL(hippi_type_trans); - -/* - * For HIPPI we will actually use the lower 4 bytes of the hardware - * address as the I-FIELD rather than the actual hardware address. - */ -int hippi_mac_addr(struct net_device *dev, void *p) -{ - struct sockaddr *addr = p; - if (netif_running(dev)) - return -EBUSY; - dev_addr_set(dev, addr->sa_data); - return 0; -} -EXPORT_SYMBOL(hippi_mac_addr); - -int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) -{ - /* Never send broadcast/multicast ARP messages */ - NEIGH_VAR_INIT(p, MCAST_PROBES, 0); - - /* In IPv6 unicast probes are valid even on NBMA, - * because they are encapsulated in normal IPv6 protocol. - * Should be a generic flag. - */ - if (p->tbl->family != AF_INET6) - NEIGH_VAR_INIT(p, UCAST_PROBES, 0); - return 0; -} -EXPORT_SYMBOL(hippi_neigh_setup_dev); - -static const struct header_ops hippi_header_ops = { - .create = hippi_header, -}; - - -static void hippi_setup(struct net_device *dev) -{ - dev->header_ops = &hippi_header_ops; - - /* - * We don't support HIPPI `ARP' for the time being, and probably - * never will unless someone else implements it. However we - * still need a fake ARPHRD to make ifconfig and friends play ball. - */ - dev->type = ARPHRD_HIPPI; - dev->hard_header_len = HIPPI_HLEN; - dev->mtu = 65280; - dev->min_mtu = 68; - dev->max_mtu = 65280; - dev->addr_len = HIPPI_ALEN; - dev->tx_queue_len = 25 /* 5 */; - memset(dev->broadcast, 0xFF, HIPPI_ALEN); - - - /* - * HIPPI doesn't support broadcast+multicast and we only use - * static ARP tables. ARP is disabled by hippi_neigh_setup_dev. - */ - dev->flags = 0; -} - -/** - * alloc_hippi_dev - Register HIPPI device - * @sizeof_priv: Size of additional driver-private structure to be allocated - * for this HIPPI device - * - * Fill in the fields of the device structure with HIPPI-generic values. - * - * Constructs a new net device, complete with a private data area of - * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for - * this private data area. - */ - -struct net_device *alloc_hippi_dev(int sizeof_priv) -{ - return alloc_netdev(sizeof_priv, "hip%d", NET_NAME_UNKNOWN, - hippi_setup); -} - -EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/atm/signaling.c b/net/atm/signaling.c index e70ae2c113f9..358fbe5e4d1d 100644 --- a/net/atm/signaling.c +++ b/net/atm/signaling.c @@ -22,6 +22,36 @@ struct atm_vcc *sigd = NULL; +/* + * find_get_vcc - validate and get a reference to a vcc pointer + * @vcc: the vcc pointer to validate + * + * This function validates that @vcc points to a registered VCC in vcc_hash. + * If found, it increments the socket reference count and returns the vcc. + * The caller must call sock_put(sk_atm(vcc)) when done. + * + * Returns the vcc pointer if valid, NULL otherwise. + */ +static struct atm_vcc *find_get_vcc(struct atm_vcc *vcc) +{ + int i; + + read_lock(&vcc_sklist_lock); + for (i = 0; i < VCC_HTABLE_SIZE; i++) { + struct sock *s; + + sk_for_each(s, &vcc_hash[i]) { + if (atm_sk(s) == vcc) { + sock_hold(s); + read_unlock(&vcc_sklist_lock); + return vcc; + } + } + } + read_unlock(&vcc_sklist_lock); + return NULL; +} + static void sigd_put_skb(struct sk_buff *skb) { if (!sigd) { @@ -69,7 +99,14 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) msg = (struct atmsvc_msg *) skb->data; WARN_ON(refcount_sub_and_test(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc)); - vcc = *(struct atm_vcc **) &msg->vcc; + + vcc = find_get_vcc(*(struct atm_vcc **)&msg->vcc); + if (!vcc) { + pr_debug("invalid vcc pointer in msg\n"); + dev_kfree_skb(skb); + return -EINVAL; + } + pr_debug("%d (0x%lx)\n", (int)msg->type, (unsigned long)vcc); sk = sk_atm(vcc); @@ -100,7 +137,16 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) clear_bit(ATM_VF_WAITING, &vcc->flags); break; case as_indicate: - vcc = *(struct atm_vcc **)&msg->listen_vcc; + /* Release the reference from msg->vcc, we'll use msg->listen_vcc instead */ + sock_put(sk); + + vcc = find_get_vcc(*(struct atm_vcc **)&msg->listen_vcc); + if (!vcc) { + pr_debug("invalid listen_vcc pointer in msg\n"); + dev_kfree_skb(skb); + return -EINVAL; + } + sk = sk_atm(vcc); pr_debug("as_indicate!!!\n"); lock_sock(sk); @@ -115,6 +161,8 @@ static int sigd_send(struct atm_vcc *vcc, struct sk_buff *skb) sk->sk_state_change(sk); as_indicate_complete: release_sock(sk); + /* Paired with find_get_vcc(msg->listen_vcc) above */ + sock_put(sk); return 0; case as_close: set_bit(ATM_VF_RELEASED, &vcc->flags); @@ -131,11 +179,15 @@ as_indicate_complete: break; default: pr_alert("bad message type %d\n", (int)msg->type); + /* Paired with find_get_vcc(msg->vcc) above */ + sock_put(sk); return -EINVAL; } sk->sk_state_change(sk); out: dev_kfree_skb(skb); + /* Paired with find_get_vcc(msg->vcc) above */ + sock_put(sk); return 0; } diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig index e23a3dc14b93..310169ce1488 100644 --- a/net/ax25/Kconfig +++ b/net/ax25/Kconfig @@ -63,20 +63,6 @@ config AX25_DAMA_SLAVE be enabled at runtime. For more about DAMA see <https://linux-ax25.in-berlin.de>. If unsure, say Y. -# placeholder until implemented -config AX25_DAMA_MASTER - bool 'AX.25 DAMA Master support' - depends on AX25_DAMA_SLAVE && BROKEN - help - DAMA is a mechanism to prevent collisions when doing AX.25 - networking. A DAMA server (called "master") accepts incoming traffic - from clients (called "slaves") and redistributes it to other slaves. - If you say Y here, your Linux box will act as a DAMA master; this is - transparent in that you don't have to do any special DAMA - configuration. Linux cannot yet act as a DAMA server. This option - only compiles DAMA slave support into the kernel. It still needs to - be explicitly enabled, so if unsure, say Y. - config NETROM tristate "Amateur Radio NET/ROM protocol" depends on AX25 diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c index 3733c0254a50..c504ed9c3a88 100644 --- a/net/ax25/ax25_dev.c +++ b/net/ax25/ax25_dev.c @@ -82,9 +82,7 @@ void ax25_dev_device_up(struct net_device *dev) #ifdef CONFIG_AX25_DAMA_SLAVE ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; -#endif -#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) ax25_ds_setup_timer(ax25_dev); #endif diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index c3f7828bf9d5..0795818963a5 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1002,12 +1002,18 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, switch (type) { case ACL_LINK: conn->pkt_type = hdev->pkt_type & ACL_PTYPE_MASK; + conn->link_policy = hdev->link_policy; conn->mtu = hdev->acl_mtu; break; case LE_LINK: /* conn->src should reflect the local identity address */ hci_copy_identity_address(hdev, &conn->src, &conn->src_type); conn->mtu = hdev->le_mtu ? hdev->le_mtu : hdev->acl_mtu; + /* Use the controller supported PHYS as default until the + * remote features are resolved. + */ + conn->le_tx_def_phys = hdev->le_tx_def_phys; + conn->le_rx_def_phys = hdev->le_tx_def_phys; break; case CIS_LINK: /* conn->src should reflect the local identity address */ @@ -1819,7 +1825,7 @@ static int hci_le_create_big(struct hci_conn *conn, struct bt_iso_qos *qos) cp.bis.sdu = cpu_to_le16(qos->bcast.out.sdu); cp.bis.latency = cpu_to_le16(qos->bcast.out.latency); cp.bis.rtn = qos->bcast.out.rtn; - cp.bis.phy = qos->bcast.out.phy; + cp.bis.phy = qos->bcast.out.phys; cp.bis.packing = qos->bcast.packing; cp.bis.framing = qos->bcast.framing; cp.bis.encryption = qos->bcast.encryption; @@ -1869,10 +1875,10 @@ static int set_cig_params_sync(struct hci_dev *hdev, void *data) cis->cis_id = cis_id; cis->c_sdu = cpu_to_le16(conn->iso_qos.ucast.out.sdu); cis->p_sdu = cpu_to_le16(conn->iso_qos.ucast.in.sdu); - cis->c_phy = qos->ucast.out.phy ? qos->ucast.out.phy : - qos->ucast.in.phy; - cis->p_phy = qos->ucast.in.phy ? qos->ucast.in.phy : - qos->ucast.out.phy; + cis->c_phys = qos->ucast.out.phys ? qos->ucast.out.phys : + qos->ucast.in.phys; + cis->p_phys = qos->ucast.in.phys ? qos->ucast.in.phys : + qos->ucast.out.phys; cis->c_rtn = qos->ucast.out.rtn; cis->p_rtn = qos->ucast.in.rtn; } @@ -1974,8 +1980,8 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, return cis; /* Update LINK PHYs according to QoS preference */ - cis->le_tx_phy = qos->ucast.out.phy; - cis->le_rx_phy = qos->ucast.in.phy; + cis->le_tx_phy = qos->ucast.out.phys; + cis->le_rx_phy = qos->ucast.in.phys; /* If output interval is not set use the input interval as it cannot be * 0x000000. @@ -2090,15 +2096,15 @@ int hci_le_create_cis_pending(struct hci_dev *hdev) } static void hci_iso_qos_setup(struct hci_dev *hdev, struct hci_conn *conn, - struct bt_iso_io_qos *qos, __u8 phy) + struct bt_iso_io_qos *qos, __u8 phys) { /* Only set MTU if PHY is enabled */ - if (!qos->sdu && qos->phy) + if (!qos->sdu && qos->phys) qos->sdu = conn->mtu; /* Use the same PHY as ACL if set to any */ - if (qos->phy == BT_ISO_PHY_ANY) - qos->phy = phy; + if (qos->phys == BT_ISO_PHY_ANY) + qos->phys = phys; /* Use LE ACL connection interval if not set */ if (!qos->interval) @@ -2118,7 +2124,7 @@ static int create_big_sync(struct hci_dev *hdev, void *data) u32 flags = 0; int err; - if (qos->bcast.out.phy == 0x02) + if (qos->bcast.out.phys == BIT(1)) flags |= MGMT_ADV_FLAG_SEC_2M; /* Align intervals */ @@ -2227,8 +2233,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, return conn; /* Update LINK PHYs according to QoS preference */ - conn->le_tx_phy = qos->bcast.out.phy; - conn->le_tx_phy = qos->bcast.out.phy; + conn->le_tx_def_phys = qos->bcast.out.phys; /* Add Basic Announcement into Peridic Adv Data if BASE is set */ if (base_len && base) { @@ -2237,7 +2242,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, } hci_iso_qos_setup(hdev, conn, &qos->bcast.out, - conn->le_tx_phy ? conn->le_tx_phy : + conn->le_tx_def_phys ? conn->le_tx_def_phys : hdev->le_tx_def_phys); conn->iso_qos = *qos; @@ -2357,9 +2362,11 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, return le; hci_iso_qos_setup(hdev, le, &qos->ucast.out, - le->le_tx_phy ? le->le_tx_phy : hdev->le_tx_def_phys); + le->le_tx_def_phys ? le->le_tx_def_phys : + hdev->le_tx_def_phys); hci_iso_qos_setup(hdev, le, &qos->ucast.in, - le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); + le->le_rx_def_phys ? le->le_rx_def_phys : + hdev->le_rx_def_phys); cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { @@ -2614,8 +2621,8 @@ void hci_conn_enter_active_mode(struct hci_conn *conn, __u8 force_active) timer: if (hdev->idle_timeout > 0) - queue_delayed_work(hdev->workqueue, &conn->idle_work, - msecs_to_jiffies(hdev->idle_timeout)); + mod_delayed_work(hdev->workqueue, &conn->idle_work, + msecs_to_jiffies(hdev->idle_timeout)); } /* Drop all connection on the device */ @@ -2928,22 +2935,22 @@ u32 hci_conn_get_phy(struct hci_conn *conn) break; case LE_LINK: - if (conn->le_tx_phy & HCI_LE_SET_PHY_1M) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_1M) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_1M) phys |= BT_PHY_LE_1M_RX; - if (conn->le_tx_phy & HCI_LE_SET_PHY_2M) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_2M) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_2M) phys |= BT_PHY_LE_2M_RX; - if (conn->le_tx_phy & HCI_LE_SET_PHY_CODED) + if (conn->le_tx_def_phys & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_TX; - if (conn->le_rx_phy & HCI_LE_SET_PHY_CODED) + if (conn->le_rx_def_phys & HCI_LE_SET_PHY_CODED) phys |= BT_PHY_LE_CODED_RX; break; @@ -2952,6 +2959,111 @@ u32 hci_conn_get_phy(struct hci_conn *conn) return phys; } +static u16 bt_phy_pkt_type(struct hci_conn *conn, u32 phys) +{ + u16 pkt_type = conn->pkt_type; + + if (phys & BT_PHY_BR_1M_3SLOT) + pkt_type |= HCI_DM3 | HCI_DH3; + else + pkt_type &= ~(HCI_DM3 | HCI_DH3); + + if (phys & BT_PHY_BR_1M_5SLOT) + pkt_type |= HCI_DM5 | HCI_DH5; + else + pkt_type &= ~(HCI_DM5 | HCI_DH5); + + if (phys & BT_PHY_EDR_2M_1SLOT) + pkt_type &= ~HCI_2DH1; + else + pkt_type |= HCI_2DH1; + + if (phys & BT_PHY_EDR_2M_3SLOT) + pkt_type &= ~HCI_2DH3; + else + pkt_type |= HCI_2DH3; + + if (phys & BT_PHY_EDR_2M_5SLOT) + pkt_type &= ~HCI_2DH5; + else + pkt_type |= HCI_2DH5; + + if (phys & BT_PHY_EDR_3M_1SLOT) + pkt_type &= ~HCI_3DH1; + else + pkt_type |= HCI_3DH1; + + if (phys & BT_PHY_EDR_3M_3SLOT) + pkt_type &= ~HCI_3DH3; + else + pkt_type |= HCI_3DH3; + + if (phys & BT_PHY_EDR_3M_5SLOT) + pkt_type &= ~HCI_3DH5; + else + pkt_type |= HCI_3DH5; + + return pkt_type; +} + +static int bt_phy_le_phy(u32 phys, u8 *tx_phys, u8 *rx_phys) +{ + if (!tx_phys || !rx_phys) + return -EINVAL; + + *tx_phys = 0; + *rx_phys = 0; + + if (phys & BT_PHY_LE_1M_TX) + *tx_phys |= HCI_LE_SET_PHY_1M; + + if (phys & BT_PHY_LE_1M_RX) + *rx_phys |= HCI_LE_SET_PHY_1M; + + if (phys & BT_PHY_LE_2M_TX) + *tx_phys |= HCI_LE_SET_PHY_2M; + + if (phys & BT_PHY_LE_2M_RX) + *rx_phys |= HCI_LE_SET_PHY_2M; + + if (phys & BT_PHY_LE_CODED_TX) + *tx_phys |= HCI_LE_SET_PHY_CODED; + + if (phys & BT_PHY_LE_CODED_RX) + *rx_phys |= HCI_LE_SET_PHY_CODED; + + return 0; +} + +int hci_conn_set_phy(struct hci_conn *conn, u32 phys) +{ + u8 tx_phys, rx_phys; + + switch (conn->type) { + case SCO_LINK: + case ESCO_LINK: + return -EINVAL; + case ACL_LINK: + /* Only allow setting BR/EDR PHYs if link type is ACL */ + if (phys & ~BT_PHY_BREDR_MASK) + return -EINVAL; + + return hci_acl_change_pkt_type(conn, + bt_phy_pkt_type(conn, phys)); + case LE_LINK: + /* Only allow setting LE PHYs if link type is LE */ + if (phys & ~BT_PHY_LE_MASK) + return -EINVAL; + + if (bt_phy_le_phy(phys, &tx_phys, &rx_phys)) + return -EINVAL; + + return hci_le_set_phy(conn, tx_phys, rx_phys); + default: + return -EINVAL; + } +} + static int abort_conn_sync(struct hci_dev *hdev, void *data) { struct hci_conn *conn = data; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 8ccec73dce45..b069607b145b 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -117,6 +117,7 @@ bool hci_discovery_active(struct hci_dev *hdev) return false; } } +EXPORT_SYMBOL(hci_discovery_active); void hci_discovery_set_state(struct hci_dev *hdev, int state) { diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index a9868f17ef40..286529d2e554 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2869,6 +2869,31 @@ static void hci_cs_le_ext_create_conn(struct hci_dev *hdev, u8 status) hci_dev_unlock(hdev); } +static void hci_cs_le_set_phy(struct hci_dev *hdev, u8 status) +{ + struct hci_cp_le_set_phy *cp; + struct hci_conn *conn; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + if (status) + return; + + cp = hci_sent_cmd_data(hdev, HCI_OP_LE_SET_PHY); + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(cp->handle)); + if (conn) { + conn->le_tx_def_phys = cp->tx_phys; + conn->le_rx_def_phys = cp->rx_phys; + } + + hci_dev_unlock(hdev); +} + static void hci_cs_le_read_remote_features(struct hci_dev *hdev, u8 status) { struct hci_cp_le_read_remote_features *cp; @@ -4359,6 +4384,7 @@ static const struct hci_cs { HCI_CS(HCI_OP_LE_CREATE_CONN, hci_cs_le_create_conn), HCI_CS(HCI_OP_LE_READ_REMOTE_FEATURES, hci_cs_le_read_remote_features), HCI_CS(HCI_OP_LE_START_ENC, hci_cs_le_start_enc), + HCI_CS(HCI_OP_LE_SET_PHY, hci_cs_le_set_phy), HCI_CS(HCI_OP_LE_EXT_CREATE_CONN, hci_cs_le_ext_create_conn), HCI_CS(HCI_OP_LE_CREATE_CIS, hci_cs_le_create_cis), HCI_CS(HCI_OP_LE_CREATE_BIG, hci_cs_le_create_big), @@ -6607,8 +6633,20 @@ static void hci_le_remote_feat_complete_evt(struct hci_dev *hdev, void *data, conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(ev->handle)); if (conn) { - if (!ev->status) - memcpy(conn->features[0], ev->features, 8); + if (!ev->status) { + memcpy(conn->le_features, ev->features, 8); + + /* Update supported PHYs */ + if (!(conn->le_features[1] & HCI_LE_PHY_2M)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M; + } + + if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED; + } + } if (conn->state == BT_CONFIG) { __u8 status; @@ -6829,6 +6867,21 @@ unlock: hci_dev_unlock(hdev); } +/* Convert LE PHY to QoS PHYs */ +static u8 le_phy_qos(u8 phy) +{ + switch (phy) { + case 0x01: + return HCI_LE_SET_PHY_1M; + case 0x02: + return HCI_LE_SET_PHY_2M; + case 0x03: + return HCI_LE_SET_PHY_CODED; + } + + return 0; +} + static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -6890,8 +6943,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, 1000); qos->ucast.in.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; qos->ucast.out.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; - qos->ucast.in.phy = ev->c_phy; - qos->ucast.out.phy = ev->p_phy; + qos->ucast.in.phys = le_phy_qos(ev->c_phy); + qos->ucast.out.phys = le_phy_qos(ev->p_phy); break; case HCI_ROLE_MASTER: qos->ucast.in.interval = p_sdu_interval; @@ -6905,8 +6958,8 @@ static void hci_le_cis_established_evt(struct hci_dev *hdev, void *data, 1000); qos->ucast.out.sdu = ev->c_bn ? le16_to_cpu(ev->c_mtu) : 0; qos->ucast.in.sdu = ev->p_bn ? le16_to_cpu(ev->p_mtu) : 0; - qos->ucast.out.phy = ev->c_phy; - qos->ucast.in.phy = ev->p_phy; + qos->ucast.out.phys = le_phy_qos(ev->c_phy); + qos->ucast.in.phys = le_phy_qos(ev->p_phy); break; } @@ -7221,9 +7274,21 @@ static void hci_le_read_all_remote_features_evt(struct hci_dev *hdev, if (!conn) goto unlock; - if (!ev->status) + if (!ev->status) { memcpy(conn->le_features, ev->features, 248); + /* Update supported PHYs */ + if (!(conn->le_features[1] & HCI_LE_PHY_2M)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_2M; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_2M; + } + + if (!(conn->le_features[1] & HCI_LE_PHY_CODED)) { + conn->le_tx_def_phys &= ~HCI_LE_SET_PHY_CODED; + conn->le_rx_def_phys &= ~HCI_LE_SET_PHY_CODED; + } + } + if (conn->state == BT_CONFIG) { __u8 status; diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index cbc3a75d7326..f04a90bce4a9 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2948,8 +2948,8 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, if (conn) { struct bt_iso_qos *qos = &conn->iso_qos; - if (qos->bcast.in.phy & BT_ISO_PHY_1M || - qos->bcast.in.phy & BT_ISO_PHY_2M) { + if (qos->bcast.in.phys & BT_ISO_PHY_1M || + qos->bcast.in.phys & BT_ISO_PHY_2M) { cp->scanning_phys |= LE_SCAN_PHY_1M; hci_le_scan_phy_params(phy, type, interval, @@ -2958,7 +2958,7 @@ static int hci_le_set_ext_scan_param_sync(struct hci_dev *hdev, u8 type, phy++; } - if (qos->bcast.in.phy & BT_ISO_PHY_CODED) { + if (qos->bcast.in.phys & BT_ISO_PHY_CODED) { cp->scanning_phys |= LE_SCAN_PHY_CODED; hci_le_scan_phy_params(phy, type, interval * 3, @@ -4428,6 +4428,17 @@ static int hci_le_set_event_mask_sync(struct hci_dev *hdev) events[4] |= 0x02; /* LE BIG Info Advertising Report */ } + if (le_cs_capable(hdev)) { + /* Channel Sounding events */ + events[5] |= 0x08; /* LE CS Read Remote Supported Cap Complete event */ + events[5] |= 0x10; /* LE CS Read Remote FAE Table Complete event */ + events[5] |= 0x20; /* LE CS Security Enable Complete event */ + events[5] |= 0x40; /* LE CS Config Complete event */ + events[5] |= 0x80; /* LE CS Procedure Enable Complete event */ + events[6] |= 0x01; /* LE CS Subevent Result event */ + events[6] |= 0x02; /* LE CS Subevent Result Continue event */ + events[6] |= 0x04; /* LE CS Test End Complete event */ + } return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EVENT_MASK, sizeof(events), events, HCI_CMD_TIMEOUT); } @@ -4560,23 +4571,43 @@ static int hci_set_le_support_sync(struct hci_dev *hdev) } /* LE Set Host Feature */ -static int hci_le_set_host_feature_sync(struct hci_dev *hdev) +static int hci_le_set_host_feature_sync(struct hci_dev *hdev, u8 bit, u8 value) { struct hci_cp_le_set_host_feature cp; - if (!iso_capable(hdev)) - return 0; - memset(&cp, 0, sizeof(cp)); /* Connected Isochronous Channels (Host Support) */ - cp.bit_number = 32; - cp.bit_value = iso_enabled(hdev) ? 0x01 : 0x00; + cp.bit_number = bit; + cp.bit_value = value; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_HOST_FEATURE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } +/* Set Host Features, each feature needs to be sent separately since + * HCI_OP_LE_SET_HOST_FEATURE doesn't support setting all of them at once. + */ +static int hci_le_set_host_features_sync(struct hci_dev *hdev) +{ + int err; + + if (iso_capable(hdev)) { + /* Connected Isochronous Channels (Host Support) */ + err = hci_le_set_host_feature_sync(hdev, 32, + (iso_enabled(hdev) ? 0x01 : + 0x00)); + if (err) + return err; + } + + if (le_cs_capable(hdev)) + /* Channel Sounding (Host Support) */ + err = hci_le_set_host_feature_sync(hdev, 47, 0x01); + + return err; +} + /* LE Controller init stage 3 command sequence */ static const struct hci_init_stage le_init3[] = { /* HCI_OP_LE_SET_EVENT_MASK */ @@ -4604,7 +4635,7 @@ static const struct hci_init_stage le_init3[] = { /* HCI_OP_WRITE_LE_HOST_SUPPORTED */ HCI_INIT(hci_set_le_support_sync), /* HCI_OP_LE_SET_HOST_FEATURE */ - HCI_INIT(hci_le_set_host_feature_sync), + HCI_INIT(hci_le_set_host_features_sync), {} }; @@ -6897,8 +6928,6 @@ static int hci_acl_create_conn_sync(struct hci_dev *hdev, void *data) conn->attempt++; - conn->link_policy = hdev->link_policy; - memset(&cp, 0, sizeof(cp)); bacpy(&cp.bdaddr, &conn->dst); cp.pscan_rep_mode = 0x02; @@ -7419,3 +7448,75 @@ int hci_le_read_remote_features(struct hci_conn *conn) return err; } + +static void pkt_type_changed(struct hci_dev *hdev, void *data, int err) +{ + struct hci_cp_change_conn_ptype *cp = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(cp); +} + +static int hci_change_conn_ptype_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_change_conn_ptype *cp = data; + + return __hci_cmd_sync_status_sk(hdev, HCI_OP_CHANGE_CONN_PTYPE, + sizeof(*cp), cp, + HCI_EV_PKT_TYPE_CHANGE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_acl_change_pkt_type(struct hci_conn *conn, u16 pkt_type) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_change_conn_ptype *cp; + + cp = kmalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + cp->handle = cpu_to_le16(conn->handle); + cp->pkt_type = cpu_to_le16(pkt_type); + + return hci_cmd_sync_queue_once(hdev, hci_change_conn_ptype_sync, cp, + pkt_type_changed); +} + +static void le_phy_update_complete(struct hci_dev *hdev, void *data, int err) +{ + struct hci_cp_le_set_phy *cp = data; + + bt_dev_dbg(hdev, "err %d", err); + + kfree(cp); +} + +static int hci_le_set_phy_sync(struct hci_dev *hdev, void *data) +{ + struct hci_cp_le_set_phy *cp = data; + + return __hci_cmd_sync_status_sk(hdev, HCI_OP_LE_SET_PHY, + sizeof(*cp), cp, + HCI_EV_LE_PHY_UPDATE_COMPLETE, + HCI_CMD_TIMEOUT, NULL); +} + +int hci_le_set_phy(struct hci_conn *conn, u8 tx_phys, u8 rx_phys) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_le_set_phy *cp; + + cp = kmalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) + return -ENOMEM; + + memset(cp, 0, sizeof(*cp)); + cp->handle = cpu_to_le16(conn->handle); + cp->tx_phys = tx_phys; + cp->rx_phys = rx_phys; + + return hci_cmd_sync_queue_once(hdev, hci_le_set_phy_sync, cp, + le_phy_update_complete); +} diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index e36d24a9098b..1459ab161fd2 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -361,7 +361,7 @@ static int iso_connect_bis(struct sock *sk) } /* Fail if out PHYs are marked as disabled */ - if (!iso_pi(sk)->qos.bcast.out.phy) { + if (!iso_pi(sk)->qos.bcast.out.phys) { err = -EINVAL; goto unlock; } @@ -458,7 +458,7 @@ static int iso_connect_cis(struct sock *sk) } /* Fail if either PHYs are marked as disabled */ - if (!iso_pi(sk)->qos.ucast.in.phy && !iso_pi(sk)->qos.ucast.out.phy) { + if (!iso_pi(sk)->qos.ucast.in.phys && !iso_pi(sk)->qos.ucast.out.phys) { err = -EINVAL; goto unlock; } @@ -894,7 +894,7 @@ static struct proto iso_proto = { .interval = 10000u, \ .latency = 10u, \ .sdu = 40u, \ - .phy = BT_ISO_PHY_2M, \ + .phys = BT_ISO_PHY_2M, \ .rtn = 2u, \ } @@ -1661,7 +1661,7 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, static bool check_io_qos(struct bt_iso_io_qos *qos) { /* If no PHY is enable SDU must be 0 */ - if (!qos->phy && qos->sdu) + if (!qos->phys && qos->sdu) return false; if (qos->interval && (qos->interval < 0xff || qos->interval > 0xfffff)) @@ -1670,7 +1670,7 @@ static bool check_io_qos(struct bt_iso_io_qos *qos) if (qos->latency && (qos->latency < 0x05 || qos->latency > 0xfa0)) return false; - if (qos->phy > BT_ISO_PHY_ANY) + if (qos->phys > BT_ISO_PHY_ANY) return false; return true; diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 07b493331fd7..b628b0fa39b2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -924,26 +924,18 @@ int l2cap_chan_check_security(struct l2cap_chan *chan, bool initiator) initiator); } -static u8 l2cap_get_ident(struct l2cap_conn *conn) +static int l2cap_get_ident(struct l2cap_conn *conn) { - u8 id; + /* LE link does not support tools like l2ping so use the full range */ + if (conn->hcon->type == LE_LINK) + return ida_alloc_range(&conn->tx_ida, 1, 255, GFP_ATOMIC); /* Get next available identificator. * 1 - 128 are used by kernel. * 129 - 199 are reserved. * 200 - 254 are used by utilities like l2ping, etc. */ - - mutex_lock(&conn->ident_lock); - - if (++conn->tx_ident > 128) - conn->tx_ident = 1; - - id = conn->tx_ident; - - mutex_unlock(&conn->ident_lock); - - return id; + return ida_alloc_range(&conn->tx_ida, 1, 128, GFP_ATOMIC); } static void l2cap_send_acl(struct l2cap_conn *conn, struct sk_buff *skb, @@ -1773,6 +1765,8 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) if (work_pending(&conn->pending_rx_work)) cancel_work_sync(&conn->pending_rx_work); + ida_destroy(&conn->tx_ida); + cancel_delayed_work_sync(&conn->id_addr_timer); l2cap_unregister_all_users(conn); @@ -4782,12 +4776,34 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, return err; } +static void l2cap_put_ident(struct l2cap_conn *conn, u8 code, u8 id) +{ + switch (code) { + case L2CAP_COMMAND_REJ: + case L2CAP_CONN_RSP: + case L2CAP_CONF_RSP: + case L2CAP_DISCONN_RSP: + case L2CAP_ECHO_RSP: + case L2CAP_INFO_RSP: + case L2CAP_CONN_PARAM_UPDATE_RSP: + case L2CAP_ECRED_CONN_RSP: + case L2CAP_ECRED_RECONF_RSP: + /* First do a lookup since the remote may send bogus ids that + * would make ida_free to generate warnings. + */ + if (ida_find_first_range(&conn->tx_ida, id, id) >= 0) + ida_free(&conn->tx_ida, id); + } +} + static inline int l2cap_bredr_sig_cmd(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u16 cmd_len, u8 *data) { int err = 0; + l2cap_put_ident(conn, cmd->code, cmd->ident); + switch (cmd->code) { case L2CAP_COMMAND_REJ: l2cap_command_rej(conn, cmd, cmd_len, data); @@ -5419,6 +5435,8 @@ static inline int l2cap_le_sig_cmd(struct l2cap_conn *conn, { int err = 0; + l2cap_put_ident(conn, cmd->code, cmd->ident); + switch (cmd->code) { case L2CAP_COMMAND_REJ: l2cap_le_command_rej(conn, cmd, cmd_len, data); @@ -6907,13 +6925,13 @@ static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon) hci_dev_test_flag(hcon->hdev, HCI_FORCE_BREDR_SMP))) conn->local_fixed_chan |= L2CAP_FC_SMP_BREDR; - mutex_init(&conn->ident_lock); mutex_init(&conn->lock); INIT_LIST_HEAD(&conn->chan_l); INIT_LIST_HEAD(&conn->users); INIT_DELAYED_WORK(&conn->info_timer, l2cap_info_timeout); + ida_init(&conn->tx_ida); skb_queue_head_init(&conn->pending_rx); INIT_WORK(&conn->pending_rx_work, process_pending_rx); diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index 9ee189c815d4..3ba3ce7eaa98 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -885,7 +885,7 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, struct bt_power pwr; struct l2cap_conn *conn; int err = 0; - u32 opt; + u32 opt, phys; u16 mtu; u8 mode; @@ -1059,6 +1059,24 @@ static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, break; + case BT_PHY: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + err = copy_safe_from_sockptr(&phys, sizeof(phys), optval, + optlen); + if (err) + break; + + if (!chan->conn) + break; + + conn = chan->conn; + err = hci_conn_set_phy(conn->hcon, phys); + break; + case BT_MODE: if (!enable_ecred) { err = -ENOPROTOOPT; diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index c4063d200c0a..fdcc752c6f13 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -11,6 +11,12 @@ #include "mgmt_util.h" #include "mgmt_config.h" +#define HDEV_PARAM_U32(_param_name_) \ + struct {\ + struct mgmt_tlv_hdr entry; \ + __le32 value; \ + } __packed _param_name_ + #define HDEV_PARAM_U16(_param_name_) \ struct {\ struct mgmt_tlv_hdr entry; \ @@ -29,6 +35,12 @@ cpu_to_le16(hdev->_param_name_) \ } +#define TLV_SET_U32(_param_code_, _param_name_) \ + { \ + { cpu_to_le16(_param_code_), sizeof(__u32) }, \ + cpu_to_le32(hdev->_param_name_) \ + } + #define TLV_SET_U8(_param_code_, _param_name_) \ { \ { cpu_to_le16(_param_code_), sizeof(__u8) }, \ @@ -78,6 +90,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, HDEV_PARAM_U16(advmon_allowlist_duration); HDEV_PARAM_U16(advmon_no_filter_duration); HDEV_PARAM_U8(enable_advmon_interleave_scan); + HDEV_PARAM_U32(idle_timeout); } __packed rp = { TLV_SET_U16(0x0000, def_page_scan_type), TLV_SET_U16(0x0001, def_page_scan_int), @@ -111,6 +124,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, TLV_SET_U16(0x001d, advmon_allowlist_duration), TLV_SET_U16(0x001e, advmon_no_filter_duration), TLV_SET_U8(0x001f, enable_advmon_interleave_scan), + TLV_SET_U32(0x0020, idle_timeout), }; bt_dev_dbg(hdev, "sock %p", sk); @@ -122,6 +136,7 @@ int read_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, } #define TO_TLV(x) ((struct mgmt_tlv *)(x)) +#define TLV_GET_LE32(tlv) le32_to_cpu(*((__le32 *)(TO_TLV(tlv)->value))) #define TLV_GET_LE16(tlv) le16_to_cpu(*((__le16 *)(TO_TLV(tlv)->value))) #define TLV_GET_U8(tlv) (*((__u8 *)(TO_TLV(tlv)->value))) @@ -191,6 +206,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, case 0x001f: exp_type_len = sizeof(u8); break; + case 0x0020: + exp_type_len = sizeof(u32); + break; default: exp_type_len = 0; bt_dev_warn(hdev, "unsupported parameter %u", type); @@ -314,6 +332,9 @@ int set_def_system_config(struct sock *sk, struct hci_dev *hdev, void *data, case 0x0001f: hdev->enable_advmon_interleave_scan = TLV_GET_U8(buffer); break; + case 0x00020: + hdev->idle_timeout = TLV_GET_LE32(buffer); + break; default: bt_dev_warn(hdev, "unsupported parameter %u", type); break; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index d55a4ab87837..dccae08b4f4c 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -5201,7 +5201,7 @@ void br_multicast_get_stats(const struct net_bridge *br, do { start = u64_stats_fetch_begin(&cpu_stats->syncp); - memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); + u64_stats_copy(&temp, &cpu_stats->mstats, sizeof(temp)); } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e0421eaa3abc..76ce70b4e7f3 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -58,7 +58,7 @@ int br_validate_ipv6(struct net *net, struct sk_buff *skb) if (hdr->version != 6) goto inhdr_error; - pkt_len = ntohs(hdr->payload_len); + pkt_len = ipv6_payload_len(skb, hdr); if (hdr->nexthdr == NEXTHDR_HOP && nf_ip6_check_hbh_len(skb, &pkt_len)) goto drop; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index c20a41bf253b..cc4b27ff1b08 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -344,8 +344,8 @@ int br_stp_set_path_cost(struct net_bridge_port *p, unsigned long path_cost) ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) { - return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", - id->prio[0], id->prio[1], - id->addr[0], id->addr[1], id->addr[2], - id->addr[3], id->addr[4], id->addr[5]); + return sysfs_emit(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", + id->prio[0], id->prio[1], + id->addr[0], id->addr[1], id->addr[2], + id->addr[3], id->addr[4], id->addr[5]); } diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index cb4855ed9500..8888300b65c1 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -67,7 +67,7 @@ static ssize_t forward_delay_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); + return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } static int set_forward_delay(struct net_bridge *br, unsigned long val, @@ -87,8 +87,8 @@ static DEVICE_ATTR_RW(forward_delay); static ssize_t hello_time_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(to_bridge(d)->hello_time)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->hello_time)); } static int set_hello_time(struct net_bridge *br, unsigned long val, @@ -108,8 +108,8 @@ static DEVICE_ATTR_RW(hello_time); static ssize_t max_age_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(to_bridge(d)->max_age)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(d)->max_age)); } static int set_max_age(struct net_bridge *br, unsigned long val, @@ -129,7 +129,7 @@ static ssize_t ageing_time_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); + return sysfs_emit(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } static int set_ageing_time(struct net_bridge *br, unsigned long val, @@ -150,7 +150,7 @@ static ssize_t stp_state_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->stp_enabled); + return sysfs_emit(buf, "%d\n", br->stp_enabled); } @@ -173,7 +173,7 @@ static ssize_t group_fwd_mask_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%#x\n", br->group_fwd_mask); + return sysfs_emit(buf, "%#x\n", br->group_fwd_mask); } static int set_group_fwd_mask(struct net_bridge *br, unsigned long val, @@ -200,8 +200,8 @@ static ssize_t priority_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); + return sysfs_emit(buf, "%d\n", + (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } static int set_priority(struct net_bridge *br, unsigned long val, @@ -235,21 +235,21 @@ static DEVICE_ATTR_RO(bridge_id); static ssize_t root_port_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->root_port); + return sysfs_emit(buf, "%d\n", to_bridge(d)->root_port); } static DEVICE_ATTR_RO(root_port); static ssize_t root_path_cost_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->root_path_cost); + return sysfs_emit(buf, "%d\n", to_bridge(d)->root_path_cost); } static DEVICE_ATTR_RO(root_path_cost); static ssize_t topology_change_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", to_bridge(d)->topology_change); + return sysfs_emit(buf, "%d\n", to_bridge(d)->topology_change); } static DEVICE_ATTR_RO(topology_change); @@ -258,7 +258,7 @@ static ssize_t topology_change_detected_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->topology_change_detected); + return sysfs_emit(buf, "%d\n", br->topology_change_detected); } static DEVICE_ATTR_RO(topology_change_detected); @@ -266,7 +266,7 @@ static ssize_t hello_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->hello_timer)); } static DEVICE_ATTR_RO(hello_timer); @@ -274,7 +274,7 @@ static ssize_t tcn_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->tcn_timer)); } static DEVICE_ATTR_RO(tcn_timer); @@ -283,7 +283,7 @@ static ssize_t topology_change_timer_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); } static DEVICE_ATTR_RO(topology_change_timer); @@ -291,7 +291,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&br->gc_work.timer)); } static DEVICE_ATTR_RO(gc_timer); @@ -299,7 +299,7 @@ static ssize_t group_addr_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%pM\n", br->group_addr); + return sysfs_emit(buf, "%pM\n", br->group_addr); } static ssize_t group_addr_store(struct device *d, @@ -365,7 +365,7 @@ static ssize_t no_linklocal_learn_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); + return sysfs_emit(buf, "%d\n", br_boolopt_get(br, BR_BOOLOPT_NO_LL_LEARN)); } static int set_no_linklocal_learn(struct net_bridge *br, unsigned long val, @@ -387,7 +387,7 @@ static ssize_t multicast_router_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_ctx.multicast_router); + return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_router); } static int set_multicast_router(struct net_bridge *br, unsigned long val, @@ -409,7 +409,7 @@ static ssize_t multicast_snooping_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); + return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_MULTICAST_ENABLED)); } static ssize_t multicast_snooping_store(struct device *d, @@ -425,8 +425,8 @@ static ssize_t multicast_query_use_ifaddr_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); + return sysfs_emit(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_QUERY_USE_IFADDR)); } static int set_query_use_ifaddr(struct net_bridge *br, unsigned long val, @@ -450,7 +450,7 @@ static ssize_t multicast_querier_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->multicast_ctx.multicast_querier); + return sysfs_emit(buf, "%d\n", br->multicast_ctx.multicast_querier); } static int set_multicast_querier(struct net_bridge *br, unsigned long val, @@ -470,7 +470,7 @@ static DEVICE_ATTR_RW(multicast_querier); static ssize_t hash_elasticity_show(struct device *d, struct device_attribute *attr, char *buf) { - return sprintf(buf, "%u\n", RHT_ELASTICITY); + return sysfs_emit(buf, "%u\n", RHT_ELASTICITY); } static int set_elasticity(struct net_bridge *br, unsigned long val, @@ -494,7 +494,7 @@ static ssize_t hash_max_show(struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->hash_max); + return sysfs_emit(buf, "%u\n", br->hash_max); } static int set_hash_max(struct net_bridge *br, unsigned long val, @@ -517,7 +517,7 @@ static ssize_t multicast_igmp_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_igmp_version); } static int set_multicast_igmp_version(struct net_bridge *br, unsigned long val, @@ -539,7 +539,7 @@ static ssize_t multicast_last_member_count_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_last_member_count); } static int set_last_member_count(struct net_bridge *br, unsigned long val, @@ -561,7 +561,7 @@ static ssize_t multicast_startup_query_count_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_startup_query_count); } static int set_startup_query_count(struct net_bridge *br, unsigned long val, @@ -583,8 +583,8 @@ static ssize_t multicast_last_member_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_last_member_interval)); } static int set_last_member_interval(struct net_bridge *br, unsigned long val, @@ -606,8 +606,8 @@ static ssize_t multicast_membership_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_membership_interval)); } static int set_membership_interval(struct net_bridge *br, unsigned long val, @@ -630,8 +630,8 @@ static ssize_t multicast_querier_interval_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_querier_interval)); } static int set_querier_interval(struct net_bridge *br, unsigned long val, @@ -654,8 +654,8 @@ static ssize_t multicast_query_interval_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_query_interval)); } static int set_query_interval(struct net_bridge *br, unsigned long val, @@ -677,9 +677,8 @@ static ssize_t multicast_query_response_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf( - buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_query_response_interval)); } static int set_query_response_interval(struct net_bridge *br, unsigned long val, @@ -701,9 +700,8 @@ static ssize_t multicast_startup_query_interval_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf( - buf, "%lu\n", - jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); + return sysfs_emit(buf, "%lu\n", + jiffies_to_clock_t(br->multicast_ctx.multicast_startup_query_interval)); } static int set_startup_query_interval(struct net_bridge *br, unsigned long val, @@ -727,8 +725,8 @@ static ssize_t multicast_stats_enabled_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", - br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); + return sysfs_emit(buf, "%d\n", + br_opt_get(br, BROPT_MULTICAST_STATS_ENABLED)); } static int set_stats_enabled(struct net_bridge *br, unsigned long val, @@ -754,7 +752,7 @@ static ssize_t multicast_mld_version_show(struct device *d, { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br->multicast_ctx.multicast_mld_version); + return sysfs_emit(buf, "%u\n", br->multicast_ctx.multicast_mld_version); } static int set_multicast_mld_version(struct net_bridge *br, unsigned long val, @@ -777,7 +775,7 @@ static ssize_t nf_call_iptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IPTABLES)); } static int set_nf_call_iptables(struct net_bridge *br, unsigned long val, @@ -799,7 +797,7 @@ static ssize_t nf_call_ip6tables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_IP6TABLES)); } static int set_nf_call_ip6tables(struct net_bridge *br, unsigned long val, @@ -821,7 +819,7 @@ static ssize_t nf_call_arptables_show( struct device *d, struct device_attribute *attr, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_NF_CALL_ARPTABLES)); } static int set_nf_call_arptables(struct net_bridge *br, unsigned long val, @@ -845,7 +843,7 @@ static ssize_t vlan_filtering_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); + return sysfs_emit(buf, "%d\n", br_opt_get(br, BROPT_VLAN_ENABLED)); } static ssize_t vlan_filtering_store(struct device *d, @@ -861,7 +859,7 @@ static ssize_t vlan_protocol_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%#06x\n", ntohs(br->vlan_proto)); + return sysfs_emit(buf, "%#06x\n", ntohs(br->vlan_proto)); } static ssize_t vlan_protocol_store(struct device *d, @@ -877,7 +875,7 @@ static ssize_t default_pvid_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%d\n", br->default_pvid); + return sysfs_emit(buf, "%d\n", br->default_pvid); } static ssize_t default_pvid_store(struct device *d, @@ -893,7 +891,7 @@ static ssize_t vlan_stats_enabled_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_ENABLED)); } static int set_vlan_stats_enabled(struct net_bridge *br, unsigned long val, @@ -915,7 +913,7 @@ static ssize_t vlan_stats_per_port_show(struct device *d, char *buf) { struct net_bridge *br = to_bridge(d); - return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); + return sysfs_emit(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); } static int set_vlan_stats_per_port(struct net_bridge *br, unsigned long val, diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 74fdd8105dca..1f57c36a7fc0 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -47,7 +47,7 @@ const struct brport_attribute brport_attr_##_name = { \ #define BRPORT_ATTR_FLAG(_name, _mask) \ static ssize_t show_##_name(struct net_bridge_port *p, char *buf) \ { \ - return sprintf(buf, "%d\n", !!(p->flags & _mask)); \ + return sysfs_emit(buf, "%d\n", !!(p->flags & _mask)); \ } \ static int store_##_name(struct net_bridge_port *p, unsigned long v) \ { \ @@ -83,7 +83,7 @@ static int store_flag(struct net_bridge_port *p, unsigned long v, static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->path_cost); + return sysfs_emit(buf, "%d\n", p->path_cost); } static BRPORT_ATTR(path_cost, 0644, @@ -91,7 +91,7 @@ static BRPORT_ATTR(path_cost, 0644, static ssize_t show_priority(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->priority); + return sysfs_emit(buf, "%d\n", p->priority); } static BRPORT_ATTR(priority, 0644, @@ -111,65 +111,65 @@ static BRPORT_ATTR(designated_bridge, 0444, show_designated_bridge, NULL); static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->designated_port); + return sysfs_emit(buf, "%d\n", p->designated_port); } static BRPORT_ATTR(designated_port, 0444, show_designated_port, NULL); static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->designated_cost); + return sysfs_emit(buf, "%d\n", p->designated_cost); } static BRPORT_ATTR(designated_cost, 0444, show_designated_cost, NULL); static ssize_t show_port_id(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "0x%x\n", p->port_id); + return sysfs_emit(buf, "0x%x\n", p->port_id); } static BRPORT_ATTR(port_id, 0444, show_port_id, NULL); static ssize_t show_port_no(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "0x%x\n", p->port_no); + return sysfs_emit(buf, "0x%x\n", p->port_no); } static BRPORT_ATTR(port_no, 0444, show_port_no, NULL); static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->topology_change_ack); + return sysfs_emit(buf, "%d\n", p->topology_change_ack); } static BRPORT_ATTR(change_ack, 0444, show_change_ack, NULL); static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->config_pending); + return sysfs_emit(buf, "%d\n", p->config_pending); } static BRPORT_ATTR(config_pending, 0444, show_config_pending, NULL); static ssize_t show_port_state(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->state); + return sysfs_emit(buf, "%d\n", p->state); } static BRPORT_ATTR(state, 0444, show_port_state, NULL); static ssize_t show_message_age_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->message_age_timer)); } static BRPORT_ATTR(message_age_timer, 0444, show_message_age_timer, NULL); static ssize_t show_forward_delay_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); } static BRPORT_ATTR(forward_delay_timer, 0444, show_forward_delay_timer, NULL); static ssize_t show_hold_timer(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); + return sysfs_emit(buf, "%ld\n", br_timer_value(&p->hold_timer)); } static BRPORT_ATTR(hold_timer, 0444, show_hold_timer, NULL); @@ -182,7 +182,7 @@ static BRPORT_ATTR(flush, 0200, NULL, store_flush); static ssize_t show_group_fwd_mask(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%#x\n", p->group_fwd_mask); + return sysfs_emit(buf, "%#x\n", p->group_fwd_mask); } static int store_group_fwd_mask(struct net_bridge_port *p, @@ -205,7 +205,7 @@ static ssize_t show_backup_port(struct net_bridge_port *p, char *buf) rcu_read_lock(); backup_p = rcu_dereference(p->backup_port); if (backup_p) - ret = sprintf(buf, "%s\n", backup_p->dev->name); + ret = sysfs_emit(buf, "%s\n", backup_p->dev->name); rcu_read_unlock(); return ret; @@ -244,7 +244,7 @@ BRPORT_ATTR_FLAG(isolated, BR_ISOLATED); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) { - return sprintf(buf, "%d\n", p->multicast_ctx.multicast_router); + return sysfs_emit(buf, "%d\n", p->multicast_ctx.multicast_router); } static int store_multicast_router(struct net_bridge_port *p, diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c index 6482de4d8750..58a33d0380b0 100644 --- a/net/bridge/netfilter/nf_conntrack_bridge.c +++ b/net/bridge/netfilter/nf_conntrack_bridge.c @@ -16,8 +16,7 @@ #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_bridge.h> -#include <linux/netfilter/nf_tables.h> -#include <net/netfilter/nf_tables.h> +#include <linux/netfilter_ipv4.h> #include "../br_private.h" @@ -230,7 +229,7 @@ static int nf_ct_br_ipv6_check(const struct sk_buff *skb) if (hdr->version != 6) return -1; - len = ntohs(hdr->payload_len) + sizeof(struct ipv6hdr) + nhoff; + len = ipv6_payload_len(skb, hdr) + sizeof(struct ipv6hdr) + nhoff; if (skb->len < len) return -1; @@ -270,7 +269,7 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) return NF_ACCEPT; - len = sizeof(struct ipv6hdr) + ntohs(ipv6_hdr(skb)->payload_len); + len = sizeof(struct ipv6hdr) + skb_ipv6_payload_len(skb); if (pskb_trim_rcsum(skb, len)) return NF_ACCEPT; diff --git a/net/can/Kconfig b/net/can/Kconfig index af64a6f76458..abbb4be7ad21 100644 --- a/net/can/Kconfig +++ b/net/can/Kconfig @@ -5,6 +5,7 @@ menuconfig CAN tristate "CAN bus subsystem support" + select SKB_EXTENSIONS help Controller Area Network (CAN) is a slow (up to 1Mbit/s) serial communications protocol. Development of the CAN bus started in diff --git a/net/can/af_can.c b/net/can/af_can.c index 770173d8db42..22c65a014861 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -641,6 +641,16 @@ static int can_rcv_filter(struct can_dev_rcv_lists *dev_rcv_lists, struct sk_buf return matches; } +void can_set_skb_uid(struct sk_buff *skb) +{ + /* create non-zero unique skb identifier together with *skb */ + while (!(skb->hash)) + skb->hash = atomic_inc_return(&skbcounter); + + skb->sw_hash = 1; +} +EXPORT_SYMBOL(can_set_skb_uid); + static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct can_dev_rcv_lists *dev_rcv_lists; @@ -652,9 +662,7 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) atomic_long_inc(&pkg_stats->rx_frames); atomic_long_inc(&pkg_stats->rx_frames_delta); - /* create non-zero unique skb identifier together with *skb */ - while (!(can_skb_prv(skb)->skbcnt)) - can_skb_prv(skb)->skbcnt = atomic_inc_return(&skbcounter); + can_set_skb_uid(skb); rcu_read_lock(); @@ -679,7 +687,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_can_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_can_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -694,7 +703,8 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canfd_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_canfd_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); @@ -709,7 +719,8 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || !can_is_canxl_skb(skb))) { + if (unlikely(dev->type != ARPHRD_CAN || !can_get_ml_priv(dev) || + !can_skb_ext_find(skb) || !can_is_canxl_skb(skb))) { pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", dev->type, skb->len); diff --git a/net/can/bcm.c b/net/can/bcm.c index 7eba8ae01a5b..b7324e9c955b 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -59,6 +59,7 @@ #include <linux/can/bcm.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -291,6 +292,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int err; @@ -310,13 +312,17 @@ static void bcm_can_tx(struct bcm_op *op) return; } - skb = alloc_skb(op->cfsiz + sizeof(struct can_skb_priv), gfp_any()); + skb = alloc_skb(op->cfsiz, gfp_any()); if (!skb) goto out; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + goto out; + } + + csx->can_iif = dev->ifindex; skb_put_data(skb, cf, op->cfsiz); @@ -1318,6 +1324,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, int cfsiz) { struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; int err; @@ -1325,11 +1332,15 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, if (!ifindex) return -ENODEV; - skb = alloc_skb(cfsiz + sizeof(struct can_skb_priv), GFP_KERNEL); + skb = alloc_skb(cfsiz, GFP_KERNEL); if (!skb) return -ENOMEM; - can_skb_reserve(skb); + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return -ENOMEM; + } err = memcpy_from_msg(skb_put(skb, cfsiz), msg, cfsiz); if (err < 0) { @@ -1343,8 +1354,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, return -ENODEV; } - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = dev->ifindex; skb->dev = dev; can_skb_set_owner(skb, sk); err = can_send(skb, 1); /* send with loopback */ diff --git a/net/can/gw.c b/net/can/gw.c index 55eccb1c7620..61a1e6b1b83f 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -55,6 +55,7 @@ #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/gw.h> +#include <net/can.h> #include <net/rtnetlink.h> #include <net/net_namespace.h> #include <net/sock.h> @@ -70,8 +71,8 @@ MODULE_ALIAS(CAN_GW_NAME); #define CGW_MAX_HOPS 6 #define CGW_DEFAULT_HOPS 1 -static unsigned int max_hops __read_mostly = CGW_DEFAULT_HOPS; -module_param(max_hops, uint, 0444); +static unsigned char max_hops __read_mostly = CGW_DEFAULT_HOPS; +module_param(max_hops, byte, 0444); MODULE_PARM_DESC(max_hops, "maximum " CAN_GW_NAME " routing hops for CAN frames " "(valid values: " __stringify(CGW_MIN_HOPS) "-" @@ -459,6 +460,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; + struct can_skb_ext *csx, *ncsx; struct cf_mod *mod; int modidx = 0; @@ -471,22 +473,15 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) return; } + csx = can_skb_ext_find(skb); + if (!csx) + return; + /* Do not handle CAN frames routed more than 'max_hops' times. * In general we should never catch this delimiter which is intended * to cover a misconfiguration protection (e.g. circular CAN routes). - * - * The Controller Area Network controllers only accept CAN frames with - * correct CRCs - which are not visible in the controller registers. - * According to skbuff.h documentation the csum_start element for IP - * checksums is undefined/unused when ip_summed == CHECKSUM_UNNECESSARY. - * Only CAN skbs can be processed here which already have this property. */ - -#define cgw_hops(skb) ((skb)->csum_start) - - BUG_ON(skb->ip_summed != CHECKSUM_UNNECESSARY); - - if (cgw_hops(skb) >= max_hops) { + if (csx->can_gw_hops >= max_hops) { /* indicate deleted frames due to misconfiguration */ gwj->deleted_frames++; return; @@ -499,7 +494,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* is sending the skb back to the incoming interface not allowed? */ if (!(gwj->flags & CGW_FLAGS_CAN_IIF_TX_OK) && - can_skb_prv(skb)->ifindex == gwj->dst.dev->ifindex) + csx->can_iif == gwj->dst.dev->ifindex) return; /* clone the given skb, which has not been done in can_rcv() @@ -518,12 +513,23 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) return; } + /* the cloned/copied nskb points to the skb extension of the original + * skb with an increased refcount. skb_ext_add() creates a copy to + * separate the skb extension data to modify the can_gw_hops. + */ + ncsx = skb_ext_add(nskb, SKB_EXT_CAN); + if (!ncsx) { + kfree_skb(nskb); + gwj->dropped_frames++; + return; + } + /* put the incremented hop counter in the cloned skb */ - cgw_hops(nskb) = cgw_hops(skb) + 1; + ncsx->can_gw_hops = csx->can_gw_hops + 1; /* first processing of this CAN frame -> adjust to private hop limit */ - if (gwj->limit_hops && cgw_hops(nskb) == 1) - cgw_hops(nskb) = max_hops - gwj->limit_hops + 1; + if (gwj->limit_hops && ncsx->can_gw_hops == 1) + ncsx->can_gw_hops = max_hops - gwj->limit_hops + 1; nskb->dev = gwj->dst.dev; diff --git a/net/can/isotp.c b/net/can/isotp.c index ce588b85665a..da3b72e7afcc 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -69,6 +69,7 @@ #include <linux/can/skb.h> #include <linux/can/isotp.h> #include <linux/slab.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -214,24 +215,28 @@ static int isotp_send_fc(struct sock *sk, int ae, u8 flowstatus) { struct net_device *dev; struct sk_buff *nskb; + struct can_skb_ext *csx; struct canfd_frame *ncf; struct isotp_sock *so = isotp_sk(sk); int can_send_ret; - nskb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), gfp_any()); + nskb = alloc_skb(so->ll.mtu, gfp_any()); if (!nskb) return 1; + csx = can_skb_ext_add(nskb); + if (!csx) { + kfree_skb(nskb); + return 1; + } + dev = dev_get_by_index(sock_net(sk), so->ifindex); if (!dev) { kfree_skb(nskb); return 1; } - can_skb_reserve(nskb); - can_skb_prv(nskb)->ifindex = dev->ifindex; - can_skb_prv(nskb)->skbcnt = 0; - + csx->can_iif = dev->ifindex; nskb->dev = dev; can_skb_set_owner(nskb, sk); ncf = (struct canfd_frame *)nskb->data; @@ -763,6 +768,7 @@ static void isotp_send_cframe(struct isotp_sock *so) { struct sock *sk = &so->sk; struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int can_send_ret; @@ -772,15 +778,20 @@ static void isotp_send_cframe(struct isotp_sock *so) if (!dev) return; - skb = alloc_skb(so->ll.mtu + sizeof(struct can_skb_priv), GFP_ATOMIC); + skb = alloc_skb(so->ll.mtu, GFP_ATOMIC); if (!skb) { dev_put(dev); return; } - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + netdev_put(dev, NULL); + return; + } + + csx->can_iif = dev->ifindex; cf = (struct canfd_frame *)skb->data; skb_put_zero(skb, so->ll.mtu); @@ -940,6 +951,7 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct sock *sk = sock->sk; struct isotp_sock *so = isotp_sk(sk); struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; struct canfd_frame *cf; int ae = (so->opt.flags & CAN_ISOTP_EXTEND_ADDR) ? 1 : 0; @@ -1000,16 +1012,22 @@ static int isotp_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto err_out_drop; } - skb = sock_alloc_send_skb(sk, so->ll.mtu + sizeof(struct can_skb_priv), - msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, so->ll.mtu, msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) { dev_put(dev); goto err_out_drop; } - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + netdev_put(dev, NULL); + err = -ENOMEM; + goto err_out_drop; + } + + csx->can_iif = dev->ifindex; so->tx.len = size; so->tx.idx = 0; diff --git a/net/can/j1939/socket.c b/net/can/j1939/socket.c index ff9c4fd7b433..0502b030d238 100644 --- a/net/can/j1939/socket.c +++ b/net/can/j1939/socket.c @@ -17,6 +17,7 @@ #include <linux/can/skb.h> #include <linux/errqueue.h> #include <linux/if_arp.h> +#include <net/can.h> #include "j1939-priv.h" @@ -884,20 +885,25 @@ static struct sk_buff *j1939_sk_alloc_skb(struct net_device *ndev, struct j1939_sock *jsk = j1939_sk(sk); struct j1939_sk_buff_cb *skcb; struct sk_buff *skb; + struct can_skb_ext *csx; int ret; skb = sock_alloc_send_skb(sk, size + sizeof(struct can_frame) - - sizeof(((struct can_frame *)NULL)->data) + - sizeof(struct can_skb_priv), + sizeof(((struct can_frame *)NULL)->data), msg->msg_flags & MSG_DONTWAIT, &ret); if (!skb) goto failure; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + ret = -ENOMEM; + goto failure; + } + + csx->can_iif = ndev->ifindex; skb_reserve(skb, offsetof(struct can_frame, data)); ret = memcpy_from_msg(skb_put(skb, size), msg, size); diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index 8656ab388c83..2cbe94fc487a 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -9,6 +9,7 @@ // Oleksij Rempel <kernel@pengutronix.de> #include <linux/can/skb.h> +#include <net/can.h> #include "j1939-priv.h" @@ -591,17 +592,21 @@ sk_buff *j1939_tp_tx_dat_new(struct j1939_priv *priv, bool swap_src_dst) { struct sk_buff *skb; + struct can_skb_ext *csx; struct j1939_sk_buff_cb *skcb; - skb = alloc_skb(sizeof(struct can_frame) + sizeof(struct can_skb_priv), - GFP_ATOMIC); + skb = alloc_skb(sizeof(struct can_frame), GFP_ATOMIC); if (unlikely(!skb)) return ERR_PTR(-ENOMEM); + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return ERR_PTR(-ENOMEM); + } + skb->dev = priv->ndev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = priv->ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = priv->ndev->ifindex; /* reserve CAN header */ skb_reserve(skb, offsetof(struct can_frame, data)); @@ -1052,6 +1057,17 @@ static int j1939_simple_txnext(struct j1939_session *session) goto out_free; } + /* the cloned skb points to the skb extension of the original se_skb + * with an increased refcount. skb_ext_add() creates a copy to + * separate the skb extension data which is needed to modify the + * can_framelen in can_put_echo_skb(). + */ + if (!skb_ext_add(skb, SKB_EXT_CAN)) { + kfree_skb(skb); + ret = -ENOMEM; + goto out_free; + } + can_skb_set_owner(skb, se_skb->sk); j1939_tp_set_rxtimeout(session, J1939_SIMPLE_ECHO_TIMEOUT_MS); @@ -1526,17 +1542,22 @@ j1939_session *j1939_session_fresh_new(struct j1939_priv *priv, const struct j1939_sk_buff_cb *rel_skcb) { struct sk_buff *skb; + struct can_skb_ext *csx; struct j1939_sk_buff_cb *skcb; struct j1939_session *session; - skb = alloc_skb(size + sizeof(struct can_skb_priv), GFP_ATOMIC); + skb = alloc_skb(size, GFP_ATOMIC); if (unlikely(!skb)) return NULL; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + return NULL; + } + skb->dev = priv->ndev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = priv->ndev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx->can_iif = priv->ndev->ifindex; skcb = j1939_skb_to_cb(skb); memcpy(skcb, rel_skcb, sizeof(*skcb)); diff --git a/net/can/raw.c b/net/can/raw.c index 12293363413c..eee244ffc31e 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -53,6 +53,7 @@ #include <linux/can/core.h> #include <linux/can/skb.h> #include <linux/can/raw.h> +#include <net/can.h> #include <net/sock.h> #include <net/net_namespace.h> @@ -76,7 +77,7 @@ MODULE_ALIAS("can-proto-1"); struct uniqframe { const struct sk_buff *skb; - int skbcnt; + u32 hash; unsigned int join_rx_count; }; @@ -164,7 +165,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && - this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { + this_cpu_ptr(ro->uniq)->hash == oskb->hash) { if (!ro->join_filters) return; @@ -174,7 +175,7 @@ static void raw_rcv(struct sk_buff *oskb, void *data) return; } else { this_cpu_ptr(ro->uniq)->skb = oskb; - this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; + this_cpu_ptr(ro->uniq)->hash = oskb->hash; this_cpu_ptr(ro->uniq)->join_rx_count = 1; /* drop first frame to check all enabled filters? */ if (ro->join_filters && ro->count > 1) @@ -918,6 +919,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct raw_sock *ro = raw_sk(sk); struct sockcm_cookie sockc; struct sk_buff *skb; + struct can_skb_ext *csx; struct net_device *dev; unsigned int txmtu; int ifindex; @@ -951,14 +953,19 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) goto put_dev; } - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), - msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, + &err); if (!skb) goto put_dev; - can_skb_reserve(skb); - can_skb_prv(skb)->ifindex = dev->ifindex; - can_skb_prv(skb)->skbcnt = 0; + csx = can_skb_ext_add(skb); + if (!csx) { + kfree_skb(skb); + err = -ENOMEM; + goto put_dev; + } + + csx->can_iif = dev->ifindex; /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); diff --git a/net/core/Makefile b/net/core/Makefile index 9ef2099c5426..dc17c5a61e9a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux networking core. # -obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ +obj-y := sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ flow_dissector.o @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_config.o obj-y += netdev_rx_queue.o obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o diff --git a/net/core/dev.c b/net/core/dev.c index ccef685023c2..ac6bcb2a0784 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -246,12 +246,11 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) + unsigned long flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) - spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else - local_irq_restore(*flags); + spin_unlock(&sd->input_pkt_queue.lock); + local_irq_restore(flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) @@ -3803,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) - features &= ~NETIF_F_TSO_MANGLEID; + features &= ~dev->mangleid_features; } /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, @@ -3814,8 +3813,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && skb_transport_header_was_set(skb) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); return features; @@ -3918,8 +3916,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) goto sw_checksum; switch (skb->csum_offset) { @@ -5260,7 +5257,7 @@ void kick_defer_list_purge(unsigned int cpu) if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); @@ -5347,14 +5344,14 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); cpu_backlog_drop: reason = SKB_DROP_REASON_CPU_BACKLOG; @@ -11386,6 +11383,9 @@ int register_netdevice(struct net_device *dev) if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; + /* TSO_MANGLEID belongs in mangleid_features by definition */ + dev->mangleid_features |= NETIF_F_TSO_MANGLEID; + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; diff --git a/net/core/dev.h b/net/core/dev.h index da18536cbd35..98793a738f43 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -10,6 +10,7 @@ struct net; struct netlink_ext_ack; +struct netdev_queue_config; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ @@ -91,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem; extern struct list_head net_todo_list; void netdev_run_todo(void); +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 53a53357cfef..7a8966544c9d 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -287,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) int err; if (!ops->ndo_hwtstamp_get) - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -414,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) } if (!ops->ndo_hwtstamp_set) - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -438,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return 0; } -static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, - struct kernel_hwtstamp_config *kernel_cfg) -{ - struct ifreq ifrr; - int err; - - if (!kernel_cfg->ifr) - return -EINVAL; - - strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); - ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; - - err = dev_eth_ioctl(dev, &ifrr, cmd); - if (err) - return err; - - kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; - kernel_cfg->copied_to_user = true; - - return 0; -} - int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) { - int err; - - netdev_lock_ops(dev); - err = dev_get_hwtstamp_phylib(dev, kernel_cfg); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_get_lower); @@ -488,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) { - int err; - - netdev_lock_ops(dev); - err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_set) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_set_lower); diff --git a/net/core/devmem.c b/net/core/devmem.c index ec4217d6c0b4..63f093f7d2b2 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; -bool net_is_devmem_iov(struct net_iov *niov) -{ - return niov->type == NET_IOV_DMABUF; -} - static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } +static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) +{ + struct net_devmem_dmabuf_binding *binding = + container_of(ref, struct net_devmem_dmabuf_binding, ref); + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); @@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + percpu_ref_exit(&binding->ref); kvfree(binding->tx_vec); kfree(binding); } @@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - net_devmem_dmabuf_binding_put(binding); + percpu_ref_kill(&binding->ref); } int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, @@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dev = dev; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); - refcount_set(&binding->ref, 1); + err = percpu_ref_init(&binding->ref, + net_devmem_dmabuf_binding_release, + 0, GFP_KERNEL); + if (err < 0) + goto err_free_binding; mutex_init(&binding->lock); @@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_binding; + goto err_exit_ref; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, @@ -322,6 +331,8 @@ err_unmap: direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); +err_exit_ref: + percpu_ref_exit(&binding->ref); err_free_binding: kfree(binding); err_put_dmabuf: diff --git a/net/core/devmem.h b/net/core/devmem.h index 0b43a648cd2e..1c5c18581fcb 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding { * retransmits) hold a reference to the binding until the skb holding * them is freed. */ - refcount_t ref; + struct percpu_ref ref; /* The list of bindings currently active. Used for netlink to notify us * of the user dropping the bind. @@ -125,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - return refcount_inc_not_zero(&binding->ref); + return percpu_ref_tryget(&binding->ref); } static inline void net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) { - if (!refcount_dec_and_test(&binding->ref)) - return; - - INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); - schedule_work(&binding->unbind_w); + percpu_ref_put(&binding->ref); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -145,7 +141,7 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); -bool net_is_devmem_iov(struct net_iov *niov); + struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); struct net_iov * @@ -218,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) return 0; } -static inline bool net_is_devmem_iov(struct net_iov *niov) -{ - return false; -} - static inline struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) { diff --git a/net/core/gro.c b/net/core/gro.c index 482fa7d7f598..31d21de5b15a 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || - (p->protocol == htons(ETH_P_IPV6) && - skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } @@ -417,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); - BUG_ON(skb->end - skb->tail < grow); + DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96a3b1a93252..e0897eb41c8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -51,9 +51,8 @@ do { \ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); -static void __neigh_notify(struct neighbour *n, int type, int flags, - u32 pid); -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); +static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid); +static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); @@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); - __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); + neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -1105,6 +1104,7 @@ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); + bool skip_probe = false; unsigned int state; int notify = 0; @@ -1172,9 +1172,15 @@ static void neigh_timer_handler(struct timer_list *t) neigh_invalidate(neigh); } notify = 1; - goto out; + skip_probe = true; } + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0); + + if (skip_probe) + goto out; + if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; @@ -1189,7 +1195,7 @@ out: } if (notify) - neigh_update_notify(neigh, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); trace_neigh_timer_handler(neigh, 0); @@ -1303,6 +1309,47 @@ static void neigh_update_hhs(struct neighbour *neigh) } } +static void neigh_update_process_arp_queue(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + /* Again: avoid deadlock if something went wrong. */ + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } + READ_ONCE(n1->output)(n1, skb); + if (n2) + neigh_release(n2); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1329,6 +1376,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; + bool process_arp_queue = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; @@ -1462,53 +1510,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_connect(neigh); else neigh_suspect(neigh); - if (!(old & NUD_VALID)) { - struct sk_buff *skb; - /* Again: avoid dead loop if something went wrong */ + if (!(old & NUD_VALID)) + process_arp_queue = true; - while (neigh->nud_state & NUD_VALID && - (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n2, *n1 = neigh; - write_unlock_bh(&neigh->lock); - - rcu_read_lock(); - - /* Why not just use 'neigh' as-is? The problem is that - * things such as shaper, eql, and sch_teql can end up - * using alternative, different, neigh objects to output - * the packet in the output path. So what we need to do - * here is re-lookup the top-level neigh in the path so - * we can reinject the packet there. - */ - n2 = NULL; - if (dst && - READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { - n2 = dst_neigh_lookup_skb(dst, skb); - if (n2) - n1 = n2; - } - READ_ONCE(n1->output)(n1, skb); - if (n2) - neigh_release(n2); - rcu_read_unlock(); - - write_lock_bh(&neigh->lock); - } - __skb_queue_purge(&neigh->arp_queue); - neigh->arp_queue_len_bytes = 0; - } out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); + + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); + + if (process_arp_queue) + neigh_update_process_arp_queue(neigh); + write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); + if (notify) - neigh_update_notify(neigh, nlmsg_pid); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + trace_neigh_update_done(neigh, err); return err; } @@ -2622,8 +2647,8 @@ out: return skb->len; } -static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, - u32 pid, u32 seq, int type, unsigned int flags) +static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; @@ -2649,23 +2674,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; - read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); - if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { - read_unlock_bh(&neigh->lock); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) goto nla_put_failure; - } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; - read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -2684,6 +2705,20 @@ nla_put_failure: return -EMSGSIZE; } +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + int err; + + read_lock_bh(&neigh->lock); + err = __neigh_fill_info(skb, neigh, pid, seq, type, flags); + read_unlock_bh(&neigh->lock); + + return err; +} + static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) @@ -2727,12 +2762,6 @@ nla_put_failure: return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) -{ - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); -} - static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; @@ -3545,7 +3574,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, pid, 0, type, flags); + err = __neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3560,9 +3589,16 @@ out: rcu_read_unlock(); } +static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid) +{ + read_lock_bh(&neigh->lock); + __neigh_notify(neigh, type, flags, pid); + read_unlock_bh(&neigh->lock); +} + void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); + neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a6e6a964a287..aef44e617361 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -624,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) } EXPORT_SYMBOL_GPL(net_ns_get_ownership); -static void unhash_nsid(struct net *net, struct net *last) +static void unhash_nsid(struct net *last) { - struct net *tmp; + struct net *tmp, *peer; + /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below @@ -634,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last) * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { - int id; + int id = 0; spin_lock(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + while ((peer = idr_get_next(&tmp->netns_ids, &id))) { + int curr_id = id; + + id++; + if (!peer->is_dying) + continue; + + idr_remove(&tmp->netns_ids, curr_id); + spin_unlock(&tmp->nsid_lock); + rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL, GFP_KERNEL); + spin_lock(&tmp->nsid_lock); + } + spin_unlock(&tmp->nsid_lock); if (tmp == last) break; } - spin_lock(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -674,6 +679,7 @@ static void cleanup_net(struct work_struct *work) llist_for_each_entry(net, net_kill_list, cleanup_list) { ns_tree_remove(net); list_del_rcu(&net->list); + net->is_dying = true; } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer @@ -688,8 +694,10 @@ static void cleanup_net(struct work_struct *work) last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); + unhash_nsid(last); + llist_for_each_entry(net, net_kill_list, cleanup_list) { - unhash_nsid(net, last); + idr_destroy(&net->netns_ids); list_add_tail(&net->exit_list, &net_exit_list); } diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c new file mode 100644 index 000000000000..f14af365d5cd --- /dev/null +++ b/net/core/netdev_config.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> + +#include "dev.h" + +static int netdev_nop_validate_qcfg(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static int __netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack, + bool validate) +{ + int (*validate_cb)(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + struct pp_memory_provider_params *mpp; + int err; + + validate_cb = netdev_nop_validate_qcfg; + if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg) + validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg; + + memset(qcfg, 0, sizeof(*qcfg)); + + /* Get defaults from the driver, in case user config not set */ + if (dev->queue_mgmt_ops->ndo_default_qcfg) + dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg); + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + /* Apply MP overrides */ + mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params; + if (mpp->rx_page_size) + qcfg->rx_page_size = mpp->rx_page_size; + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + return 0; +} + +/** + * netdev_queue_config() - get configuration for a given queue + * @dev: net_device instance + * @rxq_idx: index of the queue of interest + * @qcfg: queue configuration struct (output) + * + * Render the configuration for a given queue. This helper should be used + * by drivers which support queue configuration to retrieve config for + * a particular queue. + * + * @qcfg is an output parameter and is always fully initialized by this + * function. Some values may not be set by the user, drivers may either + * deal with the "unset" values in @qcfg, or provide the callback + * to populate defaults in queue_management_ops. + */ +void netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg) +{ + __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false); +} +EXPORT_SYMBOL(netdev_queue_config); + +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true); +} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index c7d9341b7630..668a90658f25 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -7,6 +7,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/memory_provider.h> +#include "dev.h" #include "page_pool_priv.h" /* See also page_pool_is_unreadable() */ @@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) } EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); -int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +static int netdev_rx_queue_reconfig(struct net_device *dev, + unsigned int rxq_idx, + struct netdev_queue_config *qcfg_old, + struct netdev_queue_config *qcfg_new) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; @@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) goto err_free_new_mem; } - err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_free_old_mem; @@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_start_queue; } else { @@ -76,7 +80,7 @@ err_start_queue: * WARN if we fail to recover the old rx queue, and at least free * old_mem so we don't also leak that. */ - if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) { WARN(1, "Failed to restart old queue in error path. RX queue %d may be unhealthy.", rxq_idx); @@ -94,12 +98,22 @@ err_free_new_mem: return err; } + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_queue_config qcfg; + + netdev_queue_config(dev, rxq_idx, &qcfg); + return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg); +} EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int ret; @@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); return -EEXIST; } + if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) { + NL_SET_ERR_MSG(extack, "device does not support: rx_page_size"); + return -EOPNOTSUPP; + } rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { @@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, } #endif + netdev_queue_config(dev, rxq_idx, &qcfg[0]); rxq->mp_params = *p; - ret = netdev_rx_queue_restart(dev, rxq_idx); - if (ret) { - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - } + ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack); + if (ret) + goto err_clear_mp; + + ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]); + if (ret) + goto err_clear_mp; + + return 0; + +err_clear_mp: + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); return ret; } @@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int err; @@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_priv != old_p->mp_priv)) return; - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, ifq_idx); + netdev_queue_config(dev, ifq_idx, &qcfg[0]); + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); + netdev_queue_config(dev, ifq_idx, &qcfg[1]); + + err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]); WARN_ON(err && err != -ENETDOWN); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c deleted file mode 100644 index 897a8f01a67b..000000000000 --- a/net/core/request_sock.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET Generic infrastructure for Network protocols. - * - * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * From code originally in include/net/tcp.h - */ - -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/tcp.h> -#include <linux/vmalloc.h> - -#include <net/request_sock.h> - -/* - * Maximum number of SYN_RECV sockets in queue per LISTEN socket. - * One SYN_RECV socket costs about 80bytes on a 32bit machine. - * It would be better to replace it with a global counter for all sockets - * but then some measure against one socket starving all other sockets - * would be needed. - * - * The minimum value of it is 128. Experiments with real servers show that - * it is absolutely not enough even at 100conn/sec. 256 cures most - * of problems. - * This value is adjusted to 128 for low memory machines, - * and it will increase in proportion to the memory of machine. - * Note : Dont forget somaxconn that may limit backlog too. - */ - -void reqsk_queue_alloc(struct request_sock_queue *queue) -{ - queue->fastopenq.rskq_rst_head = NULL; - queue->fastopenq.rskq_rst_tail = NULL; - queue->fastopenq.qlen = 0; - - queue->rskq_accept_head = NULL; -} - -/* - * This function is called to set a Fast Open socket's "fastopen_rsk" field - * to NULL when a TFO socket no longer needs to access the request_sock. - * This happens only after 3WHS has been either completed or aborted (e.g., - * RST is received). - * - * Before TFO, a child socket is created only after 3WHS is completed, - * hence it never needs to access the request_sock. things get a lot more - * complex with TFO. A child socket, accepted or not, has to access its - * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, - * until 3WHS is either completed or aborted. Afterwards the req will stay - * until either the child socket is accepted, or in the rare case when the - * listener is closed before the child is accepted. - * - * In short, a request socket is only freed after BOTH 3WHS has completed - * (or aborted) and the child socket has been accepted (or listener closed). - * When a child socket is accepted, its corresponding req->sk is set to - * NULL since it's no longer needed. More importantly, "req->sk == NULL" - * will be used by the code below to determine if a child socket has been - * accepted or not, and the check is protected by the fastopenq->lock - * described below. - * - * Note that fastopen_rsk is only accessed from the child socket's context - * with its socket lock held. But a request_sock (req) can be accessed by - * both its child socket through fastopen_rsk, and a listener socket through - * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin - * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. - * only in the rare case when both the listener and the child locks are held, - * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. - * The lock also protects other fields such as fastopenq->qlen, which is - * decremented by this function when fastopen_rsk is no longer needed. - * - * Note that another solution was to simply use the existing socket lock - * from the listener. But first socket lock is difficult to use. It is not - * a simple spin lock - one must consider sock_owned_by_user() and arrange - * to use sk_add_backlog() stuff. But what really makes it infeasible is the - * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. - * - * This function also sets "treq->tfo_listener" to false. - * treq->tfo_listener is used by the listener so it is protected by the - * fastopenq->lock in this function. - */ -void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, - bool reset) -{ - struct sock *lsk = req->rsk_listener; - struct fastopen_queue *fastopenq; - - fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - - RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); - spin_lock_bh(&fastopenq->lock); - fastopenq->qlen--; - tcp_rsk(req)->tfo_listener = false; - if (req->sk) /* the child socket hasn't been accepted yet */ - goto out; - - if (!reset || lsk->sk_state != TCP_LISTEN) { - /* If the listener has been closed don't bother with the - * special RST handling below. - */ - spin_unlock_bh(&fastopenq->lock); - reqsk_put(req); - return; - } - /* Wait for 60secs before removing a req that has triggered RST. - * This is a simple defense against TFO spoofing attack - by - * counting the req against fastopen.max_qlen, and disabling - * TFO when the qlen exceeds max_qlen. - * - * For more details see CoNext'11 "TCP Fast Open" paper. - */ - req->rsk_timer.expires = jiffies + 60*HZ; - if (fastopenq->rskq_rst_head == NULL) - fastopenq->rskq_rst_head = req; - else - fastopenq->rskq_rst_tail->dl_next = req; - - req->dl_next = NULL; - fastopenq->rskq_rst_tail = req; - fastopenq->qlen++; -out: - spin_unlock_bh(&fastopenq->lock); -} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 61746c2b95f6..699c401a5eae 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -78,6 +78,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> +#include <net/can.h> #include <net/page_pool/helpers.h> #include <net/psp/types.h> #include <net/dropreason.h> @@ -280,7 +281,7 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align); */ static u32 skbuff_cache_size __read_mostly; -static struct sk_buff *napi_skb_cache_get(bool alloc) +static inline struct sk_buff *napi_skb_cache_get(bool alloc) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; @@ -307,6 +308,23 @@ static struct sk_buff *napi_skb_cache_get(bool alloc) return skb; } +/* + * Only clear those fields we need to clear, not those that we will + * actually initialise later. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ +static inline void skbuff_clear(struct sk_buff *skb) +{ + /* Replace memset(skb, 0, offsetof(struct sk_buff, tail)) + * with two smaller memset(), with a barrier() between them. + * This forces the compiler to inline both calls. + */ + BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128); + memset(skb, 0, 128); + barrier(); + memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128); +} + /** * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache * @skbs: pointer to an at least @n-sized array to fill with skb pointers @@ -357,7 +375,7 @@ get: skbs[i] = nc->skb_cache[base + i]; kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); - memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skbs[i]); } nc->skb_count -= n; @@ -424,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); @@ -476,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -537,7 +555,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -566,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(napi_build_skb); +static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node) +{ + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + if (!obj_size) + return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, + flags, node); + return kmalloc_node_track_caller(obj_size, flags, node); +} + /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and @@ -574,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb); * memory is free */ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, - bool *pfmemalloc) + struct sk_buff *skb) { - bool ret_pfmemalloc = false; size_t obj_size; void *obj; @@ -587,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); - goto out; + if (skb) + skb->pfmemalloc = true; + return kmalloc_pfmemalloc(0, flags, node); } obj_size = kmalloc_size_roundup(obj_size); @@ -608,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(obj_size, flags, node); - + if (skb) + skb->pfmemalloc = true; + obj = kmalloc_pfmemalloc(obj_size, flags, node); out: - if (pfmemalloc) - *pfmemalloc = ret_pfmemalloc; - return obj; } @@ -650,7 +674,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, { struct sk_buff *skb = NULL; struct kmem_cache *cache; - bool pfmemalloc; u8 *data; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) @@ -680,37 +703,35 @@ fallback: if (unlikely(!skb)) return NULL; } - prefetchw(skb); + skbuff_clear(skb); /* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, skb); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - prefetchw(data + SKB_WITH_OVERHEAD(size)); - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, size); - skb->pfmemalloc = pfmemalloc; + __finalize_skb_around(skb, data, size); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - skb->fclone = SKB_FCLONE_ORIG; + /* skb->fclone is a 2bits field. + * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG) + * with a single OR. + */ + BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0); + DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE); + skb->fclone |= SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); } @@ -1488,9 +1509,20 @@ void napi_skb_free_stolen_head(struct sk_buff *skb) napi_skb_cache_put(skb); } +/** + * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache + * @skb: buffer to free + * @budget: NAPI budget + * + * Non-zero @budget must come from the @budget argument passed by the core + * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll + * for example when polling for netpoll / netconsole. + * + * Passing @budget of 0 is safe from any context, it turns this function + * into dev_consume_skb_any(). + */ void napi_consume_skb(struct sk_buff *skb, int budget) { - /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget || !skb)) { dev_consume_skb_any(skb); return; @@ -5108,6 +5140,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_INET_PSP) [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), #endif +#if IS_ENABLED(CONFIG_CAN) + [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -5123,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void) static void skb_extensions_init(void) { - BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(SKB_EXT_NUM > 8); #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); #endif @@ -7392,31 +7427,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -void get_netmem(netmem_ref netmem) +void __get_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_get_net_iov(netmem_to_net_iov(netmem)); - return; - } - get_page(netmem_to_page(netmem)); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); } -EXPORT_SYMBOL(get_netmem); +EXPORT_SYMBOL(__get_netmem); -void put_netmem(netmem_ref netmem) +void __put_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_put_net_iov(netmem_to_net_iov(netmem)); - return; + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); +} +EXPORT_SYMBOL(__put_netmem); + +struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset) +{ + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; + + /* if type is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (vlan_depth) { + if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN)) + return (struct vlan_type_depth) { 0 }; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; } + do { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) + return (struct vlan_type_depth) { 0 }; - put_page(netmem_to_page(netmem)); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (eth_type_vlan(type)); + + return (struct vlan_type_depth) { + .type = type, + .depth = vlan_depth + }; } -EXPORT_SYMBOL(put_netmem); +EXPORT_SYMBOL(__vlan_get_protocol_offset); diff --git a/net/core/sock.c b/net/core/sock.c index a1c8b47b0d56..693e6d80f501 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab) return -EINVAL; } if (alloc_slab) { - prot->slab = kmem_cache_create_usercopy(prot->name, - prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | - prot->slab_flags, - prot->useroffset, prot->usersize, - NULL); + struct kmem_cache_args args = { + .useroffset = prot->useroffset, + .usersize = prot->usersize, + .freeptr_offset = prot->freeptr_offset, + .use_freeptr_offset = !!prot->freeptr_offset, + }; + prot->slab = kmem_cache_create(prot->name, prot->obj_size, + &args, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 05dd55cf8b58..03aea10073f0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/isolation.h> +#include <linux/hex.h> #include <net/ip.h> #include <net/sock.h> @@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; + struct ctl_table fake_table; + char *pos = buf; + + for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) { + pos = hex_byte_pack(pos, netdev_rss_key[i]); + *pos++ = ':'; + } + *(--pos) = 0; - snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); diff --git a/net/devlink/core.c b/net/devlink/core.c index 58093f49c090..da56e2b8afc1 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -178,9 +178,7 @@ int devlink_rel_nested_in_add(u32 *rel_index, u32 devlink_index, * a notification of a change of this object should be sent * over netlink. The parent devlink instance lock needs to be * taken during the notification preparation. - * However, since the devlink lock of nested instance is held here, - * we would end with wrong devlink instance lock ordering and - * deadlock. Therefore the work is utilized to avoid that. + * Since the parent may or may not be locked, 'work' is utilized. */ void devlink_rel_nested_in_notify(struct devlink *devlink) { @@ -477,7 +475,7 @@ void devlink_free(struct devlink *devlink) WARN_ON(!list_empty(&devlink->resource_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(devlink_rates_check(devlink, NULL, NULL)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); diff --git a/net/devlink/dev.c b/net/devlink/dev.c index 02602704bdea..e3a36de4f4ae 100644 --- a/net/devlink/dev.c +++ b/net/devlink/dev.c @@ -434,7 +434,7 @@ static void devlink_reload_reinit_sanity_check(struct devlink *devlink) WARN_ON(!list_empty(&devlink->trap_list)); WARN_ON(!list_empty(&devlink->dpipe_table_list)); WARN_ON(!list_empty(&devlink->sb_list)); - WARN_ON(!list_empty(&devlink->rate_list)); + WARN_ON(devlink_rates_check(devlink, NULL, NULL)); WARN_ON(!list_empty(&devlink->linecard_list)); WARN_ON(!xa_empty(&devlink->ports)); } @@ -713,10 +713,11 @@ int devlink_nl_eswitch_set_doit(struct sk_buff *skb, struct genl_info *info) if (info->attrs[DEVLINK_ATTR_ESWITCH_MODE]) { if (!ops->eswitch_mode_set) return -EOPNOTSUPP; - mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); - err = devlink_rate_nodes_check(devlink, mode, info->extack); + err = devlink_rates_check(devlink, devlink_rate_is_node, + info->extack); if (err) return err; + mode = nla_get_u16(info->attrs[DEVLINK_ATTR_ESWITCH_MODE]); err = ops->eswitch_mode_set(devlink, mode, info->extack); if (err) return err; diff --git a/net/devlink/devl_internal.h b/net/devlink/devl_internal.h index 14eaad9cfe35..1377864383bc 100644 --- a/net/devlink/devl_internal.h +++ b/net/devlink/devl_internal.h @@ -297,8 +297,10 @@ int devlink_resources_validate(struct devlink *devlink, struct genl_info *info); /* Rates */ -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack); +bool devlink_rate_is_node(const struct devlink_rate *devlink_rate); +int devlink_rates_check(struct devlink *devlink, + bool (*rate_filter)(const struct devlink_rate *), + struct netlink_ext_ack *extack); /* Linecards */ unsigned int devlink_linecard_index(struct devlink_linecard *linecard); diff --git a/net/devlink/rate.c b/net/devlink/rate.c index d157a8419bca..0d68b5c477dc 100644 --- a/net/devlink/rate.c +++ b/net/devlink/rate.c @@ -12,8 +12,7 @@ devlink_rate_is_leaf(struct devlink_rate *devlink_rate) return devlink_rate->type == DEVLINK_RATE_TYPE_LEAF; } -static inline bool -devlink_rate_is_node(struct devlink_rate *devlink_rate) +bool devlink_rate_is_node(const struct devlink_rate *devlink_rate) { return devlink_rate->type == DEVLINK_RATE_TYPE_NODE; } @@ -688,14 +687,16 @@ int devlink_nl_rate_del_doit(struct sk_buff *skb, struct genl_info *info) return err; } -int devlink_rate_nodes_check(struct devlink *devlink, u16 mode, - struct netlink_ext_ack *extack) +int devlink_rates_check(struct devlink *devlink, + bool (*rate_filter)(const struct devlink_rate *), + struct netlink_ext_ack *extack) { struct devlink_rate *devlink_rate; list_for_each_entry(devlink_rate, &devlink->rate_list, list) - if (devlink_rate_is_node(devlink_rate)) { - NL_SET_ERR_MSG(extack, "Rate node(s) exists."); + if (!rate_filter || rate_filter(devlink_rate)) { + if (extack) + NL_SET_ERR_MSG(extack, "Rate node(s) exists."); return -EBUSY; } return 0; diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index f86b30742122..5ed8c704636d 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -104,6 +104,13 @@ config NET_DSA_TAG_MTK Say Y or M if you want to enable support for tagging frames for Mediatek switches. +config NET_DSA_TAG_MXL_862XX + tristate "Tag driver for MaxLinear MxL862xx switches" + help + Say Y or M if you want to enable support for tagging frames for the + MaxLinear MxL86252 and MxL86282 switches using their native 8-byte + tagging protocol. + config NET_DSA_TAG_MXL_GSW1XX tristate "Tag driver for MaxLinear GSW1xx switches" help diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 42d173f5a701..bf7247759a64 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_NET_DSA_TAG_HELLCREEK) += tag_hellcreek.o obj-$(CONFIG_NET_DSA_TAG_KSZ) += tag_ksz.o obj-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o obj-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +obj-$(CONFIG_NET_DSA_TAG_MXL_862XX) += tag_mxl862xx.o obj-$(CONFIG_NET_DSA_TAG_MXL_GSW1XX) += tag_mxl-gsw1xx.o obj-$(CONFIG_NET_DSA_TAG_NONE) += tag_none.o obj-$(CONFIG_NET_DSA_TAG_OCELOT) += tag_ocelot.o diff --git a/net/dsa/tag_mxl862xx.c b/net/dsa/tag_mxl862xx.c new file mode 100644 index 000000000000..01f215868271 --- /dev/null +++ b/net/dsa/tag_mxl862xx.c @@ -0,0 +1,110 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * DSA Special Tag for MaxLinear 862xx switch chips + * + * Copyright (C) 2025 Daniel Golle <daniel@makrotopia.org> + * Copyright (C) 2024 MaxLinear Inc. + */ + +#include <linux/bitops.h> +#include <linux/etherdevice.h> +#include <linux/skbuff.h> +#include <net/dsa.h> +#include "tag.h" + +#define MXL862_NAME "mxl862xx" + +#define MXL862_HEADER_LEN 8 + +/* Word 0 -> EtherType */ + +/* Word 2 */ +#define MXL862_SUBIF_ID GENMASK(4, 0) + +/* Word 3 */ +#define MXL862_IGP_EGP GENMASK(3, 0) + +static struct sk_buff *mxl862_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_port *dp = dsa_user_to_port(dev); + struct dsa_port *cpu_dp = dp->cpu_dp; + unsigned int cpu_port, sub_interface; + __be16 *mxl862_tag; + + cpu_port = cpu_dp->index; + + /* target port sub-interface ID relative to the CPU port */ + sub_interface = dp->index + 16 - cpu_port; + + /* provide additional space 'MXL862_HEADER_LEN' bytes */ + skb_push(skb, MXL862_HEADER_LEN); + + /* shift MAC address to the beginning of the enlarged buffer, + * releasing the space required for DSA tag (between MAC address and + * Ethertype) + */ + dsa_alloc_etype_header(skb, MXL862_HEADER_LEN); + + /* special tag ingress (from the perspective of the switch) */ + mxl862_tag = dsa_etype_header_pos_tx(skb); + mxl862_tag[0] = htons(ETH_P_MXLGSW); + mxl862_tag[1] = 0; + mxl862_tag[2] = htons(FIELD_PREP(MXL862_SUBIF_ID, sub_interface)); + mxl862_tag[3] = htons(FIELD_PREP(MXL862_IGP_EGP, cpu_port)); + + return skb; +} + +static struct sk_buff *mxl862_tag_rcv(struct sk_buff *skb, + struct net_device *dev) +{ + __be16 *mxl862_tag; + int port; + + if (unlikely(!pskb_may_pull(skb, MXL862_HEADER_LEN))) { + dev_warn_ratelimited(&dev->dev, "Cannot pull SKB, packet dropped\n"); + return NULL; + } + + mxl862_tag = dsa_etype_header_pos_rx(skb); + + if (unlikely(mxl862_tag[0] != htons(ETH_P_MXLGSW))) { + dev_warn_ratelimited(&dev->dev, + "Invalid special tag marker, packet dropped, tag: %8ph\n", + mxl862_tag); + return NULL; + } + + /* Get source port information */ + port = FIELD_GET(MXL862_IGP_EGP, ntohs(mxl862_tag[3])); + skb->dev = dsa_conduit_find_user(dev, 0, port); + if (unlikely(!skb->dev)) { + dev_warn_ratelimited(&dev->dev, + "Invalid source port, packet dropped, tag: %8ph\n", + mxl862_tag); + return NULL; + } + + /* remove the MxL862xx special tag between the MAC addresses and the + * current ethertype field. + */ + skb_pull_rcsum(skb, MXL862_HEADER_LEN); + dsa_strip_etype_header(skb, MXL862_HEADER_LEN); + + return skb; +} + +static const struct dsa_device_ops mxl862_netdev_ops = { + .name = MXL862_NAME, + .proto = DSA_TAG_PROTO_MXL862, + .xmit = mxl862_tag_xmit, + .rcv = mxl862_tag_rcv, + .needed_headroom = MXL862_HEADER_LEN, +}; + +MODULE_ALIAS_DSA_TAG_DRIVER(DSA_TAG_PROTO_MXL862, MXL862_NAME); +MODULE_DESCRIPTION("DSA tag driver for MaxLinear MxL862xx switches"); +MODULE_LICENSE("GPL"); + +module_dsa_tag_driver(mxl862_netdev_ops); diff --git a/net/dsa/tag_yt921x.c b/net/dsa/tag_yt921x.c index 6bbfd42dc5df..aefef8c770e3 100644 --- a/net/dsa/tag_yt921x.c +++ b/net/dsa/tag_yt921x.c @@ -14,11 +14,14 @@ * are conflicts somewhere and/or you want to change it for some reason. * Tag: * 2: VLAN Tag - * 2: Rx Port + * 2: * 15b: Rx Port Valid * 14b-11b: Rx Port - * 10b-0b: Cmd? - * 2: Tx Port(s) + * 10b-8b: Tx/Rx Priority + * 7b: Tx/Rx Code Valid + * 6b-1b: Tx/Rx Code + * 0b: ? (unset) + * 2: * 15b: Tx Port(s) Valid * 10b-0b: Tx Port(s) Mask */ @@ -33,18 +36,30 @@ #define YT921X_TAG_PORT_EN BIT(15) #define YT921X_TAG_RX_PORT_M GENMASK(14, 11) -#define YT921X_TAG_RX_CMD_M GENMASK(10, 0) -#define YT921X_TAG_RX_CMD(x) FIELD_PREP(YT921X_TAG_RX_CMD_M, (x)) -#define YT921X_TAG_RX_CMD_FORWARDED 0x80 -#define YT921X_TAG_RX_CMD_UNK_UCAST 0xb2 -#define YT921X_TAG_RX_CMD_UNK_MCAST 0xb4 -#define YT921X_TAG_TX_PORTS GENMASK(10, 0) +#define YT921X_TAG_PRIO_M GENMASK(10, 8) +#define YT921X_TAG_PRIO(x) FIELD_PREP(YT921X_TAG_PRIO_M, (x)) +#define YT921X_TAG_CODE_EN BIT(7) +#define YT921X_TAG_CODE_M GENMASK(6, 1) +#define YT921X_TAG_CODE(x) FIELD_PREP(YT921X_TAG_CODE_M, (x)) +#define YT921X_TAG_TX_PORTS_M GENMASK(10, 0) +#define YT921X_TAG_TX_PORTS(x) FIELD_PREP(YT921X_TAG_TX_PORTS_M, (x)) + +/* Incomplete. Some are configurable via RMA_CTRL_CPU_CODE, the meaning of + * others remains unknown. + */ +enum yt921x_tag_code { + YT921X_TAG_CODE_FORWARD = 0, + YT921X_TAG_CODE_UNK_UCAST = 0x19, + YT921X_TAG_CODE_UNK_MCAST = 0x1a, + YT921X_TAG_CODE_PORT_COPY = 0x1b, + YT921X_TAG_CODE_FDB_COPY = 0x1c, +}; static struct sk_buff * yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev) { __be16 *tag; - u16 tx; + u16 ctrl; skb_push(skb, YT921X_TAG_LEN); dsa_alloc_etype_header(skb, YT921X_TAG_LEN); @@ -54,10 +69,12 @@ yt921x_tag_xmit(struct sk_buff *skb, struct net_device *netdev) tag[0] = htons(ETH_P_YT921X); /* VLAN tag unrelated when TX */ tag[1] = 0; - tag[2] = 0; - tx = FIELD_PREP(YT921X_TAG_TX_PORTS, dsa_xmit_port_mask(skb, netdev)) | - YT921X_TAG_PORT_EN; - tag[3] = htons(tx); + ctrl = YT921X_TAG_CODE(YT921X_TAG_CODE_FORWARD) | YT921X_TAG_CODE_EN | + YT921X_TAG_PRIO(skb->priority); + tag[2] = htons(ctrl); + ctrl = YT921X_TAG_TX_PORTS(dsa_xmit_port_mask(skb, netdev)) | + YT921X_TAG_PORT_EN; + tag[3] = htons(ctrl); return skb; } @@ -67,7 +84,6 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev) { unsigned int port; __be16 *tag; - u16 cmd; u16 rx; if (unlikely(!pskb_may_pull(skb, YT921X_TAG_LEN))) @@ -98,23 +114,34 @@ yt921x_tag_rcv(struct sk_buff *skb, struct net_device *netdev) return NULL; } - cmd = FIELD_GET(YT921X_TAG_RX_CMD_M, rx); - switch (cmd) { - case YT921X_TAG_RX_CMD_FORWARDED: - /* Already forwarded by hardware */ - dsa_default_offload_fwd_mark(skb); - break; - case YT921X_TAG_RX_CMD_UNK_UCAST: - case YT921X_TAG_RX_CMD_UNK_MCAST: - /* NOTE: hardware doesn't distinguish between TRAP (copy to CPU - * only) and COPY (forward and copy to CPU). In order to perform - * a soft switch, NEVER use COPY action in the switch driver. - */ - break; - default: + skb->priority = FIELD_GET(YT921X_TAG_PRIO_M, rx); + + if (!(rx & YT921X_TAG_CODE_EN)) { dev_warn_ratelimited(&netdev->dev, - "Unexpected rx cmd 0x%02x\n", cmd); - break; + "Tag code not enabled in rx packet\n"); + } else { + u16 code = FIELD_GET(YT921X_TAG_CODE_M, rx); + + switch (code) { + case YT921X_TAG_CODE_FORWARD: + case YT921X_TAG_CODE_PORT_COPY: + case YT921X_TAG_CODE_FDB_COPY: + /* Already forwarded by hardware */ + dsa_default_offload_fwd_mark(skb); + break; + case YT921X_TAG_CODE_UNK_UCAST: + case YT921X_TAG_CODE_UNK_MCAST: + /* NOTE: hardware doesn't distinguish between TRAP (copy + * to CPU only) and COPY (forward and copy to CPU). In + * order to perform a soft switch, NEVER use COPY action + * in the switch driver. + */ + break; + default: + dev_warn_ratelimited(&netdev->dev, + "Unknown code 0x%02x\n", code); + break; + } } /* Remove YT921x tag and update checksum */ diff --git a/net/dsa/user.c b/net/dsa/user.c index f59d66f0975d..5697291d43cf 100644 --- a/net/dsa/user.c +++ b/net/dsa/user.c @@ -1459,8 +1459,8 @@ dsa_user_add_cls_matchall_police(struct net_device *dev, struct netlink_ext_ack *extack = cls->common.extack; struct dsa_port *dp = dsa_user_to_port(dev); struct dsa_user_priv *p = netdev_priv(dev); - struct dsa_mall_policer_tc_entry *policer; struct dsa_mall_tc_entry *mall_tc_entry; + struct flow_action_police *policer; struct dsa_switch *ds = dp->ds; struct flow_action_entry *act; int err; @@ -1497,8 +1497,7 @@ dsa_user_add_cls_matchall_police(struct net_device *dev, mall_tc_entry->cookie = cls->cookie; mall_tc_entry->type = DSA_PORT_MALL_POLICER; policer = &mall_tc_entry->policer; - policer->rate_bytes_per_sec = act->police.rate_bytes_ps; - policer->burst = act->police.burst; + *policer = act->police; err = ds->ops->port_policer_add(ds, dp->index, policer); if (err) { diff --git a/net/ethtool/common.c b/net/ethtool/common.c index d47a279eb8b9..5fae329795c8 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -285,12 +285,35 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_DR8_2 8 #define __LINK_MODE_LANES_T1BRR 1 -#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ +#define __DEFINE_LINK_MODE_PARAMS_PAIRS(_speed, _type, _min_pairs, _pairs, _duplex, _medium) \ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ .speed = SPEED_ ## _speed, \ .lanes = __LINK_MODE_LANES_ ## _type, \ - .duplex = __DUPLEX_ ## _duplex \ + .min_pairs = _min_pairs, \ + .pairs = _pairs, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \ } + +#define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex, _medium) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .min_pairs = 0, \ + .pairs = 0, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium) \ + } +#define __DEFINE_LINK_MODE_PARAMS_MEDIUMS(_speed, _type, _duplex, _mediums) \ + [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ + .speed = SPEED_ ## _speed, \ + .lanes = __LINK_MODE_LANES_ ## _type, \ + .min_pairs = 0, \ + .pairs = 0, \ + .duplex = __DUPLEX_ ## _duplex, \ + .mediums = (_mediums) \ + } +#define __MED(_medium) (BIT(ETHTOOL_LINK_MEDIUM_BASE ## _medium)) #define __DUPLEX_Half DUPLEX_HALF #define __DUPLEX_Full DUPLEX_FULL #define __DEFINE_SPECIAL_MODE_PARAMS(_mode) \ @@ -298,142 +321,168 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); .speed = SPEED_UNKNOWN, \ .lanes = 0, \ .duplex = DUPLEX_UNKNOWN, \ + .mediums = BIT(ETHTOOL_LINK_MEDIUM_NONE), \ } const struct link_mode_info link_mode_params[] = { - __DEFINE_LINK_MODE_PARAMS(10, T, Half), - __DEFINE_LINK_MODE_PARAMS(10, T, Full), - __DEFINE_LINK_MODE_PARAMS(100, T, Half), - __DEFINE_LINK_MODE_PARAMS(100, T, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T, Half), - __DEFINE_LINK_MODE_PARAMS(1000, T, Full), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T, 2, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T, 2, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(Autoneg), __DEFINE_SPECIAL_MODE_PARAMS(TP), __DEFINE_SPECIAL_MODE_PARAMS(AUI), __DEFINE_SPECIAL_MODE_PARAMS(MII), __DEFINE_SPECIAL_MODE_PARAMS(FIBRE), __DEFINE_SPECIAL_MODE_PARAMS(BNC), - __DEFINE_LINK_MODE_PARAMS(10000, T, Full), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(Pause), __DEFINE_SPECIAL_MODE_PARAMS(Asym_Pause), - __DEFINE_LINK_MODE_PARAMS(2500, X, Full), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(2500, X, Full, + __MED(C) | __MED(S) | __MED(L)), __DEFINE_SPECIAL_MODE_PARAMS(Backplane), - __DEFINE_LINK_MODE_PARAMS(1000, KX, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full), - __DEFINE_LINK_MODE_PARAMS(10000, KR, Full), + __DEFINE_LINK_MODE_PARAMS(1000, KX, Full, K), + __DEFINE_LINK_MODE_PARAMS(10000, KX4, Full, K), + __DEFINE_LINK_MODE_PARAMS(10000, KR, Full, K), [ETHTOOL_LINK_MODE_10000baseR_FEC_BIT] = { .speed = SPEED_10000, .lanes = 1, .duplex = DUPLEX_FULL, }, - __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full), - __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full), - __DEFINE_LINK_MODE_PARAMS(25000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(25000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR4_ER4, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(1000, X, Full), - __DEFINE_LINK_MODE_PARAMS(10000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LR, Full), - __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full), - __DEFINE_LINK_MODE_PARAMS(10000, ER, Full), - __DEFINE_LINK_MODE_PARAMS(2500, T, Full), - __DEFINE_LINK_MODE_PARAMS(5000, T, Full), + __DEFINE_LINK_MODE_PARAMS(20000, MLD2, Full, MLD), + __DEFINE_LINK_MODE_PARAMS(20000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(40000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(40000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(40000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(40000, LR4, Full, L), + __DEFINE_LINK_MODE_PARAMS(56000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(56000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(56000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(56000, LR4, Full, L), + __DEFINE_LINK_MODE_PARAMS(25000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(25000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(25000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(50000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(50000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(100000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR4_ER4, Full, + __MED(L) | __MED(E)), + __DEFINE_LINK_MODE_PARAMS(50000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(1000, X, Full, + __MED(C) | __MED(S) | __MED(L)), + __DEFINE_LINK_MODE_PARAMS(10000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(10000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(10000, LR, Full, L), + __DEFINE_LINK_MODE_PARAMS(10000, LRM, Full, L), + __DEFINE_LINK_MODE_PARAMS(10000, ER, Full, E), + __DEFINE_LINK_MODE_PARAMS_PAIRS(2500, T, 4, 4, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(5000, T, 4, 4, Full, T), __DEFINE_SPECIAL_MODE_PARAMS(FEC_NONE), __DEFINE_SPECIAL_MODE_PARAMS(FEC_RS), __DEFINE_SPECIAL_MODE_PARAMS(FEC_BASER), - __DEFINE_LINK_MODE_PARAMS(50000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(50000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, T1, Full), - __DEFINE_LINK_MODE_PARAMS(1000, T1, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR8_ER8_FR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(50000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(50000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(50000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(50000, LR_ER_FR, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(50000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(100000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS(100000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR2_ER2_FR2, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(100000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR4_ER4_FR4, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(200000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS_PAIRS(100, T1, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(1000, T1, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(400000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, SR8, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR8_ER8_FR8, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(400000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, CR8, Full, C), __DEFINE_SPECIAL_MODE_PARAMS(FEC_LLRS), - __DEFINE_LINK_MODE_PARAMS(100000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, LR_ER_FR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(100000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, LR2_ER2_FR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, LR4_ER4_FR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(100, FX, Half), - __DEFINE_LINK_MODE_PARAMS(100, FX, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1L, Full), - __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), - __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1S, Full), - __DEFINE_LINK_MODE_PARAMS(10, T1S, Half), - __DEFINE_LINK_MODE_PARAMS(10, T1S_P2MP, Half), - __DEFINE_LINK_MODE_PARAMS(10, T1BRR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, CR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, KR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full), - __DEFINE_LINK_MODE_PARAMS(200000, SR, Full), - __DEFINE_LINK_MODE_PARAMS(200000, VR, Full), - __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full), - __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full), - __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full), - __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full), - __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full), - __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full), - __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full), - __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full), + __DEFINE_LINK_MODE_PARAMS(100000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(100000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(100000, LR_ER_FR, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(100000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(100000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(200000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(200000, LR2_ER2_FR2, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(200000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(400000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS_MEDIUMS(400000, LR4_ER4_FR4, Full, + __MED(L) | __MED(E) | __MED(F)), + __DEFINE_LINK_MODE_PARAMS(400000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(100, FX, Half, F), + __DEFINE_LINK_MODE_PARAMS(100, FX, Full, F), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1L, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full, C), + __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full, S), + __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full, V), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S, 1, 1, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1S_P2MP, 1, 1, Half, T), + __DEFINE_LINK_MODE_PARAMS_PAIRS(10, T1BRR, 1, 1, Full, T), + __DEFINE_LINK_MODE_PARAMS(200000, CR, Full, C), + __DEFINE_LINK_MODE_PARAMS(200000, KR, Full, K), + __DEFINE_LINK_MODE_PARAMS(200000, DR, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, DR_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(200000, SR, Full, S), + __DEFINE_LINK_MODE_PARAMS(200000, VR, Full, V), + __DEFINE_LINK_MODE_PARAMS(400000, CR2, Full, C), + __DEFINE_LINK_MODE_PARAMS(400000, KR2, Full, K), + __DEFINE_LINK_MODE_PARAMS(400000, DR2, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, DR2_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(400000, SR2, Full, S), + __DEFINE_LINK_MODE_PARAMS(400000, VR2, Full, V), + __DEFINE_LINK_MODE_PARAMS(800000, CR4, Full, C), + __DEFINE_LINK_MODE_PARAMS(800000, KR4, Full, K), + __DEFINE_LINK_MODE_PARAMS(800000, DR4, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, DR4_2, Full, D), + __DEFINE_LINK_MODE_PARAMS(800000, SR4, Full, S), + __DEFINE_LINK_MODE_PARAMS(800000, VR4, Full, V), + __DEFINE_LINK_MODE_PARAMS(1600000, CR8, Full, C), + __DEFINE_LINK_MODE_PARAMS(1600000, KR8, Full, K), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8, Full, D), + __DEFINE_LINK_MODE_PARAMS(1600000, DR8_2, Full, D), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); EXPORT_SYMBOL_GPL(link_mode_params); +static const char ethtool_link_medium_names[][ETH_GSTRING_LEN] = { + [ETHTOOL_LINK_MEDIUM_BASET] = "BaseT", + [ETHTOOL_LINK_MEDIUM_BASEK] = "BaseK", + [ETHTOOL_LINK_MEDIUM_BASES] = "BaseS", + [ETHTOOL_LINK_MEDIUM_BASEC] = "BaseC", + [ETHTOOL_LINK_MEDIUM_BASEL] = "BaseL", + [ETHTOOL_LINK_MEDIUM_BASED] = "BaseD", + [ETHTOOL_LINK_MEDIUM_BASEE] = "BaseE", + [ETHTOOL_LINK_MEDIUM_BASEF] = "BaseF", + [ETHTOOL_LINK_MEDIUM_BASEV] = "BaseV", + [ETHTOOL_LINK_MEDIUM_BASEMLD] = "BaseMLD", + [ETHTOOL_LINK_MEDIUM_NONE] = "None", +}; +static_assert(ARRAY_SIZE(ethtool_link_medium_names) == __ETHTOOL_LINK_MEDIUM_LAST); + const char netif_msg_class_names[][ETH_GSTRING_LEN] = { [NETIF_MSG_DRV_BIT] = "drv", [NETIF_MSG_PROBE_BIT] = "probe", @@ -588,21 +637,11 @@ int __ethtool_get_link(struct net_device *dev) int ethtool_get_rx_ring_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; - struct ethtool_rxnfc rx_rings = {}; - int ret; - - if (ops->get_rx_ring_count) - return ops->get_rx_ring_count(dev); - if (!ops->get_rxnfc) + if (!ops->get_rx_ring_count) return -EOPNOTSUPP; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret < 0) - return ret; - - return rx_rings.data; + return ops->get_rx_ring_count(dev); } static int ethtool_get_rxnfc_rule_count(struct net_device *dev) @@ -1164,3 +1203,15 @@ void ethtool_rxfh_context_lost(struct net_device *dev, u32 context_id) ethtool_rss_notify(dev, ETHTOOL_MSG_RSS_DELETE_NTF, context_id); } EXPORT_SYMBOL(ethtool_rxfh_context_lost); + +enum ethtool_link_medium ethtool_str_to_medium(const char *str) +{ + int i; + + for (i = 0; i < __ETHTOOL_LINK_MEDIUM_LAST; i++) + if (!strcmp(ethtool_link_medium_names[i], str)) + return i; + + return ETHTOOL_LINK_MEDIUM_NONE; +} +EXPORT_SYMBOL_GPL(ethtool_str_to_medium); diff --git a/net/hsr/hsr_framereg.c b/net/hsr/hsr_framereg.c index 3a2a2fa7a0a3..7d87f304ded4 100644 --- a/net/hsr/hsr_framereg.c +++ b/net/hsr/hsr_framereg.c @@ -11,6 +11,7 @@ * Same code handles filtering of duplicates for PRP as well. */ +#include <kunit/visibility.h> #include <linux/if_ether.h> #include <linux/etherdevice.h> #include <linux/slab.h> @@ -19,24 +20,6 @@ #include "hsr_framereg.h" #include "hsr_netlink.h" -/* seq_nr_after(a, b) - return true if a is after (higher in sequence than) b, - * false otherwise. - */ -static bool seq_nr_after(u16 a, u16 b) -{ - /* Remove inconsistency where - * seq_nr_after(a, b) == seq_nr_before(a, b) - */ - if ((int)b - a == 32768) - return false; - - return (((s16)(b - a)) < 0); -} - -#define seq_nr_before(a, b) seq_nr_after((b), (a)) -#define seq_nr_before_or_eq(a, b) (!seq_nr_after((a), (b))) -#define PRP_DROP_WINDOW_LEN 32768 - bool hsr_addr_is_redbox(struct hsr_priv *hsr, unsigned char *addr) { if (!hsr->redbox || !is_valid_ether_addr(hsr->macaddress_redbox)) @@ -126,13 +109,29 @@ void hsr_del_self_node(struct hsr_priv *hsr) kfree_rcu(old, rcu_head); } +static void hsr_free_node(struct hsr_node *node) +{ + xa_destroy(&node->seq_blocks); + kfree(node->block_buf); + kfree(node); +} + +static void hsr_free_node_rcu(struct rcu_head *rn) +{ + struct hsr_node *node = container_of(rn, struct hsr_node, rcu_head); + + hsr_free_node(node); +} + void hsr_del_nodes(struct list_head *node_db) { struct hsr_node *node; struct hsr_node *tmp; - list_for_each_entry_safe(node, tmp, node_db, mac_list) - kfree(node); + list_for_each_entry_safe(node, tmp, node_db, mac_list) { + list_del(&node->mac_list); + hsr_free_node(node); + } } void prp_handle_san_frame(bool san, enum hsr_port_type port, @@ -148,18 +147,16 @@ void prp_handle_san_frame(bool san, enum hsr_port_type port, node->san_b = true; } -/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A; - * seq_out is used to initialize filtering of outgoing duplicate frames - * originating from the newly added node. +/* Allocate an hsr_node and add it to node_db. 'addr' is the node's address_A. */ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, struct list_head *node_db, - unsigned char addr[], - u16 seq_out, bool san, + unsigned char addr[], bool san, enum hsr_port_type rx_port) { - struct hsr_node *new_node, *node; + struct hsr_node *new_node, *node = NULL; unsigned long now; + size_t block_sz; int i; new_node = kzalloc(sizeof(*new_node), GFP_ATOMIC); @@ -169,18 +166,24 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, ether_addr_copy(new_node->macaddress_A, addr); spin_lock_init(&new_node->seq_out_lock); + if (hsr->prot_version == PRP_V1) + new_node->seq_port_cnt = 1; + else + new_node->seq_port_cnt = HSR_PT_PORTS - 1; + + block_sz = hsr_seq_block_size(new_node); + new_node->block_buf = kcalloc(HSR_MAX_SEQ_BLOCKS, block_sz, GFP_ATOMIC); + if (!new_node->block_buf) + goto free; + + xa_init(&new_node->seq_blocks); + /* We are only interested in time diffs here, so use current jiffies * as initialization. (0 could trigger an spurious ring error warning). */ now = jiffies; for (i = 0; i < HSR_PT_PORTS; i++) { new_node->time_in[i] = now; - new_node->time_out[i] = now; - } - for (i = 0; i < HSR_PT_PORTS; i++) { - new_node->seq_out[i] = seq_out; - new_node->seq_expected[i] = seq_out + 1; - new_node->seq_start[i] = seq_out + 1; } if (san && hsr->proto_ops->handle_san_frame) @@ -199,6 +202,8 @@ static struct hsr_node *hsr_add_node(struct hsr_priv *hsr, return new_node; out: spin_unlock_bh(&hsr->list_lock); + kfree(new_node->block_buf); +free: kfree(new_node); return node; } @@ -223,7 +228,6 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, struct ethhdr *ethhdr; struct prp_rct *rct; bool san = false; - u16 seq_out; if (!skb_mac_header_was_set(skb)) return NULL; @@ -260,25 +264,72 @@ struct hsr_node *hsr_get_node(struct hsr_port *port, struct list_head *node_db, /* Check if skb contains hsr_ethhdr */ if (skb->mac_len < sizeof(struct hsr_ethhdr)) return NULL; - - /* Use the existing sequence_nr from the tag as starting point - * for filtering duplicate frames. - */ - seq_out = hsr_get_skb_sequence_nr(skb) - 1; } else { rct = skb_get_PRP_rct(skb); - if (rct && prp_check_lsdu_size(skb, rct, is_sup)) { - seq_out = prp_get_skb_sequence_nr(rct); - } else { - if (rx_port != HSR_PT_MASTER) - san = true; - seq_out = HSR_SEQNR_START; + if (!rct && rx_port != HSR_PT_MASTER) + san = true; + } + + return hsr_add_node(hsr, node_db, ethhdr->h_source, san, rx_port); +} + +static bool hsr_seq_block_is_old(struct hsr_seq_block *block) +{ + unsigned long expiry = msecs_to_jiffies(HSR_ENTRY_FORGET_TIME); + + return time_is_before_jiffies(block->time + expiry); +} + +static void hsr_forget_seq_block(struct hsr_node *node, + struct hsr_seq_block *block) +{ + if (block->time) + xa_erase(&node->seq_blocks, block->block_idx); + block->time = 0; +} + +/* Get the currently active sequence number block. If there is no block yet, or + * the existing one is expired, a new block is created. The idea is to maintain + * a "sparse bitmap" where a bitmap for the whole sequence number space is + * split into blocks and not all blocks exist all the time. The blocks can + * expire after time (in low traffic situations) or when they are replaced in + * the backing fixed size buffer (in high traffic situations). + */ +VISIBLE_IF_KUNIT struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node, + u16 block_idx) +{ + struct hsr_seq_block *block, *res; + size_t block_sz; + + block = xa_load(&node->seq_blocks, block_idx); + + if (block && hsr_seq_block_is_old(block)) { + hsr_forget_seq_block(node, block); + block = NULL; + } + + if (!block) { + block_sz = hsr_seq_block_size(node); + block = node->block_buf + node->next_block * block_sz; + hsr_forget_seq_block(node, block); + + memset(block, 0, block_sz); + block->time = jiffies; + block->block_idx = block_idx; + + res = xa_store(&node->seq_blocks, block_idx, block, GFP_ATOMIC); + if (xa_is_err(res)) { + block->time = 0; + return NULL; } + + node->next_block = + (node->next_block + 1) & (HSR_MAX_SEQ_BLOCKS - 1); } - return hsr_add_node(hsr, node_db, ethhdr->h_source, seq_out, - san, rx_port); + return block; } +EXPORT_SYMBOL_IF_KUNIT(hsr_get_seq_block); /* Use the Supervision frame's info about an eventual macaddress_B for merging * nodes that has previously had their macaddress_B registered as a separate @@ -288,16 +339,18 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) { struct hsr_node *node_curr = frame->node_src; struct hsr_port *port_rcv = frame->port_rcv; + struct hsr_seq_block *src_blk, *merge_blk; struct hsr_priv *hsr = port_rcv->hsr; - struct hsr_sup_payload *hsr_sp; struct hsr_sup_tlv *hsr_sup_tlv; + struct hsr_sup_payload *hsr_sp; struct hsr_node *node_real; struct sk_buff *skb = NULL; struct list_head *node_db; struct ethhdr *ethhdr; - int i; - unsigned int pull_size = 0; unsigned int total_pull_size = 0; + unsigned int pull_size = 0; + unsigned long idx; + int i; /* Here either frame->skb_hsr or frame->skb_prp should be * valid as supervision frame always will have protocol @@ -340,8 +393,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (!node_real) /* No frame received from AddrA of this node yet */ node_real = hsr_add_node(hsr, node_db, hsr_sp->macaddress_A, - HSR_SEQNR_START - 1, true, - port_rcv->type); + true, port_rcv->type); if (!node_real) goto done; /* No mem */ if (node_real == node_curr) @@ -388,8 +440,20 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) node_real->time_in_stale[i] = node_curr->time_in_stale[i]; } - if (seq_nr_after(node_curr->seq_out[i], node_real->seq_out[i])) - node_real->seq_out[i] = node_curr->seq_out[i]; + } + + xa_for_each(&node_curr->seq_blocks, idx, src_blk) { + if (hsr_seq_block_is_old(src_blk)) + continue; + + merge_blk = hsr_get_seq_block(node_real, src_blk->block_idx); + if (!merge_blk) + continue; + merge_blk->time = min(merge_blk->time, src_blk->time); + for (i = 0; i < node_real->seq_port_cnt; i++) { + bitmap_or(merge_blk->seq_nrs[i], merge_blk->seq_nrs[i], + src_blk->seq_nrs[i], HSR_SEQ_BLOCK_SIZE); + } } spin_unlock_bh(&node_real->seq_out_lock); node_real->addr_B_port = port_rcv->type; @@ -398,7 +462,7 @@ void hsr_handle_sup_frame(struct hsr_frame_info *frame) if (!node_curr->removed) { list_del_rcu(&node_curr->mac_list); node_curr->removed = true; - kfree_rcu(node_curr, rcu_head); + call_rcu(&node_curr->rcu_head, hsr_free_node_rcu); } spin_unlock_bh(&hsr->list_lock); @@ -466,56 +530,79 @@ void hsr_addr_subst_dest(struct hsr_node *node_src, struct sk_buff *skb, void hsr_register_frame_in(struct hsr_node *node, struct hsr_port *port, u16 sequence_nr) { - /* Don't register incoming frames without a valid sequence number. This - * ensures entries of restarted nodes gets pruned so that they can - * re-register and resume communications. - */ - if (!(port->dev->features & NETIF_F_HW_HSR_TAG_RM) && - seq_nr_before(sequence_nr, node->seq_out[port->type])) - return; - node->time_in[port->type] = jiffies; node->time_in_stale[port->type] = false; } -/* 'skb' is a HSR Ethernet frame (with a HSR tag inserted), with a valid - * ethhdr->h_source address and skb->mac_header set. +/* Duplicate discard algorithm: we maintain a bitmap where we set a bit for + * every seen sequence number. The bitmap is split into blocks and the block + * management is detailed in hsr_get_seq_block(). In any case, we err on the + * side of accepting a packet, as the specification requires the algorithm to + * be "designed such that it never rejects a legitimate frame, while occasional + * acceptance of a duplicate can be tolerated." (IEC 62439-3:2021, 4.1.10.3). + * While this requirement is explicit for PRP, applying it to HSR does no harm + * either. + * + * 'frame' is the frame to be sent + * 'port_type' is the type of the outgoing interface * * Return: * 1 if frame can be shown to have been sent recently on this interface, - * 0 otherwise, or - * negative error code on error + * 0 otherwise */ -int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) +static int hsr_check_duplicate(struct hsr_frame_info *frame, + unsigned int port_type) { - struct hsr_node *node = frame->node_src; - u16 sequence_nr = frame->sequence_nr; + u16 sequence_nr, seq_bit, block_idx; + struct hsr_seq_block *block; + struct hsr_node *node; + + node = frame->node_src; + sequence_nr = frame->sequence_nr; + + if (WARN_ON_ONCE(port_type >= node->seq_port_cnt)) + return 0; spin_lock_bh(&node->seq_out_lock); - if (seq_nr_before_or_eq(sequence_nr, node->seq_out[port->type]) && - time_is_after_jiffies(node->time_out[port->type] + - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME))) { - spin_unlock_bh(&node->seq_out_lock); - return 1; - } - node->time_out[port->type] = jiffies; - node->seq_out[port->type] = sequence_nr; + block_idx = hsr_seq_block_index(sequence_nr); + block = hsr_get_seq_block(node, block_idx); + if (!block) + goto out_new; + + seq_bit = hsr_seq_block_bit(sequence_nr); + if (__test_and_set_bit(seq_bit, block->seq_nrs[port_type])) + goto out_seen; + +out_new: spin_unlock_bh(&node->seq_out_lock); return 0; + +out_seen: + spin_unlock_bh(&node->seq_out_lock); + return 1; } -/* Adaptation of the PRP duplicate discard algorithm described in wireshark - * wiki (https://wiki.wireshark.org/PRP) +/* HSR duplicate discard: we check if the same frame has already been sent on + * this outgoing interface. The check follows the general duplicate discard + * algorithm. * - * A drop window is maintained for both LANs with start sequence set to the - * first sequence accepted on the LAN that has not been seen on the other LAN, - * and expected sequence set to the latest received sequence number plus one. + * 'port' is the outgoing interface + * 'frame' is the frame to be sent * - * When a frame is received on either LAN it is compared against the received - * frames on the other LAN. If it is outside the drop window of the other LAN - * the frame is accepted and the drop window is updated. - * The drop window for the other LAN is reset. + * Return: + * 1 if frame can be shown to have been sent recently on this interface, + * 0 otherwise + */ +int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) +{ + return hsr_check_duplicate(frame, port->type - 1); +} + +/* PRP duplicate discard: we only consider frames that are received on port A + * or port B and should go to the master port. For those, we check if they have + * already been received by the host, i.e., master port. The check uses the + * general duplicate discard algorithm, but without tracking multiple ports. * * 'port' is the outgoing interface * 'frame' is the frame to be sent @@ -526,18 +613,9 @@ int hsr_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) */ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) { - enum hsr_port_type other_port; - enum hsr_port_type rcv_port; - struct hsr_node *node; - u16 sequence_diff; - u16 sequence_exp; - u16 sequence_nr; - - /* out-going frames are always in order - * and can be checked the same way as for HSR - */ + /* out-going frames are always in order */ if (frame->port_rcv->type == HSR_PT_MASTER) - return hsr_register_frame_out(port, frame); + return 0; /* for PRP we should only forward frames from the slave ports * to the master port @@ -545,52 +623,9 @@ int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame) if (port->type != HSR_PT_MASTER) return 1; - node = frame->node_src; - sequence_nr = frame->sequence_nr; - sequence_exp = sequence_nr + 1; - rcv_port = frame->port_rcv->type; - other_port = rcv_port == HSR_PT_SLAVE_A ? HSR_PT_SLAVE_B : - HSR_PT_SLAVE_A; - - spin_lock_bh(&node->seq_out_lock); - if (time_is_before_jiffies(node->time_out[port->type] + - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME)) || - (node->seq_start[rcv_port] == node->seq_expected[rcv_port] && - node->seq_start[other_port] == node->seq_expected[other_port])) { - /* the node hasn't been sending for a while - * or both drop windows are empty, forward the frame - */ - node->seq_start[rcv_port] = sequence_nr; - } else if (seq_nr_before(sequence_nr, node->seq_expected[other_port]) && - seq_nr_before_or_eq(node->seq_start[other_port], sequence_nr)) { - /* drop the frame, update the drop window for the other port - * and reset our drop window - */ - node->seq_start[other_port] = sequence_exp; - node->seq_expected[rcv_port] = sequence_exp; - node->seq_start[rcv_port] = node->seq_expected[rcv_port]; - spin_unlock_bh(&node->seq_out_lock); - return 1; - } - - /* update the drop window for the port where this frame was received - * and clear the drop window for the other port - */ - node->seq_start[other_port] = node->seq_expected[other_port]; - node->seq_expected[rcv_port] = sequence_exp; - sequence_diff = sequence_exp - node->seq_start[rcv_port]; - if (sequence_diff > PRP_DROP_WINDOW_LEN) - node->seq_start[rcv_port] = sequence_exp - PRP_DROP_WINDOW_LEN; - - node->time_out[port->type] = jiffies; - node->seq_out[port->type] = sequence_nr; - spin_unlock_bh(&node->seq_out_lock); - return 0; + return hsr_check_duplicate(frame, 0); } - -#if IS_MODULE(CONFIG_PRP_DUP_DISCARD_KUNIT_TEST) -EXPORT_SYMBOL(prp_register_frame_out); -#endif +EXPORT_SYMBOL_IF_KUNIT(prp_register_frame_out); static struct hsr_port *get_late_port(struct hsr_priv *hsr, struct hsr_node *node) @@ -672,7 +707,7 @@ void hsr_prune_nodes(struct timer_list *t) list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } } @@ -706,7 +741,7 @@ void hsr_prune_proxy_nodes(struct timer_list *t) list_del_rcu(&node->mac_list); node->removed = true; /* Note that we need to free this entry later: */ - kfree_rcu(node, rcu_head); + call_rcu(&node->rcu_head, hsr_free_node_rcu); } } } @@ -740,6 +775,39 @@ void *hsr_get_next_node(struct hsr_priv *hsr, void *_pos, return NULL; } +/* Fill the last sequence number that has been received from node on if1 by + * finding the last sequence number sent on port B; accordingly get the last + * received sequence number for if2 using sent sequence numbers on port A. + */ +static void fill_last_seq_nrs(struct hsr_node *node, u16 *if1_seq, u16 *if2_seq) +{ + struct hsr_seq_block *block; + unsigned int block_off; + size_t block_sz; + u16 seq_bit; + + spin_lock_bh(&node->seq_out_lock); + + /* Get last inserted block */ + block_off = (node->next_block - 1) & (HSR_MAX_SEQ_BLOCKS - 1); + block_sz = hsr_seq_block_size(node); + block = node->block_buf + block_off * block_sz; + + if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_B - 1], + HSR_SEQ_BLOCK_SIZE)) { + seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_B - 1], + HSR_SEQ_BLOCK_SIZE); + *if1_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit; + } + if (!bitmap_empty(block->seq_nrs[HSR_PT_SLAVE_A - 1], + HSR_SEQ_BLOCK_SIZE)) { + seq_bit = find_last_bit(block->seq_nrs[HSR_PT_SLAVE_A - 1], + HSR_SEQ_BLOCK_SIZE); + *if2_seq = (block->block_idx << HSR_SEQ_BLOCK_SHIFT) | seq_bit; + } + spin_unlock_bh(&node->seq_out_lock); +} + int hsr_get_node_data(struct hsr_priv *hsr, const unsigned char *addr, unsigned char addr_b[ETH_ALEN], @@ -780,8 +848,10 @@ int hsr_get_node_data(struct hsr_priv *hsr, *if2_age = jiffies_to_msecs(tdiff); /* Present sequence numbers as if they were incoming on interface */ - *if1_seq = node->seq_out[HSR_PT_SLAVE_B]; - *if2_seq = node->seq_out[HSR_PT_SLAVE_A]; + *if1_seq = 0; + *if2_seq = 0; + if (hsr->prot_version != PRP_V1) + fill_last_seq_nrs(node, if1_seq, if2_seq); if (node->addr_B_port != HSR_PT_NONE) { port = hsr_port_get_hsr(hsr, node->addr_B_port); diff --git a/net/hsr/hsr_framereg.h b/net/hsr/hsr_framereg.h index b04948659d84..c65ecb925734 100644 --- a/net/hsr/hsr_framereg.h +++ b/net/hsr/hsr_framereg.h @@ -74,9 +74,30 @@ bool hsr_is_node_in_db(struct list_head *node_db, int prp_register_frame_out(struct hsr_port *port, struct hsr_frame_info *frame); +#if IS_ENABLED(CONFIG_KUNIT) +struct hsr_seq_block *hsr_get_seq_block(struct hsr_node *node, u16 block_idx); +#endif + +#define HSR_SEQ_BLOCK_SHIFT 7 /* 128 bits */ +#define HSR_SEQ_BLOCK_SIZE (1 << HSR_SEQ_BLOCK_SHIFT) +#define HSR_SEQ_BLOCK_MASK (HSR_SEQ_BLOCK_SIZE - 1) +#define HSR_MAX_SEQ_BLOCKS 64 + +#define hsr_seq_block_index(sequence_nr) ((sequence_nr) >> HSR_SEQ_BLOCK_SHIFT) +#define hsr_seq_block_bit(sequence_nr) ((sequence_nr) & HSR_SEQ_BLOCK_MASK) + +struct hsr_seq_block { + unsigned long time; + u16 block_idx; + /* Should be a flexible array member of what DECLARE_BITMAP() would + * produce. + */ + unsigned long seq_nrs[][BITS_TO_LONGS(HSR_SEQ_BLOCK_SIZE)]; +}; + struct hsr_node { struct list_head mac_list; - /* Protect R/W access to seq_out */ + /* Protect R/W access seq_blocks */ spinlock_t seq_out_lock; unsigned char macaddress_A[ETH_ALEN]; unsigned char macaddress_B[ETH_ALEN]; @@ -84,16 +105,22 @@ struct hsr_node { enum hsr_port_type addr_B_port; unsigned long time_in[HSR_PT_PORTS]; bool time_in_stale[HSR_PT_PORTS]; - unsigned long time_out[HSR_PT_PORTS]; /* if the node is a SAN */ bool san_a; bool san_b; - u16 seq_out[HSR_PT_PORTS]; bool removed; - /* PRP specific duplicate handling */ - u16 seq_expected[HSR_PT_PORTS]; - u16 seq_start[HSR_PT_PORTS]; + /* Duplicate detection */ + struct xarray seq_blocks; + void *block_buf; + unsigned int next_block; + unsigned int seq_port_cnt; struct rcu_head rcu_head; }; +static inline size_t hsr_seq_block_size(struct hsr_node *node) +{ + WARN_ON_ONCE(node->seq_port_cnt == 0); + return struct_size_t(struct hsr_seq_block, seq_nrs, node->seq_port_cnt); +} + #endif /* __HSR_FRAMEREG_H */ diff --git a/net/hsr/prp_dup_discard_test.c b/net/hsr/prp_dup_discard_test.c index e86b7b633ae8..b028e71e6a0f 100644 --- a/net/hsr/prp_dup_discard_test.c +++ b/net/hsr/prp_dup_discard_test.c @@ -4,6 +4,8 @@ #include "hsr_main.h" #include "hsr_framereg.h" +MODULE_IMPORT_NS("EXPORTED_FOR_KUNIT_TESTING"); + struct prp_test_data { struct hsr_port port; struct hsr_port port_rcv; @@ -13,37 +15,55 @@ struct prp_test_data { static struct prp_test_data *build_prp_test_data(struct kunit *test) { + size_t block_sz; + struct prp_test_data *data = kunit_kzalloc(test, sizeof(struct prp_test_data), GFP_USER); KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data); + data->node.seq_port_cnt = 1; + block_sz = hsr_seq_block_size(&data->node); + data->node.block_buf = kunit_kcalloc(test, HSR_MAX_SEQ_BLOCKS, block_sz, + GFP_ATOMIC); + KUNIT_EXPECT_NOT_ERR_OR_NULL(test, data->node.block_buf); + + xa_init(&data->node.seq_blocks); + spin_lock_init(&data->node.seq_out_lock); + data->frame.node_src = &data->node; data->frame.port_rcv = &data->port_rcv; data->port_rcv.type = HSR_PT_SLAVE_A; - data->node.seq_start[HSR_PT_SLAVE_A] = 1; - data->node.seq_expected[HSR_PT_SLAVE_A] = 1; - data->node.seq_start[HSR_PT_SLAVE_B] = 1; - data->node.seq_expected[HSR_PT_SLAVE_B] = 1; - data->node.seq_out[HSR_PT_MASTER] = 0; - data->node.time_out[HSR_PT_MASTER] = jiffies; data->port.type = HSR_PT_MASTER; return data; } -static void check_prp_counters(struct kunit *test, - struct prp_test_data *data, - u16 seq_start_a, u16 seq_expected_a, - u16 seq_start_b, u16 seq_expected_b) +static void check_prp_frame_seen(struct kunit *test, struct prp_test_data *data, + u16 sequence_nr) +{ + u16 block_idx, seq_bit; + struct hsr_seq_block *block; + + block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = xa_load(&data->node.seq_blocks, block_idx); + KUNIT_EXPECT_NOT_NULL(test, block); + + seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK; + KUNIT_EXPECT_TRUE(test, test_bit(seq_bit, block->seq_nrs[0])); +} + +static void check_prp_frame_unseen(struct kunit *test, + struct prp_test_data *data, u16 sequence_nr) { - KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_A], - seq_start_a); - KUNIT_EXPECT_EQ(test, data->node.seq_start[HSR_PT_SLAVE_B], - seq_start_b); - KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_A], - seq_expected_a); - KUNIT_EXPECT_EQ(test, data->node.seq_expected[HSR_PT_SLAVE_B], - seq_expected_b); + u16 block_idx, seq_bit; + struct hsr_seq_block *block; + + block_idx = sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = hsr_get_seq_block(&data->node, block_idx); + KUNIT_EXPECT_NOT_NULL(test, block); + + seq_bit = sequence_nr & HSR_SEQ_BLOCK_MASK; + KUNIT_EXPECT_FALSE(test, test_bit(seq_bit, block->seq_nrs[0])); } static void prp_dup_discard_forward(struct kunit *test) @@ -54,52 +74,48 @@ static void prp_dup_discard_forward(struct kunit *test) data->frame.sequence_nr = 2; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 1, 1); + check_prp_frame_seen(test, data, data->frame.sequence_nr); } -static void prp_dup_discard_inside_dropwindow(struct kunit *test) +static void prp_dup_discard_drop_duplicate(struct kunit *test) { - /* Normal situation, other LAN ahead by one. Frame is dropped */ struct prp_test_data *data = build_prp_test_data(test); - unsigned long time = jiffies - 10; - data->frame.sequence_nr = 1; - data->node.seq_expected[HSR_PT_SLAVE_B] = 3; - data->node.seq_out[HSR_PT_MASTER] = 2; - data->node.time_out[HSR_PT_MASTER] = time; + data->frame.sequence_nr = 2; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, 2, data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, time, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 2, 2, 2, 3); + check_prp_frame_seen(test, data, data->frame.sequence_nr); } -static void prp_dup_discard_node_timeout(struct kunit *test) +static void prp_dup_discard_entry_timeout(struct kunit *test) { /* Timeout situation, node hasn't sent anything for a while */ struct prp_test_data *data = build_prp_test_data(test); + struct hsr_seq_block *block; + u16 block_idx; data->frame.sequence_nr = 7; - data->node.seq_start[HSR_PT_SLAVE_A] = 1234; - data->node.seq_expected[HSR_PT_SLAVE_A] = 1235; - data->node.seq_start[HSR_PT_SLAVE_B] = 1234; - data->node.seq_expected[HSR_PT_SLAVE_B] = 1234; - data->node.seq_out[HSR_PT_MASTER] = 1234; - data->node.time_out[HSR_PT_MASTER] = - jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + + data->frame.sequence_nr = 11; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + + block_idx = data->frame.sequence_nr >> HSR_SEQ_BLOCK_SHIFT; + block = hsr_get_seq_block(&data->node, block_idx); + block->time = jiffies - msecs_to_jiffies(HSR_ENTRY_FORGET_TIME) - 1; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - KUNIT_EXPECT_EQ(test, jiffies, data->node.time_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 1234, 1234); + check_prp_frame_seen(test, data, data->frame.sequence_nr); + check_prp_frame_unseen(test, data, 7); } static void prp_dup_discard_out_of_sequence(struct kunit *test) @@ -107,50 +123,36 @@ static void prp_dup_discard_out_of_sequence(struct kunit *test) /* One frame is received out of sequence on both LANs */ struct prp_test_data *data = build_prp_test_data(test); - data->node.seq_start[HSR_PT_SLAVE_A] = 10; - data->node.seq_expected[HSR_PT_SLAVE_A] = 10; - data->node.seq_start[HSR_PT_SLAVE_B] = 10; - data->node.seq_expected[HSR_PT_SLAVE_B] = 10; - data->node.seq_out[HSR_PT_MASTER] = 9; + /* initial frame, should be accepted */ + data->frame.sequence_nr = 9; + KUNIT_EXPECT_EQ(test, 0, + prp_register_frame_out(&data->port, &data->frame)); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* 1st old frame, should be accepted */ data->frame.sequence_nr = 8; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 10, 10); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* 2nd frame should be dropped */ data->frame.sequence_nr = 8; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1); /* Next frame, this is forwarded */ data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_A; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, data->frame.sequence_nr, - data->frame.sequence_nr + 1, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); /* and next one is dropped */ data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1, - data->frame.sequence_nr + 1); } static void prp_dup_discard_lan_b_late(struct kunit *test) @@ -158,43 +160,31 @@ static void prp_dup_discard_lan_b_late(struct kunit *test) /* LAN B is behind */ struct prp_test_data *data = build_prp_test_data(test); - data->node.seq_start[HSR_PT_SLAVE_A] = 9; - data->node.seq_expected[HSR_PT_SLAVE_A] = 9; - data->node.seq_start[HSR_PT_SLAVE_B] = 9; - data->node.seq_expected[HSR_PT_SLAVE_B] = 9; - data->node.seq_out[HSR_PT_MASTER] = 8; - data->frame.sequence_nr = 9; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 9, 10, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); data->frame.sequence_nr = 10; KUNIT_EXPECT_EQ(test, 0, prp_register_frame_out(&data->port, &data->frame)); - KUNIT_EXPECT_EQ(test, data->frame.sequence_nr, - data->node.seq_out[HSR_PT_MASTER]); - check_prp_counters(test, data, 9, 11, 9, 9); + check_prp_frame_seen(test, data, data->frame.sequence_nr); data->frame.sequence_nr = 9; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, 10, 11, 10, 10); data->frame.sequence_nr = 10; data->port_rcv.type = HSR_PT_SLAVE_B; KUNIT_EXPECT_EQ(test, 1, prp_register_frame_out(&data->port, &data->frame)); - check_prp_counters(test, data, 11, 11, 11, 11); } static struct kunit_case prp_dup_discard_test_cases[] = { KUNIT_CASE(prp_dup_discard_forward), - KUNIT_CASE(prp_dup_discard_inside_dropwindow), - KUNIT_CASE(prp_dup_discard_node_timeout), + KUNIT_CASE(prp_dup_discard_drop_duplicate), + KUNIT_CASE(prp_dup_discard_entry_timeout), KUNIT_CASE(prp_dup_discard_out_of_sequence), KUNIT_CASE(prp_dup_discard_lan_b_late), {} diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ec36d2ec059e..18108a6f0499 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -9,7 +9,7 @@ obj-y := route.o inetpeer.o protocol.o \ inet_timewait_sock.o inet_connection_sock.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ - tcp_rate.o tcp_recovery.o tcp_ulp.o \ + tcp_recovery.o tcp_ulp.o \ tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 709021197e1c..32b951ebc0c2 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -2196,7 +2196,8 @@ int cipso_v4_skbuff_setattr(struct sk_buff *skb, /* if we don't ensure enough headroom we could panic on the skb_push() * call below so make sure we have enough, we are also "mangling" the * packet so we should probably do a copy-on-write call anyway */ - ret_val = skb_cow(skb, skb_headroom(skb) + len_delta); + ret_val = skb_cow(skb, + skb_headroom(skb) + (len_delta > 0 ? len_delta : 0)); if (ret_val < 0) return ret_val; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index f9b9e26c32c1..0b72796dd1ad 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -28,8 +28,10 @@ struct fib_alias { /* Don't write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { - if (!(fa->fa_state & FA_S_ACCESSED)) - fa->fa_state |= FA_S_ACCESSED; + u8 fa_state = READ_ONCE(fa->fa_state); + + if (!(fa_state & FA_S_ACCESSED)) + WRITE_ONCE(fa->fa_state, fa_state | FA_S_ACCESSED); } /* Exported by fib_semantics.c */ diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 7e2c17fec3fc..1308213791f1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1280,7 +1280,7 @@ int fib_table_insert(struct net *net, struct fib_table *tb, new_fa->fa_dscp = fa->fa_dscp; new_fa->fa_info = fi; new_fa->fa_type = cfg->fc_type; - state = fa->fa_state; + state = READ_ONCE(fa->fa_state); new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_slen = fa->fa_slen; new_fa->tb_id = tb->tb_id; @@ -1745,7 +1745,7 @@ int fib_table_delete(struct net *net, struct fib_table *tb, fib_remove_alias(t, tp, l, fa_to_delete); - if (fa_to_delete->fa_state & FA_S_ACCESSED) + if (READ_ONCE(fa_to_delete->fa_state) & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net); fib_release_info(fa_to_delete->fa_info); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 4abbec2f47ef..e216b6df6331 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -112,7 +112,9 @@ struct icmp_bxm { __be32 times[3]; } data; int head_len; - struct ip_options_data replyopts; + + /* Must be last as it ends in a flexible-array member. */ + struct ip_options_rcu replyopts; }; /* An array of errno for error messages from dest unreach. */ @@ -353,9 +355,12 @@ void icmp_out_count(struct net *net, unsigned char type) static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) { - struct icmp_bxm *icmp_param = from; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); __wsum csum; + icmp_param = from; + csum = skb_copy_and_csum_bits(icmp_param->skb, icmp_param->offset + offset, to, len); @@ -413,7 +418,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) int type = icmp_param->data.icmph.type; int code = icmp_param->data.icmph.code; - if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) + if (ip_options_echo(net, &icmp_param->replyopts.opt, skb)) return; /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ @@ -435,10 +440,10 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) daddr = ipc.addr = ip_hdr(skb)->saddr; saddr = fib_compute_spec_dst(skb); - if (icmp_param->replyopts.opt.opt.optlen) { - ipc.opt = &icmp_param->replyopts.opt; + if (icmp_param->replyopts.opt.optlen) { + ipc.opt = &icmp_param->replyopts; if (ipc.opt->opt.srr) - daddr = icmp_param->replyopts.opt.opt.faddr; + daddr = icmp_param->replyopts.opt.faddr; } memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -491,8 +496,8 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, int err; memset(fl4, 0, sizeof(*fl4)); - fl4->daddr = (param->replyopts.opt.opt.srr ? - param->replyopts.opt.opt.faddr : iph->saddr); + fl4->daddr = (param->replyopts.opt.srr ? + param->replyopts.opt.faddr : iph->saddr); fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); @@ -554,6 +559,21 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, /* steal dst entry from skb_in, don't drop refcnt */ skb_dstref_steal(skb_in); skb_dstref_restore(skb_in, orefdst); + + /* + * At this point, fl4_dec.daddr should NOT be local (we + * checked fl4_dec.saddr above). However, a race condition + * may occur if the address is added to the interface + * concurrently. In that case, ip_route_input() returns a + * LOCAL route with dst.output=ip_rt_bug, which must not + * be used for output. + */ + if (!err && rt2 && rt2->rt_type == RTN_LOCAL) { + net_warn_ratelimited("detected local route for %pI4 during ICMP sending, src %pI4\n", + &fl4_dec.daddr, &fl4_dec.saddr); + dst_release(&rt2->dst); + err = -EINVAL; + } } if (err) @@ -775,9 +795,10 @@ free_skb: void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, const struct inet_skb_parm *parm) { + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct iphdr *iph; int room; - struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); bool apply_ratelimit = false; struct sk_buff *ext_skb; @@ -906,7 +927,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + if (__ip_options_echo(net, &icmp_param->replyopts.opt, skb_in, &parm->opt)) goto out_unlock; @@ -915,21 +936,21 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * Prepare data for ICMP header. */ - icmp_param.data.icmph.type = type; - icmp_param.data.icmph.code = code; - icmp_param.data.icmph.un.gateway = info; - icmp_param.data.icmph.checksum = 0; - icmp_param.skb = skb_in; - icmp_param.offset = skb_network_offset(skb_in); + icmp_param->data.icmph.type = type; + icmp_param->data.icmph.code = code; + icmp_param->data.icmph.un.gateway = info; + icmp_param->data.icmph.checksum = 0; + icmp_param->skb = skb_in; + icmp_param->offset = skb_network_offset(skb_in); ipcm_init(&ipc); ipc.tos = tos; ipc.addr = iph->saddr; - ipc.opt = &icmp_param.replyopts.opt; + ipc.opt = &icmp_param->replyopts; ipc.sockc.mark = mark; rt = icmp_route_lookup(net, &fl4, skb_in, iph, saddr, inet_dsfield_to_dscp(tos), mark, type, code, - &icmp_param); + icmp_param); if (IS_ERR(rt)) goto out_unlock; @@ -939,10 +960,10 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, /* RFC says return as much as we can without exceeding 576 bytes. */ - room = dst_mtu(&rt->dst); + room = dst4_mtu(&rt->dst); if (room > 576) room = 576; - room -= sizeof(struct iphdr) + icmp_param.replyopts.opt.opt.optlen; + room -= sizeof(struct iphdr) + icmp_param->replyopts.opt.optlen; room -= sizeof(struct icmphdr); /* Guard against tiny mtu. We need to include at least one * IP network header for this message to make any sense. @@ -950,15 +971,15 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, if (room <= (int)sizeof(struct iphdr)) goto ende; - ext_skb = icmp_ext_append(net, skb_in, &icmp_param.data.icmph, room, + ext_skb = icmp_ext_append(net, skb_in, &icmp_param->data.icmph, room, parm->iif); if (ext_skb) - icmp_param.skb = ext_skb; + icmp_param->skb = ext_skb; - icmp_param.data_len = icmp_param.skb->len - icmp_param.offset; - if (icmp_param.data_len > room) - icmp_param.data_len = room; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data_len = icmp_param->skb->len - icmp_param->offset; + if (icmp_param->data_len > room) + icmp_param->data_len = room; + icmp_param->head_len = sizeof(struct icmphdr); /* if we don't have a source address at this point, fall back to the * dummy address instead of sending out a packet with a source address @@ -969,7 +990,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, trace_icmp_send(skb_in, type, code); - icmp_push_reply(sk, &icmp_param, &fl4, &ipc, &rt); + icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); if (ext_skb) consume_skb(ext_skb); @@ -1031,16 +1052,22 @@ static void icmp_socket_deliver(struct sk_buff *skb, u32 info) /* Checkin full IP header plus 8 bytes of protocol to * avoid additional coding at protocol handlers. */ - if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) { - __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); - return; - } + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + /* IPPROTO_RAW sockets are not supposed to receive anything. */ + if (protocol == IPPROTO_RAW) + goto out; raw_icmp_error(skb, protocol, info); ipprot = rcu_dereference(inet_protos[protocol]); if (ipprot && ipprot->err_handler) ipprot->err_handler(skb, info); + return; + +out: + __ICMP_INC_STATS(dev_net_rcu(skb->dev), ICMP_MIB_INERRORS); } static bool icmp_tag_validation(int proto) @@ -1206,7 +1233,8 @@ static enum skb_drop_reason icmp_redirect(struct sk_buff *skb) static enum skb_drop_reason icmp_echo(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net; net = skb_dst_dev_net_rcu(skb); @@ -1214,18 +1242,18 @@ static enum skb_drop_reason icmp_echo(struct sk_buff *skb) if (READ_ONCE(net->ipv4.sysctl_icmp_echo_ignore_all)) return SKB_NOT_DROPPED_YET; - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = skb->len; - icmp_param.head_len = sizeof(struct icmphdr); + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = skb->len; + icmp_param->head_len = sizeof(struct icmphdr); - if (icmp_param.data.icmph.type == ICMP_ECHO) - icmp_param.data.icmph.type = ICMP_ECHOREPLY; - else if (!icmp_build_probe(skb, &icmp_param.data.icmph)) + if (icmp_param->data.icmph.type == ICMP_ECHO) + icmp_param->data.icmph.type = ICMP_ECHOREPLY; + else if (!icmp_build_probe(skb, &icmp_param->data.icmph)) return SKB_NOT_DROPPED_YET; - icmp_reply(&icmp_param, skb); + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; } @@ -1353,7 +1381,8 @@ EXPORT_SYMBOL_GPL(icmp_build_probe); */ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) { - struct icmp_bxm icmp_param; + DEFINE_RAW_FLEX(struct icmp_bxm, icmp_param, replyopts.opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); /* * Too short. */ @@ -1363,19 +1392,19 @@ static enum skb_drop_reason icmp_timestamp(struct sk_buff *skb) /* * Fill in the current time as ms since midnight UT: */ - icmp_param.data.times[1] = inet_current_timestamp(); - icmp_param.data.times[2] = icmp_param.data.times[1]; - - BUG_ON(skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)); - - icmp_param.data.icmph = *icmp_hdr(skb); - icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; - icmp_param.data.icmph.code = 0; - icmp_param.skb = skb; - icmp_param.offset = 0; - icmp_param.data_len = 0; - icmp_param.head_len = sizeof(struct icmphdr) + 12; - icmp_reply(&icmp_param, skb); + icmp_param->data.times[1] = inet_current_timestamp(); + icmp_param->data.times[2] = icmp_param->data.times[1]; + + BUG_ON(skb_copy_bits(skb, 0, &icmp_param->data.times[0], 4)); + + icmp_param->data.icmph = *icmp_hdr(skb); + icmp_param->data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param->data.icmph.code = 0; + icmp_param->skb = skb; + icmp_param->offset = 0; + icmp_param->data_len = 0; + icmp_param->head_len = sizeof(struct icmphdr) + 12; + icmp_reply(icmp_param, skb); return SKB_NOT_DROPPED_YET; out_err: diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 7182f1419c2a..0adc993c211d 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -227,7 +227,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = get_random_u32_below(in_dev->mr_maxdelay); + int tv = get_random_u32_below(READ_ONCE(in_dev->mr_maxdelay)); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -1009,7 +1009,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); if (!max_delay) max_delay = 1; /* can't mod w/ 0 */ - in_dev->mr_maxdelay = max_delay; + WRITE_ONCE(in_dev->mr_maxdelay, max_delay); /* RFC3376, 4.1.6. QRV and 4.1.7. QQIC, when the most recently * received value was zero, use the default or statically diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 97d57c52b9ad..5dfac6ce1110 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -20,6 +20,7 @@ #include <net/tcp_states.h> #include <net/xfrm.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/sock_reuseport.h> #include <net/addrconf.h> @@ -918,6 +919,16 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, } EXPORT_SYMBOL(inet_reqsk_alloc); +void __reqsk_free(struct request_sock *req) +{ + req->rsk_ops->destructor(req); + if (req->rsk_listener) + sock_put(req->rsk_listener); + kfree(req->saved_syn); + kmem_cache_free(req->rsk_ops->slab, req); +} +EXPORT_SYMBOL_GPL(__reqsk_free); + static struct request_sock *inet_reqsk_clone(struct request_sock *req, struct sock *sk) { @@ -1103,6 +1114,8 @@ static void reqsk_timer_handler(struct timer_list *t) (!resend || !tcp_rtx_synack(sk_listener, req) || inet_rsk(req)->acked)) { + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; if (req->num_timeout++ == 0) atomic_dec(&queue->young); mod_timer(&req->rsk_timer, jiffies + tcp_reqsk_timeout(req)); @@ -1196,7 +1209,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, { struct sock *newsk = sk_clone_lock(sk, priority); struct inet_connection_sock *newicsk; - struct inet_request_sock *ireq; + const struct inet_request_sock *ireq; struct inet_sock *newinet; if (!newsk) @@ -1311,6 +1324,15 @@ static int inet_ulp_can_listen(const struct sock *sk) return 0; } +static void reqsk_queue_alloc(struct request_sock_queue *queue) +{ + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; + + queue->rskq_accept_head = NULL; +} + int inet_csk_listen_start(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index ff11d3a85a36..e4790cc7b5c2 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1300,7 +1300,7 @@ static int ip_setup_cork(struct sock *sk, struct inet_cork *cork, return -EFAULT; cork->fragsize = ip_sk_use_pmtu(sk) ? - dst_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); + dst4_mtu(&rt->dst) : READ_ONCE(rt->dst.dev->mtu); if (!inetdev_valid_mtu(cork->fragsize)) return -ENETUNREACH; @@ -1439,7 +1439,7 @@ struct sk_buff *__ip_make_skb(struct sock *sk, pmtudisc = READ_ONCE(inet->pmtudisc); if (pmtudisc == IP_PMTUDISC_DO || pmtudisc == IP_PMTUDISC_PROBE || - (skb->len <= dst_mtu(&rt->dst) && + (skb->len <= dst4_mtu(&rt->dst) && ip_dont_fragment(sk, &rt->dst))) df = htons(IP_DF); @@ -1606,7 +1606,8 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, const struct ip_reply_arg *arg, unsigned int len, u64 transmit_time, u32 txhash) { - struct ip_options_data replyopts; + DEFINE_RAW_FLEX(struct ip_options_rcu, replyopts, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct ipcm_cookie ipc; struct flowi4 fl4; struct rtable *rt = skb_rtable(skb); @@ -1615,18 +1616,18 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, int err; int oif; - if (__ip_options_echo(net, &replyopts.opt.opt, skb, sopt)) + if (__ip_options_echo(net, &replyopts->opt, skb, sopt)) return; ipcm_init(&ipc); ipc.addr = daddr; ipc.sockc.transmit_time = transmit_time; - if (replyopts.opt.opt.optlen) { - ipc.opt = &replyopts.opt; + if (replyopts->opt.optlen) { + ipc.opt = replyopts; - if (replyopts.opt.opt.srr) - daddr = replyopts.opt.opt.faddr; + if (replyopts->opt.srr) + daddr = replyopts->opt.faddr; } oif = arg->bound_dev_if; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 6d9c5c20b1c4..c062d9519818 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1634,7 +1634,7 @@ int do_ip_getsockopt(struct sock *sk, int level, int optname, val = 0; dst = sk_dst_get(sk); if (dst) { - val = dst_mtu(dst); + val = dst4_mtu(dst); dst_release(dst); } if (!val) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 019408d3ca2c..b1e1be00ff8b 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -679,8 +679,18 @@ static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; static void __init ic_dhcp_init_options(u8 *options, struct ic_device *d) { - u8 mt = ((ic_servaddr == NONE) - ? DHCPDISCOVER : DHCPREQUEST); + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 26, /* MTU */ + 40, /* NIS domain name */ + 42, /* NTP servers */ + }; + u8 mt = (ic_servaddr == NONE) ? DHCPDISCOVER : DHCPREQUEST; u8 *e = options; int len; @@ -705,51 +715,36 @@ ic_dhcp_init_options(u8 *options, struct ic_device *d) e += 4; } - /* always? */ - { - static const u8 ic_req_params[] = { - 1, /* Subnet mask */ - 3, /* Default gateway */ - 6, /* DNS server */ - 12, /* Host name */ - 15, /* Domain name */ - 17, /* Boot path */ - 26, /* MTU */ - 40, /* NIS domain name */ - 42, /* NTP servers */ - }; - - *e++ = 55; /* Parameter request list */ - *e++ = sizeof(ic_req_params); - memcpy(e, ic_req_params, sizeof(ic_req_params)); - e += sizeof(ic_req_params); - - if (ic_host_name_set) { - *e++ = 12; /* host-name */ - len = strlen(utsname()->nodename); - *e++ = len; - memcpy(e, utsname()->nodename, len); - e += len; - } - if (*vendor_class_identifier) { - pr_info("DHCP: sending class identifier \"%s\"\n", - vendor_class_identifier); - *e++ = 60; /* Class-identifier */ - len = strlen(vendor_class_identifier); - *e++ = len; - memcpy(e, vendor_class_identifier, len); - e += len; - } - len = strlen(dhcp_client_identifier + 1); - /* the minimum length of identifier is 2, include 1 byte type, - * and can not be larger than the length of options - */ - if (len >= 1 && len < 312 - (e - options) - 1) { - *e++ = 61; - *e++ = len + 1; - memcpy(e, dhcp_client_identifier, len + 1); - e += len + 1; - } + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + + if (ic_host_name_set) { + *e++ = 12; /* host-name */ + len = strlen(utsname()->nodename); + *e++ = len; + memcpy(e, utsname()->nodename, len); + e += len; + } + if (*vendor_class_identifier) { + pr_info("DHCP: sending class identifier \"%s\"\n", + vendor_class_identifier); + *e++ = 60; /* Class-identifier */ + len = strlen(vendor_class_identifier); + *e++ = len; + memcpy(e, vendor_class_identifier, len); + e += len; + } + len = strlen(dhcp_client_identifier + 1); + /* the minimum length of identifier is 2, include 1 byte type, + * and can not be larger than the length of options + */ + if (len >= 1 && len < 312 - (e - options) - 1) { + *e++ = 61; + *e++ = len + 1; + memcpy(e, dhcp_client_identifier, len + 1); + e += len + 1; } *e++ = 255; /* End of the list */ diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index ca9eaee4c2ef..131382c388e9 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1895,7 +1895,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { + if (skb->len+encap > dst4_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not * allow to send ICMP, so that packets will disappear * to blackhole. diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 8ddac1f595ed..82cf8a9e5ded 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -88,4 +88,4 @@ struct dst_metrics *ip_fib_metrics_init(struct nlattr *fc_mx, return fib_metrics; } -EXPORT_SYMBOL_GPL(ip_fib_metrics_init); +EXPORT_IPV6_MOD_GPL(ip_fib_metrics_init); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index fae4aa4a5f09..fecf6621f679 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -303,7 +303,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb, goto free_nskb; /* "Never happens" */ - if (nskb->len > dst_mtu(skb_dst(nskb))) + if (nskb->len > dst4_mtu(skb_dst(nskb))) goto free_nskb; nf_ct_attach(nskb, oldskb); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index cfbd563498e8..ebfc5a3d3ad6 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -690,6 +690,8 @@ EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct net *net = sock_net(sk); struct flowi4 fl4; struct inet_sock *inet = inet_sk(sk); @@ -697,7 +699,6 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) struct icmphdr user_icmph; struct pingfakehdr pfh; struct rtable *rt = NULL; - struct ip_options_data opt_copy; int free = 0; __be32 saddr, daddr, faddr; u8 scope; @@ -746,9 +747,9 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 5998c4cc6f47..e20c41206e29 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -481,6 +481,8 @@ static int raw_getfrag(void *from, char *to, int offset, int len, int odd, static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); struct ipcm_cookie ipc; @@ -491,7 +493,6 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) __be32 daddr; __be32 saddr; int uc_index, err; - struct ip_options_data opt_copy; struct raw_frag_vec rfv; int hdrincl; @@ -561,9 +562,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11d990703d31..06aa39ae80d6 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1795,8 +1795,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - pr_warn("martian source %pI4 from %pI4, on dev %s\n", - &daddr, &saddr, dev->name); + pr_warn("martian source (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { print_hex_dump(KERN_WARNING, "ll header: ", DUMP_PREFIX_OFFSET, 16, 1, @@ -2475,8 +2475,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev)) - net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", - &daddr, &saddr, dev->name); + net_warn_ratelimited("martian destination (src=%pI4, dst=%pI4, dev=%s)\n", + &saddr, &daddr, dev->name); #endif goto out; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a1a50a5c80dc..643763bc2142 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,7 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; -static int tcp_ecn_mode_max = 2; +static int tcp_ecn_mode_max = 5; static u32 icmp_errors_extension_mask_all = GENMASK_U8(ICMP_ERR_EXT_COUNT - 1, 0); @@ -749,7 +749,7 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_TWO, + .extra2 = SYSCTL_THREE, }, { .procname = "tcp_ecn_option_beacon", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d5319ebe2452..6ce03a9adb4a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -319,15 +319,6 @@ struct percpu_counter tcp_sockets_allocated ____cacheline_aligned_in_smp; EXPORT_IPV6_MOD(tcp_sockets_allocated); /* - * TCP splice context - */ -struct tcp_splice_state { - struct pipe_inode_info *pipe; - size_t len; - unsigned int flags; -}; - -/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting @@ -501,6 +492,9 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) struct sk_buff *skb = tcp_write_queue_tail(sk); u32 tsflags = sockc->tsflags; + if (unlikely(!skb)) + skb = skb_rb_last(&sk->tcp_rtx_queue); + if (tsflags && skb) { struct skb_shared_info *shinfo = skb_shinfo(skb); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); @@ -517,6 +511,19 @@ static void tcp_tx_timestamp(struct sock *sk, struct sockcm_cookie *sockc) bpf_skops_tx_timestamping(sk, skb, BPF_SOCK_OPS_TSTAMP_SENDMSG_CB); } +/* @wake is one when sk_stream_write_space() calls us. + * This sends EPOLLOUT only if notsent_bytes is half the limit. + * This mimics the strategy used in sock_def_write_space(). + */ +bool tcp_stream_memory_free(const struct sock *sk, int wake) +{ + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); + + return (notsent_bytes << wake) < tcp_notsent_lowat(tp); +} +EXPORT_SYMBOL(tcp_stream_memory_free); + static bool tcp_stream_is_readable(struct sock *sk, int target) { if (tcp_epollin_ready(sk, target)) @@ -775,8 +782,8 @@ void tcp_push(struct sock *sk, int flags, int mss_now, __tcp_push_pending_frames(sk, mss_now, nonagle); } -static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, - unsigned int offset, size_t len) +int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; int ret; @@ -902,6 +909,33 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, } EXPORT_IPV6_MOD(tcp_splice_read); +/* We allow to exceed memory limits for FIN packets to expedite + * connection tear down and (memory) recovery. + * Otherwise tcp_send_fin() could be tempted to either delay FIN + * or even be forced to close flow without any FIN. + * In general, we want to allow one skb per socket to avoid hangs + * with edge trigger epoll() + */ +void sk_forced_mem_schedule(struct sock *sk, int size) +{ + int delta, amt; + + delta = size - sk->sk_forward_alloc; + if (delta <= 0) + return; + + amt = sk_mem_pages(delta); + sk_forward_alloc_add(sk, amt << PAGE_SHIFT); + + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); + + if (sk->sk_bypass_prot_mem) + return; + + sk_memory_allocated_add(sk, amt); +} + struct sk_buff *tcp_stream_alloc_skb(struct sock *sk, gfp_t gfp, bool force_schedule) { @@ -1074,6 +1108,24 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, return err; } +/* If a gap is detected between sends, mark the socket application-limited. */ +void tcp_rate_check_app_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (/* We have less than one packet to send. */ + tp->write_seq - tp->snd_nxt < tp->mss_cache && + /* Nothing in sending host's qdisc queues or NIC tx queue. */ + sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && + /* We are not limited by CWND. */ + tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && + /* All lost packets have been retransmitted. */ + tp->lost_out <= tp->retrans_out) + tp->app_limited = + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; +} +EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); + int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) { struct net_devmem_dmabuf_binding *binding = NULL; @@ -3418,6 +3470,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_accecn_init_counters(tp); tp->prev_ecnfield = 0; tp->accecn_opt_tstamp = 0; + tp->pkts_acked_ewma = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -4320,6 +4373,14 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + if (tcp_ecn_disabled(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_DISABLED; + else if (tcp_ecn_mode_rfc3168(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_RFC3168; + else if (tcp_ecn_mode_accecn(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_ACCECN; + else if (tcp_ecn_mode_pending(tp)) + info->tcpi_ecn_mode = TCPI_ECN_MODE_PENDING; info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; info->tcpi_received_ce = tp->received_ce; @@ -5191,6 +5252,7 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, pkts_acked_ewma); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index df758adbb445..e9f6c77e0631 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -16,6 +16,7 @@ #include <linux/gfp.h> #include <linux/jhash.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <trace/events/tcp.h> static DEFINE_SPINLOCK(tcp_cong_list_lock); @@ -227,7 +228,7 @@ void tcp_assign_congestion_control(struct sock *sk) memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); } @@ -257,7 +258,7 @@ static void tcp_reinit_congestion_control(struct sock *sk, memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) - INET_ECN_xmit(sk); + INET_ECN_xmit_ect_1_negotiation(sk); else INET_ECN_dontxmit(sk); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 7d945a527daf..b30090cff3cf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -5,6 +5,92 @@ #include <net/tcp.h> #include <net/busy_poll.h> +/* + * This function is called to set a Fast Open socket's "fastopen_rsk" field + * to NULL when a TFO socket no longer needs to access the request_sock. + * This happens only after 3WHS has been either completed or aborted (e.g., + * RST is received). + * + * Before TFO, a child socket is created only after 3WHS is completed, + * hence it never needs to access the request_sock. things get a lot more + * complex with TFO. A child socket, accepted or not, has to access its + * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, + * until 3WHS is either completed or aborted. Afterwards the req will stay + * until either the child socket is accepted, or in the rare case when the + * listener is closed before the child is accepted. + * + * In short, a request socket is only freed after BOTH 3WHS has completed + * (or aborted) and the child socket has been accepted (or listener closed). + * When a child socket is accepted, its corresponding req->sk is set to + * NULL since it's no longer needed. More importantly, "req->sk == NULL" + * will be used by the code below to determine if a child socket has been + * accepted or not, and the check is protected by the fastopenq->lock + * described below. + * + * Note that fastopen_rsk is only accessed from the child socket's context + * with its socket lock held. But a request_sock (req) can be accessed by + * both its child socket through fastopen_rsk, and a listener socket through + * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin + * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. + * only in the rare case when both the listener and the child locks are held, + * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. + * The lock also protects other fields such as fastopenq->qlen, which is + * decremented by this function when fastopen_rsk is no longer needed. + * + * Note that another solution was to simply use the existing socket lock + * from the listener. But first socket lock is difficult to use. It is not + * a simple spin lock - one must consider sock_owned_by_user() and arrange + * to use sk_add_backlog() stuff. But what really makes it infeasible is the + * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to + * acquire a child's lock while holding listener's socket lock. + * + * This function also sets "treq->tfo_listener" to false. + * treq->tfo_listener is used by the listener so it is protected by the + * fastopenq->lock in this function. + */ +void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, + bool reset) +{ + struct sock *lsk = req->rsk_listener; + struct fastopen_queue *fastopenq; + + fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; + + RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); + spin_lock_bh(&fastopenq->lock); + fastopenq->qlen--; + tcp_rsk(req)->tfo_listener = false; + if (req->sk) /* the child socket hasn't been accepted yet */ + goto out; + + if (!reset || lsk->sk_state != TCP_LISTEN) { + /* If the listener has been closed don't bother with the + * special RST handling below. + */ + spin_unlock_bh(&fastopenq->lock); + reqsk_put(req); + return; + } + /* Wait for 60secs before removing a req that has triggered RST. + * This is a simple defense against TFO spoofing attack - by + * counting the req against fastopen.max_qlen, and disabling + * TFO when the qlen exceeds max_qlen. + * + * For more details see CoNext'11 "TCP Fast Open" paper. + */ + req->rsk_timer.expires = jiffies + 60*HZ; + if (fastopenq->rskq_rst_head == NULL) + fastopenq->rskq_rst_head = req; + else + fastopenq->rskq_rst_tail->dl_next = req; + + req->dl_next = NULL; + fastopenq->rskq_rst_tail = req; + fastopenq->qlen++; +out: + spin_unlock_bh(&fastopenq->lock); +} + void tcp_fastopen_init_key_once(struct net *net) { u8 key[TCP_FASTOPEN_KEY_LENGTH]; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 198f8a0d37be..e7b41abb82aa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -488,6 +488,10 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, tcp_count_delivered_ce(tp, delivered); } +#define PKTS_ACKED_WEIGHT 6 +#define PKTS_ACKED_PREC 6 +#define ACK_COMP_THRESH 4 + /* Returns the ECN CE delta */ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delivered_pkts, u32 delivered_bytes, @@ -499,6 +503,7 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, u32 delta, safe_delta, d_ceb; bool opt_deltas_valid; u32 corrected_ace; + u32 ewma; /* Reordered ACK or uncertain due to lack of data to send and ts */ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) @@ -507,6 +512,18 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, opt_deltas_valid = tcp_accecn_process_option(tp, skb, delivered_bytes, flag); + if (delivered_pkts) { + if (!tp->pkts_acked_ewma) { + ewma = delivered_pkts << PKTS_ACKED_PREC; + } else { + ewma = tp->pkts_acked_ewma; + ewma = (((ewma << PKTS_ACKED_WEIGHT) - ewma) + + (delivered_pkts << PKTS_ACKED_PREC)) >> + PKTS_ACKED_WEIGHT; + } + tp->pkts_acked_ewma = min_t(u32, ewma, 0xFFFFU); + } + if (!(flag & FLAG_SLOWPATH)) { /* AccECN counter might overflow on large ACKs */ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) @@ -555,7 +572,8 @@ static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, if (d_ceb < safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) return delta; - } + } else if (tp->pkts_acked_ewma > (ACK_COMP_THRESH << PKTS_ACKED_PREC)) + return delta; return safe_delta; } @@ -1558,6 +1576,38 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } +/* Record the most recently (re)sent time among the (s)acked packets + * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from + * draft-cheng-tcpm-rack-00.txt + */ +static void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, + u32 end_seq, u64 xmit_time) +{ + u32 rtt_us; + + rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); + if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { + /* If the sacked packet was retransmitted, it's ambiguous + * whether the retransmission or the original (or the prior + * retransmission) was sacked. + * + * If the original is lost, there is no ambiguity. Otherwise + * we assume the original can be delayed up to aRTT + min_rtt. + * the aRTT term is bounded by the fast recovery or timeout, + * so it's at least one RTT (i.e., retransmission is at least + * an RTT later). + */ + return; + } + tp->rack.advanced = 1; + tp->rack.rtt_us = rtt_us; + if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, + end_seq, tp->rack.end_seq)) { + tp->rack.mstamp = xmit_time; + tp->rack.end_seq = end_seq; + } +} + /* Mark the given newly-SACKed range as such, adjusting counters and hints. */ static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, @@ -1637,6 +1687,160 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; } +/* The bandwidth estimator estimates the rate at which the network + * can currently deliver outbound data packets for this flow. At a high + * level, it operates by taking a delivery rate sample for each ACK. + * + * A rate sample records the rate at which the network delivered packets + * for this flow, calculated over the time interval between the transmission + * of a data packet and the acknowledgment of that packet. + * + * Specifically, over the interval between each transmit and corresponding ACK, + * the estimator generates a delivery rate sample. Typically it uses the rate + * at which packets were acknowledged. However, the approach of using only the + * acknowledgment rate faces a challenge under the prevalent ACK decimation or + * compression: packets can temporarily appear to be delivered much quicker + * than the bottleneck rate. Since it is physically impossible to do that in a + * sustained fashion, when the estimator notices that the ACK rate is faster + * than the transmit rate, it uses the latter: + * + * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) + * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) + * bw = min(send_rate, ack_rate) + * + * Notice the estimator essentially estimates the goodput, not always the + * network bottleneck link rate when the sending or receiving is limited by + * other factors like applications or receiver window limits. The estimator + * deliberately avoids using the inter-packet spacing approach because that + * approach requires a large number of samples and sophisticated filtering. + * + * TCP flows can often be application-limited in request/response workloads. + * The estimator marks a bandwidth sample as application-limited if there + * was some moment during the sampled window of packets when there was no data + * ready to send in the write queue. + */ + +/* Update the connection delivery information and generate a rate sample. */ +static void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + bool is_sack_reneg, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 snd_us, ack_us; + + /* Clear app limited if bubble is acked and gone. */ + if (tp->app_limited && after(tp->delivered, tp->app_limited)) + tp->app_limited = 0; + + /* TODO: there are multiple places throughout tcp_ack() to get + * current time. Refactor the code using a new "tcp_acktag_state" + * to carry current time, flags, stats like "tcp_sacktag_state". + */ + if (delivered) + tp->delivered_mstamp = tp->tcp_mstamp; + + rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ + rs->losses = lost; /* freshly marked lost */ + /* Return an invalid sample if no timing information is available or + * in recovery from loss with SACK reneging. Rate samples taken during + * a SACK reneging event may overestimate bw by including packets that + * were SACKed before the reneg. + */ + if (!rs->prior_mstamp || is_sack_reneg) { + rs->delivered = -1; + rs->interval_us = -1; + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ + rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; + + /* Model sending data and receiving ACKs as separate pipeline phases + * for a window. Usually the ACK phase is longer, but with ACK + * compression the send phase can be longer. To be safe we use the + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ + ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + + /* Record both segment send and ack receive intervals */ + rs->snd_interval_us = snd_us; + rs->rcv_interval_us = ack_us; + + /* Normally we expect interval_us >= min-rtt. + * Note that rate may still be over-estimated when a spuriously + * retransmistted skb was first (s)acked because "interval_us" + * is under-estimated (up to an RTT). However continuously + * measuring the delivery rate during loss recovery is crucial + * for connections suffer heavy or prolonged losses. + */ + if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { + if (!rs->is_retrans) + pr_debug("tcp rate: %ld %d %u %u %u\n", + rs->interval_us, rs->delivered, + inet_csk(sk)->icsk_ca_state, + tp->rx_opt.sack_ok, tcp_min_rtt(tp)); + rs->interval_us = -1; + return; + } + + /* Record the last non-app-limited or the highest app-limited bw */ + if (!rs->is_app_limited || + ((u64)rs->delivered * tp->rate_interval_us >= + (u64)tp->rate_delivered * rs->interval_us)) { + tp->rate_delivered = rs->delivered; + tp->rate_interval_us = rs->interval_us; + tp->rate_app_limited = rs->is_app_limited; + } +} + +/* When an skb is sacked or acked, we fill in the rate sample with the (prior) + * delivery information when the skb was last transmitted. + * + * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is + * called multiple times. We favor the information from the most recently + * sent skb, i.e., the skb with the most recently sent time and the highest + * sequence. + */ +static void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs) +{ + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + struct tcp_sock *tp = tcp_sk(sk); + u64 tx_tstamp; + + if (!scb->tx.delivered_mstamp) + return; + + tx_tstamp = tcp_skb_timestamp_us(skb); + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ + rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, + scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being + * used again when it's cumulatively acked. For acked packets + * we don't need to reset since it'll be freed soon. + */ + if (scb->sacked & TCPCB_SACKED_ACKED) + scb->tx.delivered_mstamp = 0; +} + /* Shift newly-SACKed bytes from this skb to the immediately previous * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. */ @@ -3995,6 +4199,49 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, return delivered; } +/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. + * + * If a DSACK is received that seems like it may have been due to reordering + * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded + * by srtt), since there is possibility that spurious retransmission was + * due to reordering delay longer than reo_wnd. + * + * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) + * no. of successful recoveries (accounts for full DSACK-based loss + * recovery undo). After that, reset it to default (min_rtt/4). + * + * At max, reo_wnd is incremented only once per rtt. So that the new + * DSACK on which we are reacting, is due to the spurious retx (approx) + * after the reo_wnd has been updated last time. + * + * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than + * absolute value to account for change in rtt. + */ +static void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & + TCP_RACK_STATIC_REO_WND) || + !rs->prior_delivered) + return; + + /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ + if (before(rs->prior_delivered, tp->rack.last_delivered)) + tp->rack.dsack_seen = 0; + + /* Adjust the reo_wnd if update is pending */ + if (tp->rack.dsack_seen) { + tp->rack.reo_wnd_steps = min_t(u32, 0xFF, + tp->rack.reo_wnd_steps + 1); + tp->rack.dsack_seen = 0; + tp->rack.last_delivered = tp->delivered; + tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; + } else if (!tp->rack.reo_wnd_persist) { + tp->rack.reo_wnd_steps = 1; + } +} + /* This routine deals with incoming acks, but not outgoing ones. */ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) { @@ -4129,7 +4376,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_in_ack_event(sk, flag); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); if (tcp_ack_is_dubious(sk, flag)) { @@ -4179,7 +4426,7 @@ no_queue: */ tcp_ack_probe(sk); - if (tp->tlp_high_seq) + if (unlikely(tp->tlp_high_seq)) tcp_process_tlp_ack(sk, ack, flag); return 1; @@ -4799,8 +5046,11 @@ static void tcp_dsack_extend(struct sock *sk, u32 seq, u32 end_seq) tcp_sack_extend(tp->duplicate_sack, seq, end_seq); } -static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) +static void tcp_rcv_spurious_retrans(struct sock *sk, + const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* When the ACK path fails or drops most ACKs, the sender would * timeout and spuriously retransmit the same segment repeatedly. * If it seems our ACKs are not reaching the other side, @@ -4820,6 +5070,14 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) /* Save last flowlabel after a spurious retrans. */ tcp_save_lrcv_flowlabel(sk, skb); #endif + /* Check DSACK info to detect that the previous ACK carrying the + * AccECN option was lost after the second retransmision, and then + * stop sending AccECN option in all subsequent ACKs. + */ + if (tcp_ecn_mode_accecn(tp) && + tp->accecn_opt_sent_w_dsack && + TCP_SKB_CB(skb)->seq == tp->duplicate_sack[0].start_seq) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_SEND); } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) @@ -5527,25 +5785,6 @@ static struct sk_buff *tcp_collapse_one(struct sock *sk, struct sk_buff *skb, return next; } -/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ -void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sk_buff *skb1; - - while (*p) { - parent = *p; - skb1 = rb_to_skb(parent); - if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) - p = &parent->rb_left; - else - p = &parent->rb_right; - } - rb_link_node(&skb->rbnode, parent, p); - rb_insert_color(&skb->rbnode, root); -} - /* Collapse contiguous sequence of skbs head..tail with * sequence numbers start..end. * @@ -5879,16 +6118,11 @@ static void tcp_new_space(struct sock *sk) * small enough that tcp_stream_memory_free() decides it * is time to generate EPOLLOUT. */ -void tcp_check_space(struct sock *sk) +void __tcp_check_space(struct sock *sk) { - /* pairs with tcp_poll() */ - smp_mb(); - if (sk->sk_socket && - test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { - tcp_new_space(sk); - if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) - tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); - } + tcp_new_space(sk); + if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED); } static inline void tcp_data_snd_check(struct sock *sk) @@ -6222,6 +6456,8 @@ step1: if (th->syn) { if (tcp_ecn_mode_accecn(tp)) { accecn_reflector = true; + tp->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; if (tp->rx_opt.accecn && tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); @@ -6843,7 +7079,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th, skb); + tcp_ecn_rcv_syn(sk, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -7248,7 +7484,8 @@ static void tcp_ecn_create_request(struct request_sock *req, u32 ecn_ok_dst; if (tcp_accecn_syn_requested(th) && - READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + (READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3 || + tcp_ca_needs_accecn(listen_sk))) { inet_rsk(req)->ecn_ok = 1; tcp_rsk(req)->accecn_ok = 1; tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index f8a9596e8f4d..6264fc0b2be5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -374,7 +374,7 @@ void tcp_v4_mtu_reduced(struct sock *sk) { struct inet_sock *inet = inet_sk(sk); struct dst_entry *dst; - u32 mtu; + u32 mtu, dmtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; @@ -386,15 +386,14 @@ void tcp_v4_mtu_reduced(struct sock *sk) /* Something is about to be wrong... Remember soft error * for the case, if this connection will not able to recover. */ - if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + dmtu = dst4_mtu(dst); + if (mtu < dmtu && ip_dont_fragment(sk, dst)) WRITE_ONCE(sk->sk_err_soft, EMSGSIZE); - mtu = dst_mtu(dst); - if (inet->pmtudisc != IP_PMTUDISC_DONT && ip_sk_accept_pmtu(sk) && - inet_csk(sk)->icsk_pmtu_cookie > mtu) { - tcp_sync_mss(sk, mtu); + inet_csk(sk)->icsk_pmtu_cookie > dmtu) { + tcp_sync_mss(sk, dmtu); /* Resend the TCP packet because it's * clear that the old packet has been @@ -1760,7 +1759,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_ca_openreq_child(newsk, dst); - tcp_sync_mss(newsk, dst_mtu(dst)); + tcp_sync_mss(newsk, dst4_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -2110,14 +2109,6 @@ no_coalesce: } EXPORT_IPV6_MOD(tcp_add_backlog); -int tcp_filter(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason *reason) -{ - struct tcphdr *th = (struct tcphdr *)skb->data; - - return sk_filter_trim_cap(sk, skb, th->doff * 4, reason); -} -EXPORT_IPV6_MOD(tcp_filter); - static void tcp_v4_restore_cb(struct sk_buff *skb) { memmove(IPCB(skb), &TCP_SKB_CB(skb)->header.h4, @@ -3418,20 +3409,6 @@ void tcp4_proc_exit(void) } #endif /* CONFIG_PROC_FS */ -/* @wake is one when sk_stream_write_space() calls us. - * This sends EPOLLOUT only if notsent_bytes is half the limit. - * This mimics the strategy used in sock_def_write_space(). - */ -bool tcp_stream_memory_free(const struct sock *sk, int wake) -{ - const struct tcp_sock *tp = tcp_sk(sk); - u32 notsent_bytes = READ_ONCE(tp->write_seq) - - READ_ONCE(tp->snd_nxt); - - return (notsent_bytes << wake) < tcp_notsent_lowat(tp); -} -EXPORT_SYMBOL(tcp_stream_memory_free); - struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, @@ -3474,6 +3451,8 @@ struct proto tcp_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), + .freeptr_offset = offsetof(struct tcp_sock, + inet_conn.icsk_inet.sk.sk_freeptr), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index bd5462154f97..ec128865f5c0 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -481,13 +481,18 @@ static void tcp_ecn_openreq_child(struct sock *sk, tp->syn_ect_snt = treq->syn_ect_snt; tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); tp->saw_accecn_opt = treq->saw_accecn_opt; + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_SEND) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_SEND); + if (treq->accecn_fail_mode & TCP_ACCECN_ACE_FAIL_RECV) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_ACE_FAIL_RECV); tp->prev_ecnfield = treq->syn_ect_rcv; tp->accecn_opt_demand = 1; tcp_ecn_received_counters_payload(sk, skb); } else { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + if (inet_rsk(req)->ecn_ok && !tcp_ca_no_fallback_rfc3168(sk)) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + else + tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); } } @@ -748,16 +753,28 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, */ if (!tcp_oow_rate_limited(sock_net(sk), skb, LINUX_MIB_TCPACKSKIPPEDSYNRECV, - &tcp_rsk(req)->last_oow_ack_time) && - - !tcp_rtx_synack(sk, req)) { - unsigned long expires = jiffies; - - expires += tcp_reqsk_timeout(req); - if (!fastopen) - mod_timer_pending(&req->rsk_timer, expires); - else - req->rsk_timer.expires = expires; + &tcp_rsk(req)->last_oow_ack_time)) { + if (tcp_rsk(req)->accecn_ok) { + u8 ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + + tcp_rsk(req)->syn_ect_rcv = ect_rcv; + if (tcp_accecn_ace(tcp_hdr(skb)) == 0x0) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_RECV; + } + if (!tcp_rtx_synack(sk, req)) { + unsigned long expires = jiffies; + + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; + + expires += tcp_reqsk_timeout(req); + if (!fastopen) + mod_timer_pending(&req->rsk_timer, + expires); + else + req->rsk_timer.expires = expires; + } } return NULL; } diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index 942a948f1a31..3b1fdcd3cb29 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -304,8 +304,7 @@ struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb, goto out_check_final; th2 = tcp_hdr(p); - flush = (__force int)(flags & TCP_FLAG_CWR); - flush |= (__force int)((flags ^ tcp_flag_word(th2)) & + flush = (__force int)((flags ^ tcp_flag_word(th2)) & ~(TCP_FLAG_FIN | TCP_FLAG_PSH)); flush |= (__force int)(th->ack_seq ^ th2->ack_seq); for (i = sizeof(*th); i < thlen; i += 4) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 479afb714bdf..326b58ff1118 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -66,6 +66,25 @@ void tcp_mstamp_refresh(struct tcp_sock *tp) static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, int push_one, gfp_t gfp); +/* Insert skb into rb tree, ordered by TCP_SKB_CB(skb)->seq */ +void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct sk_buff *skb1; + + while (*p) { + parent = *p; + skb1 = rb_to_skb(parent); + if (before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb1)->seq)) + p = &parent->rb_left; + else + p = &parent->rb_right; + } + rb_link_node(&skb->rbnode, parent, p); + rb_insert_color(&skb->rbnode, root); +} + /* Account for new data that has been sent to the network. */ static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { @@ -334,8 +353,11 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, return; if (tcp_ecn_mode_accecn(tp)) { - if (!tcp_accecn_ace_fail_recv(tp)) + if (!tcp_accecn_ace_fail_recv(tp) && + !tcp_accecn_ace_fail_send(tp)) INET_ECN_xmit(sk); + else + INET_ECN_dontxmit(sk); tcp_accecn_set_ace(tp, skb, th); skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; } else { @@ -712,9 +734,12 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, if (tp) { tp->accecn_minlen = 0; tp->accecn_opt_tstamp = tp->tcp_mstamp; + tp->accecn_opt_sent_w_dsack = tp->rx_opt.dsack; if (tp->accecn_opt_demand) tp->accecn_opt_demand--; } + } else if (tp) { + tp->accecn_opt_sent_w_dsack = 0; } if (unlikely(OPTION_SACK_ADVERTISE & options)) { @@ -1106,7 +1131,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (treq->accecn_ok && READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && - req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { + synack_type != TCP_SYNACK_RETRANS && remaining >= TCPOLEN_ACCECN_BASE) { opts->use_synack_ecn_bytes = 1; remaining -= tcp_options_fit_accecn(opts, 0, remaining); } @@ -1186,7 +1211,9 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb if (tcp_ecn_mode_accecn(tp)) { int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && + if (ecn_opt && tp->saw_accecn_opt && + (ecn_opt >= TCP_ACCECN_OPTION_PERSIST || + !tcp_accecn_opt_fail_send(tp)) && (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || tcp_accecn_option_beacon_check(sk))) { opts->use_synack_ecn_bytes = 0; @@ -1432,6 +1459,41 @@ static void tcp_update_skb_after_send(struct sock *sk, struct sk_buff *skb, list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); } +/* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +static void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* In general we need to start delivery rate samples from the + * time we received the most recent ACK, to ensure we include + * the full time the network needs to deliver all in-flight + * packets. If there are no packets in flight yet, then we + * know that any ACKs after now indicate that the network was + * able to deliver those packets completely in the sampling + * interval between now and the next ACK. + * + * Note that we use packets_out instead of tcp_packets_in_flight(tp) + * because the latter is a guess based on RTO and loss-marking + * heuristics. We don't want spurious RTOs or loss markings to cause + * a spuriously small time interval, causing a spuriously high + * bandwidth estimate. + */ + if (!tp->packets_out) { + u64 tstamp_us = tcp_skb_timestamp_us(skb); + + tp->first_tx_mstamp = tstamp_us; + tp->delivered_mstamp = tstamp_us; + } + + TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; +} + INDIRECT_CALLABLE_DECLARE(int ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl)); INDIRECT_CALLABLE_DECLARE(void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)); @@ -1530,7 +1592,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, */ skb->pfmemalloc = 0; - skb_push(skb, tcp_header_size); + __skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_orphan(skb); @@ -3571,12 +3633,15 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback - * As AccECN uses the same SYN flags (+ AE), this check covers both - * cases. - */ - if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) - tcp_ecn_clear_syn(sk, skb); + if (!tcp_ecn_mode_pending(tp) || icsk->icsk_retransmits > 1) { + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check + * covers both cases. + */ + if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == + TCPHDR_SYN_ECN) + tcp_ecn_clear_syn(sk, skb); + } /* Update global and local TCP statistics. */ segs = tcp_skb_pcount(skb); @@ -3732,33 +3797,6 @@ void tcp_xmit_retransmit_queue(struct sock *sk) inet_csk(sk)->icsk_rto, true); } -/* We allow to exceed memory limits for FIN packets to expedite - * connection tear down and (memory) recovery. - * Otherwise tcp_send_fin() could be tempted to either delay FIN - * or even be forced to close flow without any FIN. - * In general, we want to allow one skb per socket to avoid hangs - * with edge trigger epoll() - */ -void sk_forced_mem_schedule(struct sock *sk, int size) -{ - int delta, amt; - - delta = size - sk->sk_forward_alloc; - if (delta <= 0) - return; - - amt = sk_mem_pages(delta); - sk_forward_alloc_add(sk, amt << PAGE_SHIFT); - - if (mem_cgroup_sk_enabled(sk)) - mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); - - if (sk->sk_bypass_prot_mem) - return; - - sk_memory_allocated_add(sk, amt); -} - /* Send a FIN. The caller locks the socket for us. * We should try to send a FIN packet really hard, but eventually give up. */ @@ -3918,6 +3956,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, switch (synack_type) { case TCP_SYNACK_NORMAL: + case TCP_SYNACK_RETRANS: skb_set_owner_edemux(skb, req_to_sk(req)); break; case TCP_SYNACK_COOKIE: @@ -4000,7 +4039,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, memset(th, 0, sizeof(struct tcphdr)); th->syn = 1; th->ack = 1; - tcp_ecn_make_synack(req, th); + tcp_ecn_make_synack(req, th, synack_type); th->source = htons(ireq->ir_num); th->dest = ireq->ir_rmt_port; skb->mark = ireq->ir_mark; @@ -4603,7 +4642,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); - res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, + res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_RETRANS, NULL); if (!res) { TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c deleted file mode 100644 index a8f6d9d06f2e..000000000000 --- a/net/ipv4/tcp_rate.c +++ /dev/null @@ -1,209 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -#include <net/tcp.h> - -/* The bandwidth estimator estimates the rate at which the network - * can currently deliver outbound data packets for this flow. At a high - * level, it operates by taking a delivery rate sample for each ACK. - * - * A rate sample records the rate at which the network delivered packets - * for this flow, calculated over the time interval between the transmission - * of a data packet and the acknowledgment of that packet. - * - * Specifically, over the interval between each transmit and corresponding ACK, - * the estimator generates a delivery rate sample. Typically it uses the rate - * at which packets were acknowledged. However, the approach of using only the - * acknowledgment rate faces a challenge under the prevalent ACK decimation or - * compression: packets can temporarily appear to be delivered much quicker - * than the bottleneck rate. Since it is physically impossible to do that in a - * sustained fashion, when the estimator notices that the ACK rate is faster - * than the transmit rate, it uses the latter: - * - * send_rate = #pkts_delivered/(last_snd_time - first_snd_time) - * ack_rate = #pkts_delivered/(last_ack_time - first_ack_time) - * bw = min(send_rate, ack_rate) - * - * Notice the estimator essentially estimates the goodput, not always the - * network bottleneck link rate when the sending or receiving is limited by - * other factors like applications or receiver window limits. The estimator - * deliberately avoids using the inter-packet spacing approach because that - * approach requires a large number of samples and sophisticated filtering. - * - * TCP flows can often be application-limited in request/response workloads. - * The estimator marks a bandwidth sample as application-limited if there - * was some moment during the sampled window of packets when there was no data - * ready to send in the write queue. - */ - -/* Snapshot the current delivery information in the skb, to generate - * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). - */ -void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - - /* In general we need to start delivery rate samples from the - * time we received the most recent ACK, to ensure we include - * the full time the network needs to deliver all in-flight - * packets. If there are no packets in flight yet, then we - * know that any ACKs after now indicate that the network was - * able to deliver those packets completely in the sampling - * interval between now and the next ACK. - * - * Note that we use packets_out instead of tcp_packets_in_flight(tp) - * because the latter is a guess based on RTO and loss-marking - * heuristics. We don't want spurious RTOs or loss markings to cause - * a spuriously small time interval, causing a spuriously high - * bandwidth estimate. - */ - if (!tp->packets_out) { - u64 tstamp_us = tcp_skb_timestamp_us(skb); - - tp->first_tx_mstamp = tstamp_us; - tp->delivered_mstamp = tstamp_us; - } - - TCP_SKB_CB(skb)->tx.first_tx_mstamp = tp->first_tx_mstamp; - TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; - TCP_SKB_CB(skb)->tx.delivered = tp->delivered; - TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; - TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; -} - -/* When an skb is sacked or acked, we fill in the rate sample with the (prior) - * delivery information when the skb was last transmitted. - * - * If an ACK (s)acks multiple skbs (e.g., stretched-acks), this function is - * called multiple times. We favor the information from the most recently - * sent skb, i.e., the skb with the most recently sent time and the highest - * sequence. - */ -void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, - struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - u64 tx_tstamp; - - if (!scb->tx.delivered_mstamp) - return; - - tx_tstamp = tcp_skb_timestamp_us(skb); - if (!rs->prior_delivered || - tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, - scb->end_seq, rs->last_end_seq)) { - rs->prior_delivered_ce = scb->tx.delivered_ce; - rs->prior_delivered = scb->tx.delivered; - rs->prior_mstamp = scb->tx.delivered_mstamp; - rs->is_app_limited = scb->tx.is_app_limited; - rs->is_retrans = scb->sacked & TCPCB_RETRANS; - rs->last_end_seq = scb->end_seq; - - /* Record send time of most recently ACKed packet: */ - tp->first_tx_mstamp = tx_tstamp; - /* Find the duration of the "send phase" of this window: */ - rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, - scb->tx.first_tx_mstamp); - - } - /* Mark off the skb delivered once it's sacked to avoid being - * used again when it's cumulatively acked. For acked packets - * we don't need to reset since it'll be freed soon. - */ - if (scb->sacked & TCPCB_SACKED_ACKED) - scb->tx.delivered_mstamp = 0; -} - -/* Update the connection delivery information and generate a rate sample. */ -void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - bool is_sack_reneg, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 snd_us, ack_us; - - /* Clear app limited if bubble is acked and gone. */ - if (tp->app_limited && after(tp->delivered, tp->app_limited)) - tp->app_limited = 0; - - /* TODO: there are multiple places throughout tcp_ack() to get - * current time. Refactor the code using a new "tcp_acktag_state" - * to carry current time, flags, stats like "tcp_sacktag_state". - */ - if (delivered) - tp->delivered_mstamp = tp->tcp_mstamp; - - rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ - rs->losses = lost; /* freshly marked lost */ - /* Return an invalid sample if no timing information is available or - * in recovery from loss with SACK reneging. Rate samples taken during - * a SACK reneging event may overestimate bw by including packets that - * were SACKed before the reneg. - */ - if (!rs->prior_mstamp || is_sack_reneg) { - rs->delivered = -1; - rs->interval_us = -1; - return; - } - rs->delivered = tp->delivered - rs->prior_delivered; - - rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; - /* delivered_ce occupies less than 32 bits in the skb control block */ - rs->delivered_ce &= TCPCB_DELIVERED_CE_MASK; - - /* Model sending data and receiving ACKs as separate pipeline phases - * for a window. Usually the ACK phase is longer, but with ACK - * compression the send phase can be longer. To be safe we use the - * longer phase. - */ - snd_us = rs->interval_us; /* send phase */ - ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, - rs->prior_mstamp); /* ack phase */ - rs->interval_us = max(snd_us, ack_us); - - /* Record both segment send and ack receive intervals */ - rs->snd_interval_us = snd_us; - rs->rcv_interval_us = ack_us; - - /* Normally we expect interval_us >= min-rtt. - * Note that rate may still be over-estimated when a spuriously - * retransmistted skb was first (s)acked because "interval_us" - * is under-estimated (up to an RTT). However continuously - * measuring the delivery rate during loss recovery is crucial - * for connections suffer heavy or prolonged losses. - */ - if (unlikely(rs->interval_us < tcp_min_rtt(tp))) { - if (!rs->is_retrans) - pr_debug("tcp rate: %ld %d %u %u %u\n", - rs->interval_us, rs->delivered, - inet_csk(sk)->icsk_ca_state, - tp->rx_opt.sack_ok, tcp_min_rtt(tp)); - rs->interval_us = -1; - return; - } - - /* Record the last non-app-limited or the highest app-limited bw */ - if (!rs->is_app_limited || - ((u64)rs->delivered * tp->rate_interval_us >= - (u64)tp->rate_delivered * rs->interval_us)) { - tp->rate_delivered = rs->delivered; - tp->rate_interval_us = rs->interval_us; - tp->rate_app_limited = rs->is_app_limited; - } -} - -/* If a gap is detected between sends, mark the socket application-limited. */ -void tcp_rate_check_app_limited(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (/* We have less than one packet to send. */ - tp->write_seq - tp->snd_nxt < tp->mss_cache && - /* Nothing in sending host's qdisc queues or NIC tx queue. */ - sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1) && - /* We are not limited by CWND. */ - tcp_packets_in_flight(tp) < tcp_snd_cwnd(tp) && - /* All lost packets have been retransmitted. */ - tp->lost_out <= tp->retrans_out) - tp->app_limited = - (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; -} -EXPORT_SYMBOL_GPL(tcp_rate_check_app_limited); diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index c52fd3254b6e..139646751073 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -111,38 +111,6 @@ bool tcp_rack_mark_lost(struct sock *sk) return !!timeout; } -/* Record the most recently (re)sent time among the (s)acked packets - * This is "Step 3: Advance RACK.xmit_time and update RACK.RTT" from - * draft-cheng-tcpm-rack-00.txt - */ -void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - u64 xmit_time) -{ - u32 rtt_us; - - rtt_us = tcp_stamp_us_delta(tp->tcp_mstamp, xmit_time); - if (rtt_us < tcp_min_rtt(tp) && (sacked & TCPCB_RETRANS)) { - /* If the sacked packet was retransmitted, it's ambiguous - * whether the retransmission or the original (or the prior - * retransmission) was sacked. - * - * If the original is lost, there is no ambiguity. Otherwise - * we assume the original can be delayed up to aRTT + min_rtt. - * the aRTT term is bounded by the fast recovery or timeout, - * so it's at least one RTT (i.e., retransmission is at least - * an RTT later). - */ - return; - } - tp->rack.advanced = 1; - tp->rack.rtt_us = rtt_us; - if (tcp_skb_sent_after(xmit_time, tp->rack.mstamp, - end_seq, tp->rack.end_seq)) { - tp->rack.mstamp = xmit_time; - tp->rack.end_seq = end_seq; - } -} - /* We have waited long enough to accommodate reordering. Mark the expired * packets lost and retransmit them. */ @@ -166,49 +134,6 @@ void tcp_rack_reo_timeout(struct sock *sk) tcp_rearm_rto(sk); } -/* Updates the RACK's reo_wnd based on DSACK and no. of recoveries. - * - * If a DSACK is received that seems like it may have been due to reordering - * triggering fast recovery, increment reo_wnd by min_rtt/4 (upper bounded - * by srtt), since there is possibility that spurious retransmission was - * due to reordering delay longer than reo_wnd. - * - * Persist the current reo_wnd value for TCP_RACK_RECOVERY_THRESH (16) - * no. of successful recoveries (accounts for full DSACK-based loss - * recovery undo). After that, reset it to default (min_rtt/4). - * - * At max, reo_wnd is incremented only once per rtt. So that the new - * DSACK on which we are reacting, is due to the spurious retx (approx) - * after the reo_wnd has been updated last time. - * - * reo_wnd is tracked in terms of steps (of min_rtt/4), rather than - * absolute value to account for change in rtt. - */ -void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if ((READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_recovery) & - TCP_RACK_STATIC_REO_WND) || - !rs->prior_delivered) - return; - - /* Disregard DSACK if a rtt has not passed since we adjusted reo_wnd */ - if (before(rs->prior_delivered, tp->rack.last_delivered)) - tp->rack.dsack_seen = 0; - - /* Adjust the reo_wnd if update is pending */ - if (tp->rack.dsack_seen) { - tp->rack.reo_wnd_steps = min_t(u32, 0xFF, - tp->rack.reo_wnd_steps + 1); - tp->rack.dsack_seen = 0; - tp->rack.last_delivered = tp->delivered; - tp->rack.reo_wnd_persist = TCP_RACK_RECOVERY_THRESH; - } else if (!tp->rack.reo_wnd_persist) { - tp->rack.reo_wnd_steps = 1; - } -} - /* RFC6582 NewReno recovery for non-SACK connection. It simply retransmits * the next unacked packet upon receiving * a) three or more DUPACKs to start the fast recovery diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 160080c9021d..5a14a53a3c9e 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -22,6 +22,7 @@ #include <linux/module.h> #include <linux/gfp.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/rstreason.h> static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk) @@ -479,6 +480,8 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req) * it's not good to give up too easily. */ tcp_rtx_synack(sk, req); + if (req->num_retrans > 1 && tcp_rsk(req)->accecn_ok) + tcp_rsk(req)->accecn_fail_mode |= TCP_ACCECN_ACE_FAIL_SEND; req->num_timeout++; tcp_update_rto_stats(sk); if (!tp->retrans_stamp) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ee63af0ef42c..b96e47f1c8a2 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1193,7 +1193,7 @@ csum_partial: send: err = ip_send_skb(sock_net(sk), skb); - if (err) { + if (unlikely(err)) { if (err == -ENOBUFS && !inet_test_bit(RECVERR, sk)) { UDP_INC_STATS(sock_net(sk), @@ -1269,6 +1269,8 @@ EXPORT_IPV6_MOD_GPL(udp_cmsg_send); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { + DEFINE_RAW_FLEX(struct ip_options_rcu, opt_copy, opt.__data, + IP_OPTIONS_DATA_FIXED_SIZE); struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); @@ -1286,7 +1288,6 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) int corkreq = udp_test_bit(CORK, sk) || msg->msg_flags & MSG_MORE; int (*getfrag)(void *, char *, int, int, int, struct sk_buff *); struct sk_buff *skb; - struct ip_options_data opt_copy; int uc_index; if (len > 0xFFFF) @@ -1368,9 +1369,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_lock(); inet_opt = rcu_dereference(inet->inet_opt); if (inet_opt) { - memcpy(&opt_copy, inet_opt, + memcpy(opt_copy, inet_opt, sizeof(*inet_opt) + inet_opt->opt.optlen); - ipc.opt = &opt_copy.opt; + ipc.opt = opt_copy; } rcu_read_unlock(); } @@ -1793,14 +1794,32 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) } if (unlikely(to_drop)) { + int err_ipv4 = 0; + int err_ipv6 = 0; + for (nb = 0; to_drop != NULL; nb++) { skb = to_drop; + if (skb->protocol == htons(ETH_P_IP)) + err_ipv4++; + else + err_ipv6++; to_drop = skb->next; skb_mark_not_on_list(skb); - /* TODO: update SNMP values. */ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); } numa_drop_add(&udp_sk(sk)->drop_counters, nb); + if (err_ipv4 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_MEMERRORS, + err_ipv4); + SNMP_ADD_STATS(__UDPX_MIB(sk, true), UDP_MIB_INERRORS, + err_ipv4); + } + if (err_ipv6 > 0) { + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_MEMERRORS, + err_ipv6); + SNMP_ADD_STATS(__UDPX_MIB(sk, false), UDP_MIB_INERRORS, + err_ipv6); + } } atomic_sub(total_size, &udp_prod_queue->rmem_alloc); @@ -2429,7 +2448,8 @@ static int udp_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets */ - if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && + UDP_SKB_CB(skb)->partial_cov)) { u16 pcrlen = READ_ONCE(up->pcrlen); /* diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 589456bd8b5f..6b1654c1ad4a 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -483,11 +483,11 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, struct sock *sk = gso_skb->sk; unsigned int sum_truesize = 0; struct sk_buff *segs, *seg; + __be16 newlen, msslen; struct udphdr *uh; unsigned int mss; bool copy_dtor; __sum16 check; - __be16 newlen; int ret = 0; mss = skb_shinfo(gso_skb)->gso_size; @@ -556,6 +556,8 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, return segs; } + msslen = htons(sizeof(*uh) + mss); + /* GSO partial and frag_list segmentation only requires splitting * the frame into an MSS multiple and possibly a remainder, both * cases return a GSO skb. So update the mss now. @@ -585,7 +587,7 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, if (!seg->next) break; - uh->len = newlen; + uh->len = msslen; uh->check = check; if (seg->ip_summed == CHECKSUM_PARTIAL) diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index d283c59df4c1..0492f1a0b491 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -45,7 +45,7 @@ obj-$(CONFIG_IPV6_FOU) += fou6.o obj-y += addrconf_core.o exthdrs_core.o ip6_checksum.o ip6_icmp.o obj-$(CONFIG_INET) += output_core.o protocol.o \ - ip6_offload.o tcpv6_offload.o exthdrs_offload.o + ip6_offload.o exthdrs_offload.o obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 27ab9d7adc64..6db9cf9e2a50 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1013,7 +1013,7 @@ ipv6_link_dev_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) list_for_each(p, &idev->addr_list) { struct inet6_ifaddr *ifa = list_entry(p, struct inet6_ifaddr, if_list); - if (ifp_scope >= ipv6_addr_src_scope(&ifa->addr)) + if (ifp_scope > ipv6_addr_src_scope(&ifa->addr)) break; } @@ -3339,11 +3339,10 @@ static int ipv6_generate_stable_address(struct in6_addr *address, const struct inet6_dev *idev) { static DEFINE_SPINLOCK(lock); - static __u32 digest[SHA1_DIGEST_WORDS]; - static __u32 workspace[SHA1_WORKSPACE_WORDS]; + static struct sha1_ctx sha_ctx; static union { - char __data[SHA1_BLOCK_SIZE]; + u8 __data[SHA1_BLOCK_SIZE]; struct { struct in6_addr secret; __be32 prefix[2]; @@ -3368,20 +3367,26 @@ static int ipv6_generate_stable_address(struct in6_addr *address, retry: spin_lock_bh(&lock); - sha1_init_raw(digest); + sha1_init(&sha_ctx); + memset(&data, 0, sizeof(data)); - memset(workspace, 0, sizeof(workspace)); memcpy(data.hwaddr, idev->dev->perm_addr, idev->dev->addr_len); data.prefix[0] = address->s6_addr32[0]; data.prefix[1] = address->s6_addr32[1]; data.secret = secret; data.dad_count = dad_count; - sha1_transform(digest, data.__data, workspace); + sha1_update(&sha_ctx, data.__data, sizeof(data)); + /* + * Note that the SHA-1 finalization is omitted here, and the digest is + * pulled directly from the internal SHA-1 state (making it incompatible + * with standard SHA-1). Unusual, but technically okay since the data + * length is fixed and is a multiple of the SHA-1 block size. + */ temp = *address; - temp.s6_addr32[2] = (__force __be32)digest[0]; - temp.s6_addr32[3] = (__force __be32)digest[1]; + temp.s6_addr32[2] = (__force __be32)sha_ctx.state.h[0]; + temp.s6_addr32[3] = (__force __be32)sha_ctx.state.h[1]; spin_unlock_bh(&lock); diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b705751eb73c..31ba677d0442 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -224,8 +224,8 @@ lookup_protocol: inet6_set_bit(MC6_LOOP, sk); inet6_set_bit(MC6_ALL, sk); np->pmtudisc = IPV6_PMTUDISC_WANT; - inet6_assign_bit(REPFLOW, sk, net->ipv6.sysctl.flowlabel_reflect & - FLOWLABEL_REFLECT_ESTABLISHED); + inet6_assign_bit(REPFLOW, sk, READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_ESTABLISHED); sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; sk->sk_txrehash = READ_ONCE(net->core.sysctl_txrehash); @@ -824,45 +824,42 @@ EXPORT_SYMBOL(inet6_unregister_protosw); int inet6_sk_rebuild_header(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p; struct dst_entry *dst; + struct flowi6 *fl6; dst = __sk_dst_check(sk, np->dst_cookie); + if (dst) + return 0; + + fl6 = &inet->cork.fl.u.ip6; + memset(fl6, 0, sizeof(*fl6)); + fl6->flowi6_proto = sk->sk_protocol; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = np->saddr; + fl6->flowlabel = np->flow_label; + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_dport = inet->inet_dport; + fl6->fl6_sport = inet->inet_sport; + fl6->flowi6_uid = sk_uid(sk); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - if (!dst) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p, final; - struct flowi6 fl6; - - memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_proto = sk->sk_protocol; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = np->saddr; - fl6.flowlabel = np->flow_label; - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = inet->inet_dport; - fl6.fl6_sport = inet->inet_sport; - fl6.flowi6_uid = sk_uid(sk); - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - - rcu_read_lock(); - final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), - &final); - rcu_read_unlock(); - - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); - if (IS_ERR(dst)) { - sk->sk_route_caps = 0; - WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); - return PTR_ERR(dst); - } + rcu_read_lock(); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final); + rcu_read_unlock(); - ip6_dst_store(sk, dst, false, false); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (IS_ERR(dst)) { + sk->sk_route_caps = 0; + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); + return PTR_ERR(dst); } + ip6_dst_store(sk, dst, false, false); return 0; } -EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt) diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 83e03176819c..c564b68a0562 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -72,12 +72,12 @@ static void ip6_datagram_flow_key_init(struct flowi6 *fl6, int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) { struct ip6_flowlabel *flowlabel = NULL; - struct in6_addr *final_p, final; - struct ipv6_txoptions *opt; - struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 fl6; + struct ipv6_txoptions *opt; + struct in6_addr *final_p; + struct dst_entry *dst; + struct flowi6 *fl6; int err = 0; if (inet6_test_bit(SNDFLOW, sk) && @@ -86,14 +86,15 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) if (IS_ERR(flowlabel)) return -EINVAL; } - ip6_datagram_flow_key_init(&fl6, sk); + fl6 = &inet_sk(sk)->cork.fl.u.ip6; + ip6_datagram_flow_key_init(fl6, sk); rcu_read_lock(); opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt); - final_p = fl6_update_dst(&fl6, opt, &final); + final_p = fl6_update_dst(fl6, opt, &np->final); rcu_read_unlock(); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto out; @@ -101,17 +102,17 @@ int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr) if (fix_sk_saddr) { if (ipv6_addr_any(&np->saddr)) - np->saddr = fl6.saddr; + np->saddr = fl6->saddr; if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) { - sk->sk_v6_rcv_saddr = fl6.saddr; + sk->sk_v6_rcv_saddr = fl6->saddr; inet->inet_rcv_saddr = LOOPBACK4_IPV6; if (sk->sk_prot->rehash) sk->sk_prot->rehash(sk); } } - ip6_sk_dst_store_flow(sk, dst, &fl6); + ip6_sk_dst_store_flow(sk, dst, fl6); out: fl6_sock_release(flowlabel); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index a23eb8734e15..209fdf1b1aa9 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -314,7 +314,7 @@ fail_and_free: } extlen = (skb_transport_header(skb)[1] + 1) << 3; - if (extlen > net->ipv6.sysctl.max_dst_opts_len) + if (extlen > READ_ONCE(net->ipv6.sysctl.max_dst_opts_len)) goto fail_and_free; opt->lastopt = opt->dst1 = skb_network_header_len(skb); @@ -322,7 +322,8 @@ fail_and_free: dstbuf = opt->dst1; #endif - if (ip6_parse_tlv(false, skb, net->ipv6.sysctl.max_dst_opts_cnt)) { + if (ip6_parse_tlv(false, skb, + READ_ONCE(net->ipv6.sysctl.max_dst_opts_cnt))) { skb->transport_header += extlen; opt = IP6CB(skb); #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -1049,11 +1050,12 @@ fail_and_free: } extlen = (skb_transport_header(skb)[1] + 1) << 3; - if (extlen > net->ipv6.sysctl.max_hbh_opts_len) + if (extlen > READ_ONCE(net->ipv6.sysctl.max_hbh_opts_len)) goto fail_and_free; opt->flags |= IP6SKB_HOPBYHOP; - if (ip6_parse_tlv(true, skb, net->ipv6.sysctl.max_hbh_opts_cnt)) { + if (ip6_parse_tlv(true, skb, + READ_ONCE(net->ipv6.sysctl.max_hbh_opts_cnt))) { skb->transport_header += extlen; opt = IP6CB(skb); opt->nhoff = sizeof(struct ipv6hdr); @@ -1072,9 +1074,9 @@ fail_and_free: * for headers. */ -static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr0(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { struct rt0_hdr *phdr, *ihdr; int hops; @@ -1093,13 +1095,13 @@ static void ipv6_push_rthdr0(struct sk_buff *skb, u8 *proto, phdr->addr[hops - 1] = **addr_p; *addr_p = ihdr->addr; - phdr->rt_hdr.nexthdr = *proto; - *proto = NEXTHDR_ROUTING; + phdr->rt_hdr.nexthdr = proto; + return NEXTHDR_ROUTING; } -static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr4(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { struct ipv6_sr_hdr *sr_phdr, *sr_ihdr; int plen, hops; @@ -1142,58 +1144,61 @@ static void ipv6_push_rthdr4(struct sk_buff *skb, u8 *proto, } #endif - sr_phdr->nexthdr = *proto; - *proto = NEXTHDR_ROUTING; + sr_phdr->nexthdr = proto; + return NEXTHDR_ROUTING; } -static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, - struct ipv6_rt_hdr *opt, - struct in6_addr **addr_p, struct in6_addr *saddr) +static u8 ipv6_push_rthdr(struct sk_buff *skb, u8 proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p, struct in6_addr *saddr) { switch (opt->type) { case IPV6_SRCRT_TYPE_0: case IPV6_SRCRT_STRICT: case IPV6_SRCRT_TYPE_2: - ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); + proto = ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: - ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); + proto = ipv6_push_rthdr4(skb, proto, opt, addr_p, saddr); break; default: break; } + return proto; } -static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +static u8 ipv6_push_exthdr(struct sk_buff *skb, u8 proto, u8 type, struct ipv6_opt_hdr *opt) { struct ipv6_opt_hdr *h = skb_push(skb, ipv6_optlen(opt)); memcpy(h, opt, ipv6_optlen(opt)); - h->nexthdr = *proto; - *proto = type; + h->nexthdr = proto; + return type; } -void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, - u8 *proto, - struct in6_addr **daddr, struct in6_addr *saddr) +u8 ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 proto, + struct in6_addr **daddr, struct in6_addr *saddr) { if (opt->srcrt) { - ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); + proto = ipv6_push_rthdr(skb, proto, opt->srcrt, daddr, saddr); /* * IPV6_RTHDRDSTOPTS is ignored * unless IPV6_RTHDR is set (RFC3542). */ if (opt->dst0opt) - ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); } if (opt->hopopt) - ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); + return proto; } -void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +u8 ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 proto) { if (opt->dst1opt) - ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); + proto = ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); + return proto; } EXPORT_SYMBOL(ipv6_push_frag_opts); @@ -1334,21 +1339,21 @@ struct ipv6_txoptions *__ipv6_fixup_options(struct ipv6_txoptions *opt_space, EXPORT_SYMBOL_GPL(__ipv6_fixup_options); /** - * fl6_update_dst - update flowi destination address with info given + * __fl6_update_dst - update flowi destination address with info given * by srcrt option, if any. * * @fl6: flowi6 for which daddr is to be updated * @opt: struct ipv6_txoptions in which to look for srcrt opt * @orig: copy of original daddr address if modified * - * Returns NULL if no txoptions or no srcrt, otherwise returns orig + * Return: NULL if no srcrt or invalid srcrt type, otherwise returns orig * and initial value of fl6->daddr set in orig */ -struct in6_addr *fl6_update_dst(struct flowi6 *fl6, - const struct ipv6_txoptions *opt, - struct in6_addr *orig) +struct in6_addr *__fl6_update_dst(struct flowi6 *fl6, + const struct ipv6_txoptions *opt, + struct in6_addr *orig) { - if (!opt || !opt->srcrt) + if (!opt->srcrt) return NULL; *orig = fl6->daddr; @@ -1372,4 +1377,4 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, return orig; } -EXPORT_SYMBOL_GPL(fl6_update_dst); +EXPORT_SYMBOL_GPL(__fl6_update_dst); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 9d37e7711bc2..375ecd779fda 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -958,7 +958,8 @@ static enum skb_drop_reason icmpv6_echo_reply(struct sk_buff *skb) tmp_hdr.icmp6_type = type; memset(&fl6, 0, sizeof(fl6)); - if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) + if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES) fl6.flowlabel = ip6_flowlabel(ipv6_hdr(skb)); fl6.flowi6_proto = IPPROTO_ICMPV6; @@ -1066,6 +1067,12 @@ enum skb_drop_reason icmpv6_notify(struct sk_buff *skb, u8 type, if (reason != SKB_NOT_DROPPED_YET) goto out; + if (nexthdr == IPPROTO_RAW) { + /* Add a more specific reason later ? */ + reason = SKB_DROP_REASON_NOT_SPECIFIED; + goto out; + } + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. Without this we will not able f.e. to make source routed pmtu discovery. diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index ea5cf3fdfdd6..11fc2f7de2fe 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -25,14 +25,14 @@ #include <net/sock_reuseport.h> struct dst_entry *inet6_csk_route_req(const struct sock *sk, + struct dst_entry *dst, struct flowi6 *fl6, const struct request_sock *req, u8 proto) { - struct inet_request_sock *ireq = inet_rsk(req); + const struct inet_request_sock *ireq = inet_rsk(req); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *final_p, final; - struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); fl6->flowi6_proto = proto; @@ -48,25 +48,20 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, fl6->flowi6_uid = sk_uid(sk); security_req_classify_flow(req, flowi6_to_flowi_common(fl6)); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); - if (IS_ERR(dst)) - return NULL; - + if (!dst) { + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + if (IS_ERR(dst)) + return NULL; + } return dst; } -static inline -struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie) -{ - return __sk_dst_check(sk, cookie); -} - static struct dst_entry *inet6_csk_route_socket(struct sock *sk, struct flowi6 *fl6) { struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); - struct in6_addr *final_p, final; + struct in6_addr *final_p; struct dst_entry *dst; memset(fl6, 0, sizeof(*fl6)); @@ -83,41 +78,41 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); rcu_read_lock(); - final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final); + final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &np->final); rcu_read_unlock(); - dst = __inet6_csk_dst_check(sk, np->dst_cookie); - if (!dst) { - dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); + + if (!IS_ERR(dst)) + ip6_dst_store(sk, dst, false, false); - if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, false, false); - } return dst; } int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused) { + struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi6 fl6; struct dst_entry *dst; int res; - dst = inet6_csk_route_socket(sk, &fl6); - if (IS_ERR(dst)) { - WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); - sk->sk_route_caps = 0; - kfree_skb(skb); - return PTR_ERR(dst); + dst = __sk_dst_check(sk, np->dst_cookie); + if (unlikely(!dst)) { + dst = inet6_csk_route_socket(sk, fl6); + if (IS_ERR(dst)) { + WRITE_ONCE(sk->sk_err_soft, -PTR_ERR(dst)); + sk->sk_route_caps = 0; + kfree_skb(skb); + return PTR_ERR(dst); + } + /* Restore final destination back after routing done */ + fl6->daddr = sk->sk_v6_daddr; } rcu_read_lock(); skb_dst_set_noref(skb, dst); - /* Restore final destination back after routing done */ - fl6.daddr = sk->sk_v6_daddr; - - res = ip6_xmit(sk, skb, &fl6, sk->sk_mark, rcu_dereference(np->opt), + res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt), np->tclass, READ_ONCE(sk->sk_priority)); rcu_read_unlock(); return res; @@ -126,13 +121,15 @@ EXPORT_SYMBOL_GPL(inet6_csk_xmit); struct dst_entry *inet6_csk_update_pmtu(struct sock *sk, u32 mtu) { - struct flowi6 fl6; - struct dst_entry *dst = inet6_csk_route_socket(sk, &fl6); + struct flowi6 *fl6 = &inet_sk(sk)->cork.fl.u.ip6; + struct dst_entry *dst; + + dst = inet6_csk_route_socket(sk, fl6); if (IS_ERR(dst)) return NULL; dst->ops->update_pmtu(dst, sk, NULL, mtu, true); - dst = inet6_csk_route_socket(sk, &fl6); + dst = inet6_csk_route_socket(sk, fl6); return IS_ERR(dst) ? NULL : dst; } diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c6439e30e892..9880d608392b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1375,14 +1375,14 @@ static void fib6_start_gc(struct net *net, struct fib6_info *rt) if (!timer_pending(&net->ipv6.ip6_fib_timer) && (rt->fib6_flags & RTF_EXPIRES)) mod_timer(&net->ipv6.ip6_fib_timer, - jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); + jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } void fib6_force_start_gc(struct net *net) { if (!timer_pending(&net->ipv6.ip6_fib_timer)) mod_timer(&net->ipv6.ip6_fib_timer, - jiffies + net->ipv6.sysctl.ip6_rt_gc_interval); + jiffies + READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval)); } static void __fib6_update_sernum_upto_root(struct fib6_info *rt, @@ -2414,6 +2414,7 @@ static void fib6_gc_all(struct net *net, struct fib6_gc_args *gc_args) void fib6_run_gc(unsigned long expires, struct net *net, bool force) { struct fib6_gc_args gc_args; + int ip6_rt_gc_interval; unsigned long now; if (force) { @@ -2422,8 +2423,8 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ); return; } - gc_args.timeout = expires ? (int)expires : - net->ipv6.sysctl.ip6_rt_gc_interval; + ip6_rt_gc_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_interval); + gc_args.timeout = expires ? (int)expires : ip6_rt_gc_interval; gc_args.more = 0; fib6_gc_all(net, &gc_args); @@ -2432,8 +2433,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force) if (gc_args.more) mod_timer(&net->ipv6.ip6_fib_timer, - round_jiffies(now - + net->ipv6.sysctl.ip6_rt_gc_interval)); + round_jiffies(now + ip6_rt_gc_interval)); else timer_delete(&net->ipv6.ip6_fib_timer); spin_unlock_bh(&net->ipv6.fib6_gc_lock); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index d19d86ed4376..dafcc0dcd77a 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -1057,7 +1057,7 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, /* TooBig packet may have updated dst->dev's mtu */ if (!t->parms.collect_md && dst) { mtu = READ_ONCE(dst_dev(dst)->mtu); - if (dst_mtu(dst) > mtu) + if (dst6_mtu(dst) > mtu) dst->ops->update_pmtu(dst, NULL, skb, mtu, false); } err = ip6_tnl_xmit(skb, dev, dsfield, &fl6, encap_limit, &mtu, diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 168ec07e31cc..2bcb981c91aa 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -262,7 +262,7 @@ static struct sk_buff *ip6_rcv_core(struct sk_buff *skb, struct net_device *dev, skb->transport_header = skb->network_header + sizeof(*hdr); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); - pkt_len = ntohs(hdr->payload_len); + pkt_len = ipv6_payload_len(skb, hdr); /* pkt_len may be zero if Jumbo payload option is present */ if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index fce91183797a..bd7f780e37a5 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -19,23 +19,7 @@ #include <net/gso.h> #include "ip6_offload.h" - -/* All GRO functions are always builtin, except UDP over ipv6, which lays in - * ipv6 module, as it depends on UDPv6 lookup function, so we need special care - * when ipv6 is built as a module - */ -#if IS_BUILTIN(CONFIG_IPV6) -#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_2(f, f2, f1, __VA_ARGS__) -#else -#define INDIRECT_CALL_L4(f, f2, f1, ...) INDIRECT_CALL_1(f, f2, __VA_ARGS__) -#endif - -#define indirect_call_gro_receive_l4(f2, f1, cb, head, skb) \ -({ \ - unlikely(gro_recursion_inc_test(skb)) ? \ - NAPI_GRO_CB(skb)->flush |= 1, NULL : \ - INDIRECT_CALL_L4(cb, f2, f1, head, skb); \ -}) +#include "tcpv6_offload.c" static int ipv6_gro_pull_exthdrs(struct sk_buff *skb, int off, int proto) { @@ -110,7 +94,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, struct sk_buff *segs = ERR_PTR(-EINVAL); struct ipv6hdr *ipv6h; const struct net_offload *ops; - int proto, err; + int proto; struct frag_hdr *fptr; unsigned int payload_len; u8 *prevhdr; @@ -120,9 +104,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb, bool gso_partial; skb_reset_network_header(skb); - err = ipv6_hopopt_jumbo_remove(skb); - if (err) - return ERR_PTR(err); nhoff = skb_network_header(skb) - skb_mac_header(skb); if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h)))) goto out; @@ -298,9 +279,19 @@ not_same_flow: skb_gro_postpull_rcsum(skb, iph, nlen); - pp = indirect_call_gro_receive_l4(tcp6_gro_receive, udp6_gro_receive, - ops->callbacks.gro_receive, head, skb); + if (unlikely(gro_recursion_inc_test(skb))) { + flush = 1; + goto out; + } + if (likely(proto == IPPROTO_TCP)) + pp = tcp6_gro_receive(head, skb); +#if IS_BUILTIN(CONFIG_IPV6) + else if (likely(proto == IPPROTO_UDP)) + pp = udp6_gro_receive(head, skb); +#endif + else + pp = ops->callbacks.gro_receive(head, skb); out: skb_gro_flush_final(skb, pp, flush); @@ -342,48 +333,28 @@ INDIRECT_CALLABLE_SCOPE int ipv6_gro_complete(struct sk_buff *skb, int nhoff) const struct net_offload *ops; struct ipv6hdr *iph; int err = -ENOSYS; - u32 payload_len; if (skb->encapsulation) { skb_set_inner_protocol(skb, cpu_to_be16(ETH_P_IPV6)); skb_set_inner_network_header(skb, nhoff); } - payload_len = skb->len - nhoff - sizeof(*iph); - if (unlikely(payload_len > IPV6_MAXPLEN)) { - struct hop_jumbo_hdr *hop_jumbo; - int hoplen = sizeof(*hop_jumbo); - - /* Move network header left */ - memmove(skb_mac_header(skb) - hoplen, skb_mac_header(skb), - skb->transport_header - skb->mac_header); - skb->data -= hoplen; - skb->len += hoplen; - skb->mac_header -= hoplen; - skb->network_header -= hoplen; - iph = (struct ipv6hdr *)(skb->data + nhoff); - hop_jumbo = (struct hop_jumbo_hdr *)(iph + 1); - - /* Build hop-by-hop options */ - hop_jumbo->nexthdr = iph->nexthdr; - hop_jumbo->hdrlen = 0; - hop_jumbo->tlv_type = IPV6_TLV_JUMBO; - hop_jumbo->tlv_len = 4; - hop_jumbo->jumbo_payload_len = htonl(payload_len + hoplen); - - iph->nexthdr = NEXTHDR_HOP; - iph->payload_len = 0; - } else { - iph = (struct ipv6hdr *)(skb->data + nhoff); - iph->payload_len = htons(payload_len); - } + iph = (struct ipv6hdr *)(skb->data + nhoff); + ipv6_set_payload_len(iph, skb->len - nhoff - sizeof(*iph)); nhoff += sizeof(*iph) + ipv6_exthdrs_len(iph, &ops); + + if (likely(ops == &net_hotdata.tcpv6_offload)) + return tcp6_gro_complete(skb, nhoff); +#if IS_BUILTIN(CONFIG_IPV6) + if (ops == &net_hotdata.udpv6_offload) + return udp6_gro_complete(skb, nhoff); +#endif + if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; - err = INDIRECT_CALL_L4(ops->callbacks.gro_complete, tcp6_gro_complete, - udp6_gro_complete, skb, nhoff); + err = ops->callbacks.gro_complete(skb, nhoff); out: return err; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f904739e99b9..769c39fed1f3 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -80,7 +80,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * hdr = ipv6_hdr(skb); daddr = &hdr->daddr; - if (ipv6_addr_is_multicast(daddr)) { + if (unlikely(ipv6_addr_is_multicast(daddr))) { if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && ((mroute6_is_socket(net, skb) && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || @@ -179,8 +179,7 @@ ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, static int ip6_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb, unsigned int mtu) { - if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && - !skb_gso_validate_network_len(skb, mtu)) + if (unlikely(!skb_gso_validate_network_len(skb, mtu))) return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); return ip6_finish_output2(net, sk, skb); @@ -202,8 +201,8 @@ static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff if (skb_is_gso(skb)) return ip6_finish_output_gso(net, sk, skb, mtu); - if (skb->len > mtu || - (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) + if (unlikely(skb->len > mtu || + (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) return ip6_fragment(net, sk, skb, ip6_finish_output2); return ip6_finish_output2(net, sk, skb); @@ -273,8 +272,6 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); struct inet6_dev *idev = ip6_dst_idev(dst); - struct hop_jumbo_hdr *hop_jumbo; - int hoplen = sizeof(*hop_jumbo); struct net *net = sock_net(sk); unsigned int head_room; struct net_device *dev; @@ -287,7 +284,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, rcu_read_lock(); dev = dst_dev_rcu(dst); - head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); + head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; @@ -301,32 +298,22 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, } } - if (opt) { + if (unlikely(opt)) { seg_len += opt->opt_nflen + opt->opt_flen; if (opt->opt_flen) - ipv6_push_frag_opts(skb, opt, &proto); + proto = ipv6_push_frag_opts(skb, opt, proto); if (opt->opt_nflen) - ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, - &fl6->saddr); + proto = ipv6_push_nfrag_opts(skb, opt, proto, + &first_hop, + &fl6->saddr); } - if (unlikely(seg_len > IPV6_MAXPLEN)) { - hop_jumbo = skb_push(skb, hoplen); - - hop_jumbo->nexthdr = proto; - hop_jumbo->hdrlen = 0; - hop_jumbo->tlv_type = IPV6_TLV_JUMBO; - hop_jumbo->tlv_len = 4; - hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); - - proto = IPPROTO_HOPOPTS; + if (unlikely(seg_len > IPV6_MAXPLEN)) seg_len = 0; - IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; - } - skb_push(skb, sizeof(struct ipv6hdr)); + __skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); @@ -352,8 +339,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, skb->priority = priority; skb->mark = mark; - mtu = dst_mtu(dst); - if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { + mtu = dst6_mtu(dst); + if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); /* if egress device is enslaved to an L3 master device pass the @@ -382,7 +369,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); - kfree_skb(skb); + kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); unlock: rcu_read_unlock(); return ret; @@ -653,7 +640,7 @@ int ip6_forward(struct sk_buff *skb) if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (ip6_pkt_too_big(skb, mtu)) { + if (unlikely(ip6_pkt_too_big(skb, mtu))) { /* Again, force OUTPUT device used as source address */ skb->dev = dev; icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); @@ -1352,12 +1339,13 @@ static void ip6_append_data_mtu(unsigned int *mtu, } static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, - struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, + struct ipcm6_cookie *ipc6, struct rt6_info *rt) { + struct ipv6_txoptions *nopt, *opt = ipc6->opt; + struct inet6_cork *v6_cork = &cork->base6; struct ipv6_pinfo *np = inet6_sk(sk); unsigned int mtu, frag_size; - struct ipv6_txoptions *nopt, *opt = ipc6->opt; /* callers pass dst together with a reference, set it first so * ip6_cork_release() can put it down even in case of an error. @@ -1367,7 +1355,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, /* * setup for corking */ - if (opt) { + if (unlikely(opt)) { if (WARN_ON(v6_cork->opt)) return -EINVAL; @@ -1402,10 +1390,10 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? - READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); + READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); else mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? - READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); + READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); frag_size = READ_ONCE(np->frag_size); if (frag_size && frag_size < mtu) @@ -1430,17 +1418,17 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, static int __ip6_append_data(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork_full, - struct inet6_cork *v6_cork, struct page_frag *pfrag, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, unsigned int flags) { - struct sk_buff *skb, *skb_prev = NULL; + unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct inet6_cork *v6_cork = &cork_full->base6; struct inet_cork *cork = &cork_full->base; struct flowi6 *fl6 = &cork_full->fl.u.ip6; - unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; + struct sk_buff *skb, *skb_prev = NULL; struct ubuf_info *uarg = NULL; int exthdrlen = 0; int dst_exthdrlen = 0; @@ -1843,7 +1831,6 @@ int ip6_append_data(struct sock *sk, struct rt6_info *rt, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); int exthdrlen; int err; @@ -1854,7 +1841,7 @@ int ip6_append_data(struct sock *sk, * setup for corking */ dst_hold(&rt->dst); - err = ip6_setup_cork(sk, &inet->cork, &np->cork, + err = ip6_setup_cork(sk, &inet->cork, ipc6, rt); if (err) return err; @@ -1868,7 +1855,7 @@ int ip6_append_data(struct sock *sk, } return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, - &np->cork, sk_page_frag(sk), getfrag, + sk_page_frag(sk), getfrag, from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -1881,10 +1868,11 @@ static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) skb_dst_set(skb, dst); } -static void ip6_cork_release(struct inet_cork_full *cork, - struct inet6_cork *v6_cork) +static void ip6_cork_release(struct inet_cork_full *cork) { - if (v6_cork->opt) { + struct inet6_cork *v6_cork = &cork->base6; + + if (unlikely(v6_cork->opt)) { struct ipv6_txoptions *opt = v6_cork->opt; kfree(opt->dst0opt); @@ -1903,15 +1891,14 @@ static void ip6_cork_release(struct inet_cork_full *cork, struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, - struct inet_cork_full *cork, - struct inet6_cork *v6_cork) + struct inet_cork_full *cork) { struct sk_buff *skb, *tmp_skb; struct sk_buff **tail_skb; struct in6_addr *final_dst; struct net *net = sock_net(sk); struct ipv6hdr *hdr; - struct ipv6_txoptions *opt = v6_cork->opt; + struct ipv6_txoptions *opt; struct rt6_info *rt = dst_rt6_info(cork->base.dst); struct flowi6 *fl6 = &cork->fl.u.ip6; unsigned char proto = fl6->flowi6_proto; @@ -1940,19 +1927,22 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, __skb_pull(skb, skb_network_header_len(skb)); final_dst = &fl6->daddr; - if (opt && opt->opt_flen) - ipv6_push_frag_opts(skb, opt, &proto); - if (opt && opt->opt_nflen) - ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); - + opt = cork->base6.opt; + if (unlikely(opt)) { + if (opt->opt_flen) + proto = ipv6_push_frag_opts(skb, opt, proto); + if (opt->opt_nflen) + proto = ipv6_push_nfrag_opts(skb, opt, proto, + &final_dst, &fl6->saddr); + } skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); hdr = ipv6_hdr(skb); - ip6_flow_hdr(hdr, v6_cork->tclass, + ip6_flow_hdr(hdr, cork->base6.tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, ip6_autoflowlabel(net, sk), fl6)); - hdr->hop_limit = v6_cork->hop_limit; + hdr->hop_limit = cork->base6.hop_limit; hdr->nexthdr = proto; hdr->saddr = fl6->saddr; hdr->daddr = *final_dst; @@ -1966,7 +1956,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ip6_cork_steal_dst(skb, cork); IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); - if (proto == IPPROTO_ICMPV6) { + if (unlikely(proto == IPPROTO_ICMPV6)) { struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; @@ -1979,7 +1969,7 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); } - ip6_cork_release(cork, v6_cork); + ip6_cork_release(cork); out: return skb; } @@ -2018,8 +2008,7 @@ EXPORT_SYMBOL_GPL(ip6_push_pending_frames); static void __ip6_flush_pending_frames(struct sock *sk, struct sk_buff_head *queue, - struct inet_cork_full *cork, - struct inet6_cork *v6_cork) + struct inet_cork_full *cork) { struct sk_buff *skb; @@ -2030,13 +2019,13 @@ static void __ip6_flush_pending_frames(struct sock *sk, kfree_skb(skb); } - ip6_cork_release(cork, v6_cork); + ip6_cork_release(cork); } void ip6_flush_pending_frames(struct sock *sk) { __ip6_flush_pending_frames(sk, &sk->sk_write_queue, - &inet_sk(sk)->cork, &inet6_sk(sk)->cork); + &inet_sk(sk)->cork); } EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); @@ -2047,9 +2036,8 @@ struct sk_buff *ip6_make_skb(struct sock *sk, struct ipcm6_cookie *ipc6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork) { - struct inet6_cork v6_cork; - struct sk_buff_head queue; int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); + struct sk_buff_head queue; int err; if (flags & MSG_PROBE) { @@ -2062,21 +2050,21 @@ struct sk_buff *ip6_make_skb(struct sock *sk, cork->base.flags = 0; cork->base.addr = 0; cork->base.opt = NULL; - v6_cork.opt = NULL; - err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); + cork->base6.opt = NULL; + err = ip6_setup_cork(sk, cork, ipc6, rt); if (err) { - ip6_cork_release(cork, &v6_cork); + ip6_cork_release(cork); return ERR_PTR(err); } - err = __ip6_append_data(sk, &queue, cork, &v6_cork, + err = __ip6_append_data(sk, &queue, cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, flags); if (err) { - __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); + __ip6_flush_pending_frames(sk, &queue, cork); return ERR_PTR(err); } - return __ip6_make_skb(sk, &queue, cork, &v6_cork); + return __ip6_make_skb(sk, &queue, cork); } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index c1f39735a236..4c29aa94e86e 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -638,7 +638,7 @@ ip4ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, /* change mtu on this route */ if (rel_type == ICMP_DEST_UNREACH && rel_code == ICMP_FRAG_NEEDED) { - if (rel_info > dst_mtu(skb_dst(skb2))) + if (rel_info > dst6_mtu(skb_dst(skb2))) goto out; skb_dst_update_pmtu_no_confirm(skb2, rel_info); @@ -1187,7 +1187,7 @@ route_lookup: t->parms.name); goto tx_err_dst_release; } - mtu = dst_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; + mtu = dst6_mtu(dst) - eth_hlen - psh_hlen - t->tun_hlen; if (encap_limit >= 0) { max_headroom += 8; mtu -= 8; @@ -1265,7 +1265,7 @@ route_lookup: if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); - ipv6_push_frag_opts(skb, &opt.ops, &proto); + proto = ipv6_push_frag_opts(skb, &opt.ops, proto); } skb_push(skb, sizeof(struct ipv6hdr)); @@ -1828,6 +1828,32 @@ int ip6_tnl_encap_setup(struct ip6_tnl *t, } EXPORT_SYMBOL_GPL(ip6_tnl_encap_setup); +static int ip6_tnl_fill_forward_path(struct net_device_path_ctx *ctx, + struct net_device_path *path) +{ + struct ip6_tnl *t = netdev_priv(ctx->dev); + struct flowi6 fl6 = { + .daddr = t->parms.raddr, + }; + struct dst_entry *dst; + int err; + + dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6); + if (!dst->error) { + path->type = DEV_PATH_TUN; + path->tun.src_v6 = t->parms.laddr; + path->tun.dst_v6 = t->parms.raddr; + path->tun.l3_proto = IPPROTO_IPV6; + path->dev = ctx->dev; + ctx->dev = dst->dev; + } + + err = dst->error; + dst_release(dst); + + return err; +} + static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_init = ip6_tnl_dev_init, .ndo_uninit = ip6_tnl_dev_uninit, @@ -1836,6 +1862,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { .ndo_change_mtu = ip6_tnl_change_mtu, .ndo_get_stats64 = dev_get_tstats64, .ndo_get_iflink = ip6_tnl_get_iflink, + .ndo_fill_forward_path = ip6_tnl_fill_forward_path, }; #define IPXIPX_FEATURES (NETIF_F_SG | \ diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a61e742794f9..d784a8644ff2 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -1184,7 +1184,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) - val = dst_mtu(dst); + val = dst6_mtu(dst); rcu_read_unlock(); if (!val) return -ENOTCONN; @@ -1283,7 +1283,7 @@ int do_ipv6_getsockopt(struct sock *sk, int level, int optname, rcu_read_lock(); dst = __sk_dst_get(sk); if (dst) - mtuinfo.ip6m_mtu = dst_mtu(dst); + mtuinfo.ip6m_mtu = dst6_mtu(dst); rcu_read_unlock(); if (!mtuinfo.ip6m_mtu) return -ENOTCONN; diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 1c9b283a4132..cba1684a3f30 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -125,12 +125,7 @@ EXPORT_SYMBOL(ip6_dst_hoplimit); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb) { - int len; - - len = skb->len - sizeof(struct ipv6hdr); - if (len > IPV6_MAXPLEN) - len = 0; - ipv6_hdr(skb)->payload_len = htons(len); + ipv6_set_payload_len(ipv6_hdr(skb), skb->len - sizeof(struct ipv6hdr)); IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); /* if egress device is enslaved to an L3 master device pass the diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index b4cd05dba9b6..27a268059168 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -90,23 +90,24 @@ EXPORT_SYMBOL_GPL(raw_v6_match); * 0 - deliver * 1 - block */ -static int icmpv6_filter(const struct sock *sk, const struct sk_buff *skb) +static int icmpv6_filter(const struct sock *sk, struct sk_buff *skb) { - struct icmp6hdr _hdr; const struct icmp6hdr *hdr; + const __u32 *data; + unsigned int type; /* We require only the four bytes of the ICMPv6 header, not any * additional bytes of message body in "struct icmp6hdr". */ - hdr = skb_header_pointer(skb, skb_transport_offset(skb), - ICMPV6_HDRLEN, &_hdr); - if (hdr) { - const __u32 *data = &raw6_sk(sk)->filter.data[0]; - unsigned int type = hdr->icmp6_type; + if (!pskb_may_pull(skb, ICMPV6_HDRLEN)) + return 1; - return (data[type >> 5] & (1U << (type & 31))) != 0; - } - return 1; + hdr = (struct icmp6hdr *)skb->data; + type = hdr->icmp6_type; + + data = &raw6_sk(sk)->filter.data[0]; + + return (data[type >> 5] & (1U << (type & 31))) != 0; } #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -141,15 +142,13 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); - const struct in6_addr *saddr; - const struct in6_addr *daddr; + const struct ipv6hdr *ip6h; struct hlist_head *hlist; - struct sock *sk; bool delivered = false; + struct sock *sk; __u8 hash; - saddr = &ipv6_hdr(skb)->saddr; - daddr = saddr + 1; + ip6h = ipv6_hdr(skb); hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; @@ -157,7 +156,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) sk_for_each_rcu(sk, hlist) { int filtered; - if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, + if (!raw_v6_match(net, sk, nexthdr, &ip6h->daddr, &ip6h->saddr, inet6_iif(skb), inet6_sdif(skb))) continue; @@ -171,6 +170,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) switch (nexthdr) { case IPPROTO_ICMPV6: filtered = icmpv6_filter(sk, skb); + ip6h = ipv6_hdr(skb); break; #if IS_ENABLED(CONFIG_IPV6_MIP6) @@ -529,7 +529,7 @@ static int rawv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, offset = rp->offset; total_len = inet_sk(sk)->cork.base.length; - opt = inet6_sk(sk)->cork.opt; + opt = inet_sk(sk)->cork.base6.opt; total_len -= opt ? opt->opt_flen : 0; if (offset >= total_len - 1) { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e3a260a5564b..c0350d97307e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2049,6 +2049,8 @@ unlock: static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, struct rt6_info *rt, int mtu) { + u32 dmtu = dst6_mtu(&rt->dst); + /* If the new MTU is lower than the route PMTU, this new MTU will be the * lowest MTU in the path: always allow updating the route PMTU to * reflect PMTU decreases. @@ -2059,10 +2061,10 @@ static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev, * handle this. */ - if (dst_mtu(&rt->dst) >= mtu) + if (dmtu >= mtu) return true; - if (dst_mtu(&rt->dst) == idev->cnf.mtu6) + if (dmtu == idev->cnf.mtu6) return true; return false; @@ -2895,7 +2897,7 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) dst_metric_set(&rt->dst, RTAX_MTU, mtu); rt->rt6i_flags |= RTF_MODIFIED; - rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires); + rt6_update_expires(rt, READ_ONCE(net->ipv6.sysctl.ip6_rt_mtu_expires)); } static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) @@ -2932,7 +2934,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (mtu < IPV6_MIN_MTU) return; - if (mtu >= dst_mtu(dst)) + if (mtu >= dst6_mtu(dst)) return; if (!rt6_cache_allowed_for_pmtu(rt6)) { @@ -3248,7 +3250,7 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - unsigned int mtu = dst_mtu(dst); + unsigned int mtu = dst6_mtu(dst); struct net *net; mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); @@ -3256,8 +3258,8 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) rcu_read_lock(); net = dst_dev_net_rcu(dst); - if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) - mtu = net->ipv6.sysctl.ip6_rt_min_advmss; + mtu = max_t(unsigned int, mtu, + READ_ONCE(net->ipv6.sysctl.ip6_rt_min_advmss)); rcu_read_unlock(); @@ -3359,10 +3361,10 @@ out: static void ip6_dst_gc(struct dst_ops *ops) { struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops); - int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval; - int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; - int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; - unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int rt_min_interval = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_min_interval); + int rt_elasticity = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_elasticity); + int rt_gc_timeout = READ_ONCE(net->ipv6.sysctl.ip6_rt_gc_timeout); + unsigned long rt_last_gc = READ_ONCE(net->ipv6.ip6_rt_last_gc); unsigned int val; int entries; @@ -3419,11 +3421,8 @@ static int ip6_route_check_nh_onlink(struct net *net, err = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0, &res); if (!err && !(res.fib6_flags & RTF_REJECT) && - /* ignore match if it is the default route */ - !ipv6_addr_any(&res.f6i->fib6_dst.addr) && - (res.fib6_type != RTN_UNICAST || dev != res.nh->fib_nh_dev)) { - NL_SET_ERR_MSG(extack, - "Nexthop has invalid gateway or device mismatch"); + res.fib6_type != RTN_UNICAST) { + NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway"); err = -EINVAL; } @@ -5008,7 +5007,7 @@ void rt6_sync_down_dev(struct net_device *dev, unsigned long event) }; struct net *net = dev_net(dev); - if (net->ipv6.sysctl.skip_notify_on_dev_down) + if (READ_ONCE(net->ipv6.sysctl.skip_notify_on_dev_down)) fib6_clean_all_skip_notify(net, fib6_ifdown, &arg); else fib6_clean_all(net, fib6_ifdown, &arg); @@ -6408,6 +6407,7 @@ errout: void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, bool offload, bool trap, bool offload_failed) { + u8 fib_notify_on_flag_change; struct sk_buff *skb; int err; @@ -6419,8 +6419,9 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, WRITE_ONCE(f6i->offload, offload); WRITE_ONCE(f6i->trap, trap); + fib_notify_on_flag_change = READ_ONCE(net->ipv6.sysctl.fib_notify_on_flag_change); /* 2 means send notifications only if offload_failed was changed. */ - if (net->ipv6.sysctl.fib_notify_on_flag_change == 2 && + if (fib_notify_on_flag_change == 2 && READ_ONCE(f6i->offload_failed) == offload_failed) return; @@ -6432,7 +6433,7 @@ void fib6_info_hw_flags_set(struct net *net, struct fib6_info *f6i, */ return; - if (!net->ipv6.sysctl.fib_notify_on_flag_change) + if (!fib_notify_on_flag_change) return; skb = nlmsg_new(rt6_nlmsg_size(f6i), GFP_KERNEL); @@ -6529,7 +6530,7 @@ static int ipv6_sysctl_rtcache_flush(const struct ctl_table *ctl, int write, return ret; net = (struct net *)ctl->extra1; - delay = net->ipv6.sysctl.flush_delay; + delay = READ_ONCE(net->ipv6.sysctl.flush_delay); fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0); return 0; } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index cf37ad9686e6..439c8a1c6625 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -962,7 +962,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, } if (df) { - mtu = dst_mtu(&rt->dst) - t_hlen; + mtu = dst4_mtu(&rt->dst) - t_hlen; if (mtu < IPV4_MIN_MTU) { DEV_STATS_INC(dev, collisions); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 280fe5978559..d10487b4e5bf 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -138,15 +138,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; struct inet_connection_sock *icsk = inet_csk(sk); - struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct in6_addr *saddr = NULL, *final_p; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct ipv6_txoptions *opt; struct dst_entry *dst; - struct flowi6 fl6; + struct flowi6 *fl6; int addr_type; int err; @@ -156,14 +156,15 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, if (usin->sin6_family != AF_INET6) return -EAFNOSUPPORT; - memset(&fl6, 0, sizeof(fl6)); + fl6 = &inet_sk(sk)->cork.fl.u.ip6; + memset(fl6, 0, sizeof(*fl6)); if (inet6_test_bit(SNDFLOW, sk)) { - fl6.flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; - IP6_ECN_flow_init(fl6.flowlabel); - if (fl6.flowlabel&IPV6_FLOWLABEL_MASK) { + fl6->flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl6->flowlabel); + if (fl6->flowlabel & IPV6_FLOWLABEL_MASK) { struct ip6_flowlabel *flowlabel; - flowlabel = fl6_sock_lookup(sk, fl6.flowlabel); + flowlabel = fl6_sock_lookup(sk, fl6->flowlabel); if (IS_ERR(flowlabel)) return -EINVAL; fl6_sock_release(flowlabel); @@ -212,7 +213,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, } sk->sk_v6_daddr = usin->sin6_addr; - np->flow_label = fl6.flowlabel; + np->flow_label = fl6->flowlabel; /* * TCP over IPv4 @@ -260,24 +261,24 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, if (!ipv6_addr_any(&sk->sk_v6_rcv_saddr)) saddr = &sk->sk_v6_rcv_saddr; - fl6.flowi6_proto = IPPROTO_TCP; - fl6.daddr = sk->sk_v6_daddr; - fl6.saddr = saddr ? *saddr : np->saddr; - fl6.flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); - fl6.flowi6_oif = sk->sk_bound_dev_if; - fl6.flowi6_mark = sk->sk_mark; - fl6.fl6_dport = usin->sin6_port; - fl6.fl6_sport = inet->inet_sport; - if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6.fl6_sport) - fl6.flowi6_flags = FLOWI_FLAG_ANY_SPORT; - fl6.flowi6_uid = sk_uid(sk); + fl6->flowi6_proto = IPPROTO_TCP; + fl6->daddr = sk->sk_v6_daddr; + fl6->saddr = saddr ? *saddr : np->saddr; + fl6->flowlabel = ip6_make_flowinfo(np->tclass, np->flow_label); + fl6->flowi6_oif = sk->sk_bound_dev_if; + fl6->flowi6_mark = sk->sk_mark; + fl6->fl6_dport = usin->sin6_port; + fl6->fl6_sport = inet->inet_sport; + if (IS_ENABLED(CONFIG_IP_ROUTE_MULTIPATH) && !fl6->fl6_sport) + fl6->flowi6_flags = FLOWI_FLAG_ANY_SPORT; + fl6->flowi6_uid = sk_uid(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); - final_p = fl6_update_dst(&fl6, opt, &final); + final_p = fl6_update_dst(fl6, opt, &np->final); - security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); + security_sk_classify_flow(sk, flowi6_to_flowi_common(fl6)); - dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(net, sk, fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; @@ -287,7 +288,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr_unsized *uaddr, tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; if (!saddr) { - saddr = &fl6.saddr; + saddr = &fl6->saddr; err = inet_bhash2_update_saddr(sk, saddr, AF_INET6); if (err) @@ -351,7 +352,7 @@ failure: static void tcp_v6_mtu_reduced(struct sock *sk) { struct dst_entry *dst; - u32 mtu; + u32 mtu, dmtu; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) return; @@ -368,8 +369,9 @@ static void tcp_v6_mtu_reduced(struct sock *sk) if (!dst) return; - if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { - tcp_sync_mss(sk, dst_mtu(dst)); + dmtu = dst6_mtu(dst); + if (inet_csk(sk)->icsk_pmtu_cookie > dmtu) { + tcp_sync_mss(sk, dmtu); tcp_simple_retransmit(sk); } } @@ -537,7 +539,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, u8 tclass; /* First, grab a route. */ - if (!dst && (dst = inet6_csk_route_req(sk, fl6, req, + if (!dst && (dst = inet6_csk_route_req(sk, NULL, fl6, req, IPPROTO_TCP)) == NULL) goto done; @@ -787,7 +789,7 @@ static struct dst_entry *tcp_v6_route_req(const struct sock *sk, if (security_inet_conn_request(sk, skb, req)) return NULL; - return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); + return inet6_csk_route_req(sk, NULL, &fl->u.ip6, req, IPPROTO_TCP); } struct request_sock_ops tcp6_request_sock_ops __read_mostly = { @@ -1085,7 +1087,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, txhash = inet_twsk(sk)->tw_txhash; } } else { - if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) + if (READ_ONCE(net->ipv6.sysctl.flowlabel_reflect) & + FLOWLABEL_REFLECT_TCP_RESET) label = ip6_flowlabel(ipv6h); } @@ -1315,12 +1318,12 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * struct request_sock *req_unhash, bool *own_req) { - struct inet_request_sock *ireq; - struct ipv6_pinfo *newnp; const struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct inet_request_sock *ireq; struct ipv6_txoptions *opt; struct inet_sock *newinet; bool found_dup_sk = false; + struct ipv6_pinfo *newnp; struct tcp_sock *newtp; struct sock *newsk; #ifdef CONFIG_TCP_MD5SIG @@ -1389,11 +1392,9 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * if (sk_acceptq_is_full(sk)) goto exit_overflow; - if (!dst) { - dst = inet6_csk_route_req(sk, &fl6, req, IPPROTO_TCP); - if (!dst) - goto exit; - } + dst = inet6_csk_route_req(sk, dst, &fl6, req, IPPROTO_TCP); + if (!dst) + goto exit; newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) @@ -1409,6 +1410,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * inet6_sk_rx_dst_set(newsk, skb); newinet = inet_sk(newsk); + newinet->cork.fl.u.ip6 = fl6; newinet->pinet6 = tcp_inet6_sk(newsk); newinet->ipv6_fl_list = NULL; newinet->inet_opt = NULL; @@ -1466,7 +1468,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * tcp_ca_openreq_child(newsk, dst); - tcp_sync_mss(newsk, dst_mtu(dst)); + tcp_sync_mss(newsk, dst6_mtu(dst)); newtp->advmss = tcp_mss_clamp(tcp_sk(sk), dst_metric_advmss(dst)); tcp_initialize_rcv_mss(newsk); @@ -2331,6 +2333,8 @@ struct proto tcpv6_prot = { .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), + .freeptr_offset = offsetof(struct tcp6_sock, + tcp.inet_conn.icsk_inet.sk.sk_freeptr), .ipv6_pinfo_offset = offsetof(struct tcp6_sock, inet6), .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index 5670d32c27f8..f2a659cd6183 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -24,9 +24,6 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, struct net *net; int iif, sdif; - if (likely(!(skb->dev->features & NETIF_F_GRO_FRAGLIST))) - return; - p = tcp_gro_lookup(head, th); if (p) { NAPI_GRO_CB(skb)->is_flist = NAPI_GRO_CB(p)->is_flist; @@ -45,8 +42,8 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, #endif /* IS_ENABLED(CONFIG_IPV6) */ } -INDIRECT_CALLABLE_SCOPE -struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) +static __always_inline struct sk_buff *tcp6_gro_receive(struct list_head *head, + struct sk_buff *skb) { struct tcphdr *th; @@ -60,7 +57,8 @@ struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb) if (!th) goto flush; - tcp6_check_fraglist_gro(head, skb, th); + if (unlikely(skb->dev->features & NETIF_F_GRO_FRAGLIST)) + tcp6_check_fraglist_gro(head, skb, th); return tcp_gro_receive(head, skb, th); @@ -69,7 +67,7 @@ flush: return NULL; } -INDIRECT_CALLABLE_SCOPE int tcp6_gro_complete(struct sk_buff *skb, int thoff) +static __always_inline int tcp6_gro_complete(struct sk_buff *skb, int thoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *iph = (struct ipv6hdr *)(skb->data + offset); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 794c13674e8a..010b909275dd 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -875,7 +875,8 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) /* * UDP-Lite specific tests, ignored on UDP sockets (see net/ipv4/udp.c). */ - if (udp_test_bit(UDPLITE_RECV_CC, sk) && UDP_SKB_CB(skb)->partial_cov) { + if (unlikely(udp_test_bit(UDPLITE_RECV_CC, sk) && + UDP_SKB_CB(skb)->partial_cov)) { u16 pcrlen = READ_ONCE(up->pcrlen); if (pcrlen == 0) { /* full coverage was set */ @@ -1439,7 +1440,7 @@ csum_partial: send: err = ip6_send_skb(skb); - if (err) { + if (unlikely(err)) { if (err == -ENOBUFS && !inet6_test_bit(RECVERR6, sk)) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_SNDBUFERRORS, is_udplite); diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 046f13b1d77a..e003b8494dc0 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -132,7 +132,6 @@ static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, sdif, net->ipv4.udp_table, NULL); } -INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); @@ -165,7 +164,7 @@ flush: return NULL; } -INDIRECT_CALLABLE_SCOPE int udp6_gro_complete(struct sk_buff *skb, int nhoff) +int udp6_gro_complete(struct sk_buff *skb, int nhoff) { const u16 offset = NAPI_GRO_CB(skb)->network_offsets[skb->encapsulation]; const struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + offset); diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c index da2af413c89d..b43b1059eea8 100644 --- a/net/iucv/iucv.c +++ b/net/iucv/iucv.c @@ -313,13 +313,12 @@ static union iucv_param *iucv_param[NR_CPUS]; static union iucv_param *iucv_param_irq[NR_CPUS]; /** - * __iucv_call_b2f0 + * __iucv_call_b2f0 - Calls CP to execute IUCV commands. + * * @command: identifier of IUCV call to CP. * @parm: pointer to a struct iucv_parm block * - * Calls CP to execute IUCV commands. - * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ static inline int __iucv_call_b2f0(int command, union iucv_param *parm) { @@ -348,11 +347,10 @@ static inline int iucv_call_b2f0(int command, union iucv_param *parm) } /* - * iucv_query_maxconn + * iucv_query_maxconn - Determine the maximum number of connections that + * may be established. * - * Determines the maximum number of connections that may be established. - * - * Returns the maximum number of connections or -EPERM is IUCV is not + * Returns: the maximum number of connections or -EPERM is IUCV is not * available. */ static int __iucv_query_maxconn(void *param, unsigned long *max_pathid) @@ -391,10 +389,9 @@ static int iucv_query_maxconn(void) } /** - * iucv_allow_cpu - * @data: unused + * iucv_allow_cpu - Allow iucv interrupts on this cpu. * - * Allow iucv interrupts on this cpu. + * @data: unused */ static void iucv_allow_cpu(void *data) { @@ -432,10 +429,9 @@ static void iucv_allow_cpu(void *data) } /** - * iucv_block_cpu - * @data: unused + * iucv_block_cpu - Block iucv interrupts on this cpu. * - * Block iucv interrupts on this cpu. + * @data: unused */ static void iucv_block_cpu(void *data) { @@ -452,10 +448,9 @@ static void iucv_block_cpu(void *data) } /** - * iucv_declare_cpu - * @data: unused + * iucv_declare_cpu - Declare a interrupt buffer on this cpu. * - * Declare a interrupt buffer on this cpu. + * @data: unused */ static void iucv_declare_cpu(void *data) { @@ -507,10 +502,9 @@ static void iucv_declare_cpu(void *data) } /** - * iucv_retrieve_cpu - * @data: unused + * iucv_retrieve_cpu - Retrieve interrupt buffer on this cpu. * - * Retrieve interrupt buffer on this cpu. + * @data: unused */ static void iucv_retrieve_cpu(void *data) { @@ -532,9 +526,7 @@ static void iucv_retrieve_cpu(void *data) } /* - * iucv_setmask_mp - * - * Allow iucv interrupts on all cpus. + * iucv_setmask_mp - Allow iucv interrupts on all cpus. */ static void iucv_setmask_mp(void) { @@ -551,9 +543,7 @@ static void iucv_setmask_mp(void) } /* - * iucv_setmask_up - * - * Allow iucv interrupts on a single cpu. + * iucv_setmask_up - Allow iucv interrupts on a single cpu. */ static void iucv_setmask_up(void) { @@ -568,12 +558,11 @@ static void iucv_setmask_up(void) } /* - * iucv_enable + * iucv_enable - Make the iucv ready for use * - * This function makes iucv ready for use. It allocates the pathid - * table, declares an iucv interrupt buffer and enables the iucv - * interrupts. Called when the first user has registered an iucv - * handler. + * It allocates the pathid table, declares an iucv interrupt buffer and + * enables the iucv interrupts. Called when the first user has registered + * an iucv handler. */ static int iucv_enable(void) { @@ -603,11 +592,10 @@ out: } /* - * iucv_disable + * iucv_disable - Shuts down iucv. * - * This function shuts down iucv. It disables iucv interrupts, retrieves - * the iucv interrupt buffer and frees the pathid table. Called after the - * last user unregister its iucv handler. + * It disables iucv interrupts, retrieves the iucv interrupt buffer and frees + * the pathid table. Called after the last user unregister its iucv handler. */ static void iucv_disable(void) { @@ -695,11 +683,10 @@ __free_cpumask: } /** - * iucv_sever_pathid + * iucv_sever_pathid - Sever an iucv path to free up the pathid. Used internally. + * * @pathid: path identification number. * @userdata: 16-bytes of user data. - * - * Sever an iucv path to free up the pathid. Used internally. */ static int iucv_sever_pathid(u16 pathid, u8 *userdata) { @@ -714,22 +701,20 @@ static int iucv_sever_pathid(u16 pathid, u8 *userdata) } /** - * __iucv_cleanup_queue - * @dummy: unused dummy argument + * __iucv_cleanup_queue - Nop function called via smp_call_function to force + * work items from pending external iucv interrupts to the work queue. * - * Nop function called via smp_call_function to force work items from - * pending external iucv interrupts to the work queue. + * @dummy: unused dummy argument */ static void __iucv_cleanup_queue(void *dummy) { } /** - * iucv_cleanup_queue + * iucv_cleanup_queue - Called after a path has been severed to find all + * remaining work items for the now stale pathid. * - * Function called after a path has been severed to find all remaining - * work items for the now stale pathid. The caller needs to hold the - * iucv_table_lock. + * The caller needs to hold the iucv_table_lock. */ static void iucv_cleanup_queue(void) { @@ -757,13 +742,12 @@ static void iucv_cleanup_queue(void) } /** - * iucv_register: + * iucv_register - Registers a driver with IUCV. + * * @handler: address of iucv handler structure * @smp: != 0 indicates that the handler can deal with out of order messages * - * Registers a driver with IUCV. - * - * Returns 0 on success, -ENOMEM if the memory allocation for the pathid + * Returns: 0 on success, -ENOMEM if the memory allocation for the pathid * table failed, or -EIO if IUCV_DECLARE_BUFFER failed on all cpus. */ int iucv_register(struct iucv_handler *handler, int smp) @@ -794,11 +778,10 @@ out_mutex: EXPORT_SYMBOL(iucv_register); /** - * iucv_unregister + * iucv_unregister - Unregister driver from IUCV. + * * @handler: address of iucv handler structure * @smp: != 0 indicates that the handler can deal with out of order messages - * - * Unregister driver from IUCV. */ void iucv_unregister(struct iucv_handler *handler, int smp) { @@ -852,7 +835,8 @@ static struct notifier_block iucv_reboot_notifier = { }; /** - * iucv_path_accept + * iucv_path_accept - Complete the IUCV communication path + * * @path: address of iucv path structure * @handler: address of iucv handler structure * @userdata: 16 bytes of data reflected to the communication partner @@ -861,7 +845,7 @@ static struct notifier_block iucv_reboot_notifier = { * This function is issued after the user received a connection pending * external interrupt and now wishes to complete the IUCV communication path. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_accept(struct iucv_path *path, struct iucv_handler *handler, u8 *userdata, void *private) @@ -896,7 +880,8 @@ out: EXPORT_SYMBOL(iucv_path_accept); /** - * iucv_path_connect + * iucv_path_connect - Establish an IUCV path + * * @path: address of iucv path structure * @handler: address of iucv handler structure * @userid: 8-byte user identification @@ -908,7 +893,7 @@ EXPORT_SYMBOL(iucv_path_accept); * successfully, you are not able to use the path until you receive an IUCV * Connection Complete external interrupt. * - * Returns the result of the CP IUCV call. + * Returns: the result of the CP IUCV call. */ int iucv_path_connect(struct iucv_path *path, struct iucv_handler *handler, u8 *userid, u8 *system, u8 *userdata, @@ -964,14 +949,14 @@ out: EXPORT_SYMBOL(iucv_path_connect); /** - * iucv_path_quiesce: + * iucv_path_quiesce - Temporarily suspend incoming messages * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * * This function temporarily suspends incoming messages on an IUCV path. * You can later reactivate the path by invoking the iucv_resume function. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_quiesce(struct iucv_path *path, u8 *userdata) { @@ -996,14 +981,15 @@ out: EXPORT_SYMBOL(iucv_path_quiesce); /** - * iucv_path_resume: + * iucv_path_resume - Resume incoming messages on a suspended IUCV path + * * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * * This function resumes incoming messages on an IUCV path that has * been stopped with iucv_path_quiesce. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_resume(struct iucv_path *path, u8 *userdata) { @@ -1027,13 +1013,12 @@ out: } /** - * iucv_path_sever + * iucv_path_sever - Terminates an IUCV path. + * * @path: address of iucv path structure * @userdata: 16 bytes of data reflected to the communication partner * - * This function terminates an IUCV path. - * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_path_sever(struct iucv_path *path, u8 *userdata) { @@ -1058,14 +1043,13 @@ out: EXPORT_SYMBOL(iucv_path_sever); /** - * iucv_message_purge + * iucv_message_purge - Cancels a message you have sent. + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @srccls: source class of message * - * Cancels a message you have sent. - * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_purge(struct iucv_path *path, struct iucv_message *msg, u32 srccls) @@ -1096,13 +1080,15 @@ out: EXPORT_SYMBOL(iucv_message_purge); /** - * iucv_message_receive_iprmdata + * iucv_message_receive_iprmdata - Internal function to receive RMDATA + * stored in &struct iucv_message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer - * @residual: + * @residual: number of bytes remaining in the data buffer * * Internal function used by iucv_message_receive and __iucv_message_receive * to receive RMDATA data stored in struct iucv_message. @@ -1140,10 +1126,11 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path, } /** - * __iucv_message_receive + * __iucv_message_receive - Receives messages on an established path (no locking) + * * @path: address of iucv path structure * @msg: address of iucv msg structure - * @flags: how the message is received (IUCV_IPBUFLST) + * @flags: flags that affect how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer * @residual: @@ -1154,7 +1141,7 @@ static int iucv_message_receive_iprmdata(struct iucv_path *path, * * Locking: no locking * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual) @@ -1188,10 +1175,11 @@ int __iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(__iucv_message_receive); /** - * iucv_message_receive + * iucv_message_receive - Receives messages on an established path, with locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure - * @flags: how the message is received (IUCV_IPBUFLST) + * @flags: flags that affect how the message is received (IUCV_IPBUFLST) * @buffer: address of data buffer or address of struct iucv_array * @size: length of data buffer * @residual: @@ -1202,7 +1190,7 @@ EXPORT_SYMBOL(__iucv_message_receive); * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *buffer, size_t size, size_t *residual) @@ -1220,7 +1208,8 @@ int iucv_message_receive(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(iucv_message_receive); /** - * iucv_message_reject + * iucv_message_reject - Refuses a specified message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @@ -1228,7 +1217,7 @@ EXPORT_SYMBOL(iucv_message_receive); * are notified of a message and the time that you complete the message, * the message may be rejected. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reject(struct iucv_path *path, struct iucv_message *msg) { @@ -1254,7 +1243,8 @@ out: EXPORT_SYMBOL(iucv_message_reject); /** - * iucv_message_reply + * iucv_message_reply - Replies to a specified message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the reply is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1262,11 +1252,11 @@ EXPORT_SYMBOL(iucv_message_reject); * @size: length of reply data buffer * * This function responds to the two-way messages that you receive. You - * must identify completely the message to which you wish to reply. ie, + * must identify completely the message to which you wish to reply. I.e., * pathid, msgid, and trgcls. Prmmsg signifies the data is moved into * the parameter list. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_reply(struct iucv_path *path, struct iucv_message *msg, u8 flags, void *reply, size_t size) @@ -1303,7 +1293,8 @@ out: EXPORT_SYMBOL(iucv_message_reply); /** - * __iucv_message_send + * __iucv_message_send - Transmits a one-way message, no locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1317,7 +1308,7 @@ EXPORT_SYMBOL(iucv_message_reply); * * Locking: no locking * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int __iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size) @@ -1357,7 +1348,8 @@ out: EXPORT_SYMBOL(__iucv_message_send); /** - * iucv_message_send + * iucv_message_send - Transmits a one-way message, with locking + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent (IUCV_IPRMDATA, IUCV_IPPRTY, IUCV_IPBUFLST) @@ -1371,7 +1363,7 @@ EXPORT_SYMBOL(__iucv_message_send); * * Locking: local_bh_enable/local_bh_disable * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size) @@ -1386,7 +1378,8 @@ int iucv_message_send(struct iucv_path *path, struct iucv_message *msg, EXPORT_SYMBOL(iucv_message_send); /** - * iucv_message_send2way + * iucv_message_send2way - Transmits a two-way message + * * @path: address of iucv path structure * @msg: address of iucv msg structure * @flags: how the message is sent and the reply is received @@ -1403,7 +1396,7 @@ EXPORT_SYMBOL(iucv_message_send); * reply to the message and a buffer is provided into which IUCV moves * the reply to this message. * - * Returns the result from the CP IUCV call. + * Returns: the result from the CP IUCV call. */ int iucv_message_send2way(struct iucv_path *path, struct iucv_message *msg, u8 flags, u32 srccls, void *buffer, size_t size, @@ -1462,11 +1455,11 @@ struct iucv_path_pending { } __packed; /** - * iucv_path_pending + * iucv_path_pending - Process connection pending work item + * * @data: Pointer to external interrupt buffer * - * Process connection pending work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_pending(struct iucv_irq_data *data) { @@ -1523,11 +1516,11 @@ struct iucv_path_complete { } __packed; /** - * iucv_path_complete + * iucv_path_complete - Process connection complete work item + * * @data: Pointer to external interrupt buffer * - * Process connection complete work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_complete(struct iucv_irq_data *data) { @@ -1553,11 +1546,11 @@ struct iucv_path_severed { } __packed; /** - * iucv_path_severed + * iucv_path_severed - Process connection severed work item. + * * @data: Pointer to external interrupt buffer * - * Process connection severed work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_severed(struct iucv_irq_data *data) { @@ -1589,11 +1582,11 @@ struct iucv_path_quiesced { } __packed; /** - * iucv_path_quiesced + * iucv_path_quiesced - Process connection quiesced work item. + * * @data: Pointer to external interrupt buffer * - * Process connection quiesced work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_quiesced(struct iucv_irq_data *data) { @@ -1617,11 +1610,11 @@ struct iucv_path_resumed { } __packed; /** - * iucv_path_resumed + * iucv_path_resumed - Process connection resumed work item. + * * @data: Pointer to external interrupt buffer * - * Process connection resumed work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_path_resumed(struct iucv_irq_data *data) { @@ -1648,11 +1641,11 @@ struct iucv_message_complete { } __packed; /** - * iucv_message_complete + * iucv_message_complete - Process message complete work item. + * * @data: Pointer to external interrupt buffer * - * Process message complete work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_message_complete(struct iucv_irq_data *data) { @@ -1695,11 +1688,11 @@ struct iucv_message_pending { } __packed; /** - * iucv_message_pending + * iucv_message_pending - Process message pending work item. + * * @data: Pointer to external interrupt buffer * - * Process message pending work item. Called from tasklet while holding - * iucv_table_lock. + * Context: Called from tasklet while holding iucv_table_lock. */ static void iucv_message_pending(struct iucv_irq_data *data) { @@ -1722,7 +1715,7 @@ static void iucv_message_pending(struct iucv_irq_data *data) } /* - * iucv_tasklet_fn: + * iucv_tasklet_fn - Process the queue of IRQ buffers * * This tasklet loops over the queue of irq buffers created by * iucv_external_interrupt, calls the appropriate action handler @@ -1766,7 +1759,7 @@ static void iucv_tasklet_fn(unsigned long ignored) } /* - * iucv_work_fn: + * iucv_work_fn - Process the queue of path pending IRQ blocks * * This work function loops over the queue of path pending irq blocks * created by iucv_external_interrupt, calls the appropriate action @@ -1797,9 +1790,8 @@ static void iucv_work_fn(struct work_struct *work) } /* - * iucv_external_interrupt + * iucv_external_interrupt - Handles external interrupts coming in from CP. * - * Handles external interrupts coming in from CP. * Places the interrupt buffer on a queue and schedules iucv_tasklet_fn(). */ static void iucv_external_interrupt(struct ext_code ext_code, @@ -1857,10 +1849,9 @@ struct iucv_interface iucv_if = { EXPORT_SYMBOL(iucv_if); static enum cpuhp_state iucv_online; + /** - * iucv_init - * - * Allocates and initializes various data structures. + * iucv_init - Allocates and initializes various data structures. */ static int __init iucv_init(void) { @@ -1924,9 +1915,7 @@ out: } /** - * iucv_exit - * - * Frees everything allocated from iucv_init. + * iucv_exit - Frees everything allocated from iucv_init. */ static void __exit iucv_exit(void) { diff --git a/net/mac80211/Makefile b/net/mac80211/Makefile index a33884967f21..b0e392eb7753 100644 --- a/net/mac80211/Makefile +++ b/net/mac80211/Makefile @@ -36,7 +36,7 @@ mac80211-y := \ tdls.o \ ocb.o \ airtime.o \ - eht.o + eht.o uhr.o mac80211-$(CONFIG_MAC80211_LEDS) += led.o mac80211-$(CONFIG_MAC80211_DEBUGFS) += \ diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c81091a5cc3a..5d04d7d550b0 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/ieee80211.h> @@ -680,10 +680,18 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, * association has completed, this rejects that attempt * so it will set the key again after association. * + * With (re)association frame encryption enabled, cfg80211 + * may deliver keys to mac80211 before the station has + * associated. In that case, accept the key if the station + * is an Enhanced Privacy Protection (EPP) peer. + * If (re)association frame encryption support is not present, + * cfg80211 will not allow key installation in non‑AP STA mode. + * * TODO: accept the key if we have a station entry and - * add it to the device after the station. + * add it to the device after the station associates. */ - if (!sta || !test_sta_flag(sta, WLAN_STA_ASSOC)) { + if (!sta || (!sta->sta.epp_peer && + !test_sta_flag(sta, WLAN_STA_ASSOC))) { ieee80211_key_free_unused(key); return -ENOENT; } @@ -1600,6 +1608,13 @@ static int ieee80211_start_ap(struct wiphy *wiphy, struct net_device *dev, link_conf->eht_mu_beamformer = false; } + if (params->uhr_oper) { + if (!link_conf->eht_support) + return -EOPNOTSUPP; + + link_conf->uhr_support = true; + } + if (sdata->vif.type == NL80211_IFTYPE_AP && params->mbssid_config.tx_wdev) { err = ieee80211_set_ap_mbssid_options(sdata, @@ -1908,7 +1923,7 @@ static int ieee80211_stop_ap(struct wiphy *wiphy, struct net_device *dev, if (sdata->wdev.links[link_id].cac_started) { chandef = link_conf->chanreq.oper; - wiphy_delayed_work_cancel(wiphy, &link->dfs_cac_timer_work); + wiphy_hrtimer_work_cancel(wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(sdata->dev, &chandef, NL80211_RADAR_CAC_ABORTED, GFP_KERNEL, link_id); @@ -2077,6 +2092,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->vht_capa || params->he_capa || params->eht_capa || + params->uhr_capa || params->s1g_capa || params->opmode_notif_used; @@ -2125,8 +2141,7 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, if (params->supported_rates && params->supported_rates_len && - !ieee80211_parse_bitrates(link->conf->chanreq.oper.width, - sband, params->supported_rates, + !ieee80211_parse_bitrates(sband, params->supported_rates, params->supported_rates_len, &link_sta->pub->supp_rates[sband->band])) return -EINVAL; @@ -2156,6 +2171,12 @@ static int sta_link_apply_parameters(struct ieee80211_local *local, params->eht_capa_len, link_sta); + if (params->uhr_capa) + ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband, + params->uhr_capa, + params->uhr_capa_len, + link_sta); + if (params->s1g_capa) ieee80211_s1g_cap_to_sta_s1g_cap(sdata, params->s1g_capa, link_sta); @@ -2199,6 +2220,9 @@ static int sta_apply_parameters(struct ieee80211_local *local, mask = params->sta_flags_mask; set = params->sta_flags_set; + if (params->epp_peer) + sta->sta.epp_peer = true; + if (ieee80211_vif_is_mesh(&sdata->vif)) { /* * In mesh mode, ASSOCIATED isn't part of the nl80211 @@ -2987,8 +3011,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, return -EINVAL; if (params->basic_rates) { - if (!ieee80211_parse_bitrates(link->conf->chanreq.oper.width, - wiphy->bands[sband->band], + if (!ieee80211_parse_bitrates(sband, params->basic_rates, params->basic_rates_len, &link->conf->basic_rates)) @@ -3865,8 +3888,8 @@ static int ieee80211_start_radar_detection(struct wiphy *wiphy, if (err) return err; - wiphy_delayed_work_queue(wiphy, &link_data->dfs_cac_timer_work, - msecs_to_jiffies(cac_time_ms)); + wiphy_hrtimer_work_queue(wiphy, &link_data->dfs_cac_timer_work, + ms_to_ktime(cac_time_ms)); return 0; } @@ -3885,7 +3908,7 @@ static void ieee80211_end_cac(struct wiphy *wiphy, if (!link_data) continue; - wiphy_delayed_work_cancel(wiphy, + wiphy_hrtimer_work_cancel(wiphy, &link_data->dfs_cac_timer_work); if (sdata->wdev.links[link_id].cac_started) { @@ -4151,12 +4174,21 @@ static int __ieee80211_csa_finalize(struct ieee80211_link_data *link_data) static void ieee80211_csa_finalize(struct ieee80211_link_data *link_data) { struct ieee80211_sub_if_data *sdata = link_data->sdata; + int link_id = -1; if (__ieee80211_csa_finalize(link_data)) { sdata_info(sdata, "failed to finalize CSA on link %d, disconnecting\n", link_data->link_id); - cfg80211_stop_iface(sdata->local->hw.wiphy, &sdata->wdev, - GFP_KERNEL); + if (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_P2P_GO) + /* + * link_id is expected only for AP/P2P_GO type + * currently + */ + link_id = link_data->link_id; + + cfg80211_stop_link(sdata->local->hw.wiphy, &sdata->wdev, + link_id, GFP_KERNEL); } } @@ -4400,7 +4432,7 @@ __ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev, goto out; /* if reservation is invalid then this will fail */ - err = ieee80211_check_combinations(sdata, NULL, chanctx->mode, 0, -1); + err = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (err) { ieee80211_link_unreserve_chanctx(link_data); goto out; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 55105d238d6b..51bf3c7822a7 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1772,4 +1772,25 @@ drv_prep_add_interface(struct ieee80211_local *local, trace_drv_return_void(local); } +static inline int drv_set_eml_op_mode(struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + struct ieee80211_eml_params *eml_params) +{ + struct ieee80211_local *local = sdata->local; + int ret = -EOPNOTSUPP; + + might_sleep(); + lockdep_assert_wiphy(local->hw.wiphy); + + trace_drv_set_eml_op_mode(local, sdata, sta, eml_params->link_id, + eml_params->control, + eml_params->link_bitmap); + if (local->ops->set_eml_op_mode) + ret = local->ops->set_eml_op_mode(&local->hw, &sdata->vif, + sta, eml_params); + trace_drv_return_int(local, ret); + + return ret; +} + #endif /* __MAC80211_DRIVER_OPS */ diff --git a/net/mac80211/drop.h b/net/mac80211/drop.h index eb9ab310f91c..f06a8aa905c5 100644 --- a/net/mac80211/drop.h +++ b/net/mac80211/drop.h @@ -2,7 +2,7 @@ /* * mac80211 drop reason list * - * Copyright (C) 2023-2024 Intel Corporation + * Copyright (C) 2023-2024, 2026 Intel Corporation */ #ifndef MAC80211_DROP_H @@ -65,6 +65,49 @@ typedef unsigned int __bitwise ieee80211_rx_result; /* 0x30 */ \ R(RX_DROP_U_BAD_MGMT_KEYIDX) \ R(RX_DROP_U_UNKNOWN_ACTION_REJECTED) \ + R(RX_DROP_U_MESH_DS_BITS) \ + R(RX_DROP_U_MESH_A3_MISMATCH) \ + R(RX_DROP_U_MESH_NO_A4) \ + R(RX_DROP_U_MESH_A4_MISMATCH) \ + R(RX_DROP_U_MESH_UNEXP_DATA) \ + R(RX_DROP_U_MESH_WRONG_ACTION) \ + R(RX_DROP_U_MESH_UNEXP_MGMT) \ + R(RX_DROP_U_SPURIOUS_NOTIF) \ + R(RX_DROP_U_RUNT_DATA) \ + R(RX_DROP_U_KEY_TAINTED) \ + R(RX_DROP_U_UNPROTECTED) \ + R(RX_DROP_U_MCAST_FRAGMENT) \ + R(RX_DROP_U_DEFRAG_MISMATCH) \ + R(RX_DROP_U_RUNT_MESH_DATA) \ + /* 0x40 */ \ + R(RX_DROP_U_MESH_NO_TTL) \ + R(RX_DROP_U_MESH_RMC) \ + R(RX_DROP_U_MESH_BAD_AE) \ + R(RX_DROP_U_MESH_TTL_EXPIRED) \ + R(RX_DROP_U_MESH_NOT_FORWARDING) \ + R(RX_DROP_U_AMSDU_WITHOUT_DATA) \ + R(RX_DROP_U_NULL_DATA) \ + R(RX_DROP_U_UNEXPECTED_4ADDR) \ + R(RX_DROP_U_PORT_CONTROL) \ + R(RX_DROP_U_UNKNOWN_STA) \ + R(RX_DROP_U_RUNT_BAR) \ + R(RX_DROP_U_BAR_OUTSIDE_SESSION) \ + R(RX_DROP_U_CTRL_FRAME) \ + R(RX_DROP_U_RUNT_MGMT) \ + R(RX_DROP_U_EXPECTED_MGMT) \ + R(RX_DROP_U_NONBCAST_BEACON) \ + /* 0x50 */ \ + R(RX_DROP_U_MALFORMED_ACTION) \ + R(RX_DROP_U_UNKNOWN_MCAST_ACTION) \ + R(RX_DROP_U_UNEXPECTED_EXT_FRAME) \ + R(RX_DROP_U_UNHANDLED_MGMT) \ + R(RX_DROP_U_MCAST_DEAUTH) \ + R(RX_DROP_U_UNHANDLED_DEAUTH) \ + R(RX_DROP_U_MCAST_DISASSOC) \ + R(RX_DROP_U_UNHANDLED_DISASSOC) \ + R(RX_DROP_U_UNHANDLED_PREQ) \ + R(RX_DROP_U_UNHANDLED_MGMT_STYPE) \ + R(RX_DROP_U_NO_LINK) \ /* this line for the trailing \ - add before this */ /* having two enums allows for checking ieee80211_rx_result use with sparse */ @@ -85,7 +128,6 @@ enum ___mac80211_drop_reason { enum mac80211_drop_reason { RX_CONTINUE = (__force ieee80211_rx_result)___RX_CONTINUE, RX_QUEUED = (__force ieee80211_rx_result)___RX_QUEUED, - RX_DROP = (__force ieee80211_rx_result)___RX_DROP_UNUSABLE, #define DEF(x) x = (__force ieee80211_rx_result)___ ## x, MAC80211_DROP_REASONS_UNUSABLE(DEF) #undef DEF diff --git a/net/mac80211/eht.c b/net/mac80211/eht.c index fd41046e3b68..75096b2195d2 100644 --- a/net/mac80211/eht.c +++ b/net/mac80211/eht.c @@ -5,6 +5,7 @@ * Copyright(c) 2021-2025 Intel Corporation */ +#include "driver-ops.h" #include "ieee80211_i.h" void @@ -102,3 +103,177 @@ ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, ieee80211_sta_recalc_aggregates(&link_sta->sta->sta); } + +static void +ieee80211_send_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct ieee80211_mgmt *req, int opt_len) +{ + int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + struct ieee80211_local *local = sdata->local; + struct ieee80211_mgmt *mgmt; + struct sk_buff *skb; + + len += opt_len; /* optional len */ + skb = dev_alloc_skb(local->tx_headroom + len); + if (!skb) + return; + + skb_reserve(skb, local->tx_headroom); + mgmt = skb_put_zero(skb, len); + mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_ACTION); + memcpy(mgmt->da, req->sa, ETH_ALEN); + memcpy(mgmt->sa, sdata->vif.addr, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); + + mgmt->u.action.category = WLAN_CATEGORY_PROTECTED_EHT; + mgmt->u.action.u.eml_omn.action_code = + WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF; + mgmt->u.action.u.eml_omn.dialog_token = + req->u.action.u.eml_omn.dialog_token; + mgmt->u.action.u.eml_omn.control = req->u.action.u.eml_omn.control & + ~(IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE | + IEEE80211_EML_CTRL_INDEV_COEX_ACT); + /* Copy optional fields from the received notification frame */ + memcpy(mgmt->u.action.u.eml_omn.variable, + req->u.action.u.eml_omn.variable, opt_len); + + ieee80211_tx_skb(sdata, skb); +} + +void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb) +{ + int len = offsetofend(struct ieee80211_mgmt, u.action.u.eml_omn); + enum nl80211_iftype type = ieee80211_vif_type_p2p(&sdata->vif); + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + const struct wiphy_iftype_ext_capab *ift_ext_capa; + struct ieee80211_mgmt *mgmt = (void *)skb->data; + struct ieee80211_local *local = sdata->local; + u8 control = mgmt->u.action.u.eml_omn.control; + u8 *ptr = mgmt->u.action.u.eml_omn.variable; + struct ieee80211_eml_params eml_params = { + .link_id = status->link_id, + }; + struct sta_info *sta; + int opt_len = 0; + + if (!ieee80211_vif_is_mld(&sdata->vif)) + return; + + /* eMLSR and eMLMR can't be enabled at the same time */ + if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) && + (control & IEEE80211_EML_CTRL_EMLMR_MODE)) + return; + + if ((control & IEEE80211_EML_CTRL_EMLMR_MODE) && + (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE)) + return; + + ift_ext_capa = cfg80211_get_iftype_ext_capa(local->hw.wiphy, type); + if (!ift_ext_capa) + return; + + if (!status->link_valid) + return; + + sta = sta_info_get_bss(sdata, mgmt->sa); + if (!sta) + return; + + if (control & IEEE80211_EML_CTRL_EMLSR_MODE) { + u8 emlsr_param_update_len; + + if (!(ift_ext_capa->eml_capabilities & + IEEE80211_EML_CAP_EMLSR_SUPP)) + return; + + opt_len += sizeof(__le16); /* eMLSR link_bitmap */ + /* eMLSR param update field is not part of Notfication frame + * sent by the AP to client so account it separately. + */ + emlsr_param_update_len = + !!(control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE); + + if (skb->len < len + opt_len + emlsr_param_update_len) + return; + + if (control & IEEE80211_EML_CTRL_EMLSR_PARAM_UPDATE) { + u8 pad_delay, trans_delay; + + pad_delay = u8_get_bits(ptr[2], + IEEE80211_EML_EMLSR_PAD_DELAY); + if (pad_delay > + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY_256US) + return; + + trans_delay = u8_get_bits(ptr[2], + IEEE80211_EML_EMLSR_TRANS_DELAY); + if (trans_delay > + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY_256US) + return; + + /* Update sta padding and transition delay */ + sta->sta.eml_cap = + u8_replace_bits(sta->sta.eml_cap, + pad_delay, + IEEE80211_EML_CAP_EMLSR_PADDING_DELAY); + sta->sta.eml_cap = + u8_replace_bits(sta->sta.eml_cap, + trans_delay, + IEEE80211_EML_CAP_EMLSR_TRANSITION_DELAY); + } + } + + if (control & IEEE80211_EML_CTRL_EMLMR_MODE) { + u8 mcs_map_size; + int i; + + if (!(ift_ext_capa->eml_capabilities & + IEEE80211_EML_CAP_EMLMR_SUPPORT)) + return; + + opt_len += sizeof(__le16); /* eMLMR link_bitmap */ + opt_len++; /* eMLMR mcs_map_count */ + if (skb->len < len + opt_len) + return; + + eml_params.emlmr_mcs_map_count = ptr[2]; + if (eml_params.emlmr_mcs_map_count > 2) + return; + + mcs_map_size = 3 * (1 + eml_params.emlmr_mcs_map_count); + opt_len += mcs_map_size; + if (skb->len < len + opt_len) + return; + + for (i = 0; i < mcs_map_size; i++) { + u8 rx_mcs, tx_mcs; + + rx_mcs = u8_get_bits(ptr[3 + i], + IEEE80211_EML_EMLMR_RX_MCS_MAP); + if (rx_mcs > 8) + return; + + tx_mcs = u8_get_bits(ptr[3 + i], + IEEE80211_EML_EMLMR_TX_MCS_MAP); + if (tx_mcs > 8) + return; + } + + memcpy(eml_params.emlmr_mcs_map_bw, &ptr[3], mcs_map_size); + } + + if ((control & IEEE80211_EML_CTRL_EMLSR_MODE) || + (control & IEEE80211_EML_CTRL_EMLMR_MODE)) { + eml_params.link_bitmap = get_unaligned_le16(ptr); + if ((eml_params.link_bitmap & sdata->vif.active_links) != + eml_params.link_bitmap) + return; + } + + if (drv_set_eml_op_mode(sdata, &sta->sta, &eml_params)) + return; + + ieee80211_send_eml_op_mode_notif(sdata, mgmt, opt_len); +} diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index bd573f8e61fb..e60b814dd89e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #ifndef IEEE80211_I_H @@ -394,9 +394,10 @@ enum ieee80211_conn_mode { IEEE80211_CONN_MODE_VHT, IEEE80211_CONN_MODE_HE, IEEE80211_CONN_MODE_EHT, + IEEE80211_CONN_MODE_UHR, }; -#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_EHT +#define IEEE80211_CONN_MODE_HIGHEST IEEE80211_CONN_MODE_UHR enum ieee80211_conn_bw_limit { IEEE80211_CONN_BW_LIMIT_20, @@ -430,7 +431,7 @@ struct ieee80211_mgd_auth_data { u8 ap_addr[ETH_ALEN] __aligned(2); - u16 sae_trans, sae_status; + u16 trans, status; size_t data_len; u8 data[]; }; @@ -1099,7 +1100,7 @@ struct ieee80211_link_data { int ap_power_level; /* in dBm */ bool radar_required; - struct wiphy_delayed_work dfs_cac_timer_work; + struct wiphy_hrtimer_work dfs_cac_timer_work; union { struct ieee80211_link_data_managed mgd; @@ -1824,6 +1825,8 @@ struct ieee802_11_elems { const struct ieee80211_multi_link_elem *ml_epcs; const struct ieee80211_bandwidth_indication *bandwidth_indication; const struct ieee80211_ttlm_elem *ttlm[IEEE80211_TTLM_MAX_CNT]; + const struct ieee80211_uhr_cap *uhr_cap; + const struct ieee80211_uhr_operation *uhr_operation; /* not the order in the psd values is per element, not per chandef */ struct ieee80211_parsed_tpe tpe; @@ -1848,6 +1851,8 @@ struct ieee802_11_elems { u8 country_elem_len; u8 bssid_index_len; u8 eht_cap_len; + u8 uhr_cap_len; + u8 uhr_operation_len; /* mult-link element can be de-fragmented and thus u8 is not sufficient */ size_t ml_basic_len; @@ -2391,6 +2396,14 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, int tid, int link_id, enum nl80211_band band); +static inline bool ieee80211_require_encrypted_assoc(__le16 fc, + struct sta_info *sta) +{ + return (sta && sta->sta.epp_peer && + (ieee80211_is_assoc_req(fc) || ieee80211_is_reassoc_req(fc) || + ieee80211_is_assoc_resp(fc) || ieee80211_is_reassoc_resp(fc))); +} + /* sta_out needs to be checked for ERR_PTR() before using */ int ieee80211_lookup_ra_sta(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, @@ -2658,8 +2671,7 @@ u8 ieee80211_ie_len_he_cap(struct ieee80211_sub_if_data *sdata); u8 *ieee80211_ie_build_he_oper(u8 *pos, const struct cfg80211_chan_def *chandef); u8 *ieee80211_ie_build_eht_oper(u8 *pos, const struct cfg80211_chan_def *chandef, const struct ieee80211_sta_eht_cap *eht_cap); -int ieee80211_parse_bitrates(enum nl80211_chan_width width, - const struct ieee80211_supported_band *sband, +int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates); u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo); void ieee80211_add_s1g_capab_ie(struct ieee80211_sub_if_data *sdata, @@ -2684,6 +2696,9 @@ int ieee80211_put_eht_cap(struct sk_buff *skb, struct ieee80211_sub_if_data *sdata, const struct ieee80211_supported_band *sband, const struct ieee80211_conn_settings *conn); +int ieee80211_put_uhr_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband); int ieee80211_put_reg_conn(struct sk_buff *skb, enum ieee80211_channel_flags flags); @@ -2828,6 +2843,8 @@ void ieee80211_destroy_frag_cache(struct ieee80211_fragment_cache *cache); u8 ieee80211_ie_len_eht_cap(struct ieee80211_sub_if_data *sdata); +void ieee80211_rx_eml_op_mode_notif(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb); void ieee80211_eht_cap_ie_to_sta_eht_cap(struct ieee80211_sub_if_data *sdata, struct ieee80211_supported_band *sband, @@ -2859,6 +2876,13 @@ void ieee80211_process_ml_reconf_resp(struct ieee80211_sub_if_data *sdata, struct ieee80211_mgmt *mgmt, size_t len); void ieee80211_stop_mbssid(struct ieee80211_sub_if_data *sdata); +void +ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_uhr_cap *uhr_cap, + u8 uhr_cap_len, + struct link_sta_info *link_sta); + #if IS_ENABLED(CONFIG_MAC80211_KUNIT_TEST) #define EXPORT_SYMBOL_IF_MAC80211_KUNIT(sym) EXPORT_SYMBOL_IF_KUNIT(sym) #define VISIBLE_IF_MAC80211_KUNIT diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 515384ca2f8f..676b2a43c9f2 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -8,7 +8,7 @@ * Copyright 2008, Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/slab.h> #include <linux/kernel.h> @@ -565,7 +565,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.csa.finalize_work); wiphy_work_cancel(local->hw.wiphy, &sdata->deflink.color_change_finalize_work); - wiphy_delayed_work_cancel(local->hw.wiphy, + wiphy_hrtimer_work_cancel(local->hw.wiphy, &sdata->deflink.dfs_cac_timer_work); if (sdata->wdev.links[0].cac_started) { @@ -1668,7 +1668,15 @@ static void ieee80211_iface_process_skb(struct ieee80211_local *local, } } else if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_PROTECTED_EHT) { - if (sdata->vif.type == NL80211_IFTYPE_STATION) { + if (sdata->vif.type == NL80211_IFTYPE_AP) { + switch (mgmt->u.action.u.eml_omn.action_code) { + case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: + ieee80211_rx_eml_op_mode_notif(sdata, skb); + break; + default: + break; + } + } else if (sdata->vif.type == NL80211_IFTYPE_STATION) { switch (mgmt->u.action.u.ttlm_req.action_code) { case WLAN_PROTECTED_EHT_ACTION_TTLM_REQ: ieee80211_process_neg_ttlm_req(sdata, mgmt, @@ -1793,7 +1801,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work) else ieee80211_iface_process_skb(local, sdata, skb); - kfree_skb(skb); + consume_skb(skb); kcov_remote_stop(); } @@ -1802,7 +1810,7 @@ static void ieee80211_iface_work(struct wiphy *wiphy, struct wiphy_work *work) kcov_remote_start_common(skb_get_kcov_handle(skb)); ieee80211_iface_process_status(sdata, skb); - kfree_skb(skb); + consume_skb(skb); kcov_remote_stop(); } diff --git a/net/mac80211/link.c b/net/mac80211/link.c index 1e05845872af..17bf55dabd31 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -116,7 +116,7 @@ void ieee80211_link_init(struct ieee80211_sub_if_data *sdata, ieee80211_color_change_finalize_work); wiphy_delayed_work_init(&link->color_collision_detect_work, ieee80211_color_collision_detection_work); - wiphy_delayed_work_init(&link->dfs_cac_timer_work, + wiphy_hrtimer_work_init(&link->dfs_cac_timer_work, ieee80211_dfs_cac_timer_work); if (!deflink) { @@ -155,7 +155,7 @@ void ieee80211_link_stop(struct ieee80211_link_data *link) &link->csa.finalize_work); if (link->sdata->wdev.links[link->link_id].cac_started) { - wiphy_delayed_work_cancel(link->sdata->local->hw.wiphy, + wiphy_hrtimer_work_cancel(link->sdata->local->hw.wiphy, &link->dfs_cac_timer_work); cfg80211_cac_event(link->sdata->dev, &link->conf->chanreq.oper, diff --git a/net/mac80211/main.c b/net/mac80211/main.c index b05e313c7f17..bedc81956fbc 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <net/mac80211.h> @@ -1123,7 +1123,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int result, i; enum nl80211_band band; int channels, max_bitrates; - bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g; + bool supp_ht, supp_vht, supp_he, supp_eht, supp_s1g, supp_uhr; struct cfg80211_chan_def dflt_chandef = {}; if (ieee80211_hw_check(hw, QUEUE_CONTROL) && @@ -1237,6 +1237,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_he = false; supp_eht = false; supp_s1g = false; + supp_uhr = false; for (band = 0; band < NUM_NL80211_BANDS; band++) { const struct ieee80211_sband_iftype_data *iftd; struct ieee80211_supported_band *sband; @@ -1293,6 +1294,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) supp_he = supp_he || iftd->he_cap.has_he; supp_eht = supp_eht || iftd->eht_cap.has_eht; + supp_uhr = supp_uhr || iftd->uhr_cap.has_uhr; if (band == NL80211_BAND_2GHZ) he_40_mhz_cap = @@ -1325,6 +1327,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (WARN_ON(supp_eht && !supp_he)) return -EINVAL; + /* UHR requires EHT support */ + if (WARN_ON(supp_uhr && !supp_eht)) + return -EINVAL; + if (!sband->ht_cap.ht_supported) continue; @@ -1437,6 +1443,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) IEEE80211_EHT_PPE_THRES_MAX_LEN; } + if (supp_uhr) + local->scan_ies_len += + 3 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + if (!local->ops->hw_scan) { /* For hw_scan, driver needs to set these up. */ local->hw.wiphy->max_scan_ssids = 4; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 73f57b9e0ebf..e83582b2c377 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -162,6 +162,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, const struct ieee80211_vht_operation *vht_oper = elems->vht_operation; const struct ieee80211_he_operation *he_oper = elems->he_operation; const struct ieee80211_eht_operation *eht_oper = elems->eht_operation; + const struct ieee80211_uhr_operation *uhr_oper = elems->uhr_operation; struct ieee80211_supported_band *sband = sdata->local->hw.wiphy->bands[channel->band]; struct cfg80211_chan_def vht_chandef; @@ -192,7 +193,7 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special 6 GHz case out of the way */ if (sband->band == NL80211_BAND_6GHZ) { - enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_EHT; + enum ieee80211_conn_mode mode = IEEE80211_CONN_MODE_HIGHEST; /* this is an error */ if (conn->mode < IEEE80211_CONN_MODE_HE) @@ -215,7 +216,9 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, return IEEE80211_CONN_MODE_LEGACY; } - return mode; + if (mode <= IEEE80211_CONN_MODE_EHT) + return mode; + goto check_uhr; } /* now we have the progression HT, VHT, ... */ @@ -340,7 +343,63 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, *chandef = eht_chandef; } - return IEEE80211_CONN_MODE_EHT; +check_uhr: + if (conn->mode < IEEE80211_CONN_MODE_UHR || !uhr_oper) + return IEEE80211_CONN_MODE_EHT; + + /* + * In beacons we don't have all the data - but we know the size was OK, + * so if the size is valid as a non-beacon case, we have more data and + * can validate the NPCA parameters. + */ + if (ieee80211_uhr_oper_size_ok((const void *)uhr_oper, + elems->uhr_operation_len, + false)) { + struct cfg80211_chan_def npca_chandef = *chandef; + const struct ieee80211_uhr_npca_info *npca; + const __le16 *dis_subch_bmap; + u16 punct = chandef->punctured, npca_punct; + + npca = ieee80211_uhr_npca_info(uhr_oper); + if (npca) { + int width = cfg80211_chandef_get_width(chandef); + u8 offs = le32_get_bits(npca->params, + IEEE80211_UHR_NPCA_PARAMS_PRIMARY_CHAN_OFFS); + u32 cf1 = chandef->center_freq1; + bool pri_upper, npca_upper; + + pri_upper = chandef->chan->center_freq > cf1; + npca_upper = 20 * offs >= width / 2; + + if (20 * offs >= cfg80211_chandef_get_width(chandef) || + pri_upper == npca_upper) { + sdata_info(sdata, + "AP UHR NPCA primary channel invalid, disabling UHR\n"); + return IEEE80211_CONN_MODE_EHT; + } + } + + dis_subch_bmap = ieee80211_uhr_npca_dis_subch_bitmap(uhr_oper); + + if (dis_subch_bmap) { + npca_punct = get_unaligned_le16(dis_subch_bmap); + npca_chandef.punctured = npca_punct; + } + + /* + * must be a valid puncturing pattern for this channel as + * well as puncturing all subchannels that are already in + * the disabled subchannel bitmap on the primary channel + */ + if (!cfg80211_chandef_valid(&npca_chandef) || + ((punct & npca_punct) != punct)) { + sdata_info(sdata, + "AP UHR NPCA disabled subchannel bitmap invalid, disabling UHR\n"); + return IEEE80211_CONN_MODE_EHT; + } + } + + return IEEE80211_CONN_MODE_UHR; } static bool @@ -1091,6 +1150,7 @@ again: IEEE80211_CONN_BW_LIMIT_160); break; case IEEE80211_CONN_MODE_EHT: + case IEEE80211_CONN_MODE_UHR: conn->bw_limit = min_t(enum ieee80211_conn_bw_limit, conn->bw_limit, IEEE80211_CONN_BW_LIMIT_320); @@ -1108,6 +1168,8 @@ again: set_bit(BSS_MEMBERSHIP_SELECTOR_HE_PHY, sta_selectors); if (conn->mode >= IEEE80211_CONN_MODE_EHT) set_bit(BSS_MEMBERSHIP_SELECTOR_EHT_PHY, sta_selectors); + if (conn->mode >= IEEE80211_CONN_MODE_UHR) + set_bit(BSS_MEMBERSHIP_SELECTOR_UHR_PHY, sta_selectors); /* * We do not support EPD or GLK so never add them. @@ -1155,6 +1217,11 @@ again: IEEE80211_CONN_BW_LIMIT_160); } + if (conn->mode >= IEEE80211_CONN_MODE_UHR && + !cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_NO_UHR)) + conn->mode = IEEE80211_CONN_MODE_EHT; + if (chanreq->oper.width != ap_chandef->width || ap_mode != conn->mode) link_id_info(sdata, link_id, "regulatory prevented using AP config, downgraded\n"); @@ -1548,7 +1615,7 @@ static void ieee80211_assoc_add_rates(struct ieee80211_local *local, * in the association request (e.g. D-Link DAP 1353 in * b-only mode)... */ - ieee80211_parse_bitrates(width, sband, + ieee80211_parse_bitrates(sband, assoc_data->supp_rates, assoc_data->supp_rates_len, &rates); @@ -1884,11 +1951,13 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, /* * careful - need to know about all the present elems before - * calling ieee80211_assoc_add_ml_elem(), so add this one if - * we're going to put it after the ML element + * calling ieee80211_assoc_add_ml_elem(), so add these if + * we're going to put them after the ML element */ if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_EHT) ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_EHT_CAPABILITY); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR) + ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_UHR_CAPA); if (link_id == assoc_data->assoc_link_id) ieee80211_assoc_add_ml_elem(sdata, skb, orig_capab, ext_capa, @@ -1901,6 +1970,9 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, ieee80211_put_eht_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); + if (assoc_data->link[link_id].conn.mode >= IEEE80211_CONN_MODE_UHR) + ieee80211_put_uhr_cap(skb, sdata, sband); + if (sband->band == NL80211_BAND_S1GHZ) { ieee80211_add_aid_request_ie(sdata, skb); ieee80211_add_s1g_capab_ie(sdata, &sband->s1g_cap, skb); @@ -2135,6 +2207,9 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, sizeof(struct ieee80211_eht_mcs_nss_supp) + IEEE80211_EHT_PPE_THRES_MAX_LEN; + size += 2 + 1 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + return size; } @@ -2155,6 +2230,8 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) struct ieee80211_prep_tx_info info = {}; unsigned int link_id, n_links = 0; u16 present_elems[PRESENT_ELEMS_MAX] = {}; + struct sta_info *sta; + bool assoc_encrypt; void *capab_pos; size_t size; int ret; @@ -2335,7 +2412,15 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) info.link_id = assoc_data->assoc_link_id; drv_mgd_prepare_tx(local, sdata, &info); - IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + sta = sta_info_get_bss(sdata, sdata->vif.cfg.ap_addr); + + assoc_encrypt = sta && sta->sta.epp_peer && + wiphy_dereference(sdata->local->hw.wiphy, + sta->ptk[sta->ptk_idx]); + + if (!assoc_encrypt) + IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_INTFL_MLME_CONN_TX; @@ -4911,6 +4996,7 @@ static void ieee80211_rx_mgmt_auth(struct ieee80211_sub_if_data *sdata, case WLAN_AUTH_FILS_SK: case WLAN_AUTH_FILS_SK_PFS: case WLAN_AUTH_FILS_PK: + case WLAN_AUTH_EPPKE: break; case WLAN_AUTH_SHARED_KEY: if (ifmgd->auth_data->expected_transaction != 4) { @@ -5520,6 +5606,18 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, bss_conf->epcs_support = false; } + if (elems->uhr_operation && elems->uhr_cap && + link->u.mgd.conn.mode >= IEEE80211_CONN_MODE_UHR) { + ieee80211_uhr_cap_ie_to_sta_uhr_cap(sdata, sband, + elems->uhr_cap, + elems->uhr_cap_len, + link_sta); + + bss_conf->uhr_support = link_sta->pub->uhr_cap.has_uhr; + } else { + bss_conf->uhr_support = false; + } + if (elems->s1g_oper && link->u.mgd.conn.mode == IEEE80211_CONN_MODE_S1G && elems->s1g_capab) @@ -5810,6 +5908,7 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, bool is_6ghz = sband->band == NL80211_BAND_6GHZ; const struct ieee80211_sta_he_cap *he_cap; const struct ieee80211_sta_eht_cap *eht_cap; + const struct ieee80211_sta_uhr_cap *uhr_cap; struct ieee80211_sta_vht_cap vht_cap; if (sband->band == NL80211_BAND_S1GHZ) { @@ -5985,9 +6084,6 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, "no EHT support, limiting to HE\n"); goto out; } - - /* we have EHT */ - conn->mode = IEEE80211_CONN_MODE_EHT; /* check bandwidth */ @@ -5998,6 +6094,20 @@ ieee80211_determine_our_sta_mode(struct ieee80211_sub_if_data *sdata, mlme_link_id_dbg(sdata, link_id, "no EHT 320 MHz cap in 6 GHz, limiting to 160 MHz\n"); + if (req && req->flags & ASSOC_REQ_DISABLE_UHR) { + mlme_link_id_dbg(sdata, link_id, + "UHR disabled by flag, limiting to EHT\n"); + goto out; + } + + uhr_cap = ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif); + if (!uhr_cap) { + mlme_link_id_dbg(sdata, link_id, + "no UHR support, limiting to EHT\n"); + goto out; + } + conn->mode = IEEE80211_CONN_MODE_UHR; + out: mlme_link_id_dbg(sdata, link_id, "determined local STA to be %s, BW limited to %d MHz\n", @@ -8307,6 +8417,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) if (WARN_ON_ONCE(!auth_data)) return -EINVAL; + if (auth_data->algorithm == WLAN_AUTH_EPPKE && + ieee80211_vif_is_mld(&sdata->vif) && + !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + auth_data->data, auth_data->data_len)) + return -EINVAL; + auth_data->tries++; if (auth_data->tries > IEEE80211_AUTH_MAX_TRIES) { @@ -8335,9 +8451,12 @@ static int ieee80211_auth(struct ieee80211_sub_if_data *sdata) auth_data->expected_transaction = 2; if (auth_data->algorithm == WLAN_AUTH_SAE) { - trans = auth_data->sae_trans; - status = auth_data->sae_status; + trans = auth_data->trans; + status = auth_data->status; auth_data->expected_transaction = trans; + } else if (auth_data->algorithm == WLAN_AUTH_EPPKE) { + trans = auth_data->trans; + status = auth_data->status; } if (ieee80211_hw_check(&local->hw, REPORTS_TX_ACK_STATUS)) @@ -8994,6 +9113,10 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, goto out_err; } + if (ifmgd->auth_data && + ifmgd->auth_data->algorithm == WLAN_AUTH_EPPKE) + new_sta->sta.epp_peer = true; + new_sta->sta.mlo = mlo; } @@ -9248,6 +9371,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, case NL80211_AUTHTYPE_FILS_PK: auth_alg = WLAN_AUTH_FILS_PK; break; + case NL80211_AUTHTYPE_EPPKE: + auth_alg = WLAN_AUTH_EPPKE; + break; default: return -EOPNOTSUPP; } @@ -9272,12 +9398,14 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { - if (req->auth_type == NL80211_AUTHTYPE_SAE) { + if (req->auth_type == NL80211_AUTHTYPE_SAE || + req->auth_type == NL80211_AUTHTYPE_EPPKE) { __le16 *pos = (__le16 *) req->auth_data; - auth_data->sae_trans = le16_to_cpu(pos[0]); - auth_data->sae_status = le16_to_cpu(pos[1]); + auth_data->trans = le16_to_cpu(pos[0]); + auth_data->status = le16_to_cpu(pos[1]); } + memcpy(auth_data->data, req->auth_data + 4, req->auth_data_len - 4); auth_data->data_len += req->auth_data_len - 4; @@ -9328,7 +9456,11 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, * out SAE Confirm. */ if (cont_auth && req->auth_type == NL80211_AUTHTYPE_SAE && - auth_data->peer_confirmed && auth_data->sae_trans == 2) + auth_data->peer_confirmed && auth_data->trans == 2) + ieee80211_mark_sta_auth(sdata); + + if (cont_auth && req->auth_type == NL80211_AUTHTYPE_EPPKE && + auth_data->trans == 3) ieee80211_mark_sta_auth(sdata); if (ifmgd->associated) { diff --git a/net/mac80211/parse.c b/net/mac80211/parse.c index bfc4ecb7a048..8260f6bdd5b2 100644 --- a/net/mac80211/parse.c +++ b/net/mac80211/parse.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * * element parsing for mac80211 */ @@ -189,6 +189,26 @@ ieee80211_parse_extension_element(u32 *crc, elems->ttlm_num++; } break; + case WLAN_EID_EXT_UHR_OPER: + if (params->mode < IEEE80211_CONN_MODE_UHR) + break; + calc_crc = true; + if (ieee80211_uhr_oper_size_ok(data, len, + params->type == (IEEE80211_FTYPE_MGMT | + IEEE80211_STYPE_BEACON))) { + elems->uhr_operation = data; + elems->uhr_operation_len = len; + } + break; + case WLAN_EID_EXT_UHR_CAPA: + if (params->mode < IEEE80211_CONN_MODE_UHR) + break; + calc_crc = true; + if (ieee80211_uhr_capa_size_ok(data, len, true)) { + elems->uhr_cap = data; + elems->uhr_cap_len = len; + } + break; } if (crc && calc_crc) @@ -1115,8 +1135,7 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) } EXPORT_SYMBOL_IF_KUNIT(ieee802_11_parse_elems_full); -int ieee80211_parse_bitrates(enum nl80211_chan_width width, - const struct ieee80211_supported_band *sband, +int ieee80211_parse_bitrates(const struct ieee80211_supported_band *sband, const u8 *srates, int srates_len, u32 *rates) { struct ieee80211_rate *br; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e0ccd9749853..11d6c56c9d7e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -6,7 +6,7 @@ * Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/jiffies.h> @@ -1137,14 +1137,14 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) if (is_multicast_ether_addr(hdr->addr1)) { if (ieee80211_has_tods(hdr->frame_control) || !ieee80211_has_fromds(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_DS_BITS; if (ether_addr_equal(hdr->addr3, dev_addr)) - return RX_DROP; + return RX_DROP_U_MESH_A3_MISMATCH; } else { if (!ieee80211_has_a4(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_NO_A4; if (ether_addr_equal(hdr->addr4, dev_addr)) - return RX_DROP; + return RX_DROP_U_MESH_A4_MISMATCH; } } @@ -1156,20 +1156,20 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt; if (!ieee80211_is_mgmt(hdr->frame_control)) - return RX_DROP; + return RX_DROP_U_MESH_UNEXP_DATA; if (ieee80211_is_action(hdr->frame_control)) { u8 category; /* make sure category field is present */ if (rx->skb->len < IEEE80211_MIN_ACTION_SIZE) - return RX_DROP; + return RX_DROP_U_RUNT_ACTION; mgmt = (struct ieee80211_mgmt *)hdr; category = mgmt->u.action.category; if (category != WLAN_CATEGORY_MESH_ACTION && category != WLAN_CATEGORY_SELF_PROTECTED) - return RX_DROP; + return RX_DROP_U_MESH_WRONG_ACTION; return RX_CONTINUE; } @@ -1179,7 +1179,7 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx) ieee80211_is_auth(hdr->frame_control)) return RX_CONTINUE; - return RX_DROP; + return RX_DROP_U_MESH_UNEXP_MGMT; } return RX_CONTINUE; @@ -1605,7 +1605,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (rx->skb->len < hdrlen + 8) - return RX_DROP; + return RX_DROP_U_RUNT_DATA; skb_copy_bits(rx->skb, hdrlen + 6, ðertype, 2); if (ethertype == rx->sdata->control_port_protocol) @@ -1615,9 +1615,9 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if (rx->sdata->vif.type == NL80211_IFTYPE_AP && cfg80211_rx_spurious_frame(rx->sdata->dev, hdr->addr2, rx->link_id, GFP_ATOMIC)) - return RX_DROP_U_SPURIOUS; + return RX_DROP_U_SPURIOUS_NOTIF; - return RX_DROP; + return RX_DROP_U_SPURIOUS; } return RX_CONTINUE; @@ -1880,7 +1880,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) link_sta->rx_stats.fragments++; u64_stats_update_begin(&link_sta->rx_stats.syncp); - link_sta->rx_stats.bytes += rx->skb->len; + u64_stats_add(&link_sta->rx_stats.bytes, rx->skb->len); u64_stats_update_end(&link_sta->rx_stats.syncp); if (!(status->flag & RX_FLAG_NO_SIGNAL_VAL)) { @@ -2106,7 +2106,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->link_sta) { if (ieee80211_is_group_privacy_action(skb) && test_sta_flag(rx->sta, WLAN_STA_MFP)) - return RX_DROP; + return RX_DROP_U_UNPROTECTED; rx->key = rcu_dereference(rx->link_sta->gtk[mmie_keyidx]); } @@ -2191,11 +2191,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (rx->key) { if (unlikely(rx->key->flags & KEY_FLAG_TAINTED)) - return RX_DROP; + return RX_DROP_U_KEY_TAINTED; /* TODO: add threshold stuff again */ } else { - return RX_DROP; + return RX_DROP_U_UNPROTECTED; } switch (rx->key->conf.cipher) { @@ -2371,7 +2371,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) goto out; if (is_multicast_ether_addr(hdr->addr1)) - return RX_DROP; + return RX_DROP_U_MCAST_FRAGMENT; I802_DEBUG_INC(rx->local->rx_handlers_fragments); @@ -2426,7 +2426,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) rx->seqno_idx, hdr); if (!entry) { I802_DEBUG_INC(rx->local->rx_handlers_drop_defrag); - return RX_DROP; + return RX_DROP_U_DEFRAG_MISMATCH; } /* "The receiver shall discard MSDUs and MMPDUs whose constituent @@ -2609,6 +2609,14 @@ ieee80211_drop_unencrypted_mgmt(struct ieee80211_rx_data *rx) (!rx->sta || !test_sta_flag(rx->sta, WLAN_STA_ASSOC))) return RX_DROP_U_UNPROT_ROBUST_ACTION; + /* + * Drop unprotected (Re)Association Request/Response frame received from + * an EPP Peer. + */ + if (!ieee80211_has_protected(fc) && + ieee80211_require_encrypted_assoc(fc, rx->sta)) + return RX_DROP_U_UNPROT_UCAST_MGMT; + return RX_CONTINUE; } EXPORT_SYMBOL_IF_MAC80211_KUNIT(ieee80211_drop_unencrypted_mgmt); @@ -2777,7 +2785,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) * frame, so count MSDUs. */ u64_stats_update_begin(&rx->link_sta->rx_stats.syncp); - rx->link_sta->rx_stats.msdu[rx->seqno_idx]++; + u64_stats_inc(&rx->link_sta->rx_stats.msdu[rx->seqno_idx]); u64_stats_update_end(&rx->link_sta->rx_stats.syncp); } @@ -2948,25 +2956,25 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta return RX_CONTINUE; if (!pskb_may_pull(skb, sizeof(*eth) + 6)) - return RX_DROP; + return RX_DROP_U_RUNT_MESH_DATA; mesh_hdr = (struct ieee80211s_hdr *)(skb->data + sizeof(*eth)); mesh_hdrlen = ieee80211_get_mesh_hdrlen(mesh_hdr); if (!pskb_may_pull(skb, sizeof(*eth) + mesh_hdrlen)) - return RX_DROP; + return RX_DROP_U_RUNT_MESH_DATA; eth = (struct ethhdr *)skb->data; multicast = is_multicast_ether_addr(eth->h_dest); mesh_hdr = (struct ieee80211s_hdr *)(eth + 1); if (!mesh_hdr->ttl) - return RX_DROP; + return RX_DROP_U_MESH_NO_TTL; /* frame is in RMC, don't forward */ if (is_multicast_ether_addr(eth->h_dest) && mesh_rmc_check(sdata, eth->h_source, mesh_hdr)) - return RX_DROP; + return RX_DROP_U_MESH_RMC; /* forward packet */ if (sdata->crypto_tx_tailroom_needed_cnt) @@ -2983,7 +2991,7 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta /* has_a4 already checked in ieee80211_rx_mesh_check */ proxied_addr = mesh_hdr->eaddr2; else - return RX_DROP; + return RX_DROP_U_MESH_BAD_AE; rcu_read_lock(); mppath = mpp_path_lookup(sdata, proxied_addr); @@ -3015,14 +3023,14 @@ ieee80211_rx_mesh_data(struct ieee80211_sub_if_data *sdata, struct sta_info *sta goto rx_accept; IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_ttl); - return RX_DROP; + return RX_DROP_U_MESH_TTL_EXPIRED; } if (!ifmsh->mshcfg.dot11MeshForwarding) { if (is_multicast_ether_addr(eth->h_dest)) goto rx_accept; - return RX_DROP; + return RX_DROP_U_MESH_NOT_FORWARDING; } skb_set_queue_mapping(skb, ieee802_1d_to_ac[skb->priority]); @@ -3208,7 +3216,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(fc))) - return RX_DROP; + return RX_DROP_U_AMSDU_WITHOUT_DATA; if (unlikely(ieee80211_has_a4(hdr->frame_control))) { switch (rx->sdata->vif.type) { @@ -3265,7 +3273,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (unlikely(!ieee80211_is_data_present(hdr->frame_control))) - return RX_DROP; + return RX_DROP_U_NULL_DATA; /* Send unexpected-4addr-frame event to hostapd */ if (ieee80211_has_a4(hdr->frame_control) && @@ -3275,7 +3283,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) cfg80211_rx_unexpected_4addr_frame( rx->sdata->dev, rx->sta->sta.addr, rx->link_id, GFP_ATOMIC); - return RX_DROP; + return RX_DROP_U_UNEXPECTED_4ADDR; } res = __ieee80211_data_to_8023(rx, &port_control); @@ -3287,7 +3295,7 @@ ieee80211_rx_h_data(struct ieee80211_rx_data *rx) return res; if (!ieee80211_frame_allowed(rx, fc)) - return RX_DROP; + return RX_DROP_U_PORT_CONTROL; /* directly handle TDLS channel switch requests/responses */ if (unlikely(((struct ethhdr *)rx->skb->data)->h_proto == @@ -3352,11 +3360,11 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) }; if (!rx->sta) - return RX_DROP; + return RX_DROP_U_UNKNOWN_STA; if (skb_copy_bits(skb, offsetof(struct ieee80211_bar, control), &bar_data, sizeof(bar_data))) - return RX_DROP; + return RX_DROP_U_RUNT_BAR; tid = le16_to_cpu(bar_data.control) >> 12; @@ -3368,7 +3376,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) tid_agg_rx = rcu_dereference(rx->sta->ampdu_mlme.tid_rx[tid]); if (!tid_agg_rx) - return RX_DROP; + return RX_DROP_U_BAR_OUTSIDE_SESSION; start_seq_num = le16_to_cpu(bar_data.start_seq_num) >> 4; event.u.ba.tid = tid; @@ -3392,7 +3400,7 @@ ieee80211_rx_h_ctrl(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) return RX_QUEUED; } - return RX_DROP; + return RX_DROP_U_CTRL_FRAME; } static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, @@ -3501,10 +3509,10 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) * and unknown (reserved) frames are useless. */ if (rx->skb->len < 24) - return RX_DROP; + return RX_DROP_U_RUNT_MGMT; if (!ieee80211_is_mgmt(mgmt->frame_control)) - return RX_DROP; + return RX_DROP_U_EXPECTED_MGMT; /* drop too small action frames */ if (ieee80211_is_action(mgmt->frame_control) && @@ -3514,7 +3522,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) /* Drop non-broadcast Beacon frames */ if (ieee80211_is_beacon(mgmt->frame_control) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_NONBCAST_BEACON; if (rx->sdata->vif.type == NL80211_IFTYPE_AP && ieee80211_is_beacon(mgmt->frame_control) && @@ -3920,6 +3928,14 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) u.action.u.epcs)) goto invalid; goto queue; + case WLAN_PROTECTED_EHT_ACTION_EML_OP_MODE_NOTIF: + if (sdata->vif.type != NL80211_IFTYPE_AP) + break; + + if (len < offsetofend(typeof(*mgmt), + u.action.u.eml_omn)) + goto invalid; + goto queue; default: break; } @@ -4046,10 +4062,10 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) - return RX_DROP; + return RX_DROP_U_MALFORMED_ACTION; if (is_multicast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_UNKNOWN_MCAST_ACTION; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -4094,7 +4110,7 @@ ieee80211_rx_h_ext(struct ieee80211_rx_data *rx) return RX_CONTINUE; if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNEXPECTED_EXT_FRAME; /* for now only beacons are ext, so queue them */ ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4115,7 +4131,7 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_OCB && sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNHANDLED_MGMT; switch (stype) { case cpu_to_le16(IEEE80211_STYPE_AUTH): @@ -4126,32 +4142,32 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) case cpu_to_le16(IEEE80211_STYPE_DEAUTH): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_MCAST_DEAUTH; /* process only for station/IBSS */ if (sdata->vif.type != NL80211_IFTYPE_STATION && sdata->vif.type != NL80211_IFTYPE_ADHOC) - return RX_DROP; + return RX_DROP_U_UNHANDLED_DEAUTH; break; case cpu_to_le16(IEEE80211_STYPE_ASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_REASSOC_RESP): case cpu_to_le16(IEEE80211_STYPE_DISASSOC): if (is_multicast_ether_addr(mgmt->da) && !is_broadcast_ether_addr(mgmt->da)) - return RX_DROP; + return RX_DROP_U_MCAST_DISASSOC; /* process only for station */ if (sdata->vif.type != NL80211_IFTYPE_STATION) - return RX_DROP; + return RX_DROP_U_UNHANDLED_DISASSOC; break; case cpu_to_le16(IEEE80211_STYPE_PROBE_REQ): /* process only for ibss and mesh */ if (sdata->vif.type != NL80211_IFTYPE_ADHOC && sdata->vif.type != NL80211_IFTYPE_MESH_POINT) - return RX_DROP; + return RX_DROP_U_UNHANDLED_PREQ; break; default: - return RX_DROP; + return RX_DROP_U_UNHANDLED_MGMT_STYPE; } ieee80211_queue_skb_to_iface(sdata, rx->link_id, rx->sta, rx->skb); @@ -4179,7 +4195,7 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, struct sk_buff_head *frames) { - ieee80211_rx_result res = RX_DROP; + ieee80211_rx_result res; struct sk_buff *skb; #define CALL_RXH(rxh) \ @@ -4205,8 +4221,10 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, */ rx->skb = skb; - if (WARN_ON_ONCE(!rx->link)) + if (WARN_ON_ONCE(!rx->link)) { + res = RX_DROP_U_NO_LINK; goto rxh_next; + } CALL_RXH(ieee80211_rx_h_check_more_data); CALL_RXH(ieee80211_rx_h_uapsd_and_pspoll); @@ -4243,7 +4261,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { struct sk_buff_head reorder_release; - ieee80211_rx_result res = RX_DROP; + ieee80211_rx_result res; __skb_queue_head_init(&reorder_release); @@ -4868,8 +4886,8 @@ static void ieee80211_rx_8023(struct ieee80211_rx_data *rx, * frame, so count MSDUs. */ u64_stats_update_begin(&stats->syncp); - stats->msdu[rx->seqno_idx]++; - stats->bytes += orig_len; + u64_stats_inc(&stats->msdu[rx->seqno_idx]); + u64_stats_add(&stats->bytes, orig_len); u64_stats_update_end(&stats->syncp); if (fast_rx->internal_forward) { @@ -5508,6 +5526,32 @@ void ieee80211_rx_list(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->nss, status->eht.gi)) goto drop; break; + case RX_ENC_UHR: + if (WARN_ONCE(!(status->rate_idx <= 15 || + status->rate_idx == 17 || + status->rate_idx == 19 || + status->rate_idx == 20 || + status->rate_idx == 23) || + !status->nss || + status->nss > 8 || + status->uhr.gi > NL80211_RATE_INFO_EHT_GI_3_2, + "Rate marked as a UHR rate but data is invalid: MCS:%d, NSS:%d, GI:%d\n", + status->rate_idx, status->nss, status->uhr.gi)) + goto drop; + if (WARN_ONCE(status->uhr.elr && + (status->nss != 1 || status->rate_idx > 1 || + status->uhr.gi != NL80211_RATE_INFO_EHT_GI_1_6 || + status->bw != RATE_INFO_BW_20 || status->uhr.im), + "bad UHR ELR MCS MCS:%d, NSS:%d, GI:%d, BW:%d, IM:%d\n", + status->rate_idx, status->nss, status->uhr.gi, + status->bw, status->uhr.im)) + goto drop; + if (WARN_ONCE(status->uhr.im && + (status->nss != 1 || status->rate_idx == 15), + "bad UHR IM MCS MCS:%d, NSS:%d\n", + status->rate_idx, status->nss)) + goto drop; + break; default: WARN_ON_ONCE(1); fallthrough; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 1a995bc301b1..a79ebeb43585 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015 - 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/module.h> @@ -360,7 +360,9 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) struct link_sta_info *link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); struct ieee80211_link_data *link; + unsigned int start; int ac, tid; + u64 value; u32 thr; for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -369,8 +371,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) sta->rem_link_stats.tx_bytes += link_sta->tx_stats.bytes[ac]; } + do { + start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp); + value = u64_stats_read(&link_sta->rx_stats.bytes); + } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp, start)); + sta->rem_link_stats.rx_packets += link_sta->rx_stats.packets; - sta->rem_link_stats.rx_bytes += link_sta->rx_stats.bytes; + sta->rem_link_stats.rx_bytes += value; sta->rem_link_stats.tx_retries += link_sta->status_stats.retry_count; sta->rem_link_stats.tx_failed += link_sta->status_stats.retry_failed; sta->rem_link_stats.rx_dropped_misc += link_sta->rx_stats.dropped; @@ -380,8 +387,13 @@ static void sta_accumulate_removed_link_stats(struct sta_info *sta, int link_id) sta->rem_link_stats.expected_throughput += thr; for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { - sta->rem_link_stats.pertid_stats.rx_msdu += - link_sta->rx_stats.msdu[tid]; + do { + start = u64_stats_fetch_begin(&link_sta->rx_stats.syncp); + value = u64_stats_read(&link_sta->rx_stats.msdu[tid]); + } while (u64_stats_fetch_retry(&link_sta->rx_stats.syncp, + start)); + + sta->rem_link_stats.pertid_stats.rx_msdu += value; sta->rem_link_stats.pertid_stats.tx_msdu += link_sta->tx_stats.msdu[tid]; sta->rem_link_stats.pertid_stats.tx_msdu_retries += @@ -2555,6 +2567,17 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u32 rate, rinfo->eht_gi = STA_STATS_GET(EHT_GI, rate); rinfo->eht_ru_alloc = STA_STATS_GET(EHT_RU, rate); break; + case STA_STATS_RATE_TYPE_UHR: + rinfo->flags = RATE_INFO_FLAGS_UHR_MCS; + rinfo->mcs = STA_STATS_GET(UHR_MCS, rate); + rinfo->nss = STA_STATS_GET(UHR_NSS, rate); + rinfo->eht_gi = STA_STATS_GET(UHR_GI, rate); + rinfo->eht_ru_alloc = STA_STATS_GET(UHR_RU, rate); + if (STA_STATS_GET(UHR_ELR, rate)) + rinfo->flags |= RATE_INFO_FLAGS_UHR_ELR_MCS; + if (STA_STATS_GET(UHR_IM, rate)) + rinfo->flags |= RATE_INFO_FLAGS_UHR_IM; + break; } } @@ -2578,7 +2601,7 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, do { start = u64_stats_fetch_begin(&rxstats->syncp); - value = rxstats->msdu[tid]; + value = u64_stats_read(&rxstats->msdu[tid]); } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; @@ -2654,7 +2677,7 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) do { start = u64_stats_fetch_begin(&rxstats->syncp); - value = rxstats->bytes; + value = u64_stats_read(&rxstats->bytes); } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 5288d5286651..2875ef7d7946 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -3,7 +3,7 @@ * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2015-2017 Intel Deutschland GmbH - * Copyright(c) 2020-2024 Intel Corporation + * Copyright(c) 2020-2026 Intel Corporation */ #ifndef STA_INFO_H @@ -434,8 +434,8 @@ struct ieee80211_sta_rx_stats { s8 chain_signal_last[IEEE80211_MAX_CHAINS]; u32 last_rate; struct u64_stats_sync syncp; - u64 bytes; - u64 msdu[IEEE80211_NUM_TIDS + 1]; + u64_stats_t bytes; + u64_stats_t msdu[IEEE80211_NUM_TIDS + 1]; }; /* @@ -1009,25 +1009,49 @@ enum sta_stats_type { STA_STATS_RATE_TYPE_HE, STA_STATS_RATE_TYPE_S1G, STA_STATS_RATE_TYPE_EHT, + STA_STATS_RATE_TYPE_UHR, }; -#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) -#define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0) -#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) -#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_HE_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_HE_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_EHT_MCS GENMASK( 3, 0) -#define STA_STATS_FIELD_EHT_NSS GENMASK( 7, 4) -#define STA_STATS_FIELD_BW GENMASK(12, 8) -#define STA_STATS_FIELD_SGI GENMASK(13, 13) -#define STA_STATS_FIELD_TYPE GENMASK(16, 14) -#define STA_STATS_FIELD_HE_RU GENMASK(19, 17) -#define STA_STATS_FIELD_HE_GI GENMASK(21, 20) -#define STA_STATS_FIELD_HE_DCM GENMASK(22, 22) -#define STA_STATS_FIELD_EHT_RU GENMASK(20, 17) -#define STA_STATS_FIELD_EHT_GI GENMASK(22, 21) +/* common */ +#define STA_STATS_FIELD_TYPE 0x0000000F +#define STA_STATS_FIELD_BW 0x000001F0 +#define STA_STATS_FIELD_RESERVED 0x00000E00 + +/* STA_STATS_RATE_TYPE_LEGACY */ +#define STA_STATS_FIELD_LEGACY_IDX 0x0000F000 +#define STA_STATS_FIELD_LEGACY_BAND 0x000F0000 + +/* STA_STATS_RATE_TYPE_HT */ +#define STA_STATS_FIELD_HT_MCS 0x000FF000 + +/* STA_STATS_RATE_TYPE_VHT */ +#define STA_STATS_FIELD_VHT_MCS 0x0000F000 +#define STA_STATS_FIELD_VHT_NSS 0x000F0000 + +/* HT & VHT */ +#define STA_STATS_FIELD_SGI 0x00100000 + +/* STA_STATS_RATE_TYPE_HE */ +#define STA_STATS_FIELD_HE_MCS 0x0000F000 +#define STA_STATS_FIELD_HE_NSS 0x000F0000 +#define STA_STATS_FIELD_HE_RU 0x00700000 +#define STA_STATS_FIELD_HE_GI 0x01800000 +#define STA_STATS_FIELD_HE_DCM 0x02000000 + +/* STA_STATS_RATE_TYPE_EHT */ +#define STA_STATS_FIELD_EHT_MCS 0x0000F000 +#define STA_STATS_FIELD_EHT_NSS 0x000F0000 +#define STA_STATS_FIELD_EHT_RU 0x00F00000 +#define STA_STATS_FIELD_EHT_GI 0x03000000 + +/* STA_STATS_RATE_TYPE_UHR */ +#define STA_STATS_FIELD_UHR_MCS 0x0001F000 +#define STA_STATS_FIELD_UHR_NSS 0x001E0000 +#define STA_STATS_FIELD_UHR_RU 0x01E00000 +#define STA_STATS_FIELD_UHR_GI 0x06000000 +#define STA_STATS_FIELD_UHR_ELR 0x08000000 +#define STA_STATS_FIELD_UHR_IM 0x10000000 + #define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) #define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) @@ -1040,8 +1064,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r = STA_STATS_FIELD(BW, s->bw); - if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) - r |= STA_STATS_FIELD(SGI, 1); + switch (s->encoding) { + case RX_ENC_HT: + case RX_ENC_VHT: + if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) + r |= STA_STATS_FIELD(SGI, 1); + break; + default: + break; + } switch (s->encoding) { case RX_ENC_VHT: @@ -1073,6 +1104,15 @@ static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) r |= STA_STATS_FIELD(EHT_GI, s->eht.gi); r |= STA_STATS_FIELD(EHT_RU, s->eht.ru); break; + case RX_ENC_UHR: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_UHR); + r |= STA_STATS_FIELD(UHR_NSS, s->nss); + r |= STA_STATS_FIELD(UHR_MCS, s->rate_idx); + r |= STA_STATS_FIELD(UHR_GI, s->uhr.gi); + r |= STA_STATS_FIELD(UHR_RU, s->uhr.ru); + r |= STA_STATS_FIELD(UHR_ELR, s->uhr.elr); + r |= STA_STATS_FIELD(UHR_IM, s->uhr.im); + break; default: WARN_ON(1); return STA_STATS_RATE_INVALID; diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h index 0bfbce157486..c04d4547e8f4 100644 --- a/net/mac80211/trace.h +++ b/net/mac80211/trace.h @@ -3353,6 +3353,38 @@ TRACE_EVENT(drv_prep_add_interface, ) ); +TRACE_EVENT(drv_set_eml_op_mode, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_sta *sta, + unsigned int link_id, + u8 control, u16 link_bitmap), + + TP_ARGS(local, sdata, sta, link_id, control, link_bitmap), + + TP_STRUCT__entry(LOCAL_ENTRY + VIF_ENTRY + STA_ENTRY + __field(u32, link_id) + __field(u8, control) + __field(u16, link_bitmap)), + + TP_fast_assign(LOCAL_ASSIGN; + VIF_ASSIGN; + STA_NAMED_ASSIGN(sta); + __entry->link_id = link_id; + __entry->control = control; + __entry->link_bitmap = link_bitmap; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT + " (link:%d control:%02x link_bitmap:%04x)", + LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->link_id, + __entry->control, __entry->link_bitmap + ) +); + #endif /* !__MAC80211_DRIVER_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 1b55e8340413..007f5a368d41 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -640,7 +640,9 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb) && - !ieee80211_is_group_privacy_action(tx->skb)) + !ieee80211_is_group_privacy_action(tx->skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, + tx->sta)) tx->key = NULL; else skip_hw = (tx->key->conf.flags & diff --git a/net/mac80211/uhr.c b/net/mac80211/uhr.c new file mode 100644 index 000000000000..2d8f5e5480ef --- /dev/null +++ b/net/mac80211/uhr.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * UHR handling + * + * Copyright(c) 2025-2026 Intel Corporation + */ + +#include "ieee80211_i.h" + +void +ieee80211_uhr_cap_ie_to_sta_uhr_cap(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct ieee80211_uhr_cap *uhr_cap, + u8 uhr_cap_len, + struct link_sta_info *link_sta) +{ + struct ieee80211_sta_uhr_cap *sta_uhr_cap = &link_sta->pub->uhr_cap; + bool from_ap; + + memset(sta_uhr_cap, 0, sizeof(*sta_uhr_cap)); + + if (!ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif)) + return; + + sta_uhr_cap->has_uhr = true; + + sta_uhr_cap->mac = uhr_cap->mac; + from_ap = sdata->vif.type == NL80211_IFTYPE_STATION; + sta_uhr_cap->phy = *ieee80211_uhr_phy_cap(uhr_cap, from_ap); +} diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0c46009a3d63..a5e09c0fa6b3 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -6,7 +6,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright (C) 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation * * utilities for mac80211 */ @@ -101,7 +101,6 @@ u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, return NULL; } -EXPORT_SYMBOL(ieee80211_get_bssid); void ieee80211_tx_set_protected(struct ieee80211_tx_data *tx) { @@ -800,20 +799,56 @@ void ieee80211_iterate_active_interfaces_atomic( } EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_atomic); -void ieee80211_iterate_active_interfaces_mtx( - struct ieee80211_hw *hw, u32 iter_flags, - void (*iterator)(void *data, u8 *mac, - struct ieee80211_vif *vif), - void *data) +struct ieee80211_vif * +__ieee80211_iterate_interfaces(struct ieee80211_hw *hw, + struct ieee80211_vif *prev, + u32 iter_flags) { + bool active_only = iter_flags & IEEE80211_IFACE_ITER_ACTIVE; + struct ieee80211_sub_if_data *sdata = NULL, *monitor; struct ieee80211_local *local = hw_to_local(hw); lockdep_assert_wiphy(hw->wiphy); - __iterate_interfaces(local, iter_flags | IEEE80211_IFACE_ITER_ACTIVE, - iterator, data); + if (prev) + sdata = vif_to_sdata(prev); + + monitor = rcu_dereference_check(local->monitor_sdata, + lockdep_is_held(&hw->wiphy->mtx)); + if (monitor && monitor == sdata) + return NULL; + + sdata = list_prepare_entry(sdata, &local->interfaces, list); + list_for_each_entry_continue(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_MONITOR: + if (!(sdata->u.mntr.flags & MONITOR_FLAG_ACTIVE) && + !ieee80211_hw_check(&local->hw, NO_VIRTUAL_MONITOR)) + continue; + break; + case NL80211_IFTYPE_AP_VLAN: + continue; + default: + break; + } + if (!(iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL) && + active_only && !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + continue; + if ((iter_flags & IEEE80211_IFACE_SKIP_SDATA_NOT_IN_DRIVER) && + !(sdata->flags & IEEE80211_SDATA_IN_DRIVER)) + continue; + if (ieee80211_sdata_running(sdata) || !active_only) + return &sdata->vif; + } + + if (monitor && ieee80211_hw_check(&local->hw, WANT_MONITOR_VIF) && + (iter_flags & IEEE80211_IFACE_ITER_RESUME_ALL || !active_only || + monitor->flags & IEEE80211_SDATA_IN_DRIVER)) + return &monitor->vif; + + return NULL; } -EXPORT_SYMBOL_GPL(ieee80211_iterate_active_interfaces_mtx); +EXPORT_SYMBOL_GPL(__ieee80211_iterate_interfaces); static void __iterate_stations(struct ieee80211_local *local, void (*iterator)(void *data, @@ -844,18 +879,29 @@ void ieee80211_iterate_stations_atomic(struct ieee80211_hw *hw, } EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_atomic); -void ieee80211_iterate_stations_mtx(struct ieee80211_hw *hw, - void (*iterator)(void *data, - struct ieee80211_sta *sta), - void *data) +struct ieee80211_sta * +__ieee80211_iterate_stations(struct ieee80211_hw *hw, + struct ieee80211_sta *prev) { struct ieee80211_local *local = hw_to_local(hw); + struct sta_info *sta = NULL; lockdep_assert_wiphy(local->hw.wiphy); - __iterate_stations(local, iterator, data); + if (prev) + sta = container_of(prev, struct sta_info, sta); + + sta = list_prepare_entry(sta, &local->sta_list, list); + list_for_each_entry_continue(sta, &local->sta_list, list) { + if (!sta->uploaded) + continue; + + return &sta->sta; + } + + return NULL; } -EXPORT_SYMBOL_GPL(ieee80211_iterate_stations_mtx); +EXPORT_SYMBOL_GPL(__ieee80211_iterate_stations); struct ieee80211_vif *wdev_to_ieee80211_vif(struct wireless_dev *wdev) { @@ -1096,14 +1142,17 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, .ml.control = cpu_to_le16(IEEE80211_ML_CONTROL_TYPE_BASIC), .basic.len = sizeof(mle.basic), }; + bool add_mle; int err; - memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); + add_mle = (multi_link && + !cfg80211_find_ext_elem(WLAN_EID_EXT_EHT_MULTI_LINK, + extra, extra_len)); /* 24 + 6 = header + auth_algo + auth_transaction + status_code */ skb = dev_alloc_skb(local->hw.extra_tx_headroom + IEEE80211_WEP_IV_LEN + 24 + 6 + extra_len + IEEE80211_WEP_ICV_LEN + - multi_link * sizeof(mle)); + add_mle * sizeof(mle)); if (!skb) return; @@ -1120,8 +1169,11 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, mgmt->u.auth.status_code = cpu_to_le16(status); if (extra) skb_put_data(skb, extra, extra_len); - if (multi_link) + + if (add_mle) { + memcpy(mle.basic.mld_mac_addr, sdata->vif.addr, ETH_ALEN); skb_put_data(skb, &mle, sizeof(mle)); + } if (auth_alg == WLAN_AUTH_SHARED_KEY && transaction == 3) { mgmt->frame_control |= cpu_to_le16(IEEE80211_FCTL_PROTECTED); @@ -1369,6 +1421,13 @@ static int ieee80211_put_preq_ies_band(struct sk_buff *skb, if (err) return err; + if (cfg80211_any_usable_channels(local->hw.wiphy, BIT(sband->band), + IEEE80211_CHAN_NO_UHR)) { + err = ieee80211_put_uhr_cap(skb, sdata, sband); + if (err) + return err; + } + /* * If adding more here, adjust code in main.c * that calculates local->scan_ies_len. @@ -3545,7 +3604,7 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local, if (ctx && &ctx->conf != chanctx_conf) continue; - wiphy_delayed_work_cancel(local->hw.wiphy, + wiphy_hrtimer_work_cancel(local->hw.wiphy, &link->dfs_cac_timer_work); if (!sdata->wdev.links[link_id].cac_started) @@ -4475,6 +4534,32 @@ int ieee80211_put_eht_cap(struct sk_buff *skb, return 0; } +int ieee80211_put_uhr_cap(struct sk_buff *skb, + struct ieee80211_sub_if_data *sdata, + const struct ieee80211_supported_band *sband) +{ + const struct ieee80211_sta_uhr_cap *uhr_cap = + ieee80211_get_uhr_iftype_cap_vif(sband, &sdata->vif); + int len; + + if (!uhr_cap) + return 0; + + len = 2 + 1 + sizeof(struct ieee80211_uhr_cap) + + sizeof(struct ieee80211_uhr_cap_phy); + + if (skb_tailroom(skb) < len) + return -ENOBUFS; + + skb_put_u8(skb, WLAN_EID_EXTENSION); + skb_put_u8(skb, len - 2); + skb_put_u8(skb, WLAN_EID_EXT_UHR_CAPA); + skb_put_data(skb, &uhr_cap->mac, sizeof(uhr_cap->mac)); + skb_put_data(skb, &uhr_cap->phy, sizeof(uhr_cap->phy)); + + return 0; +} + const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) { static const char * const modes[] = { @@ -4484,6 +4569,7 @@ const char *ieee80211_conn_mode_str(enum ieee80211_conn_mode mode) [IEEE80211_CONN_MODE_VHT] = "VHT", [IEEE80211_CONN_MODE_HE] = "HE", [IEEE80211_CONN_MODE_EHT] = "EHT", + [IEEE80211_CONN_MODE_UHR] = "UHR", }; if (WARN_ON(mode >= ARRAY_SIZE(modes))) diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 4a858112e4ef..fdf98c21d32c 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -527,7 +527,8 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx, hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(skb)) + !ieee80211_is_robust_mgmt_frame(skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { @@ -723,7 +724,8 @@ ieee80211_crypto_gcmp_decrypt(struct ieee80211_rx_data *rx) hdrlen = ieee80211_hdrlen(hdr->frame_control); if (!ieee80211_is_data(hdr->frame_control) && - !ieee80211_is_robust_mgmt_frame(skb)) + !ieee80211_is_robust_mgmt_frame(skb) && + !ieee80211_require_encrypted_assoc(hdr->frame_control, rx->sta)) return RX_CONTINUE; if (status->flag & RX_FLAG_DECRYPTED) { diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index b26675054b0d..b5316a6c7d1b 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -1044,26 +1044,23 @@ out_free: return ret; } -static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, +static void mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) { struct mptcp_rm_list list = { .nr = 0 }; - bool ret; + bool announced; list.ids[list.nr++] = mptcp_endp_get_local_id(msk, addr); - ret = mptcp_remove_anno_list_by_saddr(msk, addr); - if (ret || force) { + announced = mptcp_remove_anno_list_by_saddr(msk, addr); + if (announced || force) { spin_lock_bh(&msk->pm.lock); - if (ret) { - __set_bit(addr->id, msk->pm.id_avail_bitmap); + if (announced) msk->pm.add_addr_signaled--; - } mptcp_pm_remove_addr(msk, &list); spin_unlock_bh(&msk->pm.lock); } - return ret; } static void __mark_subflow_endp_available(struct mptcp_sock *msk, u8 id) @@ -1097,17 +1094,15 @@ static int mptcp_nl_remove_subflow_and_signal_addr(struct net *net, !(entry->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)); list.ids[0] = mptcp_endp_get_local_id(msk, addr); - if (remove_subflow) { - spin_lock_bh(&msk->pm.lock); - mptcp_pm_rm_subflow(msk, &list); - spin_unlock_bh(&msk->pm.lock); - } - if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - spin_lock_bh(&msk->pm.lock); + spin_lock_bh(&msk->pm.lock); + if (remove_subflow) + mptcp_pm_rm_subflow(msk, &list); + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) __mark_subflow_endp_available(msk, list.ids[0]); - spin_unlock_bh(&msk->pm.lock); - } + else /* mark endp ID as available, e.g. Signal or MPC endp */ + __set_bit(addr->id, msk->pm.id_avail_bitmap); + spin_unlock_bh(&msk->pm.lock); if (msk->mpc_endpoint_id == entry->addr.id) msk->mpc_endpoint_id = 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 8d3233667418..cf1852b99963 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -28,6 +28,8 @@ #include "protocol.h" #include "mib.h" +static unsigned int mptcp_inq_hint(const struct sock *sk); + #define CREATE_TRACE_POINTS #include <trace/events/mptcp.h> @@ -224,9 +226,6 @@ static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval) do_div(grow, oldval); rcvwin += grow << 1; - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) - rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; - cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); @@ -350,9 +349,6 @@ merge_right: end: skb_condense(skb); skb_set_owner_r(skb, sk); - /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ - if (sk->sk_socket) - mptcp_rcvbuf_grow(sk, msk->rcvq_space.space); } static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, @@ -1164,8 +1160,9 @@ struct mptcp_sendmsg_info { bool data_lock_held; }; -static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *ssk, - u64 data_seq, int avail_size) +static size_t mptcp_check_allowed_size(const struct mptcp_sock *msk, + struct sock *ssk, u64 data_seq, + size_t avail_size) { u64 window_end = mptcp_wnd_end(msk); u64 mptcp_snd_wnd; @@ -1174,7 +1171,7 @@ static int mptcp_check_allowed_size(const struct mptcp_sock *msk, struct sock *s return avail_size; mptcp_snd_wnd = window_end - data_seq; - avail_size = min_t(unsigned int, mptcp_snd_wnd, avail_size); + avail_size = min(mptcp_snd_wnd, avail_size); if (unlikely(tcp_sk(ssk)->snd_wnd < mptcp_snd_wnd)) { tcp_sk(ssk)->snd_wnd = min_t(u64, U32_MAX, mptcp_snd_wnd); @@ -1518,7 +1515,7 @@ struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) if (!ssk || !sk_stream_memory_free(ssk)) return NULL; - burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); + burst = min(MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); wmem = READ_ONCE(ssk->sk_wmem_queued); if (!burst) return ssk; @@ -1995,6 +1992,17 @@ do_error: static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); +static void mptcp_eat_recv_skb(struct sock *sk, struct sk_buff *skb) +{ + /* avoid the indirect call, we know the destructor is sock_rfree */ + skb->destructor = NULL; + skb->sk = NULL; + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(sk, skb->truesize); + __skb_unlink(skb, &sk->sk_receive_queue); + skb_attempt_defer_free(skb); +} + static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, size_t len, int flags, int copied_total, struct scm_timestamping_internal *tss, @@ -2049,13 +2057,7 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, break; } - /* avoid the indirect call, we know the destructor is sock_rfree */ - skb->destructor = NULL; - skb->sk = NULL; - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); - sk_mem_uncharge(sk, skb->truesize); - __skb_unlink(skb, &sk->sk_receive_queue); - skb_attempt_defer_free(skb); + mptcp_eat_recv_skb(sk, skb); } if (copied >= len) @@ -2066,6 +2068,21 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg, return copied; } +static void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) +{ + const struct tcp_sock *tp = tcp_sk(ssk); + + msk->rcvspace_init = 1; + msk->rcvq_space.copied = 0; + msk->rcvq_space.rtt_us = 0; + + /* initial rcv_space offering made to peer */ + msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, + TCP_INIT_CWND * tp->advmss); + if (msk->rcvq_space.space == 0) + msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; +} + /* receive buffer autotuning. See tcp_rcv_space_adjust for more information. * * Only difference: Use highest rtt estimate of the subflows in use. @@ -2088,8 +2105,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) msk->rcvq_space.copied += copied; - mstamp = div_u64(tcp_clock_ns(), NSEC_PER_USEC); - time = tcp_stamp_us_delta(mstamp, msk->rcvq_space.time); + mstamp = mptcp_stamp(); + time = tcp_stamp_us_delta(mstamp, READ_ONCE(msk->rcvq_space.time)); rtt_us = msk->rcvq_space.rtt_us; if (rtt_us && time < (rtt_us >> 3)) @@ -2119,6 +2136,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; + trace_mptcp_rcvbuf_grow(sk, time); if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) { /* Make subflows follow along. If we do not do this, we * get drops at subflow level if skbs can't be moved to @@ -3040,6 +3058,7 @@ static int mptcp_init_sock(struct sock *sk) sk_sockets_allocated_inc(sk); sk->sk_rcvbuf = READ_ONCE(net->ipv4.sysctl_tcp_rmem[1]); sk->sk_sndbuf = READ_ONCE(net->ipv4.sysctl_tcp_wmem[1]); + sk->sk_write_space = sk_stream_write_space; return 0; } @@ -3549,6 +3568,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, __mptcp_propagate_sndbuf(nsk, ssk); mptcp_rcv_space_init(msk, ssk); + msk->rcvq_space.time = mptcp_stamp(); if (mp_opt->suboptions & OPTION_MPTCP_MPC_ACK) __mptcp_subflow_fully_established(msk, subflow, mp_opt); @@ -3558,23 +3578,6 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, return nsk; } -void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk) -{ - const struct tcp_sock *tp = tcp_sk(ssk); - - msk->rcvspace_init = 1; - msk->rcvq_space.copied = 0; - msk->rcvq_space.rtt_us = 0; - - msk->rcvq_space.time = tp->tcp_mstamp; - - /* initial rcv_space offering made to peer */ - msk->rcvq_space.space = min_t(u32, tp->rcv_wnd, - TCP_INIT_CWND * tp->advmss); - if (msk->rcvq_space.space == 0) - msk->rcvq_space.space = TCP_INIT_CWND * TCP_MSS_DEFAULT; -} - static void mptcp_destroy(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -3763,6 +3766,7 @@ void mptcp_finish_connect(struct sock *ssk) * accessing the field below */ WRITE_ONCE(msk->local_key, subflow->local_key); + WRITE_ONCE(msk->rcvq_space.time, mptcp_stamp()); mptcp_pm_new_connection(msk, ssk, 0); } @@ -4312,6 +4316,201 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, return mask; } +static struct sk_buff *mptcp_recv_skb(struct sock *sk, u32 *off) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + u32 offset; + + if (!list_empty(&msk->backlog_list)) + mptcp_move_skbs(sk); + + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + offset = MPTCP_SKB_CB(skb)->offset; + if (offset < skb->len) { + *off = offset; + return skb; + } + mptcp_eat_recv_skb(sk, skb); + } + return NULL; +} + +/* + * Note: + * - It is assumed that the socket was locked by the caller. + */ +static int __mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor, bool noack) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *skb; + int copied = 0; + u32 offset; + + msk_owned_by_me(msk); + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = mptcp_recv_skb(sk, &offset)) != NULL) { + u32 data_len = skb->len - offset; + int count; + u32 size; + + size = min_t(size_t, data_len, INT_MAX); + count = recv_actor(desc, skb, offset, size); + if (count <= 0) { + if (!copied) + copied = count; + break; + } + + copied += count; + + msk->bytes_consumed += count; + if (count < data_len) { + MPTCP_SKB_CB(skb)->offset += count; + MPTCP_SKB_CB(skb)->map_seq += count; + break; + } + + mptcp_eat_recv_skb(sk, skb); + } + + if (noack) + goto out; + + mptcp_rcv_space_adjust(msk, copied); + + if (copied > 0) { + mptcp_recv_skb(sk, &offset); + mptcp_cleanup_rbuf(msk, copied); + } +out: + return copied; +} + +static int mptcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + return __mptcp_read_sock(sk, desc, recv_actor, false); +} + +static int __mptcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + .count = tss->len, + }; + + return mptcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * mptcp_splice_read - splice data from MPTCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + * Return: + * Amount of bytes that have been spliced. + * + **/ +static ssize_t mptcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + struct sock *sk = sock->sk; + ssize_t spliced = 0; + int ret = 0; + long timeo; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + lock_sock(sk); + + mptcp_rps_record_subflows(mptcp_sk(sk)); + + timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK); + while (tss.len) { + ret = __mptcp_splice_read(sk, &tss); + if (ret < 0) { + break; + } else if (!ret) { + if (spliced) + break; + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + /* if __mptcp_splice_read() got nothing while we have + * an skb in receive queue, we do not want to loop. + * This might happen with URG data. + */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + ret = sk_wait_data(sk, &timeo, NULL); + if (ret < 0) + break; + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + if (!tss.len || !timeo) + break; + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + static const struct proto_ops mptcp_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -4332,6 +4531,8 @@ static const struct proto_ops mptcp_stream_ops = { .recvmsg = inet_recvmsg, .mmap = sock_no_mmap, .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct inet_protosw mptcp_protosw = { @@ -4436,6 +4637,8 @@ static const struct proto_ops mptcp_v6_stream_ops = { .compat_ioctl = inet6_compat_ioctl, #endif .set_rcvlowat = mptcp_set_rcvlowat, + .read_sock = mptcp_read_sock, + .splice_read = mptcp_splice_read, }; static struct proto mptcp_v6_prot; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 66e973500791..0bd1ee860316 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -246,14 +246,14 @@ struct mptcp_pm_data { struct mptcp_pm_local { struct mptcp_addr_info addr; - u8 flags; + u32 flags; int ifindex; }; struct mptcp_pm_addr_entry { struct list_head list; struct mptcp_addr_info addr; - u8 flags; + u32 flags; int ifindex; struct socket *lsk; }; @@ -915,7 +915,11 @@ static inline bool mptcp_is_fully_established(struct sock *sk) READ_ONCE(mptcp_sk(sk)->fully_established); } -void mptcp_rcv_space_init(struct mptcp_sock *msk, const struct sock *ssk); +static inline u64 mptcp_stamp(void) +{ + return div_u64(tcp_clock_ns(), NSEC_PER_USEC); +} + void mptcp_data_ready(struct sock *sk, struct sock *ssk); bool mptcp_finish_join(struct sock *sk); bool mptcp_schedule_work(struct sock *sk); @@ -971,7 +975,7 @@ static inline void mptcp_write_space(struct sock *sk) /* pairs with memory barrier in mptcp_poll */ smp_mb(); if (mptcp_stream_memory_free(sk, 1)) - sk_stream_write_space(sk); + INDIRECT_CALL_1(sk->sk_write_space, sk_stream_write_space, sk); } static inline void __mptcp_sync_sndbuf(struct sock *sk) diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index 96d54cb2cd93..f66129f1e649 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -462,8 +462,6 @@ void __mptcp_sync_state(struct sock *sk, int state) subflow = mptcp_subflow_ctx(ssk); __mptcp_propagate_sndbuf(sk, ssk); - if (!msk->rcvspace_init) - mptcp_rcv_space_init(msk, ssk); if (sk->sk_state == TCP_SYN_SENT) { /* subflow->idsn is always available is TCP_SYN_SENT state, diff --git a/net/mptcp/token.c b/net/mptcp/token.c index 5bb924534387..f1a50f367add 100644 --- a/net/mptcp/token.c +++ b/net/mptcp/token.c @@ -103,7 +103,7 @@ static void mptcp_crypto_key_gen_sha(u64 *key, u32 *token, u64 *idsn) * It creates a unique token to identify the new mptcp connection, * a secret local key and the initial data sequence number (idsn). * - * Returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_request(struct request_sock *req) { @@ -146,7 +146,7 @@ int mptcp_token_new_request(struct request_sock *req) * the computed token at a later time, this is needed to process * join requests. * - * returns 0 on success. + * Return: 0 on success. */ int mptcp_token_new_connect(struct sock *ssk) { @@ -241,7 +241,7 @@ found: * This function returns the mptcp connection structure with the given token. * A reference count on the mptcp socket returned is taken. * - * returns NULL if no connection with the given token value exists. + * Return: NULL if no connection with the given token value exists. */ struct mptcp_sock *mptcp_token_get_sock(struct net *net, u32 token) { @@ -288,11 +288,13 @@ EXPORT_SYMBOL_GPL(mptcp_token_get_sock); * @s_slot: start slot number * @s_num: start number inside the given lock * - * This function returns the first mptcp connection structure found inside the - * token container starting from the specified position, or NULL. + * Description: + * On successful iteration, the iterator is moved to the next position and a + * reference to the returned socket is acquired. * - * On successful iteration, the iterator is moved to the next position and - * a reference to the returned socket is acquired. + * Return: + * The first mptcp connection structure found inside the token container + * starting from the specified position, or NULL. */ struct mptcp_sock *mptcp_token_iter_next(const struct net *net, long *s_slot, long *s_num) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 64c697212578..f861d116cc33 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -949,7 +949,7 @@ ip_vs_prepare_tunneled_skb(struct sk_buff *skb, int skb_af, *next_protocol = IPPROTO_IPV6; if (payload_len) *payload_len = - ntohs(old_ipv6h->payload_len) + + ipv6_payload_len(skb, old_ipv6h) + sizeof(*old_ipv6h); old_dsfield = ipv6_get_dsfield(old_ipv6h); *ttl = old_ipv6h->hop_limit; diff --git a/net/netfilter/nf_conncount.c b/net/netfilter/nf_conncount.c index 8487808c8761..14e62b3263cd 100644 --- a/net/netfilter/nf_conncount.c +++ b/net/netfilter/nf_conncount.c @@ -34,8 +34,9 @@ #define CONNCOUNT_SLOTS 256U -#define CONNCOUNT_GC_MAX_NODES 8 -#define MAX_KEYLEN 5 +#define CONNCOUNT_GC_MAX_NODES 8 +#define CONNCOUNT_GC_MAX_COLLECT 64 +#define MAX_KEYLEN 5 /* we will save the tuples of all connections we care about */ struct nf_conncount_tuple { @@ -178,16 +179,28 @@ static int __nf_conncount_add(struct net *net, return -ENOENT; if (ct && nf_ct_is_confirmed(ct)) { - err = -EEXIST; - goto out_put; + /* local connections are confirmed in postrouting so confirmation + * might have happened before hitting connlimit + */ + if (skb->skb_iif != LOOPBACK_IFINDEX) { + err = -EEXIST; + goto out_put; + } + + /* this is likely a local connection, skip optimization to avoid + * adding duplicates from a 'packet train' + */ + goto check_connections; } - if ((u32)jiffies == list->last_gc) + if ((u32)jiffies == list->last_gc && + (list->count - list->last_gc_count) < CONNCOUNT_GC_MAX_COLLECT) goto add_new_node; +check_connections: /* check the saved connections */ list_for_each_entry_safe(conn, conn_n, &list->head, node) { - if (collect > CONNCOUNT_GC_MAX_NODES) + if (collect > CONNCOUNT_GC_MAX_COLLECT) break; found = find_or_evict(net, list, conn); @@ -230,6 +243,7 @@ static int __nf_conncount_add(struct net *net, nf_ct_put(found_ct); } list->last_gc = (u32)jiffies; + list->last_gc_count = list->count; add_new_node: if (WARN_ON_ONCE(list->count > INT_MAX)) { @@ -277,6 +291,7 @@ void nf_conncount_list_init(struct nf_conncount_list *list) spin_lock_init(&list->list_lock); INIT_LIST_HEAD(&list->head); list->count = 0; + list->last_gc_count = 0; list->last_gc = (u32)jiffies; } EXPORT_SYMBOL_GPL(nf_conncount_list_init); @@ -316,13 +331,14 @@ static bool __nf_conncount_gc_list(struct net *net, } nf_ct_put(found_ct); - if (collected > CONNCOUNT_GC_MAX_NODES) + if (collected > CONNCOUNT_GC_MAX_COLLECT) break; } if (!list->count) ret = true; list->last_gc = (u32)jiffies; + list->last_gc_count = list->count; return ret; } diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index be654363f53f..40c261cd0af3 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -14,6 +14,7 @@ #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> +#include <net/sock.h> #include <net/xdp.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index 14f73872f647..17f1f453d481 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -23,6 +23,7 @@ #include <linux/skbuff.h> #include <net/route.h> #include <net/ip6_route.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <net/netfilter/nf_conntrack.h> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 3a04665adf99..662f6bbfa805 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -32,6 +32,7 @@ #include <linux/siphash.h> #include <linux/netfilter.h> +#include <net/ipv6.h> #include <net/netlink.h> #include <net/sock.h> #include <net/netfilter/nf_conntrack.h> diff --git a/net/netfilter/nf_conntrack_ovs.c b/net/netfilter/nf_conntrack_ovs.c index 068e9489e1c2..a6988eeb1579 100644 --- a/net/netfilter/nf_conntrack_ovs.c +++ b/net/netfilter/nf_conntrack_ovs.c @@ -121,7 +121,7 @@ int nf_ct_skb_network_trim(struct sk_buff *skb, int family) len = skb_ip_totlen(skb); break; case NFPROTO_IPV6: - len = ntohs(ipv6_hdr(skb)->payload_len); + len = skb_ipv6_payload_len(skb); if (ipv6_hdr(skb)->nexthdr == NEXTHDR_HOP) { int err = nf_ip6_check_hbh_len(skb, &len); diff --git a/net/netfilter/nf_conntrack_proto_generic.c b/net/netfilter/nf_conntrack_proto_generic.c index e831637bc8ca..cb260eb3d012 100644 --- a/net/netfilter/nf_conntrack_proto_generic.c +++ b/net/netfilter/nf_conntrack_proto_generic.c @@ -67,6 +67,7 @@ void nf_conntrack_generic_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_generic = { .l4proto = 255, + .allow_clash = true, #ifdef CONFIG_NF_CONNTRACK_TIMEOUT .ctnl_timeout = { .nlattr_to_obj = generic_timeout_nlattr_to_obj, diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index af369e686fc5..b894bb7a97ad 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -33,12 +33,14 @@ #include <linux/skbuff.h> #include <linux/slab.h> #include <net/dst.h> +#include <net/gre.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/netfilter/nf_conntrack_l4proto.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_timeout.h> +#include <net/pptp.h> #include <linux/netfilter/nf_conntrack_proto_gre.h> #include <linux/netfilter/nf_conntrack_pptp.h> diff --git a/net/netfilter/nf_conntrack_proto_icmp.c b/net/netfilter/nf_conntrack_proto_icmp.c index b38b7164acd5..32148a3a8509 100644 --- a/net/netfilter/nf_conntrack_proto_icmp.c +++ b/net/netfilter/nf_conntrack_proto_icmp.c @@ -365,6 +365,7 @@ void nf_conntrack_icmp_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp = { .l4proto = IPPROTO_ICMP, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmp_tuple_to_nlattr, .nlattr_tuple_size = icmp_nlattr_tuple_size, diff --git a/net/netfilter/nf_conntrack_proto_icmpv6.c b/net/netfilter/nf_conntrack_proto_icmpv6.c index 327b8059025d..e508b3aa370a 100644 --- a/net/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/netfilter/nf_conntrack_proto_icmpv6.c @@ -343,6 +343,7 @@ void nf_conntrack_icmpv6_init_net(struct net *net) const struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6 = { .l4proto = IPPROTO_ICMPV6, + .allow_clash = true, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .tuple_to_nlattr = icmpv6_tuple_to_nlattr, .nlattr_tuple_size = icmpv6_nlattr_tuple_size, diff --git a/net/netfilter/nf_flow_table_core.c b/net/netfilter/nf_flow_table_core.c index 06e8251a6644..2c4140e6f53c 100644 --- a/net/netfilter/nf_flow_table_core.c +++ b/net/netfilter/nf_flow_table_core.c @@ -16,6 +16,7 @@ static DEFINE_MUTEX(flowtable_lock); static LIST_HEAD(flowtables); +static __read_mostly struct kmem_cache *flow_offload_cachep; static void flow_offload_fill_dir(struct flow_offload *flow, @@ -56,7 +57,7 @@ struct flow_offload *flow_offload_alloc(struct nf_conn *ct) if (unlikely(nf_ct_is_dying(ct))) return NULL; - flow = kzalloc(sizeof(*flow), GFP_ATOMIC); + flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC); if (!flow) return NULL; @@ -812,9 +813,13 @@ static int __init nf_flow_table_module_init(void) { int ret; + flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN); + if (!flow_offload_cachep) + return -ENOMEM; + ret = register_pernet_subsys(&nf_flow_table_net_ops); if (ret < 0) - return ret; + goto out_pernet; ret = nf_flow_table_offload_init(); if (ret) @@ -830,6 +835,8 @@ out_bpf: nf_flow_table_offload_exit(); out_offload: unregister_pernet_subsys(&nf_flow_table_net_ops); +out_pernet: + kmem_cache_destroy(flow_offload_cachep); return ret; } @@ -837,6 +844,7 @@ static void __exit nf_flow_table_module_exit(void) { nf_flow_table_offload_exit(); unregister_pernet_subsys(&nf_flow_table_net_ops); + kmem_cache_destroy(flow_offload_cachep); } module_init(nf_flow_table_module_init); diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c index 78883343e5d6..3fdb10d9bf7f 100644 --- a/net/netfilter/nf_flow_table_ip.c +++ b/net/netfilter/nf_flow_table_ip.c @@ -8,10 +8,13 @@ #include <linux/ipv6.h> #include <linux/netdevice.h> #include <linux/if_ether.h> +#include <linux/if_vlan.h> +#include <net/gre.h> #include <net/gso.h> #include <net/ip.h> #include <net/ipv6.h> #include <net/ip6_route.h> +#include <net/ip6_tunnel.h> #include <net/neighbour.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_conntrack_acct.h> @@ -142,12 +145,26 @@ static bool ip_has_options(unsigned int thoff) return thoff != sizeof(struct iphdr); } -static void nf_flow_tuple_encap(struct sk_buff *skb, +struct nf_flowtable_ctx { + const struct net_device *in; + u32 offset; + u32 hdrsize; + struct { + /* Tunnel IP header size */ + u32 hdr_size; + /* IP tunnel protocol */ + u8 proto; + } tun; +}; + +static void nf_flow_tuple_encap(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple *tuple) { __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; struct pppoe_hdr *phdr; + struct ipv6hdr *ip6h; struct iphdr *iph; u16 offset = 0; int i = 0; @@ -174,22 +191,28 @@ static void nf_flow_tuple_encap(struct sk_buff *skb, break; } - if (inner_proto == htons(ETH_P_IP)) { + switch (inner_proto) { + case htons(ETH_P_IP): iph = (struct iphdr *)(skb_network_header(skb) + offset); - if (iph->protocol == IPPROTO_IPIP) { + if (ctx->tun.proto == IPPROTO_IPIP) { tuple->tun.dst_v4.s_addr = iph->daddr; tuple->tun.src_v4.s_addr = iph->saddr; tuple->tun.l3_proto = IPPROTO_IPIP; } + break; + case htons(ETH_P_IPV6): + ip6h = (struct ipv6hdr *)(skb_network_header(skb) + offset); + if (ctx->tun.proto == IPPROTO_IPV6) { + tuple->tun.dst_v6 = ip6h->daddr; + tuple->tun.src_v6 = ip6h->saddr; + tuple->tun.l3_proto = IPPROTO_IPV6; + } + break; + default: + break; } } -struct nf_flowtable_ctx { - const struct net_device *in; - u32 offset; - u32 hdrsize; -}; - static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -257,7 +280,7 @@ static int nf_flow_tuple_ip(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET; tuple->l4proto = ipproto; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple); return 0; } @@ -293,15 +316,16 @@ static unsigned int nf_flow_xmit_xfrm(struct sk_buff *skb, return NF_STOLEN; } -static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) +static bool nf_flow_ip4_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { struct iphdr *iph; u16 size; - if (!pskb_may_pull(skb, sizeof(*iph) + *psize)) + if (!pskb_may_pull(skb, sizeof(*iph) + ctx->offset)) return false; - iph = (struct iphdr *)(skb_network_header(skb) + *psize); + iph = (struct iphdr *)(skb_network_header(skb) + ctx->offset); size = iph->ihl << 2; if (ip_is_fragment(iph) || unlikely(ip_has_options(size))) @@ -310,25 +334,62 @@ static bool nf_flow_ip4_tunnel_proto(struct sk_buff *skb, u32 *psize) if (iph->ttl <= 1) return false; - if (iph->protocol == IPPROTO_IPIP) - *psize += size; + if (iph->protocol == IPPROTO_IPIP) { + ctx->tun.proto = IPPROTO_IPIP; + ctx->tun.hdr_size = size; + ctx->offset += size; + } return true; } -static void nf_flow_ip4_tunnel_pop(struct sk_buff *skb) +static bool nf_flow_ip6_tunnel_proto(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) { - struct iphdr *iph = (struct iphdr *)skb_network_header(skb); +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6hdr *ip6h, _ip6h; + __be16 frag_off; + u8 nexthdr; + int hdrlen; + + ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h); + if (!ip6h) + return false; + + if (ip6h->hop_limit <= 1) + return false; + + nexthdr = ip6h->nexthdr; + hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr, + &frag_off); + if (hdrlen < 0) + return false; + + if (nexthdr == IPPROTO_IPV6) { + ctx->tun.hdr_size = hdrlen; + ctx->tun.proto = IPPROTO_IPV6; + } + ctx->offset += ctx->tun.hdr_size; + + return true; +#else + return false; +#endif /* IS_ENABLED(CONFIG_IPV6) */ +} - if (iph->protocol != IPPROTO_IPIP) +static void nf_flow_ip_tunnel_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb) +{ + if (ctx->tun.proto != IPPROTO_IPIP && + ctx->tun.proto != IPPROTO_IPV6) return; - skb_pull(skb, iph->ihl << 2); + skb_pull(skb, ctx->tun.hdr_size); skb_reset_network_header(skb); } -static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, - u32 *offset) +static bool nf_flow_skb_encap_protocol(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, __be16 proto) { __be16 inner_proto = skb->protocol; struct vlan_ethhdr *veth; @@ -341,7 +402,7 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, veth = (struct vlan_ethhdr *)skb_mac_header(skb); if (veth->h_vlan_encapsulated_proto == proto) { - *offset += VLAN_HLEN; + ctx->offset += VLAN_HLEN; inner_proto = proto; ret = true; } @@ -349,19 +410,28 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto, case htons(ETH_P_PPP_SES): if (nf_flow_pppoe_proto(skb, &inner_proto) && inner_proto == proto) { - *offset += PPPOE_SES_HLEN; + ctx->offset += PPPOE_SES_HLEN; ret = true; } break; } - if (inner_proto == htons(ETH_P_IP)) - ret = nf_flow_ip4_tunnel_proto(skb, offset); + switch (inner_proto) { + case htons(ETH_P_IP): + ret = nf_flow_ip4_tunnel_proto(ctx, skb); + break; + case htons(ETH_P_IPV6): + ret = nf_flow_ip6_tunnel_proto(ctx, skb); + break; + default: + break; + } return ret; } -static void nf_flow_encap_pop(struct sk_buff *skb, +static void nf_flow_encap_pop(struct nf_flowtable_ctx *ctx, + struct sk_buff *skb, struct flow_offload_tuple_rhash *tuplehash) { struct vlan_hdr *vlan_hdr; @@ -387,8 +457,9 @@ static void nf_flow_encap_pop(struct sk_buff *skb, } } - if (skb->protocol == htons(ETH_P_IP)) - nf_flow_ip4_tunnel_pop(skb); + if (skb->protocol == htons(ETH_P_IP) || + skb->protocol == htons(ETH_P_IPV6)) + nf_flow_ip_tunnel_pop(ctx, skb); } struct nf_flow_xmit { @@ -414,7 +485,7 @@ nf_flow_offload_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (!nf_flow_skb_encap_protocol(skb, htons(ETH_P_IP), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IP))) return NULL; if (nf_flow_tuple_ip(ctx, skb, &tuple) < 0) @@ -458,7 +529,7 @@ static int nf_flow_offload_forward(struct nf_flowtable_ctx *ctx, flow_offload_refresh(flow_table, flow, false); - nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash); thoff -= ctx->offset; iph = ip_hdr(skb); @@ -567,6 +638,97 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct sk_buff *skb, return 0; } +struct ipv6_tel_txoption { + struct ipv6_txoptions ops; + __u8 dst_opt[8]; +}; + +static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr, + int encap_limit) +{ + struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb); + u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6; + struct rtable *rt = dst_rtable(tuple->dst_cache); + __u8 dsfield = ipv6_get_dsfield(ip6h); + struct flowi6 fl6 = { + .daddr = tuple->tun.src_v6, + .saddr = tuple->tun.dst_v6, + .flowi6_proto = proto, + }; + int err, mtu; + u32 headroom; + + err = iptunnel_handle_offloads(skb, SKB_GSO_IPXIP6); + if (err) + return err; + + skb_set_inner_ipproto(skb, proto); + headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) + + rt->dst.header_len; + if (encap_limit) + headroom += 8; + err = skb_cow_head(skb, headroom); + if (err) + return err; + + skb_scrub_packet(skb, true); + mtu = dst_mtu(&rt->dst) - sizeof(*ip6h); + if (encap_limit) + mtu -= 8; + mtu = max(mtu, IPV6_MIN_MTU); + skb_dst_update_pmtu_no_confirm(skb, mtu); + + if (encap_limit > 0) { + struct ipv6_tel_txoption opt = { + .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT, + .dst_opt[3] = 1, + .dst_opt[4] = encap_limit, + .dst_opt[5] = IPV6_TLV_PADN, + .dst_opt[6] = 1, + }; + struct ipv6_opt_hdr *hopt; + + opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt; + opt.ops.opt_nflen = 8; + + hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt)); + memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt)); + hopt->nexthdr = IPPROTO_IPV6; + proto = NEXTHDR_DEST; + } + + skb_push(skb, sizeof(*ip6h)); + skb_reset_network_header(skb); + + ip6h = ipv6_hdr(skb); + ip6_flow_hdr(ip6h, dsfield, + ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6)); + ip6h->hop_limit = hop_limit; + ip6h->nexthdr = proto; + ip6h->daddr = tuple->tun.src_v6; + ip6h->saddr = tuple->tun.dst_v6; + ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h)); + IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr); + + *ip6_daddr = &tuple->tun.src_v6; + + return 0; +} + +static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb, + struct flow_offload_tuple *tuple, + struct in6_addr **ip6_daddr, + int encap_limit) +{ + if (tuple->tun_num) + return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr, + encap_limit); + + return 0; +} + static int nf_flow_encap_push(struct sk_buff *skb, struct flow_offload_tuple *tuple) { @@ -836,7 +998,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, tuple->l3proto = AF_INET6; tuple->l4proto = nexthdr; tuple->iifidx = ctx->in->ifindex; - nf_flow_tuple_encap(skb, tuple); + nf_flow_tuple_encap(ctx, skb, tuple); return 0; } @@ -844,7 +1006,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx *ctx, struct sk_buff *skb, static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, struct nf_flowtable *flow_table, struct flow_offload_tuple_rhash *tuplehash, - struct sk_buff *skb) + struct sk_buff *skb, int encap_limit) { enum flow_offload_tuple_dir dir; struct flow_offload *flow; @@ -855,6 +1017,12 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]); mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset; + if (flow->tuplehash[!dir].tuple.tun_num) { + mtu -= sizeof(*ip6h); + if (encap_limit > 0) + mtu -= 8; /* encap limit option */ + } + if (unlikely(nf_flow_exceeds_mtu(skb, mtu))) return 0; @@ -873,7 +1041,7 @@ static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx, flow_offload_refresh(flow_table, flow, false); - nf_flow_encap_pop(skb, tuplehash); + nf_flow_encap_pop(ctx, skb, tuplehash); ip6h = ipv6_hdr(skb); nf_flow_nat_ipv6(flow, skb, dir, ip6h); @@ -894,8 +1062,7 @@ nf_flow_offload_ipv6_lookup(struct nf_flowtable_ctx *ctx, { struct flow_offload_tuple tuple = {}; - if (skb->protocol != htons(ETH_P_IPV6) && - !nf_flow_skb_encap_protocol(skb, htons(ETH_P_IPV6), &ctx->offset)) + if (!nf_flow_skb_encap_protocol(ctx, skb, htons(ETH_P_IPV6))) return NULL; if (nf_flow_tuple_ipv6(ctx, skb, &tuple) < 0) @@ -908,6 +1075,7 @@ unsigned int nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { + int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT; struct flow_offload_tuple_rhash *tuplehash; struct nf_flowtable *flow_table = priv; struct flow_offload_tuple *other_tuple; @@ -926,7 +1094,8 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, if (tuplehash == NULL) return NF_ACCEPT; - ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb); + ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb, + encap_limit); if (ret < 0) return NF_DROP; else if (ret == 0) @@ -945,6 +1114,10 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb, other_tuple = &flow->tuplehash[!dir].tuple; ip6_daddr = &other_tuple->src_v6; + if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple, + &ip6_daddr, encap_limit) < 0) + return NF_DROP; + if (nf_flow_encap_push(skb, other_tuple) < 0) return NF_DROP; diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index d8f7bfd60ac6..b1966b68c48a 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -6,6 +6,7 @@ #include <linux/netdevice.h> #include <linux/tc_act/tc_csum.h> #include <net/flow_offload.h> +#include <net/ip_tunnels.h> #include <net/netfilter/nf_flow_table.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_conntrack.h> diff --git a/net/netfilter/nf_flow_table_path.c b/net/netfilter/nf_flow_table_path.c index eb24fe2715dc..6bb9579dcc2a 100644 --- a/net/netfilter/nf_flow_table_path.c +++ b/net/netfilter/nf_flow_table_path.c @@ -2,6 +2,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> +#include <linux/etherdevice.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/spinlock.h> diff --git a/net/netfilter/nf_log_syslog.c b/net/netfilter/nf_log_syslog.c index 86d5fc5d28e3..41503847d9d7 100644 --- a/net/netfilter/nf_log_syslog.c +++ b/net/netfilter/nf_log_syslog.c @@ -561,7 +561,7 @@ dump_ipv6_packet(struct net *net, struct nf_log_buf *m, /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ nf_log_buf_add(m, "LEN=%zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", - ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + ipv6_payload_len(skb, ih) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, (ntohl(*(__be32 *)ih) & 0x000fffff)); diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c index 0f9a559f6207..31474e8c034a 100644 --- a/net/netfilter/nf_nat_ovs.c +++ b/net/netfilter/nf_nat_ovs.c @@ -2,6 +2,9 @@ /* Support nat functions for openvswitch and used by OVS and TC conntrack. */ #include <net/netfilter/nf_nat.h> +#include <net/ipv6.h> +#include <linux/ip.h> +#include <linux/if_vlan.h> /* Modelled after nf_nat_ipv[46]_fn(). * range is only used for new, uninitialized NAT state. diff --git a/net/netfilter/nf_nat_proto.c b/net/netfilter/nf_nat_proto.c index b14a434b9561..97c0f841fc96 100644 --- a/net/netfilter/nf_nat_proto.c +++ b/net/netfilter/nf_nat_proto.c @@ -25,6 +25,7 @@ #include <net/ip6_route.h> #include <net/xfrm.h> #include <net/ipv6.h> +#include <net/pptp.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack.h> diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 3fa3f5dfb264..57f57e2fc80a 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -10,6 +10,7 @@ #include <net/netns/generic.h> #include <linux/proc_fs.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter_ipv6.h> #include <linux/netfilter/nf_synproxy.h> diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index be92750e2af3..1ed034a47bd0 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -14,6 +14,7 @@ #include <linux/rhashtable.h> #include <linux/audit.h> #include <linux/netfilter.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/nfnetlink.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_flow_table.h> @@ -7269,7 +7270,8 @@ static u32 nft_set_maxsize(const struct nft_set *set) } static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr, u32 nlmsg_flags) + const struct nlattr *attr, u32 nlmsg_flags, + bool last) { struct nft_expr *expr_array[NFT_SET_EXPR_MAX] = {}; struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; @@ -7555,6 +7557,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (last) + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; + else + elem.flags = 0; + if (obj) *nft_set_ext_obj(ext) = obj; @@ -7635,6 +7642,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, * and an existing one. */ err = -EEXIST; + } else if (err == -ECANCELED) { + /* ECANCELED reports an existing nul-element in + * interval sets. + */ + err = 0; } goto err_element_clash; } @@ -7713,7 +7725,8 @@ static int nf_tables_newsetelem(struct sk_buff *skb, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, NULL, nla); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags); + err = nft_add_set_elem(&ctx, set, attr, info->nlh->nlmsg_flags, + nla_is_last(attr, rem)); if (err < 0) { NL_SET_BAD_ATTR(extack, attr); return err; @@ -7806,7 +7819,8 @@ static bool nft_trans_elems_new_abort(const struct nft_ctx *ctx, continue; } - if (!te->set->ops->abort || nft_setelem_is_catchall(te->set, te->elems[i].priv)) + if (!te->set->ops->abort_skip_removal || + nft_setelem_is_catchall(te->set, te->elems[i].priv)) nft_setelem_remove(ctx->net, te->set, te->elems[i].priv); if (!nft_setelem_is_catchall(te->set, te->elems[i].priv)) @@ -7836,7 +7850,7 @@ static void nft_trans_elems_destroy_abort(const struct nft_ctx *ctx, } static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, - const struct nlattr *attr) + const struct nlattr *attr, bool last) { struct nlattr *nla[NFTA_SET_ELEM_MAX + 1]; struct nft_set_ext_tmpl tmpl; @@ -7904,6 +7918,11 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, if (flags) *nft_set_ext_flags(ext) = flags; + if (last) + elem.flags = NFT_SET_ELEM_INTERNAL_LAST; + else + elem.flags = 0; + trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set); if (trans == NULL) goto fail_trans; @@ -8051,7 +8070,8 @@ static int nf_tables_delsetelem(struct sk_buff *skb, return nft_set_flush(&ctx, set, genmask); nla_for_each_nested(attr, nla[NFTA_SET_ELEM_LIST_ELEMENTS], rem) { - err = nft_del_setelem(&ctx, set, attr); + err = nft_del_setelem(&ctx, set, attr, + nla_is_last(attr, rem)); if (err == -ENOENT && NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYSETELEM) continue; @@ -11536,6 +11556,13 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, ret = __nf_tables_abort(net, action); nft_gc_seq_end(nft_net, gc_seq); + if (action == NFNL_ABORT_NONE) { + struct nft_table *table; + + list_for_each_entry(table, &nft_net->tables, list) + table->validate_state = NFT_VALIDATE_SKIP; + } + WARN_ON_ONCE(!list_empty(&nft_net->commit_list)); /* module autoload needs to happen after GC sequence update because it diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 8b7b39d8a109..f1c8049861a6 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -30,6 +30,8 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/list.h> #include <linux/cgroup-defs.h> +#include <linux/rhashtable.h> +#include <linux/jhash.h> #include <net/gso.h> #include <net/sock.h> #include <net/tcp_states.h> @@ -47,6 +49,8 @@ #endif #define NFQNL_QMAX_DEFAULT 1024 +#define NFQNL_HASH_MIN 1024 +#define NFQNL_HASH_MAX 1048576 /* We're using struct nlattr which has 16bit nla_len. Note that nla_len * includes the header length. Thus, the maximum packet length that we @@ -56,6 +60,26 @@ */ #define NFQNL_MAX_COPY_RANGE (0xffff - NLA_HDRLEN) +/* Composite key for packet lookup: (net, queue_num, packet_id) */ +struct nfqnl_packet_key { + possible_net_t net; + u32 packet_id; + u16 queue_num; +} __aligned(sizeof(u32)); /* jhash2 requires 32-bit alignment */ + +/* Global rhashtable - one for entire system, all netns */ +static struct rhashtable nfqnl_packet_map __read_mostly; + +/* Helper to initialize composite key */ +static inline void nfqnl_init_key(struct nfqnl_packet_key *key, + struct net *net, u32 packet_id, u16 queue_num) +{ + memset(key, 0, sizeof(*key)); + write_pnet(&key->net, net); + key->packet_id = packet_id; + key->queue_num = queue_num; +} + struct nfqnl_instance { struct hlist_node hlist; /* global list of queues */ struct rcu_head rcu; @@ -100,6 +124,39 @@ static inline u_int8_t instance_hashfn(u_int16_t queue_num) return ((queue_num >> 8) ^ queue_num) % INSTANCE_BUCKETS; } +/* Extract composite key from nf_queue_entry for hashing */ +static u32 nfqnl_packet_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct nf_queue_entry *entry = data; + struct nfqnl_packet_key key; + + nfqnl_init_key(&key, entry->state.net, entry->id, entry->queue_num); + + return jhash2((u32 *)&key, sizeof(key) / sizeof(u32), seed); +} + +/* Compare stack-allocated key against entry */ +static int nfqnl_packet_obj_cmpfn(struct rhashtable_compare_arg *arg, + const void *obj) +{ + const struct nfqnl_packet_key *key = arg->key; + const struct nf_queue_entry *entry = obj; + + return !net_eq(entry->state.net, read_pnet(&key->net)) || + entry->queue_num != key->queue_num || + entry->id != key->packet_id; +} + +static const struct rhashtable_params nfqnl_rhashtable_params = { + .head_offset = offsetof(struct nf_queue_entry, hash_node), + .key_len = sizeof(struct nfqnl_packet_key), + .obj_hashfn = nfqnl_packet_obj_hashfn, + .obj_cmpfn = nfqnl_packet_obj_cmpfn, + .automatic_shrinking = true, + .min_size = NFQNL_HASH_MIN, + .max_size = NFQNL_HASH_MAX, +}; + static struct nfqnl_instance * instance_lookup(struct nfnl_queue_net *q, u_int16_t queue_num) { @@ -121,17 +178,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) unsigned int h; int err; - spin_lock(&q->instances_lock); - if (instance_lookup(q, queue_num)) { - err = -EEXIST; - goto out_unlock; - } - - inst = kzalloc(sizeof(*inst), GFP_ATOMIC); - if (!inst) { - err = -ENOMEM; - goto out_unlock; - } + inst = kzalloc(sizeof(*inst), GFP_KERNEL_ACCOUNT); + if (!inst) + return ERR_PTR(-ENOMEM); inst->queue_num = queue_num; inst->peer_portid = portid; @@ -141,9 +190,15 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); + spin_lock(&q->instances_lock); + if (instance_lookup(q, queue_num)) { + err = -EEXIST; + goto out_unlock; + } + if (!try_module_get(THIS_MODULE)) { err = -EAGAIN; - goto out_free; + goto out_unlock; } h = instance_hashfn(queue_num); @@ -153,10 +208,9 @@ instance_create(struct nfnl_queue_net *q, u_int16_t queue_num, u32 portid) return inst; -out_free: - kfree(inst); out_unlock: spin_unlock(&q->instances_lock); + kfree(inst); return ERR_PTR(err); } @@ -191,33 +245,45 @@ instance_destroy(struct nfnl_queue_net *q, struct nfqnl_instance *inst) spin_unlock(&q->instances_lock); } -static inline void +static int __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { - list_add_tail(&entry->list, &queue->queue_list); - queue->queue_total++; + int err; + + entry->queue_num = queue->queue_num; + + err = rhashtable_insert_fast(&nfqnl_packet_map, &entry->hash_node, + nfqnl_rhashtable_params); + if (unlikely(err)) + return err; + + list_add_tail(&entry->list, &queue->queue_list); + queue->queue_total++; + + return 0; } static void __dequeue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) { + rhashtable_remove_fast(&nfqnl_packet_map, &entry->hash_node, + nfqnl_rhashtable_params); list_del(&entry->list); queue->queue_total--; } static struct nf_queue_entry * -find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) +find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id, + struct net *net) { - struct nf_queue_entry *entry = NULL, *i; + struct nfqnl_packet_key key; + struct nf_queue_entry *entry; - spin_lock_bh(&queue->lock); + nfqnl_init_key(&key, net, id, queue->queue_num); - list_for_each_entry(i, &queue->queue_list, list) { - if (i->id == id) { - entry = i; - break; - } - } + spin_lock_bh(&queue->lock); + entry = rhashtable_lookup_fast(&nfqnl_packet_map, &key, + nfqnl_rhashtable_params); if (entry) __dequeue_entry(queue, entry); @@ -369,6 +435,34 @@ next_hook: nf_queue_entry_free(entry); } +/* return true if the entry has an unconfirmed conntrack attached that isn't owned by us + * exclusively. + */ +static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry, bool *is_unconfirmed) +{ +#if IS_ENABLED(CONFIG_NF_CONNTRACK) + struct nf_conn *ct = (void *)skb_nfct(entry->skb); + + if (!ct || nf_ct_is_confirmed(ct)) + return false; + + if (is_unconfirmed) + *is_unconfirmed = true; + + /* in some cases skb_clone() can occur after initial conntrack + * pickup, but conntrack assumes exclusive skb->_nfct ownership for + * unconfirmed entries. + * + * This happens for br_netfilter and with ip multicast routing. + * This can't be solved with serialization here because one clone + * could have been queued for local delivery or could be transmitted + * in parallel on another CPU. + */ + return refcount_read(&ct->ct_general.use) > 1; +#endif + return false; +} + static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) { const struct nf_ct_hook *ct_hook; @@ -396,6 +490,24 @@ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict) break; } } + + if (verdict != NF_DROP && entry->nf_ct_is_unconfirmed) { + /* If first queued segment was already reinjected then + * there is a good chance the ct entry is now confirmed. + * + * Handle the rare cases: + * - out-of-order verdict + * - threaded userspace reinjecting in parallel + * - first segment was dropped + * + * In all of those cases we can't handle this packet + * because we can't be sure that another CPU won't modify + * nf_conn->ext in parallel which isn't allowed. + */ + if (nf_ct_drop_unconfirmed(entry, NULL)) + verdict = NF_DROP; + } + nf_reinject(entry, verdict); } @@ -407,8 +519,7 @@ nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) spin_lock_bh(&queue->lock); list_for_each_entry_safe(entry, next, &queue->queue_list, list) { if (!cmpfn || cmpfn(entry, data)) { - list_del(&entry->list); - queue->queue_total--; + __dequeue_entry(queue, entry); nfqnl_reinject(entry, NF_DROP); } } @@ -826,49 +937,6 @@ nlmsg_failure: return NULL; } -static bool nf_ct_drop_unconfirmed(const struct nf_queue_entry *entry) -{ -#if IS_ENABLED(CONFIG_NF_CONNTRACK) - static const unsigned long flags = IPS_CONFIRMED | IPS_DYING; - struct nf_conn *ct = (void *)skb_nfct(entry->skb); - unsigned long status; - unsigned int use; - - if (!ct) - return false; - - status = READ_ONCE(ct->status); - if ((status & flags) == IPS_DYING) - return true; - - if (status & IPS_CONFIRMED) - return false; - - /* in some cases skb_clone() can occur after initial conntrack - * pickup, but conntrack assumes exclusive skb->_nfct ownership for - * unconfirmed entries. - * - * This happens for br_netfilter and with ip multicast routing. - * We can't be solved with serialization here because one clone could - * have been queued for local delivery. - */ - use = refcount_read(&ct->ct_general.use); - if (likely(use == 1)) - return false; - - /* Can't decrement further? Exclusive ownership. */ - if (!refcount_dec_not_one(&ct->ct_general.use)) - return false; - - skb_set_nfct(entry->skb, 0); - /* No nf_ct_put(): we already decremented .use and it cannot - * drop down to 0. - */ - return true; -#endif - return false; -} - static int __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, struct nf_queue_entry *entry) @@ -885,26 +953,23 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, } spin_lock_bh(&queue->lock); - if (nf_ct_drop_unconfirmed(entry)) - goto err_out_free_nskb; + if (queue->queue_total >= queue->queue_maxlen) + goto err_out_queue_drop; - if (queue->queue_total >= queue->queue_maxlen) { - if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { - failopen = 1; - err = 0; - } else { - queue->queue_dropped++; - net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", - queue->queue_total); - } - goto err_out_free_nskb; - } entry->id = ++queue->id_sequence; *packet_id_ptr = htonl(entry->id); + /* Insert into hash BEFORE unicast. If failure don't send to userspace. */ + err = __enqueue_entry(queue, entry); + if (unlikely(err)) + goto err_out_queue_drop; + /* nfnetlink_unicast will either free the nskb or add it to a socket */ err = nfnetlink_unicast(nskb, net, queue->peer_portid); if (err < 0) { + /* Unicast failed - remove entry we just inserted */ + __dequeue_entry(queue, entry); + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { failopen = 1; err = 0; @@ -914,12 +979,22 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue, goto err_out_unlock; } - __enqueue_entry(queue, entry); - spin_unlock_bh(&queue->lock); return 0; -err_out_free_nskb: +err_out_queue_drop: + if (queue->flags & NFQA_CFG_F_FAIL_OPEN) { + failopen = 1; + err = 0; + } else { + queue->queue_dropped++; + + if (queue->queue_total >= queue->queue_maxlen) + net_warn_ratelimited("nf_queue: full at %d entries, dropping packets(s)\n", + queue->queue_total); + else + net_warn_ratelimited("nf_queue: hash insert failed: %d\n", err); + } kfree_skb(nskb); err_out_unlock: spin_unlock_bh(&queue->lock); @@ -998,9 +1073,10 @@ __nfqnl_enqueue_packet_gso(struct net *net, struct nfqnl_instance *queue, static int nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) { - unsigned int queued; - struct nfqnl_instance *queue; struct sk_buff *skb, *segs, *nskb; + bool ct_is_unconfirmed = false; + struct nfqnl_instance *queue; + unsigned int queued; int err = -ENOBUFS; struct net *net = entry->state.net; struct nfnl_queue_net *q = nfnl_queue_pernet(net); @@ -1024,6 +1100,15 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) break; } + /* Check if someone already holds another reference to + * unconfirmed ct. If so, we cannot queue the skb: + * concurrent modifications of nf_conn->ext are not + * allowed and we can't know if another CPU isn't + * processing the same nf_conn entry in parallel. + */ + if (nf_ct_drop_unconfirmed(entry, &ct_is_unconfirmed)) + return -EINVAL; + if (!skb_is_gso(skb) || ((queue->flags & NFQA_CFG_F_GSO) && !skb_is_gso_sctp(skb))) return __nfqnl_enqueue_packet(net, queue, entry); @@ -1037,7 +1122,23 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) goto out_err; queued = 0; err = 0; + skb_list_walk_safe(segs, segs, nskb) { + if (ct_is_unconfirmed && queued > 0) { + /* skb_gso_segment() increments the ct refcount. + * This is a problem for unconfirmed (not in hash) + * entries, those can race when reinjections happen + * in parallel. + * + * Annotate this for all queued entries except the + * first one. + * + * As long as the first one is reinjected first it + * will do the confirmation for us. + */ + entry->nf_ct_is_unconfirmed = ct_is_unconfirmed; + } + if (err == 0) err = __nfqnl_enqueue_packet_gso(net, queue, segs, entry); @@ -1430,7 +1531,7 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info, verdict = ntohl(vhdr->verdict); - entry = find_dequeue_entry(queue, ntohl(vhdr->id)); + entry = find_dequeue_entry(queue, ntohl(vhdr->id), info->net); if (entry == NULL) return -ENOENT; @@ -1498,7 +1599,8 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, struct nfqnl_msg_config_cmd *cmd = NULL; struct nfqnl_instance *queue; __u32 flags = 0, mask = 0; - int ret = 0; + + WARN_ON_ONCE(!lockdep_nfnl_is_held(NFNL_SUBSYS_QUEUE)); if (nfqa[NFQA_CFG_CMD]) { cmd = nla_data(nfqa[NFQA_CFG_CMD]); @@ -1544,47 +1646,44 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, } } + /* Lookup queue under RCU. After peer_portid check (or for new queue + * in BIND case), the queue is owned by the socket sending this message. + * A socket cannot simultaneously send a message and close, so while + * processing this CONFIG message, nfqnl_rcv_nl_event() (triggered by + * socket close) cannot destroy this queue. Safe to use without RCU. + */ rcu_read_lock(); queue = instance_lookup(q, queue_num); if (queue && queue->peer_portid != NETLINK_CB(skb).portid) { - ret = -EPERM; - goto err_out_unlock; + rcu_read_unlock(); + return -EPERM; } + rcu_read_unlock(); if (cmd != NULL) { switch (cmd->command) { case NFQNL_CFG_CMD_BIND: - if (queue) { - ret = -EBUSY; - goto err_out_unlock; - } - queue = instance_create(q, queue_num, - NETLINK_CB(skb).portid); - if (IS_ERR(queue)) { - ret = PTR_ERR(queue); - goto err_out_unlock; - } + if (queue) + return -EBUSY; + queue = instance_create(q, queue_num, NETLINK_CB(skb).portid); + if (IS_ERR(queue)) + return PTR_ERR(queue); break; case NFQNL_CFG_CMD_UNBIND: - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; instance_destroy(q, queue); - goto err_out_unlock; + return 0; case NFQNL_CFG_CMD_PF_BIND: case NFQNL_CFG_CMD_PF_UNBIND: break; default: - ret = -ENOTSUPP; - goto err_out_unlock; + return -EOPNOTSUPP; } } - if (!queue) { - ret = -ENODEV; - goto err_out_unlock; - } + if (!queue) + return -ENODEV; if (nfqa[NFQA_CFG_PARAMS]) { struct nfqnl_msg_config_params *params = @@ -1609,9 +1708,7 @@ static int nfqnl_recv_config(struct sk_buff *skb, const struct nfnl_info *info, spin_unlock_bh(&queue->lock); } -err_out_unlock: - rcu_read_unlock(); - return ret; + return 0; } static const struct nfnl_callback nfqnl_cb[NFQNL_MSG_MAX] = { @@ -1781,10 +1878,14 @@ static int __init nfnetlink_queue_init(void) { int status; + status = rhashtable_init(&nfqnl_packet_map, &nfqnl_rhashtable_params); + if (status < 0) + return status; + status = register_pernet_subsys(&nfnl_queue_net_ops); if (status < 0) { pr_err("failed to register pernet ops\n"); - goto out; + goto cleanup_rhashtable; } netlink_register_notifier(&nfqnl_rtnl_notifier); @@ -1809,7 +1910,8 @@ cleanup_netlink_subsys: cleanup_netlink_notifier: netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); -out: +cleanup_rhashtable: + rhashtable_destroy(&nfqnl_packet_map); return status; } @@ -1821,6 +1923,8 @@ static void __exit nfnetlink_queue_fini(void) netlink_unregister_notifier(&nfqnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_queue_net_ops); + rhashtable_destroy(&nfqnl_packet_map); + rcu_barrier(); /* Wait for completion of call_rcu()'s */ } diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index 72711d62fddf..08f620311b03 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -134,7 +134,8 @@ static void nft_target_eval_bridge(const struct nft_expr *expr, } static const struct nla_policy nft_target_policy[NFTA_TARGET_MAX + 1] = { - [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_TARGET_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN, }, [NFTA_TARGET_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_TARGET_INFO] = { .type = NLA_BINARY }, }; @@ -434,7 +435,8 @@ static void nft_match_eval(const struct nft_expr *expr, } static const struct nla_policy nft_match_policy[NFTA_MATCH_MAX + 1] = { - [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING }, + [NFTA_MATCH_NAME] = { .type = NLA_NUL_STRING, + .len = XT_EXTENSION_MAXNAMELEN }, [NFTA_MATCH_REV] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_MATCH_INFO] = { .type = NLA_BINARY }, }; @@ -693,7 +695,12 @@ static int nfnl_compat_get_rcu(struct sk_buff *skb, name = nla_data(tb[NFTA_COMPAT_NAME]); rev = ntohl(nla_get_be32(tb[NFTA_COMPAT_REV])); - target = ntohl(nla_get_be32(tb[NFTA_COMPAT_TYPE])); + /* x_tables api checks for 'target == 1' to mean target, + * everything else means 'match'. + * In x_tables world, the number is set by kernel, not + * userspace. + */ + target = nla_get_be32(tb[NFTA_COMPAT_TYPE]) == htonl(1); switch(family) { case AF_INET: diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index cc7325329496..0d70325280cc 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -117,8 +117,8 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv, nft_sync = this_cpu_ptr(&nft_counter_sync); u64_stats_update_begin(nft_sync); - u64_stats_add(&this_cpu->packets, -total->packets); - u64_stats_add(&this_cpu->bytes, -total->bytes); + u64_stats_sub(&this_cpu->packets, total->packets); + u64_stats_sub(&this_cpu->bytes, total->bytes); u64_stats_update_end(nft_sync); local_bh_enable(); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index b8f76c9057fd..179d0e59e2b5 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/etherdevice.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index ba01ce75d6de..739b992bde59 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -619,15 +619,20 @@ static struct nft_elem_priv * nft_hash_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { + const u32 *key = (const u32 *)&elem->key.val; struct nft_hash *priv = nft_set_priv(set); u8 genmask = nft_genmask_cur(net); struct nft_hash_elem *he; u32 hash; - hash = jhash(elem->key.val.data, set->klen, priv->seed); + if (set->klen == 4) + hash = jhash_1word(*key, priv->seed); + else + hash = jhash(key, set->klen, priv->seed); + hash = reciprocal_scale(hash, priv->buckets); hlist_for_each_entry_rcu(he, &priv->table[hash], node) { - if (!memcmp(nft_set_ext_key(&he->ext), elem->key.val.data, set->klen) && + if (!memcmp(nft_set_ext_key(&he->ext), key, set->klen) && nft_set_elem_active(&he->ext, genmask)) return &he->priv; } diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 6d77a5f0088a..18e1903b1d3d 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -2370,6 +2370,7 @@ const struct nft_set_type nft_set_pipapo_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; @@ -2394,6 +2395,7 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { .gc_init = nft_pipapo_gc_init, .commit = nft_pipapo_commit, .abort = nft_pipapo_abort, + .abort_skip_removal = true, .elemsize = offsetof(struct nft_pipapo_elem, ext), }, }; diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index ca594161b840..644d4b916705 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -10,21 +10,41 @@ #include <linux/module.h> #include <linux/list.h> #include <linux/rbtree.h> +#include <linux/bsearch.h> #include <linux/netlink.h> #include <linux/netfilter.h> #include <linux/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> +struct nft_array_interval { + struct nft_set_ext *from; + struct nft_set_ext *to; +}; + +struct nft_array { + u32 max_intervals; + u32 num_intervals; + struct nft_array_interval *intervals; + struct rcu_head rcu_head; +}; + struct nft_rbtree { struct rb_root root; rwlock_t lock; - seqcount_rwlock_t count; + struct nft_array __rcu *array; + struct nft_array *array_next; + unsigned long start_rbe_cookie; unsigned long last_gc; + struct list_head expired; + u64 last_tstamp; }; struct nft_rbtree_elem { struct nft_elem_priv priv; - struct rb_node node; + union { + struct rb_node node; + struct list_head list; + }; struct nft_set_ext ext; }; @@ -39,6 +59,13 @@ static bool nft_rbtree_interval_start(const struct nft_rbtree_elem *rbe) return !nft_rbtree_interval_end(rbe); } +static bool nft_rbtree_interval_null(const struct nft_set *set, + const struct nft_rbtree_elem *rbe) +{ + return (!memchr_inv(nft_set_ext_key(&rbe->ext), 0, set->klen) && + nft_rbtree_interval_end(rbe)); +} + static int nft_rbtree_cmp(const struct nft_set *set, const struct nft_rbtree_elem *e1, const struct nft_rbtree_elem *e2) @@ -47,67 +74,33 @@ static int nft_rbtree_cmp(const struct nft_set *set, set->klen); } -static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) -{ - return nft_set_elem_expired(&rbe->ext); -} +struct nft_array_lookup_ctx { + const u32 *key; + u32 klen; +}; -static const struct nft_set_ext * -__nft_rbtree_lookup(const struct net *net, const struct nft_set *set, - const u32 *key, unsigned int seq) +static int nft_array_lookup_cmp(const void *pkey, const void *entry) { - struct nft_rbtree *priv = nft_set_priv(set); - const struct nft_rbtree_elem *rbe, *interval = NULL; - u8 genmask = nft_genmask_cur(net); - const struct rb_node *parent; - int d; + const struct nft_array_interval *interval = entry; + const struct nft_array_lookup_ctx *ctx = pkey; + int a, b; - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return NULL; - - rbe = rb_entry(parent, struct nft_rbtree_elem, node); + if (!interval->from) + return 1; - d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (interval && - !nft_rbtree_cmp(set, rbe, interval) && - nft_rbtree_interval_end(rbe) && - nft_rbtree_interval_start(interval)) - continue; - if (nft_set_elem_active(&rbe->ext, genmask) && - !nft_rbtree_elem_expired(rbe)) - interval = rbe; - } else if (d > 0) - parent = rcu_dereference_raw(parent->rb_right); - else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } - - if (nft_rbtree_elem_expired(rbe)) - return NULL; - - if (nft_rbtree_interval_end(rbe)) { - if (nft_set_is_anonymous(set)) - return NULL; - parent = rcu_dereference_raw(parent->rb_left); - interval = NULL; - continue; - } + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - return &rbe->ext; - } - } + if (a >= 0 && b < 0) + return 0; - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_rbtree_interval_start(interval)) - return &interval->ext; + if (a < 0) + return -1; - return NULL; + return 1; } INDIRECT_CALLABLE_SCOPE @@ -116,83 +109,57 @@ nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - const struct nft_set_ext *ext; - - ext = __nft_rbtree_lookup(net, set, key, seq); - if (ext || !read_seqcount_retry(&priv->count, seq)) - return ext; - - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ext = __nft_rbtree_lookup(net, set, key, seq); - read_unlock_bh(&priv->lock); - - return ext; + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_lookup_ctx ctx = { + .key = key, + .klen = set->klen, + }; + + if (!array) + return NULL; + + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), + nft_array_lookup_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return NULL; + + return interval->from; } -static bool __nft_rbtree_get(const struct net *net, const struct nft_set *set, - const u32 *key, struct nft_rbtree_elem **elem, - unsigned int seq, unsigned int flags, u8 genmask) -{ - struct nft_rbtree_elem *rbe, *interval = NULL; - struct nft_rbtree *priv = nft_set_priv(set); - const struct rb_node *parent; - const void *this; - int d; - - parent = rcu_dereference_raw(priv->root.rb_node); - while (parent != NULL) { - if (read_seqcount_retry(&priv->count, seq)) - return false; - - rbe = rb_entry(parent, struct nft_rbtree_elem, node); - - this = nft_set_ext_key(&rbe->ext); - d = memcmp(this, key, set->klen); - if (d < 0) { - parent = rcu_dereference_raw(parent->rb_left); - if (!(flags & NFT_SET_ELEM_INTERVAL_END)) - interval = rbe; - } else if (d > 0) { - parent = rcu_dereference_raw(parent->rb_right); - if (flags & NFT_SET_ELEM_INTERVAL_END) - interval = rbe; - } else { - if (!nft_set_elem_active(&rbe->ext, genmask)) { - parent = rcu_dereference_raw(parent->rb_left); - continue; - } +struct nft_array_get_ctx { + const u32 *key; + unsigned int flags; + u32 klen; +}; - if (nft_set_elem_expired(&rbe->ext)) - return false; +static int nft_array_get_cmp(const void *pkey, const void *entry) +{ + const struct nft_array_interval *interval = entry; + const struct nft_array_get_ctx *ctx = pkey; + int a, b; - if (!nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) || - (*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END) == - (flags & NFT_SET_ELEM_INTERVAL_END)) { - *elem = rbe; - return true; - } + if (!interval->from) + return 1; - if (nft_rbtree_interval_end(rbe)) - interval = NULL; + a = memcmp(ctx->key, nft_set_ext_key(interval->from), ctx->klen); + if (!interval->to) + b = -1; + else + b = memcmp(ctx->key, nft_set_ext_key(interval->to), ctx->klen); - parent = rcu_dereference_raw(parent->rb_left); - } + if (a >= 0) { + if (ctx->flags & NFT_SET_ELEM_INTERVAL_END && b <= 0) + return 0; + else if (b < 0) + return 0; } - if (set->flags & NFT_SET_INTERVAL && interval != NULL && - nft_set_elem_active(&interval->ext, genmask) && - !nft_set_elem_expired(&interval->ext) && - ((!nft_rbtree_interval_end(interval) && - !(flags & NFT_SET_ELEM_INTERVAL_END)) || - (nft_rbtree_interval_end(interval) && - (flags & NFT_SET_ELEM_INTERVAL_END)))) { - *elem = interval; - return true; - } + if (a < 0) + return -1; - return false; + return 1; } static struct nft_elem_priv * @@ -200,34 +167,41 @@ nft_rbtree_get(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, unsigned int flags) { struct nft_rbtree *priv = nft_set_priv(set); - unsigned int seq = read_seqcount_begin(&priv->count); - struct nft_rbtree_elem *rbe = ERR_PTR(-ENOENT); - const u32 *key = (const u32 *)&elem->key.val; - u8 genmask = nft_genmask_cur(net); - bool ret; - - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - if (ret || !read_seqcount_retry(&priv->count, seq)) - return &rbe->priv; - - read_lock_bh(&priv->lock); - seq = read_seqcount_begin(&priv->count); - ret = __nft_rbtree_get(net, set, key, &rbe, seq, flags, genmask); - read_unlock_bh(&priv->lock); - - if (!ret) + struct nft_array *array = rcu_dereference(priv->array); + const struct nft_array_interval *interval; + struct nft_array_get_ctx ctx = { + .key = (const u32 *)&elem->key.val, + .flags = flags, + .klen = set->klen, + }; + struct nft_rbtree_elem *rbe; + + if (!array) return ERR_PTR(-ENOENT); + interval = bsearch(&ctx, array->intervals, array->num_intervals, + sizeof(struct nft_array_interval), nft_array_get_cmp); + if (!interval || nft_set_elem_expired(interval->from)) + return ERR_PTR(-ENOENT); + + if (flags & NFT_SET_ELEM_INTERVAL_END) + rbe = container_of(interval->to, struct nft_rbtree_elem, ext); + else + rbe = container_of(interval->from, struct nft_rbtree_elem, ext); + return &rbe->priv; } -static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) +static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) { lockdep_assert_held_write(&priv->lock); nft_setelem_data_deactivate(net, set, &rbe->priv); rb_erase(&rbe->node, &priv->root); + + /* collected later on in commit callback */ + list_add(&rbe->list, &priv->expired); } static const struct nft_rbtree_elem * @@ -238,11 +212,6 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, struct rb_node *prev = rb_prev(&rbe->node); struct net *net = read_pnet(&set->net); struct nft_rbtree_elem *rbe_prev; - struct nft_trans_gc *gc; - - gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); - if (!gc) - return ERR_PTR(-ENOMEM); /* search for end interval coming before this element. * end intervals don't carry a timeout extension, they @@ -260,28 +229,10 @@ nft_rbtree_gc_elem(const struct nft_set *__set, struct nft_rbtree *priv, rbe_prev = NULL; if (prev) { rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); - nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev); - - /* There is always room in this trans gc for this element, - * memory allocation never actually happens, hence, the warning - * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, - * this is synchronous gc which never fails. - */ - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - if (WARN_ON_ONCE(!gc)) - return ERR_PTR(-ENOMEM); - - nft_trans_gc_elem_add(gc, rbe_prev); + nft_rbtree_gc_elem_move(net, set, priv, rbe_prev); } - nft_rbtree_gc_elem_remove(net, set, priv, rbe); - gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); - if (WARN_ON_ONCE(!gc)) - return ERR_PTR(-ENOMEM); - - nft_trans_gc_elem_add(gc, rbe); - - nft_trans_gc_queue_sync_done(gc); + nft_rbtree_gc_elem_move(net, set, priv, rbe); return rbe_prev; } @@ -302,16 +253,107 @@ static bool nft_rbtree_update_first(const struct nft_set *set, return false; } +/* Only for anonymous sets which do not allow updates, all element are active. */ +static struct nft_rbtree_elem *nft_rbtree_prev_active(struct nft_rbtree_elem *rbe) +{ + struct rb_node *node; + + node = rb_prev(&rbe->node); + if (!node) + return NULL; + + return rb_entry(node, struct nft_rbtree_elem, node); +} + +static struct nft_rbtree_elem * +__nft_rbtree_next_active(struct rb_node *node, u8 genmask) +{ + struct nft_rbtree_elem *next_rbe; + + while (node) { + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (!nft_set_elem_active(&next_rbe->ext, genmask)) { + node = rb_next(node); + continue; + } + + return next_rbe; + } + + return NULL; +} + +static struct nft_rbtree_elem * +nft_rbtree_next_active(struct nft_rbtree_elem *rbe, u8 genmask) +{ + return __nft_rbtree_next_active(rb_next(&rbe->node), genmask); +} + +static void nft_rbtree_maybe_reset_start_cookie(struct nft_rbtree *priv, + u64 tstamp) +{ + if (priv->last_tstamp != tstamp) { + priv->start_rbe_cookie = 0; + priv->last_tstamp = tstamp; + } +} + +static void nft_rbtree_set_start_cookie(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe) +{ + priv->start_rbe_cookie = (unsigned long)rbe; +} + +static void nft_rbtree_set_start_cookie_open(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe, + unsigned long open_interval) +{ + priv->start_rbe_cookie = (unsigned long)rbe | open_interval; +} + +#define NFT_RBTREE_OPEN_INTERVAL 1UL + +static bool nft_rbtree_cmp_start_cookie(struct nft_rbtree *priv, + const struct nft_rbtree_elem *rbe) +{ + return (priv->start_rbe_cookie & ~NFT_RBTREE_OPEN_INTERVAL) == (unsigned long)rbe; +} + +static bool nft_rbtree_insert_same_interval(const struct net *net, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + u8 genmask = nft_genmask_next(net); + struct nft_rbtree_elem *next_rbe; + + if (!priv->start_rbe_cookie) + return true; + + next_rbe = nft_rbtree_next_active(rbe, genmask); + if (next_rbe) { + /* Closest start element differs from last element added. */ + if (nft_rbtree_interval_start(next_rbe) && + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { + priv->start_rbe_cookie = 0; + return true; + } + } + + priv->start_rbe_cookie = 0; + + return false; +} + static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, struct nft_rbtree_elem *new, - struct nft_elem_priv **elem_priv) + struct nft_elem_priv **elem_priv, u64 tstamp, bool last) { - struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL, *rbe_prev; struct rb_node *node, *next, *parent, **p, *first = NULL; struct nft_rbtree *priv = nft_set_priv(set); u8 cur_genmask = nft_genmask_cur(net); u8 genmask = nft_genmask_next(net); - u64 tstamp = nft_net_tstamp(net); + unsigned long open_interval = 0; int d; /* Descend the tree to search for an existing element greater than the @@ -417,12 +459,46 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, } } + if (nft_rbtree_interval_null(set, new)) { + priv->start_rbe_cookie = 0; + } else if (nft_rbtree_interval_start(new) && priv->start_rbe_cookie) { + if (nft_set_is_anonymous(set)) { + priv->start_rbe_cookie = 0; + } else if (priv->start_rbe_cookie & NFT_RBTREE_OPEN_INTERVAL) { + /* Previous element is an open interval that partially + * overlaps with an existing non-open interval. + */ + return -ENOTEMPTY; + } + } + /* - new start element matching existing start element: full overlap * reported as -EEXIST, cleared by caller if NLM_F_EXCL is not given. */ if (rbe_ge && !nft_rbtree_cmp(set, new, rbe_ge) && nft_rbtree_interval_start(rbe_ge) == nft_rbtree_interval_start(new)) { *elem_priv = &rbe_ge->priv; + + /* - Corner case: new start element of open interval (which + * comes as last element in the batch) overlaps the start of + * an existing interval with an end element: partial overlap. + */ + node = rb_first(&priv->root); + rbe = __nft_rbtree_next_active(node, genmask); + if (rbe && nft_rbtree_interval_end(rbe)) { + rbe = nft_rbtree_next_active(rbe, genmask); + if (rbe && + nft_rbtree_interval_start(rbe) && + !nft_rbtree_cmp(set, new, rbe)) { + if (last) + return -ENOTEMPTY; + + /* Maybe open interval? */ + open_interval = NFT_RBTREE_OPEN_INTERVAL; + } + } + nft_rbtree_set_start_cookie_open(priv, rbe_ge, open_interval); + return -EEXIST; } @@ -431,18 +507,37 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, */ if (rbe_le && !nft_rbtree_cmp(set, new, rbe_le) && nft_rbtree_interval_end(rbe_le) == nft_rbtree_interval_end(new)) { + /* - ignore null interval, otherwise NLM_F_CREATE bogusly + * reports EEXIST. + */ + if (nft_rbtree_interval_null(set, new)) + return -ECANCELED; + *elem_priv = &rbe_le->priv; + + /* - start and end element belong to the same interval. */ + if (!nft_rbtree_insert_same_interval(net, priv, rbe_le)) + return -ENOTEMPTY; + return -EEXIST; } /* - new start element with existing closest, less or equal key value * being a start element: partial overlap, reported as -ENOTEMPTY. * Anonymous sets allow for two consecutive start element since they - * are constant, skip them to avoid bogus overlap reports. + * are constant, but validate that this new start element does not + * sit in between an existing start and end elements: partial overlap, + * reported as -ENOTEMPTY. */ - if (!nft_set_is_anonymous(set) && rbe_le && - nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) - return -ENOTEMPTY; + if (rbe_le && + nft_rbtree_interval_start(rbe_le) && nft_rbtree_interval_start(new)) { + if (!nft_set_is_anonymous(set)) + return -ENOTEMPTY; + + rbe_prev = nft_rbtree_prev_active(rbe_le); + if (rbe_prev && nft_rbtree_interval_end(rbe_prev)) + return -ENOTEMPTY; + } /* - new end element with existing closest, less or equal key value * being a end element: partial overlap, reported as -ENOTEMPTY. @@ -458,6 +553,12 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, nft_rbtree_interval_end(rbe_ge) && nft_rbtree_interval_end(new)) return -ENOTEMPTY; + /* - start element overlaps an open interval but end element is new: + * partial overlap, reported as -ENOEMPTY. + */ + if (!rbe_ge && priv->start_rbe_cookie && nft_rbtree_interval_end(new)) + return -ENOTEMPTY; + /* Accepted element: pick insertion point depending on key value */ parent = NULL; p = &priv->root.rb_node; @@ -481,14 +582,102 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, return 0; } +static int nft_array_intervals_alloc(struct nft_array *array, u32 max_intervals) +{ + struct nft_array_interval *intervals; + + intervals = kvcalloc(max_intervals, sizeof(struct nft_array_interval), + GFP_KERNEL_ACCOUNT); + if (!intervals) + return -ENOMEM; + + if (array->intervals) + kvfree(array->intervals); + + array->intervals = intervals; + array->max_intervals = max_intervals; + + return 0; +} + +static struct nft_array *nft_array_alloc(u32 max_intervals) +{ + struct nft_array *array; + + array = kzalloc(sizeof(*array), GFP_KERNEL_ACCOUNT); + if (!array) + return NULL; + + if (nft_array_intervals_alloc(array, max_intervals) < 0) { + kfree(array); + return NULL; + } + + return array; +} + +#define NFT_ARRAY_EXTRA_SIZE 10240 + +/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider + * packed representation coming from userspace for anonymous sets too. + */ +static u32 nft_array_elems(const struct nft_set *set) +{ + u32 nelems = atomic_read(&set->nelems); + + /* Adjacent intervals are represented with a single start element in + * anonymous sets, use the current element counter as is. + */ + if (nft_set_is_anonymous(set)) + return nelems; + + /* Add extra room for never matching interval at the beginning and open + * interval at the end which only use a single element to represent it. + * The conversion to array will compact intervals, this allows reduce + * memory consumption. + */ + return (nelems / 2) + 2; +} + +static int nft_array_may_resize(const struct nft_set *set) +{ + u32 nelems = nft_array_elems(set), new_max_intervals; + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array; + + if (!priv->array_next) { + array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE); + if (!array) + return -ENOMEM; + + priv->array_next = array; + } + + if (nelems < priv->array_next->max_intervals) + return 0; + + new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE; + if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0) + return -ENOMEM; + + return 0; +} + static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_elem_priv **elem_priv) { struct nft_rbtree_elem *rbe = nft_elem_priv_cast(elem->priv); + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); struct nft_rbtree *priv = nft_set_priv(set); + u64 tstamp = nft_net_tstamp(net); int err; + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); + + if (nft_array_may_resize(set) < 0) + return -ENOMEM; + do { if (fatal_signal_pending(current)) return -EINTR; @@ -496,10 +685,12 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, cond_resched(); write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); - err = __nft_rbtree_insert(net, set, rbe, elem_priv); - write_seqcount_end(&priv->count); + err = __nft_rbtree_insert(net, set, rbe, elem_priv, tstamp, last); write_unlock_bh(&priv->lock); + + if (nft_rbtree_interval_end(rbe)) + priv->start_rbe_cookie = 0; + } while (err == -EAGAIN); return err; @@ -508,9 +699,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, static void nft_rbtree_erase(struct nft_rbtree *priv, struct nft_rbtree_elem *rbe) { write_lock_bh(&priv->lock); - write_seqcount_begin(&priv->count); rb_erase(&rbe->node, &priv->root); - write_seqcount_end(&priv->count); write_unlock_bh(&priv->lock); } @@ -533,6 +722,48 @@ static void nft_rbtree_activate(const struct net *net, nft_clear(net, &rbe->ext); } +static struct nft_rbtree_elem * +nft_rbtree_next_inactive(struct nft_rbtree_elem *rbe, u8 genmask) +{ + struct nft_rbtree_elem *next_rbe; + struct rb_node *node; + + node = rb_next(&rbe->node); + if (node) { + next_rbe = rb_entry(node, struct nft_rbtree_elem, node); + if (nft_rbtree_interval_start(next_rbe) && + !nft_set_elem_active(&next_rbe->ext, genmask)) + return next_rbe; + } + + return NULL; +} + +static bool nft_rbtree_deactivate_same_interval(const struct net *net, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe) +{ + u8 genmask = nft_genmask_next(net); + struct nft_rbtree_elem *next_rbe; + + if (!priv->start_rbe_cookie) + return true; + + next_rbe = nft_rbtree_next_inactive(rbe, genmask); + if (next_rbe) { + /* Closest start element differs from last element added. */ + if (nft_rbtree_interval_start(next_rbe) && + nft_rbtree_cmp_start_cookie(priv, next_rbe)) { + priv->start_rbe_cookie = 0; + return true; + } + } + + priv->start_rbe_cookie = 0; + + return false; +} + static void nft_rbtree_flush(const struct net *net, const struct nft_set *set, struct nft_elem_priv *elem_priv) @@ -547,12 +778,22 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem) { struct nft_rbtree_elem *rbe, *this = nft_elem_priv_cast(elem->priv); - const struct nft_rbtree *priv = nft_set_priv(set); + bool last = !!(elem->flags & NFT_SET_ELEM_INTERNAL_LAST); + struct nft_rbtree *priv = nft_set_priv(set); const struct rb_node *parent = priv->root.rb_node; u8 genmask = nft_genmask_next(net); u64 tstamp = nft_net_tstamp(net); int d; + nft_rbtree_maybe_reset_start_cookie(priv, tstamp); + + if (nft_rbtree_interval_start(this) || + nft_rbtree_interval_null(set, this)) + priv->start_rbe_cookie = 0; + + if (nft_array_may_resize(set) < 0) + return NULL; + while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -577,6 +818,13 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, parent = parent->rb_left; continue; } + + if (nft_rbtree_interval_start(rbe)) { + if (!last) + nft_rbtree_set_start_cookie(priv, rbe); + } else if (!nft_rbtree_deactivate_same_interval(net, priv, rbe)) + return NULL; + nft_rbtree_flush(net, set, &rbe->priv); return &rbe->priv; } @@ -615,6 +863,11 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, switch (iter->type) { case NFT_ITER_UPDATE: lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (nft_array_may_resize(set) < 0) { + iter->err = -ENOMEM; + break; + } nft_rbtree_do_walk(ctx, set, iter); break; case NFT_ITER_READ: @@ -629,29 +882,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, } } -static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, - struct nft_rbtree *priv, - struct nft_rbtree_elem *rbe) -{ - nft_setelem_data_deactivate(net, set, &rbe->priv); - nft_rbtree_erase(priv, rbe); -} - -static void nft_rbtree_gc(struct nft_set *set) +static void nft_rbtree_gc_scan(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe, *rbe_end = NULL; struct net *net = read_pnet(&set->net); u64 tstamp = nft_net_tstamp(net); struct rb_node *node, *next; - struct nft_trans_gc *gc; - - set = nft_set_container_of(priv); - net = read_pnet(&set->net); - - gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); - if (!gc) - return; for (node = rb_first(&priv->root); node ; node = next) { next = rb_next(node); @@ -669,34 +906,46 @@ static void nft_rbtree_gc(struct nft_set *set) if (!__nft_set_elem_expired(&rbe->ext, tstamp)) continue; - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - goto try_later; - /* end element needs to be removed first, it has * no timeout extension. */ + write_lock_bh(&priv->lock); if (rbe_end) { - nft_rbtree_gc_remove(net, set, priv, rbe_end); - nft_trans_gc_elem_add(gc, rbe_end); + nft_rbtree_gc_elem_move(net, set, priv, rbe_end); rbe_end = NULL; } - gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); - if (!gc) - goto try_later; - - nft_rbtree_gc_remove(net, set, priv, rbe); - nft_trans_gc_elem_add(gc, rbe); + nft_rbtree_gc_elem_move(net, set, priv, rbe); + write_unlock_bh(&priv->lock); } -try_later: + priv->last_gc = jiffies; +} + +static void nft_rbtree_gc_queue(struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *rbe_end; + struct nft_trans_gc *gc; - if (gc) { - gc = nft_trans_gc_catchall_sync(gc); - nft_trans_gc_queue_sync_done(gc); - priv->last_gc = jiffies; + if (list_empty(&priv->expired)) + return; + + gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); + if (!gc) + return; + + list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) { + list_del(&rbe->list); + nft_trans_gc_elem_add(gc, rbe); + + gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL); + if (!gc) + return; } + + gc = nft_trans_gc_catchall_sync(gc); + nft_trans_gc_queue_sync_done(gc); } static u64 nft_rbtree_privsize(const struct nlattr * const nla[], @@ -714,24 +963,45 @@ static int nft_rbtree_init(const struct nft_set *set, BUILD_BUG_ON(offsetof(struct nft_rbtree_elem, priv) != 0); rwlock_init(&priv->lock); - seqcount_rwlock_init(&priv->count, &priv->lock); priv->root = RB_ROOT; + INIT_LIST_HEAD(&priv->expired); + + priv->array = NULL; + priv->array_next = NULL; return 0; } +static void __nft_array_free(struct nft_array *array) +{ + kvfree(array->intervals); + kfree(array); +} + static void nft_rbtree_destroy(const struct nft_ctx *ctx, const struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); - struct nft_rbtree_elem *rbe; + struct nft_rbtree_elem *rbe, *next; + struct nft_array *array; struct rb_node *node; + list_for_each_entry_safe(rbe, next, &priv->expired, list) { + list_del(&rbe->list); + nf_tables_set_elem_destroy(ctx, set, &rbe->priv); + } + while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); nf_tables_set_elem_destroy(ctx, set, &rbe->priv); } + + array = rcu_dereference_protected(priv->array, true); + if (array) + __nft_array_free(array); + if (priv->array_next) + __nft_array_free(priv->array_next); } static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, @@ -752,12 +1022,105 @@ static bool nft_rbtree_estimate(const struct nft_set_desc *desc, u32 features, return true; } +static void nft_array_free_rcu(struct rcu_head *rcu_head) +{ + struct nft_array *array = container_of(rcu_head, struct nft_array, rcu_head); + + __nft_array_free(array); +} + static void nft_rbtree_commit(struct nft_set *set) { struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree_elem *rbe, *prev_rbe; + struct nft_array *old; + u32 num_intervals = 0; + struct rb_node *node; + /* No changes, skip, eg. elements updates only. */ + if (!priv->array_next) + return; + + /* GC can be performed if the binary search blob is going + * to be rebuilt. It has to be done in two phases: first + * scan tree and move all expired elements to the expired + * list. + * + * Then, after blob has been re-built and published to other + * CPUs, queue collected entries for freeing. + */ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set))) - nft_rbtree_gc(set); + nft_rbtree_gc_scan(set); + + /* Reverse walk to create an array from smaller to largest interval. */ + node = rb_last(&priv->root); + if (node) + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + else + prev_rbe = NULL; + + while (prev_rbe) { + rbe = prev_rbe; + + if (nft_rbtree_interval_start(rbe)) + priv->array_next->intervals[num_intervals].from = &rbe->ext; + else if (nft_rbtree_interval_end(rbe)) + priv->array_next->intervals[num_intervals++].to = &rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + + node = rb_prev(node); + if (!node) + break; + + prev_rbe = rb_entry(node, struct nft_rbtree_elem, node); + + /* For anonymous sets, when adjacent ranges are found, + * the end element is not added to the set to pack the set + * representation. Use next start element to complete this + * interval. + */ + if (nft_rbtree_interval_start(rbe) && + nft_rbtree_interval_start(prev_rbe) && + priv->array_next->intervals[num_intervals].from) + priv->array_next->intervals[num_intervals++].to = &prev_rbe->ext; + + if (num_intervals >= priv->array_next->max_intervals) { + pr_warn_once("malformed interval set from userspace?"); + goto err_out; + } + } + + if (priv->array_next->intervals[num_intervals].from) + num_intervals++; +err_out: + priv->array_next->num_intervals = num_intervals; + old = rcu_replace_pointer(priv->array, priv->array_next, + lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex)); + priv->array_next = NULL; + if (old) + call_rcu(&old->rcu_head, nft_array_free_rcu); + + /* New blob is public, queue collected entries for freeing. + * call_rcu ensures elements stay around until readers are done. + */ + nft_rbtree_gc_queue(set); +} + +static void nft_rbtree_abort(const struct nft_set *set) +{ + struct nft_rbtree *priv = nft_set_priv(set); + struct nft_array *array_next; + + if (!priv->array_next) + return; + + array_next = priv->array_next; + priv->array_next = NULL; + __nft_array_free(array_next); } static void nft_rbtree_gc_init(const struct nft_set *set) @@ -821,6 +1184,7 @@ const struct nft_set_type nft_set_rbtree_type = { .flush = nft_rbtree_flush, .activate = nft_rbtree_activate, .commit = nft_rbtree_commit, + .abort = nft_rbtree_abort, .gc_init = nft_rbtree_gc_init, .lookup = nft_rbtree_lookup, .walk = nft_rbtree_walk, diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c index 4d3e5a31b412..b71ef18b0e8c 100644 --- a/net/netfilter/nft_synproxy.c +++ b/net/netfilter/nft_synproxy.c @@ -7,6 +7,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_synproxy.h> #include <net/netfilter/nf_synproxy.h> +#include <linux/netfilter_ipv4.h> #include <linux/netfilter/nf_tables.h> #include <linux/netfilter/nf_synproxy.h> diff --git a/net/netfilter/xt_tcpmss.c b/net/netfilter/xt_tcpmss.c index 37704ab01799..0d32d4841cb3 100644 --- a/net/netfilter/xt_tcpmss.c +++ b/net/netfilter/xt_tcpmss.c @@ -61,7 +61,7 @@ tcpmss_mt(const struct sk_buff *skb, struct xt_action_param *par) return (mssval >= info->mss_min && mssval <= info->mss_max) ^ info->invert; } - if (op[i] < 2) + if (op[i] < 2 || i == optlen - 1) i++; else i += op[i+1] ? : 1; diff --git a/net/netfilter/xt_time.c b/net/netfilter/xt_time.c index 6aa12d0f54e2..00319d2a54da 100644 --- a/net/netfilter/xt_time.c +++ b/net/netfilter/xt_time.c @@ -14,6 +14,7 @@ #include <linux/ktime.h> #include <linux/module.h> +#include <linux/rtc.h> #include <linux/skbuff.h> #include <linux/types.h> #include <linux/netfilter/x_tables.h> @@ -64,11 +65,6 @@ static const u_int16_t days_since_epoch[] = { 3287, 2922, 2557, 2191, 1826, 1461, 1096, 730, 365, 0, }; -static inline bool is_leap(unsigned int y) -{ - return y % 4 == 0 && (y % 100 != 0 || y % 400 == 0); -} - /* * Each network packet has a (nano)seconds-since-the-epoch (SSTE) timestamp. * Since we match against days and daytime, the SSTE value needs to be @@ -138,7 +134,7 @@ static void localtime_3(struct xtm *r, time64_t time) * (A different approach to use would be to subtract a monthlength * from w repeatedly while counting.) */ - if (is_leap(year)) { + if (is_leap_year(year)) { /* use days_since_leapyear[] in a leap year */ for (i = ARRAY_SIZE(days_since_leapyear) - 1; i > 0 && days_since_leapyear[i] > w; --i) diff --git a/net/nfc/hci/llc_shdlc.c b/net/nfc/hci/llc_shdlc.c index 4fc37894860c..08c8aa1530d8 100644 --- a/net/nfc/hci/llc_shdlc.c +++ b/net/nfc/hci/llc_shdlc.c @@ -762,6 +762,14 @@ static void llc_shdlc_deinit(struct nfc_llc *llc) { struct llc_shdlc *shdlc = nfc_llc_get_data(llc); + timer_shutdown_sync(&shdlc->connect_timer); + timer_shutdown_sync(&shdlc->t1_timer); + timer_shutdown_sync(&shdlc->t2_timer); + shdlc->t1_active = false; + shdlc->t2_active = false; + + cancel_work_sync(&shdlc->sm_work); + skb_queue_purge(&shdlc->rcv_q); skb_queue_purge(&shdlc->send_q); skb_queue_purge(&shdlc->ack_pending_q); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 494d628d10a5..a1005359085a 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -572,8 +572,9 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) __be16 proto = skb->protocol; if (unlikely(eth_type_vlan(proto))) - proto = __vlan_get_protocol_offset(skb, proto, - skb_mac_offset(skb), NULL); + proto = vlan_get_protocol_offset_inline(skb, proto, + skb_mac_offset(skb), + NULL); return proto; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 8b689ebbd5b5..ac1f120c10f9 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -242,7 +242,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) * therefore trigger warnings. * Defer the xmit to rds_send_worker() instead. */ - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); } rcu_read_unlock(); } diff --git a/net/rds/connection.c b/net/rds/connection.c index 68bc88cce84e..185f73b01694 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -169,6 +169,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); struct rds_transport *loop_trans; + struct rds_conn_path *free_cp = NULL; unsigned long flags; int ret, i; int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); @@ -269,6 +270,11 @@ static struct rds_connection *__rds_conn_create(struct net *net, __rds_conn_path_init(conn, &conn->c_path[i], is_outgoing); conn->c_path[i].cp_index = i; + conn->c_path[i].cp_wq = + alloc_ordered_workqueue("krds_cp_wq#%lu/%d", 0, + rds_conn_count, i); + if (!conn->c_path[i].cp_wq) + conn->c_path[i].cp_wq = rds_wq; } rcu_read_lock(); if (rds_destroy_pending(conn)) @@ -277,7 +283,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, ret = trans->conn_alloc(conn, GFP_ATOMIC); if (ret) { rcu_read_unlock(); - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = ERR_PTR(ret); goto out; @@ -300,7 +306,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, /* Creating passive conn */ if (parent->c_passive) { trans->conn_free(conn->c_path[0].cp_transport_data); - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = parent->c_passive; } else { @@ -327,7 +333,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, if (cp->cp_transport_data) trans->conn_free(cp->cp_transport_data); } - kfree(conn->c_path); + free_cp = conn->c_path; kmem_cache_free(rds_conn_slab, conn); conn = found; } else { @@ -342,6 +348,13 @@ static struct rds_connection *__rds_conn_create(struct net *net, rcu_read_unlock(); out: + if (free_cp) { + for (i = 0; i < npaths; i++) + if (free_cp[i].cp_wq != rds_wq) + destroy_workqueue(free_cp[i].cp_wq); + kfree(free_cp); + } + return conn; } @@ -382,6 +395,8 @@ void rds_conn_shutdown(struct rds_conn_path *cp) if (!rds_conn_path_transition(cp, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && !rds_conn_path_transition(cp, RDS_CONN_ERROR, + RDS_CONN_DISCONNECTING) && + !rds_conn_path_transition(cp, RDS_CONN_RESETTING, RDS_CONN_DISCONNECTING)) { rds_conn_path_error(cp, "shutdown called in state %d\n", @@ -427,13 +442,21 @@ void rds_conn_shutdown(struct rds_conn_path *cp) * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&cp->cp_conn_w); + + clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); rcu_read_lock(); if (!hlist_unhashed(&conn->c_hash_node)) { rcu_read_unlock(); + if (conn->c_trans->t_mp_capable && + cp->cp_index == 0) + rds_send_ping(conn, 0); rds_queue_reconnect(cp); } else { rcu_read_unlock(); } + + if (conn->c_trans->conn_slots_available) + conn->c_trans->conn_slots_available(conn, false); } /* destroy a single rds_conn_path. rds_conn_destroy() iterates over @@ -469,6 +492,11 @@ static void rds_conn_path_destroy(struct rds_conn_path *cp) WARN_ON(delayed_work_pending(&cp->cp_conn_w)); WARN_ON(work_pending(&cp->cp_down_w)); + if (cp->cp_wq != rds_wq) { + destroy_workqueue(cp->cp_wq); + cp->cp_wq = NULL; + } + cp->cp_conn->c_trans->conn_free(cp->cp_transport_data); } @@ -884,7 +912,7 @@ void rds_conn_path_drop(struct rds_conn_path *cp, bool destroy) rcu_read_unlock(); return; } - queue_work(rds_wq, &cp->cp_down_w); + queue_work(cp->cp_wq, &cp->cp_down_w); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_drop); @@ -909,7 +937,7 @@ void rds_conn_path_connect_if_down(struct rds_conn_path *cp) } if (rds_conn_path_state(cp) == RDS_CONN_DOWN && !test_and_set_bit(RDS_RECONNECT_PENDING, &cp->cp_flags)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_conn_path_connect_if_down); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 4248dfa816eb..357128d34a54 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -457,7 +457,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp) (must_wake || (can_wait && rds_ib_ring_low(&ic->i_recv_ring)) || rds_ib_ring_empty(&ic->i_recv_ring))) { - queue_delayed_work(rds_wq, &conn->c_recv_w, 1); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_recv_w, 1); } if (can_wait) cond_resched(); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 4190b90ff3b1..fcd04c29f543 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -297,7 +297,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0); /* We expect errors as the qp is drained during shutdown */ if (wc->status != IB_WC_SUCCESS && rds_conn_up(conn)) { @@ -419,7 +419,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits) atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits); if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + queue_delayed_work(conn->c_path->cp_wq, &conn->c_send_w, 0); WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384); @@ -577,16 +577,42 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ if (rm->rdma.op_active) { - struct rds_ext_header_rdma ext_hdr; + struct rds_ext_header_rdma ext_hdr = {}; + struct rds_ext_header_rdma_bytes + rdma_bytes_ext_hdr = {}; ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); - rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); + if (rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA, + &ext_hdr)) { + /* prepare the rdma bytes ext header */ + rdma_bytes_ext_hdr.h_rflags = + rm->rdma.op_write ? + RDS_FLAG_RDMA_WR_BYTES : + RDS_FLAG_RDMA_RD_BYTES; + rdma_bytes_ext_hdr.h_rdma_bytes = + cpu_to_be32(rm->rdma.op_bytes); + } else { + rdsdebug("RDS_EXTHDR_RDMA dropped"); + } + + if (rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_RDMA_BYTES, + &rdma_bytes_ext_hdr)) { + /* rdma bytes ext header was added successfully, + * notify the remote side via flag in header + */ + rm->m_inc.i_hdr.h_flags |= + RDS_FLAG_EXTHDR_EXTENSION; + } else { + rdsdebug("RDS_EXTHDR_RDMA_BYTES dropped"); + } } - if (rm->m_rdma_cookie) { - rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, - rds_rdma_cookie_key(rm->m_rdma_cookie), - rds_rdma_cookie_offset(rm->m_rdma_cookie)); + if (rm->m_rdma_cookie && + !rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr, + rds_rdma_cookie_key(rm->m_rdma_cookie), + rds_rdma_cookie_offset(rm->m_rdma_cookie))) { + rdsdebug("RDS_EXTHDR_RDMA_DEST dropped\n"); } /* Note - rds_ib_piggyb_ack clears the ACK_REQUIRED bit, so diff --git a/net/rds/message.c b/net/rds/message.c index 199a899a43e9..54fd000806ea 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -44,8 +44,10 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), +[RDS_EXTHDR_RDMA_BYTES] = sizeof(struct rds_ext_header_rdma_bytes), [RDS_EXTHDR_NPATHS] = sizeof(__be16), [RDS_EXTHDR_GEN_NUM] = sizeof(__be32), +[RDS_EXTHDR_SPORT_IDX] = 1, }; void rds_message_addref(struct rds_message *rm) @@ -191,31 +193,69 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, hdr->h_sport = sport; hdr->h_dport = dport; hdr->h_sequence = cpu_to_be64(seq); - hdr->h_exthdr[0] = RDS_EXTHDR_NONE; + /* see rds_find_next_ext_space for reason why we memset the + * ext header + */ + memset(hdr->h_exthdr, RDS_EXTHDR_NONE, RDS_HEADER_EXT_SPACE); } EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, unsigned int type, - const void *data, unsigned int len) +/* + * Find the next place we can add an RDS header extension with + * specific length. Extension headers are pushed one after the + * other. In the following, the number after the colon is the number + * of bytes: + * + * [ type1:1 dta1:len1 [ type2:1 dta2:len2 ] ... ] RDS_EXTHDR_NONE + * + * If the extension headers fill the complete extension header space + * (16 bytes), the trailing RDS_EXTHDR_NONE is omitted. + */ +static int rds_find_next_ext_space(struct rds_header *hdr, unsigned int len, + u8 **ext_start) { - unsigned int ext_len = sizeof(u8) + len; - unsigned char *dst; + unsigned int ext_len; + unsigned int type; + int ind = 0; + + while ((ind + 1 + len) <= RDS_HEADER_EXT_SPACE) { + if (hdr->h_exthdr[ind] == RDS_EXTHDR_NONE) { + *ext_start = hdr->h_exthdr + ind; + return 0; + } - /* For now, refuse to add more than one extension header */ - if (hdr->h_exthdr[0] != RDS_EXTHDR_NONE) - return 0; + type = hdr->h_exthdr[ind]; + + ext_len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0; + WARN_ONCE(!ext_len, "Unknown ext hdr type %d\n", type); + if (!ext_len) + return -EINVAL; + + /* ind points to a valid ext hdr with known length */ + ind += 1 + ext_len; + } + + /* no room for extension */ + return -ENOSPC; +} + +/* The ext hdr space is prefilled with zero from the kzalloc() */ +int rds_message_add_extension(struct rds_header *hdr, + unsigned int type, const void *data) +{ + unsigned char *dst; + unsigned int len; - if (type >= __RDS_EXTHDR_MAX || len != rds_exthdr_size[type]) + len = (type < __RDS_EXTHDR_MAX) ? rds_exthdr_size[type] : 0; + if (!len) return 0; - if (ext_len >= RDS_HEADER_EXT_SPACE) + if (rds_find_next_ext_space(hdr, len, &dst)) return 0; - dst = hdr->h_exthdr; *dst++ = type; memcpy(dst, data, len); - dst[len] = RDS_EXTHDR_NONE; return 1; } EXPORT_SYMBOL_GPL(rds_message_add_extension); @@ -272,7 +312,7 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o ext_hdr.h_rdma_rkey = cpu_to_be32(r_key); ext_hdr.h_rdma_offset = cpu_to_be32(offset); - return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr, sizeof(ext_hdr)); + return rds_message_add_extension(hdr, RDS_EXTHDR_RDMA_DEST, &ext_hdr); } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); diff --git a/net/rds/rds.h b/net/rds/rds.h index a029e5fcdea7..6e0790e4b570 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -118,6 +118,7 @@ struct rds_conn_path { void *cp_transport_data; + struct workqueue_struct *cp_wq; atomic_t cp_state; unsigned long cp_send_gen; unsigned long cp_flags; @@ -146,6 +147,7 @@ struct rds_connection { c_ping_triggered:1, c_pad_to_32:29; int c_npaths; + bool c_with_sport_idx; struct rds_connection *c_passive; struct rds_transport *c_trans; @@ -168,6 +170,8 @@ struct rds_connection { u32 c_my_gen_num; u32 c_peer_gen_num; + + u64 c_cp0_mprds_catchup_tx_seq; }; static inline @@ -182,10 +186,11 @@ void rds_conn_net_set(struct rds_connection *conn, struct net *net) write_pnet(&conn->c_net, net); } -#define RDS_FLAG_CONG_BITMAP 0x01 -#define RDS_FLAG_ACK_REQUIRED 0x02 -#define RDS_FLAG_RETRANSMITTED 0x04 -#define RDS_MAX_ADV_CREDIT 255 +#define RDS_FLAG_CONG_BITMAP 0x01 +#define RDS_FLAG_ACK_REQUIRED 0x02 +#define RDS_FLAG_RETRANSMITTED 0x04 +#define RDS_FLAG_EXTHDR_EXTENSION 0x20 +#define RDS_MAX_ADV_CREDIT 255 /* RDS_FLAG_PROBE_PORT is the reserved sport used for sending a ping * probe to exchange control information before establishing a connection. @@ -257,13 +262,29 @@ struct rds_ext_header_rdma_dest { __be32 h_rdma_offset; }; +/* + * This extension header tells the peer about delivered RDMA byte count. + */ +#define RDS_EXTHDR_RDMA_BYTES 4 + +struct rds_ext_header_rdma_bytes { + __be32 h_rdma_bytes; /* byte count */ + u8 h_rflags; /* direction of RDMA, write or read */ + u8 h_pad[3]; +}; + +#define RDS_FLAG_RDMA_WR_BYTES 0x01 +#define RDS_FLAG_RDMA_RD_BYTES 0x02 + /* Extension header announcing number of paths. * Implicit length = 2 bytes. */ #define RDS_EXTHDR_NPATHS 5 #define RDS_EXTHDR_GEN_NUM 6 +#define RDS_EXTHDR_SPORT_IDX 8 #define __RDS_EXTHDR_MAX 16 /* for now */ + #define RDS_RX_MAX_TRACES (RDS_MSG_RX_DGRAM_TRACE_MAX + 1) #define RDS_MSG_RX_HDR 0 #define RDS_MSG_RX_START 1 @@ -505,33 +526,6 @@ struct rds_notifier { */ #define RDS_TRANS_LOOP 3 -/** - * struct rds_transport - transport specific behavioural hooks - * - * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send - * part of a message. The caller serializes on the send_sem so this - * doesn't need to be reentrant for a given conn. The header must be - * sent before the data payload. .xmit must be prepared to send a - * message with no data payload. .xmit should return the number of - * bytes that were sent down the connection, including header bytes. - * Returning 0 tells the caller that it doesn't need to perform any - * additional work now. This is usually the case when the transport has - * filled the sending queue for its connection and will handle - * triggering the rds thread to continue the send when space becomes - * available. Returning -EAGAIN tells the caller to retry the send - * immediately. Returning -ENOMEM tells the caller to retry the send at - * some point in the future. - * - * @conn_shutdown: conn_shutdown stops traffic on the given connection. Once - * it returns the connection can not call rds_recv_incoming(). - * This will only be called once after conn_connect returns - * non-zero success and will The caller serializes this with - * the send and connecting paths (xmit_* and conn_*). The - * transport is responsible for other serialization, including - * rds_recv_incoming(). This is called in process context but - * should try hard not to block. - */ - struct rds_transport { char t_name[TRANSNAMSIZ]; struct list_head t_item; @@ -544,10 +538,49 @@ struct rds_transport { __u32 scope_id); int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp); void (*conn_free)(void *data); + + /* + * conn_slots_available is invoked when a previously unavailable + * connection slot becomes available again. rds_tcp_accept_one_path may + * return -ENOBUFS if it cannot find an available slot, and then stashes + * the new socket in "rds_tcp_accepted_sock". This function re-issues + * `rds_tcp_accept_one_path`, which picks up the stashed socket and + * continuing where it left with "-ENOBUFS" last time. This ensures + * messages received on the new socket are not discarded when no + * connection path was available at the time. + */ + void (*conn_slots_available)(struct rds_connection *conn, bool fan_out); int (*conn_path_connect)(struct rds_conn_path *cp); + + /* + * conn_shutdown stops traffic on the given connection. Once + * it returns the connection can not call rds_recv_incoming(). + * This will only be called once after conn_connect returns + * non-zero success and will The caller serializes this with + * the send and connecting paths (xmit_* and conn_*). The + * transport is responsible for other serialization, including + * rds_recv_incoming(). This is called in process context but + * should try hard not to block. + */ void (*conn_path_shutdown)(struct rds_conn_path *conn); void (*xmit_path_prepare)(struct rds_conn_path *cp); void (*xmit_path_complete)(struct rds_conn_path *cp); + + /* + * .xmit is called by rds_send_xmit() to tell the transport to send + * part of a message. The caller serializes on the send_sem so this + * doesn't need to be reentrant for a given conn. The header must be + * sent before the data payload. .xmit must be prepared to send a + * message with no data payload. .xmit should return the number of + * bytes that were sent down the connection, including header bytes. + * Returning 0 tells the caller that it doesn't need to perform any + * additional work now. This is usually the case when the transport has + * filled the sending queue for its connection and will handle + * triggering the rds thread to continue the send when space becomes + * available. Returning -EAGAIN tells the caller to retry the send + * immediately. Returning -ENOMEM tells the caller to retry the send at + * some point in the future. + */ int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); @@ -682,42 +715,43 @@ static inline int rds_sk_rcvbuf(struct rds_sock *rs) } struct rds_statistics { - uint64_t s_conn_reset; - uint64_t s_recv_drop_bad_checksum; - uint64_t s_recv_drop_old_seq; - uint64_t s_recv_drop_no_sock; - uint64_t s_recv_drop_dead_sock; - uint64_t s_recv_deliver_raced; - uint64_t s_recv_delivered; - uint64_t s_recv_queued; - uint64_t s_recv_immediate_retry; - uint64_t s_recv_delayed_retry; - uint64_t s_recv_ack_required; - uint64_t s_recv_rdma_bytes; - uint64_t s_recv_ping; - uint64_t s_send_queue_empty; - uint64_t s_send_queue_full; - uint64_t s_send_lock_contention; - uint64_t s_send_lock_queue_raced; - uint64_t s_send_immediate_retry; - uint64_t s_send_delayed_retry; - uint64_t s_send_drop_acked; - uint64_t s_send_ack_required; - uint64_t s_send_queued; - uint64_t s_send_rdma; - uint64_t s_send_rdma_bytes; - uint64_t s_send_pong; - uint64_t s_page_remainder_hit; - uint64_t s_page_remainder_miss; - uint64_t s_copy_to_user; - uint64_t s_copy_from_user; - uint64_t s_cong_update_queued; - uint64_t s_cong_update_received; - uint64_t s_cong_send_error; - uint64_t s_cong_send_blocked; - uint64_t s_recv_bytes_added_to_socket; - uint64_t s_recv_bytes_removed_from_socket; - uint64_t s_send_stuck_rm; + u64 s_conn_reset; + u64 s_recv_drop_bad_checksum; + u64 s_recv_drop_old_seq; + u64 s_recv_drop_no_sock; + u64 s_recv_drop_dead_sock; + u64 s_recv_deliver_raced; + u64 s_recv_delivered; + u64 s_recv_queued; + u64 s_recv_immediate_retry; + u64 s_recv_delayed_retry; + u64 s_recv_ack_required; + u64 s_recv_rdma_bytes; + u64 s_recv_ping; + u64 s_send_queue_empty; + u64 s_send_queue_full; + u64 s_send_lock_contention; + u64 s_send_lock_queue_raced; + u64 s_send_immediate_retry; + u64 s_send_delayed_retry; + u64 s_send_drop_acked; + u64 s_send_ack_required; + u64 s_send_queued; + u64 s_send_rdma; + u64 s_send_rdma_bytes; + u64 s_send_pong; + u64 s_page_remainder_hit; + u64 s_page_remainder_miss; + u64 s_copy_to_user; + u64 s_copy_from_user; + u64 s_cong_update_queued; + u64 s_cong_update_received; + u64 s_cong_send_error; + u64 s_cong_send_blocked; + u64 s_recv_bytes_added_to_socket; + u64 s_recv_bytes_removed_from_socket; + u64 s_send_stuck_rm; + u64 s_mprds_catchup_tx0_retries; }; /* af_rds.c */ @@ -858,7 +892,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq); int rds_message_add_extension(struct rds_header *hdr, - unsigned int type, const void *data, unsigned int len); + unsigned int type, const void *data); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); diff --git a/net/rds/recv.c b/net/rds/recv.c index 66205d6924bf..4b3f9e4a8bfd 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -204,8 +204,14 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, struct rds_ext_header_version version; __be16 rds_npaths; __be32 rds_gen_num; + u8 dummy; } buffer; + bool new_with_sport_idx = false; u32 new_peer_gen_num = 0; + int new_npaths; + bool fan_out; + + new_npaths = conn->c_npaths; while (1) { len = sizeof(buffer); @@ -215,21 +221,48 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, /* Process extension header here */ switch (type) { case RDS_EXTHDR_NPATHS: - conn->c_npaths = min_t(int, RDS_MPATH_WORKERS, - be16_to_cpu(buffer.rds_npaths)); + new_npaths = min_t(int, RDS_MPATH_WORKERS, + be16_to_cpu(buffer.rds_npaths)); break; case RDS_EXTHDR_GEN_NUM: new_peer_gen_num = be32_to_cpu(buffer.rds_gen_num); break; + case RDS_EXTHDR_SPORT_IDX: + new_with_sport_idx = true; + break; default: pr_warn_ratelimited("ignoring unknown exthdr type " "0x%x\n", type); } } + + conn->c_with_sport_idx = new_with_sport_idx; + + if (new_npaths > 1 && new_npaths != conn->c_npaths) { + /* We're about to fan-out. + * Make sure that messages from cp_index#0 + * are sent prior to handling other lanes. + */ + struct rds_conn_path *cp0 = conn->c_path; + unsigned long flags; + + spin_lock_irqsave(&cp0->cp_lock, flags); + conn->c_cp0_mprds_catchup_tx_seq = cp0->cp_next_tx_seq; + spin_unlock_irqrestore(&cp0->cp_lock, flags); + fan_out = true; + } else { + fan_out = false; + } + /* if RDS_EXTHDR_NPATHS was not found, default to a single-path */ - conn->c_npaths = max_t(int, conn->c_npaths, 1); + conn->c_npaths = max_t(int, new_npaths, 1); + conn->c_ping_triggered = 0; rds_conn_peer_gen_update(conn, new_peer_gen_num); + + if (conn->c_npaths > 1 && + conn->c_trans->conn_slots_available) + conn->c_trans->conn_slots_available(conn, fan_out); } /* rds_start_mprds() will synchronously start multiple paths when appropriate. diff --git a/net/rds/send.c b/net/rds/send.c index 0b3d0ef2f008..6e96f108473e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -120,6 +120,57 @@ static void release_in_xmit(struct rds_conn_path *cp) } /* + * Helper function for multipath fanout to ensure lane 0 transmits queued + * messages before other lanes to prevent out-of-order delivery. + * + * Returns true if lane 0 still has messages or false otherwise + */ +static bool rds_mprds_cp0_catchup(struct rds_connection *conn) +{ + struct rds_conn_path *cp0 = conn->c_path; + struct rds_message *rm0; + unsigned long flags; + bool ret = false; + + spin_lock_irqsave(&cp0->cp_lock, flags); + + /* the oldest / first message in the retransmit queue + * has to be at or beyond c_cp0_mprds_catchup_tx_seq + */ + if (!list_empty(&cp0->cp_retrans)) { + rm0 = list_entry(cp0->cp_retrans.next, struct rds_message, + m_conn_item); + if (be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) < + conn->c_cp0_mprds_catchup_tx_seq) { + /* the retransmit queue of cp_index#0 has not + * quite caught up yet + */ + ret = true; + goto unlock; + } + } + + /* the oldest / first message of the send queue + * has to be at or beyond c_cp0_mprds_catchup_tx_seq + */ + rm0 = cp0->cp_xmit_rm; + if (!rm0 && !list_empty(&cp0->cp_send_queue)) + rm0 = list_entry(cp0->cp_send_queue.next, struct rds_message, + m_conn_item); + if (rm0 && be64_to_cpu(rm0->m_inc.i_hdr.h_sequence) < + conn->c_cp0_mprds_catchup_tx_seq) { + /* the send queue of cp_index#0 has not quite + * caught up yet + */ + ret = true; + } + +unlock: + spin_unlock_irqrestore(&cp0->cp_lock, flags); + return ret; +} + +/* * We're making the conscious trade-off here to only send one message * down the connection at a time. * Pro: @@ -248,6 +299,14 @@ restart: if (batch_count >= send_batch_count) goto over_batch; + /* make sure cp_index#0 caught up during fan-out in + * order to avoid lane races + */ + if (cp->cp_index > 0 && rds_mprds_cp0_catchup(conn)) { + rds_stats_inc(s_mprds_catchup_tx0_retries); + goto over_batch; + } + spin_lock_irqsave(&cp->cp_lock, flags); if (!list_empty(&cp->cp_send_queue)) { @@ -458,7 +517,8 @@ over_batch: if (rds_destroy_pending(cp->cp_conn)) ret = -ENETUNREACH; else - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + queue_delayed_work(cp->cp_wq, + &cp->cp_send_w, 1); rcu_read_unlock(); } else if (raced) { rds_stats_inc(s_send_lock_queue_raced); @@ -1041,39 +1101,6 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, return ret; } -static int rds_send_mprds_hash(struct rds_sock *rs, - struct rds_connection *conn, int nonblock) -{ - int hash; - - if (conn->c_npaths == 0) - hash = RDS_MPATH_HASH(rs, RDS_MPATH_WORKERS); - else - hash = RDS_MPATH_HASH(rs, conn->c_npaths); - if (conn->c_npaths == 0 && hash != 0) { - rds_send_ping(conn, 0); - - /* The underlying connection is not up yet. Need to wait - * until it is up to be sure that the non-zero c_path can be - * used. But if we are interrupted, we have to use the zero - * c_path in case the connection ends up being non-MP capable. - */ - if (conn->c_npaths == 0) { - /* Cannot wait for the connection be made, so just use - * the base c_path. - */ - if (nonblock) - return 0; - if (wait_event_interruptible(conn->c_hs_waitq, - conn->c_npaths != 0)) - hash = 0; - } - if (conn->c_npaths == 1) - hash = 0; - } - return hash; -} - static int rds_rdma_bytes(struct msghdr *msg, size_t *rdma_bytes) { struct rds_rdma_args *args; @@ -1303,10 +1330,32 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) rs->rs_conn = conn; } - if (conn->c_trans->t_mp_capable) - cpath = &conn->c_path[rds_send_mprds_hash(rs, conn, nonblock)]; - else + if (conn->c_trans->t_mp_capable) { + /* Use c_path[0] until we learn that + * the peer supports more (c_npaths > 1) + */ + cpath = &conn->c_path[RDS_MPATH_HASH(rs, conn->c_npaths ? : 1)]; + } else { cpath = &conn->c_path[0]; + } + + /* If we're multipath capable and path 0 is down, queue reconnect + * and send a ping. This initiates the multipath handshake through + * rds_send_probe(), which sends RDS_EXTHDR_NPATHS to the peer, + * starting multipath capability negotiation. + */ + if (conn->c_trans->t_mp_capable && + !rds_conn_path_up(&conn->c_path[0])) { + /* Ensures that only one request is queued. And + * rds_send_ping() ensures that only one ping is + * outstanding. + */ + if (!test_and_set_bit(RDS_RECONNECT_PENDING, + &conn->c_path[0].cp_flags)) + queue_delayed_work(conn->c_path[0].cp_wq, + &conn->c_path[0].cp_conn_w, 0); + rds_send_ping(conn, 0); + } rm->m_conn_path = cpath; @@ -1380,7 +1429,7 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) if (rds_destroy_pending(cpath->cp_conn)) ret = -ENETUNREACH; else - queue_delayed_work(rds_wq, &cpath->cp_send_w, 1); + queue_delayed_work(cpath->cp_wq, &cpath->cp_send_w, 1); rcu_read_unlock(); } if (ret) @@ -1456,24 +1505,26 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, cp->cp_conn->c_trans->t_mp_capable) { __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + u8 dummy = 0; rds_message_add_extension(&rm->m_inc.i_hdr, - RDS_EXTHDR_NPATHS, &npaths, - sizeof(npaths)); + RDS_EXTHDR_NPATHS, &npaths); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_GEN_NUM, - &my_gen_num, - sizeof(u32)); + &my_gen_num); + rds_message_add_extension(&rm->m_inc.i_hdr, + RDS_EXTHDR_SPORT_IDX, + &dummy); } spin_unlock_irqrestore(&cp->cp_lock, flags); rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - /* schedule the send work on rds_wq */ + /* schedule the send work on cp_wq */ rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_send_w, 1); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 1); rcu_read_unlock(); rds_message_put(rm); diff --git a/net/rds/stats.c b/net/rds/stats.c index cb2e3d2cdf73..24ee22d09e8c 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -79,6 +79,7 @@ static const char *const rds_stat_names[] = { "recv_bytes_added_to_sock", "recv_bytes_freed_fromsock", "send_stuck_rm", + "mprds_catchup_tx0_retries", }; void rds_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 3cc2f303bf78..45484a93d75f 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -213,6 +213,8 @@ void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp) sock->sk->sk_data_ready = sock->sk->sk_user_data; tc->t_sock = sock; + if (!tc->t_rtn) + tc->t_rtn = net_generic(sock_net(sock->sk), rds_tcp_netid); tc->t_cpath = cp; tc->t_orig_data_ready = sock->sk->sk_data_ready; tc->t_orig_write_space = sock->sk->sk_write_space; @@ -378,9 +380,11 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) } mutex_init(&tc->t_conn_path_lock); tc->t_sock = NULL; + tc->t_rtn = NULL; tc->t_tinc = NULL; tc->t_tinc_hdr_rem = sizeof(struct rds_header); tc->t_tinc_data_rem = 0; + init_waitqueue_head(&tc->t_recv_done_waitq); conn->c_path[i].cp_transport_data = tc; tc->t_cpath = &conn->c_path[i]; @@ -458,6 +462,7 @@ struct rds_transport rds_tcp_transport = { .recv_path = rds_tcp_recv_path, .conn_alloc = rds_tcp_conn_alloc, .conn_free = rds_tcp_conn_free, + .conn_slots_available = rds_tcp_conn_slots_available, .conn_path_connect = rds_tcp_conn_path_connect, .conn_path_shutdown = rds_tcp_conn_path_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, @@ -473,17 +478,7 @@ struct rds_transport rds_tcp_transport = { .t_unloading = rds_tcp_is_unloading, }; -static unsigned int rds_tcp_netid; - -/* per-network namespace private data for this module */ -struct rds_tcp_net { - struct socket *rds_tcp_listen_sock; - struct work_struct rds_tcp_accept_w; - struct ctl_table_header *rds_tcp_sysctl; - struct ctl_table *ctl_table; - int sndbuf_size; - int rcvbuf_size; -}; +int rds_tcp_netid; /* All module specific customizations to the RDS-TCP socket should be done in * rds_tcp_tune() and applied after socket creation. @@ -526,15 +521,12 @@ static void rds_tcp_accept_worker(struct work_struct *work) struct rds_tcp_net, rds_tcp_accept_w); - while (rds_tcp_accept_one(rtn->rds_tcp_listen_sock) == 0) + while (rds_tcp_accept_one(rtn) == 0) cond_resched(); } -void rds_tcp_accept_work(struct sock *sk) +void rds_tcp_accept_work(struct rds_tcp_net *rtn) { - struct net *net = sock_net(sk); - struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid); - queue_work(rds_wq, &rtn->rds_tcp_accept_w); } @@ -546,6 +538,8 @@ static __net_init int rds_tcp_init_net(struct net *net) memset(rtn, 0, sizeof(*rtn)); + mutex_init(&rtn->rds_tcp_accept_lock); + /* {snd, rcv}buf_size default to 0, which implies we let the * stack pick the value, and permit auto-tuning of buffer size. */ @@ -609,6 +603,8 @@ static void rds_tcp_kill_sock(struct net *net) rtn->rds_tcp_listen_sock = NULL; rds_tcp_listen_stop(lsock, &rtn->rds_tcp_accept_w); + if (rtn->rds_tcp_accepted_sock) + sock_release(rtn->rds_tcp_accepted_sock); spin_lock_irq(&rds_tcp_conn_lock); list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) { struct net *c_net = read_pnet(&tc->t_cpath->cp_conn->c_net); diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 053aa7da87ef..39c86347188c 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -4,6 +4,21 @@ #define RDS_TCP_PORT 16385 +/* per-network namespace private data for this module */ +struct rds_tcp_net { + /* serialize "rds_tcp_accept_one" with "rds_tcp_accept_lock" + * to protect "rds_tcp_accepted_sock" + */ + struct mutex rds_tcp_accept_lock; + struct socket *rds_tcp_listen_sock; + struct socket *rds_tcp_accepted_sock; + struct work_struct rds_tcp_accept_w; + struct ctl_table_header *rds_tcp_sysctl; + const struct ctl_table *ctl_table; + int sndbuf_size; + int rcvbuf_size; +}; + struct rds_tcp_incoming { struct rds_incoming ti_inc; struct sk_buff_head ti_skb_list; @@ -19,6 +34,8 @@ struct rds_tcp_connection { */ struct mutex t_conn_path_lock; struct socket *t_sock; + u32 t_client_port_group; + struct rds_tcp_net *t_rtn; void *t_orig_write_space; void *t_orig_data_ready; void *t_orig_state_change; @@ -38,6 +55,9 @@ struct rds_tcp_connection { u32 t_last_sent_nxt; u32 t_last_expected_una; u32 t_last_seen_una; + + /* for rds_tcp_conn_path_shutdown */ + wait_queue_head_t t_recv_done_waitq; }; struct rds_tcp_statistics { @@ -49,6 +69,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ +extern int rds_tcp_netid; bool rds_tcp_tune(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_conn_path *cp); void rds_tcp_reset_callbacks(struct socket *sock, struct rds_conn_path *cp); @@ -57,7 +78,7 @@ void rds_tcp_restore_callbacks(struct socket *sock, u32 rds_tcp_write_seq(struct rds_tcp_connection *tc); u32 rds_tcp_snd_una(struct rds_tcp_connection *tc); extern struct rds_transport rds_tcp_transport; -void rds_tcp_accept_work(struct sock *sk); +void rds_tcp_accept_work(struct rds_tcp_net *rtn); int rds_tcp_laddr_check(struct net *net, const struct in6_addr *addr, __u32 scope_id); /* tcp_connect.c */ @@ -69,7 +90,8 @@ void rds_tcp_state_change(struct sock *sk); struct socket *rds_tcp_listen_init(struct net *net, bool isv6); void rds_tcp_listen_stop(struct socket *sock, struct work_struct *acceptor); void rds_tcp_listen_data_ready(struct sock *sk); -int rds_tcp_accept_one(struct socket *sock); +void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out); +int rds_tcp_accept_one(struct rds_tcp_net *rtn); void rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); @@ -86,6 +108,7 @@ void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp); void rds_tcp_xmit_path_complete(struct rds_conn_path *cp); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); +int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack); void rds_tcp_write_space(struct sock *sk); /* tcp_stats.c */ diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 92891b0d224d..b77c88ffb199 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -75,8 +75,16 @@ void rds_tcp_state_change(struct sock *sk) rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } break; + case TCP_CLOSING: + case TCP_TIME_WAIT: + if (wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); + break; case TCP_CLOSE_WAIT: + case TCP_LAST_ACK: case TCP_CLOSE: + if (wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); rds_conn_path_drop(cp, false); break; default: @@ -93,6 +101,8 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) struct sockaddr_in6 sin6; struct sockaddr_in sin; struct sockaddr *addr; + int port_low, port_high, port; + int port_groups, groups_left; int addrlen; bool isv6; int ret; @@ -145,7 +155,26 @@ int rds_tcp_conn_path_connect(struct rds_conn_path *cp) addrlen = sizeof(sin); } - ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, addrlen); + /* encode cp->cp_index in lowest bits of source-port */ + inet_get_local_port_range(rds_conn_net(conn), &port_low, &port_high); + port_low = ALIGN(port_low, RDS_MPATH_WORKERS); + port_groups = (port_high - port_low + 1) / RDS_MPATH_WORKERS; + ret = -EADDRINUSE; + groups_left = port_groups; + while (groups_left-- > 0 && ret) { + if (++tc->t_client_port_group >= port_groups) + tc->t_client_port_group = 0; + port = port_low + + tc->t_client_port_group * RDS_MPATH_WORKERS + + cp->cp_index; + + if (isv6) + sin6.sin6_port = htons(port); + else + sin.sin_port = htons(port); + ret = kernel_bind(sock, (struct sockaddr_unsized *)addr, + addrlen); + } if (ret) { rdsdebug("bind failed with %d at address %pI6c\n", ret, &conn->c_laddr); @@ -205,18 +234,58 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) { struct rds_tcp_connection *tc = cp->cp_transport_data; struct socket *sock = tc->t_sock; + struct sock *sk; + unsigned int rounds; rdsdebug("shutting down conn %p tc %p sock %p\n", cp->cp_conn, tc, sock); if (sock) { + sk = sock->sk; if (rds_destroy_pending(cp->cp_conn)) - sock_no_linger(sock->sk); - sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); - lock_sock(sock->sk); + sock_no_linger(sk); + + sock->ops->shutdown(sock, SHUT_WR); + + /* after sending FIN, + * wait until we processed all incoming messages + * and we're sure that there won't be any more: + * i.e. state CLOSING, TIME_WAIT, CLOSE_WAIT, + * LAST_ACK, or CLOSE (RFC 793). + * + * Give up waiting after 5 seconds and allow messages + * to theoretically get dropped, if the TCP transition + * didn't happen. + */ + rounds = 0; + do { + /* we need to ensure messages are dequeued here + * since "rds_recv_worker" only dispatches messages + * while the connection is still in RDS_CONN_UP + * and there is no guarantee that "rds_tcp_data_ready" + * was called nor that "sk_data_ready" still points to + * it. + */ + rds_tcp_recv_path(cp); + } while (!wait_event_timeout(tc->t_recv_done_waitq, + (sk->sk_state == TCP_CLOSING || + sk->sk_state == TCP_TIME_WAIT || + sk->sk_state == TCP_CLOSE_WAIT || + sk->sk_state == TCP_LAST_ACK || + sk->sk_state == TCP_CLOSE) && + skb_queue_empty_lockless(&sk->sk_receive_queue), + msecs_to_jiffies(100)) && + ++rounds < 50); + lock_sock(sk); + + /* discard messages that the peer received already */ + tc->t_last_seen_una = rds_tcp_snd_una(tc); + rds_send_path_drop_acked(cp, rds_tcp_snd_una(tc), + rds_tcp_is_acked); + rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ - release_sock(sock->sk); + release_sock(sk); sock_release(sock); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 820d3e20de19..6fb5c928b8fd 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -35,6 +35,8 @@ #include <linux/in.h> #include <net/tcp.h> #include <trace/events/sock.h> +#include <net/net_namespace.h> +#include <net/netns/generic.h> #include "rds.h" #include "tcp.h" @@ -54,49 +56,120 @@ void rds_tcp_keepalive(struct socket *sock) tcp_sock_set_keepintvl(sock->sk, keepidle); } +static int +rds_tcp_get_peer_sport(struct socket *sock) +{ + union { + struct sockaddr_storage storage; + struct sockaddr addr; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; + } saddr; + int sport; + + if (kernel_getpeername(sock, &saddr.addr) >= 0) { + switch (saddr.addr.sa_family) { + case AF_INET: + sport = ntohs(saddr.sin.sin_port); + break; + case AF_INET6: + sport = ntohs(saddr.sin6.sin6_port); + break; + default: + sport = -1; + } + } else { + sport = -1; + } + + return sport; +} + /* rds_tcp_accept_one_path(): if accepting on cp_index > 0, make sure the * client's ipaddr < server's ipaddr. Otherwise, close the accepted * socket and force a reconneect from smaller -> larger ip addr. The reason * we special case cp_index 0 is to allow the rds probe ping itself to itself * get through efficiently. - * Since reconnects are only initiated from the node with the numerically - * smaller ip address, we recycle conns in RDS_CONN_ERROR on the passive side - * by moving them to CONNECTING in this function. */ -static -struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) +static struct rds_tcp_connection * +rds_tcp_accept_one_path(struct rds_connection *conn, struct socket *sock) { - int i; - int npaths = max_t(int, 1, conn->c_npaths); + int sport, npaths, i_min, i_max, i; - /* for mprds, all paths MUST be initiated by the peer - * with the smaller address. - */ - if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) >= 0) { - /* Make sure we initiate at least one path if this - * has not already been done; rds_start_mprds() will - * take care of additional paths, if necessary. - */ - if (npaths == 1) - rds_conn_path_connect_if_down(&conn->c_path[0]); - return NULL; + if (conn->c_with_sport_idx) + /* cp->cp_index is encoded in lowest bits of source-port */ + sport = rds_tcp_get_peer_sport(sock); + else + sport = -1; + + npaths = max_t(int, 1, conn->c_npaths); + + if (sport >= 0) { + i_min = sport % npaths; + i_max = i_min; + } else { + i_min = 0; + i_max = npaths - 1; } - for (i = 0; i < npaths; i++) { + for (i = i_min; i <= i_max; i++) { struct rds_conn_path *cp = &conn->c_path[i]; if (rds_conn_path_transition(cp, RDS_CONN_DOWN, - RDS_CONN_CONNECTING) || - rds_conn_path_transition(cp, RDS_CONN_ERROR, - RDS_CONN_CONNECTING)) { + RDS_CONN_CONNECTING)) return cp->cp_transport_data; - } } + return NULL; } -int rds_tcp_accept_one(struct socket *sock) +void rds_tcp_conn_slots_available(struct rds_connection *conn, bool fan_out) +{ + struct rds_tcp_connection *tc; + struct rds_tcp_net *rtn; + struct socket *sock; + int sport, npaths; + + if (rds_destroy_pending(conn)) + return; + + tc = conn->c_path->cp_transport_data; + rtn = tc->t_rtn; + if (!rtn) + return; + + sock = tc->t_sock; + + /* During fan-out, check that the connection we already + * accepted in slot#0 carried the proper source port modulo. + */ + if (fan_out && conn->c_with_sport_idx && sock && + rds_addr_cmp(&conn->c_laddr, &conn->c_faddr) > 0) { + /* cp->cp_index is encoded in lowest bits of source-port */ + sport = rds_tcp_get_peer_sport(sock); + npaths = max_t(int, 1, conn->c_npaths); + if (sport >= 0 && sport % npaths != 0) + /* peer initiated with a non-#0 lane first */ + rds_conn_path_drop(conn->c_path, 0); + } + + /* As soon as a connection went down, + * it is safe to schedule a "rds_tcp_accept_one" + * attempt even if there are no connections pending: + * Function "rds_tcp_accept_one" won't block + * but simply return -EAGAIN in that case. + * + * Doing so is necessary to address the case where an + * incoming connection on "rds_tcp_listen_sock" is ready + * to be acccepted prior to a free slot being available: + * the -ENOBUFS case in "rds_tcp_accept_one". + */ + rds_tcp_accept_work(rtn); +} + +int rds_tcp_accept_one(struct rds_tcp_net *rtn) { + struct socket *listen_sock = rtn->rds_tcp_listen_sock; struct socket *new_sock = NULL; struct rds_connection *conn; int ret; @@ -110,17 +183,23 @@ int rds_tcp_accept_one(struct socket *sock) #endif int dev_if = 0; - if (!sock) /* module unload or netns delete in progress */ + if (!listen_sock) /* module unload or netns delete in progress */ return -ENETUNREACH; - ret = kernel_accept(sock, &new_sock, O_NONBLOCK); - if (ret) - return ret; + mutex_lock(&rtn->rds_tcp_accept_lock); + new_sock = rtn->rds_tcp_accepted_sock; + rtn->rds_tcp_accepted_sock = NULL; - rds_tcp_keepalive(new_sock); - if (!rds_tcp_tune(new_sock)) { - ret = -EINVAL; - goto out; + if (!new_sock) { + ret = kernel_accept(listen_sock, &new_sock, O_NONBLOCK); + if (ret) + goto out; + + rds_tcp_keepalive(new_sock); + if (!rds_tcp_tune(new_sock)) { + ret = -EINVAL; + goto out; + } } inet = inet_sk(new_sock->sk); @@ -135,7 +214,7 @@ int rds_tcp_accept_one(struct socket *sock) peer_addr = &daddr; #endif rdsdebug("accepted family %d tcp %pI6c:%u -> %pI6c:%u\n", - sock->sk->sk_family, + listen_sock->sk->sk_family, my_addr, ntohs(inet->inet_sport), peer_addr, ntohs(inet->inet_dport)); @@ -155,13 +234,13 @@ int rds_tcp_accept_one(struct socket *sock) } #endif - if (!rds_tcp_laddr_check(sock_net(sock->sk), peer_addr, dev_if)) { + if (!rds_tcp_laddr_check(sock_net(listen_sock->sk), peer_addr, dev_if)) { /* local address connection is only allowed via loopback */ ret = -EOPNOTSUPP; goto out; } - conn = rds_conn_create(sock_net(sock->sk), + conn = rds_conn_create(sock_net(listen_sock->sk), my_addr, peer_addr, &rds_tcp_transport, 0, GFP_KERNEL, dev_if); @@ -174,15 +253,51 @@ int rds_tcp_accept_one(struct socket *sock) * If the client reboots, this conn will need to be cleaned up. * rds_tcp_state_change() will do that cleanup */ - rs_tcp = rds_tcp_accept_one_path(conn); - if (!rs_tcp) + if (rds_addr_cmp(&conn->c_faddr, &conn->c_laddr) < 0) { + /* Try to obtain a free connection slot. + * If unsuccessful, we need to preserve "new_sock" + * that we just accepted, since its "sk_receive_queue" + * may contain messages already that have been acknowledged + * to and discarded by the sender. + * We must not throw those away! + */ + rs_tcp = rds_tcp_accept_one_path(conn, new_sock); + if (!rs_tcp) { + /* It's okay to stash "new_sock", since + * "rds_tcp_conn_slots_available" triggers + * "rds_tcp_accept_one" again as soon as one of the + * connection slots becomes available again + */ + rtn->rds_tcp_accepted_sock = new_sock; + new_sock = NULL; + ret = -ENOBUFS; + goto out; + } + } else { + /* This connection request came from a peer with + * a larger address. + * Function "rds_tcp_state_change" makes sure + * that the connection doesn't transition + * to state "RDS_CONN_UP", and therefore + * we should not have received any messages + * on this socket yet. + * This is the only case where it's okay to + * not dequeue messages from "sk_receive_queue". + */ + if (conn->c_npaths <= 1) + rds_conn_path_connect_if_down(&conn->c_path[0]); + rs_tcp = NULL; goto rst_nsk; + } + mutex_lock(&rs_tcp->t_conn_path_lock); cp = rs_tcp->t_cpath; conn_state = rds_conn_path_state(cp); WARN_ON(conn_state == RDS_CONN_UP); - if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) + if (conn_state != RDS_CONN_CONNECTING && conn_state != RDS_CONN_ERROR) { + rds_conn_path_drop(cp, 0); goto rst_nsk; + } if (rs_tcp->t_sock) { /* Duelling SYN has been handled in rds_tcp_accept_one() */ rds_tcp_reset_callbacks(new_sock, cp); @@ -192,6 +307,22 @@ int rds_tcp_accept_one(struct socket *sock) rds_tcp_set_callbacks(new_sock, cp); rds_connect_path_complete(cp, RDS_CONN_CONNECTING); } + + /* Since "rds_tcp_set_callbacks" happens this late + * the connection may already have been closed without + * "rds_tcp_state_change" doing its due diligence. + * + * If that's the case, we simply drop the path, + * knowing that "rds_tcp_conn_path_shutdown" will + * dequeue pending messages. + */ + if (new_sock->sk->sk_state == TCP_CLOSE_WAIT || + new_sock->sk->sk_state == TCP_LAST_ACK || + new_sock->sk->sk_state == TCP_CLOSE) + rds_conn_path_drop(cp, 0); + else + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); + new_sock = NULL; ret = 0; if (conn->c_npaths == 0) @@ -212,6 +343,9 @@ out: mutex_unlock(&rs_tcp->t_conn_path_lock); if (new_sock) sock_release(new_sock); + + mutex_unlock(&rtn->rds_tcp_accept_lock); + return ret; } @@ -239,7 +373,7 @@ void rds_tcp_listen_data_ready(struct sock *sk) * the listen socket is being torn down. */ if (sk->sk_state == TCP_LISTEN) - rds_tcp_accept_work(sk); + rds_tcp_accept_work(net_generic(sock_net(sk), rds_tcp_netid)); else ready = rds_tcp_listen_sock_def_readable(sock_net(sk)); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 7997a19d1da3..49f96ee0c40f 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -278,6 +278,10 @@ static int rds_tcp_read_sock(struct rds_conn_path *cp, gfp_t gfp) rdsdebug("tcp_read_sock for tc %p gfp 0x%x returned %d\n", tc, gfp, desc.error); + if (skb_queue_empty_lockless(&sock->sk->sk_receive_queue) && + wq_has_sleeper(&tc->t_recv_done_waitq)) + wake_up(&tc->t_recv_done_waitq); + return desc.error; } @@ -327,7 +331,7 @@ void rds_tcp_data_ready(struct sock *sk) if (rds_tcp_read_sock(cp, GFP_ATOMIC) == -ENOMEM) { rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); rcu_read_unlock(); } out: diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 7d284ac7e81a..7c52acc749cf 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -169,7 +169,7 @@ out: * unacked byte of the TCP sequence space. We have to do very careful * wrapping 32bit comparisons here. */ -static int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) +int rds_tcp_is_acked(struct rds_message *rm, uint64_t ack) { if (!test_bit(RDS_MSG_HAS_ACK_SEQ, &rm->m_flags)) return 0; @@ -201,7 +201,7 @@ void rds_tcp_write_space(struct sock *sk) rcu_read_lock(); if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf && !rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); rcu_read_unlock(); out: diff --git a/net/rds/threads.c b/net/rds/threads.c index 1f424cbfcbb4..639302bab51e 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -89,8 +89,8 @@ void rds_connect_path_complete(struct rds_conn_path *cp, int curr) set_bit(0, &cp->cp_conn->c_map_queued); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) { - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); } rcu_read_unlock(); cp->cp_conn->c_proposed_version = RDS_PROTOCOL_VERSION; @@ -140,7 +140,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) cp->cp_reconnect_jiffies = rds_sysctl_reconnect_min_jiffies; rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, 0); rcu_read_unlock(); return; } @@ -151,7 +151,7 @@ void rds_queue_reconnect(struct rds_conn_path *cp) conn, &conn->c_laddr, &conn->c_faddr); rcu_read_lock(); if (!rds_destroy_pending(cp->cp_conn)) - queue_delayed_work(rds_wq, &cp->cp_conn_w, + queue_delayed_work(cp->cp_wq, &cp->cp_conn_w, rand % cp->cp_reconnect_jiffies); rcu_read_unlock(); @@ -203,11 +203,11 @@ void rds_send_worker(struct work_struct *work) switch (ret) { case -EAGAIN: rds_stats_inc(s_send_immediate_retry); - queue_delayed_work(rds_wq, &cp->cp_send_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 0); break; case -ENOMEM: rds_stats_inc(s_send_delayed_retry); - queue_delayed_work(rds_wq, &cp->cp_send_w, 2); + queue_delayed_work(cp->cp_wq, &cp->cp_send_w, 2); break; default: break; @@ -228,11 +228,11 @@ void rds_recv_worker(struct work_struct *work) switch (ret) { case -EAGAIN: rds_stats_inc(s_recv_immediate_retry); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 0); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 0); break; case -ENOMEM: rds_stats_inc(s_recv_delayed_retry); - queue_delayed_work(rds_wq, &cp->cp_recv_w, 2); + queue_delayed_work(cp->cp_wq, &cp->cp_recv_w, 2); break; default: break; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index 2b6ac7069dc1..81d488655793 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -13,9 +13,11 @@ #include <linux/skbuff.h> #include <linux/rtnetlink.h> #include <linux/pkt_cls.h> +#include <linux/if_tunnel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/rhashtable.h> +#include <net/gre.h> #include <net/netlink.h> #include <net/pkt_sched.h> #include <net/pkt_cls.h> diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 71efe04d00b5..d2c750bab1d3 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -16,6 +16,7 @@ #include <net/pkt_sched.h> #include <net/act_api.h> #include <net/pkt_cls.h> +#include <net/inet_ecn.h> #include <uapi/linux/tc_act/tc_ctinfo.h> #include <net/tc_act/tc_ctinfo.h> #include <net/tc_wrapper.h> diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index f56b18c8aebf..443c116e8663 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1353,7 +1353,7 @@ err_out4: ops->destroy(sch); qdisc_put_stab(rtnl_dereference(sch->stab)); err_out3: - lockdep_unregister_key(&sch->root_lock_key); + qdisc_lock_uninit(sch, ops); netdev_put(dev, &sch->dev_tracker); qdisc_free(sch); err_out2: diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 4a64d6397b6f..d2bbd5654d5b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -67,6 +67,7 @@ #include <linux/if_vlan.h> #include <net/gso.h> #include <net/pkt_sched.h> +#include <net/sch_priv.h> #include <net/pkt_cls.h> #include <net/tcp.h> #include <net/flow_dissector.h> @@ -197,40 +198,45 @@ struct cake_tin_data { u32 way_collisions; }; /* number of tins is small, so size of this struct doesn't matter much */ +struct cake_sched_config { + u64 rate_bps; + u64 interval; + u64 target; + u64 sync_time; + u32 buffer_config_limit; + u32 fwmark_mask; + u16 fwmark_shft; + s16 rate_overhead; + u16 rate_mpu; + u16 rate_flags; + u8 tin_mode; + u8 flow_mode; + u8 atm_mode; + u8 ack_filter; + u8 is_shared; +}; + struct cake_sched_data { struct tcf_proto __rcu *filter_list; /* optional external classifier */ struct tcf_block *block; struct cake_tin_data *tins; + struct cake_sched_config *config; + struct cake_sched_config initial_config; struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS]; - u16 overflow_timeout; - - u16 tin_cnt; - u8 tin_mode; - u8 flow_mode; - u8 ack_filter; - u8 atm_mode; - - u32 fwmark_mask; - u16 fwmark_shft; /* time_next = time_this + ((len * rate_ns) >> rate_shft) */ - u16 rate_shft; ktime_t time_next_packet; ktime_t failsafe_next_packet; u64 rate_ns; - u64 rate_bps; - u16 rate_flags; - s16 rate_overhead; - u16 rate_mpu; - u64 interval; - u64 target; + u16 rate_shft; + u16 overflow_timeout; + u16 tin_cnt; /* resource tracking */ u32 buffer_used; u32 buffer_max_used; u32 buffer_limit; - u32 buffer_config_limit; /* indices for dequeue */ u16 cur_tin; @@ -254,6 +260,11 @@ struct cake_sched_data { u16 max_adjlen; u16 min_netlen; u16 min_adjlen; + + /* mq sync state */ + u64 last_checked_active; + u64 last_active; + u32 active_queues; }; enum { @@ -380,6 +391,8 @@ static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { 1239850263, 1191209601, 1147878294, 1108955788 }; +static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, + u64 target_ns, u64 rtt_est_ns); /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) * @@ -1198,7 +1211,7 @@ static bool cake_tcph_may_drop(const struct tcphdr *tcph, static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, struct cake_flow *flow) { - bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE; + bool aggressive = q->config->ack_filter == CAKE_ACK_AGGRESSIVE; struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL; struct sk_buff *skb_check, *skb_prev = NULL; const struct ipv6hdr *ipv6h, *ipv6h_check; @@ -1266,7 +1279,7 @@ static struct sk_buff *cake_ack_filter(struct cake_sched_data *q, ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr)) continue; - seglen = ntohs(ipv6h_check->payload_len); + seglen = ipv6_payload_len(skb, ipv6h_check); } else { WARN_ON(1); /* shouldn't happen */ continue; @@ -1358,15 +1371,17 @@ static u64 cake_ewma(u64 avg, u64 sample, u32 shift) return avg; } -static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) +static u32 cake_calc_overhead(struct cake_sched_data *qd, u32 len, u32 off) { + struct cake_sched_config *q = qd->config; + if (q->rate_flags & CAKE_FLAG_OVERHEAD) len -= off; - if (q->max_netlen < len) - q->max_netlen = len; - if (q->min_netlen > len) - q->min_netlen = len; + if (qd->max_netlen < len) + qd->max_netlen = len; + if (qd->min_netlen > len) + qd->min_netlen = len; len += q->rate_overhead; @@ -1385,10 +1400,10 @@ static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off) len += (len + 63) / 64; } - if (q->max_adjlen < len) - q->max_adjlen = len; - if (q->min_adjlen > len) - q->min_adjlen = len; + if (qd->max_adjlen < len) + qd->max_adjlen = len; + if (qd->min_adjlen > len) + qd->min_adjlen = len; return len; } @@ -1586,7 +1601,7 @@ static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free) flow->dropped++; b->tin_dropped++; - if (q->rate_flags & CAKE_FLAG_INGRESS) + if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, skb, now, true); qdisc_drop_reason(skb, sch, to_free, SKB_DROP_REASON_QDISC_OVERLIMIT); @@ -1656,7 +1671,8 @@ static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash) static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, struct sk_buff *skb) { - struct cake_sched_data *q = qdisc_priv(sch); + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; u32 tin, mark; bool wash; u8 dscp; @@ -1673,24 +1689,24 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT) tin = 0; - else if (mark && mark <= q->tin_cnt) - tin = q->tin_order[mark - 1]; + else if (mark && mark <= qd->tin_cnt) + tin = qd->tin_order[mark - 1]; else if (TC_H_MAJ(skb->priority) == sch->handle && TC_H_MIN(skb->priority) > 0 && - TC_H_MIN(skb->priority) <= q->tin_cnt) - tin = q->tin_order[TC_H_MIN(skb->priority) - 1]; + TC_H_MIN(skb->priority) <= qd->tin_cnt) + tin = qd->tin_order[TC_H_MIN(skb->priority) - 1]; else { if (!wash) dscp = cake_handle_diffserv(skb, wash); - tin = q->tin_index[dscp]; + tin = qd->tin_index[dscp]; - if (unlikely(tin >= q->tin_cnt)) + if (unlikely(tin >= qd->tin_cnt)) tin = 0; } - return &q->tins[tin]; + return &qd->tins[tin]; } static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t, @@ -1746,7 +1762,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, bool same_flow = false; /* choose flow to insert into */ - idx = cake_classify(sch, &b, skb, q->flow_mode, &ret); + idx = cake_classify(sch, &b, skb, q->config->flow_mode, &ret); if (idx == 0) { if (ret & __NET_XMIT_BYPASS) qdisc_qstats_drop(sch); @@ -1781,7 +1797,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (unlikely(len > b->max_skblen)) b->max_skblen = len; - if (qdisc_pkt_segs(skb) > 1 && q->rate_flags & CAKE_FLAG_SPLIT_GSO) { + if (qdisc_pkt_segs(skb) > 1 && q->config->rate_flags & CAKE_FLAG_SPLIT_GSO) { struct sk_buff *segs, *nskb; netdev_features_t features = netif_skb_features(skb); unsigned int slen = 0, numsegs = 0; @@ -1823,7 +1839,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb); flow_queue_add(flow, skb); - if (q->ack_filter) + if (q->config->ack_filter) ack = cake_ack_filter(q, flow); if (ack) { @@ -1832,7 +1848,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, ack_pkt_len = qdisc_pkt_len(ack); b->bytes += ack_pkt_len; q->buffer_used += skb->truesize - ack->truesize; - if (q->rate_flags & CAKE_FLAG_INGRESS) + if (q->config->rate_flags & CAKE_FLAG_INGRESS) cake_advance_shaper(q, b, ack, now, true); qdisc_tree_reduce_backlog(sch, 1, ack_pkt_len); @@ -1855,7 +1871,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, cake_heapify_up(q, b->overflow_idx[idx]); /* incoming bandwidth capacity estimate */ - if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { + if (q->config->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) { u64 packet_interval = \ ktime_to_ns(ktime_sub(now, q->last_packet_time)); @@ -1887,7 +1903,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, if (ktime_after(now, ktime_add_ms(q->last_reconfig_time, 250))) { - q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; + q->config->rate_bps = (q->avg_peak_bandwidth * 15) >> 4; cake_reconfigure(sch); } } @@ -1907,7 +1923,7 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, flow->set = CAKE_SET_SPARSE; b->sparse_flow_count++; - flow->deficit = cake_get_flow_quantum(b, flow, q->flow_mode); + flow->deficit = cake_get_flow_quantum(b, flow, q->config->flow_mode); } else if (flow->set == CAKE_SET_SPARSE_WAIT) { /* this flow was empty, accounted as a sparse flow, but actually * in the bulk rotation. @@ -1916,8 +1932,8 @@ static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch, b->sparse_flow_count--; b->bulk_flow_count++; - cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); } if (q->buffer_used > q->buffer_max_used) @@ -1997,6 +2013,40 @@ static struct sk_buff *cake_dequeue(struct Qdisc *sch) u64 delay; u32 len; + if (q->config->is_shared && now - q->last_checked_active >= q->config->sync_time) { + struct net_device *dev = qdisc_dev(sch); + struct cake_sched_data *other_priv; + u64 new_rate = q->config->rate_bps; + u64 other_qlen, other_last_active; + struct Qdisc *other_sch; + u32 num_active_qs = 1; + unsigned int ntx; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + other_sch = rcu_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); + other_priv = qdisc_priv(other_sch); + + if (other_priv == q) + continue; + + other_qlen = READ_ONCE(other_sch->q.qlen); + other_last_active = READ_ONCE(other_priv->last_active); + + if (other_qlen || other_last_active > q->last_checked_active) + num_active_qs++; + } + + if (num_active_qs > 1) + new_rate = div64_u64(q->config->rate_bps, num_active_qs); + + /* mtu = 0 is used to only update the rate and not mess with cobalt params */ + cake_set_rate(b, new_rate, 0, 0, 0); + q->last_checked_active = now; + q->active_queues = num_active_qs; + q->rate_ns = b->tin_rate_ns; + q->rate_shft = b->tin_rate_shft; + } + begin: if (!sch->q.qlen) return NULL; @@ -2104,8 +2154,8 @@ retry: b->sparse_flow_count--; b->bulk_flow_count++; - cake_inc_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_inc_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_inc_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_inc_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); flow->set = CAKE_SET_BULK; } else { @@ -2117,7 +2167,7 @@ retry: } } - flow->deficit += cake_get_flow_quantum(b, flow, q->flow_mode); + flow->deficit += cake_get_flow_quantum(b, flow, q->config->flow_mode); list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2141,8 +2191,8 @@ retry: if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); b->decaying_flow_count++; } else if (flow->set == CAKE_SET_SPARSE || @@ -2160,8 +2210,8 @@ retry: else if (flow->set == CAKE_SET_BULK) { b->bulk_flow_count--; - cake_dec_srchost_bulk_flow_count(b, flow, q->flow_mode); - cake_dec_dsthost_bulk_flow_count(b, flow, q->flow_mode); + cake_dec_srchost_bulk_flow_count(b, flow, q->config->flow_mode); + cake_dec_dsthost_bulk_flow_count(b, flow, q->config->flow_mode); } else b->decaying_flow_count--; @@ -2172,14 +2222,14 @@ retry: reason = cobalt_should_drop(&flow->cvars, &b->cparams, now, skb, (b->bulk_flow_count * - !!(q->rate_flags & + !!(q->config->rate_flags & CAKE_FLAG_INGRESS))); /* Last packet in queue may be marked, shouldn't be dropped */ if (reason == SKB_NOT_DROPPED_YET || !flow->head) break; /* drop this packet, get another one */ - if (q->rate_flags & CAKE_FLAG_INGRESS) { + if (q->config->rate_flags & CAKE_FLAG_INGRESS) { len = cake_advance_shaper(q, b, skb, now, true); flow->deficit -= len; @@ -2190,12 +2240,13 @@ retry: qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb)); qdisc_qstats_drop(sch); qdisc_dequeue_drop(sch, skb, reason); - if (q->rate_flags & CAKE_FLAG_INGRESS) + if (q->config->rate_flags & CAKE_FLAG_INGRESS) goto retry; } b->tin_ecn_mark += !!flow->cvars.ecn_marked; qdisc_bstats_update(sch, skb); + WRITE_ONCE(q->last_active, now); /* collect delay stats */ delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb))); @@ -2296,6 +2347,9 @@ static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu, b->tin_rate_ns = rate_ns; b->tin_rate_shft = rate_shft; + if (mtu == 0) + return; + byte_target_ns = (byte_target * rate_ns) >> rate_shft; b->cparams.target = max((byte_target_ns * 3) / 2, target_ns); @@ -2312,7 +2366,7 @@ static int cake_config_besteffort(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); struct cake_tin_data *b = &q->tins[0]; u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; + u64 rate = q->config->rate_bps; q->tin_cnt = 1; @@ -2320,7 +2374,7 @@ static int cake_config_besteffort(struct Qdisc *sch) q->tin_order = normal_order; cake_set_rate(b, rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); b->tin_quantum = 65535; return 0; @@ -2331,7 +2385,7 @@ static int cake_config_precedence(struct Qdisc *sch) /* convert high-level (user visible) parameters into internal format */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; + u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2342,8 +2396,8 @@ static int cake_config_precedence(struct Qdisc *sch) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; - cake_set_rate(b, rate, mtu, us_to_ns(q->target), - us_to_ns(q->interval)); + cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), + us_to_ns(q->config->interval)); b->tin_quantum = max_t(u16, 1U, quantum); @@ -2420,7 +2474,7 @@ static int cake_config_diffserv8(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; + u64 rate = q->config->rate_bps; u32 quantum = 256; u32 i; @@ -2434,8 +2488,8 @@ static int cake_config_diffserv8(struct Qdisc *sch) for (i = 0; i < q->tin_cnt; i++) { struct cake_tin_data *b = &q->tins[i]; - cake_set_rate(b, rate, mtu, us_to_ns(q->target), - us_to_ns(q->interval)); + cake_set_rate(b, rate, mtu, us_to_ns(q->config->target), + us_to_ns(q->config->interval)); b->tin_quantum = max_t(u16, 1U, quantum); @@ -2464,7 +2518,7 @@ static int cake_config_diffserv4(struct Qdisc *sch) struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; + u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 4; @@ -2475,13 +2529,13 @@ static int cake_config_diffserv4(struct Qdisc *sch) /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[2], rate >> 1, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[3], rate >> 2, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; @@ -2501,7 +2555,7 @@ static int cake_config_diffserv3(struct Qdisc *sch) */ struct cake_sched_data *q = qdisc_priv(sch); u32 mtu = psched_mtu(qdisc_dev(sch)); - u64 rate = q->rate_bps; + u64 rate = q->config->rate_bps; u32 quantum = 1024; q->tin_cnt = 3; @@ -2512,11 +2566,11 @@ static int cake_config_diffserv3(struct Qdisc *sch) /* class characteristics */ cake_set_rate(&q->tins[0], rate, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[1], rate >> 4, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); cake_set_rate(&q->tins[2], rate >> 2, mtu, - us_to_ns(q->target), us_to_ns(q->interval)); + us_to_ns(q->config->target), us_to_ns(q->config->interval)); /* bandwidth-sharing weights */ q->tins[0].tin_quantum = quantum; @@ -2528,7 +2582,8 @@ static int cake_config_diffserv3(struct Qdisc *sch) static void cake_reconfigure(struct Qdisc *sch) { - struct cake_sched_data *q = qdisc_priv(sch); + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; int c, ft; switch (q->tin_mode) { @@ -2554,39 +2609,38 @@ static void cake_reconfigure(struct Qdisc *sch) break; } - for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) { + for (c = qd->tin_cnt; c < CAKE_MAX_TINS; c++) { cake_clear_tin(sch, c); - q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time; + qd->tins[c].cparams.mtu_time = qd->tins[ft].cparams.mtu_time; } - q->rate_ns = q->tins[ft].tin_rate_ns; - q->rate_shft = q->tins[ft].tin_rate_shft; + qd->rate_ns = qd->tins[ft].tin_rate_ns; + qd->rate_shft = qd->tins[ft].tin_rate_shft; if (q->buffer_config_limit) { - q->buffer_limit = q->buffer_config_limit; + qd->buffer_limit = q->buffer_config_limit; } else if (q->rate_bps) { u64 t = q->rate_bps * q->interval; do_div(t, USEC_PER_SEC / 4); - q->buffer_limit = max_t(u32, t, 4U << 20); + qd->buffer_limit = max_t(u32, t, 4U << 20); } else { - q->buffer_limit = ~0; + qd->buffer_limit = ~0; } sch->flags &= ~TCQ_F_CAN_BYPASS; - q->buffer_limit = min(q->buffer_limit, - max(sch->limit * psched_mtu(qdisc_dev(sch)), - q->buffer_config_limit)); + qd->buffer_limit = min(qd->buffer_limit, + max(sch->limit * psched_mtu(qdisc_dev(sch)), + q->buffer_config_limit)); } -static int cake_change(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static int cake_config_change(struct cake_sched_config *q, struct nlattr *opt, + struct netlink_ext_ack *extack, bool *overhead_changed) { - struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_CAKE_MAX + 1]; - u16 rate_flags; - u8 flow_mode; + u16 rate_flags = q->rate_flags; + u8 flow_mode = q->flow_mode; int err; err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy, @@ -2594,7 +2648,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; - flow_mode = q->flow_mode; if (tb[TCA_CAKE_NAT]) { #if IS_ENABLED(CONFIG_NF_CONNTRACK) flow_mode &= ~CAKE_FLOW_NAT_FLAG; @@ -2607,6 +2660,19 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, #endif } + if (tb[TCA_CAKE_AUTORATE]) { + if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) { + if (q->is_shared) { + NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_AUTORATE], + "Can't use autorate-ingress with cake_mq"); + return -EOPNOTSUPP; + } + rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; + } else { + rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; + } + } + if (tb[TCA_CAKE_BASE_RATE64]) WRITE_ONCE(q->rate_bps, nla_get_u64(tb[TCA_CAKE_BASE_RATE64])); @@ -2615,7 +2681,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->tin_mode, nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE])); - rate_flags = q->rate_flags; if (tb[TCA_CAKE_WASH]) { if (!!nla_get_u32(tb[TCA_CAKE_WASH])) rate_flags |= CAKE_FLAG_WASH; @@ -2636,20 +2701,12 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->rate_overhead, nla_get_s32(tb[TCA_CAKE_OVERHEAD])); rate_flags |= CAKE_FLAG_OVERHEAD; - - q->max_netlen = 0; - q->max_adjlen = 0; - q->min_netlen = ~0; - q->min_adjlen = ~0; + *overhead_changed = true; } if (tb[TCA_CAKE_RAW]) { rate_flags &= ~CAKE_FLAG_OVERHEAD; - - q->max_netlen = 0; - q->max_adjlen = 0; - q->min_netlen = ~0; - q->min_adjlen = ~0; + *overhead_changed = true; } if (tb[TCA_CAKE_MPU]) @@ -2668,13 +2725,6 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->target, max(target, 1U)); } - if (tb[TCA_CAKE_AUTORATE]) { - if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE])) - rate_flags |= CAKE_FLAG_AUTORATE_INGRESS; - else - rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS; - } - if (tb[TCA_CAKE_INGRESS]) { if (!!nla_get_u32(tb[TCA_CAKE_INGRESS])) rate_flags |= CAKE_FLAG_INGRESS; @@ -2705,7 +2755,35 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, WRITE_ONCE(q->rate_flags, rate_flags); WRITE_ONCE(q->flow_mode, flow_mode); - if (q->tins) { + + return 0; +} + +static int cake_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = qd->config; + bool overhead_changed = false; + int ret; + + if (q->is_shared) { + NL_SET_ERR_MSG(extack, "can't reconfigure cake_mq sub-qdiscs"); + return -EOPNOTSUPP; + } + + ret = cake_config_change(q, opt, extack, &overhead_changed); + if (ret) + return ret; + + if (overhead_changed) { + qd->max_netlen = 0; + qd->max_adjlen = 0; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + } + + if (qd->tins) { sch_tree_lock(sch); cake_reconfigure(sch); sch_tree_unlock(sch); @@ -2723,15 +2801,8 @@ static void cake_destroy(struct Qdisc *sch) kvfree(q->tins); } -static int cake_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static void cake_config_init(struct cake_sched_config *q, bool is_shared) { - struct cake_sched_data *q = qdisc_priv(sch); - int i, j, err; - - sch->limit = 10240; - sch->flags |= TCQ_F_DEQUEUE_DROPS; - q->tin_mode = CAKE_DIFFSERV_DIFFSERV3; q->flow_mode = CAKE_FLOW_TRIPLE; @@ -2742,19 +2813,35 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, * for 5 to 10% of interval */ q->rate_flags |= CAKE_FLAG_SPLIT_GSO; - q->cur_tin = 0; - q->cur_flow = 0; + q->is_shared = is_shared; + q->sync_time = 200 * NSEC_PER_USEC; +} + +static int cake_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + struct cake_sched_config *q = &qd->initial_config; + int i, j, err; + + cake_config_init(q, false); + + sch->limit = 10240; + sch->flags |= TCQ_F_DEQUEUE_DROPS; + + qd->cur_tin = 0; + qd->cur_flow = 0; + qd->config = q; - qdisc_watchdog_init(&q->watchdog, sch); + qdisc_watchdog_init(&qd->watchdog, sch); if (opt) { err = cake_change(sch, opt, extack); - if (err) return err; } - err = tcf_block_get(&q->block, &q->filter_list, sch, extack); + err = tcf_block_get(&qd->block, &qd->filter_list, sch, extack); if (err) return err; @@ -2762,13 +2849,13 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, for (i = 1; i <= CAKE_QUEUES; i++) quantum_div[i] = 65535 / i; - q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), - GFP_KERNEL); - if (!q->tins) + qd->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data), + GFP_KERNEL); + if (!qd->tins) return -ENOMEM; for (i = 0; i < CAKE_MAX_TINS; i++) { - struct cake_tin_data *b = q->tins + i; + struct cake_tin_data *b = qd->tins + i; INIT_LIST_HEAD(&b->new_flows); INIT_LIST_HEAD(&b->old_flows); @@ -2784,22 +2871,32 @@ static int cake_init(struct Qdisc *sch, struct nlattr *opt, INIT_LIST_HEAD(&flow->flowchain); cobalt_vars_init(&flow->cvars); - q->overflow_heap[k].t = i; - q->overflow_heap[k].b = j; + qd->overflow_heap[k].t = i; + qd->overflow_heap[k].b = j; b->overflow_idx[j] = k; } } cake_reconfigure(sch); - q->avg_peak_bandwidth = q->rate_bps; - q->min_netlen = ~0; - q->min_adjlen = ~0; + qd->avg_peak_bandwidth = q->rate_bps; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + qd->active_queues = 0; + qd->last_checked_active = 0; + return 0; } -static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +static void cake_config_replace(struct Qdisc *sch, struct cake_sched_config *cfg) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + + qd->config = cfg; + cake_reconfigure(sch); +} + +static int cake_config_dump(struct cake_sched_config *q, struct sk_buff *skb) { - struct cake_sched_data *q = qdisc_priv(sch); struct nlattr *opts; u16 rate_flags; u8 flow_mode; @@ -2875,6 +2972,13 @@ nla_put_failure: return -1; } +static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_sched_data *qd = qdisc_priv(sch); + + return cake_config_dump(qd->config, skb); +} + static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) { struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP); @@ -2903,6 +3007,7 @@ static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d) PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen); PUT_STAT_U32(MIN_NETLEN, q->min_netlen); PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen); + PUT_STAT_U32(ACTIVE_QUEUES, q->active_queues); #undef PUT_STAT_U32 #undef PUT_STAT_U64 @@ -3136,14 +3241,133 @@ static struct Qdisc_ops cake_qdisc_ops __read_mostly = { }; MODULE_ALIAS_NET_SCH("cake"); +struct cake_mq_sched { + struct mq_sched mq_priv; /* must be first */ + struct cake_sched_config cake_config; +}; + +static void cake_mq_destroy(struct Qdisc *sch) +{ + mq_destroy_common(sch); +} + +static int cake_mq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int ret, ntx; + bool _unused; + + cake_config_init(&priv->cake_config, true); + if (opt) { + ret = cake_config_change(&priv->cake_config, opt, extack, &_unused); + if (ret) + return ret; + } + + ret = mq_init_common(sch, opt, extack, &cake_qdisc_ops); + if (ret) + return ret; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) + cake_config_replace(priv->mq_priv.qdiscs[ntx], &priv->cake_config); + + return 0; +} + +static int cake_mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + + mq_dump_common(sch, skb); + return cake_config_dump(&priv->cake_config, skb); +} + +static int cake_mq_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct cake_mq_sched *priv = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + bool overhead_changed = false; + unsigned int ntx; + int ret; + + ret = cake_config_change(&priv->cake_config, opt, extack, &overhead_changed); + if (ret) + return ret; + + for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { + struct Qdisc *chld = rtnl_dereference(netdev_get_tx_queue(dev, ntx)->qdisc_sleeping); + struct cake_sched_data *qd = qdisc_priv(chld); + + if (overhead_changed) { + qd->max_netlen = 0; + qd->max_adjlen = 0; + qd->min_netlen = ~0; + qd->min_adjlen = ~0; + } + + if (qd->tins) { + sch_tree_lock(chld); + cake_reconfigure(chld); + sch_tree_unlock(chld); + } + } + + return 0; +} + +static int cake_mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, + struct Qdisc **old, struct netlink_ext_ack *extack) +{ + NL_SET_ERR_MSG(extack, "can't replace cake_mq sub-qdiscs"); + return -EOPNOTSUPP; +} + +static const struct Qdisc_class_ops cake_mq_class_ops = { + .select_queue = mq_select_queue, + .graft = cake_mq_graft, + .leaf = mq_leaf, + .find = mq_find, + .walk = mq_walk, + .dump = mq_dump_class, + .dump_stats = mq_dump_class_stats, +}; + +static struct Qdisc_ops cake_mq_qdisc_ops __read_mostly = { + .cl_ops = &cake_mq_class_ops, + .id = "cake_mq", + .priv_size = sizeof(struct cake_mq_sched), + .init = cake_mq_init, + .destroy = cake_mq_destroy, + .attach = mq_attach, + .change = cake_mq_change, + .change_real_num_tx = mq_change_real_num_tx, + .dump = cake_mq_dump, + .owner = THIS_MODULE, +}; +MODULE_ALIAS_NET_SCH("cake_mq"); + static int __init cake_module_init(void) { - return register_qdisc(&cake_qdisc_ops); + int ret; + + ret = register_qdisc(&cake_qdisc_ops); + if (ret) + return ret; + + ret = register_qdisc(&cake_mq_qdisc_ops); + if (ret) + unregister_qdisc(&cake_qdisc_ops); + + return ret; } static void __exit cake_module_exit(void) { unregister_qdisc(&cake_qdisc_ops); + unregister_qdisc(&cake_mq_qdisc_ops); } module_init(cake_module_init) @@ -3151,3 +3375,4 @@ module_exit(cake_module_exit) MODULE_AUTHOR("Jonathan Morton"); MODULE_LICENSE("Dual BSD/GPL"); MODULE_DESCRIPTION("The CAKE shaper."); +MODULE_IMPORT_NS("NET_SCHED_INTERNAL"); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index 6e5f2f4f2415..80235e85f844 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -245,8 +245,6 @@ static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f) static struct kmem_cache *fq_flow_cachep __read_mostly; -/* limit number of collected flows per round */ -#define FQ_GC_MAX 8 #define FQ_GC_AGE (3*HZ) static bool fq_gc_candidate(const struct fq_flow *f) @@ -259,10 +257,9 @@ static void fq_gc(struct fq_sched_data *q, struct rb_root *root, struct sock *sk) { + struct fq_flow *f, *tofree = NULL; struct rb_node **p, *parent; - void *tofree[FQ_GC_MAX]; - struct fq_flow *f; - int i, fcnt = 0; + int fcnt; p = &root->rb_node; parent = NULL; @@ -274,9 +271,8 @@ static void fq_gc(struct fq_sched_data *q, break; if (fq_gc_candidate(f)) { - tofree[fcnt++] = f; - if (fcnt == FQ_GC_MAX) - break; + f->next = tofree; + tofree = f; } if (f->sk > sk) @@ -285,18 +281,20 @@ static void fq_gc(struct fq_sched_data *q, p = &parent->rb_left; } - if (!fcnt) + if (!tofree) return; - for (i = fcnt; i > 0; ) { - f = tofree[--i]; + fcnt = 0; + while (tofree) { + f = tofree; + tofree = f->next; rb_erase(&f->fq_node, root); + kmem_cache_free(fq_flow_cachep, f); + fcnt++; } q->flows -= fcnt; q->inactive_flows -= fcnt; q->stat_gc_flows += fcnt; - - kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree); } /* Fast path can be used if : @@ -665,7 +663,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) return NULL; skb = fq_peek(&q->internal); - if (unlikely(skb)) { + if (skb) { q->internal.qlen--; fq_dequeue_skb(sch, &q->internal, skb); goto out; @@ -716,7 +714,7 @@ begin: } prefetch(&skb->end); fq_dequeue_skb(sch, f, skb); - if ((s64)(now - time_next_packet - q->ce_threshold) > 0) { + if (unlikely((s64)(now - time_next_packet - q->ce_threshold) > 0)) { INET_ECN_set_ce(skb); q->stat_ce_mark++; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 852e603c1755..98ffe64de51f 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -955,9 +955,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, __skb_queue_head_init(&sch->gso_skb); __skb_queue_head_init(&sch->skb_bad_txq); gnet_stats_basic_sync_init(&sch->bstats); - lockdep_register_key(&sch->root_lock_key); - spin_lock_init(&sch->q.lock); - lockdep_set_class(&sch->q.lock, &sch->root_lock_key); + qdisc_lock_init(sch, ops); if (ops->static_flags & TCQ_F_CPUSTATS) { sch->cpu_bstats = @@ -987,7 +985,7 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, return sch; errout1: - lockdep_unregister_key(&sch->root_lock_key); + qdisc_lock_uninit(sch, ops); kfree(sch); errout: return ERR_PTR(err); @@ -1076,7 +1074,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc) if (ops->destroy) ops->destroy(qdisc); - lockdep_unregister_key(&qdisc->root_lock_key); + qdisc_lock_uninit(qdisc, ops); bpf_module_put(ops, ops->owner); netdev_put(dev, &qdisc->dev_tracker); diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index c860119a8f09..bb94cd577943 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -15,11 +15,7 @@ #include <net/netlink.h> #include <net/pkt_cls.h> #include <net/pkt_sched.h> -#include <net/sch_generic.h> - -struct mq_sched { - struct Qdisc **qdiscs; -}; +#include <net/sch_priv.h> static int mq_offload(struct Qdisc *sch, enum tc_mq_command cmd) { @@ -49,23 +45,29 @@ static int mq_offload_stats(struct Qdisc *sch) return qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_MQ, &opt); } -static void mq_destroy(struct Qdisc *sch) +void mq_destroy_common(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); unsigned int ntx; - mq_offload(sch, TC_MQ_DESTROY); - if (!priv->qdiscs) return; for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_put(priv->qdiscs[ntx]); kfree(priv->qdiscs); } +EXPORT_SYMBOL_NS_GPL(mq_destroy_common, "NET_SCHED_INTERNAL"); -static int mq_init(struct Qdisc *sch, struct nlattr *opt, - struct netlink_ext_ack *extack) +static void mq_destroy(struct Qdisc *sch) +{ + mq_offload(sch, TC_MQ_DESTROY); + mq_destroy_common(sch); +} + +int mq_init_common(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack, + const struct Qdisc_ops *qdisc_ops) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); @@ -87,7 +89,8 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt, for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx), + qdisc = qdisc_create_dflt(dev_queue, + qdisc_ops ?: get_default_qdisc_ops(dev, ntx), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1)), extack); @@ -98,12 +101,24 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt, } sch->flags |= TCQ_F_MQROOT; + return 0; +} +EXPORT_SYMBOL_NS_GPL(mq_init_common, "NET_SCHED_INTERNAL"); + +static int mq_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + int ret; + + ret = mq_init_common(sch, opt, extack, NULL); + if (ret) + return ret; mq_offload(sch, TC_MQ_CREATE); return 0; } -static void mq_attach(struct Qdisc *sch) +void mq_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mq_sched *priv = qdisc_priv(sch); @@ -124,8 +139,9 @@ static void mq_attach(struct Qdisc *sch) kfree(priv->qdiscs); priv->qdiscs = NULL; } +EXPORT_SYMBOL_NS_GPL(mq_attach, "NET_SCHED_INTERNAL"); -static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +void mq_dump_common(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct Qdisc *qdisc; @@ -152,7 +168,12 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) spin_unlock_bh(qdisc_lock(qdisc)); } +} +EXPORT_SYMBOL_NS_GPL(mq_dump_common, "NET_SCHED_INTERNAL"); +static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + mq_dump_common(sch, skb); return mq_offload_stats(sch); } @@ -166,11 +187,12 @@ static struct netdev_queue *mq_queue_get(struct Qdisc *sch, unsigned long cl) return netdev_get_tx_queue(dev, ntx); } -static struct netdev_queue *mq_select_queue(struct Qdisc *sch, - struct tcmsg *tcm) +struct netdev_queue *mq_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) { return mq_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); } +EXPORT_SYMBOL_NS_GPL(mq_select_queue, "NET_SCHED_INTERNAL"); static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old, struct netlink_ext_ack *extack) @@ -198,14 +220,15 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, return 0; } -static struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) +struct Qdisc *mq_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); return rtnl_dereference(dev_queue->qdisc_sleeping); } +EXPORT_SYMBOL_NS_GPL(mq_leaf, "NET_SCHED_INTERNAL"); -static unsigned long mq_find(struct Qdisc *sch, u32 classid) +unsigned long mq_find(struct Qdisc *sch, u32 classid) { unsigned int ntx = TC_H_MIN(classid); @@ -213,9 +236,10 @@ static unsigned long mq_find(struct Qdisc *sch, u32 classid) return 0; return ntx; } +EXPORT_SYMBOL_NS_GPL(mq_find, "NET_SCHED_INTERNAL"); -static int mq_dump_class(struct Qdisc *sch, unsigned long cl, - struct sk_buff *skb, struct tcmsg *tcm) +int mq_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); @@ -224,9 +248,10 @@ static int mq_dump_class(struct Qdisc *sch, unsigned long cl, tcm->tcm_info = rtnl_dereference(dev_queue->qdisc_sleeping)->handle; return 0; } +EXPORT_SYMBOL_NS_GPL(mq_dump_class, "NET_SCHED_INTERNAL"); -static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, - struct gnet_dump *d) +int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) { struct netdev_queue *dev_queue = mq_queue_get(sch, cl); @@ -236,8 +261,9 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, return -1; return 0; } +EXPORT_SYMBOL_NS_GPL(mq_dump_class_stats, "NET_SCHED_INTERNAL"); -static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx; @@ -251,6 +277,7 @@ static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) break; } } +EXPORT_SYMBOL_NS_GPL(mq_walk, "NET_SCHED_INTERNAL"); static const struct Qdisc_class_ops mq_class_ops = { .select_queue = mq_select_queue, diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index f97f77b041d9..d8201eb3ac5f 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -3357,11 +3357,10 @@ int smc_create_clcsk(struct net *net, struct sock *sk, int family) return 0; } -static int __smc_create(struct net *net, struct socket *sock, int protocol, - int kern, struct socket *clcsock) +static int smc_create(struct net *net, struct socket *sock, int protocol, + int kern) { int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET; - struct smc_sock *smc; struct sock *sk; int rc; @@ -3380,15 +3379,7 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, if (!sk) goto out; - /* create internal TCP socket for CLC handshake and fallback */ - smc = smc_sk(sk); - - rc = 0; - if (clcsock) - smc->clcsock = clcsock; - else - rc = smc_create_clcsk(net, sk, family); - + rc = smc_create_clcsk(net, sk, family); if (rc) { sk_common_release(sk); sock->sk = NULL; @@ -3397,76 +3388,12 @@ out: return rc; } -static int smc_create(struct net *net, struct socket *sock, int protocol, - int kern) -{ - return __smc_create(net, sock, protocol, kern, NULL); -} - static const struct net_proto_family smc_sock_family_ops = { .family = PF_SMC, .owner = THIS_MODULE, .create = smc_create, }; -static int smc_ulp_init(struct sock *sk) -{ - struct socket *tcp = sk->sk_socket; - struct net *net = sock_net(sk); - struct socket *smcsock; - int protocol, ret; - - /* only TCP can be replaced */ - if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP || - (sk->sk_family != AF_INET && sk->sk_family != AF_INET6)) - return -ESOCKTNOSUPPORT; - /* don't handle wq now */ - if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list) - return -ENOTCONN; - - if (sk->sk_family == AF_INET) - protocol = SMCPROTO_SMC; - else - protocol = SMCPROTO_SMC6; - - smcsock = sock_alloc(); - if (!smcsock) - return -ENFILE; - - smcsock->type = SOCK_STREAM; - __module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */ - ret = __smc_create(net, smcsock, protocol, 1, tcp); - if (ret) { - sock_release(smcsock); /* module_put() which ops won't be NULL */ - return ret; - } - - /* replace tcp socket to smc */ - smcsock->file = tcp->file; - smcsock->file->private_data = smcsock; - smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */ - smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */ - tcp->file = NULL; - - return ret; -} - -static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk, - const gfp_t priority) -{ - struct inet_connection_sock *icsk = inet_csk(newsk); - - /* don't inherit ulp ops to child when listen */ - icsk->icsk_ulp_ops = NULL; -} - -static struct tcp_ulp_ops smc_ulp_ops __read_mostly = { - .name = "smc", - .owner = THIS_MODULE, - .init = smc_ulp_init, - .clone = smc_ulp_clone, -}; - unsigned int smc_net_id; static __net_init int smc_net_init(struct net *net) @@ -3589,16 +3516,10 @@ static int __init smc_init(void) pr_err("%s: ib_register fails with %d\n", __func__, rc); goto out_sock; } - - rc = tcp_register_ulp(&smc_ulp_ops); - if (rc) { - pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_ib; - } rc = smc_inet_init(); if (rc) { pr_err("%s: smc_inet_init fails with %d\n", __func__, rc); - goto out_ulp; + goto out_ib; } rc = bpf_smc_hs_ctrl_init(); if (rc) { @@ -3610,8 +3531,6 @@ static int __init smc_init(void) return 0; out_inet: smc_inet_exit(); -out_ulp: - tcp_unregister_ulp(&smc_ulp_ops); out_ib: smc_ib_unregister_client(); out_sock: @@ -3647,7 +3566,6 @@ static void __exit smc_exit(void) { static_branch_disable(&tcp_have_smc); smc_inet_exit(); - tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); smc_ib_unregister_client(); @@ -3672,7 +3590,6 @@ MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>"); MODULE_DESCRIPTION("smc socket address family"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_SMC); -MODULE_ALIAS_TCP_ULP("smc"); /* 256 for IPPROTO_SMC and 1 for SOCK_STREAM */ MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_INET, 256, 1); #if IS_ENABLED(CONFIG_IPV6) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 970db62bd029..a3f9ca28c3d5 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -460,7 +460,7 @@ static void tipc_aead_users_dec(struct tipc_aead __rcu *aead, int lim) rcu_read_lock(); tmp = rcu_dereference(aead); if (tmp) - atomic_add_unless(&rcu_dereference(aead)->users, -1, lim); + atomic_add_unless(&tmp->users, -1, lim); rcu_read_unlock(); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d0511225799b..f6d56e70c7a2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1650,10 +1650,9 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); - /* First of all allocate resources. - * If we will make it after state is locked, - * we will have to recheck all again in any case. - */ + err = prepare_peercred(&peercred); + if (err) + goto out; /* create new sock for complete connection */ newsk = unix_create1(net, NULL, 0, sock->type); @@ -1662,10 +1661,6 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr_unsized *uad goto out; } - err = prepare_peercred(&peercred); - if (err) - goto out; - /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index a3505a4dcee0..20ad2b2dc17b 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -83,6 +83,50 @@ * TCP_ESTABLISHED - connected * TCP_CLOSING - disconnecting * TCP_LISTEN - listening + * + * - Namespaces in vsock support two different modes: "local" and "global". + * Each mode defines how the namespace interacts with CIDs. + * Each namespace exposes two sysctl files: + * + * - /proc/sys/net/vsock/ns_mode (read-only) reports the current namespace's + * mode, which is set at namespace creation and immutable thereafter. + * - /proc/sys/net/vsock/child_ns_mode (writable) controls what mode future + * child namespaces will inherit when created. The default is "global". + * + * Changing child_ns_mode only affects newly created namespaces, not the + * current namespace or existing children. At namespace creation, ns_mode + * is inherited from the parent's child_ns_mode. + * + * The init_net mode is "global" and cannot be modified. + * + * The modes affect the allocation and accessibility of CIDs as follows: + * + * - global - access and allocation are all system-wide + * - all CID allocation from global namespaces draw from the same + * system-wide pool. + * - if one global namespace has already allocated some CID, another + * global namespace will not be able to allocate the same CID. + * - global mode AF_VSOCK sockets can reach any VM or socket in any global + * namespace, they are not contained to only their own namespace. + * - AF_VSOCK sockets in a global mode namespace cannot reach VMs or + * sockets in any local mode namespace. + * - local - access and allocation are contained within the namespace + * - CID allocation draws only from a private pool local only to the + * namespace, and does not affect the CIDs available for allocation in any + * other namespace (global or local). + * - VMs in a local namespace do not collide with CIDs in any other local + * namespace or any global namespace. For example, if a VM in a local mode + * namespace is given CID 10, then CID 10 is still available for + * allocation in any other namespace, but not in the same namespace. + * - AF_VSOCK sockets in a local mode namespace can connect only to VMs or + * other sockets within their own namespace. + * - sockets bound to VMADDR_CID_ANY in local namespaces will never resolve + * to any transport that is not compatible with local mode. There is no + * error that propagates to the user (as there is for connection attempts) + * because it is possible for some packet to reach this socket from + * a different transport that *does* support local mode. For + * example, virtio-vsock may not support local mode, but the socket + * may still accept a connection from vhost-vsock which does. */ #include <linux/compat.h> @@ -100,20 +144,31 @@ #include <linux/module.h> #include <linux/mutex.h> #include <linux/net.h> +#include <linux/proc_fs.h> #include <linux/poll.h> #include <linux/random.h> #include <linux/skbuff.h> #include <linux/smp.h> #include <linux/socket.h> #include <linux/stddef.h> +#include <linux/sysctl.h> #include <linux/unistd.h> #include <linux/wait.h> #include <linux/workqueue.h> #include <net/sock.h> #include <net/af_vsock.h> +#include <net/netns/vsock.h> #include <uapi/linux/vm_sockets.h> #include <uapi/asm-generic/ioctls.h> +#define VSOCK_NET_MODE_STR_GLOBAL "global" +#define VSOCK_NET_MODE_STR_LOCAL "local" + +/* 6 chars for "global", 1 for null-terminator, and 1 more for '\n'. + * The newline is added by proc_dostring() for read operations. + */ +#define VSOCK_NET_MODE_STR_MAX 8 + static int __vsock_bind(struct sock *sk, struct sockaddr_vm *addr); static void vsock_sk_destruct(struct sock *sk); static int vsock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); @@ -235,33 +290,42 @@ static void __vsock_remove_connected(struct vsock_sock *vsk) sock_put(&vsk->sk); } -static struct sock *__vsock_find_bound_socket(struct sockaddr_vm *addr) +static struct sock *__vsock_find_bound_socket_net(struct sockaddr_vm *addr, + struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_bound_sockets(addr), bound_table) { - if (vsock_addr_equals_addr(addr, &vsk->local_addr)) - return sk_vsock(vsk); + struct sock *sk = sk_vsock(vsk); + + if (vsock_addr_equals_addr(addr, &vsk->local_addr) && + vsock_net_check_mode(sock_net(sk), net)) + return sk; if (addr->svm_port == vsk->local_addr.svm_port && (vsk->local_addr.svm_cid == VMADDR_CID_ANY || - addr->svm_cid == VMADDR_CID_ANY)) - return sk_vsock(vsk); + addr->svm_cid == VMADDR_CID_ANY) && + vsock_net_check_mode(sock_net(sk), net)) + return sk; } return NULL; } -static struct sock *__vsock_find_connected_socket(struct sockaddr_vm *src, - struct sockaddr_vm *dst) +static struct sock * +__vsock_find_connected_socket_net(struct sockaddr_vm *src, + struct sockaddr_vm *dst, struct net *net) { struct vsock_sock *vsk; list_for_each_entry(vsk, vsock_connected_sockets(src, dst), connected_table) { + struct sock *sk = sk_vsock(vsk); + if (vsock_addr_equals_addr(src, &vsk->remote_addr) && - dst->svm_port == vsk->local_addr.svm_port) { - return sk_vsock(vsk); + dst->svm_port == vsk->local_addr.svm_port && + vsock_net_check_mode(sock_net(sk), net)) { + return sk; } } @@ -304,12 +368,18 @@ void vsock_remove_connected(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(vsock_remove_connected); -struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) +/* Find a bound socket, filtering by namespace and namespace mode. + * + * Use this in transports that are namespace-aware and can provide the + * network namespace context. + */ +struct sock *vsock_find_bound_socket_net(struct sockaddr_vm *addr, + struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); - sk = __vsock_find_bound_socket(addr); + sk = __vsock_find_bound_socket_net(addr, net); if (sk) sock_hold(sk); @@ -317,15 +387,32 @@ struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) return sk; } +EXPORT_SYMBOL_GPL(vsock_find_bound_socket_net); + +/* Find a bound socket without namespace filtering. + * + * Use this in transports that lack namespace context. All sockets are + * treated as if in global mode. + */ +struct sock *vsock_find_bound_socket(struct sockaddr_vm *addr) +{ + return vsock_find_bound_socket_net(addr, NULL); +} EXPORT_SYMBOL_GPL(vsock_find_bound_socket); -struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, - struct sockaddr_vm *dst) +/* Find a connected socket, filtering by namespace and namespace mode. + * + * Use this in transports that are namespace-aware and can provide the + * network namespace context. + */ +struct sock *vsock_find_connected_socket_net(struct sockaddr_vm *src, + struct sockaddr_vm *dst, + struct net *net) { struct sock *sk; spin_lock_bh(&vsock_table_lock); - sk = __vsock_find_connected_socket(src, dst); + sk = __vsock_find_connected_socket_net(src, dst, net); if (sk) sock_hold(sk); @@ -333,6 +420,18 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, return sk; } +EXPORT_SYMBOL_GPL(vsock_find_connected_socket_net); + +/* Find a connected socket without namespace filtering. + * + * Use this in transports that lack namespace context. All sockets are + * treated as if in global mode. + */ +struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, + struct sockaddr_vm *dst) +{ + return vsock_find_connected_socket_net(src, dst, NULL); +} EXPORT_SYMBOL_GPL(vsock_find_connected_socket); void vsock_remove_sock(struct vsock_sock *vsk) @@ -528,7 +627,7 @@ int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk) if (sk->sk_type == SOCK_SEQPACKET) { if (!new_transport->seqpacket_allow || - !new_transport->seqpacket_allow(remote_cid)) { + !new_transport->seqpacket_allow(vsk, remote_cid)) { module_put(new_transport->module); return -ESOCKTNOSUPPORT; } @@ -676,11 +775,11 @@ out: static int __vsock_bind_connectible(struct vsock_sock *vsk, struct sockaddr_vm *addr) { - static u32 port; + struct net *net = sock_net(sk_vsock(vsk)); struct sockaddr_vm new_addr; - if (!port) - port = get_random_u32_above(LAST_RESERVED_PORT); + if (!net->vsock.port) + net->vsock.port = get_random_u32_above(LAST_RESERVED_PORT); vsock_addr_init(&new_addr, addr->svm_cid, addr->svm_port); @@ -689,13 +788,13 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, unsigned int i; for (i = 0; i < MAX_PORT_RETRIES; i++) { - if (port == VMADDR_PORT_ANY || - port <= LAST_RESERVED_PORT) - port = LAST_RESERVED_PORT + 1; + if (net->vsock.port == VMADDR_PORT_ANY || + net->vsock.port <= LAST_RESERVED_PORT) + net->vsock.port = LAST_RESERVED_PORT + 1; - new_addr.svm_port = port++; + new_addr.svm_port = net->vsock.port++; - if (!__vsock_find_bound_socket(&new_addr)) { + if (!__vsock_find_bound_socket_net(&new_addr, net)) { found = true; break; } @@ -712,7 +811,7 @@ static int __vsock_bind_connectible(struct vsock_sock *vsk, return -EACCES; } - if (__vsock_find_bound_socket(&new_addr)) + if (__vsock_find_bound_socket_net(&new_addr, net)) return -EADDRINUSE; } @@ -1314,7 +1413,7 @@ static int vsock_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if (!transport->dgram_allow(remote_addr->svm_cid, + if (!transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; @@ -1355,7 +1454,7 @@ static int vsock_dgram_connect(struct socket *sock, if (err) goto out; - if (!vsk->transport->dgram_allow(remote_addr->svm_cid, + if (!vsk->transport->dgram_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -EINVAL; goto out; @@ -1585,7 +1684,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr_unsized *addr, * endpoints. */ if (!transport || - !transport->stream_allow(remote_addr->svm_cid, + !transport->stream_allow(vsk, remote_addr->svm_cid, remote_addr->svm_port)) { err = -ENETUNREACH; goto out; @@ -2662,6 +2761,180 @@ static struct miscdevice vsock_device = { .fops = &vsock_device_ops, }; +static int __vsock_net_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos, + enum vsock_net_mode mode, + enum vsock_net_mode *new_mode) +{ + char data[VSOCK_NET_MODE_STR_MAX] = {0}; + struct ctl_table tmp; + int ret; + + if (!table->data || !table->maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + tmp = *table; + tmp.data = data; + + if (!write) { + const char *p; + + switch (mode) { + case VSOCK_NET_MODE_GLOBAL: + p = VSOCK_NET_MODE_STR_GLOBAL; + break; + case VSOCK_NET_MODE_LOCAL: + p = VSOCK_NET_MODE_STR_LOCAL; + break; + default: + WARN_ONCE(true, "netns has invalid vsock mode"); + *lenp = 0; + return 0; + } + + strscpy(data, p, sizeof(data)); + tmp.maxlen = strlen(p); + } + + ret = proc_dostring(&tmp, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + + if (*lenp >= sizeof(data)) + return -EINVAL; + + if (!strncmp(data, VSOCK_NET_MODE_STR_GLOBAL, sizeof(data))) + *new_mode = VSOCK_NET_MODE_GLOBAL; + else if (!strncmp(data, VSOCK_NET_MODE_STR_LOCAL, sizeof(data))) + *new_mode = VSOCK_NET_MODE_LOCAL; + else + return -EINVAL; + + return 0; +} + +static int vsock_net_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net; + + if (write) + return -EPERM; + + net = current->nsproxy->net_ns; + + return __vsock_net_mode_string(table, write, buffer, lenp, ppos, + vsock_net_mode(net), NULL); +} + +static int vsock_net_child_mode_string(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + enum vsock_net_mode new_mode; + struct net *net; + int ret; + + net = current->nsproxy->net_ns; + + ret = __vsock_net_mode_string(table, write, buffer, lenp, ppos, + vsock_net_child_mode(net), &new_mode); + if (ret) + return ret; + + if (write) + vsock_net_set_child_mode(net, new_mode); + + return 0; +} + +static struct ctl_table vsock_table[] = { + { + .procname = "ns_mode", + .data = &init_net.vsock.mode, + .maxlen = VSOCK_NET_MODE_STR_MAX, + .mode = 0444, + .proc_handler = vsock_net_mode_string + }, + { + .procname = "child_ns_mode", + .data = &init_net.vsock.child_ns_mode, + .maxlen = VSOCK_NET_MODE_STR_MAX, + .mode = 0644, + .proc_handler = vsock_net_child_mode_string + }, +}; + +static int __net_init vsock_sysctl_register(struct net *net) +{ + struct ctl_table *table; + + if (net_eq(net, &init_net)) { + table = vsock_table; + } else { + table = kmemdup(vsock_table, sizeof(vsock_table), GFP_KERNEL); + if (!table) + goto err_alloc; + + table[0].data = &net->vsock.mode; + table[1].data = &net->vsock.child_ns_mode; + } + + net->vsock.sysctl_hdr = register_net_sysctl_sz(net, "net/vsock", table, + ARRAY_SIZE(vsock_table)); + if (!net->vsock.sysctl_hdr) + goto err_reg; + + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void vsock_sysctl_unregister(struct net *net) +{ + const struct ctl_table *table; + + table = net->vsock.sysctl_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->vsock.sysctl_hdr); + if (!net_eq(net, &init_net)) + kfree(table); +} + +static void vsock_net_init(struct net *net) +{ + if (net_eq(net, &init_net)) + net->vsock.mode = VSOCK_NET_MODE_GLOBAL; + else + net->vsock.mode = vsock_net_child_mode(current->nsproxy->net_ns); + + net->vsock.child_ns_mode = VSOCK_NET_MODE_GLOBAL; +} + +static __net_init int vsock_sysctl_init_net(struct net *net) +{ + vsock_net_init(net); + + if (vsock_sysctl_register(net)) + return -ENOMEM; + + return 0; +} + +static __net_exit void vsock_sysctl_exit_net(struct net *net) +{ + vsock_sysctl_unregister(net); +} + +static struct pernet_operations vsock_sysctl_ops = { + .init = vsock_sysctl_init_net, + .exit = vsock_sysctl_exit_net, +}; + static int __init vsock_init(void) { int err = 0; @@ -2689,10 +2962,17 @@ static int __init vsock_init(void) goto err_unregister_proto; } + if (register_pernet_subsys(&vsock_sysctl_ops)) { + err = -ENOMEM; + goto err_unregister_sock; + } + vsock_bpf_build_proto(); return 0; +err_unregister_sock: + sock_unregister(AF_VSOCK); err_unregister_proto: proto_unregister(&vsock_proto); err_deregister_misc: @@ -2706,6 +2986,7 @@ static void __exit vsock_exit(void) misc_deregister(&vsock_device); sock_unregister(AF_VSOCK); proto_unregister(&vsock_proto); + unregister_pernet_subsys(&vsock_sysctl_ops); } const struct vsock_transport *vsock_core_get_transport(struct vsock_sock *vsk) diff --git a/net/vmw_vsock/hyperv_transport.c b/net/vmw_vsock/hyperv_transport.c index 432fcbbd14d4..c3010c874308 100644 --- a/net/vmw_vsock/hyperv_transport.c +++ b/net/vmw_vsock/hyperv_transport.c @@ -570,7 +570,7 @@ static int hvs_dgram_enqueue(struct vsock_sock *vsk, return -EOPNOTSUPP; } -static bool hvs_dgram_allow(u32 cid, u32 port) +static bool hvs_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port) { return false; } @@ -745,8 +745,11 @@ static bool hvs_stream_is_active(struct vsock_sock *vsk) return hvs->chan != NULL; } -static bool hvs_stream_allow(u32 cid, u32 port) +static bool hvs_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) { + if (!vsock_net_mode_global(vsk)) + return false; + if (cid == VMADDR_CID_HOST) return true; diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 8c867023a2e5..3f7ea2db9bd7 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -231,7 +231,7 @@ static int virtio_transport_send_skb_fast_path(struct virtio_vsock *vsock, struc } static int -virtio_transport_send_pkt(struct sk_buff *skb) +virtio_transport_send_pkt(struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr; struct virtio_vsock *vsock; @@ -536,7 +536,13 @@ static bool virtio_transport_msgzerocopy_allow(void) return true; } -static bool virtio_transport_seqpacket_allow(u32 remote_cid); +bool virtio_transport_stream_allow(struct vsock_sock *vsk, u32 cid, u32 port) +{ + return vsock_net_mode_global(vsk); +} + +static bool virtio_transport_seqpacket_allow(struct vsock_sock *vsk, + u32 remote_cid); static struct virtio_transport virtio_transport = { .transport = { @@ -593,11 +599,15 @@ static struct virtio_transport virtio_transport = { .can_msgzerocopy = virtio_transport_can_msgzerocopy, }; -static bool virtio_transport_seqpacket_allow(u32 remote_cid) +static bool +virtio_transport_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid) { struct virtio_vsock *vsock; bool seqpacket_allow; + if (!vsock_net_mode_global(vsk)) + return false; + seqpacket_allow = false; rcu_read_lock(); vsock = rcu_dereference(the_virtio_vsock); @@ -660,7 +670,11 @@ static void virtio_transport_rx_work(struct work_struct *work) virtio_vsock_skb_put(skb, payload_len); virtio_transport_deliver_tap_pkt(skb); - virtio_transport_recv_pkt(&virtio_transport, skb); + + /* Force virtio-transport into global mode since it + * does not yet support local-mode namespacing. + */ + virtio_transport_recv_pkt(&virtio_transport, skb, NULL); } } while (!virtqueue_enable_cb(vq)); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index d3e26025ef58..d017ab318a7e 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -414,7 +414,7 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, virtio_transport_inc_tx_pkt(vvs, skb); - ret = t_ops->send_pkt(skb); + ret = t_ops->send_pkt(skb, info->net); if (ret < 0) break; @@ -526,6 +526,7 @@ static int virtio_transport_send_credit_update(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_CREDIT_UPDATE, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1055,12 +1056,6 @@ bool virtio_transport_stream_is_active(struct vsock_sock *vsk) } EXPORT_SYMBOL_GPL(virtio_transport_stream_is_active); -bool virtio_transport_stream_allow(u32 cid, u32 port) -{ - return true; -} -EXPORT_SYMBOL_GPL(virtio_transport_stream_allow); - int virtio_transport_dgram_bind(struct vsock_sock *vsk, struct sockaddr_vm *addr) { @@ -1068,7 +1063,7 @@ int virtio_transport_dgram_bind(struct vsock_sock *vsk, } EXPORT_SYMBOL_GPL(virtio_transport_dgram_bind); -bool virtio_transport_dgram_allow(u32 cid, u32 port) +bool virtio_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, u32 port) { return false; } @@ -1079,6 +1074,7 @@ int virtio_transport_connect(struct vsock_sock *vsk) struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_REQUEST, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1094,6 +1090,7 @@ int virtio_transport_shutdown(struct vsock_sock *vsk, int mode) (mode & SEND_SHUTDOWN ? VIRTIO_VSOCK_SHUTDOWN_SEND : 0), .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1120,6 +1117,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk, .msg = msg, .pkt_len = len, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1157,6 +1155,7 @@ static int virtio_transport_reset(struct vsock_sock *vsk, .op = VIRTIO_VSOCK_OP_RST, .reply = !!skb, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; /* Send RST only if the original pkt is not a RST pkt */ @@ -1168,15 +1167,31 @@ static int virtio_transport_reset(struct vsock_sock *vsk, /* Normally packets are associated with a socket. There may be no socket if an * attempt was made to connect to a socket that does not exist. + * + * net refers to the namespace of whoever sent the invalid message. For + * loopback, this is the namespace of the socket. For vhost, this is the + * namespace of the VM (i.e., vhost_vsock). */ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, - struct sk_buff *skb) + struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct virtio_vsock_pkt_info info = { .op = VIRTIO_VSOCK_OP_RST, .type = le16_to_cpu(hdr->type), .reply = true, + + /* Set sk owner to socket we are replying to (may be NULL for + * non-loopback). This keeps a reference to the sock and + * sock_net(sk) until the reply skb is freed. + */ + .vsk = vsock_sk(skb->sk), + + /* net is not defined here because we pass it directly to + * t->send_pkt(), instead of relying on + * virtio_transport_send_pkt_info() to pass it. It is not needed + * by virtio_transport_alloc_skb(). + */ }; struct sk_buff *reply; @@ -1195,7 +1210,7 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, if (!reply) return -ENOMEM; - return t->send_pkt(reply); + return t->send_pkt(reply, net); } /* This function should be called with sk_lock held and SOCK_DONE set */ @@ -1479,6 +1494,7 @@ virtio_transport_send_response(struct vsock_sock *vsk, .remote_port = le32_to_cpu(hdr->src_port), .reply = true, .vsk = vsk, + .net = sock_net(sk_vsock(vsk)), }; return virtio_transport_send_pkt_info(vsk, &info); @@ -1521,12 +1537,12 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, int ret; if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -EINVAL; } if (sk_acceptq_is_full(sk)) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ENOMEM; } @@ -1534,13 +1550,13 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, * Subsequent enqueues would lead to a memory leak. */ if (sk->sk_shutdown == SHUTDOWN_MASK) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ESHUTDOWN; } child = vsock_create_connected(sk); if (!child) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); return -ENOMEM; } @@ -1562,7 +1578,7 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, */ if (ret || vchild->transport != &t->transport) { release_sock(child); - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, skb, sock_net(sk)); sock_put(child); return ret; } @@ -1590,7 +1606,7 @@ static bool virtio_transport_valid_type(u16 type) * lock. */ void virtio_transport_recv_pkt(struct virtio_transport *t, - struct sk_buff *skb) + struct sk_buff *skb, struct net *net) { struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); struct sockaddr_vm src, dst; @@ -1613,24 +1629,24 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, le32_to_cpu(hdr->fwd_cnt)); if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); goto free_pkt; } /* The socket must be in connected or bound table * otherwise send reset back */ - sk = vsock_find_connected_socket(&src, &dst); + sk = vsock_find_connected_socket_net(&src, &dst, net); if (!sk) { - sk = vsock_find_bound_socket(&dst); + sk = vsock_find_bound_socket_net(&dst, net); if (!sk) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); goto free_pkt; } } if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); sock_put(sk); goto free_pkt; } @@ -1649,7 +1665,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, */ if (sock_flag(sk, SOCK_DONE) || (sk->sk_state != TCP_LISTEN && vsk->transport != &t->transport)) { - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); release_sock(sk); sock_put(sk); goto free_pkt; @@ -1681,7 +1697,7 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, kfree_skb(skb); break; default: - (void)virtio_transport_reset_no_sock(t, skb); + (void)virtio_transport_reset_no_sock(t, skb, net); kfree_skb(skb); break; } diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 7eccd6708d66..a64522be1bad 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -161,7 +161,7 @@ vmci_transport_packet_init(struct vmci_transport_packet *pkt, case VMCI_TRANSPORT_PACKET_TYPE_WAITING_READ: case VMCI_TRANSPORT_PACKET_TYPE_WAITING_WRITE: - memcpy(&pkt->u.wait, wait, sizeof(pkt->u.wait)); + pkt->u.wait = *wait; break; case VMCI_TRANSPORT_PACKET_TYPE_REQUEST2: @@ -646,13 +646,17 @@ static int vmci_transport_recv_dgram_cb(void *data, struct vmci_datagram *dg) return VMCI_SUCCESS; } -static bool vmci_transport_stream_allow(u32 cid, u32 port) +static bool vmci_transport_stream_allow(struct vsock_sock *vsk, u32 cid, + u32 port) { static const u32 non_socket_contexts[] = { VMADDR_CID_LOCAL, }; int i; + if (!vsock_net_mode_global(vsk)) + return false; + BUILD_BUG_ON(sizeof(cid) != sizeof(*non_socket_contexts)); for (i = 0; i < ARRAY_SIZE(non_socket_contexts); i++) { @@ -682,12 +686,10 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) err = VMCI_SUCCESS; bh_process_pkt = false; - /* Ignore incoming packets from contexts without sockets, or resources - * that aren't vsock implementations. + /* Ignore incoming packets from resources that aren't vsock + * implementations. */ - - if (!vmci_transport_stream_allow(dg->src.context, -1) - || vmci_transport_peer_rid(dg->src.context) != dg->src.resource) + if (vmci_transport_peer_rid(dg->src.context) != dg->src.resource) return VMCI_ERROR_NO_ACCESS; if (VMCI_DG_SIZE(dg) < sizeof(*pkt)) @@ -749,6 +751,12 @@ static int vmci_transport_recv_stream_cb(void *data, struct vmci_datagram *dg) goto out; } + /* Ignore incoming packets from contexts without sockets. */ + if (!vmci_transport_stream_allow(vsk, dg->src.context, -1)) { + err = VMCI_ERROR_NO_ACCESS; + goto out; + } + /* We do most everything in a work queue, but let's fast path the * notification of reads and writes to help data transfer performance. * We can only do this if there is no process context code executing @@ -1784,8 +1792,12 @@ out: return err; } -static bool vmci_transport_dgram_allow(u32 cid, u32 port) +static bool vmci_transport_dgram_allow(struct vsock_sock *vsk, u32 cid, + u32 port) { + if (!vsock_net_mode_global(vsk)) + return false; + if (cid == VMADDR_CID_HYPERVISOR) { /* Registrations of PBRPC Servers do not modify VMX/Hypervisor * state and are allowed. diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index bc2ff918b315..8068d1b6e851 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -26,7 +26,7 @@ static u32 vsock_loopback_get_local_cid(void) return VMADDR_CID_LOCAL; } -static int vsock_loopback_send_pkt(struct sk_buff *skb) +static int vsock_loopback_send_pkt(struct sk_buff *skb, struct net *net) { struct vsock_loopback *vsock = &the_vsock_loopback; int len = skb->len; @@ -46,7 +46,15 @@ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) return 0; } -static bool vsock_loopback_seqpacket_allow(u32 remote_cid); +static bool vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, + u32 remote_cid); + +static bool vsock_loopback_stream_allow(struct vsock_sock *vsk, u32 cid, + u32 port) +{ + return true; +} + static bool vsock_loopback_msgzerocopy_allow(void) { return true; @@ -76,7 +84,7 @@ static struct virtio_transport loopback_transport = { .stream_has_space = virtio_transport_stream_has_space, .stream_rcvhiwat = virtio_transport_stream_rcvhiwat, .stream_is_active = virtio_transport_stream_is_active, - .stream_allow = virtio_transport_stream_allow, + .stream_allow = vsock_loopback_stream_allow, .seqpacket_dequeue = virtio_transport_seqpacket_dequeue, .seqpacket_enqueue = virtio_transport_seqpacket_enqueue, @@ -106,9 +114,10 @@ static struct virtio_transport loopback_transport = { .send_pkt = vsock_loopback_send_pkt, }; -static bool vsock_loopback_seqpacket_allow(u32 remote_cid) +static bool +vsock_loopback_seqpacket_allow(struct vsock_sock *vsk, u32 remote_cid) { - return true; + return vsock_net_mode_global(vsk); } static void vsock_loopback_work(struct work_struct *work) @@ -130,7 +139,8 @@ static void vsock_loopback_work(struct work_struct *work) */ virtio_transport_consume_skb_sent(skb, false); virtio_transport_deliver_tap_pkt(skb); - virtio_transport_recv_pkt(&loopback_transport, skb); + virtio_transport_recv_pkt(&loopback_transport, skb, + sock_net(skb->sk)); } } diff --git a/net/wireless/core.c b/net/wireless/core.c index 9a420d627d3c..9af85d655027 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -265,6 +265,8 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, rdev_stop_nan(rdev, wdev); wdev->is_running = false; + eth_zero_addr(wdev->u.nan.cluster_id); + rdev->opencount--; } @@ -347,7 +349,7 @@ void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) guard(wiphy)(&rdev->wiphy); - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); cfg80211_remove_virtual_intf(rdev, wdev); } } @@ -661,12 +663,8 @@ int wiphy_verify_iface_combinations(struct wiphy *wiphy, c->limits[j].max > 1)) return -EINVAL; - /* Only a single NAN can be allowed, avoid this - * check for multi-radio global combination, since it - * hold the capabilities of all radio combinations. - */ - if (!combined_radio && - WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && + /* Only a single NAN can be allowed */ + if (WARN_ON(types & BIT(NL80211_IFTYPE_NAN) && c->limits[j].max > 1)) return -EINVAL; @@ -1371,7 +1369,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, } void cfg80211_leave(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev) + struct wireless_dev *wdev, + int link_id) { struct net_device *dev = wdev->netdev; struct cfg80211_sched_scan_request *pos, *tmp; @@ -1409,14 +1408,16 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, break; case NL80211_IFTYPE_AP: case NL80211_IFTYPE_P2P_GO: - cfg80211_stop_ap(rdev, dev, -1, true); + cfg80211_stop_ap(rdev, dev, link_id, true); break; case NL80211_IFTYPE_OCB: cfg80211_leave_ocb(rdev, dev); break; case NL80211_IFTYPE_P2P_DEVICE: + cfg80211_stop_p2p_device(rdev, wdev); + break; case NL80211_IFTYPE_NAN: - /* cannot happen, has no netdev */ + cfg80211_stop_nan(rdev, wdev); break; case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_MONITOR: @@ -1430,27 +1431,34 @@ void cfg80211_leave(struct cfg80211_registered_device *rdev, } } -void cfg80211_stop_iface(struct wiphy *wiphy, struct wireless_dev *wdev, - gfp_t gfp) +void cfg80211_stop_link(struct wiphy *wiphy, struct wireless_dev *wdev, + int link_id, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct cfg80211_event *ev; unsigned long flags; - trace_cfg80211_stop_iface(wiphy, wdev); + /* Only AP/GO interfaces may have a specific link_id */ + if (WARN_ON_ONCE(link_id != -1 && + wdev->iftype != NL80211_IFTYPE_AP && + wdev->iftype != NL80211_IFTYPE_P2P_GO)) + link_id = -1; + + trace_cfg80211_stop_link(wiphy, wdev, link_id); ev = kzalloc(sizeof(*ev), gfp); if (!ev) return; ev->type = EVENT_STOPPED; + ev->link_id = link_id; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } -EXPORT_SYMBOL(cfg80211_stop_iface); +EXPORT_SYMBOL(cfg80211_stop_link); void cfg80211_init_wdev(struct wireless_dev *wdev) { @@ -1589,7 +1597,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, break; case NETDEV_GOING_DOWN: scoped_guard(wiphy, &rdev->wiphy) { - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); cfg80211_remove_links(wdev); } /* since we just did cfg80211_leave() nothing to do there */ diff --git a/net/wireless/core.h b/net/wireless/core.h index 63dcf315dba7..6ac57b7b2615 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -289,6 +289,7 @@ struct cfg80211_event { u8 td_bitmap_len; } pa; }; + int link_id; }; struct cfg80211_cached_keys { @@ -537,7 +538,8 @@ void cfg80211_update_iface_num(struct cfg80211_registered_device *rdev, enum nl80211_iftype iftype, int num); void cfg80211_leave(struct cfg80211_registered_device *rdev, - struct wireless_dev *wdev); + struct wireless_dev *wdev, + int link_id); void cfg80211_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 03efd45c007f..6e58b238a1f8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5,7 +5,7 @@ * Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2015-2017 Intel Deutschland GmbH - * Copyright (C) 2018-2025 Intel Corporation + * Copyright (C) 2018-2026 Intel Corporation */ #include <linux/if.h> @@ -332,6 +332,15 @@ static int validate_nan_cluster_id(const struct nlattr *attr, return 0; } +static int validate_uhr_capa(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + + return ieee80211_uhr_capa_size_ok(data, len, false); +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -361,6 +370,7 @@ nl80211_pmsr_ftm_req_attr_policy[NL80211_PMSR_FTM_REQ_ATTR_MAX + 1] = { [NL80211_PMSR_FTM_REQ_ATTR_NON_TRIGGER_BASED] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK] = { .type = NLA_FLAG }, [NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR] = { .type = NLA_U8 }, + [NL80211_PMSR_FTM_REQ_ATTR_RSTA] = { .type = NLA_FLAG }, }; static const struct nla_policy @@ -932,6 +942,10 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { NLA_POLICY_NESTED(nl80211_s1g_short_beacon), [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, + [NL80211_ATTR_EPP_PEER] = { .type = NLA_FLAG }, + [NL80211_ATTR_UHR_CAPABILITY] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_uhr_capa, 255), + [NL80211_ATTR_DISABLE_UHR] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1314,6 +1328,12 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_S1G_NO_PRIMARY)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_UHR) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_UHR)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -1675,7 +1695,9 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) return -ENOLINK; case NL80211_IFTYPE_STATION: case NL80211_IFTYPE_P2P_CLIENT: - if (wdev->connected) + if (wdev->connected || + (wiphy_ext_feature_isset(wdev->wiphy, + NL80211_EXT_FEATURE_ASSOC_FRAME_ENCRYPTION))) return 0; return -ENOLINK; case NL80211_IFTYPE_NAN: @@ -1947,6 +1969,7 @@ nl80211_send_iftype_data(struct sk_buff *msg, { const struct ieee80211_sta_he_cap *he_cap = &iftdata->he_cap; const struct ieee80211_sta_eht_cap *eht_cap = &iftdata->eht_cap; + const struct ieee80211_sta_uhr_cap *uhr_cap = &iftdata->uhr_cap; if (nl80211_put_iftypes(msg, NL80211_BAND_IFTYPE_ATTR_IFTYPES, iftdata->types_mask)) @@ -1998,6 +2021,14 @@ nl80211_send_iftype_data(struct sk_buff *msg, return -ENOBUFS; } + if (uhr_cap->has_uhr) { + if (nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_MAC, + sizeof(uhr_cap->mac), &uhr_cap->mac) || + nla_put(msg, NL80211_BAND_IFTYPE_ATTR_UHR_CAP_PHY, + sizeof(uhr_cap->phy), &uhr_cap->phy)) + return -ENOBUFS; + } + if (sband->band == NL80211_BAND_6GHZ && nla_put(msg, NL80211_BAND_IFTYPE_ATTR_HE_6GHZ_CAPA, sizeof(iftdata->he_6ghz_capa), @@ -2307,6 +2338,32 @@ nl80211_send_pmsr_ftm_capa(const struct cfg80211_pmsr_capabilities *cap, if (cap->ftm.non_trigger_based && nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_NON_TRIGGER_BASED)) return -ENOBUFS; + if (cap->ftm.support_6ghz && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_6GHZ_SUPPORT)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_LTF_REP, + cap->ftm.max_tx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_LTF_REP, + cap->ftm.max_rx_ltf_rep)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TX_STS, + cap->ftm.max_tx_sts)) + return -ENOBUFS; + if (nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_RX_STS, + cap->ftm.max_rx_sts)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_tx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_TX, + cap->ftm.max_total_ltf_tx)) + return -ENOBUFS; + if (cap->ftm.max_total_ltf_rx > 0 && + nla_put_u32(msg, NL80211_PMSR_FTM_CAPA_ATTR_MAX_TOTAL_LTF_RX, + cap->ftm.max_total_ltf_rx)) + return -ENOBUFS; + if (cap->ftm.support_rsta && + nla_put_flag(msg, NL80211_PMSR_FTM_CAPA_ATTR_RSTA_SUPPORT)) + return -ENOBUFS; nla_nest_end(msg, ftm); return 0; @@ -6429,6 +6486,17 @@ static int nl80211_calculate_ap_params(struct cfg80211_ap_settings *params) cap->datalen - 1)) return -EINVAL; } + + cap = cfg80211_find_ext_elem(WLAN_EID_EXT_UHR_OPER, ies, ies_len); + if (cap) { + if (!cap->datalen) + return -EINVAL; + params->uhr_oper = (void *)(cap->data + 1); + if (!ieee80211_uhr_oper_size_ok((const u8 *)params->uhr_oper, + cap->datalen - 1, true)) + return -EINVAL; + } + return 0; } @@ -6470,6 +6538,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || auth_type == NL80211_AUTHTYPE_FILS_PK)) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_CONNECT: if (!(rdev->wiphy.features & NL80211_FEATURE_SAE) && @@ -6487,6 +6559,10 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && auth_type == NL80211_AUTHTYPE_FILS_SK) return false; + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_EPPKE) && + auth_type == NL80211_AUTHTYPE_EPPKE) + return false; return true; case NL80211_CMD_START_AP: if (!wiphy_ext_feature_isset(&rdev->wiphy, @@ -6552,6 +6628,9 @@ static int nl80211_validate_ap_phy_operation(struct cfg80211_ap_settings *params (channel->flags & IEEE80211_CHAN_NO_EHT)) return -EOPNOTSUPP; + if (params->uhr_oper && (channel->flags & IEEE80211_CHAN_NO_UHR)) + return -EOPNOTSUPP; + return 0; } @@ -7134,7 +7213,8 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) break; case RATE_INFO_BW_EHT_RU: rate_flg = 0; - WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS)); + WARN_ON(!(info->flags & RATE_INFO_FLAGS_EHT_MCS) && + !(info->flags & RATE_INFO_FLAGS_UHR_MCS)); break; } @@ -7187,6 +7267,23 @@ bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr) nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, info->eht_ru_alloc)) return false; + } else if (info->flags & RATE_INFO_FLAGS_UHR_MCS) { + if (nla_put_u8(msg, NL80211_RATE_INFO_UHR_MCS, info->mcs)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_NSS, info->nss)) + return false; + if (nla_put_u8(msg, NL80211_RATE_INFO_EHT_GI, info->eht_gi)) + return false; + if (info->bw == RATE_INFO_BW_EHT_RU && + nla_put_u8(msg, NL80211_RATE_INFO_EHT_RU_ALLOC, + info->eht_ru_alloc)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_ELR_MCS && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_ELR)) + return false; + if (info->flags & RATE_INFO_FLAGS_UHR_IM && + nla_put_flag(msg, NL80211_RATE_INFO_UHR_IM)) + return false; } nla_nest_end(msg, rate); @@ -8060,7 +8157,8 @@ int cfg80211_check_station_change(struct wiphy *wiphy, if (params->ext_capab || params->link_sta_params.ht_capa || params->link_sta_params.vht_capa || params->link_sta_params.he_capa || - params->link_sta_params.eht_capa) + params->link_sta_params.eht_capa || + params->link_sta_params.uhr_capa) return -EINVAL; if (params->sta_flags_mask & BIT(NL80211_STA_FLAG_SPP_AMSDU)) return -EINVAL; @@ -8280,6 +8378,16 @@ static int nl80211_set_station_tdls(struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params->link_sta_params.eht_capa) + return -EINVAL; + + params->link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params->link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_S1G_CAPABILITY]) params->link_sta_params.s1g_capa = nla_data(info->attrs[NL80211_ATTR_S1G_CAPABILITY]); @@ -8600,6 +8708,16 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.link_sta_params.eht_capa) + return -EINVAL; + + params.link_sta_params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.link_sta_params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_EML_CAPABILITY]) { params.eml_cap_present = true; params.eml_cap = @@ -8659,10 +8777,11 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.link_sta_params.ht_capa = NULL; params.link_sta_params.vht_capa = NULL; - /* HE and EHT require WME */ + /* HE, EHT and UHR require WME */ if (params.link_sta_params.he_capa_len || params.link_sta_params.he_6ghz_capa || - params.link_sta_params.eht_capa_len) + params.link_sta_params.eht_capa_len || + params.link_sta_params.uhr_capa_len) return -EINVAL; } @@ -8779,6 +8898,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) goto out; } } + + params.epp_peer = + nla_get_flag(info->attrs[NL80211_ATTR_EPP_PEER]); + err = rdev_add_station(rdev, dev, mac_addr, ¶ms); out: dev_put(params.vlan); @@ -11953,7 +12076,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if ((auth_type == NL80211_AUTHTYPE_SAE || auth_type == NL80211_AUTHTYPE_FILS_SK || auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || - auth_type == NL80211_AUTHTYPE_FILS_PK) && + auth_type == NL80211_AUTHTYPE_FILS_PK || + auth_type == NL80211_AUTHTYPE_EPPKE) && !info->attrs[NL80211_ATTR_AUTH_DATA]) return -EINVAL; @@ -11961,7 +12085,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (auth_type != NL80211_AUTHTYPE_SAE && auth_type != NL80211_AUTHTYPE_FILS_SK && auth_type != NL80211_AUTHTYPE_FILS_SK_PFS && - auth_type != NL80211_AUTHTYPE_FILS_PK) + auth_type != NL80211_AUTHTYPE_FILS_PK && + auth_type != NL80211_AUTHTYPE_EPPKE) return -EINVAL; req.auth_data = nla_data(info->attrs[NL80211_ATTR_AUTH_DATA]); req.auth_data_len = nla_len(info->attrs[NL80211_ATTR_AUTH_DATA]); @@ -12329,6 +12454,9 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) req.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + req.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&req.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -13201,6 +13329,9 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_EHT])) connect.flags |= ASSOC_REQ_DISABLE_EHT; + if (nla_get_flag(info->attrs[NL80211_ATTR_DISABLE_UHR])) + connect.flags |= ASSOC_REQ_DISABLE_UHR; + if (info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]) memcpy(&connect.vht_capa_mask, nla_data(info->attrs[NL80211_ATTR_VHT_CAPABILITY_MASK]), @@ -15553,7 +15684,8 @@ static int nl80211_parse_nan_band_config(struct wiphy *wiphy, static int nl80211_parse_nan_conf(struct wiphy *wiphy, struct genl_info *info, struct cfg80211_nan_conf *conf, - u32 *changed_flags) + u32 *changed_flags, + bool start) { struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; int err, rem; @@ -15600,7 +15732,7 @@ static int nl80211_parse_nan_conf(struct wiphy *wiphy, return err; changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; - if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + if (attrs[NL80211_NAN_CONF_CLUSTER_ID] && start) conf->cluster_id = nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); @@ -15711,7 +15843,7 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL, true); if (err) return err; @@ -16077,7 +16209,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed, false); if (err) return err; @@ -16096,6 +16228,9 @@ void cfg80211_nan_match(struct wireless_dev *wdev, struct sk_buff *msg; void *hdr; + if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE)) + return; + if (WARN_ON(!match->inst_id || !match->peer_inst_id || !match->addr)) return; @@ -16178,6 +16313,9 @@ void cfg80211_nan_func_terminated(struct wireless_dev *wdev, struct nlattr *func_attr; void *hdr; + if (WARN_ON(wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE)) + return; + if (WARN_ON(!inst_id)) return; @@ -17626,6 +17764,16 @@ nl80211_add_mod_link_station(struct sk_buff *skb, struct genl_info *info, } } + if (info->attrs[NL80211_ATTR_UHR_CAPABILITY]) { + if (!params.eht_capa) + return -EINVAL; + + params.uhr_capa = + nla_data(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + params.uhr_capa_len = + nla_len(info->attrs[NL80211_ATTR_UHR_CAPABILITY]); + } + if (info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]) params.he_6ghz_capa = nla_data(info->attrs[NL80211_ATTR_HE_6GHZ_CAPABILITY]); diff --git a/net/wireless/pmsr.c b/net/wireless/pmsr.c index a117f5093ca2..60e1e31c2185 100644 --- a/net/wireless/pmsr.c +++ b/net/wireless/pmsr.c @@ -85,11 +85,6 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } - out->ftm.burst_duration = 15; - if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) - out->ftm.burst_duration = - nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); - out->ftm.ftms_per_burst = 0; if (tb[NL80211_PMSR_FTM_REQ_ATTR_FTMS_PER_BURST]) out->ftm.ftms_per_burst = @@ -164,6 +159,12 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, return -EINVAL; } + if (tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]) + out->ftm.burst_duration = + nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BURST_DURATION]); + else if (!out->ftm.non_trigger_based && !out->ftm.trigger_based) + out->ftm.burst_duration = 15; + out->ftm.lmr_feedback = !!tb[NL80211_PMSR_FTM_REQ_ATTR_LMR_FEEDBACK]; if (!out->ftm.trigger_based && !out->ftm.non_trigger_based && @@ -186,6 +187,21 @@ static int pmsr_parse_ftm(struct cfg80211_registered_device *rdev, nla_get_u8(tb[NL80211_PMSR_FTM_REQ_ATTR_BSS_COLOR]); } + out->ftm.rsta = !!tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA]; + if (out->ftm.rsta && !capa->ftm.support_rsta) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA not supported by device"); + return -EOPNOTSUPP; + } + + if (out->ftm.rsta && !out->ftm.lmr_feedback) { + NL_SET_ERR_MSG_ATTR(info->extack, + tb[NL80211_PMSR_FTM_REQ_ATTR_RSTA], + "FTM: RSTA set without LMR feedback"); + return -EINVAL; + } + return 0; } @@ -453,6 +469,7 @@ static int nl80211_pmsr_send_ftm_res(struct sk_buff *msg, PUT(u8, NUM_BURSTS_EXP, num_bursts_exp); PUT(u8, BURST_DURATION, burst_duration); PUT(u8, FTMS_PER_BURST, ftms_per_burst); + PUT(u16, BURST_PERIOD, burst_period); PUTOPT(s32, RSSI_AVG, rssi_avg); PUTOPT(s32, RSSI_SPREAD, rssi_spread); if (res->ftm.tx_rate_valid && diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 73cab51f6379..139cb27e5a81 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez <mcgrof@qca.qualcomm.com> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2025 Intel Corporation + * Copyright (C) 2018 - 2026 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -1605,6 +1605,8 @@ static u32 map_regdom_flags(u32 rd_flags) channel_flags |= IEEE80211_CHAN_ALLOW_6GHZ_VLP_AP; if (rd_flags & NL80211_RRF_ALLOW_20MHZ_ACTIVITY) channel_flags |= IEEE80211_CHAN_ALLOW_20MHZ_ACTIVITY; + if (rd_flags & NL80211_RRF_NO_UHR) + channel_flags |= IEEE80211_CHAN_NO_UHR; return channel_flags; } @@ -2332,8 +2334,17 @@ static void reg_process_ht_flags(struct wiphy *wiphy) if (!wiphy) return; - for (band = 0; band < NUM_NL80211_BANDS; band++) + for (band = 0; band < NUM_NL80211_BANDS; band++) { + /* + * Don't apply HT flags to channels within the S1G band. + * Each bonded channel will instead be validated individually + * within cfg80211_s1g_usable(). + */ + if (band == NL80211_BAND_S1GHZ) + continue; + reg_process_ht_flags_band(wiphy, wiphy->bands[band]); + } } static bool reg_wdev_chan_valid(struct wiphy *wiphy, struct wireless_dev *wdev) @@ -2442,7 +2453,7 @@ static void reg_leave_invalid_chans(struct wiphy *wiphy) list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) if (!reg_wdev_chan_valid(wiphy, wdev)) - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); } static void reg_check_chans_work(struct work_struct *work) diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 7546647752fd..eb0e77813d46 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1959,7 +1959,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, ether_addr_copy(known->parent_bssid, new->parent_bssid); known->pub.max_bssid_indicator = new->pub.max_bssid_indicator; known->pub.bssid_index = new->pub.bssid_index; - known->pub.use_for &= new->pub.use_for; + known->pub.use_for = new->pub.use_for; known->pub.cannot_use_reasons = new->pub.cannot_use_reasons; known->bss_source = new->bss_source; diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 8d142856e385..2e0ea69b9604 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -88,7 +88,7 @@ static void cfg80211_leave_all(struct cfg80211_registered_device *rdev) struct wireless_dev *wdev; list_for_each_entry(wdev, &rdev->wiphy.wdev_list, list) - cfg80211_leave(rdev, wdev); + cfg80211_leave(rdev, wdev, -1); } static int wiphy_suspend(struct device *dev) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 2b71f1d867a0..643ccf4f0227 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3915,19 +3915,22 @@ TRACE_EVENT(cfg80211_ft_event, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->target_ap) ); -TRACE_EVENT(cfg80211_stop_iface, - TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), - TP_ARGS(wiphy, wdev), +TRACE_EVENT(cfg80211_stop_link, + TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, + int link_id), + TP_ARGS(wiphy, wdev, link_id), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY + __field(int, link_id) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; + __entry->link_id = link_id; ), - TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, - WIPHY_PR_ARG, WDEV_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", link_id: %d", + WIPHY_PR_ARG, WDEV_PR_ARG, __entry->link_id) ); TRACE_EVENT(cfg80211_pmsr_report, diff --git a/net/wireless/util.c b/net/wireless/util.c index 4f581aed45b7..404fe604a8db 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -5,7 +5,7 @@ * Copyright 2007-2009 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018-2023, 2025 Intel Corporation + * Copyright (C) 2018-2023, 2025-2026 Intel Corporation */ #include <linux/export.h> #include <linux/bitops.h> @@ -1144,7 +1144,8 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) ev->ij.channel); break; case EVENT_STOPPED: - cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); + cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev, + ev->link_id); break; case EVENT_PORT_AUTHORIZED: __cfg80211_port_authorized(wdev, ev->pa.peer_addr, @@ -1203,7 +1204,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, dev->ieee80211_ptr->use_4addr = false; rdev_set_qos_map(rdev, dev, NULL); - cfg80211_leave(rdev, dev->ieee80211_ptr); + cfg80211_leave(rdev, dev->ieee80211_ptr, -1); cfg80211_process_rdev_events(rdev); cfg80211_mlme_purge_registrations(dev->ieee80211_ptr); @@ -1573,26 +1574,30 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) return result / 10000; } -static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +static u32 _cfg80211_calculate_bitrate_eht_uhr(struct rate_info *rate) { #define SCALE 6144 - static const u32 mcs_divisors[16] = { - 102399, /* 16.666666... */ - 51201, /* 8.333333... */ - 34134, /* 5.555555... */ - 25599, /* 4.166666... */ - 17067, /* 2.777777... */ - 12801, /* 2.083333... */ - 11377, /* 1.851725... */ - 10239, /* 1.666666... */ - 8532, /* 1.388888... */ - 7680, /* 1.250000... */ - 6828, /* 1.111111... */ - 6144, /* 1.000000... */ - 5690, /* 0.926106... */ - 5120, /* 0.833333... */ - 409600, /* 66.666666... */ - 204800, /* 33.333333... */ + static const u32 mcs_divisors[] = { + [ 0] = 102399, /* 16.666666... */ + [ 1] = 51201, /* 8.333333... */ + [ 2] = 34134, /* 5.555555... */ + [ 3] = 25599, /* 4.166666... */ + [ 4] = 17067, /* 2.777777... */ + [ 5] = 12801, /* 2.083333... */ + [ 6] = 11377, /* 1.851725... */ + [ 7] = 10239, /* 1.666666... */ + [ 8] = 8532, /* 1.388888... */ + [ 9] = 7680, /* 1.250000... */ + [10] = 6828, /* 1.111111... */ + [11] = 6144, /* 1.000000... */ + [12] = 5690, /* 0.926106... */ + [13] = 5120, /* 0.833333... */ + [14] = 409600, /* 66.666666... */ + [15] = 204800, /* 33.333333... */ + [17] = 38400, /* 6.250180... */ + [19] = 19200, /* 3.125090... */ + [20] = 15360, /* 2.500000... */ + [23] = 9600, /* 1.562545... */ }; static const u32 rates_996[3] = { 480388888, 453700000, 408333333 }; static const u32 rates_484[3] = { 229411111, 216666666, 195000000 }; @@ -1603,8 +1608,6 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) u64 tmp; u32 result; - if (WARN_ON_ONCE(rate->mcs > 15)) - return 0; if (WARN_ON_ONCE(rate->eht_gi > NL80211_RATE_INFO_EHT_GI_3_2)) return 0; if (WARN_ON_ONCE(rate->eht_ru_alloc > @@ -1685,7 +1688,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) rate->eht_ru_alloc == NL80211_RATE_INFO_EHT_RU_ALLOC_26) result = rates_26[rate->eht_gi]; else { - WARN(1, "invalid EHT MCS: bw:%d, ru:%d\n", + WARN(1, "invalid EHT or UHR MCS: bw:%d, ru:%d\n", rate->bw, rate->eht_ru_alloc); return 0; } @@ -1699,11 +1702,64 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) tmp *= rate->nss; do_div(tmp, 8); + /* and handle interference mitigation - 0.9x */ + if (rate->flags & RATE_INFO_FLAGS_UHR_IM) { + if (WARN(rate->nss != 1 || rate->mcs == 15, + "invalid NSS or MCS for UHR IM\n")) + return 0; + tmp *= 9000; + do_div(tmp, 10000); + } + result = tmp; return result / 10000; } +static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) +{ + if (WARN_ONCE(rate->mcs > 15, "bad EHT MCS %d\n", rate->mcs)) + return 0; + + if (WARN_ONCE(rate->flags & (RATE_INFO_FLAGS_UHR_ELR_MCS | + RATE_INFO_FLAGS_UHR_IM), + "bad EHT MCS flags 0x%x\n", rate->flags)) + return 0; + + return _cfg80211_calculate_bitrate_eht_uhr(rate); +} + +static u32 cfg80211_calculate_bitrate_uhr(struct rate_info *rate) +{ + if (rate->flags & RATE_INFO_FLAGS_UHR_ELR_MCS) { + WARN_ONCE(rate->eht_gi != NL80211_RATE_INFO_EHT_GI_1_6, + "bad UHR ELR guard interval %d\n", + rate->eht_gi); + WARN_ONCE(rate->mcs > 1, "bad UHR ELR MCS %d\n", rate->mcs); + WARN_ONCE(rate->nss != 1, "bad UHR ELR NSS %d\n", rate->nss); + WARN_ONCE(rate->bw != RATE_INFO_BW_20, + "bad UHR ELR bandwidth %d\n", + rate->bw); + WARN_ONCE(rate->flags & RATE_INFO_FLAGS_UHR_IM, + "bad UHR MCS flags 0x%x\n", rate->flags); + if (rate->mcs == 0) + return 17; + return 33; + } + + switch (rate->mcs) { + case 0 ... 15: + case 17: + case 19: + case 20: + case 23: + return _cfg80211_calculate_bitrate_eht_uhr(rate); + } + + WARN_ONCE(1, "bad UHR MCS %d\n", rate->mcs); + return 0; +} + static u32 cfg80211_calculate_bitrate_s1g(struct rate_info *rate) { /* For 1, 2, 4, 8 and 16 MHz channels */ @@ -1828,6 +1884,8 @@ u32 cfg80211_calculate_bitrate(struct rate_info *rate) return cfg80211_calculate_bitrate_he(rate); if (rate->flags & RATE_INFO_FLAGS_EHT_MCS) return cfg80211_calculate_bitrate_eht(rate); + if (rate->flags & RATE_INFO_FLAGS_UHR_MCS) + return cfg80211_calculate_bitrate_uhr(rate); if (rate->flags & RATE_INFO_FLAGS_S1G_MCS) return cfg80211_calculate_bitrate_s1g(rate); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index f093c3453f64..3b46bc635c43 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -543,9 +543,9 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool) { int ret; - spin_lock(&pool->cq_cached_prod_lock); + spin_lock(&pool->cq->cq_cached_prod_lock); ret = xskq_prod_reserve(pool->cq); - spin_unlock(&pool->cq_cached_prod_lock); + spin_unlock(&pool->cq->cq_cached_prod_lock); return ret; } @@ -619,9 +619,9 @@ static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool, static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n) { - spin_lock(&pool->cq_cached_prod_lock); + spin_lock(&pool->cq->cq_cached_prod_lock); xskq_prod_cancel_n(pool->cq, n); - spin_unlock(&pool->cq_cached_prod_lock); + spin_unlock(&pool->cq->cq_cached_prod_lock); } INDIRECT_CALLABLE_SCOPE @@ -1349,6 +1349,13 @@ static int xsk_bind(struct socket *sock, struct sockaddr_unsized *addr, int addr } if (umem_xs->queue_id != qid || umem_xs->dev != dev) { + /* One fill and completion ring required for each queue id. */ + if (!xsk_validate_queues(xs)) { + err = -EINVAL; + sockfd_put(sock); + goto out_unlock; + } + /* Share the umem with another socket on another qid * and/or device. */ diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 51526034c42a..cd5125b6af53 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -91,7 +91,7 @@ struct xsk_buff_pool *xp_create_and_assign_umem(struct xdp_sock *xs, INIT_LIST_HEAD(&pool->xsk_tx_list); spin_lock_init(&pool->xsk_tx_list_lock); spin_lock_init(&pool->cq_prod_lock); - spin_lock_init(&pool->cq_cached_prod_lock); + spin_lock_init(&xs->cq_tmp->cq_cached_prod_lock); refcount_set(&pool->users, 1); pool->fq = xs->fq_tmp; @@ -247,10 +247,6 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, u16 flags; struct xdp_umem *umem = umem_xs->umem; - /* One fill and completion ring required for each queue id. */ - if (!pool->fq || !pool->cq) - return -EINVAL; - flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 1eb8d9f8b104..ec08d9c102b1 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -46,6 +46,11 @@ struct xsk_queue { u64 invalid_descs; u64 queue_empty_descs; size_t ring_vmalloc_size; + /* Mutual exclusion of the completion ring in the SKB mode. + * Protect: when sockets share a single cq when the same netdev + * and queue id is shared. + */ + spinlock_t cq_cached_prod_lock; }; struct parsed_desc { |
