diff options
Diffstat (limited to 'net')
313 files changed, 8055 insertions, 6832 deletions
diff --git a/net/Kconfig b/net/Kconfig index d5865cf19799..1d3f757d4b07 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -82,11 +82,13 @@ config NET_CRC32C menu "Networking options" source "net/packet/Kconfig" +source "net/psp/Kconfig" source "net/unix/Kconfig" source "net/tls/Kconfig" source "net/xfrm/Kconfig" source "net/iucv/Kconfig" source "net/smc/Kconfig" +source "drivers/dibs/Kconfig" source "net/xdp/Kconfig" config NET_HANDSHAKE diff --git a/net/Makefile b/net/Makefile index aac960c41db6..90e3d72bf58b 100644 --- a/net/Makefile +++ b/net/Makefile @@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ obj-$(CONFIG_UNIX) += unix/ +obj-$(CONFIG_INET_PSP) += psp/ obj-y += ipv6/ obj-$(CONFIG_PACKET) += packet/ obj-$(CONFIG_NET_KEY) += key/ diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index 20b316207f9a..c299e2bc87ed 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -53,19 +53,6 @@ config BATMAN_ADV_DAT mesh networks. If you think that your network does not need this option you can safely remove it and save some space. -config BATMAN_ADV_NC - bool "Network Coding" - depends on BATMAN_ADV - help - This option enables network coding, a mechanism that aims to - increase the overall network throughput by fusing multiple - packets in one transmission. - Note that interfaces controlled by batman-adv must be manually - configured to have promiscuous mode enabled in order to make - network coding work. - If you think that your network does not need this feature you - can safely disable it and save some space. - config BATMAN_ADV_MCAST bool "Multicast optimisation" depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y) diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 1cc9be6de456..d3c4d4143c14 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -23,7 +23,6 @@ batman-adv-y += mesh-interface.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o batman-adv-y += netlink.o -batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o batman-adv-y += originator.o batman-adv-y += routing.o batman-adv-y += send.o diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 54fe38b3b2fd..b75c2228e69a 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -52,7 +52,6 @@ #include "hash.h" #include "log.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -1406,10 +1405,6 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset, if (!orig_neigh_node) goto out; - /* Update nc_nodes of the originator */ - batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node, - ogm_packet, is_single_hop_neigh); - orig_neigh_router = batadv_orig_router_get(orig_neigh_node, if_outgoing); diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 747755647c6a..b992ba12aa24 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -12,6 +12,7 @@ #include <linux/compiler.h> #include <linux/container_of.h> #include <linux/crc16.h> +#include <linux/crc32.h> #include <linux/err.h> #include <linux/errno.h> #include <linux/etherdevice.h> @@ -1585,6 +1586,39 @@ int batadv_bla_init(struct batadv_priv *bat_priv) } /** + * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in + * the header + * @skb: skb pointing to fragmented socket buffers + * @payload_ptr: Pointer to position inside the head buffer of the skb + * marking the start of the data to be CRC'ed + * + * payload_ptr must always point to an address in the skb head buffer and not to + * a fragment. + * + * Return: big endian crc32c of the checksummed data + */ +static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) +{ + unsigned int to = skb->len; + unsigned int consumed = 0; + struct skb_seq_state st; + unsigned int from; + unsigned int len; + const u8 *data; + u32 crc = 0; + + from = (unsigned int)(payload_ptr - skb->data); + + skb_prepare_seq_read(skb, from, to, &st); + while ((len = skb_seq_read(consumed, &data, &st)) != 0) { + crc = crc32c(crc, data, len); + consumed += len; + } + + return htonl(crc); +} + +/** * batadv_bla_check_duplist() - Check if a frame is in the broadcast dup. * @bat_priv: the bat priv with all the mesh interface information * @skb: contains the multicast packet to be checked diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index bace57e4f9a5..5113f879736b 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -22,6 +22,7 @@ #include <linux/minmax.h> #include <linux/mutex.h> #include <linux/netdevice.h> +#include <linux/notifier.h> #include <linux/printk.h> #include <linux/rculist.h> #include <linux/rtnetlink.h> diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index 262a78364742..9db8a310961e 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -12,7 +12,6 @@ #include <linux/compiler.h> #include <linux/kref.h> #include <linux/netdevice.h> -#include <linux/notifier.h> #include <linux/rcupdate.h> #include <linux/stddef.h> #include <linux/types.h> diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 567afaa8df99..225b747a2048 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -51,9 +51,6 @@ enum batadv_dbg_level { /** @BATADV_DBG_DAT: ARP snooping and DAT related messages */ BATADV_DBG_DAT = BIT(4), - /** @BATADV_DBG_NC: network coding related messages */ - BATADV_DBG_NC = BIT(5), - /** @BATADV_DBG_MCAST: multicast related messages */ BATADV_DBG_MCAST = BIT(6), diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 20346d7b6b69..3a35aadd8b41 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -11,7 +11,6 @@ #include <linux/build_bug.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> -#include <linux/crc32.h> #include <linux/device.h> #include <linux/errno.h> #include <linux/gfp.h> @@ -53,7 +52,6 @@ #include "mesh-interface.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "send.h" @@ -103,7 +101,6 @@ static int __init batadv_init(void) batadv_v_init(); batadv_iv_init(); - batadv_nc_init(); batadv_tp_meter_init(); batadv_event_workqueue = create_singlethread_workqueue("bat_events"); @@ -218,12 +215,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) goto err_dat; } - ret = batadv_nc_mesh_init(bat_priv); - if (ret < 0) { - atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING); - goto err_nc; - } - batadv_gw_init(bat_priv); batadv_mcast_init(bat_priv); @@ -232,8 +223,6 @@ int batadv_mesh_init(struct net_device *mesh_iface) return 0; -err_nc: - batadv_dat_free(bat_priv); err_dat: batadv_bla_free(bat_priv); err_bla: @@ -264,7 +253,6 @@ void batadv_mesh_free(struct net_device *mesh_iface) batadv_gw_node_free(bat_priv); batadv_v_mesh_free(bat_priv); - batadv_nc_mesh_free(bat_priv); batadv_dat_free(bat_priv); batadv_bla_free(bat_priv); @@ -336,11 +324,6 @@ int batadv_max_header_len(void) header_len = max_t(int, header_len, sizeof(struct batadv_bcast_packet)); -#ifdef CONFIG_BATMAN_ADV_NC - header_len = max_t(int, header_len, - sizeof(struct batadv_coded_packet)); -#endif - return header_len + ETH_HLEN; } @@ -578,39 +561,6 @@ void batadv_recv_handler_unregister(u8 packet_type) } /** - * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in - * the header - * @skb: skb pointing to fragmented socket buffers - * @payload_ptr: Pointer to position inside the head buffer of the skb - * marking the start of the data to be CRC'ed - * - * payload_ptr must always point to an address in the skb head buffer and not to - * a fragment. - * - * Return: big endian crc32c of the checksummed data - */ -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr) -{ - u32 crc = 0; - unsigned int from; - unsigned int to = skb->len; - struct skb_seq_state st; - const u8 *data; - unsigned int len; - unsigned int consumed = 0; - - from = (unsigned int)(payload_ptr - skb->data); - - skb_prepare_seq_read(skb, from, to, &st); - while ((len = skb_seq_read(consumed, &data, &st)) != 0) { - crc = crc32c(crc, data, len); - consumed += len; - } - - return htonl(crc); -} - -/** * batadv_get_vid() - extract the VLAN identifier from skb if any * @skb: the buffer containing the packet * @header_len: length of the batman header preceding the ethernet header diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 1481eb2bacee..2be1ac17acaa 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2025.3" +#define BATADV_SOURCE_VERSION "2025.4" #endif /* B.A.T.M.A.N. parameters */ @@ -121,8 +121,6 @@ #define BATADV_RESET_PROTECTION_MS 30000 #define BATADV_EXPECTED_SEQNO_RANGE 65536 -#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */ - /** * BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions */ @@ -250,7 +248,6 @@ batadv_recv_handler_register(u8 packet_type, int (*recv_handler)(struct sk_buff *, struct batadv_hard_iface *)); void batadv_recv_handler_unregister(u8 packet_type); -__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr); /** * batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c index de2c2d9c6e4d..df7e95811ef5 100644 --- a/net/batman-adv/mesh-interface.c +++ b/net/batman-adv/mesh-interface.c @@ -37,6 +37,7 @@ #include <linux/string.h> #include <linux/types.h> #include <net/netlink.h> +#include <net/rtnetlink.h> #include <uapi/linux/batadv_packet.h> #include <uapi/linux/batman_adv.h> @@ -46,7 +47,6 @@ #include "gateway_client.h" #include "hard-interface.h" #include "multicast.h" -#include "network-coding.h" #include "send.h" #include "translation-table.h" @@ -802,8 +802,6 @@ static int batadv_meshif_init_late(struct net_device *dev) bat_priv->primary_if = NULL; - batadv_nc_init_bat_priv(bat_priv); - if (!bat_priv->algo_ops) { ret = batadv_algo_select(bat_priv, batadv_routing_algo); if (ret < 0) @@ -947,17 +945,6 @@ static const struct { { "dat_put_rx" }, { "dat_cached_reply_tx" }, #endif -#ifdef CONFIG_BATMAN_ADV_NC - { "nc_code" }, - { "nc_code_bytes" }, - { "nc_recode" }, - { "nc_recode_bytes" }, - { "nc_buffer" }, - { "nc_decode" }, - { "nc_decode_bytes" }, - { "nc_decode_failed" }, - { "nc_sniffed" }, -#endif }; static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h index 7ba055b2bc26..53756c5a45e0 100644 --- a/net/batman-adv/mesh-interface.h +++ b/net/batman-adv/mesh-interface.h @@ -13,7 +13,6 @@ #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/types.h> -#include <net/rtnetlink.h> int batadv_skb_head_push(struct sk_buff *skb, unsigned int len); void batadv_interface_rx(struct net_device *mesh_iface, diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index beb181b3a7d8..78c651f634cd 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -44,7 +44,6 @@ #include "log.h" #include "mesh-interface.h" #include "multicast.h" -#include "network-coding.h" #include "originator.h" #include "tp_meter.h" #include "translation-table.h" @@ -144,7 +143,6 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 }, - [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, @@ -345,12 +343,6 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, - !!atomic_read(&bat_priv->network_coding))) - goto nla_put_failure; -#endif /* CONFIG_BATMAN_ADV_NC */ - if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, atomic_read(&bat_priv->orig_interval))) goto nla_put_failure; @@ -588,15 +580,6 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_MCAST */ -#ifdef CONFIG_BATMAN_ADV_NC - if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { - attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; - - atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); - batadv_nc_status_update(bat_priv->mesh_iface); - } -#endif /* CONFIG_BATMAN_ADV_NC */ - if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { u32 orig_interval; diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index fe4548b974bb..4eae9e5ff135 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -11,7 +11,6 @@ #include <linux/netlink.h> #include <linux/types.h> -#include <net/genetlink.h> void batadv_netlink_register(void); void batadv_netlink_unregister(void); diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c deleted file mode 100644 index af97d077369f..000000000000 --- a/net/batman-adv/network-coding.c +++ /dev/null @@ -1,1878 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#include "network-coding.h" -#include "main.h" - -#include <linux/atomic.h> -#include <linux/bitops.h> -#include <linux/byteorder/generic.h> -#include <linux/compiler.h> -#include <linux/container_of.h> -#include <linux/errno.h> -#include <linux/etherdevice.h> -#include <linux/gfp.h> -#include <linux/if_ether.h> -#include <linux/if_packet.h> -#include <linux/init.h> -#include <linux/jhash.h> -#include <linux/jiffies.h> -#include <linux/kref.h> -#include <linux/list.h> -#include <linux/lockdep.h> -#include <linux/net.h> -#include <linux/netdevice.h> -#include <linux/printk.h> -#include <linux/random.h> -#include <linux/rculist.h> -#include <linux/rcupdate.h> -#include <linux/skbuff.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/stddef.h> -#include <linux/string.h> -#include <linux/workqueue.h> -#include <uapi/linux/batadv_packet.h> - -#include "hash.h" -#include "log.h" -#include "originator.h" -#include "routing.h" -#include "send.h" -#include "tvlv.h" - -static struct lock_class_key batadv_nc_coding_hash_lock_class_key; -static struct lock_class_key batadv_nc_decoding_hash_lock_class_key; - -static void batadv_nc_worker(struct work_struct *work); -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if); - -/** - * batadv_nc_init() - one-time initialization for network coding - * - * Return: 0 on success or negative error number in case of failure - */ -int __init batadv_nc_init(void) -{ - /* Register our packet type */ - return batadv_recv_handler_register(BATADV_CODED, - batadv_nc_recv_coded_packet); -} - -/** - * batadv_nc_start_timer() - initialise the nc periodic worker - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_start_timer(struct batadv_priv *bat_priv) -{ - queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work, - msecs_to_jiffies(10)); -} - -/** - * batadv_nc_tvlv_container_update() - update the network coding tvlv container - * after network coding setting change - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv) -{ - char nc_mode; - - nc_mode = atomic_read(&bat_priv->network_coding); - - switch (nc_mode) { - case 0: - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - break; - case 1: - batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1, - NULL, 0); - break; - } -} - -/** - * batadv_nc_status_update() - update the network coding tvlv container after - * network coding setting change - * @net_dev: the mesh interface net device - */ -void batadv_nc_status_update(struct net_device *net_dev) -{ - struct batadv_priv *bat_priv = netdev_priv(net_dev); - - batadv_nc_tvlv_container_update(bat_priv); -} - -/** - * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container - * @bat_priv: the bat priv with all the mesh interface information - * @orig: the orig_node of the ogm - * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags) - * @tvlv_value: tvlv buffer containing the gateway data - * @tvlv_value_len: tvlv buffer length - */ -static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig, - u8 flags, - void *tvlv_value, u16 tvlv_value_len) -{ - if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND) - clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); - else - set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities); -} - -/** - * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping - * @bat_priv: the bat priv with all the mesh interface information - * - * Return: 0 on success or negative error number in case of failure - */ -int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - bat_priv->nc.timestamp_fwd_flush = jiffies; - bat_priv->nc.timestamp_sniffed_purge = jiffies; - - if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash) - return 0; - - bat_priv->nc.coding_hash = batadv_hash_new(128); - if (!bat_priv->nc.coding_hash) - goto err; - - batadv_hash_set_lock_class(bat_priv->nc.coding_hash, - &batadv_nc_coding_hash_lock_class_key); - - bat_priv->nc.decoding_hash = batadv_hash_new(128); - if (!bat_priv->nc.decoding_hash) { - batadv_hash_destroy(bat_priv->nc.coding_hash); - goto err; - } - - batadv_hash_set_lock_class(bat_priv->nc.decoding_hash, - &batadv_nc_decoding_hash_lock_class_key); - - INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker); - batadv_nc_start_timer(bat_priv); - - batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1, - NULL, NULL, BATADV_TVLV_NC, 1, - BATADV_TVLV_HANDLER_OGM_CIFNOTFND); - batadv_nc_tvlv_container_update(bat_priv); - return 0; - -err: - return -ENOMEM; -} - -/** - * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ - atomic_set(&bat_priv->network_coding, 0); - bat_priv->nc.min_tq = 200; - bat_priv->nc.max_fwd_delay = 10; - bat_priv->nc.max_buffer_time = 200; -} - -/** - * batadv_nc_init_orig() - initialise the nc fields of an orig_node - * @orig_node: the orig_node which is going to be initialised - */ -void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ - INIT_LIST_HEAD(&orig_node->in_coding_list); - INIT_LIST_HEAD(&orig_node->out_coding_list); - spin_lock_init(&orig_node->in_coding_list_lock); - spin_lock_init(&orig_node->out_coding_list_lock); -} - -/** - * batadv_nc_node_release() - release nc_node from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_node - */ -static void batadv_nc_node_release(struct kref *ref) -{ - struct batadv_nc_node *nc_node; - - nc_node = container_of(ref, struct batadv_nc_node, refcount); - - batadv_orig_node_put(nc_node->orig_node); - kfree_rcu(nc_node, rcu); -} - -/** - * batadv_nc_node_put() - decrement the nc_node refcounter and possibly - * release it - * @nc_node: nc_node to be free'd - */ -static void batadv_nc_node_put(struct batadv_nc_node *nc_node) -{ - if (!nc_node) - return; - - kref_put(&nc_node->refcount, batadv_nc_node_release); -} - -/** - * batadv_nc_path_release() - release nc_path from lists and queue for free - * after rcu grace period - * @ref: kref pointer of the nc_path - */ -static void batadv_nc_path_release(struct kref *ref) -{ - struct batadv_nc_path *nc_path; - - nc_path = container_of(ref, struct batadv_nc_path, refcount); - - kfree_rcu(nc_path, rcu); -} - -/** - * batadv_nc_path_put() - decrement the nc_path refcounter and possibly - * release it - * @nc_path: nc_path to be free'd - */ -static void batadv_nc_path_put(struct batadv_nc_path *nc_path) -{ - if (!nc_path) - return; - - kref_put(&nc_path->refcount, batadv_nc_path_release); -} - -/** - * batadv_nc_packet_free() - frees nc packet - * @nc_packet: the nc packet to free - * @dropped: whether the packet is freed because is dropped - */ -static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet, - bool dropped) -{ - if (dropped) - kfree_skb(nc_packet->skb); - else - consume_skb(nc_packet->skb); - - batadv_nc_path_put(nc_packet->nc_path); - kfree(nc_packet); -} - -/** - * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged - * @bat_priv: the bat priv with all the mesh interface information - * @nc_node: the nc node to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv, - struct batadv_nc_node *nc_node) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT); -} - -/** - * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_fwd_delay time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_fwd_delay * 10); -} - -/** - * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed - * out - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path to check - * - * Return: true if the entry has to be purged now, false otherwise - */ -static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path) -{ - if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) - return true; - - /* purge the path when no packets has been added for 10 times the - * max_buffer time - */ - return batadv_has_timed_out(nc_path->last_valid, - bat_priv->nc.max_buffer_time * 10); -} - -/** - * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale - * entries - * @bat_priv: the bat priv with all the mesh interface information - * @list: list of nc nodes - * @lock: nc node list lock - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true if the entry has to be deleted, false - * otherwise - */ -static void -batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv, - struct list_head *list, - spinlock_t *lock, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - struct batadv_nc_node *nc_node, *nc_node_tmp; - - /* For each nc_node in list */ - spin_lock_bh(lock); - list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_node)) - continue; - - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Removing nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - list_del_rcu(&nc_node->list); - batadv_nc_node_put(nc_node); - } - spin_unlock_bh(lock); -} - -/** - * batadv_nc_purge_orig() - purges all nc node data attached of the given - * originator - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig_node with the nc node entries to be purged - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ - /* Check ingoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list, - &orig_node->in_coding_list_lock, - to_purge); - - /* Check outgoing nc_node's of this orig_node */ - batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list, - &orig_node->out_coding_list_lock, - to_purge); -} - -/** - * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if - * they have timed out nc nodes - * @bat_priv: the bat priv with all the mesh interface information - */ -static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv) -{ - struct batadv_hashtable *hash = bat_priv->orig_hash; - struct hlist_head *head; - struct batadv_orig_node *orig_node; - u32 i; - - if (!hash) - return; - - /* For each orig_node */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(orig_node, head, hash_entry) - batadv_nc_purge_orig(bat_priv, orig_node, - batadv_nc_to_purge_nc_node); - rcu_read_unlock(); - } -} - -/** - * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove - * unused ones - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc paths to check - * @to_purge: function in charge to decide whether an entry has to be purged or - * not. This function takes the nc node as argument and has to return - * a boolean value: true is the entry has to be deleted, false - * otherwise - */ -static void batadv_nc_purge_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_path *)) -{ - struct hlist_head *head; - struct hlist_node *node_tmp; - struct batadv_nc_path *nc_path; - spinlock_t *lock; /* Protects lists in hash */ - u32 i; - - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - lock = &hash->list_locks[i]; - - /* For each nc_path in this bin */ - spin_lock_bh(lock); - hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) { - /* if an helper function has been passed as parameter, - * ask it if the entry has to be purged or not - */ - if (to_purge && !to_purge(bat_priv, nc_path)) - continue; - - /* purging an non-empty nc_path should never happen, but - * is observed under high CPU load. Delay the purging - * until next iteration to allow the packet_list to be - * emptied first. - */ - if (!unlikely(list_empty(&nc_path->packet_list))) { - net_ratelimited_function(printk, - KERN_WARNING - "Skipping free of non-empty nc_path (%pM -> %pM)!\n", - nc_path->prev_hop, - nc_path->next_hop); - continue; - } - - /* nc_path is unused, so remove it */ - batadv_dbg(BATADV_DBG_NC, bat_priv, - "Remove nc_path %pM -> %pM\n", - nc_path->prev_hop, nc_path->next_hop); - hlist_del_rcu(&nc_path->hash_entry); - batadv_nc_path_put(nc_path); - } - spin_unlock_bh(lock); - } -} - -/** - * batadv_nc_hash_key_gen() - computes the nc_path hash key - * @key: buffer to hold the final hash key - * @src: source ethernet mac address going into the hash key - * @dst: destination ethernet mac address going into the hash key - */ -static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src, - const char *dst) -{ - memcpy(key->prev_hop, src, sizeof(key->prev_hop)); - memcpy(key->next_hop, dst, sizeof(key->next_hop)); -} - -/** - * batadv_nc_hash_choose() - compute the hash value for an nc path - * @data: data to hash - * @size: size of the hash table - * - * Return: the selected index in the hash table for the given data. - */ -static u32 batadv_nc_hash_choose(const void *data, u32 size) -{ - const struct batadv_nc_path *nc_path = data; - u32 hash = 0; - - hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash); - hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash); - - return hash % size; -} - -/** - * batadv_nc_hash_compare() - comparing function used in the network coding hash - * tables - * @node: node in the local table - * @data2: second object to compare the node to - * - * Return: true if the two entry are the same, false otherwise - */ -static bool batadv_nc_hash_compare(const struct hlist_node *node, - const void *data2) -{ - const struct batadv_nc_path *nc_path1, *nc_path2; - - nc_path1 = container_of(node, struct batadv_nc_path, hash_entry); - nc_path2 = data2; - - /* Return 1 if the two keys are identical */ - if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop)) - return false; - - if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop)) - return false; - - return true; -} - -/** - * batadv_nc_hash_find() - search for an existing nc path and return it - * @hash: hash table containing the nc path - * @data: search key - * - * Return: the nc_path if found, NULL otherwise. - */ -static struct batadv_nc_path * -batadv_nc_hash_find(struct batadv_hashtable *hash, - void *data) -{ - struct hlist_head *head; - struct batadv_nc_path *nc_path, *nc_path_tmp = NULL; - int index; - - if (!hash) - return NULL; - - index = batadv_nc_hash_choose(data, hash->size); - head = &hash->table[index]; - - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - if (!batadv_nc_hash_compare(&nc_path->hash_entry, data)) - continue; - - if (!kref_get_unless_zero(&nc_path->refcount)) - continue; - - nc_path_tmp = nc_path; - break; - } - rcu_read_unlock(); - - return nc_path_tmp; -} - -/** - * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct - * @nc_packet: the nc packet to send - */ -static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet) -{ - batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node); - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); -} - -/** - * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given sniffed (overheard) nc_packet has hit its buffering - * timeout. If so, the packet is no longer kept and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_buffer_time; - bool res = false; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - goto out; - - /* purge nc packet */ - list_del(&nc_packet->list); - batadv_nc_packet_free(nc_packet, true); - - res = true; - -out: - return res; -} - -/** - * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet. - * @bat_priv: the bat priv with all the mesh interface information - * @nc_path: the nc path the packet belongs to - * @nc_packet: the nc packet to be checked - * - * Checks whether the given nc packet has hit its forward timeout. If so, the - * packet is no longer delayed, immediately sent and the entry deleted from the - * queue. Has to be called with the appropriate locks. - * - * Return: false as soon as the entry in the fifo queue has not been timed out - * yet and true otherwise. - */ -static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv, - struct batadv_nc_path *nc_path, - struct batadv_nc_packet *nc_packet) -{ - unsigned long timeout = bat_priv->nc.max_fwd_delay; - - lockdep_assert_held(&nc_path->packet_list_lock); - - /* Packets are added to tail, so the remaining packets did not time - * out and we can stop processing the current queue - */ - if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE && - !batadv_has_timed_out(nc_packet->timestamp, timeout)) - return false; - - /* Send packet */ - batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD); - batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES, - nc_packet->skb->len + ETH_HLEN); - list_del(&nc_packet->list); - batadv_nc_send_packet(nc_packet); - - return true; -} - -/** - * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed - * out nc packets - * @bat_priv: the bat priv with all the mesh interface information - * @hash: to be processed hash table - * @process_fn: Function called to process given nc packet. Should return true - * to encourage this function to proceed with the next packet. - * Otherwise the rest of the current queue is skipped. - */ -static void -batadv_nc_process_nc_paths(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - bool (*process_fn)(struct batadv_priv *, - struct batadv_nc_path *, - struct batadv_nc_packet *)) -{ - struct hlist_head *head; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_nc_path *nc_path; - bool ret; - int i; - - if (!hash) - return; - - /* Loop hash table bins */ - for (i = 0; i < hash->size; i++) { - head = &hash->table[i]; - - /* Loop coding paths */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, head, hash_entry) { - /* Loop packets */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - ret = process_fn(bat_priv, nc_path, nc_packet); - if (!ret) - break; - } - spin_unlock_bh(&nc_path->packet_list_lock); - } - rcu_read_unlock(); - } -} - -/** - * batadv_nc_worker() - periodic task for housekeeping related to network - * coding - * @work: kernel work struct - */ -static void batadv_nc_worker(struct work_struct *work) -{ - struct delayed_work *delayed_work; - struct batadv_priv_nc *priv_nc; - struct batadv_priv *bat_priv; - unsigned long timeout; - - delayed_work = to_delayed_work(work); - priv_nc = container_of(delayed_work, struct batadv_priv_nc, work); - bat_priv = container_of(priv_nc, struct batadv_priv, nc); - - batadv_nc_purge_orig_hash(bat_priv); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_to_purge_nc_path_coding); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_to_purge_nc_path_decoding); - - timeout = bat_priv->nc.max_fwd_delay; - - if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash, - batadv_nc_fwd_flush); - bat_priv->nc.timestamp_fwd_flush = jiffies; - } - - if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge, - bat_priv->nc.max_buffer_time)) { - batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash, - batadv_nc_sniffed_purge); - bat_priv->nc.timestamp_sniffed_purge = jiffies; - } - - /* Schedule a new check */ - batadv_nc_start_timer(bat_priv); -} - -/** - * batadv_can_nc_with_orig() - checks whether the given orig node is suitable - * for coding or not - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: neighboring orig node which may be used as nc candidate - * @ogm_packet: incoming ogm packet also used for the checks - * - * Return: true if: - * 1) The OGM must have the most recent sequence number. - * 2) The TTL must be decremented by one and only one. - * 3) The OGM must be received from the first hop from orig_node. - * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq. - */ -static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_ogm_packet *ogm_packet) -{ - struct batadv_orig_ifinfo *orig_ifinfo; - u32 last_real_seqno; - u8 last_ttl; - - orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT); - if (!orig_ifinfo) - return false; - - last_ttl = orig_ifinfo->last_ttl; - last_real_seqno = orig_ifinfo->last_real_seqno; - batadv_orig_ifinfo_put(orig_ifinfo); - - if (last_real_seqno != ntohl(ogm_packet->seqno)) - return false; - if (last_ttl != ogm_packet->ttl + 1) - return false; - if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender)) - return false; - if (ogm_packet->tq < bat_priv->nc.min_tq) - return false; - - return true; -} - -/** - * batadv_nc_find_nc_node() - search for an existing nc node and return it - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found, NULL otherwise. - */ -static struct batadv_nc_node * -batadv_nc_find_nc_node(struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node, *nc_node_out = NULL; - struct list_head *list; - - if (in_coding) - list = &orig_neigh_node->in_coding_list; - else - list = &orig_neigh_node->out_coding_list; - - /* Traverse list of nc_nodes to orig_node */ - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, list, list) { - if (!batadv_compare_eth(nc_node->addr, orig_node->orig)) - continue; - - if (!kref_get_unless_zero(&nc_node->refcount)) - continue; - - /* Found a match */ - nc_node_out = nc_node; - break; - } - rcu_read_unlock(); - - return nc_node_out; -} - -/** - * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was - * not found - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @in_coding: traverse incoming or outgoing network coding list - * - * Return: the nc_node if found or created, NULL in case of an error. - */ -static struct batadv_nc_node * -batadv_nc_get_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - bool in_coding) -{ - struct batadv_nc_node *nc_node; - spinlock_t *lock; /* Used to lock list selected by "int in_coding" */ - struct list_head *list; - - /* Select ingoing or outgoing coding node */ - if (in_coding) { - lock = &orig_neigh_node->in_coding_list_lock; - list = &orig_neigh_node->in_coding_list; - } else { - lock = &orig_neigh_node->out_coding_list_lock; - list = &orig_neigh_node->out_coding_list; - } - - spin_lock_bh(lock); - - /* Check if nc_node is already added */ - nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding); - - /* Node found */ - if (nc_node) - goto unlock; - - nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC); - if (!nc_node) - goto unlock; - - /* Initialize nc_node */ - INIT_LIST_HEAD(&nc_node->list); - kref_init(&nc_node->refcount); - ether_addr_copy(nc_node->addr, orig_node->orig); - kref_get(&orig_neigh_node->refcount); - nc_node->orig_node = orig_neigh_node; - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n", - nc_node->addr, nc_node->orig_node->orig); - - /* Add nc_node to orig_node */ - kref_get(&nc_node->refcount); - list_add_tail_rcu(&nc_node->list, list); - -unlock: - spin_unlock_bh(lock); - - return nc_node; -} - -/** - * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node - * structs (best called on incoming OGMs) - * @bat_priv: the bat priv with all the mesh interface information - * @orig_node: orig node originating the ogm packet - * @orig_neigh_node: neighboring orig node from which we received the ogm packet - * (can be equal to orig_node) - * @ogm_packet: incoming ogm packet - * @is_single_hop_neigh: orig_node is a single hop neighbor - */ -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ - struct batadv_nc_node *in_nc_node = NULL; - struct batadv_nc_node *out_nc_node = NULL; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* check if orig node is network coding enabled */ - if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities)) - goto out; - - /* accept ogms from 'good' neighbors and single hop neighbors */ - if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) && - !is_single_hop_neigh) - goto out; - - /* Add orig_node as in_nc_node on hop */ - in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node, - orig_neigh_node, true); - if (!in_nc_node) - goto out; - - in_nc_node->last_seen = jiffies; - - /* Add hop as out_nc_node on orig_node */ - out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node, - orig_node, false); - if (!out_nc_node) - goto out; - - out_nc_node->last_seen = jiffies; - -out: - batadv_nc_node_put(in_nc_node); - batadv_nc_node_put(out_nc_node); -} - -/** - * batadv_nc_get_path() - get existing nc_path or allocate a new one - * @bat_priv: the bat priv with all the mesh interface information - * @hash: hash table containing the nc path - * @src: ethernet source address - first half of the nc path search key - * @dst: ethernet destination address - second half of the nc path search key - * - * Return: pointer to nc_path if the path was found or created, returns NULL - * on error. - */ -static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv, - struct batadv_hashtable *hash, - u8 *src, - u8 *dst) -{ - int hash_added; - struct batadv_nc_path *nc_path, nc_path_key; - - batadv_nc_hash_key_gen(&nc_path_key, src, dst); - - /* Search for existing nc_path */ - nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key); - - if (nc_path) { - /* Set timestamp to delay removal of nc_path */ - nc_path->last_valid = jiffies; - return nc_path; - } - - /* No existing nc_path was found; create a new */ - nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC); - - if (!nc_path) - return NULL; - - /* Initialize nc_path */ - INIT_LIST_HEAD(&nc_path->packet_list); - spin_lock_init(&nc_path->packet_list_lock); - kref_init(&nc_path->refcount); - nc_path->last_valid = jiffies; - ether_addr_copy(nc_path->next_hop, dst); - ether_addr_copy(nc_path->prev_hop, src); - - batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n", - nc_path->prev_hop, - nc_path->next_hop); - - /* Add nc_path to hash table */ - kref_get(&nc_path->refcount); - hash_added = batadv_hash_add(hash, batadv_nc_hash_compare, - batadv_nc_hash_choose, &nc_path_key, - &nc_path->hash_entry); - - if (hash_added < 0) { - kfree(nc_path); - return NULL; - } - - return nc_path; -} - -/** - * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair - * selection of a receiver with slightly lower TQ than the other - * @tq: to be weighted tq value - * - * Return: scaled tq value - */ -static u8 batadv_nc_random_weight_tq(u8 tq) -{ - /* randomize the estimated packet loss (max TQ - estimated TQ) */ - u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq); - - /* convert to (randomized) estimated tq again */ - return BATADV_TQ_MAX_VALUE - rand_tq; -} - -/** - * batadv_nc_memxor() - XOR destination with source - * @dst: byte array to XOR into - * @src: byte array to XOR from - * @len: length of destination array - */ -static void batadv_nc_memxor(char *dst, const char *src, unsigned int len) -{ - unsigned int i; - - for (i = 0; i < len; ++i) - dst[i] ^= src[i]; -} - -/** - * batadv_nc_code_packets() - code a received unicast_packet with an nc packet - * into a coded_packet and send it - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @ethhdr: pointer to the ethernet header inside the skb - * @nc_packet: structure containing the packet to the skb can be coded with - * @neigh_node: next hop to forward packet to - * - * Return: true if both packets are consumed, false otherwise. - */ -static bool batadv_nc_code_packets(struct batadv_priv *bat_priv, - struct sk_buff *skb, - struct ethhdr *ethhdr, - struct batadv_nc_packet *nc_packet, - struct batadv_neigh_node *neigh_node) -{ - u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp; - struct sk_buff *skb_dest, *skb_src; - struct batadv_unicast_packet *packet1; - struct batadv_unicast_packet *packet2; - struct batadv_coded_packet *coded_packet; - struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest; - struct batadv_neigh_node *router_coding = NULL, *second_dest; - struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL; - struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL; - u8 *first_source, *second_source; - __be32 packet_id1, packet_id2; - size_t count; - bool res = false; - int coding_len; - int unicast_size = sizeof(*packet1); - int coded_size = sizeof(*coded_packet); - int header_add = coded_size - unicast_size; - - /* TODO: do we need to consider the outgoing interface for - * coded packets? - */ - router_neigh = batadv_orig_router_get(neigh_node->orig_node, - BATADV_IF_DEFAULT); - if (!router_neigh) - goto out; - - router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh, - BATADV_IF_DEFAULT); - if (!router_neigh_ifinfo) - goto out; - - neigh_tmp = nc_packet->neigh_node; - router_coding = batadv_orig_router_get(neigh_tmp->orig_node, - BATADV_IF_DEFAULT); - if (!router_coding) - goto out; - - router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding, - BATADV_IF_DEFAULT); - if (!router_coding_ifinfo) - goto out; - - tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg; - tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp); - tq_tmp = router_coding_ifinfo->bat_iv.tq_avg; - tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp); - - /* Select one destination for the MAC-header dst-field based on - * weighted TQ-values. - */ - if (tq_weighted_neigh >= tq_weighted_coding) { - /* Destination from nc_packet is selected for MAC-header */ - first_dest = nc_packet->neigh_node; - first_source = nc_packet->nc_path->prev_hop; - second_dest = neigh_node; - second_source = ethhdr->h_source; - packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet2 = (struct batadv_unicast_packet *)skb->data; - packet_id1 = nc_packet->packet_id; - packet_id2 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet2)); - } else { - /* Destination for skb is selected for MAC-header */ - first_dest = neigh_node; - first_source = ethhdr->h_source; - second_dest = nc_packet->neigh_node; - second_source = nc_packet->nc_path->prev_hop; - packet1 = (struct batadv_unicast_packet *)skb->data; - packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data; - packet_id1 = batadv_skb_crc32(skb, - skb->data + sizeof(*packet1)); - packet_id2 = nc_packet->packet_id; - } - - /* Instead of zero padding the smallest data buffer, we - * code into the largest. - */ - if (skb->len <= nc_packet->skb->len) { - skb_dest = nc_packet->skb; - skb_src = skb; - } else { - skb_dest = skb; - skb_src = nc_packet->skb; - } - - /* coding_len is used when decoding the packet shorter packet */ - coding_len = skb_src->len - unicast_size; - - if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0) - goto out; - - skb_push(skb_dest, header_add); - - coded_packet = (struct batadv_coded_packet *)skb_dest->data; - skb_reset_mac_header(skb_dest); - - coded_packet->packet_type = BATADV_CODED; - coded_packet->version = BATADV_COMPAT_VERSION; - coded_packet->ttl = packet1->ttl; - - /* Info about first unicast packet */ - ether_addr_copy(coded_packet->first_source, first_source); - ether_addr_copy(coded_packet->first_orig_dest, packet1->dest); - coded_packet->first_crc = packet_id1; - coded_packet->first_ttvn = packet1->ttvn; - - /* Info about second unicast packet */ - ether_addr_copy(coded_packet->second_dest, second_dest->addr); - ether_addr_copy(coded_packet->second_source, second_source); - ether_addr_copy(coded_packet->second_orig_dest, packet2->dest); - coded_packet->second_crc = packet_id2; - coded_packet->second_ttl = packet2->ttl; - coded_packet->second_ttvn = packet2->ttvn; - coded_packet->coded_len = htons(coding_len); - - /* This is where the magic happens: Code skb_src into skb_dest */ - batadv_nc_memxor(skb_dest->data + coded_size, - skb_src->data + unicast_size, coding_len); - - /* Update counters accordingly */ - if (BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are recoded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* Both packets are newly coded */ - count = skb_src->len + ETH_HLEN; - count += skb_dest->len + ETH_HLEN; - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count); - } else if (BATADV_SKB_CB(skb_src)->decoded && - !BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src recoded and skb_dest is newly coded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_dest->len + ETH_HLEN); - } else if (!BATADV_SKB_CB(skb_src)->decoded && - BATADV_SKB_CB(skb_dest)->decoded) { - /* skb_src is newly coded and skb_dest is recoded */ - batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, - skb_src->len + ETH_HLEN); - batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, - skb_dest->len + ETH_HLEN); - } - - /* skb_src is now coded into skb_dest, so free it */ - consume_skb(skb_src); - - /* avoid duplicate free of skb from nc_packet */ - nc_packet->skb = NULL; - batadv_nc_packet_free(nc_packet, false); - - /* Send the coded packet and return true */ - batadv_send_unicast_skb(skb_dest, first_dest); - res = true; -out: - batadv_neigh_node_put(router_neigh); - batadv_neigh_node_put(router_coding); - batadv_neigh_ifinfo_put(router_neigh_ifinfo); - batadv_neigh_ifinfo_put(router_coding_ifinfo); - return res; -} - -/** - * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst. - * @skb: data skb to forward - * @dst: destination mac address of the other skb to code with - * @src: source mac address of skb - * - * Whenever we network code a packet we have to check whether we received it in - * a network coded form. If so, we may not be able to use it for coding because - * some neighbors may also have received (overheard) the packet in the network - * coded form without being able to decode it. It is hard to know which of the - * neighboring nodes was able to decode the packet, therefore we can only - * re-code the packet if the source of the previous encoded packet is involved. - * Since the source encoded the packet we can be certain it has all necessary - * decode information. - * - * Return: true if coding of a decoded packet is allowed. - */ -static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src) -{ - if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src)) - return false; - return true; -} - -/** - * batadv_nc_path_search() - Find the coding path matching in_nc_node and - * out_nc_node to retrieve a buffered packet that can be used for coding. - * @bat_priv: the bat priv with all the mesh interface information - * @in_nc_node: pointer to skb next hop's neighbor nc node - * @out_nc_node: pointer to skb source's neighbor nc node - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * - * Return: true if coding of a decoded skb is allowed. - */ -static struct batadv_nc_packet * -batadv_nc_path_search(struct batadv_priv *bat_priv, - struct batadv_nc_node *in_nc_node, - struct batadv_nc_node *out_nc_node, - struct sk_buff *skb, - u8 *eth_dst) -{ - struct batadv_nc_path *nc_path, nc_path_key; - struct batadv_nc_packet *nc_packet_out = NULL; - struct batadv_nc_packet *nc_packet, *nc_packet_tmp; - struct batadv_hashtable *hash = bat_priv->nc.coding_hash; - int idx; - - if (!hash) - return NULL; - - /* Create almost path key */ - batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr, - out_nc_node->addr); - idx = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Check for coding opportunities in this nc_path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) { - if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr)) - continue; - - if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr)) - continue; - - spin_lock_bh(&nc_path->packet_list_lock); - if (list_empty(&nc_path->packet_list)) { - spin_unlock_bh(&nc_path->packet_list_lock); - continue; - } - - list_for_each_entry_safe(nc_packet, nc_packet_tmp, - &nc_path->packet_list, list) { - if (!batadv_nc_skb_coding_possible(nc_packet->skb, - eth_dst, - in_nc_node->addr)) - continue; - - /* Coding opportunity is found! */ - list_del(&nc_packet->list); - nc_packet_out = nc_packet; - break; - } - - spin_unlock_bh(&nc_path->packet_list_lock); - break; - } - rcu_read_unlock(); - - return nc_packet_out; -} - -/** - * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of - * the skb's sender (may be equal to the originator). - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to forward - * @eth_dst: next hop mac address of skb - * @eth_src: source mac address of skb - * @in_nc_node: pointer to skb next hop's neighbor nc node - * - * Return: an nc packet if a suitable coding packet was found, NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_skb_src_search(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst, - u8 *eth_src, - struct batadv_nc_node *in_nc_node) -{ - struct batadv_orig_node *orig_node; - struct batadv_nc_node *out_nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - orig_node = batadv_orig_hash_find(bat_priv, eth_src); - if (!orig_node) - return NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(out_nc_node, - &orig_node->out_coding_list, list) { - /* Check if the skb is decoded and if recoding is possible */ - if (!batadv_nc_skb_coding_possible(skb, - out_nc_node->addr, eth_src)) - continue; - - /* Search for an opportunity in this nc_path */ - nc_packet = batadv_nc_path_search(bat_priv, in_nc_node, - out_nc_node, skb, eth_dst); - if (nc_packet) - break; - } - rcu_read_unlock(); - - batadv_orig_node_put(orig_node); - return nc_packet; -} - -/** - * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the - * unicast skb before it is stored for use in later decoding - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - * @eth_dst_new: new destination mac address of skb - */ -static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv, - struct sk_buff *skb, - u8 *eth_dst_new) -{ - struct ethhdr *ethhdr; - - /* Copy skb header to change the mac header */ - skb = pskb_copy_for_clone(skb, GFP_ATOMIC); - if (!skb) - return; - - /* Set the mac header as if we actually sent the packet uncoded */ - ethhdr = eth_hdr(skb); - ether_addr_copy(ethhdr->h_source, ethhdr->h_dest); - ether_addr_copy(ethhdr->h_dest, eth_dst_new); - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - /* Add the packet to the decoding packet pool */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - - /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free - * our ref - */ - consume_skb(skb); -} - -/** - * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst. - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * @ethhdr: pointer to the ethernet header inside the skb - * - * Loops through the list of neighboring nodes the next hop has a good - * connection to (receives OGMs with a sufficient quality). We need to find a - * neighbor of our next hop that potentially sent a packet which our next hop - * also received (overheard) and has stored for later decoding. - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -static bool batadv_nc_skb_dst_search(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node, - struct ethhdr *ethhdr) -{ - struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_orig_node *orig_node = neigh_node->orig_node; - struct batadv_nc_node *nc_node; - struct batadv_nc_packet *nc_packet = NULL; - - rcu_read_lock(); - list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) { - /* Search for coding opportunity with this in_nc_node */ - nc_packet = batadv_nc_skb_src_search(bat_priv, skb, - neigh_node->addr, - ethhdr->h_source, nc_node); - - /* Opportunity was found, so stop searching */ - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - return false; - - /* Save packets for later decoding */ - batadv_nc_skb_store_before_coding(bat_priv, skb, - neigh_node->addr); - batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb, - nc_packet->neigh_node->addr); - - /* Code and send packets */ - if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet, - neigh_node)) - return true; - - /* out of mem ? Coding failed - we have to free the buffered packet - * to avoid memleaks. The skb passed as argument will be dealt with - * by the calling function. - */ - batadv_nc_send_packet(nc_packet); - return false; -} - -/** - * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding - * @skb: skb to add to path - * @nc_path: path to add skb to - * @neigh_node: next hop to forward packet to - * @packet_id: checksum to identify packet - * - * Return: true if the packet was buffered or false in case of an error. - */ -static bool batadv_nc_skb_add_to_path(struct sk_buff *skb, - struct batadv_nc_path *nc_path, - struct batadv_neigh_node *neigh_node, - __be32 packet_id) -{ - struct batadv_nc_packet *nc_packet; - - nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC); - if (!nc_packet) - return false; - - /* Initialize nc_packet */ - nc_packet->timestamp = jiffies; - nc_packet->packet_id = packet_id; - nc_packet->skb = skb; - nc_packet->neigh_node = neigh_node; - nc_packet->nc_path = nc_path; - - /* Add coding packet to list */ - spin_lock_bh(&nc_path->packet_list_lock); - list_add_tail(&nc_packet->list, &nc_path->packet_list); - spin_unlock_bh(&nc_path->packet_list_lock); - - return true; -} - -/** - * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet - * buffer - * @skb: data skb to forward - * @neigh_node: next hop to forward packet to - * - * Return: true if the skb was consumed (encoded packet sent) or false otherwise - */ -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - const struct net_device *netdev = neigh_node->if_incoming->mesh_iface; - struct batadv_priv *bat_priv = netdev_priv(netdev); - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* We only handle unicast packets */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Try to find a coding opportunity and send the skb if one is found */ - if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr)) - return true; - - /* Find or create a nc_path for this src-dst pair */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.coding_hash, - ethhdr->h_source, - neigh_node->addr); - - if (!nc_path) - goto out; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id)) - goto free_nc_path; - - /* Packet is consumed */ - return true; - -free_nc_path: - batadv_nc_path_put(nc_path); -out: - /* Packet is not consumed */ - return false; -} - -/** - * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be - * used when decoding coded packets - * @bat_priv: the bat priv with all the mesh interface information - * @skb: data skb to store - */ -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct batadv_unicast_packet *packet; - struct batadv_nc_path *nc_path; - struct ethhdr *ethhdr = eth_hdr(skb); - __be32 packet_id; - u8 *payload; - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto out; - - /* Check for supported packet type */ - payload = skb_network_header(skb); - packet = (struct batadv_unicast_packet *)payload; - if (packet->packet_type != BATADV_UNICAST) - goto out; - - /* Find existing nc_path or create a new */ - nc_path = batadv_nc_get_path(bat_priv, - bat_priv->nc.decoding_hash, - ethhdr->h_source, - ethhdr->h_dest); - - if (!nc_path) - goto out; - - /* Clone skb and adjust skb->data to point at batman header */ - skb = skb_clone(skb, GFP_ATOMIC); - if (unlikely(!skb)) - goto free_nc_path; - - if (unlikely(!pskb_may_pull(skb, ETH_HLEN))) - goto free_skb; - - if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN))) - goto free_skb; - - /* Add skb to nc_path */ - packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet)); - if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id)) - goto free_skb; - - batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER); - return; - -free_skb: - kfree_skb(skb); -free_nc_path: - batadv_nc_path_put(nc_path); -out: - return; -} - -/** - * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet - * should be saved in the decoding buffer and, if so, store it there - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to store - */ -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ - struct ethhdr *ethhdr = eth_hdr(skb); - - if (batadv_is_my_mac(bat_priv, ethhdr->h_dest)) - return; - - /* Set data pointer to MAC header to mimic packets from our tx path */ - skb_push(skb, ETH_HLEN); - - batadv_nc_skb_store_for_decoding(bat_priv, skb); -} - -/** - * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored - * in nc_packet - * @bat_priv: the bat priv with all the mesh interface information - * @skb: unicast skb to decode - * @nc_packet: decode data needed to decode the skb - * - * Return: pointer to decoded unicast packet if the packet was decoded or NULL - * in case of an error. - */ -static struct batadv_unicast_packet * -batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb, - struct batadv_nc_packet *nc_packet) -{ - const int h_size = sizeof(struct batadv_unicast_packet); - const int h_diff = sizeof(struct batadv_coded_packet) - h_size; - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet coded_packet_tmp; - struct ethhdr *ethhdr, ethhdr_tmp; - u8 *orig_dest, ttl, ttvn; - unsigned int coding_len; - int err; - - /* Save headers temporarily */ - memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp)); - memcpy(ðhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp)); - - if (skb_cow(skb, 0) < 0) - return NULL; - - if (unlikely(!skb_pull_rcsum(skb, h_diff))) - return NULL; - - /* Data points to batman header, so set mac header 14 bytes before - * and network to data - */ - skb_set_mac_header(skb, -ETH_HLEN); - skb_reset_network_header(skb); - - /* Reconstruct original mac header */ - ethhdr = eth_hdr(skb); - *ethhdr = ethhdr_tmp; - - /* Select the correct unicast header information based on the location - * of our mac address in the coded_packet header - */ - if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) { - /* If we are the second destination the packet was overheard, - * so the Ethernet address must be copied to h_dest and - * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST - */ - ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest); - skb->pkt_type = PACKET_HOST; - - orig_dest = coded_packet_tmp.second_orig_dest; - ttl = coded_packet_tmp.second_ttl; - ttvn = coded_packet_tmp.second_ttvn; - } else { - orig_dest = coded_packet_tmp.first_orig_dest; - ttl = coded_packet_tmp.ttl; - ttvn = coded_packet_tmp.first_ttvn; - } - - coding_len = ntohs(coded_packet_tmp.coded_len); - - /* ensure dst buffer is large enough (payload only) */ - if (coding_len + h_size > skb->len) - return NULL; - - /* ensure src buffer is large enough (payload only) */ - if (coding_len + h_size > nc_packet->skb->len) - return NULL; - - /* Here the magic is reversed: - * extract the missing packet from the received coded packet - */ - batadv_nc_memxor(skb->data + h_size, - nc_packet->skb->data + h_size, - coding_len); - - /* Resize decoded skb if decoded with larger packet */ - if (nc_packet->skb->len > coding_len + h_size) { - err = pskb_trim_rcsum(skb, coding_len + h_size); - if (err) - return NULL; - } - - /* Create decoded unicast packet */ - unicast_packet = (struct batadv_unicast_packet *)skb->data; - unicast_packet->packet_type = BATADV_UNICAST; - unicast_packet->version = BATADV_COMPAT_VERSION; - unicast_packet->ttl = ttl; - ether_addr_copy(unicast_packet->dest, orig_dest); - unicast_packet->ttvn = ttvn; - - batadv_nc_packet_free(nc_packet, false); - return unicast_packet; -} - -/** - * batadv_nc_find_decoding_packet() - search through buffered decoding data to - * find the data needed to decode the coded packet - * @bat_priv: the bat priv with all the mesh interface information - * @ethhdr: pointer to the ethernet header inside the coded packet - * @coded: coded packet we try to find decode data for - * - * Return: pointer to nc packet if the needed data was found or NULL otherwise. - */ -static struct batadv_nc_packet * -batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv, - struct ethhdr *ethhdr, - struct batadv_coded_packet *coded) -{ - struct batadv_hashtable *hash = bat_priv->nc.decoding_hash; - struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL; - struct batadv_nc_path *nc_path, nc_path_key; - u8 *dest, *source; - __be32 packet_id; - int index; - - if (!hash) - return NULL; - - /* Select the correct packet id based on the location of our mac-addr */ - dest = ethhdr->h_source; - if (!batadv_is_my_mac(bat_priv, coded->second_dest)) { - source = coded->second_source; - packet_id = coded->second_crc; - } else { - source = coded->first_source; - packet_id = coded->first_crc; - } - - batadv_nc_hash_key_gen(&nc_path_key, source, dest); - index = batadv_nc_hash_choose(&nc_path_key, hash->size); - - /* Search for matching coding path */ - rcu_read_lock(); - hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) { - /* Find matching nc_packet */ - spin_lock_bh(&nc_path->packet_list_lock); - list_for_each_entry(tmp_nc_packet, - &nc_path->packet_list, list) { - if (packet_id == tmp_nc_packet->packet_id) { - list_del(&tmp_nc_packet->list); - - nc_packet = tmp_nc_packet; - break; - } - } - spin_unlock_bh(&nc_path->packet_list_lock); - - if (nc_packet) - break; - } - rcu_read_unlock(); - - if (!nc_packet) - batadv_dbg(BATADV_DBG_NC, bat_priv, - "No decoding packet found for %u\n", packet_id); - - return nc_packet; -} - -/** - * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the - * resulting unicast packet - * @skb: incoming coded packet - * @recv_if: pointer to interface this packet was received on - * - * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP - * otherwise. - */ -static int batadv_nc_recv_coded_packet(struct sk_buff *skb, - struct batadv_hard_iface *recv_if) -{ - struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface); - struct batadv_unicast_packet *unicast_packet; - struct batadv_coded_packet *coded_packet; - struct batadv_nc_packet *nc_packet; - struct ethhdr *ethhdr; - int hdr_size = sizeof(*coded_packet); - - /* Check if network coding is enabled */ - if (!atomic_read(&bat_priv->network_coding)) - goto free_skb; - - /* Make sure we can access (and remove) header */ - if (unlikely(!pskb_may_pull(skb, hdr_size))) - goto free_skb; - - coded_packet = (struct batadv_coded_packet *)skb->data; - ethhdr = eth_hdr(skb); - - /* Verify frame is destined for us */ - if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) && - !batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - goto free_skb; - - /* Update stat counter */ - if (batadv_is_my_mac(bat_priv, coded_packet->second_dest)) - batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED); - - nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr, - coded_packet); - if (!nc_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_skb; - } - - /* Make skb's linear, because decoding accesses the entire buffer */ - if (skb_linearize(skb) < 0) - goto free_nc_packet; - - if (skb_linearize(nc_packet->skb) < 0) - goto free_nc_packet; - - /* Decode the packet */ - unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet); - if (!unicast_packet) { - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED); - goto free_nc_packet; - } - - /* Mark packet as decoded to do correct recoding when forwarding */ - BATADV_SKB_CB(skb)->decoded = true; - batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE); - batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES, - skb->len + ETH_HLEN); - return batadv_recv_unicast_packet(skb, recv_if); - -free_nc_packet: - batadv_nc_packet_free(nc_packet, true); -free_skb: - kfree_skb(skb); - - return NET_RX_DROP; -} - -/** - * batadv_nc_mesh_free() - clean up network coding memory - * @bat_priv: the bat priv with all the mesh interface information - */ -void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ - batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1); - batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1); - cancel_delayed_work_sync(&bat_priv->nc.work); - - batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.coding_hash); - batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL); - batadv_hash_destroy(bat_priv->nc.decoding_hash); -} diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h deleted file mode 100644 index 368cc3130e4c..000000000000 --- a/net/batman-adv/network-coding.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) B.A.T.M.A.N. contributors: - * - * Martin Hundebøll, Jeppe Ledet-Pedersen - */ - -#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_ -#define _NET_BATMAN_ADV_NETWORK_CODING_H_ - -#include "main.h" - -#include <linux/netdevice.h> -#include <linux/skbuff.h> -#include <linux/types.h> -#include <uapi/linux/batadv_packet.h> - -#ifdef CONFIG_BATMAN_ADV_NC - -void batadv_nc_status_update(struct net_device *net_dev); -int batadv_nc_init(void); -int batadv_nc_mesh_init(struct batadv_priv *bat_priv); -void batadv_nc_mesh_free(struct batadv_priv *bat_priv); -void batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh); -void batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)); -void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv); -void batadv_nc_init_orig(struct batadv_orig_node *orig_node); -bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node); -void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb); -void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb); - -#else /* ifdef CONFIG_BATMAN_ADV_NC */ - -static inline void batadv_nc_status_update(struct net_device *net_dev) -{ -} - -static inline int batadv_nc_init(void) -{ - return 0; -} - -static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv) -{ - return 0; -} - -static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv) -{ -} - -static inline void -batadv_nc_update_nc_node(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - struct batadv_orig_node *orig_neigh_node, - struct batadv_ogm_packet *ogm_packet, - int is_single_hop_neigh) -{ -} - -static inline void -batadv_nc_purge_orig(struct batadv_priv *bat_priv, - struct batadv_orig_node *orig_node, - bool (*to_purge)(struct batadv_priv *, - struct batadv_nc_node *)) -{ -} - -static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv) -{ -} - -static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node) -{ -} - -static inline bool batadv_nc_skb_forward(struct sk_buff *skb, - struct batadv_neigh_node *neigh_node) -{ - return false; -} - -static inline void -batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -static inline void -batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv, - struct sk_buff *skb) -{ -} - -#endif /* ifdef CONFIG_BATMAN_ADV_NC */ - -#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */ diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index a464ff96b929..c84420cb410d 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -37,7 +37,6 @@ #include "log.h" #include "multicast.h" #include "netlink.h" -#include "network-coding.h" #include "routing.h" #include "translation-table.h" @@ -883,9 +882,6 @@ void batadv_orig_node_release(struct kref *ref) } spin_unlock_bh(&orig_node->vlan_list_lock); - /* Free nc_nodes */ - batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL); - call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu); } @@ -959,8 +955,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv, spin_lock_init(&orig_node->tt_lock); spin_lock_init(&orig_node->vlan_list_lock); - batadv_nc_init_orig(orig_node); - /* extra reference for return */ kref_init(&orig_node->refcount); diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 35d8c5783999..12c16f81cc51 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -31,7 +31,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "send.h" #include "tp_meter.h" @@ -956,15 +955,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* function returns -EREMOTE for promiscuous packets */ check = batadv_check_unicast_packet(bat_priv, skb, hdr_size); - - /* Even though the packet is not for us, we might save it to use for - * decoding a later received coded packet - */ - if (check == -EREMOTE) - batadv_nc_skb_store_sniffed_unicast(bat_priv, skb); - if (check < 0) goto free_skb; + if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size)) goto free_skb; diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 95849ba004e7..20d85c681064 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -34,7 +34,6 @@ #include "hard-interface.h" #include "log.h" #include "mesh-interface.h" -#include "network-coding.h" #include "originator.h" #include "routing.h" #include "translation-table.h" @@ -63,12 +62,9 @@ int batadv_send_skb_packet(struct sk_buff *skb, struct batadv_hard_iface *hard_iface, const u8 *dst_addr) { - struct batadv_priv *bat_priv; struct ethhdr *ethhdr; int ret; - bat_priv = netdev_priv(hard_iface->mesh_iface); - if (hard_iface->if_status != BATADV_IF_ACTIVE) goto send_skb_err; @@ -97,9 +93,6 @@ int batadv_send_skb_packet(struct sk_buff *skb, skb->dev = hard_iface->net_dev; - /* Save a clone of the skb to use when decoding coded packets */ - batadv_nc_skb_store_for_decoding(bat_priv, skb); - /* dev_queue_xmit() returns a negative result on error. However on * congestion and traffic shaping, it drops and returns NET_XMIT_DROP * (which is > 0). This will not be treated as an error. @@ -202,14 +195,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb, goto put_neigh_node; } - /* try to network code the packet, if it is received on an interface - * (i.e. being forwarded). If the packet originates from this node or if - * network coding fails, then send the packet as usual. - */ - if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) - ret = -EINPROGRESS; - else - ret = batadv_send_unicast_skb(skb, neigh_node); + ret = batadv_send_unicast_skb(skb, neigh_node); /* skb was consumed */ skb = NULL; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8d0e04e770cb..6e95e883c2bf 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -212,7 +212,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr, /** * batadv_tt_local_entry_release() - release tt_local_entry from lists and queue * for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_local_entry */ static void batadv_tt_local_entry_release(struct kref *ref) { @@ -244,7 +244,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry) /** * batadv_tt_global_entry_release() - release tt_global_entry from lists and * queue for free after rcu grace period - * @ref: kref pointer of the nc_node + * @ref: kref pointer of the batadv_tt_global_entry */ void batadv_tt_global_entry_release(struct kref *ref) { diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 0ca0fc072fc9..ae1d7a8dc480 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -505,20 +505,6 @@ struct batadv_orig_node { /** @rcu: struct used for freeing in an RCU-safe manner */ struct rcu_head rcu; -#ifdef CONFIG_BATMAN_ADV_NC - /** @in_coding_list: list of nodes this orig can hear */ - struct list_head in_coding_list; - - /** @out_coding_list: list of nodes that can hear this orig */ - struct list_head out_coding_list; - - /** @in_coding_list_lock: protects in_coding_list */ - spinlock_t in_coding_list_lock; - - /** @out_coding_list_lock: protects out_coding_list */ - spinlock_t out_coding_list_lock; -#endif - /** @fragments: array with heads for fragment chains */ struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT]; @@ -545,9 +531,6 @@ enum batadv_orig_capabilities { */ BATADV_ORIG_CAPA_HAS_DAT, - /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */ - BATADV_ORIG_CAPA_HAS_NC, - /** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */ BATADV_ORIG_CAPA_HAS_TT, @@ -953,60 +936,6 @@ enum batadv_counters { BATADV_CNT_DAT_CACHED_REPLY_TX, #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter - */ - BATADV_CNT_NC_CODE, - - /** - * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes - * counter - */ - BATADV_CNT_NC_CODE_BYTES, - - /** - * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet - * counter - */ - BATADV_CNT_NC_RECODE, - - /** - * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes - * counter - */ - BATADV_CNT_NC_RECODE_BYTES, - - /** - * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc - * decoding - */ - BATADV_CNT_NC_BUFFER, - - /** - * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter - */ - BATADV_CNT_NC_DECODE, - - /** - * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes - * counter - */ - BATADV_CNT_NC_DECODE_BYTES, - - /** - * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic - * packet counter - */ - BATADV_CNT_NC_DECODE_FAILED, - - /** - * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in - * promisc mode. - */ - BATADV_CNT_NC_SNIFFED, -#endif - /** @BATADV_CNT_NUM: number of traffic counters */ BATADV_CNT_NUM, }; @@ -1340,56 +1269,6 @@ struct batadv_priv_mcast { #endif /** - * struct batadv_priv_nc - per mesh interface network coding private data - */ -struct batadv_priv_nc { - /** @work: work queue callback item for cleanup */ - struct delayed_work work; - - /** - * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq - */ - u8 min_tq; - - /** - * @max_fwd_delay: maximum packet forward delay to allow coding of - * packets - */ - u32 max_fwd_delay; - - /** - * @max_buffer_time: buffer time for sniffed packets used to decoding - */ - u32 max_buffer_time; - - /** - * @timestamp_fwd_flush: timestamp of last forward packet queue flush - */ - unsigned long timestamp_fwd_flush; - - /** - * @timestamp_sniffed_purge: timestamp of last sniffed packet queue - * purge - */ - unsigned long timestamp_sniffed_purge; - - /** - * @coding_hash: Hash table used to buffer skbs while waiting for - * another incoming skb to code it with. Skbs are added to the buffer - * just before being forwarded in routing.c - */ - struct batadv_hashtable *coding_hash; - - /** - * @decoding_hash: Hash table used to buffer skbs that might be needed - * to decode a received coded skb. The buffer is used for 1) skbs - * arriving on the mesh-interface; 2) skbs overheard on the - * hard-interface; and 3) skbs forwarded by batman-adv. - */ - struct batadv_hashtable *decoding_hash; -}; - -/** * struct batadv_tp_unacked - unacked packet meta-information * * This struct is supposed to represent a buffer unacked packet. However, since @@ -1775,16 +1654,6 @@ struct batadv_priv { struct batadv_priv_mcast mcast; #endif -#ifdef CONFIG_BATMAN_ADV_NC - /** - * @network_coding: bool indicating whether network coding is enabled - */ - atomic_t network_coding; - - /** @nc: network coding data */ - struct batadv_priv_nc nc; -#endif /* CONFIG_BATMAN_ADV_NC */ - #ifdef CONFIG_BATMAN_ADV_BATMAN_V /** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */ struct batadv_priv_bat_v bat_v; @@ -2017,95 +1886,10 @@ struct batadv_tt_roam_node { }; /** - * struct batadv_nc_node - network coding node - */ -struct batadv_nc_node { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @addr: the node's mac address */ - u8 addr[ETH_ALEN]; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @orig_node: pointer to corresponding orig node struct */ - struct batadv_orig_node *orig_node; - - /** @last_seen: timestamp of last ogm received from this node */ - unsigned long last_seen; -}; - -/** - * struct batadv_nc_path - network coding path - */ -struct batadv_nc_path { - /** @hash_entry: next and prev pointer for the list handling */ - struct hlist_node hash_entry; - - /** @rcu: struct used for freeing in an RCU-safe manner */ - struct rcu_head rcu; - - /** @refcount: number of contexts the object is used by */ - struct kref refcount; - - /** @packet_list: list of buffered packets for this path */ - struct list_head packet_list; - - /** @packet_list_lock: access lock for packet list */ - spinlock_t packet_list_lock; - - /** @next_hop: next hop (destination) of path */ - u8 next_hop[ETH_ALEN]; - - /** @prev_hop: previous hop (source) of path */ - u8 prev_hop[ETH_ALEN]; - - /** @last_valid: timestamp for last validation of path */ - unsigned long last_valid; -}; - -/** - * struct batadv_nc_packet - network coding packet used when coding and - * decoding packets - */ -struct batadv_nc_packet { - /** @list: next and prev pointer for the list handling */ - struct list_head list; - - /** @packet_id: crc32 checksum of skb data */ - __be32 packet_id; - - /** - * @timestamp: field containing the info when the packet was added to - * path - */ - unsigned long timestamp; - - /** @neigh_node: pointer to original next hop neighbor of skb */ - struct batadv_neigh_node *neigh_node; - - /** @skb: skb which can be encoded or used for decoding */ - struct sk_buff *skb; - - /** @nc_path: pointer to path this nc packet is attached to */ - struct batadv_nc_path *nc_path; -}; - -/** * struct batadv_skb_cb - control buffer structure used to store private data * relevant to batman-adv in the skb->cb buffer in skbs. */ struct batadv_skb_cb { - /** - * @decoded: Marks a skb as decoded, which is checked when searching for - * coding opportunities in network-coding.c - */ - unsigned char decoded:1; - /** @num_bcasts: Counter for broadcast packet retransmissions */ unsigned char num_bcasts; }; diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index e524bb59bff2..111f0e37b672 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -924,10 +924,9 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_mtu) - /* Dedicated ISO Buffer exists */ - break; - fallthrough; + if (!hdev->iso_mtu) + return ERR_PTR(-ECONNREFUSED); + break; case LE_LINK: if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU) return ERR_PTR(-ECONNREFUSED); @@ -1540,7 +1539,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos) /* This function requires the caller holds hdev->lock */ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; @@ -1582,6 +1581,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst, conn->state = BT_CONNECT; conn->sid = sid; + conn->conn_timeout = timeout; hci_conn_hold(conn); return conn; @@ -1922,7 +1922,8 @@ done: } struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *cis; @@ -1937,6 +1938,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst, cis->dst_type = dst_type; cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET; cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET; + cis->conn_timeout = timeout; } if (cis->state == BT_CONNECTED) @@ -2176,7 +2178,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err) struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; struct hci_conn *parent; @@ -2197,7 +2199,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid, base, base_len); /* We need hci_conn object using the BDADDR_ANY as dst */ - conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir); + conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout); if (IS_ERR(conn)) return conn; @@ -2250,13 +2252,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data) struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 dst_type, __u8 sid, struct bt_iso_qos *qos, - __u8 base_len, __u8 *base) + __u8 base_len, __u8 *base, u16 timeout) { struct hci_conn *conn; int err; struct iso_list_data data; - conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base); + conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout); if (IS_ERR(conn)) return conn; @@ -2299,7 +2301,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst, } struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, - __u8 dst_type, struct bt_iso_qos *qos) + __u8 dst_type, struct bt_iso_qos *qos, + u16 timeout) { struct hci_conn *le; struct hci_conn *cis; @@ -2323,7 +2326,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, hci_iso_qos_setup(hdev, le, &qos->ucast.in, le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys); - cis = hci_bind_cis(hdev, dst, dst_type, qos); + cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout); if (IS_ERR(cis)) { hci_conn_drop(le); return cis; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 55e0722fd066..3418d7b964a1 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3267,6 +3267,8 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue, spin_unlock_bh(&queue->lock); } + + bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue)); } void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags) @@ -3298,6 +3300,10 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) hci_skb_pkt_type(skb) = HCI_SCODATA_PKT; skb_queue_tail(&conn->data_q, skb); + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, + skb_queue_len(&conn->data_q)); + queue_work(hdev->workqueue, &hdev->tx_work); } @@ -3357,6 +3363,8 @@ static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue, __skb_queue_tail(queue, skb); } while (list); } + + bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue)); } void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb) @@ -3399,8 +3407,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote) case CIS_LINK: case BIS_LINK: case PA_LINK: - cnt = hdev->iso_mtu ? hdev->iso_cnt : - hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt; + cnt = hdev->iso_cnt; break; default: cnt = 0; @@ -3428,6 +3435,10 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, skb_queue_empty(&c->data_q)) continue; + bt_dev_dbg(hdev, "hcon %p state %s queued %d", c, + state_to_string(c->state), + skb_queue_len(&c->data_q)); + if (c->state != BT_CONNECTED && c->state != BT_CONFIG) continue; @@ -3586,24 +3597,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type) static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - unsigned long last_tx; + unsigned long timeout; if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) return; switch (type) { + case ACL_LINK: + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT; + break; case LE_LINK: - last_tx = hdev->le_last_tx; + /* tx timeout must be longer than maximum link supervision + * timeout (40.9 seconds) + */ + timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT; break; - default: - last_tx = hdev->acl_last_tx; + case CIS_LINK: + case BIS_LINK: + case PA_LINK: + /* tx timeout must be longer than the maximum transport latency + * (8.388607 seconds) + */ + timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT; break; + default: + return; } - /* tx timeout must be longer than maximum link supervision timeout - * (40.9 seconds) - */ - if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + if (!cnt && time_after(jiffies, timeout)) hci_link_tx_to(hdev, type); } @@ -3759,12 +3783,16 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type) if (!hci_conn_num(hdev, type)) return; - cnt = hdev->iso_pkts ? &hdev->iso_cnt : - hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt; + cnt = &hdev->iso_cnt; + + __check_timeout(hdev, *cnt, type); + while (*cnt && (conn = hci_low_sent(hdev, type, "e))) { while (quote-- && (skb = skb_dequeue(&conn->data_q))) { BT_DBG("skb %p len %d", skb, skb->len); + hci_send_conn_frame(hdev, conn, skb); + hdev->iso_last_tx = jiffies; conn->sent++; if (conn->sent == ~0) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fe49e8a7969f..d790b0d4eb9a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -4461,19 +4461,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data, case CIS_LINK: case BIS_LINK: case PA_LINK: - if (hdev->iso_pkts) { - hdev->iso_cnt += count; - if (hdev->iso_cnt > hdev->iso_pkts) - hdev->iso_cnt = hdev->iso_pkts; - } else if (hdev->le_pkts) { - hdev->le_cnt += count; - if (hdev->le_cnt > hdev->le_pkts) - hdev->le_cnt = hdev->le_pkts; - } else { - hdev->acl_cnt += count; - if (hdev->acl_cnt > hdev->acl_pkts) - hdev->acl_cnt = hdev->acl_pkts; - } + hdev->iso_cnt += count; + if (hdev->iso_cnt > hdev->iso_pkts) + hdev->iso_cnt = hdev->iso_pkts; break; default: diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 7a7d49890858..eefdb6134ca5 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -1325,7 +1325,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) { struct hci_cp_le_set_ext_adv_params cp; struct hci_rp_le_set_ext_adv_params rp; - bool connectable; + bool connectable, require_privacy; u32 flags; bdaddr_t random_addr; u8 own_addr_type; @@ -1363,10 +1363,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance) return -EPERM; /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. + * advertising is used and it is not periodic. + * In that case it is fine to use a non-resolvable private address. */ - err = hci_get_random_address(hdev, !connectable, + require_privacy = !connectable && !(adv && adv->periodic); + + err = hci_get_random_address(hdev, require_privacy, adv_use_rpa(hdev, flags), adv, &own_addr_type, &random_addr); if (err < 0) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 5ce823ca3aaf..9b263d061e05 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -91,8 +91,8 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst, iso_sock_match_t match, void *data); /* ---- ISO timers ---- */ -#define ISO_CONN_TIMEOUT (HZ * 40) -#define ISO_DISCONN_TIMEOUT (HZ * 2) +#define ISO_CONN_TIMEOUT secs_to_jiffies(20) +#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2) static void iso_conn_free(struct kref *ref) { @@ -111,6 +111,8 @@ static void iso_conn_free(struct kref *ref) /* Ensure no more work items will run since hci_conn has been dropped */ disable_delayed_work_sync(&conn->timeout_work); + kfree_skb(conn->rx_skb); + kfree(conn); } @@ -367,7 +369,8 @@ static int iso_connect_bis(struct sock *sk) if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, iso_pi(sk)->base_len, - iso_pi(sk)->base); + iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -376,7 +379,8 @@ static int iso_connect_bis(struct sock *sk) hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), iso_pi(sk)->bc_sid, &iso_pi(sk)->qos, - iso_pi(sk)->base_len, iso_pi(sk)->base); + iso_pi(sk)->base_len, iso_pi(sk)->base, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -458,11 +462,19 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } + /* Check if there are available buffers for output/TX. */ + if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) && + (hdev->iso_pkts && !hdev->iso_cnt)) { + err = -ENOBUFS; + goto unlock; + } + /* Just bind if DEFER_SETUP has been set */ if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -470,7 +482,8 @@ static int iso_connect_cis(struct sock *sk) } else { hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst, le_addr_type(iso_pi(sk)->dst_type), - &iso_pi(sk)->qos); + &iso_pi(sk)->qos, + READ_ONCE(sk->sk_sndtimeo)); if (IS_ERR(hcon)) { err = PTR_ERR(hcon); goto unlock; @@ -750,6 +763,13 @@ static void iso_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (iso_pi(sk)->conn) { + iso_conn_lock(iso_pi(sk)->conn); + iso_pi(sk)->conn->sk = NULL; + iso_conn_unlock(iso_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&iso_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); @@ -2407,7 +2427,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), skb->len); conn->rx_len -= skb->len; - return; + break; case ISO_END: skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len), diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 225140fcb3d6..a3d16eece0d2 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -4542,13 +4542,11 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, return -ENOMEM; #ifdef CONFIG_BT_FEATURE_DEBUG - if (!hdev) { - flags = bt_dbg_get() ? BIT(0) : 0; + flags = bt_dbg_get() ? BIT(0) : 0; - memcpy(rp->features[idx].uuid, debug_uuid, 16); - rp->features[idx].flags = cpu_to_le32(flags); - idx++; - } + memcpy(rp->features[idx].uuid, debug_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; #endif if (hdev && hci_dev_le_state_simultaneous(hdev)) { diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c index 6ef701c27da4..c4063d200c0a 100644 --- a/net/bluetooth/mgmt_config.c +++ b/net/bluetooth/mgmt_config.c @@ -13,13 +13,13 @@ #define HDEV_PARAM_U16(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __le16 value; \ } __packed _param_name_ #define HDEV_PARAM_U8(_param_name_) \ struct {\ - struct mgmt_tlv entry; \ + struct mgmt_tlv_hdr entry; \ __u8 value; \ } __packed _param_name_ diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index d382d980fd9a..ab0cf442d57b 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -498,6 +498,13 @@ static void sco_sock_kill(struct sock *sk) BT_DBG("sk %p state %d", sk, sk->sk_state); + /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */ + if (sco_pi(sk)->conn) { + sco_conn_lock(sco_pi(sk)->conn); + sco_pi(sk)->conn->sk = NULL; + sco_conn_unlock(sco_pi(sk)->conn); + } + /* Kill poor orphan */ bt_sock_unlink(&sco_sk_list, sk); sock_set_flag(sk, SOCK_DEAD); diff --git a/net/bridge/br.c b/net/bridge/br.c index c683baa3847f..c37e52e2f29a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -37,6 +37,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v int err; if (netif_is_bridge_master(dev)) { + struct net_bridge *br = netdev_priv(dev); + + if (event == NETDEV_REGISTER) + br_fdb_change_mac_address(br, dev->dev_addr); + err = br_vlan_bridge_event(dev, event, ptr); if (err) return notifier_from_errno(err); @@ -259,6 +264,23 @@ static struct notifier_block br_switchdev_blocking_notifier = { .notifier_call = br_switchdev_blocking_event, }; +static int +br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + int err; + + if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on) + return 0; + + err = br_fdb_toggle_local_vlan_0(br, on, extack); + if (err) + return err; + + br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on); + return 0; +} + /* br_boolopt_toggle - change user-controlled boolean option * * @br: bridge device @@ -287,6 +309,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on, case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on); break; + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + err = br_toggle_fdb_local_vlan_0(br, on, extack); + break; default: /* shouldn't be called with unsupported options */ WARN_ON(1); @@ -307,6 +332,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt) return br_opt_get(br, BROPT_MST_ENABLED); case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION: return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION); + case BR_BOOLOPT_FDB_LOCAL_VLAN_0: + return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); default: /* shouldn't be called with unsupported options */ WARN_ON(1); diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c index a3c755d0a09d..c2c1c7d44c61 100644 --- a/net/bridge/br_cfm.c +++ b/net/bridge/br_cfm.c @@ -134,7 +134,7 @@ static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep) * of the configured CC 'expected_interval' * in order to detect CCM defect after 3.25 interval. */ - queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork, + queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork, usecs_to_jiffies(interval_us / 4)); } @@ -285,7 +285,7 @@ static void ccm_tx_work_expired(struct work_struct *work) ccm_frame_tx(skb); interval_us = interval_to_us(mep->cc_config.exp_interval); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, usecs_to_jiffies(interval_us)); } @@ -809,7 +809,7 @@ int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance, * to send first frame immediately */ mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000); - queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0); + queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0); save: mep->cc_ccm_tx_info = *tx_info; diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 902694c0ce64..58d22e2b85fc 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -459,6 +459,9 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) struct net_bridge_fdb_entry *f; struct net_bridge *br = p->br; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); vg = nbp_vlan_group(p); @@ -468,11 +471,11 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) /* delete old one */ fdb_delete_local(br, p, f); - /* if this port has no vlan information - * configured, we can safely be done at - * this point. + /* if this port has no vlan information configured, or + * local entries are only kept on VLAN 0, we can safely + * be done at this point. */ - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto insert; } } @@ -481,7 +484,7 @@ insert: /* insert new address, may fail if invalid address or dup. */ fdb_add_local(br, p, newaddr, 0); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto done; /* Now add entries for every VLAN configured on the port. @@ -500,6 +503,9 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) struct net_bridge_vlan_group *vg; struct net_bridge_fdb_entry *f; struct net_bridge_vlan *v; + bool local_vlan_0; + + local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0); spin_lock_bh(&br->hash_lock); @@ -511,7 +517,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr) fdb_add_local(br, NULL, newaddr, 0); vg = br_vlan_group(br); - if (!vg || !vg->num_vlans) + if (!vg || !vg->num_vlans || local_vlan_0) goto out; /* Now remove and add entries for every VLAN configured on the * bridge. This function runs under RTNL so the bitmap will not @@ -576,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work) mod_delayed_work(system_long_wq, &br->gc_work, work_delay); } +static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) + br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid); +} + +static void br_fdb_delete_locals_per_vlan(struct net_bridge *br) +{ + struct net_bridge_port *p; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) + br_fdb_delete_locals_per_vlan_port(br, p); + + br_fdb_delete_locals_per_vlan_port(br, NULL); +} + +static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br, + struct net_bridge_port *p, + struct netlink_ext_ack *extack) +{ + struct net_bridge_vlan_group *vg; + struct net_bridge_vlan *v; + struct net_device *dev; + int err; + + if (p) { + vg = nbp_vlan_group(p); + dev = p->dev; + } else { + vg = br_vlan_group(br); + dev = br->dev; + } + + list_for_each_entry(v, &vg->vlan_list, vlist) { + if (!br_vlan_should_use(v)) + continue; + + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) + return err; + } + + return 0; +} + +static int br_fdb_insert_locals_per_vlan(struct net_bridge *br, + struct netlink_ext_ack *extack) +{ + struct net_bridge_port *p; + int err; + + ASSERT_RTNL(); + + list_for_each_entry(p, &br->port_list, list) { + err = br_fdb_insert_locals_per_vlan_port(br, p, extack); + if (err) + goto rollback; + } + + err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack); + if (err) + goto rollback; + + return 0; + +rollback: + NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed"); + br_fdb_delete_locals_per_vlan(br); + return err; +} + +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack) +{ + if (!on) + return br_fdb_insert_locals_per_vlan(br, extack); + + br_fdb_delete_locals_per_vlan(br); + return 0; +} + static bool __fdb_flush_matches(const struct net_bridge *br, const struct net_bridge_fdb_entry *f, const struct net_bridge_fdb_flush_desc *desc) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 29097e984b4f..870bdf2e082c 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -148,7 +148,8 @@ void br_forward(const struct net_bridge_port *to, goto out; /* redirect to backup link if the destination port is down */ - if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) { + if (rcu_access_pointer(to->backup_port) && + (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) { struct net_bridge_port *backup_port; backup_port = rcu_dereference(to->backup_port); diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 5f6ac9bf1527..67b4c905e49a 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -202,6 +202,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb break; case BR_PKT_UNICAST: dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid); + if (unlikely(!dst && vid && + br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) { + dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0); + if (dst && + (!test_bit(BR_FDB_LOCAL, &dst->flags) || + test_bit(BR_FDB_ADDED_BY_USER, &dst->flags))) + dst = NULL; + } break; default: break; diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c index fd2de35ffb3c..3c36fa24bc05 100644 --- a/net/bridge/br_mrp.c +++ b/net/bridge/br_mrp.c @@ -341,7 +341,7 @@ static void br_mrp_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(mrp->test_interval)); } @@ -418,7 +418,7 @@ static void br_mrp_in_test_work_expired(struct work_struct *work) out: rcu_read_unlock(); - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(mrp->in_test_interval)); } @@ -725,7 +725,7 @@ int br_mrp_start_test(struct net_bridge *br, mrp->test_max_miss = test->max_miss; mrp->test_monitor = test->monitor; mrp->test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->test_work, + queue_delayed_work(system_percpu_wq, &mrp->test_work, usecs_to_jiffies(test->interval)); return 0; @@ -865,7 +865,7 @@ int br_mrp_start_in_test(struct net_bridge *br, mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period); mrp->in_test_max_miss = in_test->max_miss; mrp->in_test_count_miss = 0; - queue_delayed_work(system_wq, &mrp->in_test_work, + queue_delayed_work(system_percpu_wq, &mrp->in_test_work, usecs_to_jiffies(in_test->interval)); return 0; diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8ce145938b02..22d12e545966 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4049,8 +4049,7 @@ int br_multicast_rcv(struct net_bridge_mcast **brmctx, } static void br_multicast_query_expired(struct net_bridge_mcast *brmctx, - struct bridge_mcast_own_query *query, - struct bridge_mcast_querier *querier) + struct bridge_mcast_own_query *query) { spin_lock(&brmctx->br->multicast_lock); if (br_multicast_ctx_vlan_disabled(brmctx)) @@ -4069,8 +4068,7 @@ static void br_ip4_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip4_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip4_own_query, - &brmctx->ip4_querier); + br_multicast_query_expired(brmctx, &brmctx->ip4_own_query); } #if IS_ENABLED(CONFIG_IPV6) @@ -4079,8 +4077,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t) struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t, ip6_own_query.timer); - br_multicast_query_expired(brmctx, &brmctx->ip6_own_query, - &brmctx->ip6_querier); + br_multicast_query_expired(brmctx, &brmctx->ip6_own_query); } #endif diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 8de0904b9627..16be5d250402 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -487,6 +487,7 @@ enum net_bridge_opts { BROPT_MCAST_VLAN_SNOOPING_ENABLED, BROPT_MST_ENABLED, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, + BROPT_FDB_LOCAL_VLAN_0, }; struct net_bridge { @@ -843,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br, void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr); void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr); void br_fdb_cleanup(struct work_struct *work); +int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on, + struct netlink_ext_ack *extack); void br_fdb_delete_by_port(struct net_bridge *br, const struct net_bridge_port *p, u16 vid, int do_all); struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br, diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 939a3aa78d5c..ae911220cb3c 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -331,10 +331,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags, /* Add the dev mac and count the vlan only if it's usable */ if (br_vlan_should_use(v)) { - err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); - if (err) { - br_err(br, "failed insert local address into bridge forwarding table\n"); - goto out_filt; + if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) { + err = br_fdb_add_local(br, p, dev->dev_addr, v->vid); + if (err) { + br_err(br, "failed insert local address into bridge forwarding table\n"); + goto out_filt; + } } vg->num_vlans++; } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 3e67d4aff419..5697e3949a36 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -920,8 +920,8 @@ static int translate_table(struct net *net, const char *name, * if an error occurs */ newinfo->chainstack = - vmalloc(array_size(nr_cpu_ids, - sizeof(*(newinfo->chainstack)))); + vmalloc_array(nr_cpu_ids, + sizeof(*(newinfo->chainstack))); if (!newinfo->chainstack) return -ENOMEM; for_each_possible_cpu(i) { @@ -938,7 +938,7 @@ static int translate_table(struct net *net, const char *name, } } - cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s))); + cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s)); if (!cl_s) return -ENOMEM; i = 0; /* the i'th udc */ @@ -1018,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, * the check on the size is done later, when we have the lock */ if (repl->num_counters) { - unsigned long size = repl->num_counters * sizeof(*counterstmp); - counterstmp = vmalloc(size); + counterstmp = vmalloc_array(repl->num_counters, + sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; } @@ -1386,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name, if (num_counters == 0) return -EINVAL; - tmp = vmalloc(array_size(num_counters, sizeof(*tmp))); + tmp = vmalloc_array(num_counters, sizeof(*tmp)); if (!tmp) return -ENOMEM; @@ -1526,7 +1526,7 @@ static int copy_counters_to_user(struct ebt_table *t, if (num_counters != nentries) return -EINVAL; - counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp))); + counterstmp = vmalloc_array(nentries, sizeof(*counterstmp)); if (!counterstmp) return -ENOMEM; diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5adced1e7d0c..b7af36bbd306 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr, nft_reg_store_be16(dest, htons(p_proto)); return; } + case NFT_META_BRI_IIFHWADDR: + br_dev = nft_meta_get_bridge(in); + if (!br_dev) + goto err; + + memcpy(dest, br_dev->dev_addr, ETH_ALEN); + return; default: return nft_meta_get_eval(expr, regs, pkt); } @@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx, case NFT_META_BRI_IIFVPROTO: len = sizeof(u16); break; + case NFT_META_BRI_IIFHWADDR: + len = ETH_ALEN; + break; default: return nft_meta_get_init(ctx, expr, tb); } @@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx, switch (priv->key) { case NFT_META_BRI_BROUTE: + case NFT_META_BRI_IIFHWADDR: hooks = 1 << NF_BR_PRE_ROUTING; break; default: diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 06b604cf9d58..2aa1e7d46eb2 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer, cfpkt_add_body(pkt, &tmp16, 2); tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs); cfpkt_add_body(pkt, &tmp16, 2); - memset(utility_name, 0, sizeof(utility_name)); - strscpy(utility_name, param->u.utility.name, - UTILITY_NAME_LENGTH); + strscpy_pad(utility_name, param->u.utility.name); cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH); tmp8 = param->u.utility.paramlen; cfpkt_add_body(pkt, &tmp8, 1); diff --git a/net/can/af_can.c b/net/can/af_can.c index b2387a46794a..770173d8db42 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -221,7 +221,7 @@ int can_send(struct sk_buff *skb, int loop) } /* Make sure the CAN frame can pass the selected CAN netdevice. */ - if (unlikely(skb->len > skb->dev->mtu)) { + if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) { err = -EMSGSIZE; goto inval_skb; } diff --git a/net/can/isotp.c b/net/can/isotp.c index dee1412b3c9c..74ee1e52249b 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -1313,7 +1313,7 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len) err = -ENODEV; goto out; } - if (dev->mtu < so->ll.mtu) { + if (READ_ONCE(dev->mtu) < so->ll.mtu) { dev_put(dev); err = -EINVAL; goto out; diff --git a/net/can/raw.c b/net/can/raw.c index 76b867d21def..a53853f5e9af 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -75,31 +75,31 @@ MODULE_ALIAS("can-proto-1"); */ struct uniqframe { - int skbcnt; const struct sk_buff *skb; + int skbcnt; unsigned int join_rx_count; }; struct raw_sock { struct sock sk; - int bound; - int ifindex; struct net_device *dev; netdevice_tracker dev_tracker; struct list_head notifier; - int loopback; - int recv_own_msgs; - int fd_frames; - int xl_frames; + int ifindex; + unsigned int bound:1; + unsigned int loopback:1; + unsigned int recv_own_msgs:1; + unsigned int fd_frames:1; + unsigned int xl_frames:1; + unsigned int join_filters:1; struct can_raw_vcid_options raw_vcid_opts; canid_t tx_vcid_shifted; canid_t rx_vcid_shifted; canid_t rx_vcid_mask_shifted; - int join_filters; + can_err_mask_t err_mask; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ struct can_filter *filter; /* pointer to filter(s) */ - can_err_mask_t err_mask; struct uniqframe __percpu *uniq; }; @@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, struct can_filter sfilter; /* single filter */ struct net_device *dev = NULL; can_err_mask_t err_mask = 0; - int fd_frames; int count = 0; + int flag; int err = 0; if (level != SOL_CAN_RAW) @@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_LOOPBACK: - if (optlen != sizeof(ro->loopback)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->loopback, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->loopback = !!flag; break; case CAN_RAW_RECV_OWN_MSGS: - if (optlen != sizeof(ro->recv_own_msgs)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->recv_own_msgs = !!flag; break; case CAN_RAW_FD_FRAMES: - if (optlen != sizeof(fd_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&fd_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; /* Enabling CAN XL includes CAN FD */ - if (ro->xl_frames && !fd_frames) + if (ro->xl_frames && !flag) return -EINVAL; - ro->fd_frames = fd_frames; + ro->fd_frames = !!flag; break; case CAN_RAW_XL_FRAMES: - if (optlen != sizeof(ro->xl_frames)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->xl_frames = !!flag; + /* Enabling CAN XL includes CAN FD */ if (ro->xl_frames) ro->fd_frames = ro->xl_frames; @@ -739,12 +743,13 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; case CAN_RAW_JOIN_FILTERS: - if (optlen != sizeof(ro->join_filters)) + if (optlen != sizeof(flag)) return -EINVAL; - if (copy_from_sockptr(&ro->join_filters, optval, optlen)) + if (copy_from_sockptr(&flag, optval, optlen)) return -EFAULT; + ro->join_filters = !!flag; break; default: @@ -758,6 +763,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + int flag; int len; void *val; @@ -806,25 +812,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_LOOPBACK: if (len > sizeof(int)) len = sizeof(int); - val = &ro->loopback; + flag = ro->loopback; + val = &flag; break; case CAN_RAW_RECV_OWN_MSGS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->recv_own_msgs; + flag = ro->recv_own_msgs; + val = &flag; break; case CAN_RAW_FD_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->fd_frames; + flag = ro->fd_frames; + val = &flag; break; case CAN_RAW_XL_FRAMES: if (len > sizeof(int)) len = sizeof(int); - val = &ro->xl_frames; + flag = ro->xl_frames; + val = &flag; break; case CAN_RAW_XL_VCID_OPTS: { @@ -849,7 +859,8 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); - val = &ro->join_filters; + flag = ro->join_filters; + val = &flag; break; default: @@ -950,7 +961,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) err = -EINVAL; /* check for valid CAN (CC/FD/XL) frame content */ - txmtu = raw_check_txframe(ro, skb, dev->mtu); + txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu)); if (!txmtu) goto free_skb; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9f6d860411cb..1fbec4853f00 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -252,7 +252,8 @@ int __init ceph_msgr_init(void) * The number of active work items is limited by the number of * connections, so leave @max_active at default. */ - ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0); + ceph_msgr_wq = alloc_workqueue("ceph-msgr", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (ceph_msgr_wq) return 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index ab66b599ac47..c227ececa925 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -314,7 +314,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc) delay = CEPH_MONC_PING_INTERVAL; dout("__schedule_delayed after %lu\n", delay); - mod_delayed_work(system_wq, &monc->delayed_work, + mod_delayed_work(system_percpu_wq, &monc->delayed_work, round_jiffies_relative(delay)); } diff --git a/net/core/Makefile b/net/core/Makefile index b2a76ce33932..9ef2099c5426 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o obj-y += netdev_rx_queue.o +obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o obj-$(CONFIG_PROC_FS) += net-procfs.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o diff --git a/net/core/datagram.c b/net/core/datagram.c index f474b9b120f9..cb4b9ef2e4e3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, spin_unlock_bh(&sk_queue->lock); } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return err; } EXPORT_SYMBOL(__sk_queue_drop_skb); diff --git a/net/core/dev.c b/net/core/dev.c index 8d49b2198d07..a64cef2c537e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL)) features &= ~dev->gso_partial_features; - /* Make sure to clear the IPv4 ID mangling feature if the - * IPv4 header has the potential to be fragmented. + /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header + * has the potential to be fragmented so that TSO does not generate + * segments with the same ID. For encapsulated packets, the ID mangling + * feature is guaranteed not to use the same ID for the outer IPv4 + * headers of the generated segments if the headers have the potential + * to be fragmented, so there is no need to clear the IPv4 ID mangling + * feature (see the section about NETIF_F_TSO_MANGLEID in + * segmentation-offloads.rst). */ if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) { struct iphdr *iph = skb->encapsulation ? @@ -3907,6 +3913,38 @@ sw_checksum: } EXPORT_SYMBOL(skb_csum_hwoffload_help); +/* Checks if this SKB belongs to an HW offloaded socket + * and whether any SW fallbacks are required based on dev. + * Check decrypted mark in case skb_orphan() cleared socket. + */ +static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, + struct net_device *dev) +{ +#ifdef CONFIG_SOCK_VALIDATE_XMIT + struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev, + struct sk_buff *skb); + struct sock *sk = skb->sk; + + sk_validate = NULL; + if (sk) { + if (sk_fullsock(sk)) + sk_validate = sk->sk_validate_xmit_skb; + else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT) + sk_validate = inet_twsk(sk)->tw_validate_xmit_skb; + } + + if (sk_validate) { + skb = sk_validate(sk, dev, skb); + } else if (unlikely(skb_is_decrypted(skb))) { + pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); + kfree_skb(skb); + skb = NULL; + } +#endif + + return skb; +} + static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb, struct net_device *dev) { @@ -4849,9 +4887,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table) return hash_32(hash, flow_table->log); } +#ifdef CONFIG_RFS_ACCEL +/** + * rps_flow_is_active - check whether the flow is recently active. + * @rflow: Specific flow to check activity. + * @flow_table: per-queue flowtable that @rflow belongs to. + * @cpu: CPU saved in @rflow. + * + * If the CPU has processed many packets since the flow's last activity + * (beyond 10 times the table size), the flow is considered stale. + * + * Return: true if flow was recently active. + */ +static bool rps_flow_is_active(struct rps_dev_flow *rflow, + struct rps_dev_flow_table *flow_table, + unsigned int cpu) +{ + unsigned int flow_last_active; + unsigned int sd_input_head; + + if (cpu >= nr_cpu_ids) + return false; + + sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head); + flow_last_active = READ_ONCE(rflow->last_qtail); + + return (int)(sd_input_head - flow_last_active) < + (int)(10 << flow_table->log); +} +#endif + static struct rps_dev_flow * set_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow *rflow, u16 next_cpu) + struct rps_dev_flow *rflow, u16 next_cpu, u32 hash, + u32 flow_id) { if (next_cpu < nr_cpu_ids) { u32 head; @@ -4859,8 +4928,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct netdev_rx_queue *rxqueue; struct rps_dev_flow_table *flow_table; struct rps_dev_flow *old_rflow; + struct rps_dev_flow *tmp_rflow; + unsigned int tmp_cpu; u16 rxq_index; - u32 flow_id; int rc; /* Should we steer this flow to a different hardware queue? */ @@ -4875,14 +4945,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb, flow_table = rcu_dereference(rxqueue->rps_flow_table); if (!flow_table) goto out; - flow_id = rfs_slot(skb_get_hash(skb), flow_table); + + tmp_rflow = &flow_table->flows[flow_id]; + tmp_cpu = READ_ONCE(tmp_rflow->cpu); + + if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) { + if (rps_flow_is_active(tmp_rflow, flow_table, + tmp_cpu)) { + if (hash != READ_ONCE(tmp_rflow->hash) || + next_cpu == tmp_cpu) + goto out; + } + } + rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb, rxq_index, flow_id); if (rc < 0) goto out; + old_rflow = rflow; - rflow = &flow_table->flows[flow_id]; + rflow = tmp_rflow; WRITE_ONCE(rflow->filter, rc); + WRITE_ONCE(rflow->hash, hash); + if (old_rflow->filter == rc) WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER); out: @@ -4908,6 +4993,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow_table *flow_table; struct rps_map *map; int cpu = -1; + u32 flow_id; u32 tcpu; u32 hash; @@ -4954,7 +5040,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* OK, now we know there is a match, * we can look at the local (per receive queue) flow table */ - rflow = &flow_table->flows[rfs_slot(hash, flow_table)]; + flow_id = rfs_slot(hash, flow_table); + rflow = &flow_table->flows[flow_id]; tcpu = rflow->cpu; /* @@ -4973,7 +5060,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, ((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) - rflow->last_qtail)) >= 0)) { tcpu = next_cpu; - rflow = set_rps_cpu(dev, skb, rflow, next_cpu); + rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash, + flow_id); } if (tcpu < nr_cpu_ids && cpu_online(tcpu)) { @@ -5017,17 +5105,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, struct rps_dev_flow_table *flow_table; struct rps_dev_flow *rflow; bool expire = true; - unsigned int cpu; rcu_read_lock(); flow_table = rcu_dereference(rxqueue->rps_flow_table); if (flow_table && flow_id < (1UL << flow_table->log)) { + unsigned int cpu; + rflow = &flow_table->flows[flow_id]; cpu = READ_ONCE(rflow->cpu); - if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids && - ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) - - READ_ONCE(rflow->last_qtail)) < - (int)(10 << flow_table->log))) + if (READ_ONCE(rflow->filter) == filter_id && + rps_flow_is_active(rflow, flow_table, cpu)) expire = false; } rcu_read_unlock(); @@ -5093,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd) __napi_schedule_irqoff(&mysd->backlog); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu) +void kick_defer_list_purge(unsigned int cpu) { + struct softnet_data *sd = &per_cpu(softnet_data, cpu); unsigned long flags; if (use_backlog_threads()) { @@ -5199,7 +5287,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, backlog_unlock_irq_restore(sd, &flags); cpu_backlog_drop: - atomic_inc(&sd->dropped); + numa_drop_add(&sd->drop_counters, 1); bad_dev: dev_core_stats_rx_dropped_inc(skb->dev); kfree_skb_reason(skb, reason); @@ -6628,24 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done) } EXPORT_SYMBOL(napi_complete_done); -static void skb_defer_free_flush(struct softnet_data *sd) +static void skb_defer_free_flush(void) { + struct llist_node *free_list; struct sk_buff *skb, *next; + struct skb_defer_node *sdn; + int node; - /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */ - if (!READ_ONCE(sd->defer_list)) - return; + for_each_node(node) { + sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node; - spin_lock(&sd->defer_lock); - skb = sd->defer_list; - sd->defer_list = NULL; - sd->defer_count = 0; - spin_unlock(&sd->defer_lock); + if (llist_empty(&sdn->defer_list)) + continue; + atomic_long_set(&sdn->defer_count, 0); + free_list = llist_del_all(&sdn->defer_list); - while (skb != NULL) { - next = skb->next; - napi_consume_skb(skb, 1); - skb = next; + llist_for_each_entry_safe(skb, next, free_list, ll_node) { + napi_consume_skb(skb, 1); + } } } @@ -6773,7 +6861,7 @@ count: if (work > 0) __NET_ADD_STATS(dev_net(napi->dev), LINUX_MIB_BUSYPOLLRXPACKETS, work); - skb_defer_free_flush(this_cpu_ptr(&softnet_data)); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7632,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi) local_irq_disable(); net_rps_action_and_irq_enable(sd); } - skb_defer_free_flush(sd); + skb_defer_free_flush(); bpf_net_ctx_clear(bpf_net_ctx); local_bh_enable(); @@ -7674,7 +7762,7 @@ start: for (;;) { struct napi_struct *n; - skb_defer_free_flush(sd); + skb_defer_free_flush(); if (list_empty(&list)) { if (list_empty(&repoll)) { @@ -12908,7 +12996,6 @@ static int __init net_dev_init(void) sd->cpu = i; #endif INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd); - spin_lock_init(&sd->defer_lock); gro_init(&sd->backlog.gro); sd->backlog.poll = process_backlog; @@ -12918,6 +13005,11 @@ static int __init net_dev_init(void) if (net_page_pool_create(i)) goto out; } + net_hotdata.skb_defer_nodes = + __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids, + __alignof__(struct skb_defer_node)); + if (!net_hotdata.skb_defer_nodes) + goto out; if (use_backlog_threads()) smpboot_register_percpu_thread(&backlog_threads); diff --git a/net/core/dev.h b/net/core/dev.h index d6b08d435479..900880e8b5b4 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi) WARN_ON(READ_ONCE(napi->list_owner) != -1); } -void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu); +void kick_defer_list_purge(unsigned int cpu); #define XMIT_RECURSION_LIMIT 8 diff --git a/net/core/devmem.c b/net/core/devmem.c index 24c591ab38ae..d9de31a6cc7f 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -176,6 +176,7 @@ err_close_rxq: struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack) @@ -188,6 +189,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, unsigned long virtual; int err; + if (!dma_dev) { + NL_SET_ERR_MSG(extack, "Device doesn't support DMA"); + return ERR_PTR(-EOPNOTSUPP); + } + dmabuf = dma_buf_get(dmabuf_fd); if (IS_ERR(dmabuf)) return ERR_CAST(dmabuf); @@ -209,7 +215,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dmabuf = dmabuf; binding->direction = direction; - binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent); + binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev); if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); diff --git a/net/core/devmem.h b/net/core/devmem.h index 41cd6e1c9141..101150d761af 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner { void __net_devmem_dmabuf_binding_free(struct work_struct *wq); struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, struct netlink_ext_ack *extack); @@ -170,6 +171,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov) static inline struct net_devmem_dmabuf_binding * net_devmem_bind_dmabuf(struct net_device *dev, + struct device *dma_dev, enum dma_data_direction direction, unsigned int dmabuf_fd, struct netdev_nl_sock *priv, diff --git a/net/core/dst.c b/net/core/dst.c index e2de8b68c41d..e9d35f49c9e7 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst) dst->ops->ifdown(dst, dev); WRITE_ONCE(dst->input, dst_discard); WRITE_ONCE(dst->output, dst_discard_out); - WRITE_ONCE(dst->dev, blackhole_netdev); + rcu_assign_pointer(dst->dev_rcu, blackhole_netdev); netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker, GFP_ATOMIC); } diff --git a/net/core/filter.c b/net/core/filter.c index 2af0a5f1d748..5d1838ff1ab9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, struct flowi4 fl4 = { .flowi4_flags = FLOWI_FLAG_ANYSRC, .flowi4_mark = skb->mark, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)), + .flowi4_dscp = ip4h_dscp(ip4h), .flowi4_oif = dev->ifindex, .flowi4_proto = ip4h->protocol, .daddr = ip4h->daddr, @@ -6028,7 +6028,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params, fl4.flowi4_iif = params->ifindex; fl4.flowi4_oif = 0; } - fl4.flowi4_tos = params->tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; @@ -6775,7 +6775,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6784,7 +6783,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, hinfo, NULL, 0, + sk = __inet_lookup(net, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6798,7 +6797,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, hinfo, NULL, 0, + sk = __inet6_lookup(net, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); diff --git a/net/core/gro.c b/net/core/gro.c index b350e5b69549..5ba4504cfd28 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later +#include <net/psp.h> #include <net/gro.h> #include <net/dst_metadata.h> #include <net/busy_poll.h> @@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head, diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb); diffs |= gro_list_prepare_tc_ext(skb, p, diffs); + diffs |= __psp_skb_coalesce_diff(skb, p, diffs); } NAPI_GRO_CB(p)->same_flow = !diffs; diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 864f3bbc3a4c..212cde35affa 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -157,9 +157,9 @@ static void linkwatch_schedule_work(int urgent) * override the existing timer. */ if (test_bit(LW_URGENT, &linkwatch_flags)) - mod_delayed_work(system_unbound_wq, &linkwatch_work, 0); + mod_delayed_work(system_dfl_wq, &linkwatch_work, 0); else - queue_delayed_work(system_unbound_wq, &linkwatch_work, delay); + queue_delayed_work(system_dfl_wq, &linkwatch_work, delay); } diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index ae74634310a3..9f40be0c3e71 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -8,12 +8,12 @@ #include <linux/skbuff.h> #include <linux/types.h> #include <linux/bpf.h> +#include <net/flow.h> #include <net/lwtunnel.h> #include <net/gre.h> #include <net/ip.h> #include <net/ip6_route.h> #include <net/ipv6_stubs.h> -#include <net/inet_dscp.h> struct bpf_lwt_prog { struct bpf_prog *prog; @@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb) fl4.flowi4_oif = oif; fl4.flowi4_mark = skb->mark; fl4.flowi4_uid = sock_net_uid(net, sk); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_flags = FLOWI_FLAG_ANYSRC; fl4.flowi4_proto = iph->protocol; fl4.daddr = iph->daddr; diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 4f0f0709a1cb..70e0e9a3b650 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x " "%08x %08x\n", - READ_ONCE(sd->processed), atomic_read(&sd->dropped), + READ_ONCE(sd->processed), + numa_drop_read(&sd->drop_counters), READ_ONCE(sd->time_squeeze), 0, 0, 0, 0, 0, /* was fastroute */ 0, /* was cpu_collision */ diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3c2dc4c5e683..ca878525ad7c 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, return -ENOMEM; table->log = ilog2(mask) + 1; - for (count = 0; count <= mask; count++) + for (count = 0; count <= mask; count++) { table->flows[count].cpu = RPS_NO_CPU; + table->flows[count].filter = RPS_NO_FILTER; + } } else { table = NULL; } diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 6314eb7bdf69..470fabbeacd9 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb, return err; } -int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +static int netdev_nl_read_rxq_bitmap(struct genl_info *info, + u32 rxq_bitmap_len, + unsigned long *rxq_bitmap) { + const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1; struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)]; + struct nlattr *attr; + int rem, err = 0; + u32 rxq_idx; + + nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, + genlmsg_data(info->genlhdr), + genlmsg_len(info->genlhdr), rem) { + err = nla_parse_nested(tb, maxtype, attr, + netdev_queue_id_nl_policy, info->extack); + if (err < 0) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || + NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) + return -EINVAL; + + if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); + return -EINVAL; + } + + rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + if (rxq_idx >= rxq_bitmap_len) { + NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]); + return -EINVAL; + } + + bitmap_set(rxq_bitmap, rxq_idx, 1); + } + + return 0; +} + +static struct device * +netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap, + struct netlink_ext_ack *extack) +{ + struct device *dma_dev = NULL; + u32 rxq_idx, prev_rxq_idx; + + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { + struct device *rxq_dma_dev; + + rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx); + if (dma_dev && rxq_dma_dev != dma_dev) { + NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)", + rxq_idx, prev_rxq_idx); + return ERR_PTR(-EOPNOTSUPP); + } + + dma_dev = rxq_dma_dev; + prev_rxq_idx = rxq_idx; + } + + return dma_dev; +} + +int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) +{ struct net_devmem_dmabuf_binding *binding; u32 ifindex, dmabuf_fd, rxq_idx; struct netdev_nl_sock *priv; struct net_device *netdev; + unsigned long *rxq_bitmap; + struct device *dma_dev; struct sk_buff *rsp; - struct nlattr *attr; - int rem, err = 0; + int err = 0; void *hdr; if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) || @@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock; } - binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd, - priv, info->extack); - if (IS_ERR(binding)) { - err = PTR_ERR(binding); + rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL); + if (!rxq_bitmap) { + err = -ENOMEM; goto err_unlock; } - nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES, - genlmsg_data(info->genlhdr), - genlmsg_len(info->genlhdr), rem) { - err = nla_parse_nested( - tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr, - netdev_queue_id_nl_policy, info->extack); - if (err < 0) - goto err_unbind; - - if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) || - NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) { - err = -EINVAL; - goto err_unbind; - } + err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues, + rxq_bitmap); + if (err) + goto err_rxq_bitmap; - if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) { - NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]); - err = -EINVAL; - goto err_unbind; - } + dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack); + if (IS_ERR(dma_dev)) { + err = PTR_ERR(dma_dev); + goto err_rxq_bitmap; + } - rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE, + dmabuf_fd, priv, info->extack); + if (IS_ERR(binding)) { + err = PTR_ERR(binding); + goto err_rxq_bitmap; + } + for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) { err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding, info->extack); if (err) @@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) if (err) goto err_unbind; + bitmap_free(rxq_bitmap); + netdev_unlock(netdev); mutex_unlock(&priv->lock); @@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) err_unbind: net_devmem_unbind_dmabuf(binding); +err_rxq_bitmap: + bitmap_free(rxq_bitmap); err_unlock: netdev_unlock(netdev); err_unlock_sock: @@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) struct net_devmem_dmabuf_binding *binding; struct netdev_nl_sock *priv; struct net_device *netdev; + struct device *dma_dev; u32 ifindex, dmabuf_fd; struct sk_buff *rsp; int err = 0; @@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info) goto err_unlock_netdev; } - binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv, - info->extack); + dma_dev = netdev_queue_get_dma_dev(netdev, 0); + binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE, + dmabuf_fd, priv, info->extack); if (IS_ERR(binding)) { err = PTR_ERR(binding); goto err_unlock_netdev; diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c new file mode 100644 index 000000000000..251f27a8307f --- /dev/null +++ b/net/core/netdev_queues.c @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#include <net/netdev_queues.h> + +/** + * netdev_queue_get_dma_dev() - get dma device for zero-copy operations + * @dev: net_device + * @idx: queue index + * + * Get dma device for zero-copy operations to be used for this queue. + * When such device is not available or valid, the function will return NULL. + * + * Return: Device or NULL on error + */ +struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx) +{ + const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops; + struct device *dma_dev; + + if (queue_ops && queue_ops->ndo_queue_get_dma_dev) + dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx); + else + dma_dev = dev->dev.parent; + + return dma_dev && dma_dev->dma_mask ? dma_dev : NULL; +} + diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index 3bf1151d8061..c7d9341b7630 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -9,6 +9,15 @@ #include "page_pool_priv.h" +/* See also page_pool_is_unreadable() */ +bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) +{ + struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx); + + return !!rxq->mp_params.mp_ops; +} +EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); + int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 5f65b62346d4..60a05d3b7c24 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev) np->dev = ndev; strscpy(np->dev_name, ndev->name, IFNAMSIZ); - npinfo->netpoll = np; /* fill up the skb queue */ refill_skbs(np); @@ -835,7 +834,7 @@ void __netpoll_free(struct netpoll *np) ASSERT_RTNL(); /* Wait for transmitting packets to finish before freeing. */ - synchronize_rcu(); + synchronize_net(); __netpoll_cleanup(np); kfree(np); } diff --git a/net/core/page_pool.c b/net/core/page_pool.c index ba70569bd4b0..492728f9e021 100644 --- a/net/core/page_pool.c +++ b/net/core/page_pool.c @@ -211,11 +211,7 @@ static int page_pool_init(struct page_pool *pool, return -EINVAL; if (pool->p.pool_size) - ring_qsize = pool->p.pool_size; - - /* Sanity limit mem that can be pinned down */ - if (ring_qsize > 32768) - return -E2BIG; + ring_qsize = min(pool->p.pool_size, 16384); /* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL. * DMA_BIDIRECTIONAL is for allowing page used for DMA sending, @@ -555,6 +551,12 @@ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool netmem_ref netmem; int i, nr_pages; + /* Unconditionally set NOWARN if allocating from NAPI. + * Drivers forget to set it, and OOM reports on packet Rx are useless. + */ + if ((gfp & GFP_ATOMIC) == GFP_ATOMIC) + gfp |= __GFP_NOWARN; + /* Don't support bulk alloc for high-order pages */ if (unlikely(pp_order)) return page_to_netmem(__page_pool_alloc_page_order(pool, gfp)); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 0ebe5461d4d9..d41b03fd1f63 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -114,6 +114,7 @@ #include <linux/sys.h> #include <linux/types.h> +#include <linux/minmax.h> #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/kernel.h> @@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, } i = 0; - frag_len = (datalen/frags) < PAGE_SIZE ? - (datalen/frags) : PAGE_SIZE; + frag_len = min_t(int, datalen / frags, PAGE_SIZE); while (datalen > 0) { if (unlikely(!pkt_dev->page)) { int node = numa_node_id(); @@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb, if (i == (frags - 1)) skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, - (datalen < PAGE_SIZE ? - datalen : PAGE_SIZE)); + min(datalen, PAGE_SIZE)); else skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i], pkt_dev->page, 0, frag_len); diff --git a/net/core/request_sock.c b/net/core/request_sock.c index 63de5c635842..897a8f01a67b 100644 --- a/net/core/request_sock.c +++ b/net/core/request_sock.c @@ -77,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue) * a simple spin lock - one must consider sock_owned_by_user() and arrange * to use sk_add_backlog() stuff. But what really makes it infeasible is the * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. A corner - * case might also exist in tcp_v4_hnd_req() that will trigger this locking - * order. + * acquire a child's lock while holding listener's socket lock. * * This function also sets "treq->tfo_listener" to false. * treq->tfo_listener is used by the listener so it is protected by the diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 094b085cff20..8040ff7c356e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -478,7 +478,7 @@ static int rtnl_unregister(int protocol, int msgtype) * rtnl_unregister_all - Unregister all rtnetlink message type of a protocol * @protocol : Protocol family or PF_UNSPEC * - * Identical to calling rtnl_unregster() for all registered message types + * Identical to calling rtnl_unregister() for all registered message types * of a certain protocol family. */ void rtnl_unregister_all(int protocol) @@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_devlink_port_size(dev) + rtnl_dpll_pin_size(dev) + nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */ + + nla_total_size(2) /* IFLA_HEADROOM */ + + nla_total_size(2) /* IFLA_TAILROOM */ + 0; } @@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, atomic_read(&dev->carrier_up_count)) || nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, - atomic_read(&dev->carrier_down_count))) + atomic_read(&dev->carrier_down_count)) || + nla_put_u16(skb, IFLA_HEADROOM, + READ_ONCE(dev->needed_headroom)) || + nla_put_u16(skb, IFLA_TAILROOM, + READ_ONCE(dev->needed_tailroom))) goto nla_put_failure; if (rtnl_fill_proto_down(skb, dev)) @@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1), [IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT }, + [IFLA_HEADROOM] = { .type = NLA_REJECT }, + [IFLA_TAILROOM] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { diff --git a/net/core/scm.c b/net/core/scm.c index 072d5742440a..66eaee783e8b 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -273,7 +273,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) check_object_size(data, cmlen - sizeof(*cm), true); - if (!user_write_access_begin(cm, cmlen)) + if (can_do_masked_user_access()) + cm = masked_user_access_begin(cm); + else if (!user_write_access_begin(cm, cmlen)) goto efault; unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1c0279b9cb9f..bc12790017b0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -79,6 +79,7 @@ #include <net/mptcp.h> #include <net/mctp.h> #include <net/page_pool/helpers.h> +#include <net/psp/types.h> #include <net/dropreason.h> #include <linux/uaccess.h> @@ -3112,7 +3113,9 @@ static bool __splice_segment(struct page *page, unsigned int poff, poff += flen; plen -= flen; *len -= flen; - } while (*len && plen); + if (!*len) + return true; + } while (plen); return false; } @@ -5060,6 +5063,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_MCTP_FLOWS) [SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow), #endif +#if IS_ENABLED(CONFIG_INET_PSP) + [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -7042,6 +7048,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, skb->active_extensions = 1 << id; return skb_ext_get_ptr(ext, id); } +EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL"); /** * skb_ext_add - allocate space for given extension, COW if needed @@ -7178,8 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb) */ void skb_attempt_defer_free(struct sk_buff *skb) { + struct skb_defer_node *sdn; + unsigned long defer_count; int cpu = skb->alloc_cpu; - struct softnet_data *sd; unsigned int defer_max; bool kick; @@ -7193,27 +7201,24 @@ nodefer: kfree_skb_napi_cache(skb); DEBUG_NET_WARN_ON_ONCE(skb_dst(skb)); DEBUG_NET_WARN_ON_ONCE(skb->destructor); - sd = &per_cpu(softnet_data, cpu); + sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id(); + defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max); - if (READ_ONCE(sd->defer_count) >= defer_max) + defer_count = atomic_long_inc_return(&sdn->defer_count); + + if (defer_count >= defer_max) goto nodefer; - spin_lock_bh(&sd->defer_lock); - /* Send an IPI every time queue reaches half capacity. */ - kick = sd->defer_count == (defer_max >> 1); - /* Paired with the READ_ONCE() few lines above */ - WRITE_ONCE(sd->defer_count, sd->defer_count + 1); + llist_add(&skb->ll_node, &sdn->defer_list); - skb->next = sd->defer_list; - /* Paired with READ_ONCE() in skb_defer_free_flush() */ - WRITE_ONCE(sd->defer_list, skb); - spin_unlock_bh(&sd->defer_lock); + /* Send an IPI every time queue reaches half capacity. */ + kick = (defer_count - 1) == (defer_max >> 1); /* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU * if we are unlucky enough (this seems very unlikely). */ if (unlikely(kick)) - kick_defer_list_purge(sd, cpu); + kick_defer_list_purge(cpu); } static void skb_splice_csum_page(struct sk_buff *skb, struct page *page, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 83c78379932e..2ac7731e1e0a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -876,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop(psock); INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); - queue_rcu_work(system_wq, &psock->rwork); + queue_rcu_work(system_percpu_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); diff --git a/net/core/sock.c b/net/core/sock.c index 158bddd23134..dc03d4b5909a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Run time adjustable parameters. */ -__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; +__u32 sysctl_wmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_wmem_max); -__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; +__u32 sysctl_rmem_max __read_mostly = 4 << 20; EXPORT_SYMBOL(sysctl_rmem_max); -__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; -__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; +__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; +__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); EXPORT_SYMBOL_GPL(memalloc_socks_key); @@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) struct sk_buff_head *list = &sk->sk_receive_queue; if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); trace_sock_rcvqueue_full(sk, skb); return -ENOMEM; } if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); return -ENOBUFS; } @@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, skb->dev = NULL; if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); reason = SKB_DROP_REASON_SOCKET_RCVBUFF; goto discard_and_relse; } @@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, reason = SKB_DROP_REASON_PFMEMALLOC; if (err == -ENOBUFS) reason = SKB_DROP_REASON_SOCKET_BACKLOG; - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); goto discard_and_relse; } @@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) bool charged; int pages; - if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk)) + if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) return -EOPNOTSUPP; if (!bytes) @@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes) pages = sk_mem_pages(bytes); /* pre-charge to memcg */ - charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages, - GFP_KERNEL | __GFP_RETRY_MAYFAIL); + charged = mem_cgroup_sk_charge(sk, pages, + GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!charged) return -ENOMEM; @@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes) */ if (allocated > sk_prot_mem_limits(sk, 1)) { sk_memory_allocated_sub(sk, pages); - mem_cgroup_uncharge_skmem(sk->sk_memcg, pages); + mem_cgroup_sk_uncharge(sk, pages); return -ENOMEM; } sk_forward_alloc_add(sk, pages << PAGE_SHIFT); @@ -2505,15 +2505,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_wmem_queued = 0; newsk->sk_forward_alloc = 0; newsk->sk_reserved_mem = 0; - atomic_set(&newsk->sk_drops, 0); + DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); + sk_drops_reset(newsk); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); +#ifdef CONFIG_MEMCG /* sk->sk_memcg will be populated at accept() time */ newsk->sk_memcg = NULL; +#endif cgroup_sk_clone(&newsk->sk_cgrp_data); @@ -2584,7 +2587,7 @@ free: } EXPORT_SYMBOL_GPL(sk_clone_lock); -static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) +static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) { bool is_ipv6 = false; u32 max_size; @@ -2594,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); #endif /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ - max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) : - READ_ONCE(dst_dev(dst)->gso_ipv4_max_size); + max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : + READ_ONCE(dev->gso_ipv4_max_size); if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) max_size = GSO_LEGACY_MAX_SIZE; @@ -2604,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst) void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + const struct net_device *dev; u32 max_segs = 1; - sk->sk_route_caps = dst_dev(dst)->features; + rcu_read_lock(); + dev = dst_dev_rcu(dst); + sk->sk_route_caps = dev->features; if (sk_is_tcp(sk)) { struct inet_connection_sock *icsk = inet_csk(sk); @@ -2622,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) sk->sk_route_caps &= ~NETIF_F_GSO_MASK; } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst); + sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ - max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1); + max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); } } sk->sk_gso_max_segs = max_segs; sk_dst_set(sk, dst); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -3158,23 +3165,27 @@ void __release_sock(struct sock *sk) __acquires(&sk->sk_lock.slock) { struct sk_buff *skb, *next; + int nb = 0; while ((skb = sk->sk_backlog.head) != NULL) { sk->sk_backlog.head = sk->sk_backlog.tail = NULL; spin_unlock_bh(&sk->sk_lock.slock); - do { + while (1) { next = skb->next; prefetch(next); DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); skb_mark_not_on_list(skb); sk_backlog_rcv(sk, skb); - cond_resched(); - skb = next; - } while (skb != NULL); + if (!skb) + break; + + if (!(++nb & 15)) + cond_resched(); + } spin_lock_bh(&sk->sk_lock.slock); } @@ -3241,16 +3252,16 @@ EXPORT_SYMBOL(sk_wait_data); */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) { - struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL; + bool memcg_enabled = false, charged = false; struct proto *prot = sk->sk_prot; - bool charged = true; long allocated; sk_memory_allocated_add(sk, amt); allocated = sk_memory_allocated(sk); - if (memcg) { - charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()); + if (mem_cgroup_sk_enabled(sk)) { + memcg_enabled = true; + charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); if (!charged) goto suppress_allocation; } @@ -3324,21 +3335,19 @@ suppress_allocation: */ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { /* Force charge with __GFP_NOFAIL */ - if (memcg && !charged) { - mem_cgroup_charge_skmem(memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); - } + if (memcg_enabled && !charged) + mem_cgroup_sk_charge(sk, amt, + gfp_memcg_charge() | __GFP_NOFAIL); return 1; } } - if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) - trace_sock_exceed_buf_limit(sk, prot, allocated, kind); + trace_sock_exceed_buf_limit(sk, prot, allocated, kind); sk_memory_allocated_sub(sk, amt); - if (memcg && charged) - mem_cgroup_uncharge_skmem(memcg, amt); + if (charged) + mem_cgroup_sk_uncharge(sk, amt); return 0; } @@ -3376,8 +3385,8 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount) { sk_memory_allocated_sub(sk, amount); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_uncharge(sk, amount); if (sk_under_global_memory_pressure(sk) && (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) @@ -3691,7 +3700,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) */ smp_wmb(); refcount_set(&sk->sk_refcnt, 1); - atomic_set(&sk->sk_drops, 0); + sk_drops_reset(sk); } EXPORT_SYMBOL(sock_init_data_uid); @@ -3951,7 +3960,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); } #ifdef CONFIG_PROC_FS @@ -4432,7 +4441,9 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); +#ifdef CONFIG_MEMCG CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); +#endif CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); @@ -4441,7 +4452,7 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); - CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); @@ -4460,12 +4471,15 @@ static int __init sock_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); + CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index b23594c767f2..026ce9bd9e5e 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -348,7 +348,7 @@ static struct pernet_operations diag_net_ops = { static int __init sock_diag_init(void) { - broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0); + broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0); BUG_ON(!broadcast_wq); return register_pernet_subsys(&diag_net_ops); } diff --git a/net/core/xdp.c b/net/core/xdp.c index 491334b9b8be..9100e160113a 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp) u32 tsize; tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz; - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, tsize, - xdp_buff_is_frag_pfmemalloc(xdp)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + tsize, xdp_buff_get_skb_flags(xdp)); } skb->protocol = eth_type_trans(skb, rxq->dev); @@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, struct skb_shared_info *sinfo = skb_shinfo(skb); const struct skb_shared_info *xinfo; u32 nr_frags, tsize = 0; - bool pfmemalloc = false; + u32 flags = 0; xinfo = xdp_get_shared_info_from_buff(xdp); nr_frags = xinfo->nr_frags; @@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb, __skb_fill_page_desc_noacc(sinfo, i, page, offset, len); tsize += truesize; - pfmemalloc |= page_is_pfmemalloc(page); + if (page_is_pfmemalloc(page)) + flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC; } - xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size, - tsize, pfmemalloc); + xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize, + flags); return true; } @@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, skb_metadata_set(skb, xdpf->metasize); if (unlikely(xdp_frame_has_frags(xdpf))) - xdp_update_skb_shared_info(skb, nr_frags, - sinfo->xdp_frags_size, - nr_frags * xdpf->frame_sz, - xdp_frame_is_frag_pfmemalloc(xdpf)); + xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size, + nr_frags * xdpf->frame_sz, + xdp_frame_get_skb_flags(xdpf)); /* Essential SKB info: protocol and skb->dev */ skb->protocol = eth_type_trans(skb, dev); diff --git a/net/devlink/core.c b/net/devlink/core.c index 7203c39532fc..58093f49c090 100644 --- a/net/devlink/core.c +++ b/net/devlink/core.c @@ -320,7 +320,7 @@ static void devlink_release(struct work_struct *work) void devlink_put(struct devlink *devlink) { if (refcount_dec_and_test(&devlink->refcount)) - queue_rcu_work(system_wq, &devlink->rwork); + queue_rcu_work(system_percpu_wq, &devlink->rwork); } struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp) diff --git a/net/devlink/health.c b/net/devlink/health.c index b3ce8ecbb7fb..136a67c36a20 100644 --- a/net/devlink/health.c +++ b/net/devlink/health.c @@ -60,6 +60,7 @@ struct devlink_health_reporter { struct devlink_port *devlink_port; struct devlink_fmsg *dump_fmsg; u64 graceful_period; + u64 burst_period; bool auto_recover; bool auto_dump; u8 health_state; @@ -108,11 +109,14 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port, static struct devlink_health_reporter * __devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; - if (WARN_ON(graceful_period && !ops->recover)) + if (WARN_ON(ops->default_graceful_period && !ops->recover)) + return ERR_PTR(-EINVAL); + + if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period)) return ERR_PTR(-EINVAL); reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); @@ -122,7 +126,8 @@ __devlink_health_reporter_create(struct devlink *devlink, reporter->priv = priv; reporter->ops = ops; reporter->devlink = devlink; - reporter->graceful_period = graceful_period; + reporter->graceful_period = ops->default_graceful_period; + reporter->burst_period = ops->default_burst_period; reporter->auto_recover = !!ops->recover; reporter->auto_dump = !!ops->dump; return reporter; @@ -134,13 +139,12 @@ __devlink_health_reporter_create(struct devlink *devlink, * * @port: devlink_port to which health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -150,8 +154,7 @@ devl_port_health_reporter_create(struct devlink_port *port, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(port->devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(port->devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -164,14 +167,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create); struct devlink_health_reporter * devlink_port_health_reporter_create(struct devlink_port *port, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; struct devlink *devlink = port->devlink; devl_lock(devlink); - reporter = devl_port_health_reporter_create(port, ops, - graceful_period, priv); + reporter = devl_port_health_reporter_create(port, ops, priv); devl_unlock(devlink); return reporter; } @@ -182,13 +184,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create); * * @devlink: devlink instance which the health reports will relate * @ops: devlink health reporter ops - * @graceful_period: min time (in msec) between recovery attempts * @priv: driver priv pointer */ struct devlink_health_reporter * devl_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; @@ -197,8 +198,7 @@ devl_health_reporter_create(struct devlink *devlink, if (devlink_health_reporter_find_by_name(devlink, ops->name)) return ERR_PTR(-EEXIST); - reporter = __devlink_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = __devlink_health_reporter_create(devlink, ops, priv); if (IS_ERR(reporter)) return reporter; @@ -210,13 +210,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create); struct devlink_health_reporter * devlink_health_reporter_create(struct devlink *devlink, const struct devlink_health_reporter_ops *ops, - u64 graceful_period, void *priv) + void *priv) { struct devlink_health_reporter *reporter; devl_lock(devlink); - reporter = devl_health_reporter_create(devlink, ops, - graceful_period, priv); + reporter = devl_health_reporter_create(devlink, ops, priv); devl_unlock(devlink); return reporter; } @@ -298,6 +297,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, reporter->graceful_period)) goto reporter_nest_cancel; if (reporter->ops->recover && + devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, + reporter->burst_period)) + goto reporter_nest_cancel; + if (reporter->ops->recover && nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, reporter->auto_recover)) goto reporter_nest_cancel; @@ -462,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb, if (!reporter->ops->recover && (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD])) return -EOPNOTSUPP; if (!reporter->ops->dump && info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP]) return -EOPNOTSUPP; - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) { reporter->graceful_period = nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + if (!reporter->graceful_period) + reporter->burst_period = 0; + } + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) { + u64 burst_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]); + + if (!reporter->graceful_period && burst_period) { + NL_SET_ERR_MSG_MOD(info->extack, + "Cannot set burst period without a grace period."); + return -EINVAL; + } + + reporter->burst_period = burst_period; + } if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) reporter->auto_recover = @@ -514,11 +534,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter, devlink_nl_notify_send_desc(devlink, msg, &desc); } +static bool +devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter) +{ + unsigned long burst_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period); + + return time_is_after_jiffies(burst_threshold); +} + void devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter) { reporter->recovery_count++; - reporter->last_recovery_ts = jiffies; + if (!devlink_health_reporter_in_burst(reporter)) + /* When burst period is set, last_recovery_ts marks the first + * recovery within the burst period, not necessarily the last + * one. + */ + reporter->last_recovery_ts = jiffies; } EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done); @@ -592,12 +626,37 @@ dump_err: return err; } +static bool +devlink_health_recover_abort(struct devlink_health_reporter *reporter, + enum devlink_health_reporter_state prev_state) +{ + unsigned long recover_ts_threshold; + + if (!reporter->auto_recover) + return false; + + /* abort if the previous error wasn't recovered */ + if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY) + return true; + + if (devlink_health_reporter_in_burst(reporter)) + return false; + + recover_ts_threshold = reporter->last_recovery_ts + + msecs_to_jiffies(reporter->burst_period) + + msecs_to_jiffies(reporter->graceful_period); + if (reporter->last_recovery_ts && reporter->recovery_count && + time_is_after_jiffies(recover_ts_threshold)) + return true; + + return false; +} + int devlink_health_report(struct devlink_health_reporter *reporter, const char *msg, void *priv_ctx) { enum devlink_health_reporter_state prev_health_state; struct devlink *devlink = reporter->devlink; - unsigned long recover_ts_threshold; int ret; /* write a log message of the current error */ @@ -608,13 +667,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter, reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER); - /* abort if the previous error wasn't recovered */ - recover_ts_threshold = reporter->last_recovery_ts + - msecs_to_jiffies(reporter->graceful_period); - if (reporter->auto_recover && - (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - (reporter->last_recovery_ts && reporter->recovery_count && - time_is_after_jiffies(recover_ts_threshold)))) { + if (devlink_health_recover_abort(reporter, prev_health_state)) { trace_devlink_health_recover_aborted(devlink, reporter->ops->name, reporter->health_state, diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c index d97c326a9045..9fd00977d59e 100644 --- a/net/devlink/netlink_gen.c +++ b/net/devlink/netlink_gen.c @@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN }; /* DEVLINK_CMD_HEALTH_REPORTER_SET - do */ -static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = { +static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, }, [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, }, @@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, }, + [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, }, }; /* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */ @@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = { .doit = devlink_nl_health_reporter_set_doit, .post_doit = devlink_nl_post_doit, .policy = devlink_health_reporter_set_nl_policy, - .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP, + .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD, .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, }, { diff --git a/net/devlink/param.c b/net/devlink/param.c index 41dcc86cfd94..70e69523412c 100644 --- a/net/devlink/param.c +++ b/net/devlink/param.c @@ -102,6 +102,16 @@ static const struct devlink_param devlink_param_generic[] = { .name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME, .type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE, }, + { + .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS, + .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME, + .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE, + }, + { + .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS, + .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME, + .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE, + }, }; static int devlink_param_generic_verify(const struct devlink_param *param) diff --git a/net/devlink/port.c b/net/devlink/port.c index cb8d4df61619..93d8a25bb920 100644 --- a/net/devlink/port.c +++ b/net/devlink/port.c @@ -1333,8 +1333,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb, return NOTIFY_OK; } -static int __devlink_port_attrs_set(struct devlink_port *devlink_port, - enum devlink_port_flavour flavour) +static void __devlink_port_attrs_set(struct devlink_port *devlink_port, + enum devlink_port_flavour flavour) { struct devlink_port_attrs *attrs = &devlink_port->attrs; @@ -1347,7 +1347,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, } else { devlink_port->switch_port = false; } - return 0; } /** @@ -1357,17 +1356,13 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, * @attrs: devlink port attrs */ void devlink_port_attrs_set(struct devlink_port *devlink_port, - struct devlink_port_attrs *attrs) + const struct devlink_port_attrs *attrs) { - int ret; - ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + WARN_ON(attrs->splittable && attrs->split); devlink_port->attrs = *attrs; - ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); - if (ret) - return; - WARN_ON(attrs->splittable && attrs->split); + __devlink_port_attrs_set(devlink_port, attrs->flavour); } EXPORT_SYMBOL_GPL(devlink_port_attrs_set); @@ -1383,14 +1378,10 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro u16 pf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_PF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); attrs->pci_pf.controller = controller; attrs->pci_pf.pf = pf; attrs->pci_pf.external = external; @@ -1411,14 +1402,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u16 vf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_VF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); attrs->pci_vf.controller = controller; attrs->pci_vf.pf = pf; attrs->pci_vf.vf = vf; @@ -1439,14 +1426,10 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; - int ret; ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); - ret = __devlink_port_attrs_set(devlink_port, - DEVLINK_PORT_FLAVOUR_PCI_SF); - if (ret) - return; + __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 4e3651101b86..43e211e611b1 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -613,7 +613,10 @@ EXPORT_SYMBOL(fwnode_get_mac_address); */ int device_get_mac_address(struct device *dev, char *addr) { - return fwnode_get_mac_address(dev_fwnode(dev), addr); + if (!fwnode_get_mac_address(dev_fwnode(dev), addr)) + return 0; + + return nvmem_get_mac_address(dev, addr); } EXPORT_SYMBOL(device_get_mac_address); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index a1490c4afe6b..1e493553b977 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -8,5 +8,5 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \ - module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \ + module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \ phy.o tsconfig.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 92e6a681c797..55223ebc2a7e 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -577,6 +577,26 @@ int __ethtool_get_link(struct net_device *dev) return netif_running(dev) && dev->ethtool_ops->get_link(dev); } +int ethtool_get_rx_ring_count(struct net_device *dev) +{ + const struct ethtool_ops *ops = dev->ethtool_ops; + struct ethtool_rxnfc rx_rings = {}; + int ret; + + if (ops->get_rx_ring_count) + return ops->get_rx_ring_count(dev); + + if (!ops->get_rxnfc) + return -EOPNOTSUPP; + + rx_rings.cmd = ETHTOOL_GRXRINGS; + ret = ops->get_rxnfc(dev, &rx_rings, NULL); + if (ret < 0) + return ret; + + return rx_rings.data; +} + static int ethtool_get_rxnfc_rule_count(struct net_device *dev) { const struct ethtool_ops *ops = dev->ethtool_ops; diff --git a/net/ethtool/common.h b/net/ethtool/common.h index c4d084dde5bf..1609cf4e53eb 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev, struct kernel_ethtool_ringparam *kparam, struct netlink_ext_ack *extack); +int ethtool_get_rx_ring_count(struct net_device *dev); + int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info); int ethtool_get_ts_info_by_phc(struct net_device *dev, struct kernel_ethtool_ts_info *info, diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c index e7d3f2c352a3..4669e74cbcaa 100644 --- a/net/ethtool/fec.c +++ b/net/ethtool/fec.c @@ -17,6 +17,7 @@ struct fec_reply_data { u64 stats[1 + ETHTOOL_MAX_LANES]; u8 cnt; } corr, uncorr, corr_bits; + struct ethtool_fec_hist fec_stat_hist; }; #define FEC_REPDATA(__reply_base) \ @@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base, struct ethtool_fec_stats stats; ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8); - dev->ethtool_ops->get_fec_stats(dev, &stats); + ethtool_stats_init((u64 *)data->fec_stat_hist.values, + sizeof(data->fec_stat_hist.values) / 8); + dev->ethtool_ops->get_fec_stats(dev, &stats, + &data->fec_stat_hist); fec_stats_recalc(&data->corr, &stats.corrected_blocks); fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks); @@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base, len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ - if (req_base->flags & ETHTOOL_FLAG_STATS) + if (req_base->flags & ETHTOOL_FLAG_STATS) { len += 3 * nla_total_size_64bit(sizeof(u64) * (1 + ETHTOOL_MAX_LANES)); + /* add FEC bins information */ + len += (nla_total_size(0) + /* _A_FEC_HIST */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */ + nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */ + /* _A_FEC_HIST_BIN_VAL + per-lane values */ + nla_total_size_64bit(sizeof(u64)) + + nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) * + ETHTOOL_FEC_HIST_MAX; + } return len; } +static int fec_put_hist(struct sk_buff *skb, + const struct ethtool_fec_hist *hist) +{ + const struct ethtool_fec_hist_range *ranges = hist->ranges; + const struct ethtool_fec_hist_value *values = hist->values; + struct nlattr *nest; + int i, j; + u64 sum; + + if (!ranges) + return 0; + + for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) { + if (i && !ranges[i].low && !ranges[i].high) + break; + + if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET && + values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET)) + break; + + nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW, + ranges[i].low) || + nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH, + ranges[i].high)) + goto err_cancel_hist; + sum = 0; + for (j = 0; j < ETHTOOL_MAX_LANES; j++) { + if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET) + break; + sum += values[i].per_lane[j]; + } + if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL, + values[i].sum == ETHTOOL_STAT_NOT_SET ? + sum : values[i].sum)) + goto err_cancel_hist; + if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE, + sizeof(u64) * j, + values[i].per_lane, + ETHTOOL_A_FEC_HIST_PAD)) + goto err_cancel_hist; + + nla_nest_end(skb, nest); + } + + return 0; + +err_cancel_hist: + nla_nest_cancel(skb, nest); + return -EMSGSIZE; +} + static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) { struct nlattr *nest; @@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data) data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD)) goto err_cancel; + if (fec_put_hist(skb, &data->fec_stat_hist)) + goto err_cancel; + nla_nest_end(skb, nest); return 0; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 43a7854e784e..fa83ddade4f8 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type) return false; } +static bool flow_type_v6(u32 flow_type) +{ + switch (flow_type) { + case TCP_V6_FLOW: + case UDP_V6_FLOW: + case SCTP_V6_FLOW: + case AH_ESP_V6_FLOW: + case AH_V6_FLOW: + case ESP_V6_FLOW: + case IPV6_FLOW: + case GTPU_V6_FLOW: + case GTPC_V6_FLOW: + case GTPC_TEID_V6_FLOW: + case GTPU_EH_V6_FLOW: + case GTPU_UL_V6_FLOW: + case GTPU_DL_V6_FLOW: + return true; + } + + return false; +} + /* When adding a new type, update the assert and, if it's hashable, add it to * the flow_type_hashable switch case. */ @@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr) if (rc) return rc; + if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type)) + return -EINVAL; + if (info.flow_type & FLOW_RSS && info.rss_context && !ops->rxfh_per_ctx_fields) return -EINVAL; @@ -1183,18 +1208,41 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev, return 0; } +static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev, + u32 cmd, + void __user *useraddr) +{ + struct ethtool_rxnfc info; + size_t info_size; + int ret; + + info_size = sizeof(info); + ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); + if (ret) + return ret; + + ret = ethtool_get_rx_ring_count(dev); + if (ret < 0) + return ret; + + info.data = ret; + + return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL); +} + static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev, u32 cmd, void __user *useraddr) { - struct ethtool_rxnfc info; - size_t info_size = sizeof(info); const struct ethtool_ops *ops = dev->ethtool_ops; - int ret; + struct ethtool_rxnfc info; void *rule_buf = NULL; + size_t info_size; + int ret; if (!ops->get_rxnfc) return -EOPNOTSUPP; + info_size = sizeof(info); ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr); if (ret) return ret; @@ -1221,8 +1269,8 @@ err_out: } static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, - struct ethtool_rxnfc *rx_rings, - u32 size) + int num_rx_rings, + u32 size) { int i; @@ -1231,7 +1279,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr, /* Validate ring indices */ for (i = 0; i < size; i++) - if (indir[i] >= rx_rings->data) + if (indir[i] >= num_rx_rings) return -EINVAL; return 0; @@ -1302,13 +1350,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, const struct ethtool_ops *ops = dev->ethtool_ops; struct ethtool_rxfh_param rxfh_dev = {}; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; + int num_rx_rings; u32 user_size, i; int ret; u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]); - if (!ops->get_rxfh_indir_size || !ops->set_rxfh || - !ops->get_rxnfc) + if (!ops->get_rxfh_indir_size || !ops->set_rxfh) return -EOPNOTSUPP; rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev); @@ -1328,20 +1375,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev, if (!rxfh_dev.indir) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out; + } if (user_size == 0) { u32 *indir = rxfh_dev.indir; for (i = 0; i < rxfh_dev.indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings); } else { ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + ringidx_offset, - &rx_rings, + num_rx_rings, rxfh_dev.indir_size); if (ret) goto out; @@ -1483,14 +1531,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param rxfh_dev = {}; struct ethtool_rxfh_context *ctx = NULL; struct netlink_ext_ack *extack = NULL; - struct ethtool_rxnfc rx_rings; struct ethtool_rxfh rxfh; bool create = false; + int num_rx_rings; u8 *rss_config; int ntf = 0; int ret; - if (!ops->get_rxnfc || !ops->set_rxfh) + if (!ops->set_rxfh) return -EOPNOTSUPP; if (ops->get_rxfh_indir_size) @@ -1546,10 +1594,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, if (!rss_config) return -ENOMEM; - rx_rings.cmd = ETHTOOL_GRXRINGS; - ret = ops->get_rxnfc(dev, &rx_rings, NULL); - if (ret) + num_rx_rings = ethtool_get_rx_ring_count(dev); + if (num_rx_rings < 0) { + ret = num_rx_rings; goto out_free; + } /* rxfh.indir_size == 0 means reset the indir table to default (master * context) or delete the context (other RSS contexts). @@ -1562,7 +1611,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; ret = ethtool_copy_validate_indir(rxfh_dev.indir, useraddr + rss_cfg_offset, - &rx_rings, + num_rx_rings, rxfh.indir_size); if (ret) goto out_free; @@ -1574,7 +1623,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev, rxfh_dev.indir_size = dev_indir_size; indir = rxfh_dev.indir; for (i = 0; i < dev_indir_size; i++) - indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data); + indir[i] = + ethtool_rxfh_indir_default(i, num_rx_rings); } else { rxfh_dev.rss_delete = true; } @@ -3352,6 +3402,8 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr, rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr); break; case ETHTOOL_GRXRINGS: + rc = ethtool_get_rxrings(dev, ethcmd, useraddr); + break; case ETHTOOL_GRXCLSRLCNT: case ETHTOOL_GRXCLSRULE: case ETHTOOL_GRXCLSRLALL: diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c index 992e98abe9dd..4dced53be4b3 100644 --- a/net/ethtool/rss.c +++ b/net/ethtool/rss.c @@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context) #define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \ RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \ RXH_GTP_TEID | RXH_DISCARD) +#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL) static const struct nla_policy ethnl_rss_flows_policy[] = { [ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), [ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), + [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), [ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), - [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK), + [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6), }; const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = { @@ -619,23 +620,22 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh, bool *reset, bool *mod) { - const struct ethtool_ops *ops = dev->ethtool_ops; struct netlink_ext_ack *extack = info->extack; struct nlattr **tb = info->attrs; - struct ethtool_rxnfc rx_rings; size_t alloc_size; + int num_rx_rings; u32 user_size; int i, err; if (!tb[ETHTOOL_A_RSS_INDIR]) return 0; - if (!data->indir_size || !ops->get_rxnfc) + if (!data->indir_size) return -EOPNOTSUPP; - rx_rings.cmd = ETHTOOL_GRXRINGS; - err = ops->get_rxnfc(dev, &rx_rings, NULL); - if (err) + err = ethtool_get_rx_ring_count(dev); + if (err < 0) return err; + num_rx_rings = err; if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) { NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]); @@ -664,7 +664,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size); for (i = 0; i < user_size; i++) { - if (rxfh->indir[i] < rx_rings.data) + if (rxfh->indir[i] < num_rx_rings) continue; NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR], @@ -681,7 +681,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info, } else { for (i = 0; i < data->indir_size; i++) rxfh->indir[i] = - ethtool_rxfh_indir_default(i, rx_rings.data); + ethtool_rxfh_indir_default(i, num_rx_rings); } *mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size); diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c index 2be356bdfe87..169b413b31fc 100644 --- a/net/ethtool/tsconfig.c +++ b/net/ethtool/tsconfig.c @@ -423,13 +423,11 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base, return ret; } - if (hwprov_mod || config_mod) { - ret = tsconfig_send_reply(dev, info); - if (ret && ret != -EOPNOTSUPP) { - NL_SET_ERR_MSG(info->extack, - "error while reading the new configuration set"); - return ret; - } + ret = tsconfig_send_reply(dev, info); + if (ret && ret != -EOPNOTSUPP) { + NL_SET_ERR_MSG(info->extack, + "error while reading the new configuration set"); + return ret; } /* tsconfig has no notification */ diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index 102eccf5ead7..8177ac6c2d26 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -143,6 +143,7 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, struct netlink_ext_ack *extack) { + struct netdev_lag_upper_info lag_upper_info; struct net_device *hsr_dev; struct hsr_port *master; int res; @@ -159,7 +160,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev, master = hsr_port_get_hsr(hsr, HSR_PT_MASTER); hsr_dev = master->dev; - res = netdev_upper_dev_link(dev, hsr_dev, extack); + lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST; + lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN; + res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack); if (res) goto fail_upper_dev_link; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 76e38092cd8a..3109c5ec38f3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -102,6 +102,7 @@ #include <net/gro.h> #include <net/gso.h> #include <net/tcp.h> +#include <net/psp.h> #include <net/udp.h> #include <net/udplite.h> #include <net/ping.h> @@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk) kfree(rcu_dereference_protected(inet->inet_opt, 1)); dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1)); + psp_sk_assoc_free(sk); } EXPORT_SYMBOL(inet_sock_destruct); @@ -1393,14 +1395,10 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb, segs = ERR_PTR(-EPROTONOSUPPORT); - if (!skb->encapsulation || encap) { - udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); - fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID); + fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap)); - /* fixed ID is invalid if DF bit is not set */ - if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF))) - goto out; - } + if (!skb->encapsulation || encap) + udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); ops = rcu_dereference(inet_offloads[proto]); if (likely(ops && ops->callbacks.gso_segment)) { diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 5cfc1c939673..833f2cf97178 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -170,7 +170,7 @@ struct neigh_table arp_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 740af8541d2f..709021197e1c 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -1715,8 +1715,7 @@ validate_return: */ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) { - unsigned char optbuf[sizeof(struct ip_options) + 40]; - struct ip_options *opt = (struct ip_options *)optbuf; + struct inet_skb_parm parm; int res; if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES) @@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway) * so we can not use icmp_send and IPCB here. */ - memset(opt, 0, sizeof(struct ip_options)); - opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + memset(&parm, 0, sizeof(parm)); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); - res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL); + res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; if (gateway) - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm); else - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm); } /** diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index f14a41ee4aa1..2c922afadb8f 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, - dport, x->props.saddr.a4, sport, 0); + sk = inet_lookup_established(net, x->id.daddr.a4, dport, + x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e1b94796f67..1dab44e13d3b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -32,6 +32,7 @@ #include <linux/list.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/protocol.h> @@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb) .flowi4_iif = LOOPBACK_IFINDEX, .flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev), .daddr = ip_hdr(skb)->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))), + .flowi4_dscp = ip4h_dscp(ip_hdr(skb)), .flowi4_scope = scope, .flowi4_mark = vmark ? skb->mark : 0, }; @@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_tun_key.tun_id = 0; fl4.flowi4_flags = 0; @@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn) struct flowi4 fl4 = { .flowi4_mark = frn->fl_mark, .daddr = frn->fl_addr, - .flowi4_tos = frn->fl_tos & INET_DSCP_MASK, + .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos), .flowi4_scope = frn->fl_scope, }; struct fib_table *tb; diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index fa58d6620ed6..51f0193092f0 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -23,6 +23,7 @@ #include <linux/list.h> #include <linux/rcupdate.h> #include <linux/export.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/ip.h> #include <net/route.h> @@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule, * to mask the upper three DSCP bits prior to matching to maintain * legacy behavior. */ - if (r->dscp_full && - (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask) + if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask) return 0; else if (!r->dscp_full && r->dscp && !fib_dscp_masked_match(r->dscp, fl4)) diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c index 3e30745e2c09..3970b6b7ace5 100644 --- a/net/ipv4/fou_core.c +++ b/net/ipv4/fou_core.c @@ -228,21 +228,27 @@ drop: return 0; } +static const struct net_offload *fou_gro_ops(const struct sock *sk, + int proto) +{ + const struct net_offload __rcu **offloads; + + /* FOU doesn't allow IPv4 on IPv6 sockets. */ + offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads; + return rcu_dereference(offloads[proto]); +} + static struct sk_buff *fou_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; struct sk_buff *pp = NULL; - u8 proto; if (!fou) goto out; - proto = fou->protocol; - /* We can clear the encap_mark for FOU as we are essentially doing * one of two possible things. We are either adding an L4 tunnel * header to the outer L3 tunnel header, or we are simply @@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk, /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -268,10 +273,8 @@ out: static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { - const struct net_offload __rcu **offloads; struct fou *fou = fou_from_sock(sk); const struct net_offload *ops; - u8 proto; int err; if (!fou) { @@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb, goto out; } - proto = fou->protocol; - - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, fou->protocol); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) { err = -ENOSYS; goto out; @@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk, struct list_head *head, struct sk_buff *skb) { - const struct net_offload __rcu **offloads; const struct net_offload *ops; struct sk_buff *pp = NULL; struct sk_buff *p; @@ -450,8 +449,7 @@ next_proto: /* Flag this frame as already having an outer encap header */ NAPI_GRO_CB(skb)->is_fou = 1; - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (!ops || !ops->callbacks.gro_receive) goto out; @@ -467,7 +465,6 @@ out: static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) { struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff); - const struct net_offload __rcu **offloads; const struct net_offload *ops; unsigned int guehlen = 0; u8 proto; @@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff) return err; } - offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads; - ops = rcu_dereference(offloads[proto]); + ops = fou_gro_ops(sk, proto); if (WARN_ON(!ops || !ops->callbacks.gro_complete)) goto out; diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c index 3d9614609b2d..506260b4a4dc 100644 --- a/net/ipv4/fou_nl.c +++ b/net/ipv4/fou_nl.c @@ -18,9 +18,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = { [FOU_ATTR_TYPE] = { .type = NLA_U8, }, [FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, }, [FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, }, - [FOU_ATTR_LOCAL_V6] = { .len = 16, }, + [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_V4] = { .type = NLA_U32, }, - [FOU_ATTR_PEER_V6] = { .len = 16, }, + [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16), [FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, }, [FOU_ATTR_IFINDEX] = { .type = NLA_S32, }, }; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c48c572f024d..1b7fb5d935ed 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -72,6 +72,7 @@ #include <linux/string.h> #include <linux/netfilter_ipv4.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/route.h> @@ -318,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, return true; /* No rate limit on loopback */ - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dev && (dev->flags & IFF_LOOPBACK)) goto out; - rcu_read_lock(); peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr, l3mdev_master_ifindex_rcu(dev)); rc = inet_peer_xrlim_allow(peer, READ_ONCE(net->ipv4.sysctl_icmp_ratelimit)); - rcu_read_unlock(); out: + rcu_read_unlock(); if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); else @@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.saddr = saddr; fl4.flowi4_mark = mark; fl4.flowi4_uid = sock_net_uid(net, NULL); - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))); + fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb)); fl4.flowi4_proto = IPPROTO_ICMP; fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev); security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4)); @@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, fl4->saddr = saddr; fl4->flowi4_mark = mark; fl4->flowi4_uid = sock_net_uid(net, NULL); - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; @@ -544,14 +545,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, goto relookup_failed; } /* Ugh! */ - orefdst = skb_in->_skb_refdst; /* save old refdst */ - skb_dst_set(skb_in, NULL); + orefdst = skb_dstref_steal(skb_in); err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr, dscp, rt2->dst.dev) ? -EINVAL : 0; dst_release(&rt2->dst); rt2 = skb_rtable(skb_in); - skb_in->_skb_refdst = orefdst; /* restore old refdst */ + /* steal dst entry from skb_in, don't drop refcnt */ + skb_dstref_steal(skb_in); + skb_dstref_restore(skb_in, orefdst); } if (err) @@ -592,7 +594,7 @@ relookup_failed: */ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, - const struct ip_options *opt) + const struct inet_skb_parm *parm) { struct iphdr *iph; int room; @@ -708,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, rcu_read_lock(); if (rt_is_input_route(rt) && READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr)) - dev = dev_get_by_index_rcu(net, inet_iif(skb_in)); + dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif : + inet_iif(skb_in)); if (dev) saddr = inet_select_addr(dev, iph->saddr, @@ -723,7 +726,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, iph->tos; mark = IP4_REPLY_MARK(net, skb_in->mark); - if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt)) + if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, + &parm->opt)) goto out_unlock; @@ -797,15 +801,16 @@ EXPORT_SYMBOL(__icmp_send); void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct sk_buff *cloned_skb = NULL; - struct ip_options opts = { 0 }; enum ip_conntrack_info ctinfo; enum ip_conntrack_dir dir; + struct inet_skb_parm parm; struct nf_conn *ct; __be32 orig_ip; + memset(&parm, 0, sizeof(parm)); ct = nf_ct_get(skb_in, &ctinfo); if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) { - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); return; } @@ -821,7 +826,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) orig_ip = ip_hdr(skb_in)->saddr; dir = CTINFO2DIR(ctinfo); ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip; - __icmp_send(skb_in, type, code, info, &opts); + __icmp_send(skb_in, type, code, info, &parm); ip_hdr(skb_in)->saddr = orig_ip; out: consume_skb(cloned_skb); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 1e2df51427fe..cdd1e12aac8c 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -423,7 +423,7 @@ success: } static inline int sk_reuseport_match(struct inet_bind_bucket *tb, - struct sock *sk) + const struct sock *sk) { if (tb->fastreuseport <= 0) return 0; @@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb, ipv6_only_sock(sk), true, false); } -void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, - struct sock *sk) +void inet_csk_update_fastreuse(const struct sock *sk, + struct inet_bind_bucket *tb, + struct inet_bind2_bucket *tb2) { bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; @@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, tb->fastreuseport = 0; } } + + tb2->fastreuse = tb->fastreuse; + tb2->fastreuseport = tb->fastreuseport; } /* Obtain a reference to a local port for the given sock, @@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum) } success: - inet_csk_update_fastreuse(tb, sk); + inet_csk_update_fastreuse(sk, tb, tb2); if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, tb2, port); @@ -706,9 +710,9 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg) spin_unlock_bh(&queue->fastopenq.lock); } -out: release_sock(sk); - if (newsk && mem_cgroup_sockets_enabled) { + + if (mem_cgroup_sockets_enabled) { gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL; int amt = 0; @@ -718,7 +722,7 @@ out: lock_sock(newsk); mem_cgroup_sk_alloc(newsk); - if (newsk->sk_memcg) { + if (mem_cgroup_from_sk(newsk)) { /* The socket has not been accepted yet, no need * to look at newsk->sk_wmem_queued. */ @@ -727,23 +731,22 @@ out: } if (amt) - mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp); + mem_cgroup_sk_charge(newsk, amt, gfp); kmem_cache_charge(newsk, gfp); release_sock(newsk); } + if (req) reqsk_put(req); - if (newsk) - inet_init_csk_locks(newsk); - + inet_init_csk_locks(newsk); return newsk; + out_err: - newsk = NULL; - req = NULL; + release_sock(sk); arg->err = error; - goto out; + return NULL; } EXPORT_SYMBOL(inet_csk_accept); @@ -1297,12 +1300,19 @@ void inet_csk_destroy_sock(struct sock *sk) xfrm_sk_free_policy(sk); - this_cpu_dec(*sk->sk_prot->orphan_count); + tcp_orphan_count_dec(); sock_put(sk); } EXPORT_SYMBOL(inet_csk_destroy_sock); +void inet_csk_prepare_for_destroy_sock(struct sock *sk) +{ + /* The below has to be done to allow calling inet_csk_destroy_sock */ + sock_set_flag(sk, SOCK_DEAD); + tcp_orphan_count_inc(); +} + /* This function allows to force a closure of a socket after the call to * tcp_create_openreq_child(). */ @@ -1370,7 +1380,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req, sock_orphan(child); - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) { BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 2fa53b16fe77..f0b6c5a411a2 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -20,9 +20,6 @@ #include <net/ipv6.h> #include <net/inet_common.h> #include <net/inet_connection_sock.h> -#include <net/inet_hashtables.h> -#include <net/inet_timewait_sock.h> -#include <net/inet6_hashtables.h> #include <net/bpf_sk_storage.h> #include <net/netlink.h> @@ -74,54 +71,29 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler) void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk) { - r->idiag_family = sk->sk_family; + r->idiag_family = READ_ONCE(sk->sk_family); - r->id.idiag_sport = htons(sk->sk_num); - r->id.idiag_dport = sk->sk_dport; - r->id.idiag_if = sk->sk_bound_dev_if; + r->id.idiag_sport = htons(READ_ONCE(sk->sk_num)); + r->id.idiag_dport = READ_ONCE(sk->sk_dport); + r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if); sock_diag_save_cookie(sk, r->id.idiag_cookie); #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { - *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr; - *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr; + if (r->idiag_family == AF_INET6) { + data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr); + data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr); } else #endif { memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src)); memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst)); - r->id.idiag_src[0] = sk->sk_rcv_saddr; - r->id.idiag_dst[0] = sk->sk_daddr; + r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr); + r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr); } } EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill); -static size_t inet_sk_attr_size(struct sock *sk, - const struct inet_diag_req_v2 *req, - bool net_admin) -{ - const struct inet_diag_handler *handler; - size_t aux = 0; - - rcu_read_lock(); - handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]); - DEBUG_NET_WARN_ON_ONCE(!handler); - if (handler && handler->idiag_get_aux_size) - aux = handler->idiag_get_aux_size(sk, net_admin); - rcu_read_unlock(); - - return nla_total_size(sizeof(struct tcp_info)) - + nla_total_size(sizeof(struct inet_diag_msg)) - + inet_diag_msg_attrs_size() - + nla_total_size(sizeof(struct inet_diag_meminfo)) - + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) - + nla_total_size(TCP_CA_NAME_MAX) - + nla_total_size(sizeof(struct tcpvegas_info)) - + aux - + 64; -} - int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb, struct inet_diag_msg *r, int ext, struct user_namespace *user_ns, @@ -313,17 +285,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, icsk_pending == ICSK_TIME_REO_TIMEOUT || icsk_pending == ICSK_TIME_LOSS_PROBE) { r->idiag_timer = 1; - r->idiag_retrans = icsk->icsk_retransmits; + r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (icsk_pending == ICSK_TIME_PROBE0) { r->idiag_timer = 4; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies); } else if (timer_pending(&sk->sk_timer)) { r->idiag_timer = 2; - r->idiag_retrans = icsk->icsk_probes_out; + r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out); r->idiag_expires = jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies); } @@ -422,183 +394,6 @@ errout: } EXPORT_SYMBOL_GPL(inet_sk_diag_fill); -static int inet_twsk_diag_fill(struct sock *sk, - struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct inet_timewait_sock *tw = inet_twsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, - sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - BUG_ON(tw->tw_state != TCP_TIME_WAIT); - - inet_diag_msg_common_fill(r, sk); - r->idiag_retrans = 0; - - r->idiag_state = READ_ONCE(tw->tw_substate); - r->idiag_timer = 3; - tmo = tw->tw_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - tw->tw_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - u16 nlmsg_flags, bool net_admin) -{ - struct request_sock *reqsk = inet_reqsk(sk); - struct inet_diag_msg *r; - struct nlmsghdr *nlh; - long tmo; - - nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, - cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); - if (!nlh) - return -EMSGSIZE; - - r = nlmsg_data(nlh); - inet_diag_msg_common_fill(r, sk); - r->idiag_state = TCP_SYN_RECV; - r->idiag_timer = 1; - r->idiag_retrans = reqsk->num_retrans; - - BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != - offsetof(struct sock, sk_cookie)); - - tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies; - r->idiag_expires = jiffies_delta_to_msecs(tmo); - r->idiag_rqueue = 0; - r->idiag_wqueue = 0; - r->idiag_uid = 0; - r->idiag_inode = 0; - - if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, - inet_rsk(reqsk)->ir_mark)) { - nlmsg_cancel(skb, nlh); - return -EMSGSIZE; - } - - nlmsg_end(skb, nlh); - return 0; -} - -static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r, - u16 nlmsg_flags, bool net_admin) -{ - if (sk->sk_state == TCP_TIME_WAIT) - return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - if (sk->sk_state == TCP_NEW_SYN_RECV) - return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); - - return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, - net_admin); -} - -struct sock *inet_diag_find_one_icsk(struct net *net, - struct inet_hashinfo *hashinfo, - const struct inet_diag_req_v2 *req) -{ - struct sock *sk; - - rcu_read_lock(); - if (req->sdiag_family == AF_INET) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0], - req->id.idiag_dport, req->id.idiag_src[0], - req->id.idiag_sport, req->id.idiag_if); -#if IS_ENABLED(CONFIG_IPV6) - else if (req->sdiag_family == AF_INET6) { - if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && - ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) - sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3], - req->id.idiag_dport, req->id.idiag_src[3], - req->id.idiag_sport, req->id.idiag_if); - else - sk = inet6_lookup(net, hashinfo, NULL, 0, - (struct in6_addr *)req->id.idiag_dst, - req->id.idiag_dport, - (struct in6_addr *)req->id.idiag_src, - req->id.idiag_sport, - req->id.idiag_if); - } -#endif - else { - rcu_read_unlock(); - return ERR_PTR(-EINVAL); - } - rcu_read_unlock(); - if (!sk) - return ERR_PTR(-ENOENT); - - if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { - sock_gen_put(sk); - return ERR_PTR(-ENOENT); - } - - return sk; -} -EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk); - -int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *req) -{ - struct sk_buff *in_skb = cb->skb; - bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); - struct net *net = sock_net(in_skb->sk); - struct sk_buff *rep; - struct sock *sk; - int err; - - sk = inet_diag_find_one_icsk(net, hashinfo, req); - if (IS_ERR(sk)) - return PTR_ERR(sk); - - rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL); - if (!rep) { - err = -ENOMEM; - goto out; - } - - err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); - if (err < 0) { - WARN_ON(err == -EMSGSIZE); - nlmsg_free(rep); - goto out; - } - err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); - -out: - if (sk) - sock_gen_put(sk); - - return err; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk); - static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb, const struct nlmsghdr *nlh, int hdrlen, @@ -785,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) - if (sk->sk_family == AF_INET6) { + if (entry->family == AF_INET6) { entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32; entry->daddr = sk->sk_v6_daddr.s6_addr32; } else @@ -796,31 +591,36 @@ static void entry_fill_addrs(struct inet_diag_entry *entry, } } -int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk) +int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk) { - struct inet_sock *inet = inet_sk(sk); + const struct nlattr *bc = cb_data->inet_diag_nla_bc; + const struct inet_sock *inet = inet_sk(sk); struct inet_diag_entry entry; if (!bc) return 1; - entry.family = sk->sk_family; + entry.family = READ_ONCE(sk->sk_family); entry_fill_addrs(&entry, sk); - entry.sport = inet->inet_num; - entry.dport = ntohs(inet->inet_dport); - entry.ifindex = sk->sk_bound_dev_if; - entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0; - if (sk_fullsock(sk)) - entry.mark = READ_ONCE(sk->sk_mark); - else if (sk->sk_state == TCP_NEW_SYN_RECV) - entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; - else if (sk->sk_state == TCP_TIME_WAIT) - entry.mark = inet_twsk(sk)->tw_mark; - else - entry.mark = 0; + entry.sport = READ_ONCE(inet->inet_num); + entry.dport = ntohs(READ_ONCE(inet->inet_dport)); + entry.ifindex = READ_ONCE(sk->sk_bound_dev_if); + if (cb_data->userlocks_needed) + entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0; + if (cb_data->mark_needed) { + if (sk_fullsock(sk)) + entry.mark = READ_ONCE(sk->sk_mark); + else if (sk->sk_state == TCP_NEW_SYN_RECV) + entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark; + else if (sk->sk_state == TCP_TIME_WAIT) + entry.mark = inet_twsk(sk)->tw_mark; + else + entry.mark = 0; + } #ifdef CONFIG_SOCK_CGROUP_DATA - entry.cgroup_id = sk_fullsock(sk) ? - cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; + if (cb_data->cgroup_needed) + entry.cgroup_id = sk_fullsock(sk) ? + cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0; #endif return inet_diag_bc_run(bc, &entry); @@ -920,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len, } #endif -static int inet_diag_bc_audit(const struct nlattr *attr, +static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data, const struct sk_buff *skb) { - bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); + const struct nlattr *attr = cb_data->inet_diag_nla_bc; const void *bytecode, *bc; int bytecode_len, len; + bool net_admin; + + if (!attr) + return 0; - if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op)) + if (nla_len(attr) < sizeof(struct inet_diag_bc_op)) return -EINVAL; + net_admin = netlink_net_capable(skb, CAP_NET_ADMIN); bytecode = bc = nla_data(attr); len = bytecode_len = nla_len(attr); @@ -961,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return -EPERM; if (!valid_markcond(bc, len, &min_len)) return -EINVAL; + cb_data->mark_needed = true; break; #ifdef CONFIG_SOCK_CGROUP_DATA case INET_DIAG_BC_CGROUP_COND: if (!valid_cgroupcond(bc, len, &min_len)) return -EINVAL; + cb_data->cgroup_needed = true; break; #endif case INET_DIAG_BC_AUTO: + cb_data->userlocks_needed = true; + fallthrough; case INET_DIAG_BC_JMP: case INET_DIAG_BC_NOP: break; @@ -992,280 +801,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr, return len == 0 ? 0 : -EINVAL; } -static void twsk_build_assert(void) -{ - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != - offsetof(struct sock, sk_family)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != - offsetof(struct inet_sock, inet_num)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != - offsetof(struct inet_sock, inet_dport)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != - offsetof(struct inet_sock, inet_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != - offsetof(struct inet_sock, inet_daddr)); - -#if IS_ENABLED(CONFIG_IPV6) - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != - offsetof(struct sock, sk_v6_rcv_saddr)); - - BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != - offsetof(struct sock, sk_v6_daddr)); -#endif -} - -void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, - struct netlink_callback *cb, - const struct inet_diag_req_v2 *r) -{ - bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); - struct inet_diag_dump_data *cb_data = cb->data; - struct net *net = sock_net(skb->sk); - u32 idiag_states = r->idiag_states; - int i, num, s_i, s_num; - struct nlattr *bc; - struct sock *sk; - - bc = cb_data->inet_diag_nla_bc; - if (idiag_states & TCPF_SYN_RECV) - idiag_states |= TCPF_NEW_SYN_RECV; - s_i = cb->args[1]; - s_num = num = cb->args[2]; - - if (cb->args[0] == 0) { - if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) - goto skip_listen_ht; - - for (i = s_i; i <= hashinfo->lhash2_mask; i++) { - struct inet_listen_hashbucket *ilb; - struct hlist_nulls_node *node; - - num = 0; - ilb = &hashinfo->lhash2[i]; - - if (hlist_nulls_empty(&ilb->nulls_head)) { - s_num = 0; - continue; - } - spin_lock(&ilb->lock); - sk_nulls_for_each(sk, node, &ilb->nulls_head) { - struct inet_sock *inet = inet_sk(sk); - - if (!net_eq(sock_net(sk), net)) - continue; - - if (num < s_num) { - num++; - continue; - } - - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_listen; - - if (r->id.idiag_sport != inet->inet_sport && - r->id.idiag_sport) - goto next_listen; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_listen; - - if (inet_sk_diag_fill(sk, inet_csk(sk), skb, - cb, r, NLM_F_MULTI, - net_admin) < 0) { - spin_unlock(&ilb->lock); - goto done; - } - -next_listen: - ++num; - } - spin_unlock(&ilb->lock); - - s_num = 0; - } -skip_listen_ht: - cb->args[0] = 1; - s_i = num = s_num = 0; - } - -/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets - * with bh disabled. - */ -#define SKARR_SZ 16 - - /* Dump bound but inactive (not listening, connecting, etc.) sockets */ - if (cb->args[0] == 1) { - if (!(idiag_states & TCPF_BOUND_INACTIVE)) - goto skip_bind_ht; - - for (i = s_i; i < hashinfo->bhash_size; i++) { - struct inet_bind_hashbucket *ibb; - struct inet_bind2_bucket *tb2; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - -resume_bind_walk: - num = 0; - accum = 0; - ibb = &hashinfo->bhash2[i]; - - if (hlist_empty(&ibb->chain)) { - s_num = 0; - continue; - } - spin_lock_bh(&ibb->lock); - inet_bind_bucket_for_each(tb2, &ibb->chain) { - if (!net_eq(ib2_net(tb2), net)) - continue; - - sk_for_each_bound(sk, &tb2->owners) { - struct inet_sock *inet = inet_sk(sk); - - if (num < s_num) - goto next_bind; - - if (sk->sk_state != TCP_CLOSE || - !inet->inet_num) - goto next_bind; - - if (r->sdiag_family != AF_UNSPEC && - r->sdiag_family != sk->sk_family) - goto next_bind; - - if (!inet_diag_bc_sk(bc, sk)) - goto next_bind; - - sock_hold(sk); - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - goto pause_bind_walk; -next_bind: - num++; - } - } -pause_bind_walk: - spin_unlock_bh(&ibb->lock); - - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = inet_sk_diag_fill(sk_arr[idx], - NULL, skb, cb, - r, NLM_F_MULTI, - net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_put(sk_arr[idx]); - } - if (res < 0) - goto done; - - cond_resched(); - - if (accum == SKARR_SZ) { - s_num = num + 1; - goto resume_bind_walk; - } - - s_num = 0; - } -skip_bind_ht: - cb->args[0] = 2; - s_i = num = s_num = 0; - } - - if (!(idiag_states & ~TCPF_LISTEN)) - goto out; - - for (i = s_i; i <= hashinfo->ehash_mask; i++) { - struct inet_ehash_bucket *head = &hashinfo->ehash[i]; - spinlock_t *lock = inet_ehash_lockp(hashinfo, i); - struct hlist_nulls_node *node; - struct sock *sk_arr[SKARR_SZ]; - int num_arr[SKARR_SZ]; - int idx, accum, res; - - if (hlist_nulls_empty(&head->chain)) - continue; - - if (i > s_i) - s_num = 0; - -next_chunk: - num = 0; - accum = 0; - spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &head->chain) { - int state; - - if (!net_eq(sock_net(sk), net)) - continue; - if (num < s_num) - goto next_normal; - state = (sk->sk_state == TCP_TIME_WAIT) ? - READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; - if (!(idiag_states & (1 << state))) - goto next_normal; - if (r->sdiag_family != AF_UNSPEC && - sk->sk_family != r->sdiag_family) - goto next_normal; - if (r->id.idiag_sport != htons(sk->sk_num) && - r->id.idiag_sport) - goto next_normal; - if (r->id.idiag_dport != sk->sk_dport && - r->id.idiag_dport) - goto next_normal; - twsk_build_assert(); - - if (!inet_diag_bc_sk(bc, sk)) - goto next_normal; - - if (!refcount_inc_not_zero(&sk->sk_refcnt)) - goto next_normal; - - num_arr[accum] = num; - sk_arr[accum] = sk; - if (++accum == SKARR_SZ) - break; -next_normal: - ++num; - } - spin_unlock_bh(lock); - res = 0; - for (idx = 0; idx < accum; idx++) { - if (res >= 0) { - res = sk_diag_fill(sk_arr[idx], skb, cb, r, - NLM_F_MULTI, net_admin); - if (res < 0) - num = num_arr[idx]; - } - sock_gen_put(sk_arr[idx]); - } - if (res < 0) - break; - cond_resched(); - if (accum == SKARR_SZ) { - s_num = num + 1; - goto next_chunk; - } - } - -done: - cb->args[1] = i; - cb->args[2] = num; -out: - ; -} -EXPORT_SYMBOL_GPL(inet_diag_dump_icsk); - static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { @@ -1319,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen) kfree(cb_data); return err; } - nla = cb_data->inet_diag_nla_bc; - if (nla) { - err = inet_diag_bc_audit(nla, skb); - if (err) { - kfree(cb_data); - return err; - } + err = inet_diag_bc_audit(cb_data, skb); + if (err) { + kfree(cb_data); + return err; } nla = cb_data->inet_diag_nla_bpf_stgs; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 470ab17ceb51..025895eb6ec5 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -183,7 +183,7 @@ static void fqdir_work_fn(struct work_struct *work) rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL); if (llist_add(&fqdir->free_list, &fqdir_free_list)) - queue_delayed_work(system_wq, &fqdir_free_work, HZ); + queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ); } int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index ceeeec9b7290..b7024e3d9ac3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk) sk->sk_daddr, sk->sk_dport); } +static bool sk_is_connect_bind(const struct sock *sk) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return inet_twsk(sk)->tw_connect_bind; + else + return sk->sk_userlocks & SOCK_CONNECT_BIND; +} + /* * Allocate and initialize a new local port bind bucket. * The bindhash mutex for snum's hash chain must be held here. @@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, */ void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { + const struct inet_bind2_bucket *tb2; + if (hlist_empty(&tb->bhash2)) { hlist_del_rcu(&tb->node); kfree_rcu(tb, rcu); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) { + if (tb2->fastreuse != -1 || tb2->fastreuseport != -1) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, @@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2, #else tb2->rcv_saddr = sk->sk_rcv_saddr; #endif + tb2->fastreuse = 0; + tb2->fastreuseport = 0; INIT_HLIST_HEAD(&tb2->owners); hlist_add_head(&tb2->node, &head->chain); hlist_add_head(&tb2->bhash_node, &tb->bhash2); @@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep, /* Caller must hold hashbucket lock for this tb with local BH disabled */ void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb) { + const struct sock *sk; + if (hlist_empty(&tb->owners)) { __hlist_del(&tb->node); __hlist_del(&tb->bhash_node); kmem_cache_free(cachep, tb); + return; + } + + if (tb->fastreuse == -1 && tb->fastreuseport == -1) + return; + sk_for_each_bound(sk, &tb->owners) { + if (!sk_is_connect_bind(sk)) + return; } + tb->fastreuse = -1; + tb->fastreuseport = -1; } static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, @@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk) tb = inet_csk(sk)->icsk_bind_hash; inet_csk(sk)->icsk_bind_hash = NULL; inet_sk(sk)->inet_num = 0; + sk->sk_userlocks &= ~SOCK_CONNECT_BIND; spin_lock(&head2->lock); if (inet_csk(sk)->icsk_bind2_hash) { @@ -277,7 +312,7 @@ bhash2_find: } } if (update_fastreuse) - inet_csk_update_fastreuse(tb, child); + inet_csk_update_fastreuse(child, tb, tb2); inet_bind_hash(child, tb, tb2, port); spin_unlock(&head2->lock); spin_unlock(&head->lock); @@ -425,19 +460,18 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net, } struct sock *__inet_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const __be32 saddr, __be16 sport, const __be32 daddr, const unsigned short hnum, const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet_ehashfn); @@ -445,6 +479,7 @@ struct sock *__inet_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv4_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -490,21 +525,22 @@ void sock_edemux(struct sk_buff *skb) EXPORT_SYMBOL(sock_edemux); struct sock *__inet_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const __be32 saddr, const __be16 sport, - const __be32 daddr, const u16 hnum, - const int dif, const int sdif) + const __be32 saddr, const __be16 sport, + const __be32 daddr, const u16 hnum, + const int dif, const int sdif) { - INET_ADDR_COOKIE(acookie, saddr, daddr); const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - struct sock *sk; + INET_ADDR_COOKIE(acookie, saddr, daddr); const struct hlist_nulls_node *node; - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { @@ -579,8 +615,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -707,7 +742,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ok) { sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); } else { - this_cpu_inc(*sk->sk_prot->orphan_count); + tcp_orphan_count_inc(); inet_sk_set_state(sk, TCP_CLOSE); sock_set_flag(sk, SOCK_DEAD); inet_csk_destroy_sock(sk); @@ -739,15 +774,18 @@ static int inet_reuseport_add_sock(struct sock *sk, return reuseport_alloc(sk, inet_rcv_saddr_any(sk)); } -int __inet_hash(struct sock *sk, struct sock *osk) +int inet_hash(struct sock *sk) { struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; + if (sk->sk_state == TCP_CLOSE) + return 0; + if (sk->sk_state != TCP_LISTEN) { local_bh_disable(); - inet_ehash_nolisten(sk, osk, NULL); + inet_ehash_nolisten(sk, NULL, NULL); local_bh_enable(); return 0; } @@ -772,17 +810,7 @@ unlock: return err; } -EXPORT_IPV6_MOD(__inet_hash); - -int inet_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); - - return err; -} +EXPORT_IPV6_MOD(inet_hash); void inet_unhash(struct sock *sk) { @@ -800,11 +828,6 @@ void inet_unhash(struct sock *sk) * avoid circular locking dependency on PREEMPT_RT. */ spin_lock(&ilb2->lock); - if (sk_unhashed(sk)) { - spin_unlock(&ilb2->lock); - return; - } - if (rcu_access_pointer(sk->sk_reuseport_cb)) reuseport_stop_listen_sock(sk); @@ -815,10 +838,6 @@ void inet_unhash(struct sock *sk) spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash); spin_lock_bh(lock); - if (sk_unhashed(sk)) { - spin_unlock_bh(lock); - return; - } __sk_nulls_del_node_init_rcu(sk); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock_bh(lock); @@ -966,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, if (!tb2) { tb2 = new_tb2; inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk); + if (sk_is_connect_bind(sk)) { + tb2->fastreuse = -1; + tb2->fastreuseport = -1; + } } inet_csk(sk)->icsk_bind2_hash = tb2; sk_add_bind_node(sk, &tb2->owners); @@ -1136,6 +1159,8 @@ ok: head2, tb, sk); if (!tb2) goto error; + tb2->fastreuse = -1; + tb2->fastreuseport = -1; } /* Here we want to add a little bit of randomness to the next source @@ -1148,6 +1173,7 @@ ok: /* Head lock still held and bh's disabled */ inet_bind_hash(sk, tb, tb2, port); + sk->sk_userlocks |= SOCK_CONNECT_BIND; if (sk_unhashed(sk)) { inet_sk(sk)->inet_sport = htons(port); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 56a117560c0c..c96d61d08854 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -15,7 +15,8 @@ #include <net/inet_hashtables.h> #include <net/inet_timewait_sock.h> #include <net/ip.h> - +#include <net/tcp.h> +#include <net/psp.h> /** * inet_twsk_bind_unhash - unhash a timewait socket from bind hash @@ -74,7 +75,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) void inet_twsk_free(struct inet_timewait_sock *tw) { struct module *owner = tw->tw_prot->owner; - twsk_destructor((struct sock *)tw); + + tcp_twsk_destructor((struct sock *)tw); kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } @@ -206,10 +208,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, tw->tw_hash = sk->sk_hash; tw->tw_ipv6only = 0; tw->tw_transparent = inet_test_bit(TRANSPARENT, sk); + tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND); tw->tw_prot = sk->sk_prot_creator; atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie)); twsk_net_set(tw, sock_net(sk)); timer_setup(&tw->tw_timer, tw_timer_handler, 0); +#ifdef CONFIG_SOCK_VALIDATE_XMIT + tw->tw_validate_xmit_skb = NULL; +#endif /* * Because we use RCU lookups, we should not set tw_refcnt * to a non null value before everything is setup for this @@ -218,6 +224,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, refcount_set(&tw->tw_refcnt, 0); __module_get(tw->tw_prot->owner); + psp_twsk_init(tw, sk); } return tw; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b2584cce90ae..f7012479713b 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -476,14 +476,16 @@ out_fail: /* Process an incoming IP datagram fragment. */ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) { - struct net_device *dev = skb->dev ? : skb_dst_dev(skb); - int vif = l3mdev_master_ifindex_rcu(dev); + struct net_device *dev; struct ipq *qp; + int vif; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); /* Lookup (or create) queue header */ rcu_read_lock(); + dev = skb->dev ? : skb_dst_dev_rcu(skb); + vif = l3mdev_master_ifindex_rcu(dev); qp = ip_find(net, ip_hdr(skb), user, vif); if (qp) { int ret, refs = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index f5b9004d6938..761a53c6a89a 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -28,6 +28,7 @@ #include <linux/etherdevice.h> #include <linux/if_ether.h> +#include <net/flow.h> #include <net/sock.h> #include <net/ip.h> #include <net/icmp.h> @@ -44,7 +45,6 @@ #include <net/gre.h> #include <net/dst_metadata.h> #include <net/erspan.h> -#include <net/inet_dscp.h> /* Problems & solutions @@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr)) { struct flowi4 fl4 = { .flowi4_oif = t->parms.link, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)), + .flowi4_dscp = ip4h_dscp(&t->parms.iph), .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_proto = IPPROTO_GRE, .saddr = t->parms.iph.saddr, diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index fc323994b1fa..273578579a6b 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -263,10 +263,11 @@ int ip_local_deliver(struct sk_buff *skb) } EXPORT_SYMBOL(ip_local_deliver); -static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) +static inline enum skb_drop_reason +ip_rcv_options(struct sk_buff *skb, struct net_device *dev) { - struct ip_options *opt; const struct iphdr *iph; + struct ip_options *opt; /* It looks as overkill, because not all IP options require packet mangling. @@ -277,7 +278,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) */ if (skb_cow(skb, skb_headroom(skb))) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS); - goto drop; + return SKB_DROP_REASON_NOMEM; } iph = ip_hdr(skb); @@ -286,7 +287,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) if (ip_options_compile(dev_net(dev), opt, skb)) { __IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS); - goto drop; + return SKB_DROP_REASON_IP_INHDR; } if (unlikely(opt->srr)) { @@ -298,17 +299,15 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev) net_info_ratelimited("source route option %pI4 -> %pI4\n", &iph->saddr, &iph->daddr); - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } } if (ip_options_rcv_srr(skb, dev)) - goto drop; + return SKB_DROP_REASON_NOT_SPECIFIED; } - return false; -drop: - return true; + return SKB_NOT_DROPPED_YET; } static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, @@ -319,7 +318,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph, } int tcp_v4_early_demux(struct sk_buff *skb); -int udp_v4_early_demux(struct sk_buff *skb); +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb); static int ip_rcv_finish_core(struct net *net, struct sk_buff *skb, struct net_device *dev, const struct sk_buff *hint) @@ -335,7 +334,6 @@ static int ip_rcv_finish_core(struct net *net, goto drop_error; } - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) && !skb_dst(skb) && !skb->sk && @@ -354,7 +352,6 @@ static int ip_rcv_finish_core(struct net *net, drop_reason = udp_v4_early_demux(skb); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); @@ -372,7 +369,6 @@ static int ip_rcv_finish_core(struct net *net, ip4h_dscp(iph), dev); if (unlikely(drop_reason)) goto drop_error; - drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; } else { struct in_device *in_dev = __in_dev_get_rcu(dev); @@ -391,8 +387,11 @@ static int ip_rcv_finish_core(struct net *net, } #endif - if (iph->ihl > 5 && ip_rcv_options(skb, dev)) - goto drop; + if (iph->ihl > 5) { + drop_reason = ip_rcv_options(skb, dev); + if (drop_reason) + goto drop; + } rt = skb_rtable(skb); if (rt->rt_type == RTN_MULTICAST) { @@ -587,9 +586,13 @@ static void ip_sublist_rcv_finish(struct list_head *head) } static struct sk_buff *ip_extract_route_hint(const struct net *net, - struct sk_buff *skb, int rt_type) + struct sk_buff *skb) { - if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST || + const struct iphdr *iph = ip_hdr(skb); + + if (fib4_has_custom_rules(net) || + ipv4_is_lbcast(iph->daddr) || + ipv4_is_zeronet(iph->daddr) || IPCB(skb)->flags & IPSKB_MULTIPATH) return NULL; @@ -618,8 +621,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head) dst = skb_dst(skb); if (curr_dst != dst) { - hint = ip_extract_route_hint(net, skb, - dst_rtable(dst)->rt_type); + hint = ip_extract_route_hint(net, skb); /* dispatch old sublist */ if (!list_empty(&sublist)) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index e3321932bec0..be8815ce3ac2 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev) } memcpy(&nexthop, &optptr[srrptr-1], 4); - orefdst = skb->_skb_refdst; - skb_dst_set(skb, NULL); + orefdst = skb_dstref_steal(skb); err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph), dev) ? -EINVAL : 0; rt2 = skb_rtable(skb); if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { skb_dst_drop(skb); - skb->_skb_refdst = orefdst; + skb_dstref_restore(skb, orefdst); return -EINVAL; } refdst_drop(orefdst); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 84e7f8a2f50f..5ca97ede979c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -63,6 +63,7 @@ #include <linux/stat.h> #include <linux/init.h> +#include <net/flow.h> #include <net/snmp.h> #include <net/ip.h> #include <net/protocol.h> @@ -83,6 +84,7 @@ #include <linux/netfilter_bridge.h> #include <linux/netlink.h> #include <linux/tcp.h> +#include <net/psp.h> static int ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, @@ -485,7 +487,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, inet_sk_init_flowi4(inet, fl4); /* sctp_v4_xmit() uses its own DSCP value */ - fl4->flowi4_tos = tos & INET_DSCP_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(tos); /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times @@ -1664,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk, arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csum)); nskb->ip_summed = CHECKSUM_NONE; - if (orig_sk) + if (orig_sk) { skb_set_owner_edemux(nskb, (struct sock *)orig_sk); + psp_reply_set_decrypted(nskb); + } if (transmit_time) nskb->tstamp_type = SKB_CLOCK_MONOTONIC; if (txhash) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e86a8a862c41..ca9eaee4c2ef 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -42,6 +42,7 @@ #include <linux/init.h> #include <linux/if_ether.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/net_namespace.h> #include <net/ip.h> #include <net/protocol.h> @@ -1904,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt, return -1; } - encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len; + encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len; if (skb_cow(skb, encap)) { ip_rt_put(rt); @@ -1957,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt, * result in receiving multiple packets. */ NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, - net, NULL, skb, skb->dev, rt->dst.dev, + net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst), ipmr_forward_finish); return; @@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = (rt_is_output_route(rt) ? skb->dev->ifindex : 0), .flowi4_iif = (rt_is_output_route(rt) ? @@ -2301,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb) guard(rcu)(); - dev = rt->dst.dev; + dev = dst_dev_rcu(&rt->dst); if (IPCB(skb)->flags & IPSKB_FORWARDED) goto mc_output; diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c index 0565f001120d..ce310eb779e0 100644 --- a/net/ipv4/netfilter.c +++ b/net/ipv4/netfilter.c @@ -11,10 +11,10 @@ #include <linux/skbuff.h> #include <linux/gfp.h> #include <linux/export.h> +#include <net/flow.h> #include <net/route.h> #include <net/xfrm.h> #include <net/ip.h> -#include <net/inet_dscp.h> #include <net/netfilter/nf_queue.h> /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ @@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un */ fl4.daddr = iph->daddr; fl4.saddr = saddr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0; fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev); fl4.flowi4_mark = skb->mark; @@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) { struct dst_entry *dst = skb_dst(skb); - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index a27782d7653e..6d9bf5106868 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -8,8 +8,8 @@ #include <linux/module.h> #include <linux/skbuff.h> #include <linux/netdevice.h> -#include <net/inet_dscp.h> #include <linux/ip.h> +#include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + flow.flowi4_dscp = ip4h_dscp(iph); flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index ed08fb78cfa8..9a773502f10a 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -12,10 +12,10 @@ #include <linux/skbuff.h> #include <linux/netfilter.h> #include <net/checksum.h> +#include <net/flow.h> #include <net/icmp.h> #include <net/ip.h> #include <net/route.h> -#include <net/inet_dscp.h> #include <net/netfilter/ipv4/nf_dup_ipv4.h> #if IS_ENABLED(CONFIG_NF_CONNTRACK) #include <net/netfilter/nf_conntrack.h> @@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb, fl4.flowi4_oif = oif; fl4.daddr = gw->s_addr; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 0d3cb2ba6fc8..fae4aa4a5f09 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -12,6 +12,15 @@ #include <linux/netfilter_ipv4.h> #include <linux/netfilter_bridge.h> +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl); +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth); +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook); + static int nf_reject_iphdr_validate(struct sk_buff *skb) { struct iphdr *iph; @@ -71,6 +80,27 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset); +static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb) +{ + const struct iphdr *iph = ip_hdr(skb); + u8 *tp, _type; + int thoff; + + if (iph->protocol != IPPROTO_ICMP) + return false; + + thoff = skb_network_offset(skb) + sizeof(*iph); + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmphdr, type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMP_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -91,6 +121,10 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET)) return NULL; + /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */ + if (nf_skb_is_icmp_unreach(oldskb)) + return NULL; + /* RFC says return as much as we can without exceeding 576 bytes. */ len = min_t(unsigned int, 536, oldskb->len); @@ -136,8 +170,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach); -const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *_oth, int hook) +static const struct tcphdr * +nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *_oth, int hook) { const struct tcphdr *oth; @@ -163,11 +198,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb, return oth; } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get); -struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int ttl) +static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int ttl) { struct iphdr *niph, *oiph = ip_hdr(oldskb); @@ -188,10 +222,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb, return niph; } -EXPORT_SYMBOL_GPL(nf_reject_iphdr_put); -void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, - const struct tcphdr *oth) +static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, + const struct tcphdr *oth) { struct iphdr *niph = ip_hdr(nskb); struct tcphdr *tcph; @@ -218,7 +251,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb, nskb->csum_start = (unsigned char *)tcph - nskb->head; nskb->csum_offset = offsetof(struct tcphdr, check); } -EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); static int nf_reject_fill_skb_dst(struct sk_buff *skb_in) { diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a1350fc25838..5080fa5fbf6a 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index 73e66a088e25..041c3f37f237 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, hinfo, skb, + sk = inet_lookup_listener(net, skb, ip_hdrlen(skb) + __tcp_hdrlen(hp), saddr, sport, daddr, dport, in->ifindex, 0); @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, hinfo, saddr, sport, + sk = inet_lookup_established(net, saddr, sport, daddr, dport, in->ifindex); break; default: diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 7e7c49535e3f..82af6cd76d13 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -10,7 +10,7 @@ #include <net/netfilter/nf_tables.h> #include <net/netfilter/nft_fib.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/ip.h> #include <net/ip_fib.h> #include <net/route.h> @@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (priv->flags & NFTA_FIB_F_MARK) fl4.flowi4_mark = pkt->skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)); + fl4.flowi4_dscp = ip4h_dscp(iph); if (priv->flags & NFTA_FIB_F_DADDR) { fl4.daddr = iph->daddr; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 34137768e7f9..7b9d70f9b31c 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh, { struct nh_grp_entry *nhge, *tmp; + /* If there is nothing to do, let's avoid the costly call to + * synchronize_net() + */ + if (list_empty(&nh->grp_list)) + return; + list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) remove_nh_grp_entry(net, nhge, nlinfo); @@ -3518,12 +3524,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb, int err; s_idx = ctx->idx; - for (node = rb_first(root); node; node = rb_next(node)) { + + /* If this is not the first invocation, ctx->idx will contain the id of + * the last nexthop we processed. Instead of starting from the very + * first element of the red/black tree again and linearly skipping the + * (potentially large) set of nodes with an id smaller than s_idx, walk + * the tree and find the left-most node whose id is >= s_idx. This + * provides an efficient O(log n) starting point for the dump + * continuation. + */ + if (s_idx != 0) { + struct rb_node *tmp = root->rb_node; + + node = NULL; + while (tmp) { + struct nexthop *nh; + + nh = rb_entry(tmp, struct nexthop, rb_node); + if (nh->id < s_idx) { + tmp = tmp->rb_right; + } else { + /* Track current candidate and keep looking on + * the left side to find the left-most + * (smallest id) that is still >= s_idx. + */ + node = tmp; + tmp = tmp->rb_left; + } + } + } else { + node = rb_first(root); + } + + for (; node; node = rb_next(node)) { struct nexthop *nh; nh = rb_entry(node, struct nexthop, rb_node); - if (nh->id < s_idx) - continue; ctx->idx = nh->id; err = nh_cb(skb, cb, nh, data); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 031df4c19fcc..5321c5801c64 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -56,9 +56,7 @@ struct ping_table { static struct ping_table ping_table; struct pingv6_ops pingv6_ops; -EXPORT_SYMBOL_GPL(pingv6_ops); - -static u16 ping_port_rover; +EXPORT_IPV6_MOD_GPL(pingv6_ops); static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) { @@ -67,7 +65,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask) pr_debug("hash(%u) = %u\n", num, res); return res; } -EXPORT_SYMBOL_GPL(ping_hash); static inline struct hlist_head *ping_hashslot(struct ping_table *table, struct net *net, unsigned int num) @@ -77,6 +74,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table, int ping_get_port(struct sock *sk, unsigned short ident) { + struct net *net = sock_net(sk); struct inet_sock *isk, *isk2; struct hlist_head *hlist; struct sock *sk2 = NULL; @@ -84,15 +82,16 @@ int ping_get_port(struct sock *sk, unsigned short ident) isk = inet_sk(sk); spin_lock(&ping_table.lock); if (ident == 0) { + u16 result = net->ipv4.ping_port_rover + 1; u32 i; - u16 result = ping_port_rover + 1; for (i = 0; i < (1L << 16); i++, result++) { if (!result) - result++; /* avoid zero */ - hlist = ping_hashslot(&ping_table, sock_net(sk), - result); + continue; /* avoid zero */ + hlist = ping_hashslot(&ping_table, net, result); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); if (isk2->inet_num == result) @@ -100,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident) } /* found */ - ping_port_rover = ident = result; + net->ipv4.ping_port_rover = ident = result; break; next_port: ; @@ -108,8 +107,10 @@ next_port: if (i >= (1L << 16)) goto fail; } else { - hlist = ping_hashslot(&ping_table, sock_net(sk), ident); + hlist = ping_hashslot(&ping_table, net, ident); sk_for_each(sk2, hlist) { + if (!net_eq(sock_net(sk2), net)) + continue; isk2 = inet_sk(sk2); /* BUG? Why is this reuse and not reuseaddr? ping.c @@ -129,7 +130,7 @@ next_port: pr_debug("was not hashed\n"); sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); } spin_unlock(&ping_table.lock); return 0; @@ -138,15 +139,7 @@ fail: spin_unlock(&ping_table.lock); return -EADDRINUSE; } -EXPORT_SYMBOL_GPL(ping_get_port); - -int ping_hash(struct sock *sk) -{ - pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num); - BUG(); /* "Please do not press this button again." */ - - return 0; -} +EXPORT_IPV6_MOD_GPL(ping_get_port); void ping_unhash(struct sock *sk) { @@ -161,7 +154,7 @@ void ping_unhash(struct sock *sk) } spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_unhash); +EXPORT_IPV6_MOD_GPL(ping_unhash); /* Called under rcu_read_lock() */ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) @@ -188,6 +181,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident) } sk_for_each_rcu(sk, hslot) { + if (!net_eq(sock_net(sk), net)) + continue; isk = inet_sk(sk); pr_debug("iterate\n"); @@ -279,7 +274,7 @@ out_release_group: put_group_info(group_info); return ret; } -EXPORT_SYMBOL_GPL(ping_init_sock); +EXPORT_IPV6_MOD_GPL(ping_init_sock); void ping_close(struct sock *sk, long timeout) { @@ -289,7 +284,7 @@ void ping_close(struct sock *sk, long timeout) sk_common_release(sk); } -EXPORT_SYMBOL_GPL(ping_close); +EXPORT_IPV6_MOD_GPL(ping_close); static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) @@ -467,7 +462,7 @@ out: pr_debug("ping_v4_bind -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_bind); +EXPORT_IPV6_MOD_GPL(ping_bind); /* * Is this a supported type of ICMP message? @@ -600,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info) out: return; } -EXPORT_SYMBOL_GPL(ping_err); +EXPORT_IPV6_MOD_GPL(ping_err); /* * Copy and checksum an ICMP Echo packet from user space into a buffer @@ -630,7 +625,7 @@ int ping_getfrag(void *from, char *to, return 0; } -EXPORT_SYMBOL_GPL(ping_getfrag); +EXPORT_IPV6_MOD_GPL(ping_getfrag); static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh, struct flowi4 *fl4) @@ -691,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len, return 0; } -EXPORT_SYMBOL_GPL(ping_common_sendmsg); +EXPORT_IPV6_MOD_GPL(ping_common_sendmsg); static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { @@ -936,7 +931,7 @@ out: pr_debug("ping_recvmsg -> %d\n", err); return err; } -EXPORT_SYMBOL_GPL(ping_recvmsg); +EXPORT_IPV6_MOD_GPL(ping_recvmsg); static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) @@ -957,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) { return __ping_queue_rcv_skb(sk, skb) ? -1 : 0; } -EXPORT_SYMBOL_GPL(ping_queue_rcv_skb); +EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb); /* @@ -985,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb) kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET); return SKB_DROP_REASON_NO_SOCKET; } -EXPORT_SYMBOL_GPL(ping_rcv); +EXPORT_IPV6_MOD_GPL(ping_rcv); struct proto ping_prot = { .name = "PING", @@ -1002,13 +997,12 @@ struct proto ping_prot = { .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, .release_cb = ip4_datagram_release_cb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, .obj_size = sizeof(struct inet_sock), }; -EXPORT_SYMBOL(ping_prot); +EXPORT_IPV6_MOD(ping_prot); #ifdef CONFIG_PROC_FS @@ -1073,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } -EXPORT_SYMBOL_GPL(ping_seq_start); +EXPORT_IPV6_MOD_GPL(ping_seq_start); static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos) { @@ -1092,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) ++*pos; return sk; } -EXPORT_SYMBOL_GPL(ping_seq_next); +EXPORT_IPV6_MOD_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) __releases(ping_table.lock) { spin_unlock(&ping_table.lock); } -EXPORT_SYMBOL_GPL(ping_seq_stop); +EXPORT_IPV6_MOD_GPL(ping_seq_stop); static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, int bucket) @@ -1119,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } static int ping_v4_seq_show(struct seq_file *seq, void *v) @@ -1150,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net) if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops, sizeof(struct ping_iter_state))) return -ENOMEM; + + net->ipv4.ping_port_rover = get_random_u16(); return 0; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 65b0d0ab0084..974afc4ecbe2 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = { SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; /* Following items are displayed in /proc/net/netstat */ @@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS), - SNMP_MIB_SENTINEL }; static const struct { @@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = { SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_udp_list[] = { @@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = { SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp4_net_list[] = { @@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND), SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD), SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS), - SNMP_MIB_SENTINEL }; static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals, @@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq) */ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) { + const int cnt = ARRAY_SIZE(snmp4_ipstats_list); + u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)]; struct net *net = seq->private; - u64 buff64[IPSTATS_MIB_MAX]; int i; - memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64)); + memset(buff64, 0, sizeof(buff64)); seq_puts(seq, "Ip: Forwarding DefaultTTL"); - for (i = 0; snmp4_ipstats_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", snmp4_ipstats_list[i].name); seq_printf(seq, "\nIp: %d %d", @@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) READ_ONCE(net->ipv4.sysctl_ip_default_ttl)); BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0); - snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list, - net->mib.ip_statistics, - offsetof(struct ipstats_mib, syncp)); - for (i = 0; snmp4_ipstats_list[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt, + net->mib.ip_statistics, + offsetof(struct ipstats_mib, syncp)); + for (i = 0; i < cnt; i++) seq_printf(seq, " %llu", buff64[i]); return 0; @@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v) static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) { + const int udp_cnt = ARRAY_SIZE(snmp4_udp_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list); unsigned long buff[TCPUDP_MIB_MAX]; struct net *net = seq->private; int i; - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, tcp_cnt * sizeof(unsigned long)); seq_puts(seq, "\nTcp:"); - for (i = 0; snmp4_tcp_list[i].name; i++) + for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %s", snmp4_tcp_list[i].name); seq_puts(seq, "\nTcp:"); - snmp_get_cpu_field_batch(buff, snmp4_tcp_list, - net->mib.tcp_statistics); - for (i = 0; snmp4_tcp_list[i].name; i++) { + snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list, + tcp_cnt, + net->mib.tcp_statistics); + for (i = 0; i < tcp_cnt; i++) { /* MaxConn field is signed, RFC 2012 */ if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) seq_printf(seq, " %ld", buff[i]); @@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v) seq_printf(seq, " %lu", buff[i]); } - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udp_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udp_statistics); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdp:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); - memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long)); + memset(buff, 0, udp_cnt * sizeof(unsigned long)); /* the UDP and UDP-Lite MIBs are the same */ seq_puts(seq, "\nUdpLite:"); - snmp_get_cpu_field_batch(buff, snmp4_udp_list, - net->mib.udplite_statistics); - for (i = 0; snmp4_udp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list, + udp_cnt, + net->mib.udplite_statistics); + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %s", snmp4_udp_list[i].name); seq_puts(seq, "\nUdpLite:"); - for (i = 0; snmp4_udp_list[i].name; i++) + for (i = 0; i < udp_cnt; i++) seq_printf(seq, " %lu", buff[i]); seq_putc(seq, '\n'); @@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v) */ static int netstat_seq_show(struct seq_file *seq, void *v) { - const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1; - const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1; + const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list); + const int tcp_cnt = ARRAY_SIZE(snmp4_net_list); struct net *net = seq->private; unsigned long *buff; int i; @@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v) buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)), GFP_KERNEL); if (buff) { - snmp_get_cpu_field_batch(buff, snmp4_net_list, - net->mib.net_statistics); + snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt, + net->mib.net_statistics); for (i = 0; i < tcp_cnt; i++) seq_printf(seq, " %lu", buff[i]); } else { @@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v) u64 *buff64 = (u64 *)buff; memset(buff64, 0, ip_cnt * sizeof(u64)); - snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list, + snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt, net->mib.ip_statistics, offsetof(struct ipstats_mib, syncp)); for (i = 0; i < ip_cnt; i++) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 1d2c89d63cc7..d54ebb7df966 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb, if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb) int raw_rcv(struct sock *sk, struct sk_buff *skb) { if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk) { struct raw_sock *rp = raw_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; if (inet_sk(sk)->inet_num == IPPROTO_ICMP) memset(&rp->filter, 0, sizeof(rp->filter)); return 0; @@ -1045,7 +1046,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) 0, 0L, 0, from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), 0, sock_i_ino(sp), - refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops)); + refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp)); } static int raw_seq_show(struct seq_file *seq, void *v) diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index cc793bd8de25..943e5998e0ad 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb, static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin); @@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; struct hlist_head *hlist; struct sock *sk = NULL; - struct nlattr *bc; if (IS_ERR(hashinfo)) return; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, if (r->id.idiag_dport != inet->inet_dport && r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) goto out_unlock; next: num++; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index baa43e5966b1..6d27d3610c1c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -84,6 +84,7 @@ #include <linux/jhash.h> #include <net/dst.h> #include <net/dst_metadata.h> +#include <net/flow.h> #include <net/inet_dscp.h> #include <net/net_namespace.h> #include <net/ip.h> @@ -413,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) { const struct rtable *rt = container_of(dst, struct rtable, dst); - struct net_device *dev = dst_dev(dst); + struct net_device *dev; struct neighbour *n; rcu_read_lock(); - + dev = dst_dev_rcu(dst); if (likely(rt->rt_gw_family == AF_INET)) { n = ip_neigh_gw4(dev, rt->rt_gw4); } else if (rt->rt_gw_family == AF_INET6) { @@ -1026,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) return; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv4.ip_rt_min_pmtu) { lock = true; mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu); @@ -1221,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check); static void ipv4_send_dest_unreach(struct sk_buff *skb) { + struct inet_skb_parm parm; struct net_device *dev; - struct ip_options opt; int res; /* Recompile ip options since IPCB may not be valid anymore. @@ -1232,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb) ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) return; - memset(&opt, 0, sizeof(opt)); + memset(&parm, 0, sizeof(parm)); if (ip_hdr(skb)->ihl > 5) { if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) return; - opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); + parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); rcu_read_lock(); dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev; - res = __ip_options_compile(dev_net(dev), &opt, skb, NULL); + res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL); rcu_read_unlock(); if (res) return; } - __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm); } static void ipv4_link_failure(struct sk_buff *skb) @@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) struct flowi4 fl4 = { .daddr = iph->daddr, .saddr = iph->saddr, - .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)), + .flowi4_dscp = ip4h_dscp(iph), .flowi4_oif = rt->dst.dev->ifindex, .flowi4_iif = skb->dev->ifindex, .flowi4_mark = skb->mark, @@ -1326,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst) struct net *net; rcu_read_lock(); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, net->ipv4.ip_rt_min_advmss); rcu_read_unlock(); @@ -2210,7 +2211,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_source; } - if (rt->rt_type != RTN_LOCAL) + if (!(rt->rt_flags & RTCF_LOCAL)) goto skip_validate_source; reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev, @@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, fl4.flowi4_oif = 0; fl4.flowi4_iif = dev->ifindex; fl4.flowi4_mark = skb->mark; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; fl4.flowi4_flags = 0; fl4.daddr = daddr; @@ -2694,7 +2695,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, struct rtable *rth; fl4->flowi4_iif = LOOPBACK_IFINDEX; - fl4->flowi4_tos &= INET_DSCP_MASK; rcu_read_lock(); rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); @@ -3337,7 +3337,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, fl4.daddr = dst; fl4.saddr = src; - fl4.flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4.flowi4_dscp = dscp; fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0); fl4.flowi4_mark = mark; fl4.flowi4_uid = uid; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index eb0819463fae..569befcf021b 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <net/secure_seq.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/route.h> static siphash_aligned_key_t syncookie_secret[2]; @@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct tcp_sock *tp = tcp_sk(sk); struct inet_request_sock *ireq; struct net *net = sock_net(sk); + struct tcp_request_sock *treq; struct request_sock *req; struct sock *ret = sk; struct flowi4 fl4; @@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) } ireq = inet_rsk(req); + treq = tcp_rsk(req); sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); @@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst); + treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); /* ip_queue_xmit() depends on our flow being setup diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 3a43010d726f..24dbc603cc44 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX; static int tcp_plb_max_rounds = 31; static int tcp_plb_max_cong_thresh = 256; static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC; +static int tcp_ecn_mode_max = 2; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -728,9 +729,27 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_ecn_mode_max, + }, + { + .procname = "tcp_ecn_option", + .data = &init_net.ipv4.sysctl_tcp_ecn_option, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, { + .procname = "tcp_ecn_option_beacon", + .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_THREE, + }, + { .procname = "tcp_ecn_fallback", .data = &init_net.ipv4.sysctl_tcp_ecn_fallback, .maxlen = sizeof(u8), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ad76556800f2..7949d16506a4 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -270,11 +270,14 @@ #include <net/icmp.h> #include <net/inet_common.h> +#include <net/inet_ecn.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> #include <net/xfrm.h> #include <net/ip.h> +#include <net/psp.h> #include <net/sock.h> #include <net/rstreason.h> @@ -412,6 +415,22 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp) return rate64; } +#ifdef CONFIG_TCP_MD5SIG +void tcp_md5_destruct_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->md5sig_info) { + + tcp_clear_md5_list(sk); + kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1)); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); + } +} +EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock); +#endif + /* Address-family independent initialization for a tcp_sock. * * NOTE: A lot of things set to zero explicitly by call to @@ -687,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb) tcb->seq = tcb->end_seq = tp->write_seq; tcb->tcp_flags = TCPHDR_ACK; __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -2818,9 +2838,9 @@ found_ok_skb: err = tcp_recvmsg_dmabuf(sk, skb, offset, msg, used); - if (err <= 0) { + if (err < 0) { if (!copied) - copied = -EFAULT; + copied = err; break; } @@ -3099,8 +3119,8 @@ bool tcp_check_oom(const struct sock *sk, int shift) void __tcp_close(struct sock *sk, long timeout) { + bool data_was_unread = false; struct sk_buff *skb; - int data_was_unread = 0; int state; WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); @@ -3118,13 +3138,14 @@ void __tcp_close(struct sock *sk, long timeout) * descriptor close, not protocol-sourced closes, because the * reader process may not have drained the data yet! */ - while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { - u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq; + while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) - len--; - data_was_unread += len; - __kfree_skb(skb); + end_seq--; + if (after(end_seq, tcp_sk(sk)->copied_seq)) + data_was_unread = true; + tcp_eat_recv_skb(sk, skb); } /* If socket has been already reset (e.g. in tcp_reset()) - kill it. */ @@ -3195,7 +3216,7 @@ adjudge_to_death: /* remove backlog if any, without releasing ownership. */ __release_sock(sk); - this_cpu_inc(tcp_orphan_count); + tcp_orphan_count_inc(); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -3377,7 +3398,7 @@ int tcp_disconnect(struct sock *sk, int flags) WRITE_ONCE(tp->write_seq, seq); icsk->icsk_backoff = 0; - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN); @@ -3390,6 +3411,11 @@ int tcp_disconnect(struct sock *sk, int flags) tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; + tp->accecn_fail_mode = 0; + tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_accecn_init_counters(tp); + tp->prev_ecnfield = 0; + tp->accecn_opt_tstamp = 0; if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release) icsk->icsk_ca_ops->release(sk); memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); @@ -3765,7 +3791,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val) if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) return -EINVAL; - tcp_sk(sk)->rx_opt.user_mss = val; + WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val); return 0; } @@ -3895,15 +3921,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max); return 0; } + case TCP_MAXSEG: + return tcp_sock_set_maxseg(sk, val); } sockopt_lock_sock(sk); switch (optname) { - case TCP_MAXSEG: - err = tcp_sock_set_maxseg(sk, val); - break; - case TCP_NODELAY: __tcp_sock_set_nodelay(sk, val); break; @@ -4142,6 +4166,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) { const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */ const struct inet_connection_sock *icsk = inet_csk(sk); + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ce_idx = INET_ECN_CE - 1; unsigned long rate; u32 now; u64 rate64; @@ -4268,6 +4295,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rto_stamp) info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp; + info->tcpi_accecn_fail_mode = tp->accecn_fail_mode; + info->tcpi_accecn_opt_seen = tp->saw_accecn_opt; + info->tcpi_received_ce = tp->received_ce; + info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx]; + info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx]; + info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx]; + info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx]; + info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx]; + info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx]; + unlock_sock_fast(sk, slow); } EXPORT_SYMBOL_GPL(tcp_get_info); @@ -4353,7 +4390,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); - nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, + READ_ONCE(inet_csk(sk)->icsk_retransmits)); nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh); nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered); @@ -4388,6 +4426,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); + int user_mss; int val, len; if (copy_from_sockptr(&len, optlen, sizeof(int))) @@ -4401,9 +4440,10 @@ int do_tcp_getsockopt(struct sock *sk, int level, switch (optname) { case TCP_MAXSEG: val = tp->mss_cache; - if (tp->rx_opt.user_mss && + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) - val = tp->rx_opt.user_mss; + val = user_mss; if (tp->repair) val = tp->rx_opt.mss_clamp; break; @@ -5061,7 +5101,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32); +#if IS_ENABLED(CONFIG_TLS_DEVICE) + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked); +#endif /* TXRX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset); @@ -5072,11 +5114,9 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32); /* RX read-mostly hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq); - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us); @@ -5087,12 +5127,6 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh); -#if IS_ENABLED(CONFIG_TLS_DEVICE) - CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77); -#else - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69); -#endif /* TX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out); @@ -5106,11 +5140,11 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89); /* TXRX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags); @@ -5125,15 +5159,13 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt); - /* 32bit arches with 8byte alignment on u64 fields might need padding - * before tcp_clock_cache. - */ - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4); - /* RX read-write hotpath cache lines */ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in); @@ -5144,12 +5176,12 @@ static void __init tcp_struct_check(void) CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr); + CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est); CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space); - CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99); } void __init tcp_init(void) diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c index 3338b6cc85c4..34b8450829d0 100644 --- a/net/ipv4/tcp_ao.c +++ b/net/ipv4/tcp_ao.c @@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head) kfree_sensitive(key); } -static void tcp_ao_info_free_rcu(struct rcu_head *head) +static void tcp_ao_info_free(struct tcp_ao_info *ao) { - struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu); struct tcp_ao_key *key; struct hlist_node *n; @@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk) if (!twsk) tcp_ao_sk_omem_free(sk, ao); - call_rcu(&ao->rcu, tcp_ao_info_free_rcu); + tcp_ao_info_free(ao); } void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp) diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ba4d98e510e0..fbad6c35dee9 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk) /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), - GFP_NOWAIT | __GFP_NOWARN); + GFP_NOWAIT); ca->rtt_seq = tp->snd_nxt; ca->shadow_wnd = tcp_snd_cwnd(tp); } diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 45e174b8cd22..d83efd91f461 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -12,6 +12,9 @@ #include <linux/tcp.h> +#include <net/inet_hashtables.h> +#include <net/inet6_hashtables.h> +#include <net/inet_timewait_sock.h> #include <net/netlink.h> #include <net/tcp.h> @@ -174,27 +177,465 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) size += ulp_ops->get_info_size(sk, net_admin); } } - return size; + + return size + + nla_total_size(sizeof(struct tcp_info)) + + nla_total_size(sizeof(struct inet_diag_msg)) + + inet_diag_msg_attrs_size() + + nla_total_size(sizeof(struct inet_diag_meminfo)) + + nla_total_size(SK_MEMINFO_VARS * sizeof(u32)) + + nla_total_size(TCP_CA_NAME_MAX) + + nla_total_size(sizeof(struct tcpvegas_info)) + + 64; +} + +static int tcp_twsk_diag_fill(struct sock *sk, + struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct inet_timewait_sock *tw = inet_twsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, + sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT); + + inet_diag_msg_common_fill(r, sk); + r->idiag_retrans = 0; + + r->idiag_state = READ_ONCE(tw->tw_substate); + r->idiag_timer = 3; + tmo = tw->tw_timer.expires - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + tw->tw_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + u16 nlmsg_flags, bool net_admin) +{ + struct request_sock *reqsk = inet_reqsk(sk); + struct inet_diag_msg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags); + if (!nlh) + return -EMSGSIZE; + + r = nlmsg_data(nlh); + inet_diag_msg_common_fill(r, sk); + r->idiag_state = TCP_SYN_RECV; + r->idiag_timer = 1; + r->idiag_retrans = READ_ONCE(reqsk->num_retrans); + + BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) != + offsetof(struct sock, sk_cookie)); + + tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies; + r->idiag_expires = jiffies_delta_to_msecs(tmo); + r->idiag_rqueue = 0; + r->idiag_wqueue = 0; + r->idiag_uid = 0; + r->idiag_inode = 0; + + if (net_admin && nla_put_u32(skb, INET_DIAG_MARK, + inet_rsk(reqsk)->ir_mark)) { + nlmsg_cancel(skb, nlh); + return -EMSGSIZE; + } + + nlmsg_end(skb, nlh); + return 0; +} + +static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, + struct netlink_callback *cb, + const struct inet_diag_req_v2 *r, + u16 nlmsg_flags, bool net_admin) +{ + if (sk->sk_state == TCP_TIME_WAIT) + return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + if (sk->sk_state == TCP_NEW_SYN_RECV) + return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin); + + return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags, + net_admin); +} + +static void twsk_build_assert(void) +{ + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) != + offsetof(struct sock, sk_family)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) != + offsetof(struct inet_sock, inet_num)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) != + offsetof(struct inet_sock, inet_dport)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) != + offsetof(struct inet_sock, inet_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) != + offsetof(struct inet_sock, inet_daddr)); + +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) != + offsetof(struct sock, sk_v6_rcv_saddr)); + + BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) != + offsetof(struct sock, sk_v6_daddr)); +#endif } static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - struct inet_hashinfo *hinfo; + bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); + struct inet_diag_dump_data *cb_data = cb->data; + struct net *net = sock_net(skb->sk); + u32 idiag_states = r->idiag_states; + struct inet_hashinfo *hashinfo; + int i, num, s_i, s_num; + struct sock *sk; - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + if (idiag_states & TCPF_SYN_RECV) + idiag_states |= TCPF_NEW_SYN_RECV; + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport) + goto skip_listen_ht; + + for (i = s_i; i <= hashinfo->lhash2_mask; i++) { + struct inet_listen_hashbucket *ilb; + struct hlist_nulls_node *node; + + num = 0; + ilb = &hashinfo->lhash2[i]; + + if (hlist_nulls_empty(&ilb->nulls_head)) { + s_num = 0; + continue; + } + spin_lock(&ilb->lock); + sk_nulls_for_each(sk, node, &ilb->nulls_head) { + struct inet_sock *inet = inet_sk(sk); + + if (!net_eq(sock_net(sk), net)) + continue; + + if (num < s_num) { + num++; + continue; + } + + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_listen; + + if (r->id.idiag_sport != inet->inet_sport && + r->id.idiag_sport) + goto next_listen; + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_listen; + + if (inet_sk_diag_fill(sk, inet_csk(sk), skb, + cb, r, NLM_F_MULTI, + net_admin) < 0) { + spin_unlock(&ilb->lock); + goto done; + } + +next_listen: + ++num; + } + spin_unlock(&ilb->lock); + + s_num = 0; + } +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + +/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets + * with bh disabled. + */ +#define SKARR_SZ 16 + + /* Dump bound but inactive (not listening, connecting, etc.) sockets */ + if (cb->args[0] == 1) { + if (!(idiag_states & TCPF_BOUND_INACTIVE)) + goto skip_bind_ht; + + for (i = s_i; i < hashinfo->bhash_size; i++) { + struct inet_bind_hashbucket *ibb; + struct inet_bind2_bucket *tb2; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + +resume_bind_walk: + num = 0; + accum = 0; + ibb = &hashinfo->bhash2[i]; + + if (hlist_empty(&ibb->chain)) { + s_num = 0; + continue; + } + spin_lock_bh(&ibb->lock); + inet_bind_bucket_for_each(tb2, &ibb->chain) { + if (!net_eq(ib2_net(tb2), net)) + continue; + + sk_for_each_bound(sk, &tb2->owners) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_bind; + + if (sk->sk_state != TCP_CLOSE || + !inet->inet_num) + goto next_bind; + + if (r->sdiag_family != AF_UNSPEC && + r->sdiag_family != sk->sk_family) + goto next_bind; + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_bind; + + sock_hold(sk); + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + goto pause_bind_walk; +next_bind: + num++; + } + } +pause_bind_walk: + spin_unlock_bh(&ibb->lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = inet_sk_diag_fill(sk_arr[idx], + NULL, skb, cb, + r, NLM_F_MULTI, + net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_put(sk_arr[idx]); + } + if (res < 0) + goto done; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto resume_bind_walk; + } + + s_num = 0; + } +skip_bind_ht: + cb->args[0] = 2; + s_i = num = s_num = 0; + } - inet_diag_dump_icsk(hinfo, skb, cb, r); + if (!(idiag_states & ~TCPF_LISTEN)) + goto out; + + for (i = s_i; i <= hashinfo->ehash_mask; i++) { + struct inet_ehash_bucket *head = &hashinfo->ehash[i]; + spinlock_t *lock = inet_ehash_lockp(hashinfo, i); + struct hlist_nulls_node *node; + struct sock *sk_arr[SKARR_SZ]; + int num_arr[SKARR_SZ]; + int idx, accum, res; + + if (hlist_nulls_empty(&head->chain)) + continue; + + if (i > s_i) + s_num = 0; + +next_chunk: + num = 0; + accum = 0; + spin_lock_bh(lock); + sk_nulls_for_each(sk, node, &head->chain) { + int state; + + if (!net_eq(sock_net(sk), net)) + continue; + if (num < s_num) + goto next_normal; + state = (sk->sk_state == TCP_TIME_WAIT) ? + READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state; + if (!(idiag_states & (1 << state))) + goto next_normal; + if (r->sdiag_family != AF_UNSPEC && + sk->sk_family != r->sdiag_family) + goto next_normal; + if (r->id.idiag_sport != htons(sk->sk_num) && + r->id.idiag_sport) + goto next_normal; + if (r->id.idiag_dport != sk->sk_dport && + r->id.idiag_dport) + goto next_normal; + twsk_build_assert(); + + if (!inet_diag_bc_sk(cb_data, sk)) + goto next_normal; + + if (!refcount_inc_not_zero(&sk->sk_refcnt)) + goto next_normal; + + num_arr[accum] = num; + sk_arr[accum] = sk; + if (++accum == SKARR_SZ) + break; +next_normal: + ++num; + } + spin_unlock_bh(lock); + + res = 0; + for (idx = 0; idx < accum; idx++) { + if (res >= 0) { + res = sk_diag_fill(sk_arr[idx], skb, cb, r, + NLM_F_MULTI, net_admin); + if (res < 0) + num = num_arr[idx]; + } + sock_gen_put(sk_arr[idx]); + } + if (res < 0) + break; + + cond_resched(); + + if (accum == SKARR_SZ) { + s_num = num + 1; + goto next_chunk; + } + } + +done: + cb->args[1] = i; + cb->args[2] = num; +out: + ; +} + +static struct sock *tcp_diag_find_one_icsk(struct net *net, + const struct inet_diag_req_v2 *req) +{ + struct sock *sk; + + rcu_read_lock(); + if (req->sdiag_family == AF_INET) { + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0], + req->id.idiag_dport, req->id.idiag_src[0], + req->id.idiag_sport, req->id.idiag_if); +#if IS_ENABLED(CONFIG_IPV6) + } else if (req->sdiag_family == AF_INET6) { + if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) && + ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src)) + sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3], + req->id.idiag_dport, req->id.idiag_src[3], + req->id.idiag_sport, req->id.idiag_if); + else + sk = inet6_lookup(net, NULL, 0, + (struct in6_addr *)req->id.idiag_dst, + req->id.idiag_dport, + (struct in6_addr *)req->id.idiag_src, + req->id.idiag_sport, + req->id.idiag_if); +#endif + } else { + rcu_read_unlock(); + return ERR_PTR(-EINVAL); + } + rcu_read_unlock(); + if (!sk) + return ERR_PTR(-ENOENT); + + if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) { + sock_gen_put(sk); + return ERR_PTR(-ENOENT); + } + + return sk; } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - struct inet_hashinfo *hinfo; + struct sk_buff *in_skb = cb->skb; + struct sk_buff *rep; + struct sock *sk; + struct net *net; + bool net_admin; + int err; - hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + net = sock_net(in_skb->sk); + sk = tcp_diag_find_one_icsk(net, req); + if (IS_ERR(sk)) + return PTR_ERR(sk); - return inet_diag_dump_one_icsk(hinfo, cb, req); + net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN); + rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL); + if (!rep) { + err = -ENOMEM; + goto out; + } + + err = sk_diag_fill(sk, rep, cb, req, 0, net_admin); + if (err < 0) { + WARN_ON(err == -EMSGSIZE); + nlmsg_free(rep); + goto out; + } + err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid); + +out: + if (sk) + sock_gen_put(sk); + + return err; } #ifdef CONFIG_INET_DIAG_DESTROY @@ -202,13 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct inet_hashinfo *hinfo; struct sock *sk; int err; - hinfo = net->ipv4.tcp_death_row.hashinfo; - sk = inet_diag_find_one_icsk(net, hinfo, req); - + sk = tcp_diag_find_one_icsk(net, req); if (IS_ERR(sk)) return PTR_ERR(sk); @@ -226,7 +664,6 @@ static const struct inet_diag_handler tcp_diag_handler = { .dump_one = tcp_diag_dump_one, .idiag_get_info = tcp_diag_get_info, .idiag_get_aux = tcp_diag_get_aux, - .idiag_get_aux_size = tcp_diag_get_aux_size, .idiag_type = IPPROTO_TCP, .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index f1884f0c9e52..7d945a527daf 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) } } else if (tp->syn_fastopen_ch && atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) { - dst = sk_dst_get(sk); - dev = dst ? dst_dev(dst) : NULL; + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0); - dst_release(dst); + rcu_read_unlock(); } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 71b76e98371a..b44fdc309633 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -70,8 +70,10 @@ #include <linux/sysctl.h> #include <linux/kernel.h> #include <linux/prefetch.h> +#include <linux/bitops.h> #include <net/dst.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/proto_memory.h> #include <net/inet_common.h> #include <linux/ipsec.h> @@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk) (icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk)); } -static void tcp_ecn_queue_cwr(struct tcp_sock *tp) -{ - if (tcp_ecn_mode_rfc3168(tp)) - tp->ecn_flags |= TCP_ECN_QUEUE_CWR; -} - -static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb) -{ - if (tcp_hdr(skb)->cwr) { - tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR; - - /* If the sender is telling us it has entered CWR, then its - * cwnd may be very low (even just 1 packet), so we should ACK - * immediately. - */ - if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq) - inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; - } -} - -static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp) -{ - tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; -} - static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); - if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && + tcp_ecn_mode_rfc3168(tp)) { /* Better not delay acks, sender can have a very low cwnd */ tcp_enter_quickack_mode(sk, 2); tp->ecn_flags |= TCP_ECN_DEMAND_CWR; } + /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by + * tcp_data_ecn_check() when the ECN codepoint of + * received TCP data contains ECT(0), ECT(1), or CE. + */ + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; default: if (tcp_ca_needs_ecn(sk)) tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; tp->ecn_flags |= TCP_ECN_SEEN; break; } } -static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th) +/* Returns true if the byte counters can be used */ +static bool tcp_accecn_process_option(struct tcp_sock *tp, + const struct sk_buff *skb, + u32 delivered_bytes, int flag) { - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + u8 estimate_ecnfield = tp->est_ecnfield; + bool ambiguous_ecn_bytes_incr = false; + bool first_changed = false; + unsigned int optlen; + bool order1, res; + unsigned int i; + u8 *ptr; -static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th) -{ - if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr)) - tcp_ecn_mode_set(tp, TCP_ECN_DISABLED); -} + if (tcp_accecn_opt_fail_recv(tp)) + return false; -static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th) -{ - if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp)) - return true; - return false; + if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) { + if (!tp->saw_accecn_opt) { + /* Too late to enable after this point due to + * potential counter wraps + */ + if (tp->bytes_sent >= (1 << 23) - 1) { + u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN; + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + } + return false; + } + + if (estimate_ecnfield) { + u8 ecnfield = estimate_ecnfield - 1; + + tp->delivered_ecn_bytes[ecnfield] += delivered_bytes; + return true; + } + return false; + } + + ptr = skb_transport_header(skb) + tp->rx_opt.accecn; + optlen = ptr[1] - 2; + if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1)) + return false; + order1 = (ptr[0] == TCPOPT_ACCECN1); + ptr += 2; + + if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + tp->saw_accecn_opt = tcp_accecn_option_init(skb, + tp->rx_opt.accecn); + if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) + tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV); + } + + res = !!estimate_ecnfield; + for (i = 0; i < 3; i++) { + u32 init_offset; + u8 ecnfield; + s32 delta; + u32 *cnt; + + if (optlen < TCPOLEN_ACCECN_PERFIELD) + break; + + ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1); + init_offset = tcp_accecn_field_init_offset(ecnfield); + cnt = &tp->delivered_ecn_bytes[ecnfield - 1]; + delta = tcp_update_ecn_bytes(cnt, ptr, init_offset); + if (delta && delta < 0) { + res = false; + ambiguous_ecn_bytes_incr = true; + } + if (delta && ecnfield != estimate_ecnfield) { + if (!first_changed) { + tp->est_ecnfield = ecnfield; + first_changed = true; + } else { + res = false; + ambiguous_ecn_bytes_incr = true; + } + } + + optlen -= TCPOLEN_ACCECN_PERFIELD; + ptr += TCPOLEN_ACCECN_PERFIELD; + } + if (ambiguous_ecn_bytes_incr) + tp->est_ecnfield = 0; + + return res; } static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) @@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, bool ece_ack) { tp->delivered += delivered; - if (ece_ack) + if (tcp_ecn_mode_rfc3168(tp) && ece_ack) tcp_count_delivered_ce(tp, delivered); } +/* Returns the ECN CE delta */ +static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int flag) +{ + u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1]; + const struct tcphdr *th = tcp_hdr(skb); + struct tcp_sock *tp = tcp_sk(sk); + u32 delta, safe_delta, d_ceb; + bool opt_deltas_valid; + u32 corrected_ace; + + /* Reordered ACK or uncertain due to lack of data to send and ts */ + if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS))) + return 0; + + opt_deltas_valid = tcp_accecn_process_option(tp, skb, + delivered_bytes, flag); + + if (!(flag & FLAG_SLOWPATH)) { + /* AccECN counter might overflow on large ACKs */ + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return 0; + } + + /* ACE field is not available during handshake */ + if (flag & FLAG_SYN_ACKED) + return 0; + + if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA) + inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW; + + corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET; + delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK; + if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK) + return delta; + + safe_delta = delivered_pkts - + ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK); + + if (opt_deltas_valid) { + d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb; + if (!d_ceb) + return delta; + + if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) && + (tcp_is_sack(tp) || + ((1 << inet_csk(sk)->icsk_ca_state) & + (TCPF_CA_Open | TCPF_CA_CWR)))) { + u32 est_d_cep; + + if (delivered_bytes <= d_ceb) + return safe_delta; + + est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb * + delivered_pkts, + delivered_bytes); + return min(safe_delta, + delta + + (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK)); + } + + if (d_ceb > delta * tp->mss_cache) + return safe_delta; + if (d_ceb < + safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT) + return delta; + } + + return safe_delta; +} + +static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb, + u32 delivered_pkts, u32 delivered_bytes, + int *flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 delta; + + delta = __tcp_accecn_process(sk, skb, delivered_pkts, + delivered_bytes, *flag); + if (delta > 0) { + tcp_count_delivered_ce(tp, delta); + *flag |= FLAG_ECE; + /* Recalculate header predictor */ + if (tp->pred_flags) + tcp_fast_path_on(tp); + } + return delta; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -744,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, } } -static void tcp_rcvbuf_grow(struct sock *sk) +void tcp_rcvbuf_grow(struct sock *sk) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -1030,6 +1177,7 @@ struct tcp_sacktag_state { u64 last_sackt; u32 reord; u32 sack_delivered; + u32 delivered_bytes; int flag; unsigned int mss_now; struct rate_sample *rate; @@ -1391,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, static u8 tcp_sacktag_one(struct sock *sk, struct tcp_sacktag_state *state, u8 sacked, u32 start_seq, u32 end_seq, - int dup_sack, int pcount, + int dup_sack, int pcount, u32 plen, u64 xmit_time) { struct tcp_sock *tp = tcp_sk(sk); @@ -1451,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk, tp->sacked_out += pcount; /* Out-of-order packets delivered */ state->sack_delivered += pcount; + state->delivered_bytes += plen; } /* D-SACK. We can detect redundant retransmission in S|R and plain R @@ -1487,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, * tcp_highest_sack_seq() when skb is highest_sack. */ tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, - start_seq, end_seq, dup_sack, pcount, + start_seq, end_seq, dup_sack, pcount, skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); @@ -1772,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, TCP_SKB_CB(skb)->end_seq, dup_sack, tcp_skb_pcount(skb), + skb->len, tcp_skb_timestamp_us(skb)); tcp_rate_skb_delivered(sk, skb, state->rate); if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) @@ -2569,7 +2719,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) if (frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); if (tcp_is_non_sack_preventing_reopen(sk)) return true; if (frto_undo || tcp_is_sack(tp)) { @@ -3280,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (sacked & TCPCB_SACKED_ACKED) { tp->sacked_out -= acked_pcount; + /* snd_una delta covers these skbs */ + sack->delivered_bytes -= skb->len; } else if (tcp_is_sack(tp)) { tcp_count_delivered(tp, acked_pcount, ece_ack); if (!tcp_skb_spurious_retrans(tp, skb)) @@ -3376,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb, if (before(reord, prior_fack)) tcp_check_sack_reordering(sk, reord, 0); } + + sack->delivered_bytes = (skb ? + TCP_SKB_CB(skb)->seq : tp->snd_una) - + prior_snd_una; } else if (skb && rtt_update && sack_rtt_us >= 0 && sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp, tcp_skb_timestamp_us(skb))) { @@ -3645,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time); } +static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector) +{ + struct tcp_sock *tp = tcp_sk(sk); + u16 flags = 0; + + if (accecn_reflector) + flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv); + __tcp_send_ack(sk, tp->rcv_nxt, flags); +} + /* RFC 5961 7 [ACK Throttling] */ -static void tcp_send_challenge_ack(struct sock *sk) +static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector) { struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); @@ -3676,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk) WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1); send_ack: NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK); - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, accecn_reflector); } } @@ -3787,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit) } /* Returns the number of packets newly acked or sacked by the current ACK */ -static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) +static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, + u32 ecn_count, int flag) { const struct net *net = sock_net(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -3795,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag) delivered = tp->delivered - prior_delivered; NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered); - if (flag & FLAG_ECE) - NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered); + + if (flag & FLAG_ECE) { + if (tcp_ecn_mode_rfc3168(tp)) + ecn_count = delivered; + NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count); + } return delivered; } @@ -3817,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) u32 delivered = tp->delivered; u32 lost = tp->lost; int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */ + u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */ u32 prior_fack; sack_state.first_sackt = 0; sack_state.rate = &rs; sack_state.sack_delivered = 0; + sack_state.delivered_bytes = 0; /* We very likely will need to access rtx queue. */ prefetch(sk->tcp_rtx_queue.rb_node); @@ -3837,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */ if (before(ack, prior_snd_una - max_window)) { if (!(flag & FLAG_NO_CHALLENGE_ACK)) - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); return -SKB_DROP_REASON_TCP_TOO_OLD_ACK; } goto old_ack; @@ -3851,7 +4024,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, prior_snd_una)) { flag |= FLAG_SND_UNA_ADVANCED; - icsk->icsk_retransmits = 0; + WRITE_ONCE(icsk->icsk_retransmits, 0); #if IS_ENABLED(CONFIG_TLS_DEVICE) if (static_branch_unlikely(&clean_acked_data_enabled.key)) @@ -3912,8 +4085,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) /* We passed data and got it acked, remove any soft error * log. Something worked... */ - WRITE_ONCE(sk->sk_err_soft, 0); - icsk->icsk_probes_out = 0; + if (READ_ONCE(sk->sk_err_soft)) + WRITE_ONCE(sk->sk_err_soft, 0); + WRITE_ONCE(icsk->icsk_probes_out, 0); tp->rcv_tstamp = tcp_jiffies32; if (!prior_packets) goto no_queue; @@ -3924,6 +4098,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); + tcp_in_ack_event(sk, flag); if (tp->tlp_high_seq) @@ -3948,7 +4128,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) sk_dst_confirm(sk); - delivered = tcp_newly_delivered(sk, delivered, flag); + delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag); + lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); @@ -3957,12 +4138,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + if (tcp_ecn_mode_accecn(tp)) + ecn_count = tcp_accecn_process(sk, skb, + tp->delivered - delivered, + sack_state.delivered_bytes, + &flag); tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); } /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than @@ -3983,7 +4169,7 @@ old_ack: &sack_state); tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, &rexmit); - tcp_newly_delivered(sk, delivered, flag); + tcp_newly_delivered(sk, delivered, ecn_count, flag); tcp_xmit_recovery(sk, rexmit); } @@ -4083,6 +4269,7 @@ void tcp_parse_options(const struct net *net, ptr = (const unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; + opt_rx->accecn = 0; opt_rx->saw_unknown = 0; while (length > 0) { @@ -4174,6 +4361,12 @@ void tcp_parse_options(const struct net *net, ptr, th->syn, foc, false); break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + /* Save offset of AccECN option in TCP header */ + opt_rx->accecn = (ptr - 2) - (__u8 *)th; + break; + case TCPOPT_EXP: /* Fast Open option shares code 254 using a * 16 bits magic number. @@ -4234,11 +4427,14 @@ static bool tcp_fast_parse_options(const struct net *net, */ if (th->doff == (sizeof(*th) / 4)) { tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; return false; } else if (tp->rx_opt.tstamp_ok && th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) { - if (tcp_parse_aligned_timestamp(tp, th)) + if (tcp_parse_aligned_timestamp(tp, th)) { + tp->rx_opt.accecn = 0; return true; + } } tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL); @@ -4830,7 +5026,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk, noinline_for_tracing static void tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); sk_skb_reason_drop(sk, skb, reason); } @@ -4890,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb); /* Check if this incoming skb can be added to socket receive queues * while satisfying sk->sk_rcvbuf limit. + * + * In theory we should use skb->truesize, but this can cause problems + * when applications use too small SO_RCVBUF values. + * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio, + * allowing RWIN to be close to available space. + * Whenever the receive queue gets full, we can receive a small packet + * filling RWIN, but with a high skb->truesize, because most NIC use 4K page + * plus sk_buff metadata even when receiving less than 1500 bytes of payload. + * + * Note that we use skb->len to decide to accept or drop this packet, + * but sk->sk_rmem_alloc is the sum of all skb->truesize. */ static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb) { - unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize; + unsigned int rmem = atomic_read(&sk->sk_rmem_alloc); - return new_mem <= sk->sk_rcvbuf; + return rmem + skb->len <= sk->sk_rcvbuf; } static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb, @@ -5871,6 +6078,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th, int syn_inerr) { struct tcp_sock *tp = tcp_sk(sk); + bool accecn_reflector = false; SKB_DR(reason); /* RFC1323: H1. Apply PAWS check first. */ @@ -5968,7 +6176,7 @@ step1: if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) tcp_fastopen_active_disable(sk); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); SKB_DR_SET(reason, TCP_RESET); goto discard; } @@ -5979,6 +6187,16 @@ step1: * RFC 5961 4.2 : Send a challenge ack */ if (th->syn) { + if (tcp_ecn_mode_accecn(tp)) { + accecn_reflector = true; + if (tp->rx_opt.accecn && + tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn); + + tcp_accecn_saw_opt_fail_recv(tp, saw_opt); + tcp_accecn_opt_demand_min(sk, 1); + } + } if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack && TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq && TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt && @@ -5988,7 +6206,7 @@ syn_challenge: if (syn_inerr) TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE); - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, accecn_reflector); SKB_DR_SET(reason, TCP_INVALID_SYN); goto discard; } @@ -6060,6 +6278,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) */ tp->rx_opt.saw_tstamp = 0; + tp->rx_opt.accecn = 0; /* pred_flags is 0xS?10 << 16 + snd_wnd * if header_prediction is to be made @@ -6114,6 +6333,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) flag |= __tcp_replace_ts_recent(tp, delta); + tcp_ecn_received_counters(sk, skb, 0); + /* We know that such packets are checksummed * on entry. */ @@ -6162,6 +6383,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) /* Bulk data transfer: receiver */ tcp_cleanup_skb(skb); __skb_pull(skb, tcp_header_len); + tcp_ecn_received_counters(sk, skb, + len - tcp_header_len); eaten = tcp_queue_rcv(sk, skb, &fragstolen); tcp_event_data_recv(sk, skb); @@ -6202,6 +6425,8 @@ validate: return; step5: + tcp_ecn_received_counters_payload(sk, skb); + reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT); if ((int)reason < 0) { reason = -reason; @@ -6297,7 +6522,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, u16 mss = tp->rx_opt.mss_clamp, try_exp = 0; bool syn_drop = false; - if (mss == tp->rx_opt.user_mss) { + if (mss == READ_ONCE(tp->rx_opt.user_mss)) { struct tcp_options_received opt; /* Get original SYNACK MSS value if user MSS sets mss_clamp */ @@ -6452,7 +6677,9 @@ consume: * state to ESTABLISHED..." */ - tcp_ecn_rcv_synack(tp, th); + if (tcp_ecn_mode_any(tp)) + tcp_ecn_rcv_synack(sk, skb, th, + TCP_SKB_CB(skb)->ip_dsfield); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); tcp_try_undo_spurious_syn(sk); @@ -6524,7 +6751,7 @@ consume: TCP_DELACK_MAX, false); goto consume; } - tcp_send_ack(sk); + tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp)); return -1; } @@ -6583,7 +6810,7 @@ consume: tp->snd_wl1 = TCP_SKB_CB(skb)->seq; tp->max_window = tp->snd_wnd; - tcp_ecn_rcv_syn(tp, th); + tcp_ecn_rcv_syn(tp, th, skb); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -6636,7 +6863,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk) tcp_try_undo_recovery(sk); tcp_update_rto_time(tp); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); /* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set * retrans_stamp but don't enter CA_Loss, so in case that happened we * need to zero retrans_stamp here to prevent spurious @@ -6765,7 +6992,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) } /* accept old ack during closing */ if ((int)reason < 0) { - tcp_send_challenge_ack(sk); + tcp_send_challenge_ack(sk, false); reason = -reason; goto discard; } @@ -6812,9 +7039,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + if (tcp_ecn_mode_accecn(tp)) + tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt); tcp_fast_path_on(tp); if (sk->sk_shutdown & SEND_SHUTDOWN) tcp_shutdown(sk, SEND_SHUTDOWN); + break; case TCP_FIN_WAIT1: { @@ -6984,6 +7214,15 @@ static void tcp_ecn_create_request(struct request_sock *req, bool ect, ecn_ok; u32 ecn_ok_dst; + if (tcp_accecn_syn_requested(th) && + READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) { + inet_rsk(req)->ecn_ok = 1; + tcp_rsk(req)->accecn_ok = 1; + tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield & + INET_ECN_MASK; + return; + } + if (!th_ecn) return; @@ -6991,7 +7230,8 @@ static void tcp_ecn_create_request(struct request_sock *req, ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK); ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst; - if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) || + if (((!ect || th->res1 || th->ae) && ecn_ok) || + tcp_ca_needs_ecn(listen_sk) || (ecn_ok_dst & DST_FEATURE_ECN_CA) || tcp_bpf_ca_needs_ecn((struct sock *)req)) inet_rsk(req)->ecn_ok = 1; @@ -7009,6 +7249,11 @@ static void tcp_openreq_init(struct request_sock *req, tcp_rsk(req)->snt_synack = 0; tcp_rsk(req)->snt_tsval_first = 0; tcp_rsk(req)->last_oow_ack_time = 0; + tcp_rsk(req)->accecn_ok = 0; + tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN; + tcp_rsk(req)->accecn_fail_mode = 0; + tcp_rsk(req)->syn_ect_rcv = 0; + tcp_rsk(req)->syn_ect_snt = 0; req->mss = rx_opt->mss_clamp; req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0; ireq->tstamp_ok = rx_opt->tstamp_ok; @@ -7048,8 +7293,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto) #endif __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); - if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 && - xchg(&queue->synflood_warned, 1) == 0) { + if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) { + WRITE_ONCE(queue->synflood_warned, 1); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) { net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n", proto, inet6_rcv_saddr(sk), @@ -7117,7 +7362,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, return 0; } - mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss); + mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss)); if (!mss) mss = af_ops->mss_clamp; @@ -7131,7 +7376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, { struct tcp_fastopen_cookie foc = { .len = -1 }; struct tcp_options_received tmp_opt; - struct tcp_sock *tp = tcp_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); struct sock *fastopen_sk = NULL; struct request_sock *req; @@ -7182,7 +7427,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_clear_options(&tmp_opt); tmp_opt.mss_clamp = af_ops->mss_clamp; - tmp_opt.user_mss = tp->rx_opt.user_mss; + tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss); tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, want_cookie ? NULL : &foc); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 84d3d556ed80..b1fcf3e4e1ce 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -65,6 +65,7 @@ #include <net/icmp.h> #include <net/inet_hashtables.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/transp_v6.h> #include <net/ipv6.h> #include <net/inet_common.h> @@ -74,6 +75,7 @@ #include <net/secure_seq.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> #include <linux/inet.h> #include <linux/ipv6.h> @@ -292,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); - inet_csk(sk)->icsk_ext_hdr_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk); if (inet_opt) - inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen; + inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen; tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT; @@ -506,8 +508,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) struct sock *sk; int err; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->daddr, th->dest, iph->saddr, + sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr, ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); @@ -823,8 +824,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -1191,7 +1191,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, enum tcp_synack_type synack_type, struct sk_buff *syn_skb) { - const struct inet_request_sock *ireq = inet_rsk(req); + struct inet_request_sock *ireq = inet_rsk(req); struct flowi4 fl4; int err = -1; struct sk_buff *skb; @@ -1204,6 +1204,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); tos = READ_ONCE(inet_sk(sk)->tos); @@ -1505,9 +1506,9 @@ void tcp_clear_md5_list(struct sock *sk) md5sig = rcu_dereference_protected(tp->md5sig_info, 1); hlist_for_each_entry_safe(key, n, &md5sig->head, node) { - hlist_del_rcu(&key->node); + hlist_del(&key->node); atomic_sub(sizeof(*key), &sk->sk_omem_alloc); - kfree_rcu(key, rcu); + kfree(key); } } @@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) enum skb_drop_reason reason; struct sock *rsk; + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ struct dst_entry *dst; @@ -1968,6 +1973,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; } @@ -1992,8 +1998,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); if (sk) { @@ -2070,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, (TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) || !tcp_skb_can_collapse_rx(tail, skb) || thtail->doff != th->doff || - memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th))) + memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) || + /* prior to PSP Rx policy check, retain exact PSP metadata */ + psp_skb_coalesce_diff(tail, skb)) goto no_coalesce; __skb_pull(skb, hdrlen); @@ -2236,8 +2243,7 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2258,7 +2264,7 @@ lookup: &iph->saddr, &iph->daddr, AF_INET, dif, sdif); if (unlikely(drop_reason)) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -2403,7 +2409,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; @@ -2426,9 +2432,7 @@ do_time_wait: &drop_reason); switch (tw_status) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(net, - net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), @@ -2441,6 +2445,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; @@ -2459,7 +2467,6 @@ do_time_wait: static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp_timewait_sock), - .twsk_destructor= tcp_twsk_destructor, }; void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) @@ -2501,6 +2508,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp4_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2516,23 +2530,12 @@ static int tcp_v4_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific; + sk->sk_destruct = tcp4_destruct_sock; #endif return 0; } -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5sig_info_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_info *md5sig; - - md5sig = container_of(head, struct tcp_md5sig_info, rcu); - kfree(md5sig); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - static void tcp_release_user_frags(struct sock *sk) { #ifdef CONFIG_PAGE_POOL @@ -2569,19 +2572,6 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); -#ifdef CONFIG_TCP_MD5SIG - /* Clean up the MD5 key list, if any */ - if (tp->md5sig_info) { - struct tcp_md5sig_info *md5sig; - - md5sig = rcu_dereference_protected(tp->md5sig_info, 1); - tcp_clear_md5_list(sk); - call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu); - rcu_assign_pointer(tp->md5sig_info, NULL); - } -#endif - tcp_ao_destroy_sock(sk, false); - /* Clean up a referenced TCP bind bucket. */ if (inet_csk(sk)->icsk_bind_hash) inet_put_port(sk); @@ -2958,9 +2948,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(f), sk_uid(sk)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, jiffies_to_clock_t(icsk->icsk_rto), @@ -3524,7 +3514,6 @@ struct proto tcp_prot = { .leave_memory_pressure = tcp_leave_memory_pressure, .stream_memory_free = tcp_stream_memory_free, .sockets_allocated = &tcp_sockets_allocated, - .orphan_count = &tcp_orphan_count, .memory_allocated = &net_aligned_data.tcp_memory_allocated, .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, @@ -3583,7 +3572,9 @@ fallback: static int __net_init tcp_sk_init(struct net *net) { - net->ipv4.sysctl_tcp_ecn = 2; + net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN; + net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL; + net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON; net->ipv4.sysctl_tcp_ecn_fallback = 1; net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS; diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index b67f94c60f9f..45b6ecd16412 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst, struct net *net; spin_lock_bh(&tcp_metrics_lock); - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); /* While waiting for the spin-lock the cache might have been populated * with this entry and so we have to check again. @@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return NULL; } - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); @@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, else return NULL; - net = dev_net_rcu(dst_dev(dst)); + net = dst_dev_net_rcu(dst); hash ^= net_hash_mix(net); hash = hash_32(hash, tcp_metrics_hash_log); diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 2994c9222c9c..2ec8c6f1cdcc 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -20,9 +20,11 @@ */ #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/xfrm.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) { @@ -103,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt); struct tcp_options_received tmp_opt; + enum skb_drop_reason psp_drop; bool paws_reject = false; int ts_recent_stamp; + /* Instead of dropping immediately, wait to see what value is + * returned. We will accept a non psp-encapsulated syn in the + * case where TCP_TW_SYN is returned. + */ + psp_drop = psp_twsk_rx_policy_check(tw, skb); + tmp_opt.saw_tstamp = 0; ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp); if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) { @@ -123,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) { /* Just repeat all the checks of tcp_rcv_state_process() */ + if (psp_drop) + goto out_put; + /* Out of window, send ACK */ if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, @@ -193,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { /* In window segment, it may be only reset or bare ack. */ + if (psp_drop) + goto out_put; + if (th->rst) { /* This is TIME_WAIT assassination, in two flavors. * Oh well... nobody has a sufficient solution to this @@ -246,6 +261,9 @@ kill: return TCP_TW_SYN; } + if (psp_drop) + goto out_put; + if (paws_reject) { *drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS; __NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED); @@ -264,6 +282,8 @@ kill: return tcp_timewait_check_oow_rate_limit( tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); } + +out_put: inet_twsk_put(tw); return TCP_TW_SUCCESS; } @@ -377,31 +397,22 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) } EXPORT_SYMBOL(tcp_time_wait); -#ifdef CONFIG_TCP_MD5SIG -static void tcp_md5_twsk_free_rcu(struct rcu_head *head) -{ - struct tcp_md5sig_key *key; - - key = container_of(head, struct tcp_md5sig_key, rcu); - kfree(key); - static_branch_slow_dec_deferred(&tcp_md5_needed); - tcp_md5_release_sigpool(); -} -#endif - void tcp_twsk_destructor(struct sock *sk) { #ifdef CONFIG_TCP_MD5SIG if (static_branch_unlikely(&tcp_md5_needed.key)) { struct tcp_timewait_sock *twsk = tcp_twsk(sk); - if (twsk->tw_md5_key) - call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu); + if (twsk->tw_md5_key) { + kfree(twsk->tw_md5_key); + static_branch_slow_dec_deferred(&tcp_md5_needed); + tcp_md5_release_sigpool(); + } } #endif tcp_ao_destroy_sock(sk, true); + psp_twsk_assoc_free(inet_twsk(sk)); } -EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor); void tcp_twsk_purge(struct list_head *net_exit_list) { @@ -461,12 +472,26 @@ void tcp_openreq_init_rwin(struct request_sock *req, ireq->rcv_wscale = rcv_wscale; } -static void tcp_ecn_openreq_child(struct tcp_sock *tp, - const struct request_sock *req) +static void tcp_ecn_openreq_child(struct sock *sk, + const struct request_sock *req, + const struct sk_buff *skb) { - tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? - TCP_ECN_MODE_RFC3168 : - TCP_ECN_DISABLED); + const struct tcp_request_sock *treq = tcp_rsk(req); + struct tcp_sock *tp = tcp_sk(sk); + + if (treq->accecn_ok) { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN); + tp->syn_ect_snt = treq->syn_ect_snt; + tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt); + tp->saw_accecn_opt = treq->saw_accecn_opt; + tp->prev_ecnfield = treq->syn_ect_rcv; + tp->accecn_opt_demand = 1; + tcp_ecn_received_counters_payload(sk, skb); + } else { + tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ? + TCP_ECN_MODE_RFC3168 : + TCP_ECN_DISABLED); + } } void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) @@ -631,7 +656,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len) newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; - tcp_ecn_openreq_child(newtp, req); + tcp_ecn_openreq_child(newsk, req, skb); newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); @@ -674,6 +699,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, bool own_req; tmp_opt.saw_tstamp = 0; + tmp_opt.accecn = 0; if (th->doff > (sizeof(struct tcphdr)>>2)) { tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); @@ -851,6 +877,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, if (!(flg & TCP_FLAG_ACK)) return NULL; + if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn && + tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) { + u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn); + + tcp_rsk(req)->saw_accecn_opt = saw_opt; + if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) { + u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV; + + tcp_rsk(req)->accecn_fail_mode |= fail_mode; + } + } + /* For Fast Open no more processing is needed (sk is the * child socket). */ diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index be5c2294610e..2cb93da93abc 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet_get_iif_sdif(skb, &iif, &sdif); iph = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - iph->saddr, th->source, + sk = __inet_lookup_established(net, iph->saddr, th->source, iph->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; @@ -485,6 +484,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff) th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr, iph->daddr, 0); + BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER); skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 | (NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index caf11920a878..bb3576ac0ad7 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -38,8 +38,10 @@ #define pr_fmt(fmt) "TCP: " fmt #include <net/tcp.h> +#include <net/tcp_ecn.h> #include <net/mptcp.h> #include <net/proto_memory.h> +#include <net/psp.h> #include <linux/compiler.h> #include <linux/gfp.h> @@ -319,60 +321,6 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -/* Packet ECN state for a SYN-ACK */ -static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) -{ - const struct tcp_sock *tp = tcp_sk(sk); - - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR; - if (tcp_ecn_disabled(tp)) - TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE; - else if (tcp_ca_needs_ecn(sk) || - tcp_bpf_ca_needs_ecn(sk)) - INET_ECN_xmit(sk); -} - -/* Packet ECN state for a SYN. */ -static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); - bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || - tcp_ca_needs_ecn(sk) || bpf_needs_ecn; - - if (!use_ecn) { - const struct dst_entry *dst = __sk_dst_get(sk); - - if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) - use_ecn = true; - } - - tp->ecn_flags = 0; - - if (use_ecn) { - TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; - tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); - if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) - INET_ECN_xmit(sk); - } -} - -static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb) -{ - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback)) - /* tp->ecn_flags are cleared at a later point in time when - * SYN ACK is ultimatively being received. - */ - TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR); -} - -static void -tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th) -{ - if (inet_rsk(req)->ecn_ok) - th->ece = 1; -} - /* Set up ECN state for a packet on a ESTABLISHED socket that is about to * be sent. */ @@ -381,7 +329,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_ecn_mode_rfc3168(tp)) { + if (!tcp_ecn_mode_any(tp)) + return; + + if (tcp_ecn_mode_accecn(tp)) { + if (!tcp_accecn_ace_fail_recv(tp)) + INET_ECN_xmit(sk); + tcp_accecn_set_ace(tp, skb, th); + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN; + } else { /* Not-retransmitted data segment: set ECT and inject CWR. */ if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { @@ -403,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, /* Constructs common control bits of non-data skb. If SYN/FIN is present, * auto increment end seqno. */ -static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags) +static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk, + u32 seq, u16 flags) { skb->ip_summed = CHECKSUM_PARTIAL; TCP_SKB_CB(skb)->tcp_flags = flags; tcp_skb_pcount_set(skb, 1); + psp_enqueue_set_decrypted(sk, skb); TCP_SKB_CB(skb)->seq = seq; if (flags & (TCPHDR_SYN | TCPHDR_FIN)) @@ -430,6 +388,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp) #define OPTION_SMC BIT(9) #define OPTION_MPTCP BIT(10) #define OPTION_AO BIT(11) +#define OPTION_ACCECN BIT(12) static void smc_options_write(__be32 *ptr, u16 *options) { @@ -451,6 +410,8 @@ struct tcp_out_options { u16 mss; /* 0 to disable */ u8 ws; /* window scale, 0 to disable */ u8 num_sack_blocks; /* number of SACK blocks to include */ + u8 num_accecn_fields:7, /* number of AccECN fields needed */ + use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */ u8 hash_size; /* bytes in hash_location */ u8 bpf_opt_len; /* length of BPF hdr option */ __u8 *hash_location; /* temporary pointer, overloaded */ @@ -648,6 +609,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp, return ptr; } +/* Initial values for AccECN option, ordered is based on ECN field bits + * similar to received_ecn_bytes. Used for SYN/ACK AccECN option. + */ +static const u32 synack_ecn_bytes[3] = { 0, 0, 0 }; + /* Write previously computed TCP options to the packet. * * Beware: Something in the Internet is very sensitive to the ordering of @@ -666,6 +632,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, struct tcp_out_options *opts, struct tcp_key *key) { + u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */ + u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */ __be32 *ptr = (__be32 *)(th + 1); u16 options = opts->options; /* mungable copy */ @@ -701,15 +669,75 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, *ptr++ = htonl(opts->tsecr); } + if (OPTION_ACCECN & options) { + const u32 *ecn_bytes = opts->use_synack_ecn_bytes ? + synack_ecn_bytes : + tp->received_ecn_bytes; + const u8 ect0_idx = INET_ECN_ECT_0 - 1; + const u8 ect1_idx = INET_ECN_ECT_1 - 1; + const u8 ce_idx = INET_ECN_CE - 1; + u32 e0b; + u32 e1b; + u32 ceb; + u8 len; + + e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET; + e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET; + ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET; + len = TCPOLEN_ACCECN_BASE + + opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD; + + if (opts->num_accecn_fields == 2) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + } else if (opts->num_accecn_fields == 1) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + leftover_highbyte = e1b & 0xff; + leftover_lowbyte = TCPOPT_NOP; + } else if (opts->num_accecn_fields == 0) { + leftover_highbyte = TCPOPT_ACCECN1; + leftover_lowbyte = len; + } else if (opts->num_accecn_fields == 3) { + *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) | + ((e1b >> 8) & 0xffff)); + *ptr++ = htonl(((e1b & 0xff) << 24) | + (ceb & 0xffffff)); + *ptr++ = htonl(((e0b & 0xffffff) << 8) | + TCPOPT_NOP); + } + if (tp) { + tp->accecn_minlen = 0; + tp->accecn_opt_tstamp = tp->tcp_mstamp; + if (tp->accecn_opt_demand) + tp->accecn_opt_demand--; + } + } + if (unlikely(OPTION_SACK_ADVERTISE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_WSCALE & options)) { - *ptr++ = htonl((TCPOPT_NOP << 24) | + u8 highbyte = TCPOPT_NOP; + + /* Do not split the leftover 2-byte to fit into a single + * NOP, i.e., replace this NOP only when 1 byte is leftover + * within leftover_highbyte. + */ + if (unlikely(leftover_highbyte != TCPOPT_NOP && + leftover_lowbyte == TCPOPT_NOP)) { + highbyte = leftover_highbyte; + leftover_highbyte = TCPOPT_NOP; + } + *ptr++ = htonl((highbyte << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | opts->ws); @@ -720,11 +748,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | (TCPOPT_SACK << 8) | (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK))); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; for (this_sack = 0; this_sack < opts->num_sack_blocks; ++this_sack) { @@ -733,6 +763,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp, } tp->rx_opt.dsack = 0; + } else if (unlikely(leftover_highbyte != TCPOPT_NOP || + leftover_lowbyte != TCPOPT_NOP)) { + *ptr++ = htonl((leftover_highbyte << 24) | + (leftover_lowbyte << 16) | + (TCPOPT_NOP << 8) | + TCPOPT_NOP); + leftover_highbyte = TCPOPT_NOP; + leftover_lowbyte = TCPOPT_NOP; } if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) { @@ -813,6 +851,80 @@ static void mptcp_set_option_cond(const struct request_sock *req, } } +static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts) +{ + /* How much there's room for combining with the alignment padding? */ + if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) == + OPTION_SACK_ADVERTISE) + return 2; + else if (opts->options & OPTION_WSCALE) + return 1; + return 0; +} + +/* Calculates how long AccECN option will fit to @remaining option space. + * + * AccECN option can sometimes replace NOPs used for alignment of other + * TCP options (up to @max_combine_saving available). + * + * Only solutions with at least @required AccECN fields are accepted. + * + * Returns: The size of the AccECN option excluding space repurposed from + * the alignment of the other options. + */ +static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required, + int remaining) +{ + int size = TCP_ACCECN_MAXSIZE; + int sack_blocks_reduce = 0; + int max_combine_saving; + int rem = remaining; + int align_size; + + if (opts->use_synack_ecn_bytes) + max_combine_saving = tcp_synack_options_combine_saving(opts); + else + max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0; + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + while (opts->num_accecn_fields >= required) { + /* Pad to dword if cannot combine */ + if ((size & 0x3) > max_combine_saving) + align_size = ALIGN(size, 4); + else + align_size = ALIGN_DOWN(size, 4); + + if (rem >= align_size) { + size = align_size; + break; + } else if (opts->num_accecn_fields == required && + opts->num_sack_blocks > 2 && + required > 0) { + /* Try to fit the option by removing one SACK block */ + opts->num_sack_blocks--; + sack_blocks_reduce++; + rem = rem + TCPOLEN_SACK_PERBLOCK; + + opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS; + size = TCP_ACCECN_MAXSIZE; + continue; + } + + opts->num_accecn_fields--; + size -= TCPOLEN_ACCECN_PERFIELD; + } + if (sack_blocks_reduce > 0) { + if (opts->num_accecn_fields >= required) + size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK; + else + opts->num_sack_blocks += sack_blocks_reduce; + } + if (opts->num_accecn_fields < required) + return 0; + + opts->options |= OPTION_ACCECN; + return size; +} + /* Compute TCP options for SYN packets. This is not the final * network wire format yet. */ @@ -895,6 +1007,20 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb, } } + /* Simultaneous open SYN/ACK needs AccECN option but not SYN. + * It is attempted to negotiate the use of AccECN also on the first + * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs" + * of AccECN draft. + */ + if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) && + tcp_ecn_mode_accecn(tp) && + inet_csk(sk)->icsk_retransmits < 2 && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + remaining >= TCPOLEN_ACCECN_BASE)) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining); return MAX_TCP_OPTION_SPACE - remaining; @@ -912,6 +1038,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, { struct inet_request_sock *ireq = inet_rsk(req); unsigned int remaining = MAX_TCP_OPTION_SPACE; + struct tcp_request_sock *treq = tcp_rsk(req); if (tcp_key_is_md5(key)) { opts->options |= OPTION_MD5; @@ -974,6 +1101,13 @@ static unsigned int tcp_synack_options(const struct sock *sk, smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining); + if (treq->accecn_ok && + READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) && + req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) { + opts->use_synack_ecn_bytes = 1; + remaining -= tcp_options_fit_accecn(opts, 0, remaining); + } + bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb, synack_type, opts, &remaining); @@ -1030,17 +1164,32 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack; if (unlikely(eff_sacks)) { const unsigned int remaining = MAX_TCP_OPTION_SPACE - size; - if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED + - TCPOLEN_SACK_PERBLOCK)) - return size; + if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED + + TCPOLEN_SACK_PERBLOCK)) { + opts->num_sack_blocks = + min_t(unsigned int, eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } else { + opts->num_sack_blocks = 0; + } + } else { + opts->num_sack_blocks = 0; + } - opts->num_sack_blocks = - min_t(unsigned int, eff_sacks, - (remaining - TCPOLEN_SACK_BASE_ALIGNED) / - TCPOLEN_SACK_PERBLOCK); + if (tcp_ecn_mode_accecn(tp)) { + int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option); - size += TCPOLEN_SACK_BASE_ALIGNED + - opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) && + (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand || + tcp_accecn_option_beacon_check(sk))) { + opts->use_synack_ecn_bytes = 0; + size += tcp_options_fit_accecn(opts, tp->accecn_minlen, + MAX_TCP_OPTION_SPACE - size); + } } if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp, @@ -1437,7 +1586,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th, opts.hash_location); if (err) { - kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED); + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED); return -ENOMEM; } } @@ -1510,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) /* Advance write_seq and place onto the write_queue. */ WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq); __skb_header_release(skb); + psp_enqueue_set_decrypted(sk, skb); tcp_add_write_queue_tail(sk, skb); sk_wmem_queued_add(sk, skb->truesize); sk_mem_charge(sk, skb->truesize); @@ -2750,6 +2900,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, sent_pkts = 0; tcp_mstamp_refresh(tp); + + /* AccECN option beacon depends on mstamp, it may change mss */ + if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk)) + mss_now = tcp_current_mss(sk); + if (!push_one) { /* Do MTU probing. */ result = tcp_mtu_probe(sk); @@ -3402,7 +3557,10 @@ start: tcp_retrans_try_collapse(sk, skb, avail_wnd); } - /* RFC3168, section 6.1.1.1. ECN fallback */ + /* RFC3168, section 6.1.1.1. ECN fallback + * As AccECN uses the same SYN flags (+ AE), this check covers both + * cases. + */ if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN) tcp_ecn_clear_syn(sk, skb); @@ -3578,9 +3736,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size) sk_forward_alloc_add(sk, amt << PAGE_SHIFT); sk_memory_allocated_add(sk, amt); - if (mem_cgroup_sockets_enabled && sk->sk_memcg) - mem_cgroup_charge_skmem(sk->sk_memcg, amt, - gfp_memcg_charge() | __GFP_NOFAIL); + if (mem_cgroup_sk_enabled(sk)) + mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL); } /* Send a FIN. The caller locks the socket for us. @@ -3625,7 +3782,7 @@ void tcp_send_fin(struct sock *sk) skb_reserve(skb, MAX_TCP_HEADER); sk_forced_mem_schedule(sk, skb->truesize); /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - tcp_init_nondata_skb(skb, tp->write_seq, + tcp_init_nondata_skb(skb, sk, tp->write_seq, TCPHDR_ACK | TCPHDR_FIN); tcp_queue_skb(sk, skb); } @@ -3653,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority, /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk), TCPHDR_ACK | TCPHDR_RST); tcp_mstamp_refresh(tcp_sk(sk)); /* Send it off. */ @@ -3891,6 +4048,7 @@ static void tcp_connect_init(struct sock *sk) const struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); __u8 rcv_wscale; + u16 user_mss; u32 rcv_wnd; /* We'll fix this up when we get a response from the other end. @@ -3903,8 +4061,9 @@ static void tcp_connect_init(struct sock *sk) tcp_ao_connect_init(sk); /* If user gave his TCP_MAXSEG, record it to clamp */ - if (tp->rx_opt.user_mss) - tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + user_mss = READ_ONCE(tp->rx_opt.user_mss); + if (user_mss) + tp->rx_opt.mss_clamp = user_mss; tp->max_window = 0; tcp_mtup_init(sk); tcp_sync_mss(sk, dst_mtu(dst)); @@ -3955,7 +4114,7 @@ static void tcp_connect_init(struct sock *sk) WRITE_ONCE(tp->copied_seq, tp->rcv_nxt); inet_csk(sk)->icsk_rto = tcp_timeout_init(sk); - inet_csk(sk)->icsk_retransmits = 0; + WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0); tcp_clear_retrans(tp); } @@ -4148,7 +4307,7 @@ int tcp_connect(struct sock *sk) /* SYN eats a sequence byte, write_seq updated by * tcp_connect_queue_skb(). */ - tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN); + tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN); tcp_mstamp_refresh(tp); tp->retrans_stamp = tcp_time_stamp_ts(tp); tcp_connect_queue_skb(sk, buff); @@ -4273,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags) /* Reserve space for headers and prepare control bits. */ skb_reserve(buff, MAX_TCP_HEADER); - tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags); + tcp_init_nondata_skb(buff, sk, + tcp_acceptable_seq(sk), TCPHDR_ACK | flags); /* We do not want pure acks influencing TCP Small Queues or fq/pacing * too much. @@ -4319,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib) * end to send an ack. Don't queue or clone SKB, just * send it. */ - tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK); + tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK); NET_INC_STATS(sock_net(sk), mib); return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0); } @@ -4393,13 +4553,13 @@ void tcp_send_probe0(struct sock *sk) if (tp->packets_out || tcp_write_queue_empty(sk)) { /* Cancel probe timer, if it is not required. */ - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_backoff = 0; icsk->icsk_probes_tstamp = 0; return; } - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); if (err <= 0) { if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2)) icsk->icsk_backoff++; @@ -4437,7 +4597,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) tcp_sk_rw(sk)->total_retrans++; } trace_tcp_retransmit_synack(sk, req); - req->num_retrans++; + WRITE_ONCE(req->num_retrans, req->num_retrans + 1); } return res; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a207877270fb..2dd73a4e8e51 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk) int max_probes; if (tp->packets_out || !skb) { - icsk->icsk_probes_out = 0; + WRITE_ONCE(icsk->icsk_probes_out, 0); icsk->icsk_probes_tstamp = 0; return; } @@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk) tp->total_rto_recoveries++; tp->rto_stamp = tcp_time_stamp_ms(tp); } - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1); tp->total_rto++; } @@ -839,7 +839,7 @@ static void tcp_keepalive_timer(struct timer_list *t) goto out; } if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) { - icsk->icsk_probes_out++; + WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1); elapsed = keepalive_intvl_when(tp); } else { /* If keepalive was lost due to local congestion, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cc3ce0f762ec..95241093b7f0 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -68,7 +68,7 @@ * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind * a single port at the same time. - * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support + * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support * James Chapman : Add L2TP encapsulation type. */ @@ -509,7 +509,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -1685,31 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb) udp_rmem_release(sk, udp_skb_truesize(skb), 1, true); } -/* Idea of busylocks is to let producers grab an extra spinlock - * to relieve pressure on the receive_queue spinlock shared by consumer. - * Under flood, this means that only one producer can be in line - * trying to acquire the receive_queue spinlock. - * These busylock can be allocated on a per cpu manner, instead of a - * per socket one (that would consume a cache line per socket) - */ -static int udp_busylocks_log __read_mostly; -static spinlock_t *udp_busylocks __read_mostly; - -static spinlock_t *busylock_acquire(void *ptr) -{ - spinlock_t *busy; - - busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log); - spin_lock(busy); - return busy; -} - -static void busylock_release(spinlock_t *busy) -{ - if (busy) - spin_unlock(busy); -} - static int udp_rmem_schedule(struct sock *sk, int size) { int delta; @@ -1724,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size) int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) { struct sk_buff_head *list = &sk->sk_receive_queue; + struct udp_prod_queue *udp_prod_queue; + struct sk_buff *next, *to_drop = NULL; + struct llist_node *ll_list; unsigned int rmem, rcvbuf; - spinlock_t *busy = NULL; int size, err = -ENOMEM; + int total_size = 0; + int q_size = 0; + int dropcount; + int nb = 0; rmem = atomic_read(&sk->sk_rmem_alloc); rcvbuf = READ_ONCE(sk->sk_rcvbuf); size = skb->truesize; + udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()]; + + rmem += atomic_read(&udp_prod_queue->rmem_alloc); + /* Immediately drop when the receive queue is full. * Cast to unsigned int performs the boundary check for INT_MAX. */ @@ -1739,8 +1724,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rcvbuf > INT_MAX >> 1) goto drop; - /* Always allow at least one packet for small buffer. */ - if (rmem > rcvbuf) + /* Accept the packet if queue is empty. */ + if (rmem) goto drop; } @@ -1753,42 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb) if (rmem > (rcvbuf >> 1)) { skb_condense(skb); size = skb->truesize; - busy = busylock_acquire(sk); } udp_set_dev_scratch(skb); - atomic_add(size, &sk->sk_rmem_alloc); + atomic_add(size, &udp_prod_queue->rmem_alloc); + + if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root)) + return 0; + + dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0; spin_lock(&list->lock); - err = udp_rmem_schedule(sk, size); - if (err) { - spin_unlock(&list->lock); - goto uncharge_drop; - } - sk_forward_alloc_add(sk, -size); + ll_list = llist_del_all(&udp_prod_queue->ll_root); - /* no need to setup a destructor, we will explicitly release the - * forward allocated memory on dequeue - */ - sock_skb_set_dropcount(sk, skb); + ll_list = llist_reverse_order(ll_list); + + llist_for_each_entry_safe(skb, next, ll_list, ll_node) { + size = udp_skb_truesize(skb); + total_size += size; + err = udp_rmem_schedule(sk, size); + if (unlikely(err)) { + /* Free the skbs outside of locked section. */ + skb->next = to_drop; + to_drop = skb; + continue; + } + + q_size += size; + sk_forward_alloc_add(sk, -size); + + /* no need to setup a destructor, we will explicitly release the + * forward allocated memory on dequeue + */ + SOCK_SKB_CB(skb)->dropcount = dropcount; + nb++; + __skb_queue_tail(list, skb); + } + + atomic_add(q_size, &sk->sk_rmem_alloc); - __skb_queue_tail(list, skb); spin_unlock(&list->lock); - if (!sock_flag(sk, SOCK_DEAD)) - INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk); + if (!sock_flag(sk, SOCK_DEAD)) { + /* Multiple threads might be blocked in recvmsg(), + * using prepare_to_wait_exclusive(). + */ + while (nb) { + INDIRECT_CALL_1(sk->sk_data_ready, + sock_def_readable, sk); + nb--; + } + } - busylock_release(busy); - return 0; + if (unlikely(to_drop)) { + for (nb = 0; to_drop != NULL; nb++) { + skb = to_drop; + to_drop = skb->next; + skb_mark_not_on_list(skb); + /* TODO: update SNMP values. */ + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM); + } + numa_drop_add(&udp_sk(sk)->drop_counters, nb); + } -uncharge_drop: - atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + atomic_sub(total_size, &udp_prod_queue->rmem_alloc); + + return 0; drop: - atomic_inc(&sk->sk_drops); - busylock_release(busy); + udp_drops_inc(sk); return err; } EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb); @@ -1806,6 +1826,7 @@ void udp_destruct_common(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); + kfree(up->udp_prod_queue); } EXPORT_IPV6_MOD_GPL(udp_destruct_common); @@ -1817,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk) int udp_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udp_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) @@ -1828,6 +1850,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset))) sk_peek_offset_bwd(sk, len); + if (!skb_shared(skb)) { + if (unlikely(udp_skb_has_head_state(skb))) + skb_release_head_state(skb); + skb_attempt_defer_free(skb); + return; + } + if (!skb_unref(skb)) return; @@ -1852,7 +1881,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk, IS_UDPLITE(sk)); __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __skb_unlink(skb, rcvq); *total += skb->truesize; kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); @@ -2008,7 +2037,7 @@ try_again: __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM); goto try_again; } @@ -2078,7 +2107,7 @@ try_again: if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); } @@ -2449,7 +2478,7 @@ csum_error: __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -2534,7 +2563,7 @@ start_lookup: nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP_INC_STATS(net, UDP_MIB_INERRORS, @@ -2609,7 +2638,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, return 0; } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, @@ -2807,7 +2836,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net, return NULL; } -int udp_v4_early_demux(struct sk_buff *skb) +enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb) { struct net *net = dev_net(skb->dev); struct in_device *in_dev = NULL; @@ -2821,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb) /* validate the packet */ if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr))) - return 0; + return SKB_NOT_DROPPED_YET; iph = ip_hdr(skb); uh = udp_hdr(skb); @@ -2830,12 +2859,12 @@ int udp_v4_early_demux(struct sk_buff *skb) in_dev = __in_dev_get_rcu(skb->dev); if (!in_dev) - return 0; + return SKB_NOT_DROPPED_YET; ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr, iph->protocol); if (!ours) - return 0; + return SKB_NOT_DROPPED_YET; sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr, uh->source, iph->saddr, @@ -2846,7 +2875,7 @@ int udp_v4_early_demux(struct sk_buff *skb) } if (!sk) - return 0; + return SKB_NOT_DROPPED_YET; skb->sk = sk; DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk)); @@ -2873,7 +2902,7 @@ int udp_v4_early_demux(struct sk_buff *skb) ip4h_dscp(iph), skb->dev, in_dev, &itag); } - return 0; + return SKB_NOT_DROPPED_YET; } int udp_rcv(struct sk_buff *skb) @@ -3386,7 +3415,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f, from_kuid_munged(seq_user_ns(f), sk_uid(sp)), 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } int udp4_seq_show(struct seq_file *seq, void *v) @@ -3994,7 +4023,6 @@ static void __init bpf_iter_register(void) void __init udp_init(void) { unsigned long limit; - unsigned int i; udp_table_init(&udp_table, "UDP"); limit = nr_free_buffer_pages() / 8; @@ -4003,15 +4031,6 @@ void __init udp_init(void) sysctl_udp_mem[1] = limit; sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2; - /* 16 spinlocks per cpu */ - udp_busylocks_log = ilog2(nr_cpu_ids) + 4; - udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log, - GFP_KERNEL); - if (!udp_busylocks) - panic("UDP: failed to alloc udp_busylocks\n"); - for (i = 0; i < (1U << udp_busylocks_log); i++) - spin_lock_init(udp_busylocks + i); - if (register_pernet_subsys(&udp_sysctl_ops)) panic("UDP: failed to init sysctl parameters.\n"); diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index 38cb3a28e4ed..6e491c720c90 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -16,9 +16,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI, @@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, { bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; int num, s_num, slot, s_slot; - struct nlattr *bc; - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; s_slot = cb->args[0]; num = s_num = cb->args[1]; @@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb, r->id.idiag_dport) goto next; - if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) { + if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) { spin_unlock_bh(&hslot->lock); goto done; } diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b1f3fd302e9d..19d0b5b09ffa 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skb_gro_checksum_try_convert(skb, IPPROTO_UDP, inet_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 0; - if (static_branch_unlikely(&udp_encap_needed_key)) sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index fce945f23069..54386e06a813 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -4,6 +4,7 @@ #include <linux/socket.h> #include <linux/kernel.h> #include <net/dst_metadata.h> +#include <net/flow.h> #include <net/udp.h> #include <net/udp_tunnel.h> #include <net/inet_dscp.h> @@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb, fl4.saddr = key->u.ipv4.src; fl4.fl4_dport = dport; fl4.fl4_sport = sport; - fl4.flowi4_tos = tos & INET_DSCP_MASK; + fl4.flowi4_dscp = inet_dsfield_to_dscp(tos); fl4.flowi4_flags = key->flow_flags; rt = ip_route_output_key(net, &fl4); diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c index ff66db48453c..944b3cf25468 100644 --- a/net/ipv4/udp_tunnel_nic.c +++ b/net/ipv4/udp_tunnel_nic.c @@ -930,7 +930,7 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused, err = udp_tunnel_nic_register(dev); if (err) - netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err); + netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err); return notifier_from_errno(err); } /* All other events will need the udp_tunnel_nic state */ diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7fb6205619e7..58faf1ddd2b1 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -14,7 +14,7 @@ #include <linux/inetdevice.h> #include <net/dst.h> #include <net/xfrm.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/ip.h> #include <net/l3mdev.h> @@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = params->daddr->a4; - fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp); + fl4->flowi4_dscp = params->dscp; fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net, params->oif); fl4->flowi4_mark = params->mark; diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 1c9c686d9522..b8f9a8c0302e 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL config IPV6_SEG6_HMAC bool "IPv6: Segment Routing HMAC support" depends on IPV6 - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 - select CRYPTO_SHA256 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS help Support for HMAC signature generation and verification of SR-enabled packets. diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index f17a5dd4789f..40e9c336f6c5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -7238,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = { .data = &ipv6_devconf.rpl_seg_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, }, { .procname = "ioam6_enabled", diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1992621e3f3f..1b0314644e0c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk) return PTR_ERR(dst); } - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return 0; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index eb474f0987ae..95372e0f1d21 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -46,6 +46,34 @@ struct ah_skb_cb { #define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0])) +/* Helper to save IPv6 addresses and extension headers to temporary storage */ +static inline void ah6_save_hdrs(struct tmp_ext *iph_ext, + struct ipv6hdr *top_iph, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + iph_ext->saddr = top_iph->saddr; +#endif + iph_ext->daddr = top_iph->daddr; + memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext)); +} + +/* Helper to restore IPv6 addresses and extension headers from temporary storage */ +static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph, + struct tmp_ext *iph_ext, int extlen) +{ + if (!extlen) + return; + +#if IS_ENABLED(CONFIG_IPV6_MIP6) + top_iph->saddr = iph_ext->saddr; +#endif + top_iph->daddr = iph_ext->daddr; + memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext)); +} + static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags, unsigned int size) { @@ -301,13 +329,7 @@ static void ah6_output_done(void *data, int err) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); kfree(AH_SKB_CB(skb)->tmp); xfrm_output_resume(skb->sk, skb, err); @@ -378,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) */ memcpy(iph_base, top_iph, IPV6HDR_BASELEN); + ah6_save_hdrs(iph_ext, top_iph, extlen); if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(iph_ext, &top_iph->saddr, extlen); -#else - memcpy(iph_ext, &top_iph->daddr, extlen); -#endif err = ipv6_clear_mutable_options(top_iph, extlen - sizeof(*iph_ext) + sizeof(*top_iph), @@ -434,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) memcpy(ah->auth_data, icv, ahp->icv_trunc_len); memcpy(top_iph, iph_base, IPV6HDR_BASELEN); - if (extlen) { -#if IS_ENABLED(CONFIG_IPV6_MIP6) - memcpy(&top_iph->saddr, iph_ext, extlen); -#else - memcpy(&top_iph->daddr, iph_ext, extlen); -#endif - } + ah6_restore_hdrs(top_iph, iph_ext, extlen); out_free: kfree(iph_base); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c index f8a8e46286b8..52599584422b 100644 --- a/net/ipv6/anycast.c +++ b/net/ipv6/anycast.c @@ -104,7 +104,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr) rcu_read_lock(); rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); if (rt) { - dev = dst_dev(&rt->dst); + dev = dst_dev_rcu(&rt->dst); netdev_hold(dev, &dev_tracker, GFP_ATOMIC); ip6_rt_put(rt); } else if (ishost) { diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 972bf0426d59..33ebe93d80e3 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, 0, sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, - atomic_read(&sp->sk_drops)); + sk_drops_read(sp)); } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 72adfc107b55..e75da98f5283 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) dport = encap->encap_dport; spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, - dport, &x->props.saddr.in6, ntohs(sport), 0, 0); + sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport, + &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 44550957fd4e..56c974cf75d1 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -209,7 +209,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, * this lookup should be more aggressive (not longer than timeout). */ dst = ip6_route_output(net, sk, fl6); - dev = dst_dev(dst); + rcu_read_lock(); + dev = dst_dev_rcu(dst); if (dst->error) { IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -224,14 +225,12 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (rt->rt6i_dst.plen < 128) tmo >>= ((128 - rt->rt6i_dst.plen)>>5); - rcu_read_lock(); peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr); res = inet_peer_xrlim_allow(peer, tmo); - rcu_read_unlock(); } + rcu_read_unlock(); if (!res) - __ICMP6_INC_STATS(net, ip6_dst_idev(dst), - ICMP6_MIB_RATELIMITHOST); + __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST); else icmp_global_consume(net); dst_release(dst); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 333e43434dd7..ea5cf3fdfdd6 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk, dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p); if (!IS_ERR(dst)) - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); } return dst; } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 76ee521189eb..5e1da088d8e1 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn); * The sockhash lock must be held as a reader here. */ struct sock *__inet6_lookup_established(const struct net *net, - struct inet_hashinfo *hashinfo, - const struct in6_addr *saddr, - const __be16 sport, - const struct in6_addr *daddr, - const u16 hnum, - const int dif, const int sdif) + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const u16 hnum, + const int dif, const int sdif) { - struct sock *sk; - const struct hlist_nulls_node *node; const __portpair ports = INET_COMBINED_PORTS(sport, hnum); - /* Optimize here for direct hit, only listening connections can - * have wildcards anyways. - */ - unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); - unsigned int slot = hash & hashinfo->ehash_mask; - struct inet_ehash_bucket *head = &hashinfo->ehash[slot]; - + const struct hlist_nulls_node *node; + struct inet_ehash_bucket *head; + struct inet_hashinfo *hashinfo; + unsigned int hash, slot; + struct sock *sk; + hashinfo = net->ipv4.tcp_death_row.hashinfo; + hash = inet6_ehashfn(net, daddr, hnum, saddr, sport); + slot = hash & hashinfo->ehash_mask; + head = &hashinfo->ehash[slot]; begin: sk_nulls_for_each_rcu(sk, node, &head->chain) { if (sk->sk_hash != hash) @@ -200,19 +199,20 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net, EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup); struct sock *inet6_lookup_listener(const struct net *net, - struct inet_hashinfo *hashinfo, - struct sk_buff *skb, int doff, - const struct in6_addr *saddr, - const __be16 sport, const struct in6_addr *daddr, - const unsigned short hnum, const int dif, const int sdif) + struct sk_buff *skb, int doff, + const struct in6_addr *saddr, + const __be16 sport, + const struct in6_addr *daddr, + const unsigned short hnum, + const int dif, const int sdif) { struct inet_listen_hashbucket *ilb2; + struct inet_hashinfo *hashinfo; struct sock *result = NULL; unsigned int hash2; /* Lookup redirect from BPF */ - if (static_branch_unlikely(&bpf_sk_lookup_enabled) && - hashinfo == net->ipv4.tcp_death_row.hashinfo) { + if (static_branch_unlikely(&bpf_sk_lookup_enabled)) { result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff, saddr, sport, daddr, hnum, dif, inet6_ehashfn); @@ -220,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net, goto done; } + hashinfo = net->ipv4.tcp_death_row.hashinfo; hash2 = ipv6_portaddr_hash(net, daddr, hnum); ilb2 = inet_lhash2_bucket(hashinfo, hash2); @@ -244,7 +245,6 @@ done: EXPORT_SYMBOL_GPL(inet6_lookup_listener); struct sock *inet6_lookup(const struct net *net, - struct inet_hashinfo *hashinfo, struct sk_buff *skb, int doff, const struct in6_addr *saddr, const __be16 sport, const struct in6_addr *daddr, const __be16 dport, @@ -253,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net, struct sock *sk; bool refcounted; - sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr, + sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr, ntohs(dport), dif, 0, &refcounted); if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -305,8 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, dif, sdif))) { if (sk2->sk_state == TCP_TIME_WAIT) { tw = inet_twsk(sk2); - if (sk->sk_protocol == IPPROTO_TCP && - tcp_twsk_unique(sk, sk2, twp)) + if (tcp_twsk_unique(sk, sk2, twp)) break; } goto not_unique; @@ -369,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row, __inet6_check_established); } EXPORT_SYMBOL_GPL(inet6_hash_connect); - -int inet6_hash(struct sock *sk) -{ - int err = 0; - - if (sk->sk_state != TCP_CLOSE) - err = __inet_hash(sk, NULL); - - return err; -} -EXPORT_SYMBOL_GPL(inet6_hash); diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 74d49dd6124d..c82a75510c0e 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -329,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (parms->name[0]) { if (!dev_valid_name(parms->name)) return NULL; - strscpy(name, parms->name, IFNAMSIZ); + strscpy(name, parms->name); } else { - strcpy(name, "ip6gre%d"); + strscpy(name, "ip6gre%d"); } dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN, ip6gre_tunnel_setup); @@ -1469,7 +1469,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) @@ -1529,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) tunnel->dev = dev; tunnel->net = dev_net(dev); - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; } @@ -1842,7 +1842,7 @@ static int ip6erspan_tap_init(struct net_device *dev) tunnel = netdev_priv(dev); tunnel->dev = dev; - strcpy(tunnel->parms.name, dev->name); + strscpy(tunnel->parms.name, dev->name); ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL); if (ret) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1e1410237b6e..f904739e99b9 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -60,7 +60,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev = ip6_dst_idev(dst); unsigned int hh_len = LL_RESERVED_SPACE(dev); const struct in6_addr *daddr, *nexthop; @@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * /* Be paranoid, rather than too clever. */ if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive because we hold rcu_read_lock(). */ skb = skb_expand_head(skb, hh_len); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); return -ENOMEM; } - rcu_read_unlock(); } hdr = ipv6_hdr(skb); @@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); - rcu_read_lock(); nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); neigh = __ipv6_neigh_lookup_noref(dev, nexthop); @@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * if (unlikely(!neigh)) neigh = __neigh_create(&nd_tbl, nexthop, dev, false); if (IS_ERR(neigh)) { - rcu_read_unlock(); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); return -EINVAL; @@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff * } sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock(); return ret; } @@ -233,22 +227,29 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst), *indev = skb->dev; - struct inet6_dev *idev = ip6_dst_idev(dst); + struct net_device *dev, *indev = skb->dev; + struct inet6_dev *idev; + int ret; skb->protocol = htons(ETH_P_IPV6); + rcu_read_lock(); + dev = dst_dev_rcu(dst); + idev = ip6_dst_idev(dst); skb->dev = dev; if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); + rcu_read_unlock(); kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); return 0; } - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, indev, dev, - ip6_finish_output, - !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, + net, sk, skb, indev, dev, + ip6_finish_output, + !(IP6CB(skb)->flags & IP6SKB_REROUTED)); + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_output); @@ -268,35 +269,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk) int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) { - struct net *net = sock_net(sk); const struct ipv6_pinfo *np = inet6_sk(sk); struct in6_addr *first_hop = &fl6->daddr; struct dst_entry *dst = skb_dst(skb); - struct net_device *dev = dst_dev(dst); struct inet6_dev *idev = ip6_dst_idev(dst); struct hop_jumbo_hdr *hop_jumbo; int hoplen = sizeof(*hop_jumbo); + struct net *net = sock_net(sk); unsigned int head_room; + struct net_device *dev; struct ipv6hdr *hdr; u8 proto = fl6->flowi6_proto; int seg_len = skb->len; - int hlimit = -1; + int ret, hlimit = -1; u32 mtu; + rcu_read_lock(); + + dev = dst_dev_rcu(dst); head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); if (opt) head_room += opt->opt_nflen + opt->opt_flen; if (unlikely(head_room > skb_headroom(skb))) { - /* Make sure idev stays alive */ - rcu_read_lock(); + /* idev stays alive while we hold rcu_read_lock(). */ skb = skb_expand_head(skb, head_room); if (!skb) { IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); - rcu_read_unlock(); - return -ENOBUFS; + ret = -ENOBUFS; + goto unlock; } - rcu_read_unlock(); } if (opt) { @@ -358,17 +360,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, * skb to its handler for processing */ skb = l3mdev_ip6_out((struct sock *)sk, skb); - if (unlikely(!skb)) - return 0; + if (unlikely(!skb)) { + ret = 0; + goto unlock; + } /* hooks should never assume socket lock is held. * we promote our socket to non const */ - return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, - net, (struct sock *)sk, skb, NULL, dev, - dst_output); + ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, + net, (struct sock *)sk, skb, NULL, dev, + dst_output); + goto unlock; } + ret = -EMSGSIZE; skb->dev = dev; /* ipv6_local_error() does not require socket lock, * we promote our socket to non const @@ -377,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); kfree_skb(skb); - return -EMSGSIZE; +unlock: + rcu_read_unlock(); + return ret; } EXPORT_SYMBOL(ip6_xmit); @@ -1092,9 +1100,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk, * sockets. * 2. oif also should be the same. */ - if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || + if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, + np->daddr_cache ? &sk->sk_v6_daddr : NULL) || #ifdef CONFIG_IPV6_SUBTREES - ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || + ip6_rt_check(&rt->rt6i_src, &fl6->saddr, + np->saddr_cache ? &np->saddr : NULL) || #endif (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { dst_release(dst); diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e66ec623972e..a61e742794f9 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -49,6 +49,7 @@ #include <net/xfrm.h> #include <net/compat.h> #include <net/seg6.h> +#include <net/psp.h> #include <linux/uaccess.h> @@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk, !((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen; + + icsk->icsk_ext_hdr_len = + psp_sk_overhead(sk) + + opt->opt_flen + opt->opt_nflen; icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 36ca27496b3c..016b572e7d6f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -169,6 +169,29 @@ static int unsolicited_report_interval(struct inet6_dev *idev) return iv > 0 ? iv : 1; } +static struct net_device *ip6_mc_find_dev(struct net *net, + const struct in6_addr *group, + int ifindex) +{ + struct net_device *dev = NULL; + struct rt6_info *rt; + + if (ifindex == 0) { + rcu_read_lock(); + rt = rt6_lookup(net, group, NULL, 0, NULL, 0); + if (rt) { + dev = dst_dev_rcu(&rt->dst); + dev_hold(dev); + ip6_rt_put(rt); + } + rcu_read_unlock(); + } else { + dev = dev_get_by_index(net, ifindex); + } + + return dev; +} + /* * socket join on multicast group */ @@ -191,28 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex, } mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); - if (!mc_lst) return -ENOMEM; mc_lst->next = NULL; mc_lst->addr = *addr; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, addr, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } - + dev = ip6_mc_find_dev(net, addr, ifindex); if (!dev) { sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); return -ENODEV; @@ -302,27 +310,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr) } EXPORT_SYMBOL(ipv6_sock_mc_drop); -static struct inet6_dev *ip6_mc_find_dev(struct net *net, - const struct in6_addr *group, - int ifindex) +static struct inet6_dev *ip6_mc_find_idev(struct net *net, + const struct in6_addr *group, + int ifindex) { - struct net_device *dev = NULL; + struct net_device *dev; struct inet6_dev *idev; - if (ifindex == 0) { - struct rt6_info *rt; - - rcu_read_lock(); - rt = rt6_lookup(net, group, NULL, 0, NULL, 0); - if (rt) { - dev = dst_dev(&rt->dst); - dev_hold(dev); - ip6_rt_put(rt); - } - rcu_read_unlock(); - } else { - dev = dev_get_by_index(net, ifindex); - } + dev = ip6_mc_find_dev(net, group, ifindex); if (!dev) return NULL; @@ -374,7 +369,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk, if (!ipv6_addr_is_multicast(group)) return -EINVAL; - idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface); + idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface); if (!idev) return -ENODEV; @@ -509,7 +504,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, gsf->gf_fmode != MCAST_EXCLUDE) return -EINVAL; - idev = ip6_mc_find_dev(net, group, gsf->gf_interface); + idev = ip6_mc_find_idev(net, group, gsf->gf_interface); if (!idev) return -ENODEV; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7d5abb3158ec..f427e41e9c49 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -130,7 +130,7 @@ struct neigh_table nd_tbl = { [NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ, [NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ, [NEIGH_VAR_GC_STALETIME] = 60 * HZ, - [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX, + [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT, [NEIGH_VAR_PROXY_QLEN] = 64, [NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ, [NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10, @@ -505,7 +505,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr, ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len); - dev = dst_dev(dst); + dev = dst_dev_rcu(dst); idev = __in6_dev_get(dev); IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c index 45f9105f9ac1..46540a5a4331 100644 --- a/net/ipv6/netfilter.c +++ b/net/ipv6/netfilter.c @@ -63,7 +63,10 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff #ifdef CONFIG_XFRM if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) && xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) { - skb_dst_set(skb, NULL); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0); if (IS_ERR(dst)) return PTR_ERR(dst); diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c index cb2d38e80de9..ef5b7e85cffa 100644 --- a/net/ipv6/netfilter/nf_reject_ipv6.c +++ b/net/ipv6/netfilter/nf_reject_ipv6.c @@ -12,6 +12,19 @@ #include <linux/netfilter_ipv6.h> #include <linux/netfilter_bridge.h> +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit); +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen); +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook); + static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook) { const struct ipv6hdr *ip6h = ipv6_hdr(skb); @@ -91,6 +104,32 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset); +static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + u8 proto = ip6h->nexthdr; + u8 _type, *tp; + int thoff; + __be16 fo; + + thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo); + + if (thoff < 0 || thoff >= skb->len || fo != 0) + return false; + + if (proto != IPPROTO_ICMPV6) + return false; + + tp = skb_header_pointer(skb, + thoff + offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + + if (!tp) + return false; + + return *tp == ICMPV6_DEST_UNREACH; +} + struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, struct sk_buff *oldskb, const struct net_device *dev, @@ -104,6 +143,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, if (!nf_reject_ip6hdr_validate(oldskb)) return NULL; + /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */ + if (nf_skb_is_icmp6_unreach(oldskb)) + return NULL; + /* Include "As much of invoking packet as possible without the ICMPv6 * packet exceeding the minimum IPv6 MTU" in the ICMP payload. */ @@ -146,9 +189,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net, } EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach); -const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, - struct tcphdr *otcph, - unsigned int *otcplen, int hook) +static const struct tcphdr * +nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, + struct tcphdr *otcph, + unsigned int *otcplen, int hook) { const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); u8 proto; @@ -192,11 +236,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb, return otcph; } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get); -struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - __u8 protocol, int hoplimit) +static struct ipv6hdr * +nf_reject_ip6hdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + __u8 protocol, int hoplimit) { struct ipv6hdr *ip6h; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); @@ -216,11 +260,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb, return ip6h; } -EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put); -void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, - const struct sk_buff *oldskb, - const struct tcphdr *oth, unsigned int otcplen) +static void +nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, + const struct sk_buff *oldskb, + const struct tcphdr *oth, unsigned int otcplen) { struct tcphdr *tcph; @@ -248,7 +292,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb, csum_partial(tcph, sizeof(struct tcphdr), 0)); } -EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put); static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in) { diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 9ea5ef56cb27..ced8bd44828e 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, - skb, doff, saddr, sport, daddr, dport, + return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 52f828bb5a83..b2f59ed9d7cc 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { - struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, hinfo, skb, + sk = inet6_lookup_listener(net, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + sk = __inet6_lookup_established(net, saddr, sport, daddr, ntohs(dport), in->ifindex, 0); break; default: diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index d21fe27fe21e..1c9b283a4132 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt); int ip6_dst_hoplimit(struct dst_entry *dst) { int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); + + rcu_read_lock(); if (hoplimit == 0) { - struct net_device *dev = dst_dev(dst); + struct net_device *dev = dst_dev_rcu(dst); struct inet6_dev *idev; - rcu_read_lock(); idev = __in6_dev_get(dev); if (idev) hoplimit = READ_ONCE(idev->cnf.hop_limit); else hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); - rcu_read_unlock(); } + rcu_read_unlock(); + return hoplimit; } EXPORT_SYMBOL(ip6_dst_hoplimit); diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 82b0492923d4..d7a2cdaa2631 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -208,7 +208,6 @@ struct proto pingv6_prot = { .recvmsg = ping_recvmsg, .bind = ping_bind, .backlog_rcv = ping_queue_rcv_skb, - .hash = ping_hash, .unhash = ping_unhash, .get_port = ping_get_port, .put_port = ping_unhash, diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index 752327b10dde..73296f38c252 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = { SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_icmp6_list[] = { @@ -95,30 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), +/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */ SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), - SNMP_MIB_SENTINEL }; -/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ -static const char *const icmp6type2name[256] = { - [ICMPV6_DEST_UNREACH] = "DestUnreachs", - [ICMPV6_PKT_TOOBIG] = "PktTooBigs", - [ICMPV6_TIME_EXCEED] = "TimeExcds", - [ICMPV6_PARAMPROB] = "ParmProblems", - [ICMPV6_ECHO_REQUEST] = "Echos", - [ICMPV6_ECHO_REPLY] = "EchoReplies", - [ICMPV6_MGM_QUERY] = "GroupMembQueries", - [ICMPV6_MGM_REPORT] = "GroupMembResponses", - [ICMPV6_MGM_REDUCTION] = "GroupMembReductions", - [ICMPV6_MLD2_REPORT] = "MLDv2Reports", - [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements", - [NDISC_ROUTER_SOLICITATION] = "RouterSolicits", - [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements", - [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits", - [NDISC_REDIRECT] = "Redirects", -}; - - static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), @@ -129,7 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = { SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI), SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static const struct snmp_mib snmp6_udplite6_list[] = { @@ -141,7 +119,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = { SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS), SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS), SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS), - SNMP_MIB_SENTINEL }; static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) @@ -151,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) /* print by name -- deprecated items */ for (i = 0; i < ICMP6MSG_MIB_MAX; i++) { + const char *p = NULL; int icmptype; - const char *p; + +#define CASE(TYP, STR) case TYP: p = STR; break; icmptype = i & 0xff; - p = icmp6type2name[icmptype]; + switch (icmptype) { +/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */ + CASE(ICMPV6_DEST_UNREACH, "DestUnreachs") + CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs") + CASE(ICMPV6_TIME_EXCEED, "TimeExcds") + CASE(ICMPV6_PARAMPROB, "ParmProblems") + CASE(ICMPV6_ECHO_REQUEST, "Echos") + CASE(ICMPV6_ECHO_REPLY, "EchoReplies") + CASE(ICMPV6_MGM_QUERY, "GroupMembQueries") + CASE(ICMPV6_MGM_REPORT, "GroupMembResponses") + CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions") + CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports") + CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements") + CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits") + CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements") + CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits") + CASE(NDISC_REDIRECT, "Redirects") + } +#undef CASE if (!p) /* don't print un-named types here */ continue; snprintf(name, sizeof(name), "Icmp6%s%s", @@ -182,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib) */ static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib, atomic_long_t *smib, - const struct snmp_mib *itemlist) + const struct snmp_mib *itemlist, + int cnt) { unsigned long buff[SNMP_MIB_MAX]; int i; if (pcpumib) { - memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX); + memset(buff, 0, sizeof(unsigned long) * cnt); - snmp_get_cpu_field_batch(buff, itemlist, pcpumib); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, buff[i]); } else { - for (i = 0; itemlist[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, atomic_long_read(smib + itemlist[i].entry)); } } static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib, - const struct snmp_mib *itemlist, size_t syncpoff) + const struct snmp_mib *itemlist, + int cnt, size_t syncpoff) { u64 buff64[SNMP_MIB_MAX]; int i; - memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX); + memset(buff64, 0, sizeof(u64) * cnt); - snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff); - for (i = 0; itemlist[i].name; i++) + snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]); } @@ -219,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v) struct net *net = (struct net *)seq->private; snmp6_seq_show_item64(seq, net->mib.ipv6_statistics, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); snmp6_seq_show_item(seq, net->mib.icmpv6_statistics, - NULL, snmp6_icmp6_list); + NULL, snmp6_icmp6_list, + ARRAY_SIZE(snmp6_icmp6_list)); snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs); snmp6_seq_show_item(seq, net->mib.udp_stats_in6, - NULL, snmp6_udp6_list); + NULL, snmp6_udp6_list, + ARRAY_SIZE(snmp6_udp6_list)); snmp6_seq_show_item(seq, net->mib.udplite_stats_in6, - NULL, snmp6_udplite6_list); + NULL, snmp6_udplite6_list, + ARRAY_SIZE(snmp6_udplite6_list)); return 0; } @@ -236,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); snmp6_seq_show_item64(seq, idev->stats.ipv6, - snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp)); + snmp6_ipstats_list, + ARRAY_SIZE(snmp6_ipstats_list), + offsetof(struct ipstats_mib, syncp)); + + /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */ snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs, - snmp6_icmp6_list); + snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1); + snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs); return 0; } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 4c3f8245c40f..e369f54844dd 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); continue; } @@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb) if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) && skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) struct raw6_sock *rp = raw6_sk(sk); if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY); return NET_RX_DROP; } @@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb) if (inet_test_bit(HDRINCL, sk)) { if (skb_checksum_complete(skb)) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM); return NET_RX_DROP; } @@ -445,7 +445,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); skb = skb_recv_datagram(sk, flags, &err); @@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk) { struct raw6_sock *rp = raw6_sk(sk); + sk->sk_drop_counters = &rp->drop_counters; switch (inet_sk(sk)->inet_num) { case IPPROTO_ICMPV6: rp->checksum = 1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 3299cfa12e21..aee6a10b112a 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt6->rt6i_gateway, }; @@ -3032,13 +3032,12 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, #endif ip6_dst_store(sk, dst, - ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ? - &sk->sk_v6_daddr : NULL, + ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr), #ifdef CONFIG_IPV6_SUBTREES ipv6_addr_equal(&fl6->saddr, &np->saddr) ? - &np->saddr : + true : #endif - NULL); + false); } static bool ip6_redirect_nh_match(const struct fib6_result *res, @@ -3238,7 +3237,6 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect); static unsigned int ip6_default_advmss(const struct dst_entry *dst) { - struct net_device *dev = dst_dev(dst); unsigned int mtu = dst_mtu(dst); struct net *net; @@ -3246,7 +3244,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst) rcu_read_lock(); - net = dev_net_rcu(dev); + net = dst_dev_net_rcu(dst); if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss) mtu = net->ipv6.sysctl.ip6_rt_min_advmss; @@ -4301,7 +4299,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu if (res.f6i->nh) { struct fib6_nh_match_arg arg = { - .dev = dst_dev(dst), + .dev = dst_dev_rcu(dst), .gw = &rt->rt6i_gateway, }; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 180da19c148c..a5c4c629b788 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -522,16 +522,10 @@ int __init seg6_init(void) if (err) goto out_unregister_iptun; - err = seg6_hmac_init(); - if (err) - goto out_unregister_seg6; - pr_info("Segment Routing with IPv6\n"); out: return err; -out_unregister_seg6: - seg6_local_exit(); out_unregister_iptun: seg6_iptunnel_exit(); out_unregister_genl: @@ -543,7 +537,6 @@ out_unregister_pernet: void seg6_exit(void) { - seg6_hmac_exit(); seg6_local_exit(); seg6_iptunnel_exit(); genl_unregister_family(&seg6_genl_family); diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index fd58426f222b..ee6bac0160ac 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -16,7 +16,6 @@ #include <linux/in6.h> #include <linux/icmpv6.h> #include <linux/mroute6.h> -#include <linux/slab.h> #include <linux/rhashtable.h> #include <linux/netfilter.h> @@ -34,7 +33,8 @@ #include <net/addrconf.h> #include <net/xfrm.h> -#include <crypto/hash.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <crypto/utils.h> #include <net/seg6.h> #include <net/genetlink.h> @@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = { .obj_cmpfn = seg6_hmac_cmpfn, }; -static struct seg6_hmac_algo hmac_algos[] = { - { - .alg_id = SEG6_HMAC_ALGO_SHA1, - .name = "hmac(sha1)", - }, - { - .alg_id = SEG6_HMAC_ALGO_SHA256, - .name = "hmac(sha256)", - }, -}; - static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) { struct sr6_tlv_hmac *tlv; @@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh) return tlv; } -static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id) -{ - struct seg6_hmac_algo *algo; - int i, alg_count; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - if (algo->alg_id == alg_id) - return algo; - } - - return NULL; -} - -static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize, - u8 *output, int outlen) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int ret, dgsize; - - algo = __hmac_get_algo(hinfo->alg_id); - if (!algo) - return -ENOENT; - - tfm = *this_cpu_ptr(algo->tfms); - - dgsize = crypto_shash_digestsize(tfm); - if (dgsize > outlen) { - pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n", - dgsize, outlen); - return -ENOMEM; - } - - ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret); - goto failed; - } - - shash = *this_cpu_ptr(algo->shashs); - shash->tfm = tfm; - - ret = crypto_shash_digest(shash, text, psize, output); - if (ret < 0) { - pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret); - goto failed; - } - - return dgsize; - -failed: - return ret; -} - int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, struct in6_addr *saddr, u8 *output) { __be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid); - u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE]; - int plen, i, dgsize, wrsize; + int plen, i, ret = 0; char *ring, *off; - /* a 160-byte buffer for digest output allows to store highest known - * hash function (RadioGatun) with up to 1216 bits - */ - /* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */ plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16; @@ -219,22 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr, off += 16; } - dgsize = __do_hmac(hinfo, ring, plen, tmp_out, - SEG6_HMAC_MAX_DIGESTSIZE); + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1(&hinfo->key.sha1, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE); + memset(&output[SHA1_DIGEST_SIZE], 0, + SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256(&hinfo->key.sha256, ring, plen, output); + static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE); + break; + default: + WARN_ON_ONCE(1); + ret = -EINVAL; + break; + } local_unlock_nested_bh(&hmac_storage.bh_lock); local_bh_enable(); - - if (dgsize < 0) - return dgsize; - - wrsize = SEG6_HMAC_FIELD_LEN; - if (wrsize > dgsize) - wrsize = dgsize; - - memset(output, 0, SEG6_HMAC_FIELD_LEN); - memcpy(output, tmp_out, wrsize); - - return 0; + return ret; } EXPORT_SYMBOL(seg6_hmac_compute); @@ -305,8 +235,18 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo) struct seg6_pernet_data *sdata = seg6_pernet(net); int err; - if (!__hmac_get_algo(hinfo->alg_id)) + switch (hinfo->alg_id) { + case SEG6_HMAC_ALGO_SHA1: + hmac_sha1_preparekey(&hinfo->key.sha1, + hinfo->secret, hinfo->slen); + break; + case SEG6_HMAC_ALGO_SHA256: + hmac_sha256_preparekey(&hinfo->key.sha256, + hinfo->secret, hinfo->slen); + break; + default: return -EINVAL; + } err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node, rht_params); @@ -363,65 +303,6 @@ out: } EXPORT_SYMBOL(seg6_push_hmac); -static int seg6_hmac_init_algo(void) -{ - struct seg6_hmac_algo *algo; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - int ret = -ENOMEM; - - alg_count = ARRAY_SIZE(hmac_algos); - - for (i = 0; i < alg_count; i++) { - struct crypto_shash **p_tfm; - int shsize; - - algo = &hmac_algos[i]; - algo->tfms = alloc_percpu(struct crypto_shash *); - if (!algo->tfms) - goto error_out; - - for_each_possible_cpu(cpu) { - tfm = crypto_alloc_shash(algo->name, 0, 0); - if (IS_ERR(tfm)) { - ret = PTR_ERR(tfm); - goto error_out; - } - p_tfm = per_cpu_ptr(algo->tfms, cpu); - *p_tfm = tfm; - } - - p_tfm = raw_cpu_ptr(algo->tfms); - tfm = *p_tfm; - - shsize = sizeof(*shash) + crypto_shash_descsize(tfm); - - algo->shashs = alloc_percpu(struct shash_desc *); - if (!algo->shashs) - goto error_out; - - for_each_possible_cpu(cpu) { - shash = kzalloc_node(shsize, GFP_KERNEL, - cpu_to_node(cpu)); - if (!shash) - goto error_out; - *per_cpu_ptr(algo->shashs, cpu) = shash; - } - } - - return 0; - -error_out: - seg6_hmac_exit(); - return ret; -} - -int __init seg6_hmac_init(void) -{ - return seg6_hmac_init_algo(); -} - int __net_init seg6_hmac_net_init(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); @@ -429,36 +310,6 @@ int __net_init seg6_hmac_net_init(struct net *net) return rhashtable_init(&sdata->hmac_infos, &rht_params); } -void seg6_hmac_exit(void) -{ - struct seg6_hmac_algo *algo = NULL; - struct crypto_shash *tfm; - struct shash_desc *shash; - int i, alg_count, cpu; - - alg_count = ARRAY_SIZE(hmac_algos); - for (i = 0; i < alg_count; i++) { - algo = &hmac_algos[i]; - - if (algo->shashs) { - for_each_possible_cpu(cpu) { - shash = *per_cpu_ptr(algo->shashs, cpu); - kfree(shash); - } - free_percpu(algo->shashs); - } - - if (algo->tfms) { - for_each_possible_cpu(cpu) { - tfm = *per_cpu_ptr(algo->tfms, cpu); - crypto_free_shash(tfm); - } - free_percpu(algo->tfms); - } - } -} -EXPORT_SYMBOL(seg6_hmac_exit); - void __net_exit seg6_hmac_net_exit(struct net *net) { struct seg6_pernet_data *sdata = seg6_pernet(net); diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 12496ba1b7d4..cf37ad9686e6 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -848,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel, return dst; } +static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst, + bool is_isatap) +{ + const struct ipv6hdr *iph6 = ipv6_hdr(skb); + struct neighbour *neigh = NULL; + const struct in6_addr *addr6; + bool found = false; + int addr_type; + + if (skb_dst(skb)) + neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); + + if (!neigh) { + net_dbg_ratelimited("nexthop == NULL\n"); + return false; + } + + addr6 = (const struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (is_isatap) { + if ((addr_type & IPV6_ADDR_UNICAST) && + ipv6_addr_is_isatap(addr6)) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } else { + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &ipv6_hdr(skb)->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) != 0) { + *dst = addr6->s6_addr32[3]; + found = true; + } + } + + neigh_release(neigh); + + return found; +} + /* * This function assumes it is being called from dev_queue_xmit() * and that skb is filled properly by that function. @@ -867,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, __be32 dst = tiph->daddr; struct flowi4 fl4; int mtu; - const struct in6_addr *addr6; - int addr_type; u8 ttl; u8 protocol = IPPROTO_IPV6; int t_hlen = tunnel->hlen + sizeof(struct iphdr); @@ -877,64 +918,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, tos = ipv6_get_dsfield(iph6); /* ISATAP (RFC4214) - must come before 6to4 */ - if (dev->priv_flags & IFF_ISATAP) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if ((addr_type & IPV6_ADDR_UNICAST) && - ipv6_addr_is_isatap(addr6)) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if ((dev->priv_flags & IFF_ISATAP) && + !ipip6_tunnel_dst_find(skb, &dst, true)) + goto tx_error; if (!dst) dst = try_6rd(tunnel, &iph6->daddr); - if (!dst) { - struct neighbour *neigh = NULL; - bool do_tx_error = false; - - if (skb_dst(skb)) - neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr); - - if (!neigh) { - net_dbg_ratelimited("nexthop == NULL\n"); - goto tx_error; - } - - addr6 = (const struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); - - if (addr_type == IPV6_ADDR_ANY) { - addr6 = &ipv6_hdr(skb)->daddr; - addr_type = ipv6_addr_type(addr6); - } - - if ((addr_type & IPV6_ADDR_COMPATv4) != 0) - dst = addr6->s6_addr32[3]; - else - do_tx_error = true; - - neigh_release(neigh); - if (do_tx_error) - goto tx_error; - } + if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false)) + goto tx_error; flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE, diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index f0ee1a909771..7e007f013ec8 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -16,6 +16,7 @@ #include <net/secure_seq.h> #include <net/ipv6.h> #include <net/tcp.h> +#include <net/tcp_ecn.h> #define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) @@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) if (!req->syncookie) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok &= cookie_ecn_ok(net, dst); + tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th); ret = tcp_get_cookie_sock(sk, skb, req, dst); if (!ret) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e885629312a4..9622c2776ade 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -62,6 +62,7 @@ #include <net/hotdata.h> #include <net/busy_poll.h> #include <net/rstreason.h> +#include <net/psp.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -299,12 +300,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_rcv_saddr = LOOPBACK4_IPV6; sk->sk_gso_type = SKB_GSO_TCPV6; - ip6_dst_store(sk, dst, NULL, NULL); + ip6_dst_store(sk, dst, false, false); - icsk->icsk_ext_hdr_len = 0; + icsk->icsk_ext_hdr_len = psp_sk_overhead(sk); if (opt) - icsk->icsk_ext_hdr_len = opt->opt_flen + - opt->opt_nflen; + icsk->icsk_ext_hdr_len += opt->opt_flen + + opt->opt_nflen; tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -388,8 +389,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->daddr, th->dest, + sk = __inet6_lookup_established(net, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -545,6 +545,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb); if (skb) { + tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK; __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, &ireq->ir_v6_rmt_addr); @@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 if (sk) { /* unconstify the socket only to attach it to buff with care. */ skb_set_owner_edemux(buff, (struct sock *)sk); + psp_reply_set_decrypted(buff); if (sk->sk_state == TCP_TIME_WAIT) mark = inet_twsk(sk)->tw_mark; @@ -1073,8 +1075,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb, * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - NULL, 0, &ipv6h->saddr, th->source, + sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source, &ipv6h->daddr, ntohs(th->source), dif, sdif); if (!sk1) @@ -1460,7 +1461,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * memcpy(newnp, np, sizeof(struct ipv6_pinfo)); - ip6_dst_store(newsk, dst, NULL, NULL); + ip6_dst_store(newsk, dst, false, false); newnp->saddr = ireq->ir_v6_loc_addr; @@ -1606,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); + reason = psp_sk_rx_policy_check(sk, skb); + if (reason) + goto err_discard; + /* * socket locking is here for SMP purposes as backlog rcv * is currently called with bh processing disabled. @@ -1685,6 +1690,7 @@ csum_err: reason = SKB_DROP_REASON_TCP_CSUM; trace_tcp_bad_csum(skb); TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS); +err_discard: TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); goto discard; @@ -1787,7 +1793,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1809,7 +1815,7 @@ lookup: &hdr->saddr, &hdr->daddr, AF_INET6, dif, sdif); if (drop_reason) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); reqsk_put(req); goto discard_it; } @@ -1948,7 +1954,7 @@ discard_it: return 0; discard_and_relse: - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); if (refcounted) sock_put(sk); goto discard_it; @@ -1974,8 +1980,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, - skb, __tcp_hdrlen(th), + sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, ntohs(th->dest), @@ -1990,6 +1995,10 @@ do_time_wait: __this_cpu_write(tcp_tw_isn, isn); goto process; } + + drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb); + if (drop_reason) + break; } /* to ACK */ fallthrough; @@ -2027,8 +2036,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); if (sk) { @@ -2048,7 +2056,6 @@ void tcp_v6_early_demux(struct sk_buff *skb) static struct timewait_sock_ops tcp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct tcp6_timewait_sock), - .twsk_destructor = tcp_twsk_destructor, }; INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb) @@ -2115,6 +2122,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = { .ao_calc_key_sk = tcp_v4_ao_calc_key_sk, #endif }; + +static void tcp6_destruct_sock(struct sock *sk) +{ + tcp_md5_destruct_sock(sk); + tcp_ao_destroy_sock(sk, false); + inet6_sock_destruct(sk); +} #endif /* NOTE: A lot of things set to zero explicitly by call to @@ -2130,6 +2144,7 @@ static int tcp_v6_init_sock(struct sock *sk) #if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO) tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific; + sk->sk_destruct = tcp6_destruct_sock; #endif return 0; @@ -2228,9 +2243,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) rx_queue, timer_active, jiffies_delta_to_clock_t(timer_expires - jiffies), - icsk->icsk_retransmits, + READ_ONCE(icsk->icsk_retransmits), from_kuid_munged(seq_user_ns(seq), sk_uid(sp)), - icsk->icsk_probes_out, + READ_ONCE(icsk->icsk_probes_out), sock_i_ino(sp), refcount_read(&sp->sk_refcnt), sp, jiffies_to_clock_t(icsk->icsk_rto), @@ -2340,7 +2355,7 @@ struct proto tcpv6_prot = { .splice_eof = tcp_splice_eof, .backlog_rcv = tcp_v6_do_rcv, .release_cb = tcp_release_cb, - .hash = inet6_hash, + .hash = inet_hash, .unhash = inet_unhash, .get_port = inet_csk_get_port, .put_port = inet_put_port, @@ -2356,7 +2371,6 @@ struct proto tcpv6_prot = { .per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc, .memory_pressure = &tcp_memory_pressure, - .orphan_count = &tcp_orphan_count, .sysctl_mem = sysctl_tcp_mem, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c index a8a04f441e78..effeba58630b 100644 --- a/net/ipv6/tcpv6_offload.c +++ b/net/ipv6/tcpv6_offload.c @@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb, inet6_get_iif_sdif(skb, &iif, &sdif); hdr = skb_gro_network_header(skb); net = dev_net_rcu(skb->dev); - sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, - &hdr->saddr, th->source, + sk = __inet6_lookup_established(net, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), iif, sdif); NAPI_GRO_CB(skb)->is_flist = !sk; diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 6a68f77da44b..813a2ba75824 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk) int udpv6_init_sock(struct sock *sk) { - udp_lib_init_sock(sk); + int res = udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); - return 0; + return res; } INDIRECT_CALLABLE_SCOPE @@ -260,7 +261,7 @@ rescore: /* compute_score is too long of a function to be * inlined, and calling it again here yields - * measureable overhead for some + * measurable overhead for some * workloads. Work around it by jumping * backwards to rescore 'result'. */ @@ -449,7 +450,7 @@ struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr EXPORT_SYMBOL_GPL(udp6_lib_lookup); #endif -/* do not use the scratch area len for jumbogram: their length execeeds the +/* do not use the scratch area len for jumbogram: their length exceeds the * scratch area space; note that the IP6CB flags is still in the first * cacheline, so checking for jumbograms is cheap */ @@ -479,7 +480,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, if (flags & MSG_ERRQUEUE) return ipv6_recv_error(sk, msg, len, addr_len); - if (np->rxpmtu && np->rxopt.bits.rxpmtu) + if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu)) return ipv6_recv_rxpmtu(sk, msg, len, addr_len); try_again: @@ -524,7 +525,7 @@ try_again: } if (unlikely(err)) { if (!peeking) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); SNMP_INC_STATS(mib, UDP_MIB_INERRORS); } kfree_skb(skb); @@ -908,7 +909,7 @@ csum_error: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); sk_skb_reason_drop(sk, skb, drop_reason); return -1; } @@ -1013,7 +1014,7 @@ start_lookup: } nskb = skb_clone(skb, GFP_ATOMIC); if (unlikely(!nskb)) { - atomic_inc(&sk->sk_drops); + udp_drops_inc(sk); __UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS, IS_UDPLITE(sk)); __UDP6_INC_STATS(net, UDP_MIB_INERRORS, @@ -1048,7 +1049,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst) sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst)); } -/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and +/* wrapper for udp_queue_rcv_skb taking care of csum conversion and * return code conversion for ip layer consumption */ static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb, diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index d8445ac1b2e4..046f13b1d77a 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) ip6_gro_compute_pseudo); skip: - NAPI_GRO_CB(skb)->is_ipv6 = 1; - if (static_branch_unlikely(&udpv6_encap_needed_key)) sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index cc2b3c44bc05..6c717a7ef292 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb, IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return; } @@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb) skb_reset_network_header(skb); IUCV_SKB_CB(skb)->offset = 0; if (sk_filter(sk, skb)) { - atomic_inc(&sk->sk_drops); /* skb rejected by filter */ + sk_drops_inc(sk); /* skb rejected by filter */ kfree_skb(skb); return NET_RX_SUCCESS; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2ed07fa121ab..d9aca1c3c097 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -311,6 +311,96 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy, ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev)); } +static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf) +{ + kfree(conf->cluster_id); + kfree(conf->extra_nan_attrs); + kfree(conf->vendor_elems); + memset(conf, 0, sizeof(*conf)); +} + +static void ieee80211_stop_nan(struct wiphy *wiphy, + struct wireless_dev *wdev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + if (!sdata->u.nan.started) + return; + + drv_stop_nan(sdata->local, sdata); + sdata->u.nan.started = false; + + ieee80211_nan_conf_free(&sdata->u.nan.conf); + + ieee80211_sdata_stop(sdata); + ieee80211_recalc_idle(sdata->local); +} + +static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst, + struct cfg80211_nan_conf *src, + u32 changes) +{ + if (changes & CFG80211_NAN_CONF_CHANGED_PREF) + dst->master_pref = src->master_pref; + + if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) + dst->bands = src->bands; + + if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) { + dst->scan_period = src->scan_period; + dst->scan_dwell_time = src->scan_dwell_time; + dst->discovery_beacon_interval = + src->discovery_beacon_interval; + dst->enable_dw_notification = src->enable_dw_notification; + memcpy(&dst->band_cfgs, &src->band_cfgs, + sizeof(dst->band_cfgs)); + + kfree(dst->cluster_id); + dst->cluster_id = NULL; + + kfree(dst->extra_nan_attrs); + dst->extra_nan_attrs = NULL; + dst->extra_nan_attrs_len = 0; + + kfree(dst->vendor_elems); + dst->vendor_elems = NULL; + dst->vendor_elems_len = 0; + + if (src->cluster_id) { + dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN, + GFP_KERNEL); + if (!dst->cluster_id) + goto no_mem; + } + + if (src->extra_nan_attrs && src->extra_nan_attrs_len) { + dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs, + src->extra_nan_attrs_len, + GFP_KERNEL); + if (!dst->extra_nan_attrs) + goto no_mem; + + dst->extra_nan_attrs_len = src->extra_nan_attrs_len; + } + + if (src->vendor_elems && src->vendor_elems_len) { + dst->vendor_elems = kmemdup(src->vendor_elems, + src->vendor_elems_len, + GFP_KERNEL); + if (!dst->vendor_elems) + goto no_mem; + + dst->vendor_elems_len = src->vendor_elems_len; + } + } + + return 0; + +no_mem: + ieee80211_nan_conf_free(dst); + return -ENOMEM; +} + static int ieee80211_start_nan(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) @@ -320,6 +410,9 @@ static int ieee80211_start_nan(struct wiphy *wiphy, lockdep_assert_wiphy(sdata->local->hw.wiphy); + if (sdata->u.nan.started) + return -EALREADY; + ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1); if (ret < 0) return ret; @@ -329,21 +422,21 @@ static int ieee80211_start_nan(struct wiphy *wiphy, return ret; ret = drv_start_nan(sdata->local, sdata, conf); - if (ret) + if (ret) { ieee80211_sdata_stop(sdata); + return ret; + } - sdata->u.nan.conf = *conf; - - return ret; -} + sdata->u.nan.started = true; + ieee80211_recalc_idle(sdata->local); -static void ieee80211_stop_nan(struct wiphy *wiphy, - struct wireless_dev *wdev) -{ - struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + ret = ieee80211_nan_conf_copy(&sdata->u.nan.conf, conf, 0xFFFFFFFF); + if (ret) { + ieee80211_stop_nan(wiphy, wdev); + return ret; + } - drv_stop_nan(sdata->local, sdata); - ieee80211_sdata_stop(sdata); + return 0; } static int ieee80211_nan_change_conf(struct wiphy *wiphy, @@ -352,7 +445,7 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, u32 changes) { struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - struct cfg80211_nan_conf new_conf; + struct cfg80211_nan_conf new_conf = {}; int ret = 0; if (sdata->vif.type != NL80211_IFTYPE_NAN) @@ -361,17 +454,28 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy, if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - new_conf = sdata->u.nan.conf; + if (!changes) + return 0; - if (changes & CFG80211_NAN_CONF_CHANGED_PREF) - new_conf.master_pref = conf->master_pref; + /* First make a full copy of the previous configuration and then apply + * the changes. This might be a little wasteful, but it is simpler. + */ + ret = ieee80211_nan_conf_copy(&new_conf, &sdata->u.nan.conf, + 0xFFFFFFFF); + if (ret < 0) + return ret; - if (changes & CFG80211_NAN_CONF_CHANGED_BANDS) - new_conf.bands = conf->bands; + ret = ieee80211_nan_conf_copy(&new_conf, conf, changes); + if (ret < 0) + return ret; ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes); - if (!ret) + if (ret) { + ieee80211_nan_conf_free(&new_conf); + } else { + ieee80211_nan_conf_free(&sdata->u.nan.conf); sdata->u.nan.conf = new_conf; + } return ret; } @@ -2171,10 +2275,16 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* * cfg80211 validates this (1-2007) and allows setting the AID - * only when creating a new station entry + * only when creating a new station entry. For S1G APs, the current + * implementation supports a maximum of 1600 AIDs. */ - if (params->aid) + if (params->aid) { + if (sdata->vif.cfg.s1g && + params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID) + return -EINVAL; + sta->sta.aid = params->aid; + } /* * Some of the following updates would be racy if called on an @@ -3001,6 +3111,9 @@ static int ieee80211_scan(struct wiphy *wiphy, struct cfg80211_scan_request *req) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_link_data *link; + struct ieee80211_channel *chan; + int radio_idx; sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev); @@ -3028,10 +3141,20 @@ static int ieee80211_scan(struct wiphy *wiphy, * the frames sent while scanning on other channel will be * lost) */ - if (ieee80211_num_beaconing_links(sdata) && - (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || - !(req->flags & NL80211_SCAN_FLAG_AP))) - return -EOPNOTSUPP; + for_each_link_data(sdata, link) { + /* if the link is not beaconing, ignore it */ + if (!sdata_dereference(link->u.ap.beacon, sdata)) + continue; + + chan = link->conf->chanreq.oper.chan; + radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); + + if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, + radio_idx) && + (!(wiphy->features & NL80211_FEATURE_AP_SCAN) || + !(req->flags & NL80211_SCAN_FLAG_AP))) + return -EOPNOTSUPP; + } break; case NL80211_IFTYPE_NAN: default: @@ -3677,12 +3800,7 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, if (list_empty(&local->roc_list) && !local->scanning) return false; - if (wiphy->n_radio < 2) - return true; - req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan); - if (req_radio_idx < 0) - return true; if (local->scanning) { scan_req = wiphy_dereference(wiphy, local->scan_req); @@ -3701,14 +3819,6 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy, list_for_each_entry(roc, &local->roc_list, list) { chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, roc->chan); - /* - * The roc work is added but chan_radio_idx is invalid. - * Should not happen but if it does, let's not take - * risk and return true. - */ - if (chan_radio_idx < 0) - return true; - if (chan_radio_idx == req_radio_idx) return true; } @@ -4825,7 +4935,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, int ret = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); if (wdev) { sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); @@ -4851,7 +4960,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, } out: - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return ret; diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index c9cea0e7ac16..57065714cf8c 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -659,19 +659,8 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local, for_each_sdata_link(local, link) { if (link->radar_required) { - if (wiphy->n_radio < 2) - return true; - chan = link->conf->chanreq.oper.chan; radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The radio index (radio_idx) is expected to be valid, - * as it's derived from a channel tied to a link. If - * it's invalid (i.e., negative), return true to avoid - * potential issues with radar-sensitive operations. - */ - if (radio_idx < 0) - return true; if (ieee80211_is_radio_idx_in_scan_req(wiphy, req, radio_idx)) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e8b78ec682da..d02f07368c51 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -82,7 +82,6 @@ static ssize_t aqm_read(struct file *file, int len = 0; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, sizeof(buf), "access name value\n" @@ -105,7 +104,6 @@ static ssize_t aqm_read(struct file *file, fq->limit, fq->quantum); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return simple_read_from_buffer(user_buf, count, ppos, @@ -717,7 +715,6 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount); DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount); DEBUGFS_STATS_ADD(dot11TransmittedFrameCount); - DEBUGFS_STATS_ADD(tx_handlers_drop); DEBUGFS_STATS_ADD(tx_handlers_queued); DEBUGFS_STATS_ADD(tx_handlers_drop_wep); DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 1dac78271045..30a5a978a678 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -625,7 +625,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi = to_txq_info(sdata->vif.txq); spin_lock_bh(&local->fq.lock); - rcu_read_lock(); len = scnprintf(buf, buflen, @@ -642,7 +641,6 @@ static ssize_t ieee80211_if_fmt_aqm( txqi->tin.tx_bytes, txqi->tin.tx_packets); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); return len; diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 49061bd4151b..ef75255d47d5 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -148,7 +148,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, return -ENOMEM; spin_lock_bh(&local->fq.lock); - rcu_read_lock(); p += scnprintf(p, bufsz + buf - p, @@ -178,7 +177,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf, test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : ""); } - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf); diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c index 0397755a3bd1..3d365626faa4 100644 --- a/net/mac80211/ethtool.c +++ b/net/mac80211/ethtool.c @@ -48,8 +48,8 @@ static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = { "rx_duplicates", "rx_fragments", "rx_dropped", "tx_packets", "tx_bytes", "tx_filtered", "tx_retry_failed", "tx_retries", - "sta_state", "txrate", "rxrate", "signal", - "channel", "noise", "ch_time", "ch_time_busy", + "tx_handlers_drop", "sta_state", "txrate", "rxrate", + "signal", "channel", "noise", "ch_time", "ch_time_busy", "ch_time_ext_busy", "ch_time_rx", "ch_time_tx" }; #define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats) @@ -120,6 +120,7 @@ static void ieee80211_get_stats(struct net_device *dev, i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; data[i++] = sta->sta_state; @@ -145,6 +146,7 @@ static void ieee80211_get_stats(struct net_device *dev, sta_set_sinfo(sta, &sinfo, false); i = 0; ADD_STA_STATS(&sta->deflink); + data[i++] = sdata->tx_handlers_drop; } } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 8afa2404eaa8..73fd86ec1bce 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS]; #define IEEE80211_MAX_NAN_INSTANCE_ID 255 +/* + * Current mac80211 implementation supports a maximum of 1600 AIDS + * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks + * as each block is 64 AIDs. + */ +#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600 +#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25 + enum ieee80211_status_data { IEEE80211_STATUS_TYPE_MASK = 0x00f, IEEE80211_STATUS_TYPE_INVALID = 0, @@ -977,11 +985,13 @@ struct ieee80211_if_mntr { * struct ieee80211_if_nan - NAN state * * @conf: current NAN configuration + * @started: true iff NAN is started * @func_lock: lock for @func_inst_ids * @function_inst_ids: a bitmap of available instance_id's */ struct ieee80211_if_nan { struct cfg80211_nan_conf conf; + bool started; /* protects function_inst_ids */ spinlock_t func_lock; @@ -1210,6 +1220,7 @@ struct ieee80211_sub_if_data { } debugfs; #endif + u32 tx_handlers_drop; /* must be last, dynamically sized area in this! */ struct ieee80211_vif vif; }; @@ -1599,7 +1610,6 @@ struct ieee80211_local { u32 dot11TransmittedFrameCount; /* TX/RX handler statistics */ - unsigned int tx_handlers_drop; unsigned int tx_handlers_queued; unsigned int tx_handlers_drop_wep; unsigned int tx_handlers_drop_not_assoc; @@ -1665,8 +1675,6 @@ struct ieee80211_local { struct idr ack_status_frames; spinlock_t ack_status_lock; - struct ieee80211_sub_if_data __rcu *p2p_sdata; - /* virtual monitor interface */ struct ieee80211_sub_if_data __rcu *monitor_sdata; struct ieee80211_chan_req monitor_chanreq; @@ -2702,7 +2710,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, const struct ieee80211_he_operation *he_oper, const struct ieee80211_eht_operation *eht_oper, struct cfg80211_chan_def *chandef); -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef); void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef, struct ieee80211_conn_settings *conn); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 07ba68f7cd81..a7873832d4fa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -107,6 +107,7 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, { bool working, scanning, active; unsigned int led_trig_start = 0, led_trig_stop = 0; + struct ieee80211_sub_if_data *iter; lockdep_assert_wiphy(local->hw.wiphy); @@ -117,6 +118,14 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local, working = !local->ops->remain_on_channel && !list_empty(&local->roc_list); + list_for_each_entry(iter, &local->interfaces, list) { + if (iter->vif.type == NL80211_IFTYPE_NAN && + iter->u.nan.started) { + working = true; + break; + } + } + scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) || test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning); @@ -611,10 +620,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do spin_unlock_bh(&sdata->u.nan.func_lock); break; - case NL80211_IFTYPE_P2P_DEVICE: - /* relies on synchronize_rcu() below */ - RCU_INIT_POINTER(local->p2p_sdata, NULL); - fallthrough; default: wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work); /* @@ -1405,6 +1410,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_idle(local); netif_carrier_on(dev); + list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); break; default: if (coming_up) { @@ -1468,17 +1474,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type != NL80211_IFTYPE_STATION); } - switch (sdata->vif.type) { - case NL80211_IFTYPE_P2P_DEVICE: - rcu_assign_pointer(local->p2p_sdata, sdata); - break; - case NL80211_IFTYPE_MONITOR: - list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); - break; - default: - break; - } - /* * set_multicast_list will be invoked by the networking core * which will check whether any increments here were done in diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 3ae6104e5cb2..eefa6f7e899b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -746,6 +746,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | BIT(IEEE80211_STYPE_AUTH >> 4), }, + [NL80211_IFTYPE_NAN] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4), + }, }; static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = { @@ -862,6 +867,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (emulate_chanctx || ops->remain_on_channel) wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL; + wiphy->bss_param_support = WIPHY_BSS_PARAM_CTS_PROT | + WIPHY_BSS_PARAM_SHORT_PREAMBLE | + WIPHY_BSS_PARAM_SHORT_SLOT_TIME | + WIPHY_BSS_PARAM_BASIC_RATES | + WIPHY_BSS_PARAM_AP_ISOLATE | + WIPHY_BSS_PARAM_HT_OPMODE | + WIPHY_BSS_PARAM_P2P_CTWINDOW | + WIPHY_BSS_PARAM_P2P_OPPPS; wiphy->features |= NL80211_FEATURE_SK_TX_STATUS | NL80211_FEATURE_SAE | NL80211_FEATURE_HT_IBSS | @@ -1164,9 +1177,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (WARN_ON(!ieee80211_hw_check(hw, MFP_CAPABLE))) return -EINVAL; - if (WARN_ON(!ieee80211_hw_check(hw, CONNECTION_MONITOR))) - return -EINVAL; - if (WARN_ON(ieee80211_hw_check(hw, NEED_DTIM_BEFORE_ASSOC))) return -EINVAL; @@ -1239,11 +1249,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!dflt_chandef.chan) { /* * Assign the first enabled channel to dflt_chandef - * from the list of channels + * from the list of channels. For S1G interfaces + * ensure it can be used as a primary. */ for (i = 0; i < sband->n_channels; i++) if (!(sband->channels[i].flags & - IEEE80211_CHAN_DISABLED)) + (IEEE80211_CHAN_DISABLED | + IEEE80211_CHAN_S1G_NO_PRIMARY))) break; /* if none found then use the first anyway */ if (i == sband->n_channels) diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index a4a715f6f1c3..f37068a533f4 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -624,6 +624,9 @@ int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata, if (!sband) return -EINVAL; + if (sband->band != NL80211_BAND_6GHZ) + return 0; + iftd = ieee80211_get_sband_iftype_data(sband, NL80211_IFTYPE_MESH_POINT); /* The device doesn't support HE in mesh mode or at all */ diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c index 20e022a03933..ebab1f0a0138 100644 --- a/net/mac80211/mesh_ps.c +++ b/net/mac80211/mesh_ps.c @@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta, if (sta->mesh->plink_state == NL80211_PLINK_ESTAB) has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len, - sta->mesh->aid); + sta->mesh->aid, false); if (has_buffered) mps_dbg(sta->sdata, "%pM indicates buffered frames\n", diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index dd650a127a31..3b5827ea438e 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata, /* get special S1G case out of the way */ if (sband->band == NL80211_BAND_S1GHZ) { - if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) { - sdata_info(sdata, - "Missing S1G Operation Element? Trying operating == primary\n"); - chandef->width = ieee80211_s1g_channel_width(channel); + if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper, + chandef)) { + /* Fallback to default 1MHz */ + chandef->width = NL80211_CHAN_WIDTH_1; + chandef->s1g_primary_2mhz = false; } return IEEE80211_CONN_MODE_S1G; @@ -1046,6 +1047,14 @@ again: ret = -EINVAL; goto free; } + + chanreq->oper = *ap_chandef; + if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper, + IEEE80211_CHAN_DISABLED)) { + ret = -EINVAL; + goto free; + } + return elems; case NL80211_BAND_6GHZ: if (ap_mode < IEEE80211_CONN_MODE_HE) { @@ -1850,7 +1859,8 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata, ieee80211_put_he_cap(skb, sdata, sband, &assoc_data->link[link_id].conn); ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY); - ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); + if (sband->band == NL80211_BAND_6GHZ) + ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode); } /* @@ -2112,8 +2122,11 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata, sizeof(struct ieee80211_he_mcs_nss_supp) + IEEE80211_HE_PPE_THRES_MAX_LEN; - if (sband->band == NL80211_BAND_6GHZ) + if (sband->band == NL80211_BAND_6GHZ) { size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa); + /* reg connection */ + size += 4; + } size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) + sizeof(struct ieee80211_eht_mcs_nss_supp) + @@ -2187,11 +2200,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata) 2 + /* ext capa & op */ 2; /* EML capa */ - /* - * The capability elements were already considered above; - * note this over-estimates a bit because there's no - * STA profile for the assoc link. - */ + /* The capability elements were already considered above */ size += (n_links - 1) * (1 + 1 + /* subelement ID/length */ 2 + /* STA control */ @@ -5729,7 +5738,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link, he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); - if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap)) + if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1) return chains; /* skip one byte ext_tag_id */ @@ -6356,6 +6365,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, }; u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; + u16 max_aid = IEEE80211_MAX_AID; lockdep_assert_wiphy(sdata->local->hw.wiphy); @@ -6382,10 +6392,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control); capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); - if (assoc_data->s1g) + if (assoc_data->s1g) { elem_start = mgmt->u.s1g_assoc_resp.variable; - else + max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID; + } else { elem_start = mgmt->u.assoc_resp.variable; + } /* * Note: this may not be perfect, AP might misbehave - if @@ -6409,16 +6421,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, if (elems->aid_resp) aid = le16_to_cpu(elems->aid_resp->aid); - else if (assoc_data->s1g) - aid = 0; /* TODO */ else aid = le16_to_cpu(mgmt->u.assoc_resp.aid); /* - * The 5 MSB of the AID field are reserved - * (802.11-2016 9.4.1.8 AID field) + * The 5 MSB of the AID field are reserved for a non-S1G STA. For + * an S1G STA the 3 MSBs are reserved. + * (802.11-2016 9.4.1.8 AID field). */ - aid &= 0x7ff; + aid &= assoc_data->s1g ? 0x1fff : 0x7ff; sdata_info(sdata, "RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n", @@ -6455,7 +6466,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, event.u.mlme.reason = status_code; drv_event_callback(sdata->local, sdata, &event); } else { - if (aid == 0 || aid > IEEE80211_MAX_AID) { + if (aid == 0 || aid > max_aid) { sdata_info(sdata, "invalid AID value %d (out of range), turn off PS\n", aid); @@ -6493,6 +6504,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, } sdata->vif.cfg.aid = aid; + sdata->vif.cfg.s1g = assoc_data->s1g; if (!ieee80211_assoc_success(sdata, mgmt, elems, elem_start, elem_len)) { @@ -7289,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata, return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len); } +static bool +ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local, + struct ieee80211_mgmt *mgmt, + struct ieee80211_rx_status *rx_status, + struct ieee80211_chanctx_conf *chanctx) +{ + u32 pri_2mhz_khz; + struct ieee80211_channel *s1g_sibling_1mhz; + u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan); + u32 rx_khz = ieee80211_rx_status_to_khz(rx_status); + + if (rx_khz == pri_khz) + return true; + + if (!chanctx->def.s1g_primary_2mhz) + return false; + + /* + * If we have an S1G interface with a 2MHz primary, beacons are + * sent on the center frequency of the 2MHz primary. Find the sibling + * 1MHz channel and calculate the 2MHz primary center frequency. + */ + s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy, + &chanctx->def); + if (!s1g_sibling_1mhz) + return false; + + pri_2mhz_khz = + (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2; + return rx_khz == pri_2mhz_khz; +} + static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee80211_hdr *hdr, size_t len, struct ieee80211_rx_status *rx_status) @@ -7343,8 +7387,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, return; } - if (ieee80211_rx_status_to_khz(rx_status) != - ieee80211_channel_to_khz(chanctx_conf->def.chan)) { + if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status, + chanctx_conf)) { rcu_read_unlock(); return; } @@ -7440,7 +7484,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, ncrc = elems->crc; if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) && - ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) { + ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid, + vif_cfg->s1g)) { if (local->hw.conf.dynamic_ps_timeout > 0) { if (local->hw.conf.flags & IEEE80211_CONF_PS) { local->hw.conf.flags &= ~IEEE80211_CONF_PS; diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 13df6321634d..ae82533e3c02 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -8,7 +8,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2007, Michael Wu <flamingice@sourmilk.net> * Copyright 2009 Johannes Berg <johannes@sipsolutions.net> - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include <linux/export.h> #include <net/mac80211.h> @@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, need_offchan = true; break; case NL80211_IFTYPE_NAN: + break; default: return -EOPNOTSUPP; } @@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev, /* Check if the operating channel is the requested channel */ if (!params->chan && mlo_sta) { need_offchan = false; + } else if (sdata->vif.type == NL80211_IFTYPE_NAN) { + /* Frames can be sent during NAN schedule */ } else if (!need_offchan) { struct ieee80211_chanctx_conf *chanctx_conf = NULL; int i; diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 3cb2ad6d0b28..e441f8541603 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -4,7 +4,7 @@ * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc <jbenc@suse.cz> * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2019, 2022-2024 Intel Corporation + * Copyright (C) 2019, 2022-2025 Intel Corporation */ #include <linux/kernel.h> @@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local, if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; + if (st->info->band >= NUM_NL80211_BANDS) + return; + sband = local->hw.wiphy->bands[st->info->band]; spin_lock_bh(&sta->rate_ctrl_lock); @@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta, int mcast_rate; bool use_basicrate = false; + if (!sband) + return false; + if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) { __rate_control_send_low(txrc->hw, sband, pubsta, info, txrc->rate_idx_mask); @@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, return; sdata = vif_to_sdata(vif); + if (info->band >= NUM_NL80211_BANDS) + return; + sband = sdata->local->hw.wiphy->bands[info->band]; if (ieee80211_is_tx_data(skb)) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4d4ff4d4917a..6af43dfefdd6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4502,8 +4502,16 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) (ieee80211_is_auth(hdr->frame_control) && ether_addr_equal(sdata->vif.addr, hdr->addr1)); case NL80211_IFTYPE_NAN: - /* Currently no frames on NAN interface are allowed */ - return false; + /* Accept only frames that are addressed to the NAN cluster + * (based on the Cluster ID). From these frames, accept only + * action frames or authentication frames that are addressed to + * the local NAN interface. + */ + return memcmp(sdata->wdev.u.nan.cluster_id, + hdr->addr3, ETH_ALEN) == 0 && + (ieee80211_is_public_action(hdr, skb->len) || + (ieee80211_is_auth(hdr->frame_control) && + ether_addr_equal(sdata->vif.addr, hdr->addr1))); default: break; } @@ -5230,12 +5238,20 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, } rx.sdata = prev_sta->sdata; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; + + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + continue; + + link_id = link_sta->link_id; + } + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; - if (!status->link_valid && prev_sta->sta.mlo) - continue; - ieee80211_prepare_and_rx_handle(&rx, skb, false); prev_sta = sta; @@ -5243,10 +5259,18 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (prev_sta) { rx.sdata = prev_sta->sdata; - if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) - goto out; + if (!status->link_valid && prev_sta->sta.mlo) { + struct link_sta_info *link_sta; + + link_sta = link_sta_info_get_bss(rx.sdata, + hdr->addr2); + if (!link_sta) + goto out; - if (!status->link_valid && prev_sta->sta.mlo) + link_id = link_sta->link_id; + } + + if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id)) goto out; if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index dbf98aa4cd67..bb9563f50e7b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -996,15 +996,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local, local->scan_chandef.freq1_offset = chan->freq_offset; local->scan_chandef.center_freq2 = 0; - /* For scanning on the S1G band, detect the channel width according to - * the channel being scanned. - */ + /* For S1G, only scan the 1MHz primaries. */ if (chan->band == NL80211_BAND_S1GHZ) { - local->scan_chandef.width = ieee80211_s1g_channel_width(chan); + local->scan_chandef.width = NL80211_CHAN_WIDTH_1; + local->scan_chandef.s1g_primary_2mhz = false; goto set_channel; } - /* If scanning on oper channel, use whatever channel-type + /* + * If scanning on oper channel, use whatever channel-type * is currently in use. */ if (chan == local->hw.conf.chandef.chan) @@ -1213,7 +1213,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata, for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band] || - band == NL80211_BAND_6GHZ) + band == NL80211_BAND_6GHZ || + band == NL80211_BAND_S1GHZ) continue; max_n = local->hw.wiphy->bands[band]->n_channels; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 8c550aab9bdc..f4d3b67fda06 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2637,13 +2637,11 @@ static void sta_set_tidstats(struct sta_info *sta, if (link_id < 0 && tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); - rcu_read_lock(); tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS); ieee80211_fill_txq_stats(&tidstats->txq_stats, to_txq_info(sta->sta.txq[tid])); - rcu_read_unlock(); spin_unlock_bh(&local->fq.lock); } } @@ -2962,7 +2960,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, struct ieee80211_sub_if_data *sdata = sta->sdata; struct ieee80211_local *local = sdata->local; u32 thr = 0; - int i, ac, cpu, link_id; + int i, ac, cpu; struct ieee80211_sta_rx_stats *last_rxstats; last_rxstats = sta_get_last_rx_stats(sta, -1); @@ -3204,18 +3202,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, if (sta->sta.valid_links) { struct ieee80211_link_data *link; struct link_sta_info *link_sta; + int link_id; ether_addr_copy(sinfo->mld_addr, sta->addr); + + /* assign valid links first for iteration */ + sinfo->valid_links = sta->sta.valid_links; + for_each_valid_link(sinfo, link_id) { link_sta = wiphy_dereference(sta->local->hw.wiphy, sta->link[link_id]); link = wiphy_dereference(sdata->local->hw.wiphy, sdata->link[link_id]); - if (!link_sta || !sinfo->links[link_id] || !link) + if (!link_sta || !sinfo->links[link_id] || !link) { + sinfo->valid_links &= ~BIT(link_id); continue; - - sinfo->valid_links = sta->sta.valid_links; + } sta_set_link_sinfo(sta, sinfo->links[link_id], link, tidstats); } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index a362254b310c..4b38aa0e902a 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc <jbenc@suse.cz> * Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright 2021-2024 Intel Corporation + * Copyright 2021-2025 Intel Corporation */ #include <linux/export.h> @@ -572,6 +572,7 @@ static struct ieee80211_sub_if_data * ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) { struct ieee80211_sub_if_data *sdata; + struct ieee80211_hdr *hdr = (void *)skb->data; if (skb->dev) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { @@ -585,7 +586,23 @@ ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb) return NULL; } - return rcu_dereference(local->p2p_sdata); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_P2P_DEVICE: + break; + case NL80211_IFTYPE_NAN: + if (sdata->u.nan.started) + break; + fallthrough; + default: + continue; + } + + if (ether_addr_equal(sdata->vif.addr, hdr->addr2)) + return sdata; + } + + return NULL; } static void ieee80211_report_ack_skb(struct ieee80211_local *local, diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile index 3b0c08356fc5..3c7f874e5c41 100644 --- a/net/mac80211/tests/Makefile +++ b/net/mac80211/tests/Makefile @@ -1,3 +1,3 @@ -mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o +mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o s1g_tim.o obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o diff --git a/net/mac80211/tests/s1g_tim.c b/net/mac80211/tests/s1g_tim.c new file mode 100644 index 000000000000..642fa4ece89f --- /dev/null +++ b/net/mac80211/tests/s1g_tim.c @@ -0,0 +1,356 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * KUnit tests for S1G TIM PVB decoding. This test suite covers + * IEEE80211-2024 Annex L figures 8, 9, 10, 12, 13, 14. ADE mode + * is not covered as it is an optional encoding format and is not + * currently supported by mac80211. + * + * Copyright (C) 2025 Morse Micro + */ +#include <linux/ieee80211.h> +#include <kunit/test.h> +#include <kunit/test-bug.h> + +#define MAX_AID 128 + +#define BC(enc_mode, inverse, blk_off) \ + ((((blk_off) & 0x1f) << 3) | ((inverse) ? BIT(2) : 0) | \ + ((enc_mode) & 0x3)) + +static void byte_to_bitstr(u8 v, char *out) +{ + for (int b = 7; b >= 0; b--) + *out++ = (v & BIT(b)) ? '1' : '0'; + *out = '\0'; +} + +static void dump_tim_bits(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len) +{ + const u8 *ptr = tim->virtual_map; + const u8 *end = (const u8 *)tim + tim_len; + unsigned int oct = 1; + unsigned int blk = 0; + char bits[9]; + + while (ptr < end) { + u8 ctrl = *ptr++; + u8 mode = ctrl & 0x03; + bool inverse = ctrl & BIT(2); + u8 blk_off = ctrl >> 3; + + kunit_info( + test, "Block %u (ENC=%s, blk_off=%u, inverse=%u)", blk, + (mode == IEEE80211_S1G_TIM_ENC_MODE_BLOCK) ? "BLOCK" : + (mode == IEEE80211_S1G_TIM_ENC_MODE_SINGLE) ? "SINGLE" : + "OLB", + blk_off, inverse); + + byte_to_bitstr(ctrl, bits); + kunit_info(test, " octet %2u (ctrl) : %s (0x%02x)", oct, + bits, ctrl); + ++oct; + + switch (mode) { + case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: { + u8 blkmap = *ptr++; + + byte_to_bitstr(blkmap, bits); + kunit_info(test, " octet %2u (blk-map) : %s (0x%02x)", + oct, bits, blkmap); + ++oct; + + for (u8 sb = 0; sb < 8; sb++) { + if (!(blkmap & BIT(sb))) + continue; + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, sb, bits, sub); + ++oct; + } + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: { + u8 single = *ptr++; + + byte_to_bitstr(single, bits); + kunit_info(test, " octet %2u (single) : %s (0x%02x)", + oct, bits, single); + ++oct; + break; + } + case IEEE80211_S1G_TIM_ENC_MODE_OLB: { + u8 len = *ptr++; + + byte_to_bitstr(len, bits); + kunit_info(test, " octet %2u (len=%2u) : %s (0x%02x)", + oct, len, bits, len); + ++oct; + + for (u8 i = 0; i < len && ptr < end; i++) { + u8 sub = *ptr++; + + byte_to_bitstr(sub, bits); + kunit_info( + test, + " octet %2u (SB %2u) : %s (0x%02x)", + oct, i, bits, sub); + ++oct; + } + break; + } + default: + kunit_info(test, " ** unknown encoding 0x%x **", mode); + return; + } + blk++; + } +} + +static void tim_push(u8 **p, u8 v) +{ + *(*p)++ = v; +} + +static void tim_begin(struct ieee80211_tim_ie *tim, u8 **p) +{ + tim->dtim_count = 0; + tim->dtim_period = 1; + tim->bitmap_ctrl = 0; + *p = tim->virtual_map; +} + +static u8 tim_end(struct ieee80211_tim_ie *tim, u8 *tail) +{ + return tail - (u8 *)tim; +} + +static void pvb_add_block_bitmap(u8 **p, u8 blk_off, bool inverse, u8 blk_bmap, + const u8 *subblocks) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_BLOCK; + u8 n = hweight8(blk_bmap); + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, blk_bmap); + + for (u8 i = 0; i < n; i++) + tim_push(p, subblocks[i]); +} + +static void pvb_add_single_aid(u8 **p, u8 blk_off, bool inverse, u8 single6) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_SINGLE; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, single6 & GENMASK(5, 0)); +} + +static void pvb_add_olb(u8 **p, u8 blk_off, bool inverse, const u8 *subblocks, + u8 len) +{ + u8 enc = IEEE80211_S1G_TIM_ENC_MODE_OLB; + + tim_push(p, BC(enc, inverse, blk_off)); + tim_push(p, len); + for (u8 i = 0; i < len; i++) + tim_push(p, subblocks[i]); +} + +static void check_all_aids(struct kunit *test, + const struct ieee80211_tim_ie *tim, u8 tim_len, + const unsigned long *expected) +{ + for (u16 aid = 1; aid <= MAX_AID; aid++) { + bool want = test_bit(aid, expected); + bool got = ieee80211_s1g_check_tim(tim, tim_len, aid); + + KUNIT_ASSERT_EQ_MSG(test, got, want, + "AID %u mismatch (got=%d want=%d)", aid, + got, want); + } +} + +static void fill_bitmap(unsigned long *bm, const u16 *list, size_t n) +{ + size_t i; + + bitmap_zero(bm, MAX_AID + 1); + for (i = 0; i < n; i++) + __set_bit(list[i], bm); +} + +static void fill_bitmap_inverse(unsigned long *bm, u16 max_aid, + const u16 *except, size_t n_except) +{ + bitmap_zero(bm, MAX_AID + 1); + for (u16 aid = 1; aid <= max_aid; aid++) + __set_bit(aid, bm); + + for (size_t i = 0; i < n_except; i++) + if (except[i] <= max_aid) + __clear_bit(except[i], bm); +} + +static void s1g_tim_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; /* bits 0 and 2 set */ + bool inverse = false; + static const u16 set_list[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + static const u16 set_list[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = false; + u8 blk_off = 0; + static const u16 set_list[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + u8 len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap(exp, set_list, ARRAY_SIZE(set_list)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_block_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + /* Same sub-block content as Figure L-8, but inverse = true */ + static const u8 subblocks[] = { + 0x42, /* SB m=0: AIDs 1,6 */ + 0xA0, /* SB m=2: AIDs 21,23 */ + }; + u8 blk_bmap = 0x05; + bool inverse = true; + /* All AIDs except 1,6,21,23 are set */ + static const u16 except[] = { 1, 6, 21, 23 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_single_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0; + u8 single6 = 0x1f; /* 31 */ + /* All AIDs except 31 are set */ + static const u16 except[] = { 31 }; + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_single_aid(&p, blk_off, inverse, single6); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static void s1g_tim_inverse_olb_test(struct kunit *test) +{ + u8 buf[256] = {}; + struct ieee80211_tim_ie *tim = (void *)buf; + u8 *p, tim_len; + bool inverse = true; + u8 blk_off = 0, len; + /* All AIDs except the list below are set */ + static const u16 except[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33, + 38, 45, 47, 49, 54, 61, 63, 65, 70 }; + static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42, + 0xA0, 0x42, 0xA0, 0x42 }; + len = ARRAY_SIZE(subblocks); + DECLARE_BITMAP(exp, MAX_AID + 1); + + tim_begin(tim, &p); + pvb_add_olb(&p, blk_off, inverse, subblocks, len); + tim_len = tim_end(tim, p); + + fill_bitmap_inverse(exp, 127, except, ARRAY_SIZE(except)); + + dump_tim_bits(test, tim, tim_len); + check_all_aids(test, tim, tim_len, exp); +} + +static struct kunit_case s1g_tim_test_cases[] = { + KUNIT_CASE(s1g_tim_block_test), + KUNIT_CASE(s1g_tim_single_test), + KUNIT_CASE(s1g_tim_olb_test), + KUNIT_CASE(s1g_tim_inverse_block_test), + KUNIT_CASE(s1g_tim_inverse_single_test), + KUNIT_CASE(s1g_tim_inverse_olb_test), + {} +}; + +static struct kunit_suite s1g_tim = { + .name = "mac80211-s1g-tim", + .test_cases = s1g_tim_test_cases, +}; + +kunit_test_suite(s1g_tim); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 00671ae45b2f..e7b141c55f7a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx, if (WARN_ON_ONCE(tx->rate.idx < 0)) return 0; + if (info->band >= NUM_NL80211_BANDS) + return 0; + sband = local->hw.wiphy->bands[info->band]; txrate = &sband->bitrates[tx->rate.idx]; @@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) memset(&txrc, 0, sizeof(txrc)); - sband = tx->local->hw.wiphy->bands[info->band]; + if (info->band < NUM_NL80211_BANDS) + sband = tx->local->hw.wiphy->bands[info->band]; + else + return TX_CONTINUE; len = min_t(u32, tx->skb->len + FCS_LEN, tx->local->hw.wiphy->frag_threshold); @@ -1814,7 +1820,7 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { - I802_DEBUG_INC(tx->local->tx_handlers_drop); + tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else @@ -1858,7 +1864,7 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx) txh_done: if (unlikely(res == TX_DROP)) { - I802_DEBUG_INC(tx->local->tx_handlers_drop); + tx->sdata->tx_handlers_drop++; if (tx->skb) ieee80211_free_txskb(&tx->local->hw, tx->skb); else @@ -1942,6 +1948,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata, if (unlikely(res_prepare == TX_DROP)) { ieee80211_free_txskb(&local->hw, skb); + tx.sdata->tx_handlers_drop++; return true; } else if (unlikely(res_prepare == TX_QUEUED)) { return true; @@ -3728,8 +3735,10 @@ void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata, r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs, fast_tx->key, &tx); tx.skb = NULL; - if (r == TX_DROP) + if (r == TX_DROP) { + tx.sdata->tx_handlers_drop++; goto free; + } if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -4882,15 +4891,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t) /* functions for drivers to get certain frames */ +static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int i, n1 = 0, n2; + + /* + * Find largest even number N1 so that bits numbered 1 through + * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits + * (N2 + 1) x 8 through 2007 are 0. + */ + for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { + if (ps->tim[i]) { + n1 = i & 0xfe; + break; + } + } + n2 = n1; + for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { + if (ps->tim[i]) { + n2 = i; + break; + } + } + + /* Bitmap control */ + skb_put_u8(skb, n1 | mcast_traffic); + /* Part Virt Bitmap */ + skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); +} + +/* + * mac80211 currently supports encoding using block bitmap mode, non + * inversed. The current implementation supports up to 1600 AIDs. + * + * Block bitmap encoding breaks down the AID bitmap into blocks of 64 + * AIDs. Each block contains between 0 and 8 subblocks. Each subblock + * describes 8 AIDs and the presence of a subblock is determined by + * the block bitmap. + */ +static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps, + struct sk_buff *skb, + bool mcast_traffic) +{ + int blk; + + /* + * Emit a bitmap control block with a page slice number of 31 and a + * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1 + * that the entire page (2048 bits) indicated by the page index + * is encoded in the partial virtual bitmap. + */ + skb_put_u8(skb, mcast_traffic | (31 << 1)); + + /* Emit an encoded block for each non-zero sub-block */ + for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) { + u8 blk_bmap = 0; + int sblk; + + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + /* + * If the current subblock is non-zero, increase the + * number of subblocks to emit for the current block. + */ + if (ps->tim[sblk_idx]) + blk_bmap |= BIT(sblk); + } + + /* If the current block contains no non-zero sublocks */ + if (!blk_bmap) + continue; + + /* + * Emit a block control byte for the current encoded block + * with an encoding mode of block bitmap (0x0), not inverse + * (0x0) and the current block offset (5 bits) + */ + skb_put_u8(skb, blk << 3); + + /* + * Emit the block bitmap for the current encoded block which + * contains the present subblocks. + */ + skb_put_u8(skb, blk_bmap); + + /* Emit the present subblocks */ + for (sblk = 0; sblk < 8; sblk++) { + int sblk_idx = blk * 8 + sblk; + + if (!(blk_bmap & BIT(sblk))) + continue; + + skb_put_u8(skb, ps->tim[sblk_idx]); + } + } +} + static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link, struct ps_data *ps, struct sk_buff *skb, bool is_template) { - u8 *pos, *tim; - int aid0 = 0; - int i, have_bits = 0, n1, n2; + struct element *tim; + bool mcast_traffic = false, have_bits = false; struct ieee80211_bss_conf *link_conf = link->conf; + bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ; /* Generate bitmap for TIM only if there are any STAs in power save * mode. */ @@ -4898,7 +5006,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, /* in the hope that this is faster than * checking byte-for-byte */ have_bits = !bitmap_empty((unsigned long *)ps->tim, - IEEE80211_MAX_AID+1); + IEEE80211_MAX_AID + 1); + if (!is_template) { if (ps->dtim_count == 0) ps->dtim_count = link_conf->dtim_period - 1; @@ -4906,51 +5015,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, ps->dtim_count--; } - tim = pos = skb_put(skb, 5); - *pos++ = WLAN_EID_TIM; - *pos++ = 3; - *pos++ = ps->dtim_count; - *pos++ = link_conf->dtim_period; + /* Length is set after parsing the AID bitmap */ + tim = skb_put(skb, sizeof(struct element)); + tim->id = WLAN_EID_TIM; + skb_put_u8(skb, ps->dtim_count); + skb_put_u8(skb, link_conf->dtim_period); if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf)) - aid0 = 1; + mcast_traffic = true; - ps->dtim_bc_mc = aid0 == 1; + ps->dtim_bc_mc = mcast_traffic; if (have_bits) { - /* Find largest even number N1 so that bits numbered 1 through - * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits - * (N2 + 1) x 8 through 2007 are 0. */ - n1 = 0; - for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) { - if (ps->tim[i]) { - n1 = i & 0xfe; - break; - } - } - n2 = n1; - for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) { - if (ps->tim[i]) { - n2 = i; - break; - } - } - - /* Bitmap control */ - *pos++ = n1 | aid0; - /* Part Virt Bitmap */ - skb_put_data(skb, ps->tim + n1, n2 - n1 + 1); - - tim[1] = n2 - n1 + 4; + if (s1g) + ieee80211_s1g_beacon_add_tim_pvb(ps, skb, + mcast_traffic); + else + ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic); } else { - *pos++ = aid0; /* Bitmap control */ - - if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) { - tim[1] = 4; + /* + * If there is no buffered unicast traffic for an S1G + * interface, we can exclude the bitmap control. This is in + * contrast to other phy types as they do include the bitmap + * control and pvb even when there is no buffered traffic. + */ + if (!s1g) { + /* Bitmap control */ + skb_put_u8(skb, mcast_traffic); /* Part Virt Bitmap */ skb_put_u8(skb, 0); } } + + tim->datalen = skb_tail_pointer(skb) - tim->data; } static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata, @@ -6195,7 +6292,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata, enum nl80211_band band; rcu_read_lock(); - if (!ieee80211_vif_is_mld(&sdata->vif)) { + if (sdata->vif.type == NL80211_IFTYPE_NAN) { + band = NUM_NL80211_BANDS; + } else if (!ieee80211_vif_is_mld(&sdata->vif)) { WARN_ON(link_id >= 0); chanctx_conf = rcu_dereference(sdata->vif.bss_conf.chanctx_conf); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 32f1bc5908c5..c9931537f9d2 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1756,7 +1756,6 @@ int ieee80211_reconfig(struct ieee80211_local *local) bool sched_scan_stopped = false; bool suspended = local->suspended; bool in_reconfig = false; - u32 rts_threshold; lockdep_assert_wiphy(local->hw.wiphy); @@ -1832,7 +1831,9 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* setup RTS threshold */ if (hw->wiphy->n_radio > 0) { for (i = 0; i < hw->wiphy->n_radio; i++) { - rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold; + u32 rts_threshold = + hw->wiphy->radio_cfg[i].rts_threshold; + drv_set_rts_threshold(local, i, rts_threshold); } } else { @@ -3198,10 +3199,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local, return true; } -bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, +bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local, + const struct ieee80211_s1g_oper_ie *oper, struct cfg80211_chan_def *chandef) { - u32 oper_freq; + u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz; if (!oper) return false; @@ -3226,12 +3228,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper, return false; } - oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch, - NL80211_BAND_S1GHZ); - chandef->center_freq1 = KHZ_TO_MHZ(oper_freq); - chandef->freq1_offset = oper_freq % 1000; + chandef->s1g_primary_2mhz = false; - return true; + switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) { + case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ: + pri_1mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + break; + case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ: + chandef->s1g_primary_2mhz = true; + pri_2mhz_khz = ieee80211_channel_to_freq_khz( + oper->primary_ch, NL80211_BAND_S1GHZ); + + if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) == + S1G_2M_PRIMARY_LOCATION_LOWER) + pri_1mhz_khz = pri_2mhz_khz - 500; + else + pri_1mhz_khz = pri_2mhz_khz + 500; + break; + default: + return false; + } + + oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch, + NL80211_BAND_S1GHZ); + chandef->center_freq1 = KHZ_TO_MHZ(oper_khz); + chandef->freq1_offset = oper_khz % 1000; + chandef->chan = + ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz); + + return chandef->chan; } int ieee80211_put_srates_elem(struct sk_buff *skb, @@ -4022,16 +4048,13 @@ bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy, for (i = 0; i < scan_req->n_channels; i++) { chan = scan_req->channels[i]; chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan); - /* - * The chan_radio_idx should be valid since it's taken from a - * valid scan request. - * However, if chan_radio_idx is unexpectedly invalid (negative), - * we take a conservative approach and assume the scan request - * might use the specified radio_idx. Hence, return true. - */ - if (WARN_ON(chan_radio_idx < 0)) - return true; + /* The radio index either matched successfully, or an error + * occurred. For example, if radio-level information is + * missing, the same error value is returned. This + * typically implies a single-radio setup, in which case + * the operation should not be allowed. + */ if (chan_radio_idx == radio_idx) return true; } @@ -4514,3 +4537,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe) sizeof(tpe->psd_reg_client[i].power)); } } + +bool ieee80211_vif_nan_started(struct ieee80211_vif *vif) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started; +} +EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started); diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index 685524800d70..b99ba14f39d2 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -256,7 +256,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) skb_reserve(skb, hlen); - /* set type as fist byte in payload */ + /* set type as first byte in payload */ *(u8 *)skb_put(skb, 1) = addr->smctp_type; rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len); diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index b08ba959ac4f..31948e18d97d 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -22,7 +22,6 @@ #include <linux/kernel.h> #include <crypto/sha2.h> -#include <linux/unaligned.h> #include "protocol.h" @@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn) void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac) { - u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE]; - u8 key1be[8]; - u8 key2be[8]; - int i; + __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) }; - if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE)) - len = SHA256_DIGEST_SIZE; - - put_unaligned_be64(key1, key1be); - put_unaligned_be64(key2, key2be); - - /* Generate key xored with ipad */ - memset(input, 0x36, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - memcpy(&input[SHA256_BLOCK_SIZE], msg, len); - - /* emit sha256(K1 || msg) on the second input block, so we can - * reuse 'input' for the last hashing - */ - sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]); - - /* Prepare second part of hmac */ - memset(input, 0x5C, SHA256_BLOCK_SIZE); - for (i = 0; i < 8; i++) - input[i] ^= key1be[i]; - for (i = 0; i < 8; i++) - input[i + 8] ^= key2be[i]; - - sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac); + hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac); } #if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST) diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c index fed40dae5583..d96130e49942 100644 --- a/net/mptcp/ctrl.c +++ b/net/mptcp/ctrl.c @@ -501,10 +501,15 @@ void mptcp_active_enable(struct sock *sk) struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk)); if (atomic_read(&pernet->active_disable_times)) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; - if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK)) + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!(dev && (dev->flags & IFF_LOOPBACK))) atomic_set(&pernet->active_disable_times, 0); + rcu_read_unlock(); } } diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c index cf879c188ca2..6003e47c770a 100644 --- a/net/mptcp/mib.c +++ b/net/mptcp/mib.c @@ -85,7 +85,6 @@ static const struct snmp_mib mptcp_snmp_list[] = { SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK), SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK), SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED), - SNMP_MIB_SENTINEL }; /* mptcp_mib_alloc - allocate percpu mib counters @@ -108,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net) void mptcp_seq_show(struct seq_file *seq) { - unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1]; + unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)]; + const int cnt = ARRAY_SIZE(mptcp_snmp_list); struct net *net = seq->private; int i; seq_puts(seq, "MPTcpExt:"); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %s", mptcp_snmp_list[i].name); seq_puts(seq, "\nMPTcpExt:"); memset(sum, 0, sizeof(sum)); if (net->mib.mptcp_statistics) - snmp_get_cpu_field_batch(sum, mptcp_snmp_list, - net->mib.mptcp_statistics); + snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt, + net->mib.mptcp_statistics); - for (i = 0; mptcp_snmp_list[i].name; i++) + for (i = 0; i < cnt; i++) seq_printf(seq, " %lu", sum[i]); seq_putc(seq, '\n'); diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 0566dd793810..ac974299de71 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -15,9 +15,9 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *req, - struct nlattr *bc, bool net_admin) + bool net_admin) { - if (!inet_diag_bc_sk(bc, sk)) + if (!inet_diag_bc_sk(cb->data, sk)) return 0; return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI, @@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba const struct inet_diag_req_v2 *r, bool net_admin) { - struct inet_diag_dump_data *cb_data = cb->data; struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; - struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); struct inet_hashinfo *hinfo; int i; @@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba if (!refcount_inc_not_zero(&sk->sk_refcnt)) goto next_listen; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); sock_put(sk); @@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN); struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct net *net = sock_net(skb->sk); - struct inet_diag_dump_data *cb_data; struct mptcp_sock *msk; - struct nlattr *bc; BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx)); - cb_data = cb->data; - bc = cb_data->inet_diag_nla_bc; - while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot, &diag_ctx->s_num)) != NULL) { struct inet_sock *inet = (struct inet_sock *)msk; @@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, r->id.idiag_dport) goto next; - ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin); + ret = sk_diag_dump(sk, skb, cb, r, net_admin); next: sock_put(sk); if (ret < 0) { diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 136a380602ca..2ff1b9499568 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -268,6 +268,27 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk, return -EINVAL; } +static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk) +{ + const struct net *net = sock_net((struct sock *)msk); + unsigned int rto = mptcp_get_add_addr_timeout(net); + struct mptcp_subflow_context *subflow; + unsigned int max = 0; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + struct inet_connection_sock *icsk = inet_csk(ssk); + + if (icsk->icsk_rto > max) + max = icsk->icsk_rto; + } + + if (max && max < rto) + rto = max; + + return rto; +} + static void mptcp_pm_add_timer(struct timer_list *timer) { struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer, @@ -292,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) goto out; } - timeout = mptcp_get_add_addr_timeout(sock_net(sk)); + timeout = mptcp_adjust_add_addr_timeout(msk); if (!timeout) goto out; @@ -307,7 +328,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer) if (entry->retrans_times < ADD_ADDR_RETRANS_MAX) sk_reset_timer(sk, timer, - jiffies + timeout); + jiffies + (timeout << entry->retrans_times)); spin_unlock_bh(&msk->pm.lock); @@ -348,7 +369,6 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, { struct mptcp_pm_add_entry *add_entry = NULL; struct sock *sk = (struct sock *)msk; - struct net *net = sock_net(sk); unsigned int timeout; lockdep_assert_held(&msk->pm.lock); @@ -374,7 +394,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); reset_timer: - timeout = mptcp_get_add_addr_timeout(net); + timeout = mptcp_adjust_add_addr_timeout(msk); if (timeout) sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout); @@ -463,23 +483,24 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) { struct mptcp_pm_data *pm = &msk->pm; - unsigned int subflows_max; + unsigned int limit_extra_subflows; int ret = 0; if (mptcp_pm_is_userspace(msk)) { if (mptcp_userspace_pm_active(msk)) { spin_lock_bh(&pm->lock); - pm->subflows++; + pm->extra_subflows++; spin_unlock_bh(&pm->lock); return true; } return false; } - subflows_max = mptcp_pm_get_subflows_max(msk); + limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); - pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows, - subflows_max, READ_ONCE(pm->accept_subflow)); + pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, + pm->extra_subflows, limit_extra_subflows, + READ_ONCE(pm->accept_subflow)); /* try to avoid acquiring the lock below */ if (!READ_ONCE(pm->accept_subflow)) @@ -487,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk) spin_lock_bh(&pm->lock); if (READ_ONCE(pm->accept_subflow)) { - ret = pm->subflows < subflows_max; - if (ret && ++pm->subflows == subflows_max) + ret = pm->extra_subflows < limit_extra_subflows; + if (ret && ++pm->extra_subflows == limit_extra_subflows) WRITE_ONCE(pm->accept_subflow, false); } spin_unlock_bh(&pm->lock); @@ -574,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, if (mptcp_pm_is_userspace(msk)) { if (update_subflows) { spin_lock_bh(&pm->lock); - pm->subflows--; + pm->extra_subflows--; spin_unlock_bh(&pm->lock); } return; @@ -617,9 +638,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk, } else { __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP); } - /* id0 should not have a different address */ + /* - id0 should not have a different address + * - special case for C-flag: linked to fill_local_addresses_vec() + */ } else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) || - (addr->id > 0 && !READ_ONCE(pm->accept_addr))) { + (addr->id > 0 && !READ_ONCE(pm->accept_addr) && + !mptcp_pm_add_addr_c_flag_case(msk))) { mptcp_pm_announce_addr(msk, addr, true); mptcp_pm_add_addr_send_ack(msk); } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) { @@ -1005,17 +1029,17 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk) WRITE_ONCE(pm->pm_type, pm_type); if (pm_type == MPTCP_PM_TYPE_KERNEL) { - bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk); + bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk); /* pm->work_pending must be only be set to 'true' when * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL */ WRITE_ONCE(pm->work_pending, - (!!mptcp_pm_get_local_addr_max(msk) && + (!!mptcp_pm_get_endp_subflow_max(msk) && subflows_allowed) || - !!mptcp_pm_get_add_addr_signal_max(msk)); + !!mptcp_pm_get_endp_signal_max(msk)); WRITE_ONCE(pm->accept_addr, - !!mptcp_pm_get_add_addr_accept_max(msk) && + !!mptcp_pm_get_limit_add_addr_accepted(msk) && subflows_allowed); WRITE_ONCE(pm->accept_subflow, subflows_allowed); diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c index 667803d72b64..e0f44dc232aa 100644 --- a/net/mptcp/pm_kernel.c +++ b/net/mptcp/pm_kernel.c @@ -17,14 +17,14 @@ static int pm_nl_pernet_id; struct pm_nl_pernet { /* protects pernet updates */ spinlock_t lock; - struct list_head local_addr_list; - unsigned int addrs; - unsigned int stale_loss_cnt; - unsigned int add_addr_signal_max; - unsigned int add_addr_accept_max; - unsigned int local_addr_max; - unsigned int subflows_max; - unsigned int next_id; + struct list_head endp_list; + u8 endpoints; + u8 endp_signal_max; + u8 endp_subflow_max; + u8 endp_laminar_max; + u8 limit_add_addr_accepted; + u8 limit_extra_subflows; + u8 next_id; DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); }; @@ -46,37 +46,45 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info) return pm_nl_get_pernet(genl_info_net(info)); } -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk) { const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_signal_max); + return READ_ONCE(pernet->endp_signal_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->add_addr_accept_max); + return READ_ONCE(pernet->endp_subflow_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->subflows_max); + return READ_ONCE(pernet->endp_laminar_max); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk) +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - return READ_ONCE(pernet->local_addr_max); + return READ_ONCE(pernet->limit_add_addr_accepted); } -EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted); + +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + + return READ_ONCE(pernet->limit_extra_subflows); +} +EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows); static bool lookup_subflow_by_daddr(const struct list_head *list, const struct mptcp_addr_info *daddr) @@ -110,7 +118,7 @@ select_local_address(const struct pm_nl_pernet *pernet, msk_owned_by_me(msk); rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) continue; @@ -141,7 +149,7 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, * Note: removal from the local address list during the msk life-cycle * can lead to additional addresses not being announced. */ - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) continue; @@ -159,80 +167,96 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk, return found; } -/* Fill all the remote addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *local, - bool fullmesh, - struct mptcp_addr_info *addrs) +static unsigned int +fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) { bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); - struct sock *sk = (struct sock *)msk, *ssk; - struct mptcp_subflow_context *subflow; struct mptcp_addr_info remote = { 0 }; - unsigned int subflows_max; - int i = 0; + struct sock *sk = (struct sock *)msk; + + if (deny_id0) + return 0; - subflows_max = mptcp_pm_get_subflows_max(msk); mptcp_remote_address((struct sock_common *)sk, &remote); - /* Non-fullmesh endpoint, fill in the single entry - * corresponding to the primary MPC subflow remote address - */ - if (!fullmesh) { - if (deny_id0) - return 0; + if (!mptcp_pm_addr_families_match(sk, local, &remote)) + return 0; - if (!mptcp_pm_addr_families_match(sk, local, &remote)) - return 0; + msk->pm.extra_subflows++; + *addrs = remote; - msk->pm.subflows++; - addrs[i++] = remote; - } else { - DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + return 1; +} - /* Forbid creation of new subflows matching existing - * ones, possibly already created by incoming ADD_ADDR - */ - bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); - mptcp_for_each_subflow(msk, subflow) - if (READ_ONCE(subflow->local_id) == local->id) - __set_bit(subflow->remote_id, unavail_id); - - mptcp_for_each_subflow(msk, subflow) { - ssk = mptcp_subflow_tcp_sock(subflow); - mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); - addrs[i].id = READ_ONCE(subflow->remote_id); - if (deny_id0 && !addrs[i].id) - continue; +static unsigned int +fill_remote_addresses_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *local, + struct mptcp_addr_info *addrs) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct sock *sk = (struct sock *)msk, *ssk; + struct mptcp_subflow_context *subflow; + int i = 0; - if (test_bit(addrs[i].id, unavail_id)) - continue; + /* Forbid creation of new subflows matching existing ones, possibly + * already created by incoming ADD_ADDR + */ + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) + if (READ_ONCE(subflow->local_id) == local->id) + __set_bit(subflow->remote_id, unavail_id); + + mptcp_for_each_subflow(msk, subflow) { + ssk = mptcp_subflow_tcp_sock(subflow); + mptcp_remote_address((struct sock_common *)ssk, &addrs[i]); + addrs[i].id = READ_ONCE(subflow->remote_id); + if (deny_id0 && !addrs[i].id) + continue; - if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) - continue; + if (test_bit(addrs[i].id, unavail_id)) + continue; - if (msk->pm.subflows < subflows_max) { - /* forbid creating multiple address towards - * this id - */ - __set_bit(addrs[i].id, unavail_id); - msk->pm.subflows++; - i++; - } - } + if (!mptcp_pm_addr_families_match(sk, local, &addrs[i])) + continue; + + /* forbid creating multiple address towards this id */ + __set_bit(addrs[i].id, unavail_id); + msk->pm.extra_subflows++; + i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } return i; } +/* Fill all the remote addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local, + bool fullmesh, struct mptcp_addr_info *addrs) +{ + /* Non-fullmesh: fill in the single entry corresponding to the primary + * MPC subflow remote address, and return 1, corresponding to 1 entry. + */ + if (!fullmesh) + return fill_remote_addr(msk, local, addrs); + + /* Fullmesh endpoint: fill all possible remote addresses */ + return fill_remote_addresses_fullmesh(msk, local, addrs); +} + static struct mptcp_pm_addr_entry * __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (entry->addr.id == id) return entry; @@ -245,7 +269,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + list_for_each_entry_rcu(entry, &pernet->endp_list, list, lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; @@ -253,52 +277,65 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) return NULL; } -static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, + const struct mptcp_addr_info *addr) { - struct sock *sk = (struct sock *)msk; - unsigned int add_addr_signal_max; - bool signal_and_subflow = false; - unsigned int local_addr_max; - struct pm_nl_pernet *pernet; - struct mptcp_pm_local local; - unsigned int subflows_max; - - pernet = pm_nl_get_pernet(sock_net(sk)); + return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; +} - add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk); - local_addr_max = mptcp_pm_get_local_addr_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); +/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */ +static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_pm_addr_entry *entry; + struct mptcp_addr_info mpc_addr; + struct pm_nl_pernet *pernet; + bool backup = false; /* do lazy endpoint usage accounting for the MPC subflows */ - if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first); - struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - bool backup = false; - - mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); - rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr); - if (entry) { - __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); - msk->mpc_endpoint_id = entry->addr.id; - backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); - } - rcu_read_unlock(); + if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) || + !msk->first) + return; - if (backup) - mptcp_pm_send_ack(msk, subflow, true, backup); + subflow = mptcp_subflow_ctx(msk->first); + pernet = pm_nl_get_pernet_from_msk(msk); - msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); + mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); + rcu_read_lock(); + entry = __lookup_addr(pernet, &mpc_addr); + if (entry) { + __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); + msk->mpc_endpoint_id = entry->addr.id; + backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP); } + rcu_read_unlock(); + + /* Send MP_PRIO */ + if (backup) + mptcp_pm_send_ack(msk, subflow, true, backup); + + msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); +} + +static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk); + struct sock *sk = (struct sock *)msk; + bool signal_and_subflow = false; + struct mptcp_pm_local local; + + mptcp_mpc_endpoint_setup(msk); pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", - msk->pm.local_addr_used, local_addr_max, - msk->pm.add_addr_signaled, add_addr_signal_max, - msk->pm.subflows, subflows_max); + msk->pm.local_addr_used, endp_subflow_max, + msk->pm.add_addr_signaled, endp_signal_max, + msk->pm.extra_subflows, limit_extra_subflows); /* check first for announce */ - if (msk->pm.add_addr_signaled < add_addr_signal_max) { + if (msk->pm.add_addr_signaled < endp_signal_max) { /* due to racing events on both ends we can reach here while * previous add address is still running: if we invoke now * mptcp_pm_announce_addr(), that will fail and the @@ -334,8 +371,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) subflow: /* check if should create a new subflow */ - while (msk->pm.local_addr_used < local_addr_max && - msk->pm.subflows < subflows_max) { + while (msk->pm.local_addr_used < endp_subflow_max && + msk->pm.extra_subflows < limit_extra_subflows) { struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX]; bool fullmesh; int i, nr; @@ -377,90 +414,225 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk) mptcp_pm_create_subflow_or_signal_addr(msk); } -/* Fill all the local addresses into the array addrs[], - * and return the array size. - */ -static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk, - struct mptcp_addr_info *remote, - struct mptcp_pm_local *locals) +static unsigned int +fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals, + bool c_flag_case) { + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); struct sock *sk = (struct sock *)msk; struct mptcp_pm_addr_entry *entry; - struct mptcp_addr_info mpc_addr; - struct pm_nl_pernet *pernet; - unsigned int subflows_max; + struct mptcp_pm_local *local; int i = 0; - pernet = pm_nl_get_pernet_from_msk(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - - mptcp_local_address((struct sock_common *)msk, &mpc_addr); - rcu_read_lock(); - list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + bool is_id0; + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH)) continue; if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) continue; - if (msk->pm.subflows < subflows_max) { - locals[i].addr = entry->addr; - locals[i].flags = entry->flags; - locals[i].ifindex = entry->ifindex; + local = &locals[i]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; - /* Special case for ID0: set the correct ID */ - if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port)) - locals[i].addr.id = 0; + is_id0 = local->addr.id == msk->mpc_endpoint_id; - msk->pm.subflows++; - i++; + if (c_flag_case && + (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (!is_id0) + msk->pm.local_addr_used++; } + + /* Special case for ID0: set the correct ID */ + if (is_id0) + local->addr.id = 0; + + msk->pm.extra_subflows++; + i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } rcu_read_unlock(); - /* If the array is empty, fill in the single - * 'IPADDRANY' local address + return i; +} + +static unsigned int +fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_subflow_context *subflow; + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_addr_entry *entry; + struct mptcp_pm_local *local; + int found = 0; + + /* Forbid creation of new subflows matching existing ones, possibly + * already created by 'subflow' endpoints */ - if (!i) { - memset(&locals[i], 0, sizeof(locals[i])); - locals[i].addr.family = -#if IS_ENABLED(CONFIG_MPTCP_IPV6) - remote->family == AF_INET6 && - ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : -#endif - remote->family; + bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1); + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + if ((1 << inet_sk_state_load(ssk)) & + (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | + TCPF_CLOSE)) + continue; + + __set_bit(subflow_get_local_id(subflow), unavail_id); + } + + rcu_read_lock(); + list_for_each_entry_rcu(entry, &pernet->endp_list, list) { + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR)) + continue; + + if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote)) + continue; + + if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr), + unavail_id)) + continue; + + local = &locals[0]; + local->addr = entry->addr; + local->flags = entry->flags; + local->ifindex = entry->ifindex; + + if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + + if (local->addr.id != msk->mpc_endpoint_id) + msk->pm.local_addr_used++; + } + + msk->pm.extra_subflows++; + found = 1; + break; + } + rcu_read_unlock(); + + return found; +} + +static unsigned int +fill_local_addresses_vec_c_flag(struct mptcp_sock *msk, + struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); + struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); + u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk); + struct sock *sk = (struct sock *)msk; + struct mptcp_pm_local *local; + int i = 0; + + while (msk->pm.local_addr_used < endp_subflow_max) { + local = &locals[i]; - if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote)) - return 0; + if (!select_local_address(pernet, msk, local)) + break; + + __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); - msk->pm.subflows++; + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + continue; + + if (local->addr.id == msk->mpc_endpoint_id) + continue; + + msk->pm.local_addr_used++; + msk->pm.extra_subflows++; i++; + + if (msk->pm.extra_subflows >= limit_extra_subflows) + break; } return i; } +static unsigned int +fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *local) +{ + struct sock *sk = (struct sock *)msk; + + memset(local, 0, sizeof(*local)); + local->addr.family = +#if IS_ENABLED(CONFIG_MPTCP_IPV6) + remote->family == AF_INET6 && + ipv6_addr_v4mapped(&remote->addr6) ? AF_INET : +#endif + remote->family; + + if (!mptcp_pm_addr_families_match(sk, &local->addr, remote)) + return 0; + + msk->pm.extra_subflows++; + + return 1; +} + +/* Fill all the local addresses into the array addrs[], + * and return the array size. + */ +static unsigned int +fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote, + struct mptcp_pm_local *locals) +{ + bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk); + int i; + + /* If there is at least one MPTCP endpoint with a fullmesh flag */ + i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case); + if (i) + return i; + + /* If there is at least one MPTCP endpoint with a laminar flag */ + if (mptcp_pm_get_endp_laminar_max(msk)) + return fill_local_laminar_endp(msk, remote, locals); + + /* Special case: peer sets the C flag, accept one ADD_ADDR if default + * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints + */ + if (c_flag_case) + return fill_local_addresses_vec_c_flag(msk, remote, locals); + + /* No special case: fill in the single 'IPADDRANY' local address */ + return fill_local_address_any(msk, remote, &locals[0]); +} + static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) { + u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk); + u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk); struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX]; struct sock *sk = (struct sock *)msk; - unsigned int add_addr_accept_max; struct mptcp_addr_info remote; - unsigned int subflows_max; bool sf_created = false; int i, nr; - add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk); - subflows_max = mptcp_pm_get_subflows_max(msk); - pr_debug("accepted %d:%d remote family %d\n", - msk->pm.add_addr_accepted, add_addr_accept_max, + msk->pm.add_addr_accepted, limit_add_addr_accepted, msk->pm.remote.family); remote = msk->pm.remote; mptcp_pm_announce_addr(msk, &remote, true); mptcp_pm_addr_send_ack(msk); + mptcp_mpc_endpoint_setup(msk); if (lookup_subflow_by_daddr(&msk->conn_list, &remote)) return; @@ -486,8 +658,8 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) /* add_addr_accepted is not decr for ID 0 */ if (remote.id) msk->pm.add_addr_accepted++; - if (msk->pm.add_addr_accepted >= add_addr_accept_max || - msk->pm.subflows >= subflows_max) + if (msk->pm.add_addr_accepted >= limit_add_addr_accepted || + msk->pm.extra_subflows >= limit_extra_subflows) WRITE_ONCE(msk->pm.accept_addr, false); } } @@ -495,10 +667,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk) void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id) { if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) { + u8 limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + /* Note: if the subflow has been closed before, this * add_addr_accepted counter will not be decremented. */ - if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk)) + if (--msk->pm.add_addr_accepted < limit_add_addr_accepted) WRITE_ONCE(msk->pm.accept_addr, true); } } @@ -523,8 +698,8 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, bool needs_id, bool replace) { struct mptcp_pm_addr_entry *cur, *del_entry = NULL; - unsigned int addr_max; int ret = -EINVAL; + u8 addr_max; spin_lock_bh(&pernet->lock); /* to keep the code simple, don't do IDR-like allocation for address ID, @@ -532,7 +707,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) pernet->next_id = 1; - if (pernet->addrs >= MPTCP_PM_ADDR_MAX) { + if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) { ret = -ERANGE; goto out; } @@ -546,7 +721,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, */ if (!address_use_port(entry)) entry->addr.port = 0; - list_for_each_entry(cur, &pernet->local_addr_list, list) { + list_for_each_entry(cur, &pernet->endp_list, list) { if (mptcp_addresses_equal(&cur->addr, &entry->addr, cur->addr.port || entry->addr.port)) { /* allow replacing the exiting endpoint only if such @@ -571,7 +746,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, goto out; } - pernet->addrs--; + pernet->endpoints--; entry->addr.id = cur->addr.id; list_del_rcu(&cur->list); del_entry = cur; @@ -598,19 +773,23 @@ find_next: pernet->next_id = entry->addr.id; if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max + 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max + 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1); } - pernet->addrs++; + pernet->endpoints++; if (!entry->addr.port) - list_add_tail_rcu(&entry->list, &pernet->local_addr_list); + list_add_tail_rcu(&entry->list, &pernet->endp_list); else - list_add_rcu(&entry->list, &pernet->local_addr_list); + list_add_rcu(&entry->list, &pernet->endp_list); ret = entry->addr.id; out: @@ -845,12 +1024,6 @@ out_free: return ret; } -static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk, - const struct mptcp_addr_info *addr) -{ - return msk->mpc_endpoint_id == addr->id ? 0 : addr->id; -} - static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk, const struct mptcp_addr_info *addr, bool force) @@ -969,8 +1142,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) { struct pm_nl_pernet *pernet = genl_info_pm_nl(info); struct mptcp_pm_addr_entry addr, *entry; - unsigned int addr_max; struct nlattr *attr; + u8 addr_max; int ret; if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR)) @@ -997,15 +1170,19 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) { - addr_max = pernet->add_addr_signal_max; - WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1); + addr_max = pernet->endp_signal_max; + WRITE_ONCE(pernet->endp_signal_max, addr_max - 1); } if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) { - addr_max = pernet->local_addr_max; - WRITE_ONCE(pernet->local_addr_max, addr_max - 1); + addr_max = pernet->endp_subflow_max; + WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1); + } + if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) { + addr_max = pernet->endp_laminar_max; + WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1); } - pernet->addrs--; + pernet->endpoints--; list_del_rcu(&entry->list); __clear_bit(entry->addr.id, pernet->id_bitmap); spin_unlock_bh(&pernet->lock); @@ -1084,9 +1261,10 @@ static void __flush_addrs(struct list_head *list) static void __reset_counters(struct pm_nl_pernet *pernet) { - WRITE_ONCE(pernet->add_addr_signal_max, 0); - WRITE_ONCE(pernet->local_addr_max, 0); - pernet->addrs = 0; + WRITE_ONCE(pernet->endp_signal_max, 0); + WRITE_ONCE(pernet->endp_subflow_max, 0); + WRITE_ONCE(pernet->endp_laminar_max, 0); + pernet->endpoints = 0; } int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) @@ -1095,7 +1273,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info) LIST_HEAD(free_list); spin_lock_bh(&pernet->lock); - list_splice_init(&pernet->local_addr_list, &free_list); + list_splice_init(&pernet->endp_list, &free_list); __reset_counters(pernet); pernet->next_id = 1; bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); @@ -1181,18 +1359,18 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info) int ret; spin_lock_bh(&pernet->lock); - rcv_addrs = pernet->add_addr_accept_max; + rcv_addrs = pernet->limit_add_addr_accepted; ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs); if (ret) goto unlock; - subflows = pernet->subflows_max; + subflows = pernet->limit_extra_subflows; ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows); if (ret) goto unlock; - WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs); - WRITE_ONCE(pernet->subflows_max, subflows); + WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs); + WRITE_ONCE(pernet->limit_extra_subflows, subflows); unlock: spin_unlock_bh(&pernet->lock); @@ -1215,11 +1393,11 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS, - READ_ONCE(pernet->add_addr_accept_max))) + READ_ONCE(pernet->limit_add_addr_accepted))) goto fail; if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS, - READ_ONCE(pernet->subflows_max))) + READ_ONCE(pernet->limit_extra_subflows))) goto fail; genlmsg_end(msg, reply); @@ -1328,7 +1506,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk) { struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk); - if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || + if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) || (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) { WRITE_ONCE(msk->pm.work_pending, false); @@ -1360,12 +1538,11 @@ static int __net_init pm_nl_init_net(struct net *net) { struct pm_nl_pernet *pernet = pm_nl_get_pernet(net); - INIT_LIST_HEAD_RCU(&pernet->local_addr_list); + INIT_LIST_HEAD_RCU(&pernet->endp_list); /* Cit. 2 subflows ought to be enough for anybody. */ - pernet->subflows_max = 2; + pernet->limit_extra_subflows = 2; pernet->next_id = 1; - pernet->stale_loss_cnt = 4; spin_lock_init(&pernet->lock); /* No need to initialize other pernet fields, the struct is zeroed at @@ -1386,7 +1563,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list) * other modifiers, also netns core already waited for a * RCU grace period. */ - __flush_addrs(&pernet->local_addr_list); + __flush_addrs(&pernet->endp_list); } } diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index ce7d42d3bd00..d5b383870f79 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -113,7 +113,7 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info, return err; if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) { - u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); + s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]); entry->ifindex = val; } @@ -413,8 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb, if (err) return err; - if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side))) - return -EMSGSIZE; + if (READ_ONCE(msk->pm.server_side)) { + flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE; + + /* Deprecated, and only set when it is the server side */ + if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1)) + return -EMSGSIZE; + } if (READ_ONCE(msk->pm.remote_deny_join_id0)) flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0; diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a715dcbe0146..8cbc1920afb4 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info) if (err) mptcp_userspace_pm_delete_local_addr(msk, &entry); else - msk->pm.subflows++; + msk->pm.extra_subflows++; spin_unlock_bh(&msk->pm.lock); create_err: diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 5e497a83e967..0292162a14ee 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -12,6 +12,7 @@ #include <linux/sched/signal.h> #include <linux/atomic.h> #include <net/aligned_data.h> +#include <net/rps.h> #include <net/sock.h> #include <net/inet_common.h> #include <net/inet_hashtables.h> @@ -137,26 +138,37 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk) static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { - sk_drops_add(sk, skb); + sk_drops_skbadd(sk, skb); __kfree_skb(skb); } -static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, - struct sk_buff *from) +static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from, bool *fragstolen, + int *delta) { - bool fragstolen; - int delta; + int limit = READ_ONCE(sk->sk_rcvbuf); if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || MPTCP_SKB_CB(from)->offset || - ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || - !skb_try_coalesce(to, from, &fragstolen, &delta)) + ((to->len + from->len) > (limit >> 3)) || + !skb_try_coalesce(to, from, fragstolen, delta)) return false; pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n", MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; + return true; +} + +static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, + struct sk_buff *from) +{ + bool fragstolen; + int delta; + + if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta)) + return false; /* note the fwd memory can reach a negative value after accounting * for the delta, but the later skb free will restore a non @@ -178,6 +190,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to, return mptcp_try_coalesce((struct sock *)msk, to, from); } +/* "inspired" by tcp_rcvbuf_grow(), main difference: + * - mptcp does not maintain a msk-level window clamp + * - returns true when the receive buffer is actually updated + */ +static bool mptcp_rcvbuf_grow(struct sock *sk) +{ + struct mptcp_sock *msk = mptcp_sk(sk); + const struct net *net = sock_net(sk); + int rcvwin, rcvbuf, cap; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) || + (sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + return false; + + rcvwin = msk->rcvq_space.space << 1; + + if (!RB_EMPTY_ROOT(&msk->out_of_order_queue)) + rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq; + + cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]); + + rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap); + if (rcvbuf > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + return true; + } + return false; +} + /* "inspired" by tcp_data_queue_ofo(), main differences: * - use mptcp seqs * - don't cope with sacks @@ -291,29 +332,16 @@ merge_right: end: skb_condense(skb); skb_set_owner_r(skb, sk); + /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */ + if (sk->sk_socket) + mptcp_rcvbuf_grow(sk); } -static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, - struct sk_buff *skb, unsigned int offset, - size_t copy_len) +static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset, + int copy_len) { - struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); - struct sock *sk = (struct sock *)msk; - struct sk_buff *tail; - bool has_rxtstamp; - - __skb_unlink(skb, &ssk->sk_receive_queue); - - skb_ext_reset(skb); - skb_orphan(skb); - - /* try to fetch required memory from subflow */ - if (!sk_rmem_schedule(sk, skb, skb->truesize)) { - MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); - goto drop; - } - - has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; + const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); + bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; /* the skb map_seq accounts for the skb offset: * mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq @@ -325,6 +353,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; MPTCP_SKB_CB(skb)->cant_coalesce = 0; + __skb_unlink(skb, &ssk->sk_receive_queue); + + skb_ext_reset(skb); + skb_dst_drop(skb); +} + +static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb) +{ + u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq; + struct mptcp_sock *msk = mptcp_sk(sk); + struct sk_buff *tail; + + /* try to fetch required memory from subflow */ + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED); + goto drop; + } + if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { /* in sequence */ msk->bytes_received += copy_len; @@ -544,11 +590,10 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) } } -static bool mptcp_check_data_fin(struct sock *sk) +static void mptcp_check_data_fin(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); u64 rcv_data_fin_seq; - bool ret = false; /* Need to ack a DATA_FIN received from a peer while this side * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2. @@ -587,12 +632,10 @@ static bool mptcp_check_data_fin(struct sock *sk) break; } - ret = true; if (!__mptcp_check_fallback(msk)) mptcp_send_ack(msk); mptcp_close_wake_up(sk); } - return ret; } static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) @@ -648,7 +691,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, if (offset < skb->len) { size_t len = skb->len - offset; - ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; + mptcp_init_skb(ssk, skb, offset, len); + skb_orphan(skb); + ret = __mptcp_move_skb(sk, skb) || ret; seq += len; if (unlikely(map_remaining < len)) { @@ -769,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) moved = __mptcp_move_skbs_from_subflow(msk, ssk); __mptcp_ofo_queue(msk); - if (unlikely(ssk->sk_err)) { - if (!sock_owned_by_user(sk)) - __mptcp_error_report(sk); - else - __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags); - } + if (unlikely(ssk->sk_err)) + __mptcp_subflow_error_report(sk, ssk); /* If the moves have caught up with the DATA_FIN sequence number * it's time to ack the DATA_FIN and change socket state, but @@ -786,18 +827,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) return moved; } -static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) -{ - if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) - WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); -} - static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - __mptcp_rcvbuf_update(sk, ssk); - /* Wake-up the reader only for in-sequence data */ if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) sk->sk_data_ready(sk); @@ -1756,6 +1789,20 @@ static u32 mptcp_send_limit(const struct sock *sk) return limit - not_sent; } +static void mptcp_rps_record_subflows(const struct mptcp_sock *msk) +{ + struct mptcp_subflow_context *subflow; + + if (!rfs_is_needed()) + return; + + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk = mptcp_subflow_tcp_sock(subflow); + + sock_rps_record_flow(ssk); + } +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1769,6 +1816,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + mptcp_rps_record_subflows(msk); + if (unlikely(inet_test_bit(DEFER_CONNECT, sk) || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; @@ -1929,12 +1978,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk, } if (!(flags & MSG_PEEK)) { - /* avoid the indirect call, we know the destructor is sock_wfree */ + /* avoid the indirect call, we know the destructor is sock_rfree */ skb->destructor = NULL; + skb->sk = NULL; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); sk_mem_uncharge(sk, skb->truesize); __skb_unlink(skb, &sk->sk_receive_queue); - __kfree_skb(skb); + skb_attempt_defer_free(skb); msk->bytes_consumed += count; } @@ -1999,48 +2049,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) if (msk->rcvq_space.copied <= msk->rcvq_space.space) goto new_measure; - if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && - !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - u64 rcvwin, grow; - int rcvbuf; - - rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss; - - grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space); - - do_div(grow, msk->rcvq_space.space); - rcvwin += (grow << 1); - - rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin), - READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); - - if (rcvbuf > sk->sk_rcvbuf) { - u32 window_clamp; - - window_clamp = mptcp_win_from_space(sk, rcvbuf); - WRITE_ONCE(sk->sk_rcvbuf, rcvbuf); + msk->rcvq_space.space = msk->rcvq_space.copied; + if (mptcp_rcvbuf_grow(sk)) { - /* Make subflows follow along. If we do not do this, we - * get drops at subflow level if skbs can't be moved to - * the mptcp rx queue fast enough (announced rcv_win can - * exceed ssk->sk_rcvbuf). - */ - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk; - bool slow; + /* Make subflows follow along. If we do not do this, we + * get drops at subflow level if skbs can't be moved to + * the mptcp rx queue fast enough (announced rcv_win can + * exceed ssk->sk_rcvbuf). + */ + mptcp_for_each_subflow(msk, subflow) { + struct sock *ssk; + bool slow; - ssk = mptcp_subflow_tcp_sock(subflow); - slow = lock_sock_fast(ssk); - WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); - WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp); - if (tcp_can_send_ack(ssk)) - tcp_cleanup_rbuf(ssk, 1); - unlock_sock_fast(ssk, slow); - } + ssk = mptcp_subflow_tcp_sock(subflow); + slow = lock_sock_fast(ssk); + tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied; + tcp_rcvbuf_grow(ssk); + unlock_sock_fast(ssk, slow); } } - msk->rcvq_space.space = msk->rcvq_space.copied; new_measure: msk->rcvq_space.copied = 0; msk->rcvq_space.time = mstamp; @@ -2069,11 +2097,6 @@ static bool __mptcp_move_skbs(struct sock *sk) if (list_empty(&msk->conn_list)) return false; - /* verify we can move any data from the subflow, eventually updating */ - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) - mptcp_for_each_subflow(msk, subflow) - __mptcp_rcvbuf_update(sk, subflow->tcp_sock); - subflow = list_first_entry(&msk->conn_list, struct mptcp_subflow_context, node); for (;;) { @@ -2147,6 +2170,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, goto out_err; } + mptcp_rps_record_subflows(msk); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); len = min_t(size_t, len, INT_MAX); @@ -2189,14 +2214,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, break; } - if (sk->sk_shutdown & RCV_SHUTDOWN) { - /* race breaker: the shutdown could be after the - * previous receive queue check - */ - if (__mptcp_move_skbs(sk)) - continue; + if (sk->sk_shutdown & RCV_SHUTDOWN) break; - } if (sk->sk_state == TCP_CLOSE) { copied = -ENOTCONN; @@ -2603,7 +2622,8 @@ static void __mptcp_retrans(struct sock *sk) if (mptcp_data_fin_enabled(msk)) { struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_retransmits++; + WRITE_ONCE(icsk->icsk_retransmits, + icsk->icsk_retransmits + 1); mptcp_set_datafin_timeout(sk); mptcp_send_ack(msk); @@ -3936,6 +3956,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, mptcp_sock_graft(ssk, newsock); } + mptcp_rps_record_subflows(msk); + /* Do late cleanup for the first subflow as necessary. Also * deal with bad peers not doing a complete shutdown. */ diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index b15d7fab5c4b..52f9cfa4ce95 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -235,7 +235,7 @@ struct mptcp_pm_data { u8 add_addr_accepted; u8 local_addr_used; u8 pm_type; - u8 subflows; + u8 extra_subflows; u8 status; ); @@ -341,8 +341,8 @@ struct mptcp_sock { struct mptcp_pm_data pm; struct mptcp_sched_ops *sched; struct { - u32 space; /* bytes copied in last measurement window */ - u32 copied; /* bytes copied in this measurement window */ + int space; /* bytes copied in last measurement window */ + int copied; /* bytes copied in this measurement window */ u64 time; /* start time of measurement window */ u64 rtt_us; /* last maximum rtt of subflows */ } rcvq_space; @@ -788,9 +788,7 @@ static inline bool mptcp_epollin_ready(const struct sock *sk) * as it can always coalesce them */ return (data_avail >= sk->sk_rcvlowat) || - (mem_cgroup_sockets_enabled && sk->sk_memcg && - mem_cgroup_under_socket_pressure(sk->sk_memcg)) || - READ_ONCE(tcp_memory_pressure); + tcp_under_memory_pressure(sk); } int mptcp_set_rcvlowat(struct sock *sk, int val); @@ -1182,15 +1180,16 @@ void __init mptcp_pm_userspace_register(void); void __init mptcp_pm_nl_init(void); void mptcp_pm_worker(struct mptcp_sock *msk); void __mptcp_pm_kernel_worker(struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk); -unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk); +u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk); /* called under PM lock */ static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk) { - if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk)) + if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk)) WRITE_ONCE(msk->pm.accept_subflow, true); } @@ -1201,6 +1200,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk) spin_unlock_bh(&msk->pm.lock); } +static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk) +{ + return READ_ONCE(msk->pm.remote_deny_join_id0) && + msk->pm.local_addr_used == 0 && + mptcp_pm_get_limit_add_addr_accepted(msk) == 0 && + msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk); +} + void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk); static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb) diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 2abe6f1e9940..a28a48385885 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) memset(info, 0, sizeof(*info)); - info->mptcpi_subflows = READ_ONCE(msk->pm.subflows); + info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows); info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); @@ -972,14 +972,16 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) /* The following limits only make sense for the in-kernel PM */ if (mptcp_pm_is_kernel(msk)) { - info->mptcpi_subflows_max = - mptcp_pm_get_subflows_max(msk); - info->mptcpi_add_addr_signal_max = - mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_accepted_max = - mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_local_addr_max = - mptcp_pm_get_local_addr_max(msk); + info->mptcpi_limit_extra_subflows = + mptcp_pm_get_limit_extra_subflows(msk); + info->mptcpi_endp_signal_max = + mptcp_pm_get_endp_signal_max(msk); + info->mptcpi_limit_add_addr_accepted = + mptcp_pm_get_limit_add_addr_accepted(msk); + info->mptcpi_endp_subflow_max = + mptcp_pm_get_endp_subflow_max(msk); + info->mptcpi_endp_laminar_max = + mptcp_pm_get_endp_laminar_max(msk); } if (__mptcp_check_fallback(msk)) @@ -996,7 +998,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_bytes_sent = msk->bytes_sent; info->mptcpi_bytes_received = msk->bytes_received; info->mptcpi_bytes_retrans = msk->bytes_retrans; - info->mptcpi_subflows_total = info->mptcpi_subflows + + info->mptcpi_subflows_total = info->mptcpi_extra_subflows + __mptcp_has_initial_subflow(msk); now = tcp_jiffies32; info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index f31a3a79531a..e8325890a322 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -1721,19 +1721,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child) /* only the additional subflows created by kworkers have to be modified */ if (cgroup_id(sock_cgroup_ptr(parent_skcd)) != cgroup_id(sock_cgroup_ptr(child_skcd))) { -#ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = parent->sk_memcg; - - mem_cgroup_sk_free(child); - if (memcg && css_tryget(&memcg->css)) - child->sk_memcg = memcg; -#endif /* CONFIG_MEMCG */ - cgroup_sk_free(child_skcd); *child_skcd = *parent_skcd; cgroup_sk_clone(child_skcd); } #endif /* CONFIG_SOCK_CGROUP_DATA */ + + if (mem_cgroup_sockets_enabled) + mem_cgroup_sk_inherit(parent, child); } static void mptcp_subflow_ops_override(struct sock *ssk) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 5251524b96af..5e4453e9ef8e 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { : jhash_size((htable_bits) - HTABLE_REGION_BITS)) #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) -#define ahash_region(n, htable_bits) \ +#define ahash_region(n) \ ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ @@ -702,7 +702,7 @@ retry: #endif key = HKEY(data, h->initval, htable_bits); m = __ipset_dereference(hbucket(t, key)); - nr = ahash_region(key, htable_bits); + nr = ahash_region(key); if (!m) { m = kzalloc(sizeof(*m) + AHASH_INIT_SIZE * dsize, @@ -852,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); elements = t->hregion[r].elements; maxelem = t->maxelem; @@ -1050,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, rcu_read_lock_bh(); t = rcu_dereference_bh(h->table); key = HKEY(value, h->initval, t->htable_bits); - r = ahash_region(key, t->htable_bits); + r = ahash_region(key); atomic_inc(&t->uref); rcu_read_unlock_bh(); diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 965f3c8e5089..37ebb0cb62b8 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -885,7 +885,7 @@ static void ip_vs_conn_expire(struct timer_list *t) * conntrack cleanup for the net. */ smp_rmb(); - if (ipvs->enable) + if (READ_ONCE(ipvs->enable)) ip_vs_conn_drop_conntrack(cp); } @@ -1439,7 +1439,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs) cond_resched_rcu(); /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) break; } rcu_read_unlock(); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c7a8a08b7308..5ea7ab8bf4dc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1353,9 +1353,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat if (unlikely(!skb_dst(skb))) return NF_ACCEPT; - if (!ipvs->enable) - return NF_ACCEPT; - ip_vs_fill_iph_skb(af, skb, false, &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1940,7 +1937,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state return NF_ACCEPT; } /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; ip_vs_fill_iph_skb(af, skb, false, &iph); @@ -2108,7 +2105,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb, int r; /* ipvs enabled in this netns ? */ - if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable)) + if (unlikely(sysctl_backup_only(ipvs))) return NF_ACCEPT; if (state->pf == NFPROTO_IPV4) { @@ -2295,7 +2292,7 @@ static int __net_init __ip_vs_init(struct net *net) return -ENOMEM; /* Hold the beast until a service is registered */ - ipvs->enable = 0; + WRITE_ONCE(ipvs->enable, 0); ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); @@ -2367,7 +2364,7 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list) ipvs = net_ipvs(net); ip_vs_unregister_hooks(ipvs, AF_INET); ip_vs_unregister_hooks(ipvs, AF_INET6); - ipvs->enable = 0; /* Disable packet reception */ + WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */ smp_wmb(); ip_vs_sync_net_cleanup(ipvs); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6a6fc4478533..4c8fa22be88a 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -256,7 +256,7 @@ static void est_reload_work_handler(struct work_struct *work) struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id]; /* netns clean up started, abort delayed work */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock; if (!kd) continue; @@ -1483,9 +1483,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, *svc_p = svc; - if (!ipvs->enable) { + if (!READ_ONCE(ipvs->enable)) { /* Now there is a service - full throttle */ - ipvs->enable = 1; + WRITE_ONCE(ipvs->enable, 1); /* Start estimation for first time */ ip_vs_est_reload_start(ipvs); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 15049b826732..93a925f1ed9b 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -231,7 +231,7 @@ static int ip_vs_estimation_kthread(void *data) void ip_vs_est_reload_start(struct netns_ipvs *ipvs) { /* Ignore reloads before first service is added */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) return; ip_vs_est_stopped_recalc(ipvs); /* Bump the kthread configuration genid */ @@ -306,7 +306,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) int i; if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads && - ipvs->enable && ipvs->est_max_threads) + READ_ONCE(ipvs->enable) && ipvs->est_max_threads) return -EINVAL; mutex_lock(&ipvs->est_mutex); @@ -343,7 +343,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs) } /* Start kthread tasks only when services are present */ - if (ipvs->enable && !ip_vs_est_stopped(ipvs)) { + if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) { ret = ip_vs_est_kthread_start(ipvs, kd); if (ret < 0) goto out; @@ -486,7 +486,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats) struct ip_vs_estimator *est = &stats->est; int ret; - if (!ipvs->est_max_threads && ipvs->enable) + if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable)) ipvs->est_max_threads = ip_vs_est_max_threads(ipvs); est->ktid = -1; @@ -663,7 +663,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) /* Wait for cpufreq frequency transition */ wait_event_idle_timeout(wq, kthread_should_stop(), HZ / 50); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; } @@ -681,7 +681,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max) rcu_read_unlock(); local_bh_enable(); - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto stop; cond_resched(); @@ -757,7 +757,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs) mutex_lock(&ipvs->est_mutex); for (id = 1; id < ipvs->est_kt_count; id++) { /* netns clean up started, abort */ - if (!ipvs->enable) + if (!READ_ONCE(ipvs->enable)) goto unlock2; kd = ipvs->est_kt_arr[id]; if (!kd) @@ -787,7 +787,7 @@ last_kt: id = ipvs->est_kt_count; next_kt: - if (!ipvs->enable || kthread_should_stop()) + if (!READ_ONCE(ipvs->enable) || kthread_should_stop()) goto unlock; id--; if (id < 0) diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d8a284999544..206c6700e200 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -53,6 +53,7 @@ enum { IP_VS_FTP_EPSV, }; +static bool exiting_module; /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper * First port is set to the default port. @@ -605,7 +606,7 @@ static void __ip_vs_ftp_exit(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); - if (!ipvs) + if (!ipvs || !exiting_module) return; unregister_ip_vs_app(ipvs, &ip_vs_ftp); @@ -627,6 +628,7 @@ static int __init ip_vs_ftp_init(void) */ static void __exit ip_vs_ftp_exit(void) { + exiting_module = true; unregister_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns */ } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index af68c64acaab..81baf2082604 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -301,7 +301,7 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state) net->ct.ecache_dwork_pending = true; } else if (state == NFCT_ECACHE_DESTROY_SENT) { if (!hlist_nulls_empty(&cnet->ecache.dying_list)) - mod_delayed_work(system_wq, &cnet->ecache.dwork, 0); + mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0); else net->ct.ecache_dwork_pending = false; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 50fd6809380f..3a04665adf99 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -60,7 +60,7 @@ MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("List and change connection tracking table"); struct ctnetlink_list_dump_ctx { - struct nf_conn *last; + unsigned long last_id; unsigned int cpu; bool done; }; @@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb, return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid); } -static int ctnetlink_done_list(struct netlink_callback *cb) -{ - struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - - if (ctx->last) - nf_ct_put(ctx->last); - - return 0; -} - #ifdef CONFIG_NF_CONNTRACK_EVENTS static int ctnetlink_dump_one_entry(struct sk_buff *skb, struct netlink_callback *cb, @@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, if (l3proto && nf_ct_l3num(ct) != l3proto) return 0; - if (ctx->last) { - if (ct != ctx->last) + if (ctx->last_id) { + if (ctnetlink_get_id(ct) != ctx->last_id) return 0; - ctx->last = NULL; + ctx->last_id = 0; } /* We can't dump extension info for the unconfirmed @@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb, cb->nlh->nlmsg_seq, NFNL_MSG_TYPE(cb->nlh->nlmsg_type), ct, dying, 0); - if (res < 0) { - if (!refcount_inc_not_zero(&ct->ct_general.use)) - return 0; - - ctx->last = ct; - } + if (res < 0) + ctx->last_id = ctnetlink_get_id(ct); return res; } @@ -1796,10 +1782,10 @@ static int ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) { struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx; - struct nf_conn *last = ctx->last; #ifdef CONFIG_NF_CONNTRACK_EVENTS const struct net *net = sock_net(skb->sk); struct nf_conntrack_net_ecache *ecache_net; + unsigned long last_id = ctx->last_id; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; #endif @@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) if (ctx->done) return 0; - ctx->last = NULL; + ctx->last_id = 0; #ifdef CONFIG_NF_CONNTRACK_EVENTS ecache_net = nf_conn_pernet_ecache(net); @@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb) int res; ct = nf_ct_tuplehash_to_ctrack(h); - if (last && last != ct) + if (last_id && last_id != ctnetlink_get_id(ct)) continue; res = ctnetlink_dump_one_entry(skb, cb, ct, true); if (res < 0) { spin_unlock_bh(&ecache_net->dying_lock); - nf_ct_put(last); return skb->len; } - nf_ct_put(last); - last = NULL; + last_id = 0; } spin_unlock_bh(&ecache_net->dying_lock); #endif ctx->done = true; - nf_ct_put(last); return skb->len; } @@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_dying, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } @@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb, if (info->nlh->nlmsg_flags & NLM_F_DUMP) { struct netlink_dump_control c = { .dump = ctnetlink_dump_unconfirmed, - .done = ctnetlink_done_list, }; return netlink_dump_start(info->sk, skb, info->nlh, &c); } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 1f14ef0436c6..708b79380f04 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v) smp_acquire__after_ctrl_dep(); if (nf_ct_should_gc(ct)) { + struct ct_iter_state *st = s->private; + + st->skip_elems--; nf_ct_kill(ct); goto release; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c3c73411c40c..eed434e0a970 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx, bitmap_zero(ctx->reg_inited, NFT_REG32_NUM); } -static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, - int msg_type, u32 size, gfp_t gfp) +static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, + int msg_type, u32 size) { struct nft_trans *trans; - trans = kzalloc(size, gfp); + trans = kzalloc(size, GFP_KERNEL); if (trans == NULL) return NULL; @@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx, return trans; } -static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx, - int msg_type, u32 size) -{ - return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL); -} - static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans) { switch (trans->msg_type) { @@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a, static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, struct nft_trans_elem *tail, - struct nft_trans_elem *trans, - gfp_t gfp) + struct nft_trans_elem *trans) { unsigned int nelems, old_nelems = tail->nelems; struct nft_trans_elem *new_trans; @@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, /* krealloc might free tail which invalidates list pointers */ list_del_init(&tail->nft_trans.list); - new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp); + new_trans = krealloc(tail, struct_size(tail, elems, nelems), + GFP_KERNEL); if (!new_trans) { - list_add_tail(&tail->nft_trans.list, &nft_net->commit_list); + list_add_tail(&tail->nft_trans.list, + &nft_net->commit_list); return false; } @@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net, } static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, - struct nft_trans *trans, gfp_t gfp) + struct nft_trans *trans) { struct nft_trans *tail; @@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net, case NFT_MSG_DELSETELEM: return nft_trans_collapse_set_elem(nft_net, nft_trans_container_elem(tail), - nft_trans_container_elem(trans), gfp); + nft_trans_container_elem(trans)); } return false; @@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr } } -static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans, - gfp_t gfp) +static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans) { struct nftables_pernet *nft_net = nft_pernet(net); WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM && trans->msg_type != NFT_MSG_DELSETELEM); - might_alloc(gfp); - - if (nft_trans_try_collapse(nft_net, trans, gfp)) { + if (nft_trans_try_collapse(nft_net, trans)) { kfree(trans); return; } @@ -7573,7 +7565,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } ue->priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); goto err_elem_free; } } @@ -7597,7 +7589,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, } nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; err_set_full: @@ -7863,7 +7855,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, nft_setelem_data_deactivate(ctx->net, set, elem.priv); nft_trans_container_elem(trans)->elems[0].priv = elem.priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; fail_ops: @@ -7888,9 +7880,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, if (!nft_set_elem_active(ext, iter->genmask)) return 0; - trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM, - struct_size_t(struct nft_trans_elem, elems, 1), - GFP_ATOMIC); + trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM, + struct_size_t(struct nft_trans_elem, elems, 1)); if (!trans) return -ENOMEM; @@ -7901,7 +7892,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx, nft_trans_elem_set(trans) = set; nft_trans_container_elem(trans)->nelems = 1; nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } @@ -7918,7 +7909,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx, nft_setelem_data_deactivate(ctx->net, set, elem_priv); nft_trans_container_elem(trans)->elems[0].priv = elem_priv; - nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL); + nft_trans_commit_list_add_elem(ctx->net, trans); return 0; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index e598a2a252b0..811d02b4c4f7 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, const struct nfnetlink_subsystem *ss; const struct nfnl_callback *nc; struct netlink_ext_ack extack; + struct nlmsghdr *onlh = nlh; LIST_HEAD(err_list); u32 status; int err; @@ -386,6 +387,7 @@ replay: status = 0; replay_abort: skb = netlink_skb_clone(oskb, GFP_KERNEL); + nlh = onlh; if (!skb) return netlink_ack(oskb, nlh, -ENOMEM, NULL); diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c index 225ff293cd50..14dd1c0698c3 100644 --- a/net/netfilter/nft_flow_offload.c +++ b/net/netfilter/nft_flow_offload.c @@ -9,7 +9,7 @@ #include <linux/netfilter/nf_conntrack_common.h> #include <linux/netfilter/nf_tables.h> #include <net/ip.h> -#include <net/inet_dscp.h> +#include <net/flow.h> #include <net/netfilter/nf_tables.h> #include <net/netfilter/nf_tables_core.h> #include <net/netfilter/nf_conntrack_core.h> @@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt, fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip; fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex; fl.u.ip4.flowi4_iif = this_dst->dev->ifindex; - fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb))); + fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb)); fl.u.ip4.flowi4_mark = pkt->skb->mark; fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC; break; diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index 7dfc5343dae4..b0214418f75a 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off, /* add vlan header into the user buffer for if tag was removed by offloads */ static bool -nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len) +nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len) { int mac_off = skb_mac_header(skb) - skb->data; u8 *vlanh, *dst_u8 = (u8 *) d; @@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), + [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 }, [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255), @@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = { static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { - *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); + csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum); if (*sum == 0) *sum = CSUM_MANGLED_0; } @@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, struct nft_payload_set { enum nft_payload_bases base:8; - u8 offset; + u16 offset; u8 len; u8 sreg; u8 csum_type; @@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr { }; static bool -nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len, +nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len, int *vlan_hlen) { struct nft_payload_vlan_hdr *vlanh; @@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) { + u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE; struct nft_payload_set *priv = nft_expr_priv(expr); - u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE; int err; priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); - priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); + if (err < 0) + return err; + priv->offset = offset; + if (tb[NFTA_PAYLOAD_CSUM_TYPE]) csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE])); if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) { @@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, if (tb[NFTA_PAYLOAD_DREG] == NULL) return ERR_PTR(-EINVAL); - err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset); + err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset); if (err < 0) return ERR_PTR(err); diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 266d0c637225..ba01ce75d6de 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -30,6 +30,7 @@ struct nft_rhash { struct nft_rhash_elem { struct nft_elem_priv priv; struct rhash_head node; + struct llist_node walk_node; u32 wq_gc_seq; struct nft_set_ext ext; }; @@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key, goto err1; he = nft_elem_priv_cast(elem_priv); + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set, }; struct nft_rhash_elem *prev; + init_llist_node(&he->walk_node); prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, nft_rhash_params); if (IS_ERR(prev)) @@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set, return true; } -static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rhash *priv = nft_set_priv(set); - struct nft_rhash_elem *he; struct rhashtable_iter hti; + struct nft_rhash_elem *he; rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_start(&hti); @@ -295,6 +298,97 @@ cont: rhashtable_walk_exit(&hti); } +static void nft_rhash_walk_update(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rhash *priv = nft_set_priv(set); + struct nft_rhash_elem *he, *tmp; + struct llist_node *first_node; + struct rhashtable_iter hti; + LLIST_HEAD(walk_list); + + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + + if (set->in_update_walk) { + /* This can happen with bogus rulesets during ruleset validation + * when a verdict map causes a jump back to the same map. + * + * Without this extra check the walk_next loop below will see + * elems on the callers walk_list and skip (not validate) them. + */ + iter->err = -EMLINK; + return; + } + + /* walk happens under RCU. + * + * We create a snapshot list so ->iter callback can sleep. + * commit_mutex is held, elements can ... + * .. be added in parallel from dataplane (dynset) + * .. be marked as dead in parallel from dataplane (dynset). + * .. be queued for removal in parallel (gc timeout). + * .. not be freed: transaction mutex is held. + */ + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { + if (PTR_ERR(he) != -EAGAIN) { + iter->err = PTR_ERR(he); + break; + } + + continue; + } + + /* rhashtable resized during walk, skip */ + if (llist_on_list(&he->walk_node)) + continue; + + llist_add(&he->walk_node, &walk_list); + } + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + + first_node = __llist_del_all(&walk_list); + set->in_update_walk = true; + llist_for_each_entry_safe(he, tmp, first_node, walk_node) { + if (iter->err == 0) { + iter->err = iter->fn(ctx, set, iter, &he->priv); + if (iter->err == 0) + iter->count++; + } + + /* all entries must be cleared again, else next ->walk iteration + * will skip entries. + */ + init_llist_node(&he->walk_node); + } + set->in_update_walk = false; +} + +static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + struct nft_set_iter *iter) +{ + switch (iter->type) { + case NFT_ITER_UPDATE: + /* only relevant for netlink dumps which use READ type */ + WARN_ON_ONCE(iter->skip != 0); + + nft_rhash_walk_update(ctx, set, iter); + break; + case NFT_ITER_READ: + nft_rhash_walk_ro(ctx, set, iter); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } +} + static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, struct nft_set_ext *ext) { diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index 793790d79d13..112fe46788b6 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, } /** - * pipapo_get() - Get matching element reference given key data + * pipapo_get_slow() - Get matching element reference given key data * @m: storage containing the set elements * @data: Key data to be matched against existing elements * @genmask: If set, check that element is active in given genmask @@ -414,12 +414,12 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules, * * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, - const u8 *data, u8 genmask, - u64 tstamp) +static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { + unsigned long *res_map, *fill_map, *map; struct nft_pipapo_scratch *scratch; - unsigned long *res_map, *fill_map; const struct nft_pipapo_field *f; bool map_index; int i; @@ -429,11 +429,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, scratch = *raw_cpu_ptr(m->scratch); if (unlikely(!scratch)) goto out; + __local_lock_nested_bh(&scratch->bh_lock); map_index = scratch->map_index; - res_map = scratch->map + (map_index ? m->bsize_max : 0); - fill_map = scratch->map + (map_index ? 0 : m->bsize_max); + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res_map = map + (map_index ? m->bsize_max : 0); + fill_map = map + (map_index ? 0 : m->bsize_max); pipapo_resmap_init(m, res_map); @@ -464,6 +466,7 @@ next_match: last); if (b < 0) { scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return NULL; @@ -483,6 +486,7 @@ next_match: * *next* bitmap (not initial) for the next packet. */ scratch->map_index = map_index; + __local_unlock_nested_bh(&scratch->bh_lock); local_bh_enable(); return e; } @@ -497,12 +501,47 @@ next_match: data += NFT_PIPAPO_GROUPS_PADDING(f); } + __local_unlock_nested_bh(&scratch->bh_lock); out: local_bh_enable(); return NULL; } /** + * pipapo_get() - Get matching element reference given key data + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements + * + * This is a dispatcher function, either calling out the generic C + * implementation or, if available, the AVX2 one. + * This helper is only called from the control plane, with either RCU + * read lock or transaction mutex held. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. + */ +static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) +{ + struct nft_pipapo_elem *e; + + local_bh_disable(); + +#if defined(CONFIG_X86_64) && !defined(CONFIG_UML) + if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) { + e = pipapo_get_avx2(m, data, genmask, tstamp); + local_bh_enable(); + return e; + } +#endif + e = pipapo_get_slow(m, data, genmask, tstamp); + local_bh_enable(); + return e; +} + +/** * nft_pipapo_lookup() - Dataplane fronted for main lookup function * @net: Network namespace * @set: nftables API set representation @@ -510,8 +549,7 @@ out: * * This function is called from the data path. It will search for * an element matching the given key in the current active copy. - * Unlike other set types, this uses NFT_GENMASK_ANY instead of - * nft_genmask_cur(). + * Unlike other set types, this uses 0 instead of nft_genmask_cur(). * * This is because new (future) elements are not reachable from * priv->match, they get added to priv->clone instead. @@ -521,8 +559,8 @@ out: * inconsistent state: matching old entries get skipped but thew * newly matching entries are unreachable. * - * GENMASK will still find the 'now old' entries which ensures consistent - * priv->match view. + * GENMASK_ANY doesn't work for the same reason: old-gen entries get + * skipped, new-gen entries are only reachable from priv->clone. * * nft_pipapo_commit swaps ->clone and ->match shortly after the * genbit flip. As ->clone doesn't contain the old entries in the first @@ -539,7 +577,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set, const struct nft_pipapo_elem *e; m = rcu_dereference(priv->match); - e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64()); + e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64()); return e ? &e->ext : NULL; } @@ -1152,22 +1190,17 @@ static void pipapo_map(struct nft_pipapo_match *m, } /** - * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address + * pipapo_free_scratch() - Free per-CPU map at original address * @m: Matching data * @cpu: CPU number */ static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu) { struct nft_pipapo_scratch *s; - void *mem; s = *per_cpu_ptr(m->scratch, cpu); - if (!s) - return; - mem = s; - mem -= s->align_off; - kvfree(mem); + kvfree(s); } /** @@ -1184,11 +1217,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, for_each_possible_cpu(i) { struct nft_pipapo_scratch *scratch; -#ifdef NFT_PIPAPO_ALIGN - void *scratch_aligned; - u32 align_off; -#endif - scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) + + + scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) + NFT_PIPAPO_ALIGN_HEADROOM, GFP_KERNEL_ACCOUNT, cpu_to_node(i)); if (!scratch) { @@ -1203,23 +1233,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone, } pipapo_free_scratch(clone, i); - -#ifdef NFT_PIPAPO_ALIGN - /* Align &scratch->map (not the struct itself): the extra - * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node() - * above guarantee we can waste up to those bytes in order - * to align the map field regardless of its offset within - * the struct. - */ - BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM); - - scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map); - scratch_aligned -= offsetof(struct nft_pipapo_scratch, map); - align_off = scratch_aligned - (void *)scratch; - - scratch = scratch_aligned; - scratch->align_off = align_off; -#endif + local_lock_init(&scratch->bh_lock); *per_cpu_ptr(clone->scratch, i) = scratch; } diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h index 4a2ff85ce1c4..eaab422aa56a 100644 --- a/net/netfilter/nft_set_pipapo.h +++ b/net/netfilter/nft_set_pipapo.h @@ -124,14 +124,14 @@ struct nft_pipapo_field { /** * struct nft_pipapo_scratch - percpu data used for lookup and matching + * @bh_lock: PREEMPT_RT local spinlock * @map_index: Current working bitmap index, toggled between field matches - * @align_off: Offset to get the originally allocated address - * @map: store partial matching results during lookup + * @__map: store partial matching results during lookup */ struct nft_pipapo_scratch { + local_lock_t bh_lock; u8 map_index; - u32 align_off; - unsigned long map[]; + unsigned long __map[]; }; /** diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c index c0884fa68c79..7ff90325c97f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.c +++ b/net/netfilter/nft_set_pipapo_avx2.c @@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, desc->field_count < NFT_PIPAPO_MIN_FIELDS) return false; - if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX)) + if (!boot_cpu_has(X86_FEATURE_AVX2)) return false; est->size = pipapo_estimate_size(desc); @@ -1133,75 +1133,59 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns } /** - * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation - * @net: Network namespace - * @set: nftables API set representation - * @key: nftables API element representation containing key data + * pipapo_get_avx2() - Lookup function for AVX2 implementation + * @m: Storage containing the set elements + * @data: Key data to be matched against existing elements + * @genmask: If set, check that element is active in given genmask + * @tstamp: Timestamp to check for expired elements * * For more details, see DOC: Theory of Operation in nft_set_pipapo.c. * * This implementation exploits the repetitive characteristic of the algorithm * to provide a fast, vectorised version using the AVX2 SIMD instruction set. * - * Return: true on match, false otherwise. + * The caller must check that the FPU is usable. + * This function must be called with BH disabled. + * + * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise. */ -const struct nft_set_ext * -nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, - const u32 *key) +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp) { - struct nft_pipapo *priv = nft_set_priv(set); - const struct nft_set_ext *ext = NULL; struct nft_pipapo_scratch *scratch; - const struct nft_pipapo_match *m; const struct nft_pipapo_field *f; - const u8 *rp = (const u8 *)key; - unsigned long *res, *fill; + unsigned long *res, *fill, *map; bool map_index; int i; - local_bh_disable(); - - if (unlikely(!irq_fpu_usable())) { - ext = nft_pipapo_lookup(net, set, key); + scratch = *raw_cpu_ptr(m->scratch); + if (unlikely(!scratch)) + return NULL; - local_bh_enable(); - return ext; - } + __local_lock_nested_bh(&scratch->bh_lock); + map_index = scratch->map_index; + map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]); + res = map + (map_index ? m->bsize_max : 0); + fill = map + (map_index ? 0 : m->bsize_max); - m = rcu_dereference(priv->match); + pipapo_resmap_init_avx2(m, res); - /* This also protects access to all data related to scratch maps. - * - * Note that we don't need a valid MXCSR state for any of the + /* Note that we don't need a valid MXCSR state for any of the * operations we use here, so pass 0 as mask and spare a LDMXCSR * instruction. */ kernel_fpu_begin_mask(0); - scratch = *raw_cpu_ptr(m->scratch); - if (unlikely(!scratch)) { - kernel_fpu_end(); - local_bh_enable(); - return NULL; - } - - map_index = scratch->map_index; - - res = scratch->map + (map_index ? m->bsize_max : 0); - fill = scratch->map + (map_index ? 0 : m->bsize_max); - - pipapo_resmap_init_avx2(m, res); - nft_pipapo_avx2_prepare(); -next_match: nft_pipapo_for_each_field(f, i, m) { bool last = i == m->field_count - 1, first = !i; int ret = 0; #define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \ (ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \ - ret, rp, \ + ret, data, \ first, last)) if (likely(f->bb == 8)) { @@ -1217,7 +1201,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } else { @@ -1233,7 +1217,7 @@ next_match: NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32); } else { ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f, - ret, rp, + ret, data, first, last); } } @@ -1241,28 +1225,78 @@ next_match: #undef NFT_SET_PIPAPO_AVX2_LOOKUP - if (ret < 0) - goto out; +next_match: + if (ret < 0) { + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; + } if (last) { - const struct nft_set_ext *e = &f->mt[ret].e->ext; + struct nft_pipapo_elem *e; - if (unlikely(nft_set_elem_expired(e))) + e = f->mt[ret].e; + if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) || + !nft_set_elem_active(&e->ext, genmask))) { + ret = pipapo_refill(res, f->bsize, f->rules, + fill, f->mt, last); goto next_match; + } - ext = e; - goto out; + scratch->map_index = map_index; + kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return e; } + map_index = !map_index; swap(res, fill); - rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); + data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f); } -out: - if (i % 2) - scratch->map_index = !map_index; kernel_fpu_end(); + __local_unlock_nested_bh(&scratch->bh_lock); + return NULL; +} + +/** + * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation + * @net: Network namespace + * @set: nftables API set representation + * @key: nftables API element representation containing key data + * + * This function is called from the data path. It will search for + * an element matching the given key in the current active copy using + * the AVX2 routines if the FPU is usable or fall back to the generic + * implementation of the algorithm otherwise. + * + * Return: nftables API extension pointer or NULL if no match. + */ +const struct nft_set_ext * +nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set, + const u32 *key) +{ + struct nft_pipapo *priv = nft_set_priv(set); + const struct nft_pipapo_match *m; + const u8 *rp = (const u8 *)key; + const struct nft_pipapo_elem *e; + + local_bh_disable(); + + if (unlikely(!irq_fpu_usable())) { + const struct nft_set_ext *ext; + + ext = nft_pipapo_lookup(net, set, key); + + local_bh_enable(); + return ext; + } + + m = rcu_dereference(priv->match); + + e = pipapo_get_avx2(m, rp, 0, get_jiffies_64()); local_bh_enable(); - return ext; + return e ? &e->ext : NULL; } diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h index dbb6aaca8a7a..c2999b63da3f 100644 --- a/net/netfilter/nft_set_pipapo_avx2.h +++ b/net/netfilter/nft_set_pipapo_avx2.h @@ -5,8 +5,12 @@ #include <asm/fpu/xstate.h> #define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE) +struct nft_pipapo_match; bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features, struct nft_set_estimate *est); +struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m, + const u8 *data, u8 genmask, + u64 tstamp); #endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */ #endif /* _NFT_SET_PIPAPO_AVX2_H */ diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index b1f04168ec93..ca594161b840 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set, return NULL; } -static void nft_rbtree_walk(const struct nft_ctx *ctx, - struct nft_set *set, - struct nft_set_iter *iter) +static void nft_rbtree_do_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) { struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct rb_node *node; - read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, goto cont; iter->err = iter->fn(ctx, set, iter, &rbe->priv); - if (iter->err < 0) { - read_unlock_bh(&priv->lock); + if (iter->err < 0) return; - } cont: iter->count++; } - read_unlock_bh(&priv->lock); +} + +static void nft_rbtree_walk(const struct nft_ctx *ctx, + struct nft_set *set, + struct nft_set_iter *iter) +{ + struct nft_rbtree *priv = nft_set_priv(set); + + switch (iter->type) { + case NFT_ITER_UPDATE: + lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex); + nft_rbtree_do_walk(ctx, set, iter); + break; + case NFT_ITER_READ: + read_lock_bh(&priv->lock); + nft_rbtree_do_walk(ctx, set, iter); + read_unlock_bh(&priv->lock); + break; + default: + iter->err = -EINVAL; + WARN_ON_ONCE(1); + break; + } } static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index e2f7080dd5d7..2b46c0cd752a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk) sk_error_report(sk); } } - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); } static void netlink_rcv_wake(struct sock *sk) @@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v) sk_wmem_alloc_get(s), READ_ONCE(nlk->cb_running), refcount_read(&s->sk_refcnt), - atomic_read(&s->sk_drops), + sk_drops_read(s), sock_i_ino(s) ); diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c index a818eff27e6b..418b84e2b260 100644 --- a/net/nfc/nci/ntf.c +++ b/net/nfc/nci/ntf.c @@ -27,11 +27,16 @@ /* Handle NCI Notification packets */ -static void nci_core_reset_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_reset_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { /* Handle NCI 2.x core reset notification */ - const struct nci_core_reset_ntf *ntf = (void *)skb->data; + const struct nci_core_reset_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_reset_ntf)) + return -EINVAL; + + ntf = (struct nci_core_reset_ntf *)skb->data; ndev->nci_ver = ntf->nci_ver; pr_debug("nci_ver 0x%x, config_status 0x%x\n", @@ -42,15 +47,22 @@ static void nci_core_reset_ntf_packet(struct nci_dev *ndev, __le32_to_cpu(ntf->manufact_specific_info); nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_conn_credit_ntf *ntf = (void *) skb->data; + struct nci_core_conn_credit_ntf *ntf; struct nci_conn_info *conn_info; int i; + if (skb->len < sizeof(struct nci_core_conn_credit_ntf)) + return -EINVAL; + + ntf = (struct nci_core_conn_credit_ntf *)skb->data; + pr_debug("num_entries %d\n", ntf->num_entries); if (ntf->num_entries > NCI_MAX_NUM_CONN) @@ -68,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, conn_info = nci_get_conn_info_by_conn_id(ndev, ntf->conn_entries[i].conn_id); if (!conn_info) - return; + return 0; atomic_add(ntf->conn_entries[i].credits, &conn_info->credits_cnt); @@ -77,12 +89,19 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev, /* trigger the next tx */ if (!skb_queue_empty(&ndev->tx_q)) queue_work(ndev->tx_wq, &ndev->tx_work); + + return 0; } -static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_core_generic_error_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { - __u8 status = skb->data[0]; + __u8 status; + + if (skb->len < 1) + return -EINVAL; + + status = skb->data[0]; pr_debug("status 0x%x\n", status); @@ -91,12 +110,19 @@ static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev, (the state remains the same) */ nci_req_complete(ndev, status); } + + return 0; } -static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, - struct sk_buff *skb) +static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, + struct sk_buff *skb) { - struct nci_core_intf_error_ntf *ntf = (void *) skb->data; + struct nci_core_intf_error_ntf *ntf; + + if (skb->len < sizeof(struct nci_core_intf_error_ntf)) + return -EINVAL; + + ntf = (struct nci_core_intf_error_ntf *)skb->data; ntf->conn_id = nci_conn_id(&ntf->conn_id); @@ -105,6 +131,8 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev, /* complete the data exchange transaction, if exists */ if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags)) nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO); + + return 0; } static const __u8 * @@ -329,13 +357,18 @@ void nci_clear_target_list(struct nci_dev *ndev) ndev->n_targets = 0; } -static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_rf_discover_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; bool add_target = true; + if (skb->len < sizeof(struct nci_rf_discover_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_protocol = *data++; ntf.rf_tech_and_mode = *data++; @@ -390,6 +423,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev, nfc_targets_found(ndev->nfc_dev, ndev->targets, ndev->n_targets); } + + return 0; } static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev, @@ -553,14 +588,19 @@ static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev, return NCI_STATUS_OK; } -static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { struct nci_conn_info *conn_info; struct nci_rf_intf_activated_ntf ntf; - const __u8 *data = skb->data; + const __u8 *data; int err = NCI_STATUS_OK; + if (skb->len < sizeof(struct nci_rf_intf_activated_ntf)) + return -EINVAL; + + data = skb->data; + ntf.rf_discovery_id = *data++; ntf.rf_interface = *data++; ntf.rf_protocol = *data++; @@ -667,7 +707,7 @@ exit: if (err == NCI_STATUS_OK) { conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size; conn_info->initial_num_credits = ntf.initial_num_credits; @@ -721,19 +761,26 @@ listen: pr_err("error when signaling tm activation\n"); } } + + return 0; } -static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { const struct nci_conn_info *conn_info; - const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data; + const struct nci_rf_deactivate_ntf *ntf; + + if (skb->len < sizeof(struct nci_rf_deactivate_ntf)) + return -EINVAL; + + ntf = (struct nci_rf_deactivate_ntf *)skb->data; pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason); conn_info = ndev->rf_conn_info; if (!conn_info) - return; + return 0; /* drop tx data queue */ skb_queue_purge(&ndev->tx_q); @@ -765,14 +812,20 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev, } nci_req_complete(ndev, NCI_STATUS_OK); + + return 0; } -static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, - const struct sk_buff *skb) +static int nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, + const struct sk_buff *skb) { u8 status = NCI_STATUS_OK; - const struct nci_nfcee_discover_ntf *nfcee_ntf = - (struct nci_nfcee_discover_ntf *)skb->data; + const struct nci_nfcee_discover_ntf *nfcee_ntf; + + if (skb->len < sizeof(struct nci_nfcee_discover_ntf)) + return -EINVAL; + + nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data; /* NFCForum NCI 9.2.1 HCI Network Specific Handling * If the NFCC supports the HCI Network, it SHALL return one, @@ -783,6 +836,8 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev, ndev->cur_params.id = nfcee_ntf->nfcee_id; nci_req_complete(ndev, status); + + return 0; } void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) @@ -809,35 +864,43 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb) switch (ntf_opcode) { case NCI_OP_CORE_RESET_NTF: - nci_core_reset_ntf_packet(ndev, skb); + if (nci_core_reset_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_CONN_CREDITS_NTF: - nci_core_conn_credits_ntf_packet(ndev, skb); + if (nci_core_conn_credits_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_GENERIC_ERROR_NTF: - nci_core_generic_error_ntf_packet(ndev, skb); + if (nci_core_generic_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_CORE_INTF_ERROR_NTF: - nci_core_conn_intf_error_ntf_packet(ndev, skb); + if (nci_core_conn_intf_error_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DISCOVER_NTF: - nci_rf_discover_ntf_packet(ndev, skb); + if (nci_rf_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_INTF_ACTIVATED_NTF: - nci_rf_intf_activated_ntf_packet(ndev, skb); + if (nci_rf_intf_activated_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_DEACTIVATE_NTF: - nci_rf_deactivate_ntf_packet(ndev, skb); + if (nci_rf_deactivate_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_NFCEE_DISCOVER_NTF: - nci_nfcee_discover_ntf_packet(ndev, skb); + if (nci_nfcee_discover_ntf_packet(ndev, skb)) + goto end; break; case NCI_OP_RF_NFCEE_ACTION_NTF: diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c index 7af0cde8b293..a2af90ee99af 100644 --- a/net/openvswitch/dp_notify.c +++ b/net/openvswitch/dp_notify.c @@ -75,7 +75,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event, /* schedule vport destroy, dev_put and genl notification */ ovs_net = net_generic(dev_net(dev), ovs_net_id); - queue_work(system_wq, &ovs_net->dp_notify_work); + queue_work(system_percpu_wq, &ovs_net->dp_notify_work); } return NOTIFY_DONE; diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index b80bd3a90773..66366982f604 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -129,15 +129,13 @@ void ovs_flow_stats_get(const struct sw_flow *flow, struct ovs_flow_stats *ovs_stats, unsigned long *used, __be16 *tcp_flags) { - int cpu; + unsigned int cpu; *used = 0; *tcp_flags = 0; memset(ovs_stats, 0, sizeof(*ovs_stats)); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]); if (stats) { @@ -158,11 +156,9 @@ void ovs_flow_stats_get(const struct sw_flow *flow, /* Called with ovs_mutex. */ void ovs_flow_stats_clear(struct sw_flow *flow) { - int cpu; + unsigned int cpu; - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + for_each_cpu(cpu, flow->cpu_used_mask) { struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]); if (stats) { diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d108ae0bd0ee..ffc72a741a50 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -107,16 +107,15 @@ int ovs_flow_tbl_count(const struct flow_table *table) static void flow_free(struct sw_flow *flow) { - int cpu; + unsigned int cpu; if (ovs_identifier_is_key(&flow->id)) kfree(flow->id.unmasked_key); if (flow->sf_acts) ovs_nla_free_flow_actions((struct sw_flow_actions __force *) flow->sf_acts); - /* We open code this to make sure cpu 0 is always considered */ - for (cpu = 0; cpu < nr_cpu_ids; - cpu = cpumask_next(cpu, flow->cpu_used_mask)) { + + for_each_cpu(cpu, flow->cpu_used_mask) { if (flow->stats[cpu]) kmem_cache_free(flow_stats_cache, (struct sw_flow_stats __force *)flow->stats[cpu]); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index a7017d7f0927..173e6edda08f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *, static int prb_queue_frozen(struct tpacket_kbdq_core *); static void prb_open_block(struct tpacket_kbdq_core *, struct tpacket_block_desc *); -static void prb_retire_rx_blk_timer_expired(struct timer_list *); -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *); +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *); static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); static void prb_clear_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *); @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb) return proto; } -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - timer_delete_sync(&pkc->retire_blk_timer); -} - static void prb_shutdown_retire_blk_timer(struct packet_sock *po, struct sk_buff_head *rb_queue) { struct tpacket_kbdq_core *pkc; pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - - spin_lock_bh(&rb_queue->lock); - pkc->delete_blk_timer = 1; - spin_unlock_bh(&rb_queue->lock); - - prb_del_retire_blk_timer(pkc); -} - -static void prb_setup_retire_blk_timer(struct packet_sock *po) -{ - struct tpacket_kbdq_core *pkc; - - pkc = GET_PBDQC_FROM_RB(&po->rx_ring); - timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired, - 0); - pkc->retire_blk_timer.expires = jiffies; + hrtimer_cancel(&pkc->retire_blk_timer); } static int prb_calc_retire_blk_tmo(struct packet_sock *po, @@ -669,57 +648,36 @@ static void init_prb_bdqc(struct packet_sock *po, p1->knum_blocks = req_u->req3.tp_block_nr; p1->hdrlen = po->tp_hdrlen; p1->version = po->tp_version; - p1->last_kactive_blk_num = 0; po->stats.stats3.tp_freeze_q_cnt = 0; if (req_u->req3.tp_retire_blk_tov) - p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov; + p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov); else - p1->retire_blk_tov = prb_calc_retire_blk_tmo(po, - req_u->req3.tp_block_size); - p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov); + p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po, + req_u->req3.tp_block_size)); p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv; rwlock_init(&p1->blk_fill_in_prog_lock); p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv); prb_init_ft_ops(p1, req_u); - prb_setup_retire_blk_timer(po); + hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired, + CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT); + hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime, + HRTIMER_MODE_REL_SOFT); prb_open_block(p1, pbd); } -/* Do NOT update the last_blk_num first. - * Assumes sk_buff_head lock is held. - */ -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc) -{ - mod_timer(&pkc->retire_blk_timer, - jiffies + pkc->tov_in_jiffies); - pkc->last_kactive_blk_num = pkc->kactive_blk_num; -} - /* - * Timer logic: - * 1) We refresh the timer only when we open a block. - * By doing this we don't waste cycles refreshing the timer - * on packet-by-packet basis. - * * With a 1MB block-size, on a 1Gbps line, it will take * i) ~8 ms to fill a block + ii) memcpy etc. * In this cut we are not accounting for the memcpy time. * - * So, if the user sets the 'tmo' to 10ms then the timer - * will never fire while the block is still getting filled - * (which is what we want). However, the user could choose - * to close a block early and that's fine. - * - * But when the timer does fire, we check whether or not to refresh it. * Since the tmo granularity is in msecs, it is not too expensive * to refresh the timer, lets say every '8' msecs. * Either the user can set the 'tmo' or we can derive it based on * a) line-speed and b) block-size. * prb_calc_retire_blk_tmo() calculates the tmo. - * */ -static void prb_retire_rx_blk_timer_expired(struct timer_list *t) +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t) { struct packet_sock *po = timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer); @@ -732,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) frozen = prb_queue_frozen(pkc); pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc); - if (unlikely(pkc->delete_blk_timer)) - goto out; - /* We only need to plug the race when the block is partially filled. * tpacket_rcv: * lock(); increment BLOCK_NUM_PKTS; unlock() @@ -750,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t) write_unlock(&pkc->blk_fill_in_prog_lock); } - if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) { - if (!frozen) { - if (!BLOCK_NUM_PKTS(pbd)) { - /* An empty block. Just refresh the timer. */ - goto refresh_timer; - } + if (!frozen) { + if (BLOCK_NUM_PKTS(pbd)) { + /* Not an empty block. Need retire the block. */ prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO); - if (!prb_dispatch_next_block(pkc, po)) - goto refresh_timer; - else - goto out; - } else { - /* Case 1. Queue was frozen because user-space was - * lagging behind. + prb_dispatch_next_block(pkc, po); + } + } else { + /* Case 1. Queue was frozen because user-space was + * lagging behind. + */ + if (!prb_curr_blk_in_use(pbd)) { + /* Case 2. queue was frozen,user-space caught up, + * now the link went idle && the timer fired. + * We don't have a block to close.So we open this + * block and restart the timer. + * opening a block thaws the queue,restarts timer + * Thawing/timer-refresh is a side effect. */ - if (prb_curr_blk_in_use(pbd)) { - /* - * Ok, user-space is still behind. - * So just refresh the timer. - */ - goto refresh_timer; - } else { - /* Case 2. queue was frozen,user-space caught up, - * now the link went idle && the timer fired. - * We don't have a block to close.So we open this - * block and restart the timer. - * opening a block thaws the queue,restarts timer - * Thawing/timer-refresh is a side effect. - */ - prb_open_block(pkc, pbd); - goto out; - } + prb_open_block(pkc, pbd); } } -refresh_timer: - _prb_refresh_rx_retire_blk_timer(pkc); - -out: + hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime); spin_unlock(&po->sk.sk_receive_queue.lock); + return HRTIMER_RESTART; } static void prb_flush_block(struct tpacket_kbdq_core *pkc1, @@ -883,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc) } /* - * Side effect of opening a block: + * prb_open_block is called by tpacket_rcv or timer callback. * - * 1) prb_queue is thawed. - * 2) retire_blk_timer is refreshed. + * Reasons why NOT update hrtimer in prb_open_block: + * 1) It will increase complexity to distinguish the two caller scenario. + * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update + * TMO of an already enqueued hrtimer, leading to complex shutdown logic. * + * One side effect of NOT update hrtimer when called by tpacket_rcv is that + * a newly opened block triggered by tpacket_rcv may be retired earlier than + * expected. On the other hand, if timeout is updated in prb_open_block, the + * frequent reception of network packets that leads to prb_open_block being + * called may cause hrtimer to be removed and enqueued repeatedly. */ static void prb_open_block(struct tpacket_kbdq_core *pkc1, struct tpacket_block_desc *pbd1) @@ -921,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1, pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size; prb_thaw_queue(pkc1); - _prb_refresh_rx_retire_blk_timer(pkc1); smp_wmb(); } @@ -2265,7 +2211,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, drop_n_acct: atomic_inc(&po->tp_drops); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR; drop_n_restore: diff --git a/net/packet/diag.c b/net/packet/diag.c index 6ce1dcc284d9..c8f43e0c1925 100644 --- a/net/packet/diag.c +++ b/net/packet/diag.c @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type, pdr.pdr_frame_nr = ring->frame_max + 1; if (ver > TPACKET_V2) { - pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov; + pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime); pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv; pdr.pdr_features = ring->prb_bdqc.feature_req_word; } else { diff --git a/net/packet/internal.h b/net/packet/internal.h index 1e743d0316fd..b76e645cd78d 100644 --- a/net/packet/internal.h +++ b/net/packet/internal.h @@ -20,15 +20,10 @@ struct tpacket_kbdq_core { unsigned int feature_req_word; unsigned int hdrlen; unsigned char reset_pending_on_curr_blk; - unsigned char delete_blk_timer; unsigned short kactive_blk_num; unsigned short blk_sizeof_priv; - /* last_kactive_blk_num: - * trick to see if user-space has caught up - * in order to avoid refreshing timer when every single pkt arrives. - */ - unsigned short last_kactive_blk_num; + unsigned short version; char *pkblk_start; char *pkblk_end; @@ -38,6 +33,7 @@ struct tpacket_kbdq_core { uint64_t knxt_seq_num; char *prev; char *nxt_offset; + struct sk_buff *skb; rwlock_t blk_fill_in_prog_lock; @@ -45,12 +41,10 @@ struct tpacket_kbdq_core { /* Default is set to 8ms */ #define DEFAULT_PRB_RETIRE_TOV (8) - unsigned short retire_blk_tov; - unsigned short version; - unsigned long tov_in_jiffies; + ktime_t interval_ktime; /* timer to retire an outstanding block */ - struct timer_list retire_blk_timer; + struct hrtimer retire_blk_timer; }; struct pgv { diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index a27efa4faa4e..238a9638d2b0 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -22,7 +22,7 @@ #include <net/phonet/pn_dev.h> /* Transport protocol registration */ -static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly; +static const struct phonet_protocol __rcu *proto_tab[PHONET_NPROTO] __read_mostly; static const struct phonet_protocol *phonet_proto_get(unsigned int protocol) { @@ -482,7 +482,7 @@ void phonet_proto_unregister(unsigned int protocol, const struct phonet_protocol *pp) { mutex_lock(&proto_tab_lock); - BUG_ON(proto_tab[protocol] != pp); + BUG_ON(rcu_access_pointer(proto_tab[protocol]) != pp); RCU_INIT_POINTER(proto_tab[protocol], NULL); mutex_unlock(&proto_tab_lock); synchronize_rcu(); diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 62527e1ebb88..4db564d9d522 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); break; } __skb_pull(skb, 4); @@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = -ENOBUFS; break; } @@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb) } if (pn->rx_credits == 0) { - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = NET_RX_DROP; break; } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index ea4d5e6533db..db2d552e9b32 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v) from_kuid_munged(seq_user_ns(seq), sk_uid(sk)), sock_i_ino(sk), refcount_read(&sk->sk_refcnt), sk, - atomic_read(&sk->sk_drops)); + sk_drops_read(sk)); } seq_pad(seq, '\n'); return 0; @@ -602,7 +602,7 @@ const struct seq_operations pn_sock_seq_ops = { #endif static struct { - struct sock *sk[256]; + struct sock __rcu *sk[256]; } pnres; /* @@ -654,7 +654,7 @@ int pn_sock_unbind_res(struct sock *sk, u8 res) return -EPERM; mutex_lock(&resource_mutex); - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); ret = 0; } @@ -673,7 +673,7 @@ void pn_sock_unbind_all_res(struct sock *sk) mutex_lock(&resource_mutex); for (res = 0; res < 256; res++) { - if (pnres.sk[res] == sk) { + if (rcu_access_pointer(pnres.sk[res]) == sk) { RCU_INIT_POINTER(pnres.sk[res], NULL); match++; } @@ -688,7 +688,7 @@ void pn_sock_unbind_all_res(struct sock *sk) } #ifdef CONFIG_PROC_FS -static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) +static struct sock __rcu **pn_res_get_idx(struct seq_file *seq, loff_t pos) { struct net *net = seq_file_net(seq); unsigned int i; @@ -697,7 +697,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; for (i = 0; i < 256; i++) { - if (pnres.sk[i] == NULL) + if (rcu_access_pointer(pnres.sk[i]) == NULL) continue; if (!pos) return pnres.sk + i; @@ -706,7 +706,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) return NULL; } -static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) +static struct sock __rcu **pn_res_get_next(struct seq_file *seq, struct sock __rcu **sk) { struct net *net = seq_file_net(seq); unsigned int i; @@ -728,7 +728,7 @@ static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct sock **sk; + struct sock __rcu **sk; if (v == SEQ_START_TOKEN) sk = pn_res_get_idx(seq, 0); @@ -747,11 +747,12 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v) static int pn_res_seq_show(struct seq_file *seq, void *v) { seq_setwidth(seq, 63); - if (v == SEQ_START_TOKEN) + if (v == SEQ_START_TOKEN) { seq_puts(seq, "rs uid inode"); - else { - struct sock **psk = v; - struct sock *sk = *psk; + } else { + struct sock __rcu **psk = v; + struct sock *sk = rcu_dereference_protected(*psk, + lockdep_is_held(&resource_mutex)); seq_printf(seq, "%02X %5u %lu", (int) (psk - pnres.sk), diff --git a/net/psp/Kconfig b/net/psp/Kconfig new file mode 100644 index 000000000000..371e8771f3bd --- /dev/null +++ b/net/psp/Kconfig @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# PSP configuration +# +config INET_PSP + bool "PSP Security Protocol support" + depends on INET + select SKB_DECRYPTED + select SOCK_VALIDATE_XMIT + help + Enable kernel support for the PSP Security Protocol (PSP). + For more information see: + https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf + + If unsure, say N. diff --git a/net/psp/Makefile b/net/psp/Makefile new file mode 100644 index 000000000000..eb5ff3c5bfb2 --- /dev/null +++ b/net/psp/Makefile @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: GPL-2.0-only + +obj-$(CONFIG_INET_PSP) += psp.o + +psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c new file mode 100644 index 000000000000..9fdd6f831803 --- /dev/null +++ b/net/psp/psp-nl-gen.c @@ -0,0 +1,119 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "psp-nl-gen.h" + +#include <uapi/linux/psp.h> + +/* Common nested types */ +const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = { + [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, }, + [PSP_A_KEYS_SPI] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_DEV_GET - do */ +static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_DEV_SET - do */ +static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf), +}; + +/* PSP_CMD_KEY_ROTATE - do */ +static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = { + [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +/* PSP_CMD_RX_ASSOC - do */ +static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* PSP_CMD_TX_ASSOC - do */ +static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = { + [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3), + [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy), + [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, }, +}; + +/* Ops table for psp */ +static const struct genl_split_ops psp_nl_ops[] = { + { + .cmd = PSP_CMD_DEV_GET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_get_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_get_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_DEV_GET, + .dumpit = psp_nl_dev_get_dumpit, + .flags = GENL_CMD_CAP_DUMP, + }, + { + .cmd = PSP_CMD_DEV_SET, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_dev_set_doit, + .post_doit = psp_device_unlock, + .policy = psp_dev_set_nl_policy, + .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_KEY_ROTATE, + .pre_doit = psp_device_get_locked, + .doit = psp_nl_key_rotate_doit, + .post_doit = psp_device_unlock, + .policy = psp_key_rotate_nl_policy, + .maxattr = PSP_A_DEV_ID, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_RX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_rx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_rx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, + { + .cmd = PSP_CMD_TX_ASSOC, + .pre_doit = psp_assoc_device_get_locked, + .doit = psp_nl_tx_assoc_doit, + .post_doit = psp_device_unlock, + .policy = psp_tx_assoc_nl_policy, + .maxattr = PSP_A_ASSOC_SOCK_FD, + .flags = GENL_CMD_CAP_DO, + }, +}; + +static const struct genl_multicast_group psp_nl_mcgrps[] = { + [PSP_NLGRP_MGMT] = { "mgmt", }, + [PSP_NLGRP_USE] = { "use", }, +}; + +struct genl_family psp_nl_family __ro_after_init = { + .name = PSP_FAMILY_NAME, + .version = PSP_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = psp_nl_ops, + .n_split_ops = ARRAY_SIZE(psp_nl_ops), + .mcgrps = psp_nl_mcgrps, + .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps), +}; diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h new file mode 100644 index 000000000000..25268ed11fb5 --- /dev/null +++ b/net/psp/psp-nl-gen.h @@ -0,0 +1,39 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/psp.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_PSP_GEN_H +#define _LINUX_PSP_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/psp.h> + +/* Common nested types */ +extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1]; + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info); +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info); + +int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info); +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info); + +enum { + PSP_NLGRP_MGMT, + PSP_NLGRP_USE, +}; + +extern struct genl_family psp_nl_family; + +#endif /* _LINUX_PSP_GEN_H */ diff --git a/net/psp/psp.h b/net/psp/psp.h new file mode 100644 index 000000000000..9f19137593a0 --- /dev/null +++ b/net/psp/psp.h @@ -0,0 +1,54 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ + +#ifndef __PSP_PSP_H +#define __PSP_PSP_H + +#include <linux/list.h> +#include <linux/lockdep.h> +#include <linux/mutex.h> +#include <net/netns/generic.h> +#include <net/psp.h> +#include <net/sock.h> + +extern struct xarray psp_devs; +extern struct mutex psp_devs_lock; + +void psp_dev_free(struct psp_dev *psd); +int psp_dev_check_access(struct psp_dev *psd, struct net *net); + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd); + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd); +struct psp_dev *psp_dev_get_for_sock(struct sock *sk); +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas); +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack); +void psp_assocs_key_rotated(struct psp_dev *psd); + +static inline void psp_dev_get(struct psp_dev *psd) +{ + refcount_inc(&psd->refcnt); +} + +static inline bool psp_dev_tryget(struct psp_dev *psd) +{ + return refcount_inc_not_zero(&psd->refcnt); +} + +static inline void psp_dev_put(struct psp_dev *psd) +{ + if (refcount_dec_and_test(&psd->refcnt)) + psp_dev_free(psd); +} + +static inline bool psp_dev_is_registered(struct psp_dev *psd) +{ + lockdep_assert_held(&psd->lock); + return !!psd->ops; +} + +#endif /* __PSP_PSP_H */ diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c new file mode 100644 index 000000000000..481aaf0fc9fc --- /dev/null +++ b/net/psp/psp_main.c @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/bitfield.h> +#include <linux/list.h> +#include <linux/netdevice.h> +#include <linux/xarray.h> +#include <net/net_namespace.h> +#include <net/psp.h> +#include <net/udp.h> + +#include "psp.h" +#include "psp-nl-gen.h" + +DEFINE_XARRAY_ALLOC1(psp_devs); +struct mutex psp_devs_lock; + +/** + * DOC: PSP locking + * + * psp_devs_lock protects the psp_devs xarray. + * Ordering is take the psp_devs_lock and then the instance lock. + * Each instance is protected by RCU, and has a refcount. + * When driver unregisters the instance gets flushed, but struct sticks around. + */ + +/** + * psp_dev_check_access() - check if user in a given net ns can access PSP dev + * @psd: PSP device structure user is trying to access + * @net: net namespace user is in + * + * Return: 0 if PSP device should be visible in @net, errno otherwise. + */ +int psp_dev_check_access(struct psp_dev *psd, struct net *net) +{ + if (dev_net(psd->main_netdev) == net) + return 0; + return -ENOENT; +} + +/** + * psp_dev_create() - create and register PSP device + * @netdev: main netdevice + * @psd_ops: driver callbacks + * @psd_caps: device capabilities + * @priv_ptr: back-pointer to driver private data + * + * Return: pointer to allocated PSP device, or ERR_PTR. + */ +struct psp_dev * +psp_dev_create(struct net_device *netdev, + struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps, + void *priv_ptr) +{ + struct psp_dev *psd; + static u32 last_id; + int err; + + if (WARN_ON(!psd_caps->versions || + !psd_ops->set_config || + !psd_ops->key_rotate || + !psd_ops->rx_spi_alloc || + !psd_ops->tx_key_add || + !psd_ops->tx_key_del)) + return ERR_PTR(-EINVAL); + + psd = kzalloc(sizeof(*psd), GFP_KERNEL); + if (!psd) + return ERR_PTR(-ENOMEM); + + psd->main_netdev = netdev; + psd->ops = psd_ops; + psd->caps = psd_caps; + psd->drv_priv = priv_ptr; + + mutex_init(&psd->lock); + INIT_LIST_HEAD(&psd->active_assocs); + INIT_LIST_HEAD(&psd->prev_assocs); + INIT_LIST_HEAD(&psd->stale_assocs); + refcount_set(&psd->refcnt, 1); + + mutex_lock(&psp_devs_lock); + err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b, + &last_id, GFP_KERNEL); + if (err) { + mutex_unlock(&psp_devs_lock); + kfree(psd); + return ERR_PTR(err); + } + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF); + + rcu_assign_pointer(netdev->psp_dev, psd); + + mutex_unlock(&psd->lock); + + return psd; +} +EXPORT_SYMBOL(psp_dev_create); + +void psp_dev_free(struct psp_dev *psd) +{ + mutex_lock(&psp_devs_lock); + xa_erase(&psp_devs, psd->id); + mutex_unlock(&psp_devs_lock); + + mutex_destroy(&psd->lock); + kfree_rcu(psd, rcu); +} + +/** + * psp_dev_unregister() - unregister PSP device + * @psd: PSP device structure + */ +void psp_dev_unregister(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + mutex_lock(&psp_devs_lock); + mutex_lock(&psd->lock); + + psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF); + + /* Wait until psp_dev_free() to call xa_erase() to prevent a + * different psd from being added to the xarray with this id, while + * there are still references to this psd being held. + */ + xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL); + mutex_unlock(&psp_devs_lock); + + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list) + psp_dev_tx_key_del(psd, pas); + + rcu_assign_pointer(psd->main_netdev->psp_dev, NULL); + + psd->ops = NULL; + psd->drv_priv = NULL; + + mutex_unlock(&psd->lock); + + psp_dev_put(psd); +} +EXPORT_SYMBOL(psp_dev_unregister); + +unsigned int psp_key_size(u32 version) +{ + switch (version) { + case PSP_VERSION_HDR0_AES_GCM_128: + case PSP_VERSION_HDR0_AES_GMAC_128: + return 16; + case PSP_VERSION_HDR0_AES_GCM_256: + case PSP_VERSION_HDR0_AES_GMAC_256: + return 32; + default: + return 0; + } +} +EXPORT_SYMBOL(psp_key_size); + +static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, unsigned int udp_len, __be16 sport) +{ + struct udphdr *uh = udp_hdr(skb); + struct psphdr *psph = (struct psphdr *)(uh + 1); + + uh->dest = htons(PSP_DEFAULT_UDP_PORT); + uh->source = udp_flow_src_port(net, skb, 0, 0, false); + uh->check = 0; + uh->len = htons(udp_len); + + psph->nexthdr = IPPROTO_TCP; + psph->hdrlen = PSP_HDRLEN_NOOPT; + psph->crypt_offset = 0; + psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) | + FIELD_PREP(PSPHDR_VERFL_ONE, 1); + psph->spi = spi; + memset(&psph->iv, 0, sizeof(psph->iv)); +} + +/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling + * them in. + */ +bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi, + u8 ver, __be16 sport) +{ + u32 network_len = skb_network_header_len(skb); + u32 ethr_len = skb_mac_header_len(skb); + u32 bufflen = ethr_len + network_len; + + if (skb_cow_head(skb, PSP_ENCAP_HLEN)) + return false; + + skb_push(skb, PSP_ENCAP_HLEN); + skb->mac_header -= PSP_ENCAP_HLEN; + skb->network_header -= PSP_ENCAP_HLEN; + skb->transport_header -= PSP_ENCAP_HLEN; + memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen); + + if (skb->protocol == htons(ETH_P_IP)) { + ip_hdr(skb)->protocol = IPPROTO_UDP; + be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN); + ip_hdr(skb)->check = 0; + ip_hdr(skb)->check = + ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + ipv6_hdr(skb)->nexthdr = IPPROTO_UDP; + be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN); + } else { + return false; + } + + skb_set_inner_ipproto(skb, IPPROTO_TCP); + skb_set_inner_transport_header(skb, skb_transport_offset(skb) + + PSP_ENCAP_HLEN); + skb->encapsulation = 1; + psp_write_headers(net, skb, spi, ver, + skb->len - skb_transport_offset(skb), sport); + + return true; +} +EXPORT_SYMBOL(psp_dev_encapsulate); + +/* Receive handler for PSP packets. + * + * Presently it accepts only already-authenticated packets and does not + * support optional fields, such as virtualization cookies. The caller should + * ensure that skb->data is pointing to the mac header, and that skb->mac_len + * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE + * is not supported). + */ +int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv) +{ + int l2_hlen = 0, l3_hlen, encap; + struct psp_skb_ext *pse; + struct psphdr *psph; + struct ethhdr *eth; + struct udphdr *uh; + __be16 proto; + bool is_udp; + + eth = (struct ethhdr *)skb->data; + proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen); + if (proto == htons(ETH_P_IP)) + l3_hlen = sizeof(struct iphdr); + else if (proto == htons(ETH_P_IPV6)) + l3_hlen = sizeof(struct ipv6hdr); + else + return -EINVAL; + + if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))) + return -EINVAL; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + is_udp = iph->protocol == IPPROTO_UDP; + l3_hlen = iph->ihl * 4; + if (l3_hlen != sizeof(struct iphdr) && + !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)) + return -EINVAL; + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + is_udp = ipv6h->nexthdr == IPPROTO_UDP; + } + + if (unlikely(!is_udp)) + return -EINVAL; + + uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen); + if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT))) + return -EINVAL; + + pse = skb_ext_add(skb, SKB_EXT_PSP); + if (!pse) + return -EINVAL; + + psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen + + sizeof(struct udphdr)); + pse->spi = psph->spi; + pse->dev_id = dev_id; + pse->generation = generation; + pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl); + + encap = PSP_ENCAP_HLEN; + encap += strip_icv ? PSP_TRL_SIZE : 0; + + if (proto == htons(ETH_P_IP)) { + struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen); + + iph->protocol = psph->nexthdr; + iph->tot_len = htons(ntohs(iph->tot_len) - encap); + iph->check = 0; + iph->check = ip_fast_csum((u8 *)iph, iph->ihl); + } else { + struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen); + + ipv6h->nexthdr = psph->nexthdr; + ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap); + } + + memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen); + skb_pull(skb, PSP_ENCAP_HLEN); + + if (strip_icv) + pskb_trim(skb, skb->len - PSP_TRL_SIZE); + + return 0; +} +EXPORT_SYMBOL(psp_dev_rcv); + +static int __init psp_init(void) +{ + mutex_init(&psp_devs_lock); + + return genl_register_family(&psp_nl_family); +} + +subsys_initcall(psp_init); diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c new file mode 100644 index 000000000000..8aaca62744c3 --- /dev/null +++ b/net/psp/psp_nl.c @@ -0,0 +1,505 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/skbuff.h> +#include <linux/xarray.h> +#include <net/genetlink.h> +#include <net/psp.h> +#include <net/sock.h> + +#include "psp-nl-gen.h" +#include "psp.h" + +/* Netlink helpers */ + +static struct sk_buff *psp_nl_reply_new(struct genl_info *info) +{ + struct sk_buff *rsp; + void *hdr; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return NULL; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) { + nlmsg_free(rsp); + return NULL; + } + + return rsp; +} + +static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info) +{ + /* Note that this *only* works with a single message per skb! */ + nlmsg_end(rsp, (struct nlmsghdr *)rsp->data); + + return genlmsg_reply(rsp, info); +} + +/* Device stuff */ + +static struct psp_dev * +psp_device_get_and_lock(struct net *net, struct nlattr *dev_id) +{ + struct psp_dev *psd; + int err; + + mutex_lock(&psp_devs_lock); + psd = xa_load(&psp_devs, nla_get_u32(dev_id)); + if (!psd) { + mutex_unlock(&psp_devs_lock); + return ERR_PTR(-ENODEV); + } + + mutex_lock(&psd->lock); + mutex_unlock(&psp_devs_lock); + + err = psp_dev_check_access(psd, net); + if (err) { + mutex_unlock(&psd->lock); + return ERR_PTR(err); + } + + return psd; +} + +int psp_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID)) + return -EINVAL; + + info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info), + info->attrs[PSP_A_DEV_ID]); + return PTR_ERR_OR_ZERO(info->user_ptr[0]); +} + +void +psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + + mutex_unlock(&psd->lock); + if (socket) + sockfd_put(socket); +} + +static int +psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp, + const struct genl_info *info) +{ + void *hdr; + + hdr = genlmsg_iput(rsp, info); + if (!hdr) + return -EMSGSIZE; + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) || + nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions)) + goto err_cancel_msg; + + genlmsg_end(rsp, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(rsp, hdr); + return -EMSGSIZE; +} + +void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd) +{ + struct genl_info info; + struct sk_buff *ntf; + + if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev), + PSP_NLGRP_MGMT)) + return; + + ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!ntf) + return; + + genl_info_init_ntf(&info, &psp_nl_family, cmd); + if (psp_nl_dev_fill(psd, ntf, &info)) { + nlmsg_free(ntf); + return; + } + + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_MGMT, GFP_KERNEL); +} + +int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct sk_buff *rsp; + int err; + + rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!rsp) + return -ENOMEM; + + err = psp_nl_dev_fill(psd, rsp, info); + if (err) + goto err_free_msg; + + return genlmsg_reply(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} + +static int +psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb, + struct psp_dev *psd) +{ + if (psp_dev_check_access(psd, sock_net(rsp->sk))) + return 0; + + return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb)); +} + +int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb) +{ + struct psp_dev *psd; + int err = 0; + + mutex_lock(&psp_devs_lock); + xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) { + mutex_lock(&psd->lock); + err = psp_nl_dev_get_dumpit_one(rsp, cb, psd); + mutex_unlock(&psd->lock); + if (err) + break; + } + mutex_unlock(&psp_devs_lock); + + return err; +} + +int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct psp_dev_config new_config; + struct sk_buff *rsp; + int err; + + memcpy(&new_config, &psd->config, sizeof(new_config)); + + if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) { + new_config.versions = + nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]); + if (new_config.versions & ~psd->caps->versions) { + NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device"); + return -EINVAL; + } + } else { + NL_SET_ERR_MSG(info->extack, "No settings present"); + return -EINVAL; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + if (memcmp(&new_config, &psd->config, sizeof(new_config))) { + err = psd->ops->set_config(psd, &new_config, info->extack); + if (err) + goto err_free_rsp; + + memcpy(&psd->config, &new_config, sizeof(new_config)); + } + + psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF); + + return psp_nl_reply_send(rsp, info); + +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct psp_dev *psd = info->user_ptr[0]; + struct genl_info ntf_info; + struct sk_buff *ntf, *rsp; + u8 prev_gen; + int err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF); + ntf = psp_nl_reply_new(&ntf_info); + if (!ntf) { + err = -ENOMEM; + goto err_free_rsp; + } + + if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) || + nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) { + err = -EMSGSIZE; + goto err_free_ntf; + } + + /* suggest the next gen number, driver can override */ + prev_gen = psd->generation; + psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK; + + err = psd->ops->key_rotate(psd, info->extack); + if (err) + goto err_free_ntf; + + WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) || + psd->generation & ~PSP_GEN_VALID_MASK); + + psp_assocs_key_rotated(psd); + + nlmsg_end(ntf, (struct nlmsghdr *)ntf->data); + genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf, + 0, PSP_NLGRP_USE, GFP_KERNEL); + return psp_nl_reply_send(rsp, info); + +err_free_ntf: + nlmsg_free(ntf); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +/* Key etc. */ + +int psp_assoc_device_get_locked(const struct genl_split_ops *ops, + struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket; + struct psp_dev *psd; + struct nlattr *id; + int fd, err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD)) + return -EINVAL; + + fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]); + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + if (!sk_is_tcp(socket->sk)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[PSP_A_ASSOC_SOCK_FD], + "Unsupported socket family and type"); + err = -EOPNOTSUPP; + goto err_sock_put; + } + + psd = psp_dev_get_for_sock(socket->sk); + if (psd) { + err = psp_dev_check_access(psd, genl_info_net(info)); + if (err) { + psp_dev_put(psd); + psd = NULL; + } + } + + if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) { + err = -EINVAL; + goto err_sock_put; + } + + id = info->attrs[PSP_A_ASSOC_DEV_ID]; + if (psd) { + mutex_lock(&psd->lock); + if (id && psd->id != nla_get_u32(id)) { + mutex_unlock(&psd->lock); + NL_SET_ERR_MSG_ATTR(info->extack, id, + "Device id vs socket mismatch"); + err = -EINVAL; + goto err_psd_put; + } + + psp_dev_put(psd); + } else { + psd = psp_device_get_and_lock(genl_info_net(info), id); + if (IS_ERR(psd)) { + err = PTR_ERR(psd); + goto err_sock_put; + } + } + + info->user_ptr[0] = psd; + info->user_ptr[1] = socket; + + return 0; + +err_psd_put: + psp_dev_put(psd); +err_sock_put: + sockfd_put(socket); + return err; +} + +static int +psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key, + unsigned int key_sz) +{ + struct nlattr *nest = info->attrs[attr]; + struct nlattr *tb[PSP_A_KEYS_SPI + 1]; + u32 spi; + int err; + + err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest, + psp_keys_nl_policy, info->extack); + if (err) + return err; + + if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) || + NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI)) + return -EINVAL; + + if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "incorrect key length"); + return -EINVAL; + } + + spi = nla_get_u32(tb[PSP_A_KEYS_SPI]); + if (!(spi & PSP_SPI_KEY_ID)) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY], + "invalid SPI: lower 31b must be non-zero"); + return -EINVAL; + } + + key->spi = cpu_to_be32(spi); + memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz); + + return 0; +} + +static int +psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version, + struct psp_key_parsed *key) +{ + int key_sz = psp_key_size(version); + void *nest; + + nest = nla_nest_start(skb, attr); + + if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) || + nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) { + nla_nest_cancel(skb, nest); + return -EMSGSIZE; + } + + nla_nest_end(skb, nest); + + return 0; +} + +int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct psp_assoc *pas; + struct sk_buff *rsp; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + pas = psp_assoc_create(psd); + if (!pas) { + err = -ENOMEM; + goto err_free_rsp; + } + pas->version = version; + + err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack); + if (err) + goto err_free_pas; + + if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) || + psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) { + err = -EMSGSIZE; + goto err_free_pas; + } + + err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack); + if (err) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]); + goto err_free_pas; + } + psp_assoc_put(pas); + + return psp_nl_reply_send(rsp, info); + +err_free_pas: + psp_assoc_put(pas); +err_free_rsp: + nlmsg_free(rsp); + return err; +} + +int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct socket *socket = info->user_ptr[1]; + struct psp_dev *psd = info->user_ptr[0]; + struct psp_key_parsed key; + struct sk_buff *rsp; + unsigned int key_sz; + u32 version; + int err; + + if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) || + GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY)) + return -EINVAL; + + version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]); + if (!(psd->caps->versions & (1 << version))) { + NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]); + return -EOPNOTSUPP; + } + + key_sz = psp_key_size(version); + if (!key_sz) + return -EINVAL; + + err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz); + if (err < 0) + return err; + + rsp = psp_nl_reply_new(info); + if (!rsp) + return -ENOMEM; + + err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key, + info->extack); + if (err) + goto err_free_msg; + + return psp_nl_reply_send(rsp, info); + +err_free_msg: + nlmsg_free(rsp); + return err; +} diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c new file mode 100644 index 000000000000..5324a7603bed --- /dev/null +++ b/net/psp/psp_sock.c @@ -0,0 +1,292 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/file.h> +#include <linux/net.h> +#include <linux/rcupdate.h> +#include <linux/tcp.h> + +#include <net/ip.h> +#include <net/psp.h> +#include "psp.h" + +struct psp_dev *psp_dev_get_for_sock(struct sock *sk) +{ + struct psp_dev *psd = NULL; + struct dst_entry *dst; + + rcu_read_lock(); + dst = __sk_dst_get(sk); + if (dst) { + psd = rcu_dereference(dst_dev_rcu(dst)->psp_dev); + if (psd && !psp_dev_tryget(psd)) + psd = NULL; + } + rcu_read_unlock(); + + return psd; +} + +static struct sk_buff * +psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb) +{ + struct psp_assoc *pas; + bool good; + + rcu_read_lock(); + pas = psp_skb_get_assoc_rcu(skb); + good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd; + rcu_read_unlock(); + if (!good) { + sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PSP_OUTPUT); + return NULL; + } + + return skb; +} + +struct psp_assoc *psp_assoc_create(struct psp_dev *psd) +{ + struct psp_assoc *pas; + + lockdep_assert_held(&psd->lock); + + pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc), + GFP_KERNEL_ACCOUNT); + if (!pas) + return NULL; + + pas->psd = psd; + pas->dev_id = psd->id; + pas->generation = psd->generation; + psp_dev_get(psd); + refcount_set(&pas->refcnt, 1); + + list_add_tail(&pas->assocs_list, &psd->active_assocs); + + return pas; +} + +static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas) +{ + struct psp_dev *psd = pas->psd; + size_t sz; + + lockdep_assert_held(&psd->lock); + + sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc); + return kmemdup(pas, sz, GFP_KERNEL); +} + +static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas, + struct netlink_ext_ack *extack) +{ + return psd->ops->tx_key_add(psd, pas, extack); +} + +void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas) +{ + if (pas->tx.spi) + psd->ops->tx_key_del(psd, pas); + list_del(&pas->assocs_list); +} + +static void psp_assoc_free(struct work_struct *work) +{ + struct psp_assoc *pas = container_of(work, struct psp_assoc, work); + struct psp_dev *psd = pas->psd; + + mutex_lock(&psd->lock); + if (psd->ops) + psp_dev_tx_key_del(psd, pas); + mutex_unlock(&psd->lock); + psp_dev_put(psd); + kfree(pas); +} + +static void psp_assoc_free_queue(struct rcu_head *head) +{ + struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu); + + INIT_WORK(&pas->work, psp_assoc_free); + schedule_work(&pas->work); +} + +/** + * psp_assoc_put() - release a reference on a PSP association + * @pas: association to release + */ +void psp_assoc_put(struct psp_assoc *pas) +{ + if (pas && refcount_dec_and_test(&pas->refcnt)) + call_rcu(&pas->rcu, psp_assoc_free_queue); +} + +void psp_sk_assoc_free(struct sock *sk) +{ + struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1); + + rcu_assign_pointer(sk->psp_assoc, NULL); + psp_assoc_put(pas); +} + +int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas, + struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + int err; + + memcpy(&pas->rx, key, sizeof(*key)); + + lock_sock(sk); + + if (psp_sk_assoc(sk)) { + NL_SET_ERR_MSG(extack, "Socket already has PSP state"); + err = -EBUSY; + goto exit_unlock; + } + + refcount_inc(&pas->refcnt); + rcu_assign_pointer(sk->psp_assoc, pas); + err = 0; + +exit_unlock: + release_sock(sk); + + return err; +} + +static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas) +{ + struct psp_skb_ext *pse; + struct sk_buff *skb; + + skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + + skb_queue_walk(&sk->sk_receive_queue, skb) { + pse = skb_ext_find(skb, SKB_EXT_PSP); + if (!psp_pse_matches_pas(pse, pas)) + return -EBUSY; + } + return 0; +} + +int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd, + u32 version, struct psp_key_parsed *key, + struct netlink_ext_ack *extack) +{ + struct inet_connection_sock *icsk; + struct psp_assoc *pas, *dummy; + int err; + + lock_sock(sk); + + pas = psp_sk_assoc(sk); + if (!pas) { + NL_SET_ERR_MSG(extack, "Socket has no Rx key"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->psd != psd) { + NL_SET_ERR_MSG(extack, "Rx key from different device"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->version != version) { + NL_SET_ERR_MSG(extack, + "PSP version mismatch with existing state"); + err = -EINVAL; + goto exit_unlock; + } + if (pas->tx.spi) { + NL_SET_ERR_MSG(extack, "Tx key already set"); + err = -EBUSY; + goto exit_unlock; + } + + err = psp_sock_recv_queue_check(sk, pas); + if (err) { + NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue"); + goto exit_unlock; + } + + /* Pass a fake association to drivers to make sure they don't + * try to store pointers to it. For re-keying we'll need to + * re-allocate the assoc structures. + */ + dummy = psp_assoc_dummy(pas); + if (!dummy) { + err = -ENOMEM; + goto exit_unlock; + } + + memcpy(&dummy->tx, key, sizeof(*key)); + err = psp_dev_tx_key_add(psd, dummy, extack); + if (err) + goto exit_free_dummy; + + memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc); + memcpy(&pas->tx, key, sizeof(*key)); + + WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit); + tcp_write_collapse_fence(sk); + pas->upgrade_seq = tcp_sk(sk)->rcv_nxt; + + icsk = inet_csk(sk); + icsk->icsk_ext_hdr_len += psp_sk_overhead(sk); + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); + +exit_free_dummy: + kfree(dummy); +exit_unlock: + release_sock(sk); + return err; +} + +void psp_assocs_key_rotated(struct psp_dev *psd) +{ + struct psp_assoc *pas, *next; + + /* Mark the stale associations as invalid, they will no longer + * be able to Rx any traffic. + */ + list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list) + pas->generation |= ~PSP_GEN_VALID_MASK; + list_splice_init(&psd->prev_assocs, &psd->stale_assocs); + list_splice_init(&psd->active_assocs, &psd->prev_assocs); + + /* TODO: we should inform the sockets that got shut down */ +} + +void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk) +{ + struct psp_assoc *pas = psp_sk_assoc(sk); + + if (pas) + refcount_inc(&pas->refcnt); + rcu_assign_pointer(tw->psp_assoc, pas); + tw->tw_validate_xmit_skb = psp_validate_xmit; +} + +void psp_twsk_assoc_free(struct inet_timewait_sock *tw) +{ + struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1); + + rcu_assign_pointer(tw->psp_assoc, NULL); + psp_assoc_put(pas); +} + +void psp_reply_set_decrypted(struct sk_buff *skb) +{ + struct psp_assoc *pas; + + rcu_read_lock(); + pas = psp_sk_get_assoc_rcu(skb->sk); + if (pas && pas->tx.spi) + skb->decrypted = 1; + rcu_read_unlock(); +} +EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 086a13170e09..4a7217fbeab6 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -242,7 +242,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, if (rs->rs_snd_bytes < rds_sk_sndbuf(rs)) mask |= (EPOLLOUT | EPOLLWRNORM); if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) - mask |= POLLERR; + mask |= EPOLLERR; read_unlock_irqrestore(&rs->rs_recv_lock, flags); /* clear state any time we wake a seen-congested socket */ diff --git a/net/rds/connection.c b/net/rds/connection.c index d62f486ab29f..68bc88cce84e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -57,16 +57,17 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr, static u32 rds6_hash_secret __read_mostly; static u32 rds_hash_secret __read_mostly; - u32 lhash, fhash, hash; + __be32 lhash, fhash; + u32 hash; net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret)); net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret)); - lhash = (__force u32)laddr->s6_addr32[3]; + lhash = laddr->s6_addr32[3]; #if IS_ENABLED(CONFIG_IPV6) - fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret); + fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret); #else - fhash = (__force u32)faddr->s6_addr32[3]; + fhash = faddr->s6_addr32[3]; #endif hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret); diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index ea5e9aee4959..5884de8c6f45 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -108,7 +108,6 @@ struct rds_ib_mr_pool { }; extern struct workqueue_struct *rds_ib_mr_wq; -extern bool prefer_frmr; struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev, int npages); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index d1cfceeff133..6585164c7059 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -672,7 +672,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, int rds_ib_mr_init(void) { - rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0); + rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", + WQ_MEM_RECLAIM | WQ_PERCPU, 0); if (!rds_ib_mr_wq) return -ENOMEM; return 0; diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e53b7f266bd7..4248dfa816eb 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1034,7 +1034,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_rx_ring_empty); if (rds_ib_ring_low(&ic->i_recv_ring)) { - rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN); + rds_ib_recv_refill(conn, 0, GFP_NOWAIT); rds_ib_stats_inc(s_ib_rx_refill_from_cq); } } diff --git a/net/rds/message.c b/net/rds/message.c index 7af59d2443e5..199a899a43e9 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -44,8 +44,8 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), [RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma), [RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest), -[RDS_EXTHDR_NPATHS] = sizeof(u16), -[RDS_EXTHDR_GEN_NUM] = sizeof(u32), +[RDS_EXTHDR_NPATHS] = sizeof(__be16), +[RDS_EXTHDR_GEN_NUM] = sizeof(__be32), }; void rds_message_addref(struct rds_message *rm) diff --git a/net/rds/rds.h b/net/rds/rds.h index dc360252c515..5b1c072e2e7f 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -93,7 +93,7 @@ enum { /* Max number of multipaths per RDS connection. Must be a power of 2 */ #define RDS_MPATH_WORKERS 8 -#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \ +#define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \ (rs)->rs_hash_initval) & ((n) - 1)) #define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr)) diff --git a/net/rds/recv.c b/net/rds/recv.c index 5627f80013f8..66205d6924bf 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -202,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr, unsigned int pos = 0, type, len; union { struct rds_ext_header_version version; - u16 rds_npaths; - u32 rds_gen_num; + __be16 rds_npaths; + __be32 rds_gen_num; } buffer; u32 new_peer_gen_num = 0; diff --git a/net/rds/send.c b/net/rds/send.c index 42d991bc8543..0b3d0ef2f008 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1454,8 +1454,8 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport, if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) && cp->cp_conn->c_trans->t_mp_capable) { - u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); - u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); + __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS); + __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_NPATHS, &npaths, diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 598d0a61bda7..53d286b10843 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -159,7 +159,7 @@ static void rfkill_schedule_global_op(enum rfkill_sched_op op) rfkill_op_pending = true; if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) { /* bypass the limiter for EPO */ - mod_delayed_work(system_wq, &rfkill_op_work, 0); + mod_delayed_work(system_percpu_wq, &rfkill_op_work, 0); rfkill_last_scheduled = jiffies; } else rfkill_schedule_ratelimited(); diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c index 0377301156b0..2ea71e3831f7 100644 --- a/net/rxrpc/rxperf.c +++ b/net/rxrpc/rxperf.c @@ -630,7 +630,7 @@ static int __init rxperf_init(void) pr_info("Server registering\n"); - rxperf_workqueue = alloc_workqueue("rxperf", 0, 0); + rxperf_workqueue = alloc_workqueue("rxperf", WQ_PERCPU, 0); if (!rxperf_workqueue) goto error_workqueue; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 9e468e463467..ff6be5cfe2b0 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, } _bstats_update(&a->tcfa_bstats, bytes, packets); - a->tcfa_qstats.drops += drops; + atomic_add(drops, &a->tcfa_drops); if (hw) _bstats_update(&a->tcfa_bstats_hw, bytes, packets); } @@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats); int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, int compat_mode) { - int err = 0; + struct gnet_stats_queue qstats = {0}; struct gnet_dump d; + int err = 0; if (p == NULL) goto errout; @@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, if (err < 0) goto errout; + qstats.drops = atomic_read(&p->tcfa_drops); + qstats.overlimits = atomic_read(&p->tcfa_overlimits); + if (gnet_stats_copy_basic(&d, p->cpu_bstats, &p->tcfa_bstats, false) < 0 || gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, &p->tcfa_bstats_hw, false) < 0 || gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || gnet_stats_copy_queue(&d, p->cpu_qstats, - &p->tcfa_qstats, - p->tcfa_qstats.qlen) < 0) + &qstats, + qstats.qlen) < 0) goto errout; if (gnet_stats_finish_copy(&d) < 0) diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index f3abe0545989..8e69a919b4fe 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -72,7 +72,6 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, d = to_defact(a); spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); - memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index dc0229693461..a9e0c1326e2a 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, struct tcf_result *res) { struct tcf_skbmod *d = to_skbmod(a); - int action, max_edit_len, err; struct tcf_skbmod_params *p; + int max_edit_len, err; u64 flags; tcf_lastuse_update(&d->tcf_tm); bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); - action = READ_ONCE(d->tcf_action); - if (unlikely(action == TC_ACT_SHOT)) + p = rcu_dereference_bh(d->skbmod_p); + if (unlikely(p->action == TC_ACT_SHOT)) goto drop; max_edit_len = skb_mac_header_len(skb); - p = rcu_dereference_bh(d->skbmod_p); flags = p->flags; /* tcf_skbmod_init() guarantees "flags" to be one of the following: @@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb, INET_ECN_set_ce(skb); out: - return action; + return p->action; drop: qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats)); @@ -193,7 +192,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, } p->flags = lflags; - + p->action = parm->action; if (ovr) spin_lock_bh(&d->tcf_lock); /* Protected by tcf_lock if overwriting existing action. */ @@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, opt.index = d->tcf_index; opt.refcnt = refcount_read(&d->tcf_refcnt) - ref; opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind; - spin_lock_bh(&d->tcf_lock); - opt.action = d->tcf_action; - p = rcu_dereference_protected(d->skbmod_p, - lockdep_is_held(&d->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(d->skbmod_p); + opt.action = p->action; opt.flags = p->flags; if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD)) goto nla_put_failure; - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&d->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 2cef4b08befb..876b30c5709e 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, { struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params; - int action; params = rcu_dereference_bh(t->params); tcf_lastuse_update(&t->tcf_tm); tcf_action_update_bstats(&t->common, skb); - action = READ_ONCE(t->tcf_action); switch (params->tcft_action) { case TCA_TUNNEL_KEY_ACT_RELEASE: @@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb, break; } - return action; + return params->action; } static const struct nla_policy @@ -532,6 +530,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, params_new->tcft_action = parm->t_action; params_new->tcft_enc_metadata = metadata; + params_new->action = parm->action; spin_lock_bh(&t->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); params_new = rcu_replace_pointer(t->params, params_new, @@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t tm; - spin_lock_bh(&t->tcf_lock); - params = rcu_dereference_protected(t->params, - lockdep_is_held(&t->tcf_lock)); - opt.action = t->tcf_action; + rcu_read_lock(); + params = rcu_dereference(t->params); + opt.action = params->action; opt.t_action = params->tcft_action; if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt)) @@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a, if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm), &tm, TCA_TUNNEL_KEY_PAD)) goto nla_put_failure; - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&t->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 383bf18b6862..a74621797d69 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, { struct tcf_vlan *v = to_vlan(a); struct tcf_vlan_params *p; - int action; int err; u16 tci; @@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb, if (skb_at_tc_ingress(skb)) skb_push_rcsum(skb, skb->mac_len); - action = READ_ONCE(v->tcf_action); - p = rcu_dereference_bh(v->vlan_p); switch (p->tcfv_action) { @@ -97,7 +94,7 @@ out: skb_pull_rcsum(skb, skb->mac_len); skb_reset_mac_len(skb); - return action; + return p->action; drop: tcf_action_inc_drop_qstats(&v->common); @@ -255,6 +252,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, ETH_ALEN); } + p->action = parm->action; spin_lock_bh(&v->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock)); @@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, }; struct tcf_t t; - spin_lock_bh(&v->tcf_lock); - opt.action = v->tcf_action; - p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock)); + rcu_read_lock(); + p = rcu_dereference(v->vlan_p); + opt.action = p->action; opt.v_action = p->tcfv_action; if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt)) goto nla_put_failure; @@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a, tcf_tm_dump(&t, &v->tcf_tm); if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD)) goto nla_put_failure; - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); return skb->len; nla_put_failure: - spin_unlock_bh(&v->tcf_lock); + rcu_read_unlock(); nlmsg_trim(skb, b); return -1; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index d7c767b861a4..1e058b46d3e1 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -431,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) && - !memcmp(&rtab->data, nla_data(tab), 1024)) { + !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) { rtab->refcnt++; return rtab; } @@ -441,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, if (rtab) { rtab->rate = *r; rtab->refcnt = 1; - memcpy(rtab->data, nla_data(tab), 1024); + memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE); if (r->linklayer == TC_LINKLAYER_UNAWARE) r->linklayer = __detect_linklayer(r, rtab->data); rtab->next = qdisc_rtab_list; diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig index 24d5a35ce894..e947646a380c 100644 --- a/net/sctp/Kconfig +++ b/net/sctp/Kconfig @@ -7,9 +7,9 @@ menuconfig IP_SCTP tristate "The SCTP Protocol" depends on INET depends on IPV6 || IPV6=n - select CRYPTO - select CRYPTO_HMAC - select CRYPTO_SHA1 + select CRYPTO_LIB_SHA1 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_UTILS select NET_CRC32C select NET_UDP_TUNNEL help @@ -49,46 +49,25 @@ config SCTP_DBG_OBJCNT 'cat /proc/net/sctp/sctp_dbg_objcnt' If unsure, say N + choice - prompt "Default SCTP cookie HMAC encoding" - default SCTP_DEFAULT_COOKIE_HMAC_MD5 + prompt "Default SCTP cookie authentication method" + default SCTP_DEFAULT_COOKIE_HMAC_SHA256 help - This option sets the default sctp cookie hmac algorithm - when in doubt select 'md5' + This option sets the default SCTP cookie authentication method, for + when a method hasn't been explicitly selected via the + net.sctp.cookie_hmac_alg sysctl. -config SCTP_DEFAULT_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_MD5 + If unsure, choose the default (HMAC-SHA256). -config SCTP_DEFAULT_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select SCTP_COOKIE_HMAC_SHA1 +config SCTP_DEFAULT_COOKIE_HMAC_SHA256 + bool "HMAC-SHA256" config SCTP_DEFAULT_COOKIE_HMAC_NONE - bool "Use no hmac alg in SCTP cookie generation" - help - Use no hmac algorithm in SCTP cookie generation + bool "None" endchoice -config SCTP_COOKIE_HMAC_MD5 - bool "Enable optional MD5 hmac cookie generation" - help - Enable optional MD5 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5 - select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5 - -config SCTP_COOKIE_HMAC_SHA1 - bool "Enable optional SHA1 hmac cookie generation" - help - Enable optional SHA1 hmac based SCTP cookie generation - select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1 - select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1 - config INET_SCTP_DIAG depends on INET_DIAG def_tristate INET_DIAG diff --git a/net/sctp/auth.c b/net/sctp/auth.c index c58fffc86a0c..82aad477590e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -12,36 +12,37 @@ * Vlad Yasevich <vladislav.yasevich@hp.com> */ -#include <crypto/hash.h> +#include <crypto/sha1.h> +#include <crypto/sha2.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/scatterlist.h> #include <net/sctp/sctp.h> #include <net/sctp/auth.h> -static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { +static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = { { /* id 0 is reserved. as all 0 */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0, }, { .hmac_id = SCTP_AUTH_HMAC_ID_SHA1, - .hmac_name = "hmac(sha1)", - .hmac_len = SCTP_SHA1_SIG_SIZE, + .hmac_len = SHA1_DIGEST_SIZE, }, { /* id 2 is reserved as well */ .hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2, }, -#if IS_ENABLED(CONFIG_CRYPTO_SHA256) { .hmac_id = SCTP_AUTH_HMAC_ID_SHA256, - .hmac_name = "hmac(sha256)", - .hmac_len = SCTP_SHA256_SIG_SIZE, + .hmac_len = SHA256_DIGEST_SIZE, } -#endif }; +static bool sctp_hmac_supported(__u16 hmac_id) +{ + return hmac_id < ARRAY_SIZE(sctp_hmac_list) && + sctp_hmac_list[hmac_id].hmac_len != 0; +} void sctp_auth_key_put(struct sctp_auth_bytes *key) { @@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey( return NULL; } -/* - * Initialize all the possible digest transforms that we can use. Right - * now, the supported digests are SHA1 and SHA256. We do this here once - * because of the restrictiong that transforms may only be allocated in - * user context. This forces us to pre-allocated all possible transforms - * at the endpoint init time. - */ -int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp) -{ - struct crypto_shash *tfm = NULL; - __u16 id; - - /* If the transforms are already allocated, we are done */ - if (ep->auth_hmacs) - return 0; - - /* Allocated the array of pointers to transorms */ - ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS, - sizeof(struct crypto_shash *), - gfp); - if (!ep->auth_hmacs) - return -ENOMEM; - - for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) { - - /* See is we support the id. Supported IDs have name and - * length fields set, so that we can allocated and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (!sctp_hmac_list[id].hmac_name) - continue; - - /* If this TFM has been allocated, we are all set */ - if (ep->auth_hmacs[id]) - continue; - - /* Allocate the ID */ - tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0); - if (IS_ERR(tfm)) - goto out_err; - - ep->auth_hmacs[id] = tfm; - } - - return 0; - -out_err: - /* Clean up any successful allocations */ - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; - return -ENOMEM; -} - -/* Destroy the hmac tfm array */ -void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[]) -{ - int i; - - if (!auth_hmacs) - return; - - for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) { - crypto_free_shash(auth_hmacs[i]); - } - kfree(auth_hmacs); -} - - -struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) +const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) { return &sctp_hmac_list[hmac_id]; } @@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id) /* Get an hmac description information that we can use to build * the AUTH chunk */ -struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) +const struct sctp_hmac * +sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) { struct sctp_hmac_algo_param *hmacs; __u16 n_elt; @@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc) sizeof(struct sctp_paramhdr)) >> 1; for (i = 0; i < n_elt; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range. And - * see if we support the id. Supported IDs have name and - * length fields set, so that we can allocate and use - * them. We can safely just check for name, for without the - * name, we can't allocate the TFM. - */ - if (id > SCTP_AUTH_HMAC_ID_MAX || - !sctp_hmac_list[id].hmac_name) { - id = 0; - continue; - } - - break; + if (sctp_hmac_supported(id)) + return &sctp_hmac_list[id]; } - - if (id == 0) - return NULL; - - return &sctp_hmac_list[id]; + return NULL; } static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id) @@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc, void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, struct sctp_hmac_algo_param *hmacs) { - struct sctp_endpoint *ep; __u16 id; int i; int n_params; @@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc, n_params = (ntohs(hmacs->param_hdr.length) - sizeof(struct sctp_paramhdr)) >> 1; - ep = asoc->ep; for (i = 0; i < n_params; i++) { id = ntohs(hmacs->hmac_ids[i]); - - /* Check the id is in the supported range */ - if (id > SCTP_AUTH_HMAC_ID_MAX) - continue; - - /* If this TFM has been allocated, use this id */ - if (ep->auth_hmacs[id]) { + if (sctp_hmac_supported(id)) { asoc->default_hmac_id = id; break; } @@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, struct sctp_shared_key *ep_key, gfp_t gfp) { struct sctp_auth_bytes *asoc_key; - struct crypto_shash *tfm; __u16 key_id, hmac_id; - unsigned char *end; int free_key = 0; + size_t data_len; __u8 *digest; /* Extract the info we need: @@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc, free_key = 1; } - /* set up scatter list */ - end = skb_tail_pointer(skb); - - tfm = asoc->ep->auth_hmacs[hmac_id]; - + data_len = skb_tail_pointer(skb) - (unsigned char *)auth; digest = (u8 *)(&auth->auth_hdr + 1); - if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len)) - goto free; - - crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth, - digest); + if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) { + hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } else { + WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256); + hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len, + (const u8 *)auth, data_len, digest); + } -free: if (free_key) sctp_auth_key_put(asoc_key); } @@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep, for (i = 0; i < hmacs->shmac_num_idents; i++) { id = hmacs->shmac_idents[i]; - if (id > SCTP_AUTH_HMAC_ID_MAX) + if (!sctp_hmac_supported(id)) return -EOPNOTSUPP; if (SCTP_AUTH_HMAC_ID_SHA1 == id) has_sha1 = 1; - - if (!sctp_hmac_list[id].hmac_name) - return -EOPNOTSUPP; } if (!has_sha1) @@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep, int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) { - int err = -ENOMEM; - /* Allocate space for HMACS and CHUNKS authentication * variables. There are arrays that we encode directly * into parameters to make the rest of the operations easier. @@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp) ep->auth_chunk_list = auth_chunks; } - /* Allocate and initialize transorms arrays for supported - * HMACs. - */ - err = sctp_auth_init_hmacs(ep, gfp); - if (err) - goto nomem; - return 0; nomem: @@ -1075,7 +969,7 @@ nomem: kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - return err; + return -ENOMEM; } void sctp_auth_free(struct sctp_endpoint *ep) @@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep) kfree(ep->auth_chunk_list); ep->auth_hmacs_list = NULL; ep->auth_chunk_list = NULL; - sctp_auth_destroy_hmacs(ep->auth_hmacs); - ep->auth_hmacs = NULL; } diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index fd4f8243cc35..c655b571ca01 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, * DATA. */ if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) { - struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc); + const struct sctp_hmac *hmac_desc = + sctp_auth_asoc_get_hmac(asoc); if (hmac_desc) max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) + diff --git a/net/sctp/diag.c b/net/sctp/diag.c index 23359e522273..996c2018f0e6 100644 --- a/net/sctp/diag.c +++ b/net/sctp/diag.c @@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc, mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0) goto errout; diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c index 7e77b450697c..31e989dfe846 100644 --- a/net/sctp/endpointola.c +++ b/net/sctp/endpointola.c @@ -35,6 +35,15 @@ /* Forward declarations for internal helpers. */ static void sctp_endpoint_bh_rcv(struct work_struct *work); +static void gen_cookie_auth_key(struct hmac_sha256_key *key) +{ + u8 raw_key[SCTP_COOKIE_KEY_SIZE]; + + get_random_bytes(raw_key, sizeof(raw_key)); + hmac_sha256_preparekey(key, raw_key, sizeof(raw_key)); + memzero_explicit(raw_key, sizeof(raw_key)); +} + /* * Initialize the base fields of the endpoint structure. */ @@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, struct net *net = sock_net(sk); struct sctp_shared_key *null_key; - ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp); - if (!ep->digest) - return NULL; - ep->asconf_enable = net->sctp.addip_enable; ep->auth_enable = net->sctp.auth_enable; if (ep->auth_enable) { @@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, /* Get the receive buffer policy for this endpoint */ ep->rcvbuf_policy = net->sctp.rcvbuf_policy; - /* Initialize the secret key used with cookie. */ - get_random_bytes(ep->secret_key, sizeof(ep->secret_key)); + /* Generate the cookie authentication key. */ + gen_cookie_auth_key(&ep->cookie_auth_key); /* SCTP-AUTH extensions*/ INIT_LIST_HEAD(&ep->endpoint_shared_keys); @@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, nomem_shkey: sctp_auth_free(ep); nomem: - kfree(ep->digest); return NULL; } @@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) return; } - /* Free the digest buffer */ - kfree(ep->digest); - /* SCTP-AUTH: Free up AUTH releated data such as shared keys * chunks and hmacs arrays that were allocated */ @@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep) sctp_inq_free(&ep->base.inqueue); sctp_bind_addr_free(&ep->base.bind_addr); - memset(ep->secret_key, 0, sizeof(ep->secret_key)); + memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key)); sk = ep->base.sk; /* Remove and free the port */ diff --git a/net/sctp/proc.c b/net/sctp/proc.c index 74bff317e205..1ed281f3c355 100644 --- a/net/sctp/proc.c +++ b/net/sctp/proc.c @@ -52,21 +52,21 @@ static const struct snmp_mib sctp_snmp_list[] = { SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG), SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS), SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS), - SNMP_MIB_SENTINEL }; /* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ static int sctp_snmp_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[SCTP_MIB_MAX]; + unsigned long buff[ARRAY_SIZE(sctp_snmp_list)]; + const int cnt = ARRAY_SIZE(sctp_snmp_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX); + memset(buff, 0, sizeof(buff)); - snmp_get_cpu_field_batch(buff, sctp_snmp_list, - net->sctp.sctp_statistics); - for (i = 0; sctp_snmp_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, sctp_snmp_list, cnt, + net->sctp.sctp_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, buff[i]); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index a5ccada55f2b..9dbc24af749b 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -34,6 +34,7 @@ #include <linux/memblock.h> #include <linux/highmem.h> #include <linux/slab.h> +#include <net/flow.h> #include <net/net_namespace.h> #include <net/protocol.h> #include <net/ip.h> @@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, fl4->fl4_dport = daddr->v4.sin_port; fl4->flowi4_proto = IPPROTO_SCTP; if (asoc) { - fl4->flowi4_tos = inet_dscp_to_dsfield(dscp); + fl4->flowi4_dscp = dscp; fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk); fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if; fl4->fl4_sport = htons(asoc->base.bind_addr.port); @@ -1334,14 +1335,9 @@ static int __net_init sctp_defaults_init(struct net *net) /* Whether Cookie Preservative is enabled(1) or not(0) */ net->sctp.cookie_preserve_enable = 1; - /* Default sctp sockets to use md5 as their hmac alg */ -#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5) - net->sctp.sctp_hmac_alg = "md5"; -#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1) - net->sctp.sctp_hmac_alg = "sha1"; -#else - net->sctp.sctp_hmac_alg = NULL; -#endif + /* Whether cookie authentication is enabled(1) or not(0) */ + net->sctp.cookie_auth_enable = + !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE); /* Max.Burst - 4 */ net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST; diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 3ead591c72fd..2c0017d058d4 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -30,7 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <crypto/hash.h> +#include <crypto/utils.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -1319,7 +1319,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc, __u16 key_id) { struct sctp_authhdr auth_hdr; - struct sctp_hmac *hmac_desc; + const struct sctp_hmac *hmac_desc; struct sctp_chunk *retval; /* Get the first hmac that the peer told us to use */ @@ -1674,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie( * out on the network. */ retval = kzalloc(*cookie_len, GFP_ATOMIC); - if (!retval) - goto nodata; + if (!retval) { + *cookie_len = 0; + return NULL; + } cookie = (struct sctp_signed_cookie *) retval->body; @@ -1706,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie( memcpy((__u8 *)(cookie + 1) + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); - if (sctp_sk(ep->base.sk)->hmac) { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - /* Sign the message. */ - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize, - cookie->signature); - if (err) - goto free_cookie; + /* Sign the cookie, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE); + hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c, + bodysize, cookie->mac); } return retval; - -free_cookie: - kfree(retval); -nodata: - *cookie_len = 0; - return NULL; } /* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ @@ -1740,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie( struct sctp_signed_cookie *cookie; struct sk_buff *skb = chunk->skb; struct sctp_cookie *bear_cookie; - __u8 *digest = ep->digest; enum sctp_scope scope; unsigned int len; ktime_t kt; @@ -1770,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie( cookie = chunk->subh.cookie_hdr; bear_cookie = &cookie->c; - if (!sctp_sk(ep->base.sk)->hmac) - goto no_hmac; + /* Verify the cookie's MAC, if cookie authentication is enabled. */ + if (sctp_sk(ep->base.sk)->cookie_auth_enable) { + u8 mac[SHA256_DIGEST_SIZE]; - /* Check the signature. */ - { - struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac; - int err; - - err = crypto_shash_setkey(tfm, ep->secret_key, - sizeof(ep->secret_key)) ?: - crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize, - digest); - if (err) { - *error = -SCTP_IERROR_NOMEM; + hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie, + bodysize, mac); + static_assert(sizeof(cookie->mac) == sizeof(mac)); + if (crypto_memneq(mac, cookie->mac, sizeof(mac))) { + *error = -SCTP_IERROR_BAD_SIG; goto fail; } } - if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { - *error = -SCTP_IERROR_BAD_SIG; - goto fail; - } - -no_hmac: /* IG Section 2.35.2: * 3) Compare the port numbers and the verification tag contained * within the COOKIE ECHO chunk to the actual port numbers and the diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index a0524ba8d787..4cb8f393434d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -30,6 +30,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <crypto/utils.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -4361,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate( struct sctp_shared_key *sh_key = NULL; struct sctp_authhdr *auth_hdr; __u8 *save_digest, *digest; - struct sctp_hmac *hmac; + const struct sctp_hmac *hmac; unsigned int sig_len; __u16 key_id; @@ -4416,7 +4417,7 @@ static enum sctp_ierror sctp_sf_authenticate( sh_key, GFP_ATOMIC); /* Discard the packet if the digests do not match */ - if (memcmp(save_digest, digest, sig_len)) { + if (crypto_memneq(save_digest, digest, sig_len)) { kfree(save_digest); return SCTP_IERROR_BAD_SIG; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 4921416434f9..ed8293a34240 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -37,7 +37,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <crypto/hash.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> @@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk) sp->default_rcv_context = 0; sp->max_burst = net->sctp.max_burst; - sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg; + sp->cookie_auth_enable = net->sctp.cookie_auth_enable; /* Initialize default setup parameters. These parameters * can be modified with the SCTP_INITMSG socket option or @@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk) if (!sp->ep) return -ENOMEM; - sp->hmac = NULL; - sk->sk_destruct = sctp_destruct_sock; SCTP_DBG_OBJCNT_INC(sock); @@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); } -/* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_common(struct sock *sk) -{ - struct sctp_sock *sp = sctp_sk(sk); - - /* Free up the HMAC transform. */ - crypto_free_shash(sp->hmac); -} - static void sctp_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog) { struct sctp_sock *sp = sctp_sk(sk); struct sctp_endpoint *ep = sp->ep; - struct crypto_shash *tfm = NULL; - char alg[32]; int err; - /* Allocate HMAC for generating cookie. */ - if (!sp->hmac && sp->sctp_hmac_alg) { - sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg); - tfm = crypto_alloc_shash(alg, 0, 0); - if (IS_ERR(tfm)) { - net_info_ratelimited("failed to load transform for %s: %ld\n", - sp->sctp_hmac_alg, PTR_ERR(tfm)); - return -ENOSYS; - } - sctp_sk(sk)->hmac = tfm; - } - /* * If a bind() or sctp_bindx() is not called prior to a listen() * call that allows new associations to be accepted, the system @@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, * copy. */ newsp->ep = newep; - newsp->hmac = NULL; /* Hook this new socket in to the bind_hash list. */ head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk), @@ -9581,16 +9553,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, if (err) return err; - /* New ep's auth_hmacs should be set if old ep's is set, in case - * that net->sctp.auth_enable has been changed to 0 by users and - * new ep's auth_hmacs couldn't be set in sctp_endpoint_init(). - */ - if (oldsp->ep->auth_hmacs) { - err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL); - if (err) - return err; - } - sctp_auto_asconf_init(newsp); /* Move any messages in the old socket's receive queue that are for the @@ -9723,7 +9685,6 @@ struct proto sctp_prot = { static void sctp_v6_destruct_sock(struct sock *sk) { - sctp_destruct_common(sk); inet6_sock_destruct(sk); } diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index ee3eac338a9d..15e7db9a3ab2 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = { }, { .procname = "cookie_hmac_alg", - .data = &init_net.sctp.sctp_hmac_alg, + .data = &init_net.sctp.cookie_auth_enable, .maxlen = 8, .mode = 0644, .proc_handler = proc_sctp_do_hmac_alg, @@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, - sctp.sctp_hmac_alg); + sctp.cookie_auth_enable); struct ctl_table tbl; - bool changed = false; - char *none = "none"; char tmp[8] = {0}; int ret; @@ -399,35 +397,26 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write, if (write) { tbl.data = tmp; - tbl.maxlen = sizeof(tmp); - } else { - tbl.data = net->sctp.sctp_hmac_alg ? : none; - tbl.maxlen = strlen(tbl.data); - } - - ret = proc_dostring(&tbl, write, buffer, lenp, ppos); - if (write && ret == 0) { -#ifdef CONFIG_CRYPTO_MD5 - if (!strncmp(tmp, "md5", 3)) { - net->sctp.sctp_hmac_alg = "md5"; - changed = true; + tbl.maxlen = sizeof(tmp) - 1; + ret = proc_dostring(&tbl, 1, buffer, lenp, ppos); + if (ret) + return ret; + if (!strcmp(tmp, "sha256")) { + net->sctp.cookie_auth_enable = 1; + return 0; } -#endif -#ifdef CONFIG_CRYPTO_SHA1 - if (!strncmp(tmp, "sha1", 4)) { - net->sctp.sctp_hmac_alg = "sha1"; - changed = true; + if (!strcmp(tmp, "none")) { + net->sctp.cookie_auth_enable = 0; + return 0; } -#endif - if (!strncmp(tmp, "none", 4)) { - net->sctp.sctp_hmac_alg = NULL; - changed = true; - } - if (!changed) - ret = -EINVAL; + return -EINVAL; } - - return ret; + if (net->sctp.cookie_auth_enable) + tbl.data = (char *)"sha256"; + else + tbl.data = (char *)"none"; + tbl.maxlen = strlen(tbl.data); + return proc_dostring(&tbl, 0, buffer, lenp, ppos); } static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write, diff --git a/net/smc/Kconfig b/net/smc/Kconfig index ba5e6a2dd2fd..99ecd59d1f4b 100644 --- a/net/smc/Kconfig +++ b/net/smc/Kconfig @@ -1,8 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config SMC tristate "SMC socket protocol family" - depends on INET && INFINIBAND - depends on m || ISM != m + depends on INET && INFINIBAND && DIBS help SMC-R provides a "sockets over RDMA" solution making use of RDMA over Converged Ethernet (RoCE) technology to upgrade @@ -20,16 +19,3 @@ config SMC_DIAG smcss. if unsure, say Y. - -config SMC_LO - bool "SMC intra-OS shortcut with loopback-ism" - depends on SMC - default n - help - SMC_LO enables the creation of an Emulated-ISM device named - loopback-ism in SMC and makes use of it for transferring data - when communication occurs within the same OS. This helps in - convenient testing of SMC-D since loopback-ism is independent - of architecture or hardware. - - if unsure, say N. diff --git a/net/smc/Makefile b/net/smc/Makefile index 60f1c87d5212..0e754cbc38f9 100644 --- a/net/smc/Makefile +++ b/net/smc/Makefile @@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o smc-y += smc_tracepoint.o smc_inet.o smc-$(CONFIG_SYSCTL) += smc_sysctl.o -smc-$(CONFIG_SMC_LO) += smc_loopback.o diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e0e48f24cd61..77b99e8ef35a 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -57,7 +57,6 @@ #include "smc_stats.h" #include "smc_tracepoint.h" #include "smc_sysctl.h" -#include "smc_loopback.h" #include "smc_inet.h" static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group @@ -1097,8 +1096,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc, } /* Check for VLAN ID and register it on ISM device just for CLC handshake */ -static int smc_connect_ism_vlan_setup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_setup(struct smc_init_info *ini) { if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id)) return SMC_CLC_DECL_ISMVLANERR; @@ -1113,7 +1111,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* check if there is an ism device available */ if (!(ini->smcd_version & SMC_V1) || smc_find_ism_device(smc, ini) || - smc_connect_ism_vlan_setup(smc, ini)) + smc_connect_ism_vlan_setup(ini)) ini->smcd_version &= ~SMC_V1; /* else ISM V1 is supported for this connection */ @@ -1158,8 +1156,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc, /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is * used, the VLAN ID will be registered again during the connection setup. */ -static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, - struct smc_init_info *ini) +static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini) { if (!smcd_indicated(ini->smc_type_v1)) return 0; @@ -1582,13 +1579,13 @@ static int __smc_connect(struct smc_sock *smc) goto vlan_cleanup; SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); kfree(ini); return 0; vlan_cleanup: - smc_connect_ism_vlan_cleanup(smc, ini); + smc_connect_ism_vlan_cleanup(ini); kfree(buf); fallback: kfree(ini); @@ -3537,15 +3534,15 @@ static int __init smc_init(void) rc = -ENOMEM; - smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0); + smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0); if (!smc_tcp_ls_wq) goto out_pnet; - smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0); + smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0); if (!smc_hs_wq) goto out_alloc_tcp_ls_wq; - smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0); + smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0); if (!smc_close_wq) goto out_alloc_hs_wq; @@ -3593,16 +3590,10 @@ static int __init smc_init(void) goto out_sock; } - rc = smc_loopback_init(); - if (rc) { - pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc); - goto out_ib; - } - rc = tcp_register_ulp(&smc_ulp_ops); if (rc) { pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc); - goto out_lo; + goto out_ib; } rc = smc_inet_init(); if (rc) { @@ -3613,8 +3604,6 @@ static int __init smc_init(void) return 0; out_ulp: tcp_unregister_ulp(&smc_ulp_ops); -out_lo: - smc_loopback_exit(); out_ib: smc_ib_unregister_client(); out_sock: @@ -3653,7 +3642,6 @@ static void __exit smc_exit(void) tcp_unregister_ulp(&smc_ulp_ops); sock_unregister(PF_SMC); smc_core_exit(); - smc_loopback_exit(); smc_ib_unregister_client(); smc_ism_exit(); destroy_workqueue(smc_close_wq); diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c index 08be56dfb3f2..157aace169d4 100644 --- a/net/smc/smc_clc.c +++ b/net/smc/smc_clc.c @@ -509,10 +509,10 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl) } /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */ -static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, +static int smc_clc_prfx_set4_rcu(struct net_device *dev, __be32 ipv4, struct smc_clc_msg_proposal_prefix *prop) { - struct in_device *in_dev = __in_dev_get_rcu(dst->dev); + struct in_device *in_dev = __in_dev_get_rcu(dev); const struct in_ifaddr *ifa; if (!in_dev) @@ -530,12 +530,12 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4, } /* fill CLC proposal msg with ipv6 prefixes from device */ -static int smc_clc_prfx_set6_rcu(struct dst_entry *dst, +static int smc_clc_prfx_set6_rcu(struct net_device *dev, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { #if IS_ENABLED(CONFIG_IPV6) - struct inet6_dev *in6_dev = __in6_dev_get(dst->dev); + struct inet6_dev *in6_dev = __in6_dev_get(dev); struct inet6_ifaddr *ifa; int cnt = 0; @@ -564,41 +564,44 @@ static int smc_clc_prfx_set(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop, struct smc_clc_ipv6_prefix *ipv6_prfx) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct sockaddr_storage addrs; struct sockaddr_in6 *addr6; struct sockaddr_in *addr; + struct net_device *dev; + struct dst_entry *dst; int rc = -ENOENT; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { - rc = -ENODEV; - goto out_rel; - } /* get address to which the internal TCP socket is bound */ if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0) - goto out_rel; + goto out; + /* analyze IP specific data of net_device belonging to TCP socket */ addr6 = (struct sockaddr_in6 *)&addrs; + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { + rc = -ENODEV; + goto out_unlock; + } + if (addrs.ss_family == PF_INET) { /* IPv4 */ addr = (struct sockaddr_in *)&addrs; - rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop); + rc = smc_clc_prfx_set4_rcu(dev, addr->sin_addr.s_addr, prop); } else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) { /* mapped IPv4 address - peer is IPv4 only */ - rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3], + rc = smc_clc_prfx_set4_rcu(dev, addr6->sin6_addr.s6_addr32[3], prop); } else { /* IPv6 */ - rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx); + rc = smc_clc_prfx_set6_rcu(dev, prop, ipv6_prfx); } + +out_unlock: rcu_read_unlock(); -out_rel: - dst_release(dst); out: return rc; } @@ -654,26 +657,26 @@ static int smc_clc_prfx_match6_rcu(struct net_device *dev, int smc_clc_prfx_match(struct socket *clcsock, struct smc_clc_msg_proposal_prefix *prop) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); + struct net_device *dev; + struct dst_entry *dst; int rc; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (!dev) { rc = -ENODEV; - goto out_rel; + goto out; } - rcu_read_lock(); + if (!prop->ipv6_prefixes_cnt) - rc = smc_clc_prfx_match4_rcu(dst->dev, prop); + rc = smc_clc_prfx_match4_rcu(dev, prop); else - rc = smc_clc_prfx_match6_rcu(dst->dev, prop); - rcu_read_unlock(); -out_rel: - dst_release(dst); + rc = smc_clc_prfx_match6_rcu(dev, prop); out: + rcu_read_unlock(); + return rc; } @@ -913,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) /* add SMC-D specifics */ if (ini->ism_dev[0]) { smcd = ini->ism_dev[0]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); pclc_smcd->ism.gid = htonll(smcd_gid.gid); pclc_smcd->ism.chid = htons(smc_ism_get_chid(ini->ism_dev[0])); @@ -963,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini) if (ini->ism_offered_cnt) { for (i = 1; i <= ini->ism_offered_cnt; i++) { smcd = ini->ism_dev[i]; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); gidchids[entry].chid = htons(smc_ism_get_chid(ini->ism_dev[i])); gidchids[entry].gid = htonll(smcd_gid.gid); @@ -1056,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn, /* SMC-D specific settings */ memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER, sizeof(SMCD_EYECATCHER)); - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); clc->hdr.typev1 = SMC_TYPE_D; clc->d0.gid = htonll(smcd_gid.gid); clc->d0.token = htonll(conn->rmb_desc->token); diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 262746e304dd..be0c2da83d2b 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -85,7 +85,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr) * otherwise there is a risk of out-of-sync link groups. */ if (!lgr->freeing) { - mod_delayed_work(system_wq, &lgr->free_work, + mod_delayed_work(system_percpu_wq, &lgr->free_work, (!lgr->is_smcd && lgr->role == SMC_CLNT) ? SMC_LGR_FREE_DELAY_CLNT : SMC_LGR_FREE_DELAY_SERV); @@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr, if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id))) goto errattr; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID, smcd_gid.gid, SMC_NLA_LGR_D_PAD)) goto errattr; @@ -896,7 +896,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) rc = SMC_CLC_DECL_MEM; goto ism_put_vlan; } - lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0, + lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0, SMC_LGR_ID_SIZE, &lgr->id); if (!lgr->tx_wq) { rc = -ENOMEM; @@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) if (ini->is_smcd) { /* SMC-D specific settings */ smcd = ini->ism_dev[ini->ism_selected]; - get_device(smcd->ops->get_dev(smcd)); + get_device(&smcd->dibs->dev); lgr->peer_gid.gid = ini->ism_peer_gid[ini->ism_selected].gid; lgr->peer_gid.gid_ext = @@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr) destroy_workqueue(lgr->tx_wq); if (lgr->is_smcd) { smc_ism_put_vlan(lgr->smcd, lgr->vlan_id); - put_device(lgr->smcd->ops->get_dev(lgr->smcd)); + put_device(&lgr->smcd->dibs->dev); } smc_lgr_put(lgr); /* theoretically last lgr_put */ } @@ -1883,35 +1883,32 @@ static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev, /* Determine vlan of internal TCP socket. */ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(clcsock->sk); struct netdev_nested_priv priv; struct net_device *ndev; + struct dst_entry *dst; int rc = 0; ini->vlan_id = 0; - if (!dst) { - rc = -ENOTCONN; - goto out; - } - if (!dst->dev) { + + rcu_read_lock(); + + dst = __sk_dst_get(clcsock->sk); + ndev = dst ? dst_dev_rcu(dst) : NULL; + if (!ndev) { rc = -ENODEV; - goto out_rel; + goto out; } - ndev = dst->dev; if (is_vlan_dev(ndev)) { ini->vlan_id = vlan_dev_vlan_id(ndev); - goto out_rel; + goto out; } priv.data = (void *)&ini->vlan_id; - rtnl_lock(); - netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv); - rtnl_unlock(); - -out_rel: - dst_release(dst); + netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv); out: + rcu_read_unlock(); + return rc; } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 48a1b1dcb576..a5a78cbff341 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -13,6 +13,7 @@ #define _SMC_CORE_H #include <linux/atomic.h> +#include <linux/types.h> #include <linux/smc.h> #include <linux/pci.h> #include <rdma/ib_verbs.h> @@ -221,6 +222,10 @@ struct smc_buf_desc { /* virtually contiguous */ }; struct { /* SMC-D */ + /* SMC-D tx buffer */ + bool is_attached; + /* no need for explicit writes */ + /* SMC-D rx buffer: */ unsigned short sba_idx; /* SBA index number */ u64 token; diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c index 8ed2f6689b01..bf0beaa23bdb 100644 --- a/net/smc/smc_diag.c +++ b/net/smc/smc_diag.c @@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, dinfo.linkid = *((u32 *)conn->lgr->id); dinfo.peer_gid = conn->lgr->peer_gid.gid; dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext; - smcd->ops->get_local_gid(smcd, &smcd_gid); + copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid); dinfo.my_gid = smcd_gid.gid; dinfo.my_gid_ext = smcd_gid.gid_ext; dinfo.token = conn->rmb_desc->token; diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index a42ef3f77b96..0052f02756eb 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -974,13 +974,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev) smcibdev->pnetid[i])) smc_pnetid_by_table_ib(smcibdev, i + 1); smc_copy_netdev_ifindex(smcibdev, i); - pr_warn_ratelimited("smc: ib device %s port %d has pnetid " - "%.16s%s\n", - smcibdev->ibdev->name, i + 1, - smcibdev->pnetid[i], - smcibdev->pnetid_by_user[i] ? - " (user defined)" : - ""); + if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i])) + pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n", + smcibdev->ibdev->name, i + 1, + smcibdev->pnetid[i], + smcibdev->pnetid_by_user[i] ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n", + smcibdev->ibdev->name, i + 1); + } schedule_work(&smcibdev->port_event_work); return 0; diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c index 84f98e18c7db..7b228ca2f96a 100644 --- a/net/smc/smc_ism.c +++ b/net/smc/smc_ism.c @@ -17,7 +17,7 @@ #include "smc_ism.h" #include "smc_pnet.h" #include "smc_netlink.h" -#include "linux/ism.h" +#include "linux/dibs.h" struct smcd_dev_list smcd_dev_list = { .list = LIST_HEAD_INIT(smcd_dev_list.list), @@ -27,21 +27,24 @@ struct smcd_dev_list smcd_dev_list = { static bool smc_ism_v2_capable; static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN]; -#if IS_ENABLED(CONFIG_ISM) -static void smcd_register_dev(struct ism_dev *ism); -static void smcd_unregister_dev(struct ism_dev *ism); -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event); -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_register_dev(struct dibs_dev *dibs); +static void smcd_unregister_dev(struct dibs_dev *dibs); +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event); +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask); -static struct ism_client smc_ism_client = { - .name = "SMC-D", - .add = smcd_register_dev, - .remove = smcd_unregister_dev, +static struct dibs_client_ops smc_client_ops = { + .add_dev = smcd_register_dev, + .del_dev = smcd_unregister_dev, .handle_event = smcd_handle_event, .handle_irq = smcd_handle_irq, }; -#endif + +static struct dibs_client smc_dibs_client = { + .name = "SMC-D", + .ops = &smc_client_ops, +}; static void smc_ism_create_system_eid(void) { @@ -68,8 +71,12 @@ static void smc_ism_create_system_eid(void) int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id, struct smcd_dev *smcd) { - return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0, - vlan_id); + struct dibs_dev *dibs = smcd->dibs; + uuid_t ism_rgid; + + copy_to_dibsgid(&ism_rgid, peer_gid); + return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0, + vlan_id); } void smc_ism_get_system_eid(u8 **eid) @@ -82,7 +89,7 @@ void smc_ism_get_system_eid(u8 **eid) u16 smc_ism_get_chid(struct smcd_dev *smcd) { - return smcd->ops->get_chid(smcd); + return smcd->dibs->ops->get_fabric_id(smcd->dibs); } /* HW supports ISM V2 and thus System EID is defined */ @@ -131,7 +138,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->add_vlan_id) + if (!smcd->dibs->ops->add_vlan_id) return -EOPNOTSUPP; /* create new vlan entry, in case we need it */ @@ -154,7 +161,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid) /* no existing entry found. * add new entry to device; might fail, e.g., if HW limit reached */ - if (smcd->ops->add_vlan_id(smcd, vlanid)) { + if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) { kfree(new_vlan); rc = -EIO; goto out; @@ -178,7 +185,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) if (!vlanid) /* No valid vlan id */ return -EINVAL; - if (!smcd->ops->del_vlan_id) + if (!smcd->dibs->ops->del_vlan_id) return -EOPNOTSUPP; spin_lock_irqsave(&smcd->lock, flags); @@ -196,7 +203,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid) } /* Found and the last reference just gone */ - if (smcd->ops->del_vlan_id(smcd, vlanid)) + if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid)) rc = -EIO; list_del(&vlan->list); kfree(vlan); @@ -205,43 +212,43 @@ out: return rc; } -int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc) +void smc_ism_unregister_dmb(struct smcd_dev *smcd, + struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; - int rc = 0; + struct dibs_dmb dmb; if (!dmb_desc->dma_addr) - return rc; + return; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = dmb_desc->token; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.cpu_addr = dmb_desc->cpu_addr; dmb.dma_addr = dmb_desc->dma_addr; dmb.dmb_len = dmb_desc->len; - rc = smcd->ops->unregister_dmb(smcd, &dmb); - if (!rc || rc == ISM_ERROR) { - dmb_desc->cpu_addr = NULL; - dmb_desc->dma_addr = 0; - } - return rc; + smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb); + + return; } int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dev *dibs; + struct dibs_dmb dmb; int rc; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_len = dmb_len; - dmb.sba_idx = dmb_desc->sba_idx; + dmb.idx = dmb_desc->sba_idx; dmb.vlan_id = lgr->vlan_id; - dmb.rgid = lgr->peer_gid.gid; - rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client); + copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid); + + dibs = lgr->smcd->dibs; + rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; @@ -256,38 +263,39 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd) * merging sndbuf with peer DMB to avoid * data copies between them. */ - return (smcd->ops->support_dmb_nocopy && - smcd->ops->support_dmb_nocopy(smcd)); + return (smcd->dibs->ops->support_mmapped_rdmb && + smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs)); } int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc) { - struct smcd_dmb dmb; + struct dibs_dmb dmb; int rc = 0; - if (!dev->ops->attach_dmb) + if (!dev->dibs->ops->attach_dmb) return -EINVAL; memset(&dmb, 0, sizeof(dmb)); dmb.dmb_tok = token; - rc = dev->ops->attach_dmb(dev, &dmb); + rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb); if (!rc) { - dmb_desc->sba_idx = dmb.sba_idx; + dmb_desc->sba_idx = dmb.idx; dmb_desc->token = dmb.dmb_tok; dmb_desc->cpu_addr = dmb.cpu_addr; dmb_desc->dma_addr = dmb.dma_addr; dmb_desc->len = dmb.dmb_len; + dmb_desc->is_attached = true; } return rc; } int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token) { - if (!dev->ops->detach_dmb) + if (!dev->dibs->ops->detach_dmb) return -EINVAL; - return dev->ops->detach_dmb(dev, token); + return dev->dibs->ops->detach_dmb(dev->dibs, token); } static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, @@ -297,12 +305,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, char smc_pnet[SMC_MAX_PNETID_LEN + 1]; struct smc_pci_dev smc_pci_dev; struct nlattr *port_attrs; + struct dibs_dev *dibs; struct nlattr *attrs; - struct ism_dev *ism; int use_cnt = 0; void *nlh; - ism = smcd->priv; + dibs = smcd->dibs; nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, &smc_gen_nl_family, NLM_F_MULTI, SMC_NETLINK_GET_DEV_SMCD); @@ -317,7 +325,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd, if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0)) goto errattr; memset(&smc_pci_dev, 0, sizeof(smc_pci_dev)); - smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev); + smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev); if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid)) goto errattr; if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid)) @@ -367,7 +375,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list, list_for_each_entry(smcd, &dev_list->list, list) { if (num < snum) goto next; - if (smc_ism_is_loopback(smcd)) + if (smc_ism_is_loopback(smcd->dibs)) goto next; if (smc_nl_handle_smcd_dev(smcd, skb, cb)) goto errout; @@ -385,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -#if IS_ENABLED(CONFIG_ISM) struct smc_ism_event_work { struct work_struct work; struct smcd_dev *smcd; - struct ism_event event; + struct dibs_event event; }; #define ISM_EVENT_REQUEST 0x0001 @@ -409,25 +416,27 @@ union smcd_sw_event_info { static void smcd_handle_sw_event(struct smc_ism_event_work *wrk) { - struct smcd_gid peer_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct dibs_dev *dibs = wrk->smcd->dibs; union smcd_sw_event_info ev_info; + struct smcd_gid peer_gid; + uuid_t ism_rgid; - ev_info.info = wrk->event.info; - switch (wrk->event.code) { + copy_to_smcdgid(&peer_gid, &wrk->event.gid); + ev_info.info = wrk->event.data; + switch (wrk->event.subtype) { case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */ smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id); break; case ISM_EVENT_CODE_TESTLINK: /* Activity timer */ if (ev_info.code == ISM_EVENT_REQUEST && - wrk->smcd->ops->signal_event) { + dibs->ops->signal_event) { ev_info.code = ISM_EVENT_RESPONSE; - wrk->smcd->ops->signal_event(wrk->smcd, - &peer_gid, - ISM_EVENT_REQUEST_IR, - ISM_EVENT_CODE_TESTLINK, - ev_info.info); - } + copy_to_dibsgid(&ism_rgid, &peer_gid); + dibs->ops->signal_event(dibs, &ism_rgid, + ISM_EVENT_REQUEST_IR, + ISM_EVENT_CODE_TESTLINK, + ev_info.info); + } break; } } @@ -437,41 +446,39 @@ static void smc_ism_event_work(struct work_struct *work) { struct smc_ism_event_work *wrk = container_of(work, struct smc_ism_event_work, work); - struct smcd_gid smcd_gid = { .gid = wrk->event.tok, - .gid_ext = 0 }; + struct smcd_gid smcd_gid; + + copy_to_smcdgid(&smcd_gid, &wrk->event.gid); switch (wrk->event.type) { - case ISM_EVENT_GID: /* GID event, token is peer GID */ + case DIBS_DEV_EVENT: /* GID event, token is peer GID */ smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK); break; - case ISM_EVENT_DMB: + case DIBS_BUF_EVENT: break; - case ISM_EVENT_SWR: /* Software defined event */ + case DIBS_SW_EVENT: /* Software defined event */ smcd_handle_sw_event(wrk); break; } kfree(wrk); } -static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, - const struct smcd_ops *ops, int max_dmbs) +static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs) { struct smcd_dev *smcd; - smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL); + smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); if (!smcd) return NULL; - smcd->conn = devm_kcalloc(parent, max_dmbs, - sizeof(struct smc_connection *), GFP_KERNEL); + smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), + GFP_KERNEL); if (!smcd->conn) - return NULL; + goto free_smcd; smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)", WQ_MEM_RECLAIM, name); if (!smcd->event_wq) - return NULL; - - smcd->ops = ops; + goto free_conn; spin_lock_init(&smcd->lock); spin_lock_init(&smcd->lgr_lock); @@ -479,28 +486,37 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name, INIT_LIST_HEAD(&smcd->lgr_list); init_waitqueue_head(&smcd->lgrs_deleted); return smcd; + +free_conn: + kfree(smcd->conn); +free_smcd: + kfree(smcd); + return NULL; } -static void smcd_register_dev(struct ism_dev *ism) +static void smcd_register_dev(struct dibs_dev *dibs) { - const struct smcd_ops *ops = ism_get_smcd_ops(); struct smcd_dev *smcd, *fentry; + int max_dmbs; - if (!ops) - return; + max_dmbs = dibs->ops->max_dmbs(); - smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops, - ISM_NR_DMBS); + smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs); if (!smcd) return; - smcd->priv = ism; - smcd->client = &smc_ism_client; - ism_set_priv(ism, &smc_ism_client, smcd); - if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid)) + + smcd->dibs = dibs; + dibs_set_priv(dibs, &smc_dibs_client, smcd); + + if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid)) smc_pnetid_by_table_smcd(smcd); - if (smcd->ops->supports_v2()) + if (smc_ism_is_loopback(dibs) || + (dibs->ops->add_vlan_id && + !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) { smc_ism_set_v2_capable(); + } + mutex_lock(&smcd_dev_list.mutex); /* sort list: * - devices without pnetid before devices with pnetid; @@ -509,7 +525,7 @@ static void smcd_register_dev(struct ism_dev *ism) if (!smcd->pnetid[0]) { fentry = list_first_entry_or_null(&smcd_dev_list.list, struct smcd_dev, list); - if (fentry && smc_ism_is_loopback(fentry)) + if (fentry && smc_ism_is_loopback(fentry->dibs)) list_add(&smcd->list, &fentry->list); else list_add(&smcd->list, &smcd_dev_list.list); @@ -518,25 +534,32 @@ static void smcd_register_dev(struct ism_dev *ism) } mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", - dev_name(&ism->dev), smcd->pnetid, - smcd->pnetid_by_user ? " (user defined)" : ""); - + if (smc_pnet_is_pnetid_set(smcd->pnetid)) + pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n", + dev_name(&dibs->dev), smcd->pnetid, + smcd->pnetid_by_user ? + " (user defined)" : + ""); + else + pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n", + dev_name(&dibs->dev)); return; } -static void smcd_unregister_dev(struct ism_dev *ism) +static void smcd_unregister_dev(struct dibs_dev *dibs) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ism->dev)); + dev_name(&dibs->dev)); smcd->going_away = 1; smc_smcd_terminate_all(smcd); mutex_lock(&smcd_dev_list.mutex); list_del_init(&smcd->list); mutex_unlock(&smcd_dev_list.mutex); destroy_workqueue(smcd->event_wq); + kfree(smcd->conn); + kfree(smcd); } /* SMCD Device event handler. Called from ISM device interrupt handler. @@ -550,9 +573,10 @@ static void smcd_unregister_dev(struct ism_dev *ism) * Context: * - Function called in IRQ context from ISM device driver event handler. */ -static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) +static void smcd_handle_event(struct dibs_dev *dibs, + const struct dibs_event *event) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_ism_event_work *wrk; if (smcd->going_away) @@ -574,10 +598,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event) * Context: * - Function called in IRQ context from ISM device driver IRQ handler. */ -static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, +static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno, u16 dmbemask) { - struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client); + struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client); struct smc_connection *conn = NULL; unsigned long flags; @@ -587,27 +611,26 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno, tasklet_schedule(&conn->rx_tsklet); spin_unlock_irqrestore(&smcd->lock, flags); } -#endif int smc_ism_signal_shutdown(struct smc_link_group *lgr) { int rc = 0; -#if IS_ENABLED(CONFIG_ISM) union smcd_sw_event_info ev_info; + uuid_t ism_rgid; if (lgr->peer_shutdown) return 0; - if (!lgr->smcd->ops->signal_event) + if (!lgr->smcd->dibs->ops->signal_event) return 0; memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE); ev_info.vlan_id = lgr->vlan_id; ev_info.code = ISM_EVENT_REQUEST; - rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid, + copy_to_dibsgid(&ism_rgid, &lgr->peer_gid); + rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid, ISM_EVENT_REQUEST_IR, ISM_EVENT_CODE_SHUTDOWN, ev_info.info); -#endif return rc; } @@ -618,15 +641,11 @@ int smc_ism_init(void) smc_ism_v2_capable = false; smc_ism_create_system_eid(); -#if IS_ENABLED(CONFIG_ISM) - rc = ism_register_client(&smc_ism_client); -#endif + rc = dibs_register_client(&smc_dibs_client); return rc; } void smc_ism_exit(void) { -#if IS_ENABLED(CONFIG_ISM) - ism_unregister_client(&smc_ism_client); -#endif + dibs_unregister_client(&smc_dibs_client); } diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h index 6763133dd8d0..a1575e31df73 100644 --- a/net/smc/smc_ism.h +++ b/net/smc/smc_ism.h @@ -12,6 +12,7 @@ #include <linux/uio.h> #include <linux/types.h> #include <linux/mutex.h> +#include <linux/dibs.h> #include "smc.h" @@ -47,7 +48,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id); int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size, struct smc_buf_desc *dmb_desc); -int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc); +void smc_ism_unregister_dmb(struct smcd_dev *dev, + struct smc_buf_desc *dmb_desc); bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd); int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token, struct smc_buf_desc *dmb_desc); @@ -67,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok, { int rc; - rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len); + rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset, + data, len); + return rc < 0 ? rc : 0; } @@ -84,14 +88,36 @@ static inline bool __smc_ism_is_emulated(u16 chid) static inline bool smc_ism_is_emulated(struct smcd_dev *smcd) { - u16 chid = smcd->ops->get_chid(smcd); + u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs); return __smc_ism_is_emulated(chid); } -static inline bool smc_ism_is_loopback(struct smcd_dev *smcd) +static inline bool smc_ism_is_loopback(struct dibs_dev *dibs) { - return (smcd->ops->get_chid(smcd) == 0xFFFF); + return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC); +} + +static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid) +{ + __be64 temp; + + memcpy(&temp, dibs_gid, sizeof(sgid->gid)); + sgid->gid = ntohll(temp); + memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid), + sizeof(sgid->gid_ext)); + sgid->gid_ext = ntohll(temp); +} + +static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid) +{ + __be64 temp; + + temp = htonll(sgid->gid); + memcpy(dibs_gid, &temp, sizeof(sgid->gid)); + temp = htonll(sgid->gid_ext); + memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp, + sizeof(sgid->gid_ext)); } #endif diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c deleted file mode 100644 index 77cc1c6dc3e9..000000000000 --- a/net/smc/smc_loopback.c +++ /dev/null @@ -1,425 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * Functions for loopback-ism device. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu <guwen@linux.alibaba.com> - * Tony Lu <tonylu@linux.alibaba.com> - * - */ - -#include <linux/device.h> -#include <linux/types.h> -#include <net/smc.h> - -#include "smc_cdc.h" -#include "smc_ism.h" -#include "smc_loopback.h" - -#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */ -#define SMC_LO_SUPPORT_NOCOPY 0x1 -#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0) - -static const char smc_lo_dev_name[] = "loopback-ism"; -static struct smc_lo_dev *lo_dev; - -static void smc_lo_generate_ids(struct smc_lo_dev *ldev) -{ - struct smcd_gid *lgid = &ldev->local_gid; - uuid_t uuid; - - uuid_gen(&uuid); - memcpy(&lgid->gid, &uuid, sizeof(lgid->gid)); - memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid), - sizeof(lgid->gid_ext)); - - ldev->chid = SMC_LO_RESERVED_CHID; -} - -static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid, - u32 vid_valid, u32 vid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - /* rgid should be the same as lgid */ - if (!ldev || rgid->gid != ldev->local_gid.gid || - rgid->gid_ext != ldev->local_gid.gid_ext) - return -ENETUNREACH; - return 0; -} - -static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb, - void *client_priv) -{ - struct smc_lo_dmb_node *dmb_node, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct folio *folio; - int sba_idx, rc; - - /* check space for new dmb */ - for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) { - if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask)) - break; - } - if (sba_idx == SMC_LO_MAX_DMBS) - return -ENOSPC; - - dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL); - if (!dmb_node) { - rc = -ENOMEM; - goto err_bit; - } - - dmb_node->sba_idx = sba_idx; - dmb_node->len = dmb->dmb_len; - - /* not critical; fail under memory pressure and fallback to TCP */ - folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC | - __GFP_NORETRY | __GFP_ZERO, - get_order(dmb_node->len)); - if (!folio) { - rc = -ENOMEM; - goto err_node; - } - dmb_node->cpu_addr = folio_address(folio); - dmb_node->dma_addr = SMC_DMA_ADDR_INVALID; - refcount_set(&dmb_node->refcnt, 1); - -again: - /* add new dmb into hash table */ - get_random_bytes(&dmb_node->token, sizeof(dmb_node->token)); - write_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) { - if (tmp_node->token == dmb_node->token) { - write_unlock_bh(&ldev->dmb_ht_lock); - goto again; - } - } - hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token); - write_unlock_bh(&ldev->dmb_ht_lock); - atomic_inc(&ldev->dmb_cnt); - - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - - return 0; - -err_node: - kfree(dmb_node); -err_bit: - clear_bit(sba_idx, ldev->sba_idx_mask); - return rc; -} - -static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev, - struct smc_lo_dmb_node *dmb_node) -{ - /* remove dmb from hash table */ - write_lock_bh(&ldev->dmb_ht_lock); - hash_del(&dmb_node->list); - write_unlock_bh(&ldev->dmb_ht_lock); - - clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask); - folio_put(virt_to_folio(dmb_node->cpu_addr)); - kfree(dmb_node); - - if (atomic_dec_and_test(&ldev->dmb_cnt)) - wake_up(&ldev->ldev_release); -} - -static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb from hash table */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd) -{ - return SMC_LO_SUPPORT_NOCOPY; -} - -static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) { - if (tmp_node->token == dmb->dmb_tok) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (!refcount_inc_not_zero(&dmb_node->refcnt)) - /* the dmb is being unregistered, but has - * not been removed from the hash table. - */ - return -EINVAL; - - /* provide dmb information */ - dmb->sba_idx = dmb_node->sba_idx; - dmb->dmb_tok = dmb_node->token; - dmb->cpu_addr = dmb_node->cpu_addr; - dmb->dma_addr = dmb_node->dma_addr; - dmb->dmb_len = dmb_node->len; - return 0; -} - -static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token) -{ - struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - - /* find dmb_node according to dmb->dmb_tok */ - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) { - if (tmp_node->token == token) { - dmb_node = tmp_node; - break; - } - } - if (!dmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - read_unlock_bh(&ldev->dmb_ht_lock); - - if (refcount_dec_and_test(&dmb_node->refcnt)) - __smc_lo_unregister_dmb(ldev, dmb_node); - return 0; -} - -static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok, - unsigned int idx, bool sf, unsigned int offset, - void *data, unsigned int size) -{ - struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node; - struct smc_lo_dev *ldev = smcd->priv; - struct smc_connection *conn; - - if (!sf) - /* since sndbuf is merged with peer DMB, there is - * no need to copy data from sndbuf to peer DMB. - */ - return 0; - - read_lock_bh(&ldev->dmb_ht_lock); - hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) { - if (tmp_node->token == dmb_tok) { - rmb_node = tmp_node; - break; - } - } - if (!rmb_node) { - read_unlock_bh(&ldev->dmb_ht_lock); - return -EINVAL; - } - memcpy((char *)rmb_node->cpu_addr + offset, data, size); - read_unlock_bh(&ldev->dmb_ht_lock); - - conn = smcd->conn[rmb_node->sba_idx]; - if (!conn || conn->killed) - return -EPIPE; - tasklet_schedule(&conn->rx_tsklet); - return 0; -} - -static void smc_lo_get_local_gid(struct smcd_dev *smcd, - struct smcd_gid *smcd_gid) -{ - struct smc_lo_dev *ldev = smcd->priv; - - smcd_gid->gid = ldev->local_gid.gid; - smcd_gid->gid_ext = ldev->local_gid.gid_ext; -} - -static u16 smc_lo_get_chid(struct smcd_dev *smcd) -{ - return ((struct smc_lo_dev *)smcd->priv)->chid; -} - -static struct device *smc_lo_get_dev(struct smcd_dev *smcd) -{ - return &((struct smc_lo_dev *)smcd->priv)->dev; -} - -static const struct smcd_ops lo_ops = { - .query_remote_gid = smc_lo_query_rgid, - .register_dmb = smc_lo_register_dmb, - .unregister_dmb = smc_lo_unregister_dmb, - .support_dmb_nocopy = smc_lo_support_dmb_nocopy, - .attach_dmb = smc_lo_attach_dmb, - .detach_dmb = smc_lo_detach_dmb, - .add_vlan_id = NULL, - .del_vlan_id = NULL, - .set_vlan_required = NULL, - .reset_vlan_required = NULL, - .signal_event = NULL, - .move_data = smc_lo_move_data, - .get_local_gid = smc_lo_get_local_gid, - .get_chid = smc_lo_get_chid, - .get_dev = smc_lo_get_dev, -}; - -static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops, - int max_dmbs) -{ - struct smcd_dev *smcd; - - smcd = kzalloc(sizeof(*smcd), GFP_KERNEL); - if (!smcd) - return NULL; - - smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *), - GFP_KERNEL); - if (!smcd->conn) - goto out_smcd; - - smcd->ops = ops; - - spin_lock_init(&smcd->lock); - spin_lock_init(&smcd->lgr_lock); - INIT_LIST_HEAD(&smcd->vlan); - INIT_LIST_HEAD(&smcd->lgr_list); - init_waitqueue_head(&smcd->lgrs_deleted); - return smcd; - -out_smcd: - kfree(smcd); - return NULL; -} - -static int smcd_lo_register_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd; - - smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS); - if (!smcd) - return -ENOMEM; - ldev->smcd = smcd; - smcd->priv = ldev; - smc_ism_set_v2_capable(); - mutex_lock(&smcd_dev_list.mutex); - list_add(&smcd->list, &smcd_dev_list.list); - mutex_unlock(&smcd_dev_list.mutex); - pr_warn_ratelimited("smc: adding smcd device %s\n", - dev_name(&ldev->dev)); - return 0; -} - -static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev) -{ - struct smcd_dev *smcd = ldev->smcd; - - pr_warn_ratelimited("smc: removing smcd device %s\n", - dev_name(&ldev->dev)); - smcd->going_away = 1; - smc_smcd_terminate_all(smcd); - mutex_lock(&smcd_dev_list.mutex); - list_del_init(&smcd->list); - mutex_unlock(&smcd_dev_list.mutex); - kfree(smcd->conn); - kfree(smcd); -} - -static int smc_lo_dev_init(struct smc_lo_dev *ldev) -{ - smc_lo_generate_ids(ldev); - rwlock_init(&ldev->dmb_ht_lock); - hash_init(ldev->dmb_ht); - atomic_set(&ldev->dmb_cnt, 0); - init_waitqueue_head(&ldev->ldev_release); - - return smcd_lo_register_dev(ldev); -} - -static void smc_lo_dev_exit(struct smc_lo_dev *ldev) -{ - smcd_lo_unregister_dev(ldev); - if (atomic_read(&ldev->dmb_cnt)) - wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt)); -} - -static void smc_lo_dev_release(struct device *dev) -{ - struct smc_lo_dev *ldev = - container_of(dev, struct smc_lo_dev, dev); - - kfree(ldev); -} - -static int smc_lo_dev_probe(void) -{ - struct smc_lo_dev *ldev; - int ret; - - ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); - if (!ldev) - return -ENOMEM; - - ldev->dev.parent = NULL; - ldev->dev.release = smc_lo_dev_release; - device_initialize(&ldev->dev); - dev_set_name(&ldev->dev, smc_lo_dev_name); - - ret = smc_lo_dev_init(ldev); - if (ret) - goto free_dev; - - lo_dev = ldev; /* global loopback device */ - return 0; - -free_dev: - put_device(&ldev->dev); - return ret; -} - -static void smc_lo_dev_remove(void) -{ - if (!lo_dev) - return; - - smc_lo_dev_exit(lo_dev); - put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */ -} - -int smc_loopback_init(void) -{ - return smc_lo_dev_probe(); -} - -void smc_loopback_exit(void) -{ - smc_lo_dev_remove(); -} diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h deleted file mode 100644 index 04dc6808d2e1..000000000000 --- a/net/smc/smc_loopback.h +++ /dev/null @@ -1,60 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Shared Memory Communications Direct over loopback-ism device. - * - * SMC-D loopback-ism device structure definitions. - * - * Copyright (c) 2024, Alibaba Inc. - * - * Author: Wen Gu <guwen@linux.alibaba.com> - * Tony Lu <tonylu@linux.alibaba.com> - * - */ - -#ifndef _SMC_LOOPBACK_H -#define _SMC_LOOPBACK_H - -#include <linux/device.h> -#include <net/smc.h> - -#if IS_ENABLED(CONFIG_SMC_LO) -#define SMC_LO_MAX_DMBS 5000 -#define SMC_LO_DMBS_HASH_BITS 12 -#define SMC_LO_RESERVED_CHID 0xFFFF - -struct smc_lo_dmb_node { - struct hlist_node list; - u64 token; - u32 len; - u32 sba_idx; - void *cpu_addr; - dma_addr_t dma_addr; - refcount_t refcnt; -}; - -struct smc_lo_dev { - struct smcd_dev *smcd; - struct device dev; - u16 chid; - struct smcd_gid local_gid; - atomic_t dmb_cnt; - rwlock_t dmb_ht_lock; - DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS); - DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS); - wait_queue_head_t ldev_release; -}; - -int smc_loopback_init(void); -void smc_loopback_exit(void); -#else -static inline int smc_loopback_init(void) -{ - return 0; -} - -static inline void smc_loopback_exit(void) -{ -} -#endif - -#endif /* _SMC_LOOPBACK_H */ diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 76ad29e31d60..a3a1e1fde8eb 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name) pr_warn_ratelimited("smc: smcd device %s " "erased user defined pnetid " "%.16s\n", - dev_name(smcd->ops->get_dev(smcd)), + dev_name(&smcd->dibs->dev), smcd->pnetid); memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN); smcd->pnetid_by_user = false; @@ -332,8 +332,11 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name) mutex_lock(&smcd_dev_list.mutex); list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) { - if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)), - smcd_name, IB_DEVICE_NAME_MAX - 1)) + if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name, + IB_DEVICE_NAME_MAX - 1) || + (smcd_dev->dibs->dev.parent && + !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name, + IB_DEVICE_NAME_MAX - 1))) goto out; } smcd_dev = NULL; @@ -413,7 +416,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, bool smcddev_applied = true; bool ibdev_applied = true; struct smcd_dev *smcd; - struct device *dev; bool new_ibdev; /* try to apply the pnetid to active devices */ @@ -431,10 +433,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, if (smcd) { smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name); if (smcddev_applied) { - dev = smcd->ops->get_dev(smcd); - pr_warn_ratelimited("smc: smcd device %s " - "applied user defined pnetid " - "%.16s\n", dev_name(dev), + pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n", + dev_name(&smcd->dibs->dev), smcd->pnetid); } } @@ -450,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name, return -ENOMEM; new_pe->type = SMC_PNET_IB; memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN); - strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX); + strscpy(new_pe->ib_name, ib_name); new_pe->ib_port = ib_port; new_ibdev = true; @@ -1126,37 +1126,38 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev, */ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); - - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; + struct net_device *dev; + struct dst_entry *dst; - smc_pnet_find_roce_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_roce_by_pnetid(dev, ini); + dev_put(dev); + } } void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini) { - struct dst_entry *dst = sk_dst_get(sk); + struct net_device *dev; + struct dst_entry *dst; ini->ism_dev[0] = NULL; - if (!dst) - goto out; - if (!dst->dev) - goto out_rel; - smc_pnet_find_ism_by_pnetid(dst->dev, ini); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + dev_hold(dev); + rcu_read_unlock(); -out_rel: - dst_release(dst); -out: - return; + if (dev) { + smc_pnet_find_ism_by_pnetid(dev, ini); + dev_put(dev); + } } /* Lookup and apply a pnet table entry to the given ib device. @@ -1192,7 +1193,6 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port) */ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) { - const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev)); struct smc_pnettable *pnettable; struct smc_pnetentry *tmp_pe; struct smc_net *sn; @@ -1205,7 +1205,13 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev) mutex_lock(&pnettable->lock); list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) { if (tmp_pe->type == SMC_PNET_IB && - !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) { + (!strncmp(tmp_pe->ib_name, + dev_name(&smcddev->dibs->dev), + sizeof(tmp_pe->ib_name)) || + (smcddev->dibs->dev.parent && + !strncmp(tmp_pe->ib_name, + dev_name(smcddev->dibs->dev.parent), + sizeof(tmp_pe->ib_name))))) { smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name); rc = 0; break; diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 214ac3cbcf9a..3144b4b1fe29 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -426,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len, int srcchunk, dstchunk; int rc; + if (conn->sndbuf_desc->is_attached) + return 0; + for (dstchunk = 0; dstchunk < 2; dstchunk++) { for (srcchunk = 0; srcchunk < 2; srcchunk++) { void *data = conn->sndbuf_desc->cpu_addr + src_off; diff --git a/net/socket.c b/net/socket.c index bac335ecee4c..e8892b218708 100644 --- a/net/socket.c +++ b/net/socket.c @@ -276,28 +276,41 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen, void __user *uaddr, int __user *ulen) { - int err; int len; BUG_ON(klen > sizeof(struct sockaddr_storage)); - err = get_user(len, ulen); - if (err) - return err; + + if (can_do_masked_user_access()) + ulen = masked_user_access_begin(ulen); + else if (!user_access_begin(ulen, 4)) + return -EFAULT; + + unsafe_get_user(len, ulen, efault_end); + if (len > klen) len = klen; - if (len < 0) - return -EINVAL; + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + if (len >= 0) + unsafe_put_user(klen, ulen, efault_end); + + user_access_end(); + if (len) { + if (len < 0) + return -EINVAL; if (audit_sockaddr(klen, kaddr)) return -ENOMEM; if (copy_to_user(uaddr, kaddr, len)) return -EFAULT; } - /* - * "fromlen shall refer to the value before truncation.." - * 1003.1g - */ - return __put_user(klen, ulen); + return 0; + +efault_end: + user_access_end(); + return -EFAULT; } static struct kmem_cache *sock_inode_cachep __ro_after_init; diff --git a/net/tipc/addr.c b/net/tipc/addr.c index fd0796269eed..6f5c54cbf8d9 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -79,7 +79,7 @@ void tipc_set_node_addr(struct net *net, u32 addr) pr_info("Node number set to %u\n", addr); } -char *tipc_nodeid2string(char *str, u8 *id) +int tipc_nodeid2string(char *str, u8 *id) { int i; u8 c; @@ -109,7 +109,7 @@ char *tipc_nodeid2string(char *str, u8 *id) if (i == NODE_ID_LEN) { memcpy(str, id, NODE_ID_LEN); str[NODE_ID_LEN] = 0; - return str; + return i; } /* Translate to hex string */ @@ -120,5 +120,5 @@ char *tipc_nodeid2string(char *str, u8 *id) for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--) str[i] = 0; - return str; + return i + 1; } diff --git a/net/tipc/addr.h b/net/tipc/addr.h index 93f82398283d..a113cf7e1f89 100644 --- a/net/tipc/addr.h +++ b/net/tipc/addr.h @@ -130,6 +130,6 @@ static inline int in_own_node(struct net *net, u32 addr) bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr); void tipc_set_node_id(struct net *net, u8 *id); void tipc_set_node_addr(struct net *net, u32 addr); -char *tipc_nodeid2string(char *str, u8 *id); +int tipc_nodeid2string(char *str, u8 *id); #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 3ee44d731700..931f55f781a1 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -495,11 +495,9 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id, /* Set link name for unicast links only */ if (peer_id) { - tipc_nodeid2string(self_str, tipc_own_id(net)); - if (strlen(self_str) > 16) + if (tipc_nodeid2string(self_str, tipc_own_id(net)) > NODE_ID_LEN) sprintf(self_str, "%x", self); - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); } /* Peer i/f name will be completed by reset/activate message */ @@ -570,8 +568,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id, if (peer_id) { char peer_str[NODE_ID_STR_LEN] = {0,}; - tipc_nodeid2string(peer_str, peer_id); - if (strlen(peer_str) > 16) + if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN) sprintf(peer_str, "%x", peer); /* Broadcast receiver link name: "broadcast-link:<peer>" */ snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name, diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e028bf658499..1574a83384f8 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb, else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) { trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload2!"); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); err = TIPC_ERR_OVERLOAD; } @@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk, trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!"); /* Overload => reject message back to sender */ onode = tipc_own_addr(sock_net(sk)); - atomic_inc(&sk->sk_drops); + sk_drops_inc(sk); if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) { trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL, "@sk_enqueue!"); @@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb, nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ, skb_queue_len(&sk->sk_write_queue)) || nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP, - atomic_read(&sk->sk_drops))) + sk_drops_read(sk))) goto stat_msg_cancel; if (tsk->cong_link_cnt && diff --git a/net/tls/tls.h b/net/tls/tls.h index e4c42731ce39..2f86baeb71fc 100644 --- a/net/tls/tls.h +++ b/net/tls/tls.h @@ -128,8 +128,9 @@ struct tls_rec { char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_MAX_IV_SIZE]; + + /* Must be last --ends in a flexible-array member. */ struct aead_request aead_req; - u8 aead_req_ctx[]; }; int __net_init tls_proc_init(struct net *net); diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index f672a62a9a52..a64ae15b1a60 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -123,17 +123,19 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx) /* We assume that the socket is already connected */ static struct net_device *get_netdev_for_sock(struct sock *sk) { - struct dst_entry *dst = sk_dst_get(sk); - struct net_device *netdev = NULL; + struct net_device *dev, *lowest_dev = NULL; + struct dst_entry *dst; - if (likely(dst)) { - netdev = netdev_sk_get_lowest_dev(dst->dev, sk); - dev_hold(netdev); + rcu_read_lock(); + dst = __sk_dst_get(sk); + dev = dst ? dst_dev_rcu(dst) : NULL; + if (likely(dev)) { + lowest_dev = netdev_sk_get_lowest_dev(dev, sk); + dev_hold(lowest_dev); } + rcu_read_unlock(); - dst_release(dst); - - return netdev; + return lowest_dev; } static void destroy_record(struct tls_record_info *record) @@ -1410,7 +1412,7 @@ int __init tls_device_init(void) if (!dummy_page) return -ENOMEM; - destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0); + destruct_wq = alloc_workqueue("ktls_device_destruct", WQ_PERCPU, 0); if (!destruct_wq) { err = -ENOMEM; goto err_free_dummy; diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c index 367666aa07b8..4012c4372d4c 100644 --- a/net/tls/tls_proc.c +++ b/net/tls/tls_proc.c @@ -27,17 +27,19 @@ static const struct snmp_mib tls_mib_list[] = { SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK), SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR), SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED), - SNMP_MIB_SENTINEL }; static int tls_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buf[LINUX_MIB_TLSMAX] = {}; + unsigned long buf[ARRAY_SIZE(tls_mib_list)]; + const int cnt = ARRAY_SIZE(tls_mib_list); struct net *net = seq->private; int i; - snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics); - for (i = 0; tls_mib_list[i].name; i++) + memset(buf, 0, sizeof(buf)); + snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt, + net->mib.tls_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]); return 0; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 01e2b9452c75..684ab03137b6 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -592,7 +592,7 @@ static DECLARE_WORK(unix_gc_work, __unix_gc); void unix_gc(void) { WRITE_ONCE(gc_in_progress, true); - queue_work(system_unbound_wq, &unix_gc_work); + queue_work(system_dfl_wq, &unix_gc_work); } #define UNIX_INFLIGHT_TRIGGER_GC 16000 diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c index bebb355f3ffe..4c2db6cca557 100644 --- a/net/vmw_vsock/af_vsock.c +++ b/net/vmw_vsock/af_vsock.c @@ -1029,12 +1029,7 @@ static int vsock_getname(struct socket *sock, vm_addr = &vsk->local_addr; } - /* sys_getsockname() and sys_getpeername() pass us a - * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately - * that macro is defined in socket.c instead of .h, so we hardcode its - * value here. - */ - BUILD_BUG_ON(sizeof(*vm_addr) > 128); + BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage)); memcpy(addr, vm_addr, sizeof(*vm_addr)); err = sizeof(*vm_addr); @@ -1654,7 +1649,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr, * reschedule it, then ungrab the socket refcount to * keep it balanced. */ - if (mod_delayed_work(system_wq, &vsk->connect_work, + if (mod_delayed_work(system_percpu_wq, &vsk->connect_work, timeout)) sock_put(sk); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index b6569b0ca2bb..8c867023a2e5 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -926,7 +926,7 @@ static int __init virtio_vsock_init(void) { int ret; - virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0); + virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", WQ_PERCPU, 0); if (!virtio_vsock_workqueue) return -ENOMEM; diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c index 6e78927a598e..bc2ff918b315 100644 --- a/net/vmw_vsock/vsock_loopback.c +++ b/net/vmw_vsock/vsock_loopback.c @@ -139,7 +139,7 @@ static int __init vsock_loopback_init(void) struct vsock_loopback *vsock = &the_vsock_loopback; int ret; - vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0); + vsock->workqueue = alloc_workqueue("vsock-loopback", WQ_PERCPU, 0); if (!vsock->workqueue) return -ENOMEM; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 193734b7f9dc..68221b1ab45e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef, punctured = 0) : (punctured >>= 1))) \ if (!(punctured & 1)) +#define for_each_s1g_subchan(chandef, freq_khz) \ + for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \ + freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \ + freq_khz += MHZ_TO_KHZ(1)) + struct cfg80211_per_bw_puncturing_values { u8 len; const u16 *valid_values; @@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center, bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) { - u32 control_freq, oper_freq; - int oper_width, control_width; + u32 control_freq, control_freq_khz, start_khz, end_khz; if (!chandef->chan) return false; @@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) case NL80211_CHAN_WIDTH_4: case NL80211_CHAN_WIDTH_8: case NL80211_CHAN_WIDTH_16: - if (chandef->chan->band != NL80211_BAND_S1GHZ) - return false; - - control_freq = ieee80211_channel_to_khz(chandef->chan); - oper_freq = ieee80211_chandef_to_khz(chandef); - control_width = nl80211_chan_width_to_mhz( - ieee80211_s1g_channel_width( - chandef->chan)); - oper_width = cfg80211_chandef_get_width(chandef); - - if (oper_width < 0 || control_width < 0) + if (!cfg80211_chandef_is_s1g(chandef)) return false; if (chandef->center_freq2) return false; - if (control_freq + MHZ_TO_KHZ(control_width) / 2 > - oper_freq + MHZ_TO_KHZ(oper_width) / 2) - return false; + control_freq_khz = ieee80211_channel_to_khz(chandef->chan); + start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + end_khz = cfg80211_s1g_get_end_freq_khz(chandef); - if (control_freq - MHZ_TO_KHZ(control_width) / 2 < - oper_freq - MHZ_TO_KHZ(oper_width) / 2) + if (control_freq_khz < start_khz || control_freq_khz > end_khz) return false; break; case NL80211_CHAN_WIDTH_80P80: @@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef) !cfg80211_edmg_chandef_valid(chandef)) return false; + if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz) + return false; + return valid_puncturing_bitmap(chandef); } EXPORT_SYMBOL(cfg80211_chandef_valid); @@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy, { struct ieee80211_channel *c; + /* DFS is not required for S1G */ + if (cfg80211_chandef_is_s1g(chandef)) + return 0; + for_each_subchan(chandef, freq, cf) { c = ieee80211_get_channel_khz(wiphy, freq); if (!c) @@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels, return true; } +static bool cfg80211_s1g_usable(struct wiphy *wiphy, + const struct cfg80211_chan_def *chandef) +{ + u32 freq_khz; + const struct ieee80211_channel *chan; + u32 pri_khz = ieee80211_channel_to_khz(chandef->chan); + u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef); + u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef); + int width_mhz = cfg80211_chandef_get_width(chandef); + u32 prohibited_flags = IEEE80211_CHAN_DISABLED; + + if (width_mhz >= 16) + prohibited_flags |= IEEE80211_CHAN_NO_16MHZ; + if (width_mhz >= 8) + prohibited_flags |= IEEE80211_CHAN_NO_8MHZ; + if (width_mhz >= 4) + prohibited_flags |= IEEE80211_CHAN_NO_4MHZ; + + if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + if (pri_khz < start_khz || pri_khz > end_khz) + return false; + + for_each_s1g_subchan(chandef, freq_khz) { + chan = ieee80211_get_channel_khz(wiphy, freq_khz); + if (!chan || (chan->flags & prohibited_flags)) + return false; + } + + if (chandef->s1g_primary_2mhz) { + u32 sib_khz; + const struct ieee80211_channel *sibling; + + sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef); + if (!sibling) + return false; + + if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY) + return false; + + sib_khz = ieee80211_channel_to_khz(sibling); + if (sib_khz < start_khz || sib_khz > end_khz) + return false; + } + + return true; +} + bool _cfg80211_chandef_usable(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef, u32 prohibited_flags, @@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) & IEEE80211_VHT_EXT_NSS_BW_CAPABLE; + if (cfg80211_chandef_is_s1g(chandef)) + return cfg80211_s1g_usable(wiphy, chandef); + if (edmg_cap->channels && !cfg80211_edmg_usable(wiphy, chandef->edmg.channels, @@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy, control_freq = chandef->chan->center_freq; switch (chandef->width) { - case NL80211_CHAN_WIDTH_1: - width = 1; - break; - case NL80211_CHAN_WIDTH_2: - width = 2; - break; - case NL80211_CHAN_WIDTH_4: - width = 4; - break; - case NL80211_CHAN_WIDTH_8: - width = 8; - break; - case NL80211_CHAN_WIDTH_16: - width = 16; - break; case NL80211_CHAN_WIDTH_5: width = 5; break; diff --git a/net/wireless/core.c b/net/wireless/core.c index a7e2931ffb2e..797f9f2004a6 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy) rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW; + if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS) + rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS; + else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS) + rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS; + rtnl_lock(); wiphy_lock(&rdev->wiphy); res = device_add(&rdev->wiphy.dev); diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c index 2613d6ac0fda..46e4317cbd7e 100644 --- a/net/wireless/ethtool.c +++ b/net/wireless/ethtool.c @@ -23,7 +23,7 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info) else strscpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)), + strscpy(info->bus_info, dev_name(pdev), sizeof(info->bus_info)); } EXPORT_SYMBOL(cfg80211_get_drvinfo); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 852573423e52..346dfd2bd987 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr, return 0; } +static int validate_nan_cluster_id(const struct nlattr *attr, + struct netlink_ext_ack *extack) +{ + const u8 *data = nla_data(attr); + unsigned int len = nla_len(attr); + static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1}; + + if (len != ETH_ALEN) { + NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length"); + return -EINVAL; + } + + if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) { + NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix"); + return -EINVAL; + } + + return 0; +} + /* policy for the attributes */ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR]; @@ -411,6 +431,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = { [NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8, NL80211_RATE_INFO_HE_1XLTF, NL80211_RATE_INFO_HE_4XLTF), + [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)), + [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_GI_0_8, + NL80211_RATE_INFO_EHT_GI_3_2), + [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8, + NL80211_RATE_INFO_EHT_1XLTF, + NL80211_RATE_INFO_EHT_8XLTF), + }; static const struct nla_policy @@ -492,6 +520,36 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = { IEEE80211_MAX_DATA_LEN), }; +static const struct nla_policy +nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8, + NUM_NL80211_BANDS - 1), + [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 }, + [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59), + [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74), + [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5), + [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG }, +}; + +static const struct nla_policy +nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = { + [NL80211_NAN_CONF_CLUSTER_ID] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id, + ETH_ALEN), + [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY, + .len = IEEE80211_MAX_DATA_LEN}, + [NL80211_NAN_CONF_VENDOR_ELEMS] = + NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr, + IEEE80211_MAX_DATA_LEN), + [NL80211_NAN_CONF_BAND_CONFIGS] = + NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy), + [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 }, + [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512), + [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] = + NLA_POLICY_RANGE(NLA_U8, 50, 200), + [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG }, +}; + static const struct netlink_range_validation nl80211_punct_bitmap_range = { .min = 0, .max = 0xffff, @@ -761,6 +819,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN), [NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1), [NL80211_ATTR_BANDS] = { .type = NLA_U32 }, + [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy), [NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED }, [NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY, .len = FILS_MAX_KEK_LEN }, @@ -871,6 +930,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2), [NL80211_ATTR_S1G_SHORT_BEACON] = NLA_POLICY_NESTED(nl80211_s1g_short_beacon), + [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG }, + [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1219,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, if ((chan->flags & IEEE80211_CHAN_NO_HE) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE)) goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_1MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_2MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_4MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_8MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ)) - goto nla_put_failure; - if ((chan->flags & IEEE80211_CHAN_16MHZ) && - nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ)) - goto nla_put_failure; if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) && nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ)) goto nla_put_failure; @@ -1259,6 +1305,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy, nla_put_flag(msg, NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY)) goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ)) + goto nla_put_failure; + if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) && + nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ)) + goto nla_put_failure; } if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER, @@ -2545,6 +2600,41 @@ fail: return -ENOBUFS; } +static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg) +{ + struct nlattr *nan_caps; + + nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES); + if (!nan_caps) + return -ENOBUFS; + + if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC && + nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC)) + goto fail; + + if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) && + nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE)) + goto fail; + + if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE, + wiphy->nan_capa.op_mode) || + nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS, + wiphy->nan_capa.n_antennas) || + nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME, + wiphy->nan_capa.max_channel_switch_time) || + nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES, + wiphy->nan_capa.dev_capabilities)) + goto fail; + + nla_nest_end(msg, nan_caps); + + return 0; + +fail: + nla_nest_cancel(msg, nan_caps); + return -ENOBUFS; +} + struct nl80211_dump_wiphy_state { s64 filter_wiphy; long start; @@ -3019,6 +3109,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, rdev->wiphy.ext_features)) goto nla_put_failure; + if (rdev->wiphy.bss_param_support) { + struct nlattr *nested; + u32 parsup = rdev->wiphy.bss_param_support; + + nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM); + if (!nested) + goto nla_put_failure; + + if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) && + nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) && + nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) && + nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) && + nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) && + nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) && + nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW)) + goto nla_put_failure; + if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) && + nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS)) + goto nla_put_failure; + nla_nest_end(msg, nested); + } if (rdev->wiphy.bss_select_support) { struct nlattr *nested; u32 bss_select_support = rdev->wiphy.bss_select_support; @@ -3163,6 +3287,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_put_radios(&rdev->wiphy, msg)) goto nla_put_failure; + state->split_start++; + break; + case 18: + if (nl80211_put_nan_capa(&rdev->wiphy, msg)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; @@ -3406,6 +3536,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, chandef->center_freq1 = KHZ_TO_MHZ(control_freq); chandef->freq1_offset = control_freq % 1000; chandef->center_freq2 = 0; + chandef->s1g_primary_2mhz = false; if (!chandef->chan) { NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ], @@ -3449,27 +3580,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev, return -EINVAL; } } else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) { - chandef->width = - nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); - if (chandef->chan->band == NL80211_BAND_S1GHZ) { - /* User input error for channel width doesn't match channel */ - if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) { - NL_SET_ERR_MSG_ATTR(extack, - attrs[NL80211_ATTR_CHANNEL_WIDTH], - "bad channel width"); - return -EINVAL; - } - } + chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]); if (attrs[NL80211_ATTR_CENTER_FREQ1]) { chandef->center_freq1 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]); - chandef->freq1_offset = - nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], - 0); + chandef->freq1_offset = nla_get_u32_default( + attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0); } + if (attrs[NL80211_ATTR_CENTER_FREQ2]) chandef->center_freq2 = nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]); + + chandef->s1g_primary_2mhz = nla_get_flag( + attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]); } if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) { @@ -5393,6 +5517,164 @@ static bool he_set_mcs_mask(struct genl_info *info, return true; } +static void eht_build_mcs_mask(struct genl_info *info, + const struct ieee80211_sta_eht_cap *eht_cap, + u8 mcs_nss_len, u16 *mcs_mask) +{ + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0; + unsigned int link_id = nl80211_link_id(info->attrs); + + if (mcs_nss_len == 4) { + const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs = + &eht_cap->eht_mcs_nss_supp.only_20mhz; + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + + } else { + const struct ieee80211_eht_mcs_nss_supp_bw *mcs; + enum nl80211_chan_width width; + + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + width = wdev->u.ibss.chandef.width; + break; + case NL80211_IFTYPE_MESH_POINT: + width = wdev->u.mesh.chandef.width; + break; + case NL80211_IFTYPE_OCB: + width = wdev->u.ocb.chandef.width; + break; + default: + if (wdev->valid_links) + width = wdev->links[link_id].ap.chandef.width; + else + width = wdev->u.ap.preset_chandef.width; + break; + } + + switch (width) { + case NL80211_CHAN_WIDTH_320: + mcs = &eht_cap->eht_mcs_nss_supp.bw._320; + break; + case NL80211_CHAN_WIDTH_160: + mcs = &eht_cap->eht_mcs_nss_supp.bw._160; + break; + default: + mcs = &eht_cap->eht_mcs_nss_supp.bw._80; + break; + } + + mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss, + IEEE80211_EHT_MCS_NSS_TX); + } + + /* Enable MCS 14 for NSS 0 */ + if (eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP) + mcs_mask[0] |= 0x4000; + + /* Enable MCS 15 for NSS 0 */ + mcs_mask[0] |= 0x8000; + + for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) { + if (!mcs_7) + continue; + mcs_mask[nss] |= 0x00FF; + mcs_7--; + + if (!mcs_9) + continue; + mcs_mask[nss] |= 0x0300; + mcs_9--; + + if (!mcs_11) + continue; + mcs_mask[nss] |= 0x0C00; + mcs_11--; + + if (!mcs_13) + continue; + mcs_mask[nss] |= 0x3000; + mcs_13--; + } +} + +static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev, + struct ieee80211_supported_band *sband, + struct nl80211_txrate_eht *txrate, + u16 mcs[NL80211_EHT_NSS_MAX]) +{ + const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 }; + u8 i, mcs_nss_len; + + he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype); + if (!he_cap) + return false; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + return false; + + /* Checks for MCS 14 */ + if (txrate->mcs[0] & 0x4000) { + if (sband->band != NL80211_BAND_6GHZ) + return false; + + if (!(eht_cap->eht_cap_elem.phy_cap_info[6] & + IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)) + return false; + } + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + if (mcs_nss_len == 3) { + /* Supported iftypes for setting non-20 MHZ only EHT MCS */ + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_OCB: + break; + default: + return false; + } + } + + /* Build eht_mcs_mask from EHT and HE capabilities */ + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask); + + memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX); + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i]) + mcs[i] = txrate->mcs[i]; + else + return false; + } + + return true; +} + static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, struct nlattr *attrs[], enum nl80211_attrs attr, @@ -5413,6 +5695,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, /* Default to all rates enabled */ for (i = 0; i < NUM_NL80211_BANDS; i++) { const struct ieee80211_sta_he_cap *he_cap; + const struct ieee80211_sta_eht_cap *eht_cap; + u8 mcs_nss_len; if (!default_all_enabled) break; @@ -5441,6 +5725,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[i].he_gi = 0xFF; mask->control[i].he_ltf = 0xFF; + + eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype); + if (!eht_cap) + continue; + + mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem, + &eht_cap->eht_cap_elem, + wdev->iftype == + NL80211_IFTYPE_STATION); + + eht_build_mcs_mask(info, eht_cap, mcs_nss_len, + mask->control[i].eht_mcs); + + mask->control[i].eht_gi = 0xFF; + mask->control[i].eht_ltf = 0xFF; } /* if no rates are given set it back to the defaults */ @@ -5512,13 +5811,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, mask->control[band].he_ltf = nla_get_u8(tb[NL80211_TXRATE_HE_LTF]); + if (tb[NL80211_TXRATE_EHT] && + !eht_set_mcs_mask(info, wdev, sband, + nla_data(tb[NL80211_TXRATE_EHT]), + mask->control[band].eht_mcs)) + return -EINVAL; + + if (tb[NL80211_TXRATE_EHT_GI]) + mask->control[band].eht_gi = + nla_get_u8(tb[NL80211_TXRATE_EHT_GI]); + if (tb[NL80211_TXRATE_EHT_LTF]) + mask->control[band].eht_ltf = + nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]); + if (mask->control[band].legacy == 0) { - /* don't allow empty legacy rates if HT, VHT or HE + /* don't allow empty legacy rates if HT, VHT, HE or EHT * are not even supported. */ if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported || rdev->wiphy.bands[band]->vht_cap.vht_supported || - ieee80211_get_he_iftype_cap(sband, wdev->iftype))) + ieee80211_get_he_iftype_cap(sband, wdev->iftype) || + ieee80211_get_eht_iftype_cap(sband, wdev->iftype))) return -EINVAL; for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++) @@ -5533,6 +5846,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (mask->control[band].he_mcs[i]) goto out; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) + if (mask->control[band].eht_mcs[i]) + goto out; + /* legacy and mcs rates may not be both empty */ return -EINVAL; } @@ -5546,7 +5863,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, enum nl80211_band band, struct cfg80211_bitrate_mask *beacon_rate) { - u32 count_ht, count_vht, count_he, i; + u32 count_ht, count_vht, count_he, count_eht, i; u32 rate = beacon_rate->control[band].legacy; /* Allow only one rate */ @@ -5592,8 +5909,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return -EINVAL; } - if ((count_ht && count_vht && count_he) || - (!rate && !count_ht && !count_vht && !count_he)) + count_eht = 0; + for (i = 0; i < NL80211_EHT_NSS_MAX; i++) { + if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) { + return -EINVAL; + } else if (beacon_rate->control[band].eht_mcs[i]) { + count_eht++; + if (count_eht > 1) + return -EINVAL; + } + if (count_eht && rate) + return -EINVAL; + } + + if ((count_ht && count_vht && count_he && count_eht) || + (!rate && !count_ht && !count_vht && !count_he && !count_eht)) return -EINVAL; if (rate && @@ -5613,6 +5943,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, NL80211_EXT_FEATURE_BEACON_RATE_HE)) return -EINVAL; + if (count_eht && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_BEACON_RATE_EHT)) + return -EINVAL; + return 0; } @@ -8830,6 +9165,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; + u32 bss_param_support = rdev->wiphy.bss_param_support; + u32 changed = 0; + bool strict; memset(¶ms, 0, sizeof(params)); params.link_id = nl80211_link_id_or_invalid(info->attrs); @@ -8842,26 +9180,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = -1; params.p2p_opp_ps = -1; - if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) + strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]); + if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT)) + return -EINVAL; params.use_cts_prot = nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) + changed |= WIPHY_BSS_PARAM_CTS_PROT; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE)) + return -EINVAL; params.use_short_preamble = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]); - if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) + changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE; + } + if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME)) + return -EINVAL; params.use_short_slot_time = nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]); + changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME; + } if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { + if (strict && + !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES)) + return -EINVAL; params.basic_rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); params.basic_rates_len = nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); + changed |= WIPHY_BSS_PARAM_BASIC_RATES; + } + if (info->attrs[NL80211_ATTR_AP_ISOLATE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE)) + return -EINVAL; + params.ap_isolate = + !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); + changed |= WIPHY_BSS_PARAM_AP_ISOLATE; } - if (info->attrs[NL80211_ATTR_AP_ISOLATE]) - params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) + if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) { + if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE)) + return -EINVAL; params.ht_opmode = nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]); + changed |= WIPHY_BSS_PARAM_HT_OPMODE; + } if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) { if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) @@ -8869,8 +9235,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) params.p2p_ctwindow = nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]); if (params.p2p_ctwindow != 0 && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)) + !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)) return -EINVAL; + changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW; } if (info->attrs[NL80211_ATTR_P2P_OPPPS]) { @@ -8879,9 +9246,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]); + if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) + return -EINVAL; params.p2p_opp_ps = tmp; if (params.p2p_opp_ps && - !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)) + !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)) return -EINVAL; } @@ -8892,6 +9261,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; + changed &= rdev->wiphy.bss_param_support; + if (!changed) + return 0; + return rdev_change_bss(rdev, dev, ¶ms); } @@ -10071,8 +10444,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) goto out_free; } - /* ignore disabled channels */ + /* Ignore disabled / no primary channels */ if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -10094,6 +10468,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) chan = &wiphy->bands[band]->channels[j]; if (chan->flags & IEEE80211_CHAN_DISABLED || + chan->flags & + IEEE80211_CHAN_S1G_NO_PRIMARY || !cfg80211_wdev_channel_allowed(wdev, chan)) continue; @@ -13398,7 +13774,9 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: @@ -13459,7 +13837,9 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) break; case NL80211_IFTYPE_NAN: if (!wiphy_ext_feature_isset(wdev->wiphy, - NL80211_EXT_FEATURE_SECURE_NAN)) + NL80211_EXT_FEATURE_SECURE_NAN) && + !(wdev->wiphy->nan_capa.flags & + WIPHY_NAN_FLAGS_USERSPACE_DE)) return -EOPNOTSUPP; break; default: @@ -15106,6 +15486,216 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info) return 0; } +static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy, + int freq) +{ + struct ieee80211_channel *chan; + struct cfg80211_chan_def def; + + /* Check if the frequency is valid for NAN */ + if (freq != 5220 && freq != 5745 && freq != 2437) + return NULL; + + chan = ieee80211_get_channel(wiphy, freq); + if (!chan) + return NULL; + + cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT); + + /* Check if the channel is allowed */ + if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN)) + return chan; + + return NULL; +} + +static int nl80211_parse_nan_band_config(struct wiphy *wiphy, + struct nlattr **tb, + struct cfg80211_nan_band_config *cfg, + enum nl80211_band band) +{ + if (BIT(band) & ~(u32)wiphy->nan_supported_bands) + return -EINVAL; + + if (tb[NL80211_NAN_BAND_CONF_FREQ]) { + u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]); + + if (band != NL80211_BAND_5GHZ) + return -EINVAL; + + cfg->chan = nl80211_get_nan_channel(wiphy, freq); + if (!cfg->chan) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) { + cfg->rssi_close = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]); + if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) { + cfg->rssi_middle = + nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]); + if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close) + return -EINVAL; + } + + if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) { + cfg->awake_dw_interval = + nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]); + + if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0) + return -EINVAL; + } + + cfg->disable_scan = + nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]); + return 0; +} + +static int nl80211_parse_nan_conf(struct wiphy *wiphy, + struct genl_info *info, + struct cfg80211_nan_conf *conf, + u32 *changed_flags) +{ + struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1]; + int err, rem; + u32 changed = 0; + struct nlattr *band_config; + + if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { + conf->master_pref = + nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); + + changed |= CFG80211_NAN_CONF_CHANGED_PREF; + } + + if (info->attrs[NL80211_ATTR_BANDS]) { + u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); + + if (bands & ~(u32)wiphy->nan_supported_bands) + return -EOPNOTSUPP; + + if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) + return -EINVAL; + + conf->bands = bands; + changed |= CFG80211_NAN_CONF_CHANGED_BANDS; + } + + conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1; + if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands) + conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1; + + /* On 2.4 GHz band use channel 6 */ + conf->band_cfgs[NL80211_BAND_2GHZ].chan = + nl80211_get_nan_channel(wiphy, 2437); + if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan) + return -EINVAL; + + if (!info->attrs[NL80211_ATTR_NAN_CONFIG]) + goto out; + + err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX, + info->attrs[NL80211_ATTR_NAN_CONFIG], NULL, + info->extack); + if (err) + return err; + + changed |= CFG80211_NAN_CONF_CHANGED_CONFIG; + if (attrs[NL80211_NAN_CONF_CLUSTER_ID]) + conf->cluster_id = + nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]); + + if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) { + conf->extra_nan_attrs = + nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + conf->extra_nan_attrs_len = + nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]); + } + + if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) { + conf->vendor_elems = + nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + conf->vendor_elems_len = + nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]); + } + + if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) { + nla_for_each_nested(band_config, + attrs[NL80211_NAN_CONF_BAND_CONFIGS], + rem) { + enum nl80211_band band; + struct cfg80211_nan_band_config *cfg; + struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, + NL80211_NAN_BAND_CONF_ATTR_MAX, + band_config, NULL, + info->extack); + if (err) + return err; + + if (!tb[NL80211_NAN_BAND_CONF_BAND]) + return -EINVAL; + + band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]); + if (conf->bands && !(conf->bands & BIT(band))) + return -EINVAL; + + cfg = &conf->band_cfgs[band]; + + err = nl80211_parse_nan_band_config(wiphy, tb, cfg, + band); + if (err) + return err; + } + } + + if (attrs[NL80211_NAN_CONF_SCAN_PERIOD]) + conf->scan_period = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]); + + if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]) + conf->scan_dwell_time = + nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]); + + if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]) + conf->discovery_beacon_interval = + nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]); + + if (attrs[NL80211_NAN_CONF_NOTIFY_DW]) + conf->enable_dw_notification = + nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]); + +out: + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) { + /* If no 5GHz channel is specified use default, if possible */ + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5745); + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan) + conf->band_cfgs[NL80211_BAND_5GHZ].chan = + nl80211_get_nan_channel(wiphy, 5220); + + /* Return error if user space asked explicitly for 5 GHz */ + if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan && + conf->bands & BIT(NL80211_BAND_5GHZ)) { + NL_SET_ERR_MSG_ATTR(info->extack, + info->attrs[NL80211_ATTR_BANDS], + "5 GHz band operation is not allowed"); + return -EINVAL; + } + } + + if (changed_flags) + *changed_flags = changed; + + return 0; +} + static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -15122,23 +15712,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info) if (rfkill_blocked(rdev->wiphy.rfkill)) return -ERFKILL; + /* Master preference is mandatory for START_NAN */ if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) return -EINVAL; - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL); + if (err) + return err; err = rdev_start_nan(rdev, wdev, &conf); if (err) @@ -15494,6 +16074,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb, struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_nan_conf conf = {}; u32 changed = 0; + int err; if (wdev->iftype != NL80211_IFTYPE_NAN) return -EOPNOTSUPP; @@ -15501,27 +16082,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb, if (!wdev_running(wdev)) return -ENOTCONN; - if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) { - conf.master_pref = - nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]); - if (conf.master_pref <= 1 || conf.master_pref == 255) - return -EINVAL; - - changed |= CFG80211_NAN_CONF_CHANGED_PREF; - } - - if (info->attrs[NL80211_ATTR_BANDS]) { - u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]); - - if (bands & ~(u32)wdev->wiphy->nan_supported_bands) - return -EOPNOTSUPP; - - if (bands && !(bands & BIT(NL80211_BAND_2GHZ))) - return -EINVAL; - - conf.bands = bands; - changed |= CFG80211_NAN_CONF_CHANGED_BANDS; - } + err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed); + if (err) + return err; if (!changed) return -EINVAL; @@ -21244,6 +21807,88 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled) } EXPORT_SYMBOL(cfg80211_epcs_changed); +void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev, + struct ieee80211_channel *chan, gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_next_nan_dw_notif(wdev, chan); + + if (!wdev->owner_nlportid) + return; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, + NL80211_CMD_NAN_NEXT_DW_NOTIFICATION); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid); + + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_next_nan_dw_notif); + +void cfg80211_nan_cluster_joined(struct wireless_dev *wdev, + const u8 *cluster_id, bool new_cluster, + gfp_t gfp) +{ + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct sk_buff *msg; + void *hdr; + + trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster); + + memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN); + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp); + if (!msg) + return; + + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED); + if (!hdr) + goto nla_put_failure; + + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || + nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev), + NL80211_ATTR_PAD) || + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) || + (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER))) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + + if (!wdev->owner_nlportid) + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy), + msg, 0, NL80211_MCGRP_NAN, gfp); + else + genlmsg_unicast(wiphy_net(wiphy), msg, + wdev->owner_nlportid); + return; + + nla_put_failure: + nlmsg_free(msg); +} +EXPORT_SYMBOL(cfg80211_nan_cluster_joined); + /* initialisation/exit functions */ int __init nl80211_init(void) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 3b0ac3437f81..73cab51f6379 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd if (reg_rule->flags & NL80211_RRF_AUTO_BW) max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule); + if (is_s1g) { + if (max_bandwidth_khz < MHZ_TO_KHZ(16)) + bw_flags |= IEEE80211_CHAN_NO_16MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(8)) + bw_flags |= IEEE80211_CHAN_NO_8MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(4)) + bw_flags |= IEEE80211_CHAN_NO_4MHZ; + return bw_flags; + } + /* If we get a reg_rule we can assume that at least 5Mhz fit */ if (!cfg80211_does_bw_fit_range(freq_range, center_freq_khz, @@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd MHZ_TO_KHZ(20))) bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (is_s1g) { - /* S1G is strict about non overlapping channels. We can - * calculate which bandwidth is allowed per channel by finding - * the largest bandwidth which cleanly divides the freq_range. - */ - int edge_offset; - int ch_bw = max_bandwidth_khz; - - while (ch_bw) { - edge_offset = (center_freq_khz - ch_bw / 2) - - freq_range->start_freq_khz; - if (edge_offset % ch_bw == 0) { - switch (KHZ_TO_MHZ(ch_bw)) { - case 1: - bw_flags |= IEEE80211_CHAN_1MHZ; - break; - case 2: - bw_flags |= IEEE80211_CHAN_2MHZ; - break; - case 4: - bw_flags |= IEEE80211_CHAN_4MHZ; - break; - case 8: - bw_flags |= IEEE80211_CHAN_8MHZ; - break; - case 16: - bw_flags |= IEEE80211_CHAN_16MHZ; - break; - default: - /* If we got here, no bandwidths fit on - * this frequency, ie. band edge. - */ - bw_flags |= IEEE80211_CHAN_DISABLED; - break; - } - break; - } - ch_bw /= 2; - } - } else { - if (max_bandwidth_khz < MHZ_TO_KHZ(10)) - bw_flags |= IEEE80211_CHAN_NO_10MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(20)) - bw_flags |= IEEE80211_CHAN_NO_20MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(40)) - bw_flags |= IEEE80211_CHAN_NO_HT40; - if (max_bandwidth_khz < MHZ_TO_KHZ(80)) - bw_flags |= IEEE80211_CHAN_NO_80MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(160)) - bw_flags |= IEEE80211_CHAN_NO_160MHZ; - if (max_bandwidth_khz < MHZ_TO_KHZ(320)) - bw_flags |= IEEE80211_CHAN_NO_320MHZ; - } + if (max_bandwidth_khz < MHZ_TO_KHZ(10)) + bw_flags |= IEEE80211_CHAN_NO_10MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(20)) + bw_flags |= IEEE80211_CHAN_NO_20MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(40)) + bw_flags |= IEEE80211_CHAN_NO_HT40; + if (max_bandwidth_khz < MHZ_TO_KHZ(80)) + bw_flags |= IEEE80211_CHAN_NO_80MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(160)) + bw_flags |= IEEE80211_CHAN_NO_160MHZ; + if (max_bandwidth_khz < MHZ_TO_KHZ(320)) + bw_flags |= IEEE80211_CHAN_NO_320MHZ; + return bw_flags; } diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 6c7b7c3828a4..90a9187a6b13 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -1816,6 +1816,9 @@ static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, WARN_ON(ies != old_ies); rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + + bss->ts = known->ts; + bss->pub.ts_boottime = known->pub.ts_boottime; } } @@ -1882,6 +1885,10 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, { lockdep_assert_held(&rdev->bss_lock); + /* Update time stamps */ + known->ts = new->ts; + known->pub.ts_boottime = new->pub.ts_boottime; + /* Update IEs */ if (rcu_access_pointer(new->pub.proberesp_ies)) { const struct cfg80211_bss_ies *old; @@ -1945,8 +1952,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (signal_valid) known->pub.signal = new->pub.signal; known->pub.capability = new->pub.capability; - known->ts = new->ts; - known->pub.ts_boottime = new->pub.ts_boottime; known->parent_tsf = new->parent_tsf; known->pub.chains = new->pub.chains; memcpy(known->pub.chain_signal, new->pub.chain_signal, diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 34c584a215e5..8a4c34112eb5 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3137,23 +3137,6 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate, TP_ARGS(netdev, macaddr) ); -DECLARE_EVENT_CLASS(netdev_evt_only, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev), - TP_STRUCT__entry( - NETDEV_ENTRY - ), - TP_fast_assign( - NETDEV_ASSIGN; - ), - TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG) -); - -DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth, - TP_PROTO(struct net_device *netdev), - TP_ARGS(netdev) -); - TRACE_EVENT(cfg80211_send_rx_assoc, TP_PROTO(struct net_device *netdev, const struct cfg80211_rx_assoc_resp_data *data), @@ -3480,21 +3463,6 @@ TRACE_EVENT(cfg80211_reg_can_beacon, __entry->prohibited_flags, __entry->permitting_flags) ); -TRACE_EVENT(cfg80211_chandef_dfs_required, - TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef), - TP_ARGS(wiphy, chandef), - TP_STRUCT__entry( - WIPHY_ENTRY - CHAN_DEF_ENTRY - ), - TP_fast_assign( - WIPHY_ASSIGN; - CHAN_DEF_ASSIGN(chandef); - ), - TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT, - WIPHY_PR_ARG, CHAN_DEF_PR_ARG) -); - TRACE_EVENT(cfg80211_ch_switch_notify, TP_PROTO(struct net_device *netdev, struct cfg80211_chan_def *chandef, @@ -3862,30 +3830,6 @@ DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss, TP_ARGS(pub) ); -TRACE_EVENT(cfg80211_return_uint, - TP_PROTO(unsigned int ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(unsigned int, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %d", __entry->ret) -); - -TRACE_EVENT(cfg80211_return_u32, - TP_PROTO(u32 ret), - TP_ARGS(ret), - TP_STRUCT__entry( - __field(u32, ret) - ), - TP_fast_assign( - __entry->ret = ret; - ), - TP_printk("ret: %u", __entry->ret) -); - TRACE_EVENT(cfg80211_report_wowlan_wakeup, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, struct cfg80211_wowlan_wakeup *wakeup), @@ -4222,6 +4166,41 @@ TRACE_EVENT(cfg80211_epcs_changed, WDEV_PR_ARG, __entry->enabled) ); +TRACE_EVENT(cfg80211_next_nan_dw_notif, + TP_PROTO(struct wireless_dev *wdev, + struct ieee80211_channel *chan), + TP_ARGS(wdev, chan), + TP_STRUCT__entry( + WDEV_ENTRY + CHAN_ENTRY + ), + TP_fast_assign( + WDEV_ASSIGN; + CHAN_ASSIGN(chan); + ), + TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT, + WDEV_PR_ARG, CHAN_PR_ARG) +); + +TRACE_EVENT(cfg80211_nan_cluster_joined, + TP_PROTO(struct wireless_dev *wdev, + const u8 *cluster_id, + bool new_cluster), + TP_ARGS(wdev, cluster_id, new_cluster), + TP_STRUCT__entry( + WDEV_ENTRY + MAC_ENTRY(cluster_id) + __field(bool, new_cluster) + ), + TP_fast_assign( + WDEV_ASSIGN; + MAC_ASSIGN(cluster_id, cluster_id); + __entry->new_cluster = new_cluster; + ), + TP_printk(WDEV_PR_FMT " cluster_id %pMF%s", + WDEV_PR_ARG, __entry->cluster_id, + __entry->new_cluster ? " [new]" : "") +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH diff --git a/net/wireless/util.c b/net/wireless/util.c index 240c68baa3d1..56724b33af04 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band) } EXPORT_SYMBOL(ieee80211_channel_to_freq_khz); -enum nl80211_chan_width -ieee80211_s1g_channel_width(const struct ieee80211_channel *chan) -{ - if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ)) - return NL80211_CHAN_WIDTH_20_NOHT; - - /*S1G defines a single allowed channel width per channel. - * Extract that width here. - */ - if (chan->flags & IEEE80211_CHAN_1MHZ) - return NL80211_CHAN_WIDTH_1; - else if (chan->flags & IEEE80211_CHAN_2MHZ) - return NL80211_CHAN_WIDTH_2; - else if (chan->flags & IEEE80211_CHAN_4MHZ) - return NL80211_CHAN_WIDTH_4; - else if (chan->flags & IEEE80211_CHAN_8MHZ) - return NL80211_CHAN_WIDTH_8; - else if (chan->flags & IEEE80211_CHAN_16MHZ) - return NL80211_CHAN_WIDTH_16; - - pr_err("unknown channel width for channel at %dKHz?\n", - ieee80211_channel_to_khz(chan)); - - return NL80211_CHAN_WIDTH_1; -} -EXPORT_SYMBOL(ieee80211_s1g_channel_width); - int ieee80211_freq_khz_to_channel(u32 freq) { /* TODO: just handle MHz for now */ @@ -2584,7 +2557,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy, } } - return -ENOENT; + return -EINVAL; } EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan); @@ -2992,7 +2965,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio, u32 freq, width; freq = ieee80211_chandef_to_khz(chandef); - width = cfg80211_chandef_get_width(chandef); + width = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef)); if (!ieee80211_radio_freq_range_valid(radio, freq, width)) return false; diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 72e34bd2d925..7b0c68a70888 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -618,11 +618,16 @@ static void xsk_destruct_skb(struct sk_buff *skb) sock_wfree(skb); } -static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr) +static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs, + u64 addr) { BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb)); INIT_LIST_HEAD(&XSKCB(skb)->addrs_list); + skb->dev = xs->dev; + skb->priority = READ_ONCE(xs->sk.sk_priority); + skb->mark = READ_ONCE(xs->sk.sk_mark); XSKCB(skb)->num_descs = 0; + skb->destructor = xsk_destruct_skb; skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr; } @@ -652,6 +657,45 @@ static void xsk_drop_skb(struct sk_buff *skb) xsk_consume_skb(skb); } +static int xsk_skb_metadata(struct sk_buff *skb, void *buffer, + struct xdp_desc *desc, struct xsk_buff_pool *pool, + u32 hr) +{ + struct xsk_tx_metadata *meta = NULL; + + if (unlikely(pool->tx_metadata_len == 0)) + return -EINVAL; + + meta = buffer - pool->tx_metadata_len; + if (unlikely(!xsk_buff_valid_tx_metadata(meta))) + return -EINVAL; + + if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { + if (unlikely(meta->request.csum_start + + meta->request.csum_offset + + sizeof(__sum16) > desc->len)) + return -EINVAL; + + skb->csum_start = hr + meta->request.csum_start; + skb->csum_offset = meta->request.csum_offset; + skb->ip_summed = CHECKSUM_PARTIAL; + + if (unlikely(pool->tx_sw_csum)) { + int err; + + err = skb_checksum_help(skb); + if (err) + return err; + } + } + + if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) + skb->skb_mstamp_ns = meta->request.launch_time; + xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); + + return 0; +} + static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, struct xdp_desc *desc) { @@ -664,6 +708,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, int err, i; u64 addr; + addr = desc->addr; + buffer = xsk_buff_raw_get_data(pool, addr); + if (!skb) { hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom)); @@ -673,7 +720,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, skb_reserve(skb, hr); - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, pool, hr); + if (unlikely(err)) + return ERR_PTR(err); + } } else { xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL); if (!xsk_addr) @@ -687,11 +739,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - addr = desc->addr; len = desc->len; ts = pool->unaligned ? len : pool->chunk_size; - buffer = xsk_buff_raw_get_data(pool, addr); offset = offset_in_page(buffer); addr = buffer - pool->addrs; @@ -722,16 +772,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs, static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, struct xdp_desc *desc) { - struct xsk_tx_metadata *meta = NULL; struct net_device *dev = xs->dev; struct sk_buff *skb = xs->skb; - bool first_frag = false; int err; if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) { skb = xsk_build_skb_zerocopy(xs, desc); if (IS_ERR(skb)) { err = PTR_ERR(skb); + skb = NULL; goto free_err; } } else { @@ -742,8 +791,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, len = desc->len; if (!skb) { - first_frag = true; - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)); tr = dev->needed_tailroom; skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err); @@ -757,7 +804,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, if (unlikely(err)) goto free_err; - xsk_set_destructor_arg(skb, desc->addr); + xsk_skb_init_misc(skb, xs, desc->addr); + if (desc->options & XDP_TX_METADATA) { + err = xsk_skb_metadata(skb, buffer, desc, + xs->pool, hr); + if (unlikely(err)) + goto free_err; + } } else { int nr_frags = skb_shinfo(skb)->nr_frags; struct xsk_addr_node *xsk_addr; @@ -792,54 +845,14 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs, xsk_addr->addr = desc->addr; list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list); } - - if (first_frag && desc->options & XDP_TX_METADATA) { - if (unlikely(xs->pool->tx_metadata_len == 0)) { - err = -EINVAL; - goto free_err; - } - - meta = buffer - xs->pool->tx_metadata_len; - if (unlikely(!xsk_buff_valid_tx_metadata(meta))) { - err = -EINVAL; - goto free_err; - } - - if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) { - if (unlikely(meta->request.csum_start + - meta->request.csum_offset + - sizeof(__sum16) > len)) { - err = -EINVAL; - goto free_err; - } - - skb->csum_start = hr + meta->request.csum_start; - skb->csum_offset = meta->request.csum_offset; - skb->ip_summed = CHECKSUM_PARTIAL; - - if (unlikely(xs->pool->tx_sw_csum)) { - err = skb_checksum_help(skb); - if (err) - goto free_err; - } - } - - if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME) - skb->skb_mstamp_ns = meta->request.launch_time; - } } - skb->dev = dev; - skb->priority = READ_ONCE(xs->sk.sk_priority); - skb->mark = READ_ONCE(xs->sk.sk_mark); - skb->destructor = xsk_destruct_skb; - xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta); xsk_inc_num_desc(skb); return skb; free_err: - if (first_frag && skb) + if (skb && !skb_shinfo(skb)->nr_frags) kfree_skb(skb); if (err == -EOVERFLOW) { diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index c5035a9bc3bb..62486f866975 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, static dscp_t xfrm_get_dscp(const struct flowi *fl, int family) { if (family == AF_INET) - return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos); + return fl->u.ip4.flowi4_dscp; return 0; } @@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve } fl4->flowi4_proto = flkeys->basic.ip_proto; - fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK; + fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos); } #if IS_ENABLED(CONFIG_IPV6) @@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family, fl1->flowi_oif = fl->flowi_oif; fl1->flowi_mark = fl->flowi_mark; - fl1->flowi_tos = fl->flowi_tos; + fl1->flowi_dscp = fl->flowi_dscp; nf_nat_decode_session(newskb, fl1, family); ret = false; @@ -3881,12 +3881,18 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) } skb_dst_force(skb); - if (!skb_dst(skb)) { + dst = skb_dst(skb); + if (!dst) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } - dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); + /* ignore return value from skb_dstref_steal, xfrm_lookup takes + * care of dropping the refcnt if needed. + */ + skb_dstref_steal(skb); + + dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c index 8e07dd614b0b..5e1fd6b1d503 100644 --- a/net/xfrm/xfrm_proc.c +++ b/net/xfrm/xfrm_proc.c @@ -45,21 +45,21 @@ static const struct snmp_mib xfrm_mib_list[] = { SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR), SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR), SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE), - SNMP_MIB_SENTINEL }; static int xfrm_statistics_seq_show(struct seq_file *seq, void *v) { - unsigned long buff[LINUX_MIB_XFRMMAX]; + unsigned long buff[ARRAY_SIZE(xfrm_mib_list)]; + const int cnt = ARRAY_SIZE(xfrm_mib_list); struct net *net = seq->private; int i; - memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX); + memset(buff, 0, sizeof(buff)); xfrm_state_update_stats(net); - snmp_get_cpu_field_batch(buff, xfrm_mib_list, - net->mib.xfrm_statistics); - for (i = 0; xfrm_mib_list[i].name; i++) + snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt, + net->mib.xfrm_statistics); + for (i = 0; i < cnt; i++) seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name, buff[i]); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 684239018bec..010c9e6638c0 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -593,7 +593,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); *algpp = p; return 0; } @@ -620,7 +620,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->ealg = p; x->geniv = algo->uinfo.encr.geniv; return 0; @@ -649,7 +649,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); p->alg_key_len = ualg->alg_key_len; p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8); @@ -684,7 +684,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); if (!p->alg_trunc_len) p->alg_trunc_len = algo->uinfo.auth.icv_truncbits; @@ -714,7 +714,7 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta, if (!p) return -ENOMEM; - strcpy(p->alg_name, algo->name); + strscpy(p->alg_name, algo->name); x->aead = p; x->geniv = algo->uinfo.aead.geniv; return 0; |