summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
Diffstat (limited to 'net')
-rw-r--r--net/Kconfig2
-rw-r--r--net/Makefile1
-rw-r--r--net/batman-adv/Kconfig13
-rw-r--r--net/batman-adv/Makefile1
-rw-r--r--net/batman-adv/bat_iv_ogm.c5
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c34
-rw-r--r--net/batman-adv/hard-interface.c1
-rw-r--r--net/batman-adv/hard-interface.h1
-rw-r--r--net/batman-adv/log.h3
-rw-r--r--net/batman-adv/main.c50
-rw-r--r--net/batman-adv/main.h5
-rw-r--r--net/batman-adv/mesh-interface.c15
-rw-r--r--net/batman-adv/mesh-interface.h1
-rw-r--r--net/batman-adv/netlink.c17
-rw-r--r--net/batman-adv/netlink.h1
-rw-r--r--net/batman-adv/network-coding.c1878
-rw-r--r--net/batman-adv/network-coding.h106
-rw-r--r--net/batman-adv/originator.c6
-rw-r--r--net/batman-adv/routing.c9
-rw-r--r--net/batman-adv/send.c16
-rw-r--r--net/batman-adv/translation-table.c4
-rw-r--r--net/batman-adv/types.h216
-rw-r--r--net/bluetooth/hci_conn.c27
-rw-r--r--net/bluetooth/hci_core.c52
-rw-r--r--net/bluetooth/hci_event.c16
-rw-r--r--net/bluetooth/hci_sync.c10
-rw-r--r--net/bluetooth/iso.c34
-rw-r--r--net/bluetooth/mgmt.c10
-rw-r--r--net/bluetooth/mgmt_config.c4
-rw-r--r--net/bluetooth/sco.c7
-rw-r--r--net/bridge/br.c27
-rw-r--r--net/bridge/br_cfm.c6
-rw-r--r--net/bridge/br_fdb.c114
-rw-r--r--net/bridge/br_forward.c3
-rw-r--r--net/bridge/br_input.c8
-rw-r--r--net/bridge/br_mrp.c8
-rw-r--r--net/bridge/br_multicast.c9
-rw-r--r--net/bridge/br_private.h3
-rw-r--r--net/bridge/br_vlan.c10
-rw-r--r--net/bridge/netfilter/ebtables.c14
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c11
-rw-r--r--net/caif/cfctrl.c4
-rw-r--r--net/can/af_can.c2
-rw-r--r--net/can/isotp.c2
-rw-r--r--net/can/raw.c67
-rw-r--r--net/ceph/messenger.c3
-rw-r--r--net/ceph/mon_client.c2
-rw-r--r--net/core/Makefile1
-rw-r--r--net/core/datagram.c2
-rw-r--r--net/core/dev.c156
-rw-r--r--net/core/dev.h2
-rw-r--r--net/core/devmem.c8
-rw-r--r--net/core/devmem.h2
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/filter.c9
-rw-r--r--net/core/gro.c2
-rw-r--r--net/core/link_watch.c4
-rw-r--r--net/core/lwt_bpf.c4
-rw-r--r--net/core/net-procfs.c3
-rw-r--r--net/core/net-sysfs.c4
-rw-r--r--net/core/netdev-genl.c122
-rw-r--r--net/core/netdev_queues.c27
-rw-r--r--net/core/netdev_rx_queue.c9
-rw-r--r--net/core/netpoll.c3
-rw-r--r--net/core/page_pool.c12
-rw-r--r--net/core/pktgen.c7
-rw-r--r--net/core/request_sock.c4
-rw-r--r--net/core/rtnetlink.c12
-rw-r--r--net/core/scm.c4
-rw-r--r--net/core/skbuff.c33
-rw-r--r--net/core/skmsg.c2
-rw-r--r--net/core/sock.c94
-rw-r--r--net/core/sock_diag.c2
-rw-r--r--net/core/xdp.c21
-rw-r--r--net/devlink/core.c2
-rw-r--r--net/devlink/health.c109
-rw-r--r--net/devlink/netlink_gen.c5
-rw-r--r--net/devlink/param.c10
-rw-r--r--net/devlink/port.c33
-rw-r--r--net/ethernet/eth.c5
-rw-r--r--net/ethtool/Makefile2
-rw-r--r--net/ethtool/common.c20
-rw-r--r--net/ethtool/common.h2
-rw-r--r--net/ethtool/fec.c75
-rw-r--r--net/ethtool/ioctl.c94
-rw-r--r--net/ethtool/rss.c42
-rw-r--r--net/ethtool/tsconfig.c12
-rw-r--r--net/hsr/hsr_slave.c5
-rw-r--r--net/ipv4/af_inet.c12
-rw-r--r--net/ipv4/arp.c2
-rw-r--r--net/ipv4/cipso_ipv4.c13
-rw-r--r--net/ipv4/esp4.c4
-rw-r--r--net/ipv4/fib_frontend.c7
-rw-r--r--net/ipv4/fib_rules.c4
-rw-r--r--net/ipv4/fou_core.c32
-rw-r--r--net/ipv4/fou_nl.c4
-rw-r--r--net/ipv4/icmp.c33
-rw-r--r--net/ipv4/inet_connection_sock.c42
-rw-r--r--net/ipv4/inet_diag.c570
-rw-r--r--net/ipv4/inet_fragment.c2
-rw-r--r--net/ipv4/inet_hashtables.c108
-rw-r--r--net/ipv4/inet_timewait_sock.c11
-rw-r--r--net/ipv4/ip_fragment.c6
-rw-r--r--net/ipv4/ip_gre.c4
-rw-r--r--net/ipv4/ip_input.c40
-rw-r--r--net/ipv4/ip_options.c5
-rw-r--r--net/ipv4/ip_output.c8
-rw-r--r--net/ipv4/ipmr.c9
-rw-r--r--net/ipv4/netfilter.c9
-rw-r--r--net/ipv4/netfilter/ipt_rpfilter.c4
-rw-r--r--net/ipv4/netfilter/nf_dup_ipv4.c4
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c52
-rw-r--r--net/ipv4/netfilter/nf_socket_ipv4.c3
-rw-r--r--net/ipv4/netfilter/nf_tproxy_ipv4.c5
-rw-r--r--net/ipv4/netfilter/nft_fib_ipv4.c4
-rw-r--r--net/ipv4/nexthop.c42
-rw-r--r--net/ipv4/ping.c68
-rw-r--r--net/ipv4/proc.c65
-rw-r--r--net/ipv4/raw.c7
-rw-r--r--net/ipv4/raw_diag.c10
-rw-r--r--net/ipv4/route.c28
-rw-r--r--net/ipv4/syncookies.c4
-rw-r--r--net/ipv4/sysctl_net_ipv4.c19
-rw-r--r--net/ipv4/tcp.c100
-rw-r--r--net/ipv4/tcp_ao.c5
-rw-r--r--net/ipv4/tcp_cdg.c2
-rw-r--r--net/ipv4/tcp_diag.c461
-rw-r--r--net/ipv4/tcp_fastopen.c7
-rw-r--r--net/ipv4/tcp_input.c395
-rw-r--r--net/ipv4/tcp_ipv4.c89
-rw-r--r--net/ipv4/tcp_metrics.c6
-rw-r--r--net/ipv4/tcp_minisocks.c80
-rw-r--r--net/ipv4/tcp_offload.c4
-rw-r--r--net/ipv4/tcp_output.c332
-rw-r--r--net/ipv4/tcp_timer.c6
-rw-r--r--net/ipv4/udp.c171
-rw-r--r--net/ipv4/udp_diag.c10
-rw-r--r--net/ipv4/udp_offload.c2
-rw-r--r--net/ipv4/udp_tunnel_core.c3
-rw-r--r--net/ipv4/udp_tunnel_nic.c2
-rw-r--r--net/ipv4/xfrm4_policy.c4
-rw-r--r--net/ipv6/Kconfig7
-rw-r--r--net/ipv6/addrconf.c4
-rw-r--r--net/ipv6/af_inet6.c2
-rw-r--r--net/ipv6/ah6.c50
-rw-r--r--net/ipv6/anycast.c2
-rw-r--r--net/ipv6/datagram.c2
-rw-r--r--net/ipv6/esp6.c4
-rw-r--r--net/ipv6/icmp.c9
-rw-r--r--net/ipv6/inet6_connection_sock.c2
-rw-r--r--net/ipv6/inet6_hashtables.c62
-rw-r--r--net/ipv6/ip6_gre.c10
-rw-r--r--net/ipv6/ip6_output.c70
-rw-r--r--net/ipv6/ipv6_sockglue.c6
-rw-r--r--net/ipv6/mcast.c67
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/netfilter.c5
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c67
-rw-r--r--net/ipv6/netfilter/nf_socket_ipv6.c3
-rw-r--r--net/ipv6/netfilter/nf_tproxy_ipv6.c5
-rw-r--r--net/ipv6/output_core.c8
-rw-r--r--net/ipv6/ping.c1
-rw-r--r--net/ipv6/proc.c91
-rw-r--r--net/ipv6/raw.c11
-rw-r--r--net/ipv6/route.c14
-rw-r--r--net/ipv6/seg6.c7
-rw-r--r--net/ipv6/seg6_hmac.c211
-rw-r--r--net/ipv6/sit.c104
-rw-r--r--net/ipv6/syncookies.c2
-rw-r--r--net/ipv6/tcp_ipv6.c56
-rw-r--r--net/ipv6/tcpv6_offload.c3
-rw-r--r--net/ipv6/udp.c19
-rw-r--r--net/ipv6/udp_offload.c2
-rw-r--r--net/iucv/af_iucv.c4
-rw-r--r--net/mac80211/cfg.c186
-rw-r--r--net/mac80211/chan.c11
-rw-r--r--net/mac80211/debugfs.c3
-rw-r--r--net/mac80211/debugfs_netdev.c2
-rw-r--r--net/mac80211/debugfs_sta.c2
-rw-r--r--net/mac80211/ethtool.c6
-rw-r--r--net/mac80211/ieee80211_i.h17
-rw-r--r--net/mac80211/iface.c25
-rw-r--r--net/mac80211/main.c22
-rw-r--r--net/mac80211/mesh.c3
-rw-r--r--net/mac80211/mesh_ps.c2
-rw-r--r--net/mac80211/mlme.c91
-rw-r--r--net/mac80211/offchannel.c5
-rw-r--r--net/mac80211/rate.c11
-rw-r--r--net/mac80211/rx.c40
-rw-r--r--net/mac80211/scan.c13
-rw-r--r--net/mac80211/sta_info.c15
-rw-r--r--net/mac80211/status.c21
-rw-r--r--net/mac80211/tests/Makefile2
-rw-r--r--net/mac80211/tests/s1g_tim.c356
-rw-r--r--net/mac80211/tx.c187
-rw-r--r--net/mac80211/util.c67
-rw-r--r--net/mctp/af_mctp.c2
-rw-r--r--net/mptcp/crypto.c35
-rw-r--r--net/mptcp/ctrl.c9
-rw-r--r--net/mptcp/mib.c12
-rw-r--r--net/mptcp/mptcp_diag.c15
-rw-r--r--net/mptcp/pm.c60
-rw-r--r--net/mptcp/pm_kernel.c569
-rw-r--r--net/mptcp/pm_netlink.c11
-rw-r--r--net/mptcp/pm_userspace.c2
-rw-r--r--net/mptcp/protocol.c218
-rw-r--r--net/mptcp/protocol.h29
-rw-r--r--net/mptcp/sockopt.c22
-rw-r--r--net/mptcp/subflow.c11
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h8
-rw-r--r--net/netfilter/ipvs/ip_vs_conn.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c11
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_est.c16
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c4
-rw-r--r--net/netfilter/nf_conntrack_ecache.c2
-rw-r--r--net/netfilter/nf_conntrack_netlink.c39
-rw-r--r--net/netfilter/nf_conntrack_standalone.c3
-rw-r--r--net/netfilter/nf_tables_api.c47
-rw-r--r--net/netfilter/nfnetlink.c2
-rw-r--r--net/netfilter/nft_flow_offload.c4
-rw-r--r--net/netfilter/nft_payload.c20
-rw-r--r--net/netfilter/nft_set_hash.c100
-rw-r--r--net/netfilter/nft_set_pipapo.c96
-rw-r--r--net/netfilter/nft_set_pipapo.h8
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.c142
-rw-r--r--net/netfilter/nft_set_pipapo_avx2.h4
-rw-r--r--net/netfilter/nft_set_rbtree.c35
-rw-r--r--net/netlink/af_netlink.c4
-rw-r--r--net/nfc/nci/ntf.c135
-rw-r--r--net/openvswitch/dp_notify.c2
-rw-r--r--net/openvswitch/flow.c12
-rw-r--r--net/openvswitch/flow_table.c7
-rw-r--r--net/packet/af_packet.c134
-rw-r--r--net/packet/diag.c2
-rw-r--r--net/packet/internal.h14
-rw-r--r--net/phonet/af_phonet.c4
-rw-r--r--net/phonet/pep.c6
-rw-r--r--net/phonet/socket.c25
-rw-r--r--net/psp/Kconfig15
-rw-r--r--net/psp/Makefile5
-rw-r--r--net/psp/psp-nl-gen.c119
-rw-r--r--net/psp/psp-nl-gen.h39
-rw-r--r--net/psp/psp.h54
-rw-r--r--net/psp/psp_main.c322
-rw-r--r--net/psp/psp_nl.c505
-rw-r--r--net/psp/psp_sock.c292
-rw-r--r--net/rds/af_rds.c2
-rw-r--r--net/rds/connection.c9
-rw-r--r--net/rds/ib_mr.h1
-rw-r--r--net/rds/ib_rdma.c3
-rw-r--r--net/rds/ib_recv.c2
-rw-r--r--net/rds/message.c4
-rw-r--r--net/rds/rds.h2
-rw-r--r--net/rds/recv.c4
-rw-r--r--net/rds/send.c4
-rw-r--r--net/rfkill/input.c2
-rw-r--r--net/rxrpc/rxperf.c2
-rw-r--r--net/sched/act_api.c12
-rw-r--r--net/sched/act_simple.c1
-rw-r--r--net/sched/act_skbmod.c22
-rw-r--r--net/sched/act_tunnel_key.c16
-rw-r--r--net/sched/act_vlan.c16
-rw-r--r--net/sched/sch_api.c4
-rw-r--r--net/sctp/Kconfig47
-rw-r--r--net/sctp/auth.c166
-rw-r--r--net/sctp/chunk.c3
-rw-r--r--net/sctp/diag.c2
-rw-r--r--net/sctp/endpointola.c23
-rw-r--r--net/sctp/proc.c12
-rw-r--r--net/sctp/protocol.c14
-rw-r--r--net/sctp/sm_make_chunk.c60
-rw-r--r--net/sctp/sm_statefuns.c5
-rw-r--r--net/sctp/socket.c41
-rw-r--r--net/sctp/sysctl.c49
-rw-r--r--net/smc/Kconfig16
-rw-r--r--net/smc/Makefile1
-rw-r--r--net/smc/af_smc.c30
-rw-r--r--net/smc/smc_clc.c73
-rw-r--r--net/smc/smc_core.c37
-rw-r--r--net/smc/smc_core.h5
-rw-r--r--net/smc/smc_diag.c2
-rw-r--r--net/smc/smc_ib.c18
-rw-r--r--net/smc/smc_ism.c233
-rw-r--r--net/smc/smc_ism.h36
-rw-r--r--net/smc/smc_loopback.c425
-rw-r--r--net/smc/smc_loopback.h60
-rw-r--r--net/smc/smc_pnet.c70
-rw-r--r--net/smc/smc_tx.c3
-rw-r--r--net/socket.c35
-rw-r--r--net/tipc/addr.c6
-rw-r--r--net/tipc/addr.h2
-rw-r--r--net/tipc/link.c9
-rw-r--r--net/tipc/socket.c6
-rw-r--r--net/tls/tls.h3
-rw-r--r--net/tls/tls_device.c20
-rw-r--r--net/tls/tls_proc.c10
-rw-r--r--net/unix/garbage.c2
-rw-r--r--net/vmw_vsock/af_vsock.c9
-rw-r--r--net/vmw_vsock/virtio_transport.c2
-rw-r--r--net/vmw_vsock/vsock_loopback.c2
-rw-r--r--net/wireless/chan.c103
-rw-r--r--net/wireless/core.c9
-rw-r--r--net/wireless/ethtool.c2
-rw-r--r--net/wireless/nl80211.c805
-rw-r--r--net/wireless/reg.c76
-rw-r--r--net/wireless/scan.c9
-rw-r--r--net/wireless/trace.h91
-rw-r--r--net/wireless/util.c31
-rw-r--r--net/xdp/xsk.c113
-rw-r--r--net/xfrm/xfrm_policy.c16
-rw-r--r--net/xfrm/xfrm_proc.c12
-rw-r--r--net/xfrm/xfrm_user.c10
313 files changed, 8055 insertions, 6832 deletions
diff --git a/net/Kconfig b/net/Kconfig
index d5865cf19799..1d3f757d4b07 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -82,11 +82,13 @@ config NET_CRC32C
menu "Networking options"
source "net/packet/Kconfig"
+source "net/psp/Kconfig"
source "net/unix/Kconfig"
source "net/tls/Kconfig"
source "net/xfrm/Kconfig"
source "net/iucv/Kconfig"
source "net/smc/Kconfig"
+source "drivers/dibs/Kconfig"
source "net/xdp/Kconfig"
config NET_HANDSHAKE
diff --git a/net/Makefile b/net/Makefile
index aac960c41db6..90e3d72bf58b 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_INET) += ipv4/
obj-$(CONFIG_TLS) += tls/
obj-$(CONFIG_XFRM) += xfrm/
obj-$(CONFIG_UNIX) += unix/
+obj-$(CONFIG_INET_PSP) += psp/
obj-y += ipv6/
obj-$(CONFIG_PACKET) += packet/
obj-$(CONFIG_NET_KEY) += key/
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index 20b316207f9a..c299e2bc87ed 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -53,19 +53,6 @@ config BATMAN_ADV_DAT
mesh networks. If you think that your network does not need
this option you can safely remove it and save some space.
-config BATMAN_ADV_NC
- bool "Network Coding"
- depends on BATMAN_ADV
- help
- This option enables network coding, a mechanism that aims to
- increase the overall network throughput by fusing multiple
- packets in one transmission.
- Note that interfaces controlled by batman-adv must be manually
- configured to have promiscuous mode enabled in order to make
- network coding work.
- If you think that your network does not need this feature you
- can safely disable it and save some space.
-
config BATMAN_ADV_MCAST
bool "Multicast optimisation"
depends on BATMAN_ADV && INET && !(BRIDGE=m && BATMAN_ADV=y)
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 1cc9be6de456..d3c4d4143c14 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -23,7 +23,6 @@ batman-adv-y += mesh-interface.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast.o
batman-adv-$(CONFIG_BATMAN_ADV_MCAST) += multicast_forw.o
batman-adv-y += netlink.o
-batman-adv-$(CONFIG_BATMAN_ADV_NC) += network-coding.o
batman-adv-y += originator.o
batman-adv-y += routing.o
batman-adv-y += send.o
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 54fe38b3b2fd..b75c2228e69a 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -52,7 +52,6 @@
#include "hash.h"
#include "log.h"
#include "netlink.h"
-#include "network-coding.h"
#include "originator.h"
#include "routing.h"
#include "send.h"
@@ -1406,10 +1405,6 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
if (!orig_neigh_node)
goto out;
- /* Update nc_nodes of the originator */
- batadv_nc_update_nc_node(bat_priv, orig_node, orig_neigh_node,
- ogm_packet, is_single_hop_neigh);
-
orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
if_outgoing);
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 747755647c6a..b992ba12aa24 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -12,6 +12,7 @@
#include <linux/compiler.h>
#include <linux/container_of.h>
#include <linux/crc16.h>
+#include <linux/crc32.h>
#include <linux/err.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
@@ -1585,6 +1586,39 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
}
/**
+ * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in
+ * the header
+ * @skb: skb pointing to fragmented socket buffers
+ * @payload_ptr: Pointer to position inside the head buffer of the skb
+ * marking the start of the data to be CRC'ed
+ *
+ * payload_ptr must always point to an address in the skb head buffer and not to
+ * a fragment.
+ *
+ * Return: big endian crc32c of the checksummed data
+ */
+static __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
+{
+ unsigned int to = skb->len;
+ unsigned int consumed = 0;
+ struct skb_seq_state st;
+ unsigned int from;
+ unsigned int len;
+ const u8 *data;
+ u32 crc = 0;
+
+ from = (unsigned int)(payload_ptr - skb->data);
+
+ skb_prepare_seq_read(skb, from, to, &st);
+ while ((len = skb_seq_read(consumed, &data, &st)) != 0) {
+ crc = crc32c(crc, data, len);
+ consumed += len;
+ }
+
+ return htonl(crc);
+}
+
+/**
* batadv_bla_check_duplist() - Check if a frame is in the broadcast dup.
* @bat_priv: the bat priv with all the mesh interface information
* @skb: contains the multicast packet to be checked
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index bace57e4f9a5..5113f879736b 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -22,6 +22,7 @@
#include <linux/minmax.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
+#include <linux/notifier.h>
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rtnetlink.h>
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 262a78364742..9db8a310961e 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -12,7 +12,6 @@
#include <linux/compiler.h>
#include <linux/kref.h>
#include <linux/netdevice.h>
-#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/stddef.h>
#include <linux/types.h>
diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h
index 567afaa8df99..225b747a2048 100644
--- a/net/batman-adv/log.h
+++ b/net/batman-adv/log.h
@@ -51,9 +51,6 @@ enum batadv_dbg_level {
/** @BATADV_DBG_DAT: ARP snooping and DAT related messages */
BATADV_DBG_DAT = BIT(4),
- /** @BATADV_DBG_NC: network coding related messages */
- BATADV_DBG_NC = BIT(5),
-
/** @BATADV_DBG_MCAST: multicast related messages */
BATADV_DBG_MCAST = BIT(6),
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index 20346d7b6b69..3a35aadd8b41 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -11,7 +11,6 @@
#include <linux/build_bug.h>
#include <linux/byteorder/generic.h>
#include <linux/container_of.h>
-#include <linux/crc32.h>
#include <linux/device.h>
#include <linux/errno.h>
#include <linux/gfp.h>
@@ -53,7 +52,6 @@
#include "mesh-interface.h"
#include "multicast.h"
#include "netlink.h"
-#include "network-coding.h"
#include "originator.h"
#include "routing.h"
#include "send.h"
@@ -103,7 +101,6 @@ static int __init batadv_init(void)
batadv_v_init();
batadv_iv_init();
- batadv_nc_init();
batadv_tp_meter_init();
batadv_event_workqueue = create_singlethread_workqueue("bat_events");
@@ -218,12 +215,6 @@ int batadv_mesh_init(struct net_device *mesh_iface)
goto err_dat;
}
- ret = batadv_nc_mesh_init(bat_priv);
- if (ret < 0) {
- atomic_set(&bat_priv->mesh_state, BATADV_MESH_DEACTIVATING);
- goto err_nc;
- }
-
batadv_gw_init(bat_priv);
batadv_mcast_init(bat_priv);
@@ -232,8 +223,6 @@ int batadv_mesh_init(struct net_device *mesh_iface)
return 0;
-err_nc:
- batadv_dat_free(bat_priv);
err_dat:
batadv_bla_free(bat_priv);
err_bla:
@@ -264,7 +253,6 @@ void batadv_mesh_free(struct net_device *mesh_iface)
batadv_gw_node_free(bat_priv);
batadv_v_mesh_free(bat_priv);
- batadv_nc_mesh_free(bat_priv);
batadv_dat_free(bat_priv);
batadv_bla_free(bat_priv);
@@ -336,11 +324,6 @@ int batadv_max_header_len(void)
header_len = max_t(int, header_len,
sizeof(struct batadv_bcast_packet));
-#ifdef CONFIG_BATMAN_ADV_NC
- header_len = max_t(int, header_len,
- sizeof(struct batadv_coded_packet));
-#endif
-
return header_len + ETH_HLEN;
}
@@ -578,39 +561,6 @@ void batadv_recv_handler_unregister(u8 packet_type)
}
/**
- * batadv_skb_crc32() - calculate CRC32 of the whole packet and skip bytes in
- * the header
- * @skb: skb pointing to fragmented socket buffers
- * @payload_ptr: Pointer to position inside the head buffer of the skb
- * marking the start of the data to be CRC'ed
- *
- * payload_ptr must always point to an address in the skb head buffer and not to
- * a fragment.
- *
- * Return: big endian crc32c of the checksummed data
- */
-__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
-{
- u32 crc = 0;
- unsigned int from;
- unsigned int to = skb->len;
- struct skb_seq_state st;
- const u8 *data;
- unsigned int len;
- unsigned int consumed = 0;
-
- from = (unsigned int)(payload_ptr - skb->data);
-
- skb_prepare_seq_read(skb, from, to, &st);
- while ((len = skb_seq_read(consumed, &data, &st)) != 0) {
- crc = crc32c(crc, data, len);
- consumed += len;
- }
-
- return htonl(crc);
-}
-
-/**
* batadv_get_vid() - extract the VLAN identifier from skb if any
* @skb: the buffer containing the packet
* @header_len: length of the batman header preceding the ethernet header
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 1481eb2bacee..2be1ac17acaa 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -13,7 +13,7 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2025.3"
+#define BATADV_SOURCE_VERSION "2025.4"
#endif
/* B.A.T.M.A.N. parameters */
@@ -121,8 +121,6 @@
#define BATADV_RESET_PROTECTION_MS 30000
#define BATADV_EXPECTED_SEQNO_RANGE 65536
-#define BATADV_NC_NODE_TIMEOUT 10000 /* Milliseconds */
-
/**
* BATADV_TP_MAX_NUM - maximum number of simultaneously active tp sessions
*/
@@ -250,7 +248,6 @@ batadv_recv_handler_register(u8 packet_type,
int (*recv_handler)(struct sk_buff *,
struct batadv_hard_iface *));
void batadv_recv_handler_unregister(u8 packet_type);
-__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr);
/**
* batadv_compare_eth() - Compare two not u16 aligned Ethernet addresses
diff --git a/net/batman-adv/mesh-interface.c b/net/batman-adv/mesh-interface.c
index de2c2d9c6e4d..df7e95811ef5 100644
--- a/net/batman-adv/mesh-interface.c
+++ b/net/batman-adv/mesh-interface.c
@@ -37,6 +37,7 @@
#include <linux/string.h>
#include <linux/types.h>
#include <net/netlink.h>
+#include <net/rtnetlink.h>
#include <uapi/linux/batadv_packet.h>
#include <uapi/linux/batman_adv.h>
@@ -46,7 +47,6 @@
#include "gateway_client.h"
#include "hard-interface.h"
#include "multicast.h"
-#include "network-coding.h"
#include "send.h"
#include "translation-table.h"
@@ -802,8 +802,6 @@ static int batadv_meshif_init_late(struct net_device *dev)
bat_priv->primary_if = NULL;
- batadv_nc_init_bat_priv(bat_priv);
-
if (!bat_priv->algo_ops) {
ret = batadv_algo_select(bat_priv, batadv_routing_algo);
if (ret < 0)
@@ -947,17 +945,6 @@ static const struct {
{ "dat_put_rx" },
{ "dat_cached_reply_tx" },
#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- { "nc_code" },
- { "nc_code_bytes" },
- { "nc_recode" },
- { "nc_recode_bytes" },
- { "nc_buffer" },
- { "nc_decode" },
- { "nc_decode_bytes" },
- { "nc_decode_failed" },
- { "nc_sniffed" },
-#endif
};
static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data)
diff --git a/net/batman-adv/mesh-interface.h b/net/batman-adv/mesh-interface.h
index 7ba055b2bc26..53756c5a45e0 100644
--- a/net/batman-adv/mesh-interface.h
+++ b/net/batman-adv/mesh-interface.h
@@ -13,7 +13,6 @@
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/types.h>
-#include <net/rtnetlink.h>
int batadv_skb_head_push(struct sk_buff *skb, unsigned int len);
void batadv_interface_rx(struct net_device *mesh_iface,
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index beb181b3a7d8..78c651f634cd 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -44,7 +44,6 @@
#include "log.h"
#include "mesh-interface.h"
#include "multicast.h"
-#include "network-coding.h"
#include "originator.h"
#include "tp_meter.h"
#include "translation-table.h"
@@ -144,7 +143,6 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = {
[BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 },
[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 },
[BATADV_ATTR_MULTICAST_FANOUT] = { .type = NLA_U32 },
- [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 },
[BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 },
[BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 },
[BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 },
@@ -345,12 +343,6 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg,
goto nla_put_failure;
#endif /* CONFIG_BATMAN_ADV_MCAST */
-#ifdef CONFIG_BATMAN_ADV_NC
- if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED,
- !!atomic_read(&bat_priv->network_coding)))
- goto nla_put_failure;
-#endif /* CONFIG_BATMAN_ADV_NC */
-
if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL,
atomic_read(&bat_priv->orig_interval)))
goto nla_put_failure;
@@ -588,15 +580,6 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info)
}
#endif /* CONFIG_BATMAN_ADV_MCAST */
-#ifdef CONFIG_BATMAN_ADV_NC
- if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) {
- attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED];
-
- atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr));
- batadv_nc_status_update(bat_priv->mesh_iface);
- }
-#endif /* CONFIG_BATMAN_ADV_NC */
-
if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) {
u32 orig_interval;
diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h
index fe4548b974bb..4eae9e5ff135 100644
--- a/net/batman-adv/netlink.h
+++ b/net/batman-adv/netlink.h
@@ -11,7 +11,6 @@
#include <linux/netlink.h>
#include <linux/types.h>
-#include <net/genetlink.h>
void batadv_netlink_register(void);
void batadv_netlink_unregister(void);
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
deleted file mode 100644
index af97d077369f..000000000000
--- a/net/batman-adv/network-coding.c
+++ /dev/null
@@ -1,1878 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/* Copyright (C) B.A.T.M.A.N. contributors:
- *
- * Martin Hundebøll, Jeppe Ledet-Pedersen
- */
-
-#include "network-coding.h"
-#include "main.h"
-
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/byteorder/generic.h>
-#include <linux/compiler.h>
-#include <linux/container_of.h>
-#include <linux/errno.h>
-#include <linux/etherdevice.h>
-#include <linux/gfp.h>
-#include <linux/if_ether.h>
-#include <linux/if_packet.h>
-#include <linux/init.h>
-#include <linux/jhash.h>
-#include <linux/jiffies.h>
-#include <linux/kref.h>
-#include <linux/list.h>
-#include <linux/lockdep.h>
-#include <linux/net.h>
-#include <linux/netdevice.h>
-#include <linux/printk.h>
-#include <linux/random.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/skbuff.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/stddef.h>
-#include <linux/string.h>
-#include <linux/workqueue.h>
-#include <uapi/linux/batadv_packet.h>
-
-#include "hash.h"
-#include "log.h"
-#include "originator.h"
-#include "routing.h"
-#include "send.h"
-#include "tvlv.h"
-
-static struct lock_class_key batadv_nc_coding_hash_lock_class_key;
-static struct lock_class_key batadv_nc_decoding_hash_lock_class_key;
-
-static void batadv_nc_worker(struct work_struct *work);
-static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if);
-
-/**
- * batadv_nc_init() - one-time initialization for network coding
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int __init batadv_nc_init(void)
-{
- /* Register our packet type */
- return batadv_recv_handler_register(BATADV_CODED,
- batadv_nc_recv_coded_packet);
-}
-
-/**
- * batadv_nc_start_timer() - initialise the nc periodic worker
- * @bat_priv: the bat priv with all the mesh interface information
- */
-static void batadv_nc_start_timer(struct batadv_priv *bat_priv)
-{
- queue_delayed_work(batadv_event_workqueue, &bat_priv->nc.work,
- msecs_to_jiffies(10));
-}
-
-/**
- * batadv_nc_tvlv_container_update() - update the network coding tvlv container
- * after network coding setting change
- * @bat_priv: the bat priv with all the mesh interface information
- */
-static void batadv_nc_tvlv_container_update(struct batadv_priv *bat_priv)
-{
- char nc_mode;
-
- nc_mode = atomic_read(&bat_priv->network_coding);
-
- switch (nc_mode) {
- case 0:
- batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
- break;
- case 1:
- batadv_tvlv_container_register(bat_priv, BATADV_TVLV_NC, 1,
- NULL, 0);
- break;
- }
-}
-
-/**
- * batadv_nc_status_update() - update the network coding tvlv container after
- * network coding setting change
- * @net_dev: the mesh interface net device
- */
-void batadv_nc_status_update(struct net_device *net_dev)
-{
- struct batadv_priv *bat_priv = netdev_priv(net_dev);
-
- batadv_nc_tvlv_container_update(bat_priv);
-}
-
-/**
- * batadv_nc_tvlv_ogm_handler_v1() - process incoming nc tvlv container
- * @bat_priv: the bat priv with all the mesh interface information
- * @orig: the orig_node of the ogm
- * @flags: flags indicating the tvlv state (see batadv_tvlv_handler_flags)
- * @tvlv_value: tvlv buffer containing the gateway data
- * @tvlv_value_len: tvlv buffer length
- */
-static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig,
- u8 flags,
- void *tvlv_value, u16 tvlv_value_len)
-{
- if (flags & BATADV_TVLV_HANDLER_OGM_CIFNOTFND)
- clear_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
- else
- set_bit(BATADV_ORIG_CAPA_HAS_NC, &orig->capabilities);
-}
-
-/**
- * batadv_nc_mesh_init() - initialise coding hash table and start housekeeping
- * @bat_priv: the bat priv with all the mesh interface information
- *
- * Return: 0 on success or negative error number in case of failure
- */
-int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
-{
- bat_priv->nc.timestamp_fwd_flush = jiffies;
- bat_priv->nc.timestamp_sniffed_purge = jiffies;
-
- if (bat_priv->nc.coding_hash || bat_priv->nc.decoding_hash)
- return 0;
-
- bat_priv->nc.coding_hash = batadv_hash_new(128);
- if (!bat_priv->nc.coding_hash)
- goto err;
-
- batadv_hash_set_lock_class(bat_priv->nc.coding_hash,
- &batadv_nc_coding_hash_lock_class_key);
-
- bat_priv->nc.decoding_hash = batadv_hash_new(128);
- if (!bat_priv->nc.decoding_hash) {
- batadv_hash_destroy(bat_priv->nc.coding_hash);
- goto err;
- }
-
- batadv_hash_set_lock_class(bat_priv->nc.decoding_hash,
- &batadv_nc_decoding_hash_lock_class_key);
-
- INIT_DELAYED_WORK(&bat_priv->nc.work, batadv_nc_worker);
- batadv_nc_start_timer(bat_priv);
-
- batadv_tvlv_handler_register(bat_priv, batadv_nc_tvlv_ogm_handler_v1,
- NULL, NULL, BATADV_TVLV_NC, 1,
- BATADV_TVLV_HANDLER_OGM_CIFNOTFND);
- batadv_nc_tvlv_container_update(bat_priv);
- return 0;
-
-err:
- return -ENOMEM;
-}
-
-/**
- * batadv_nc_init_bat_priv() - initialise the nc specific bat_priv variables
- * @bat_priv: the bat priv with all the mesh interface information
- */
-void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
-{
- atomic_set(&bat_priv->network_coding, 0);
- bat_priv->nc.min_tq = 200;
- bat_priv->nc.max_fwd_delay = 10;
- bat_priv->nc.max_buffer_time = 200;
-}
-
-/**
- * batadv_nc_init_orig() - initialise the nc fields of an orig_node
- * @orig_node: the orig_node which is going to be initialised
- */
-void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
-{
- INIT_LIST_HEAD(&orig_node->in_coding_list);
- INIT_LIST_HEAD(&orig_node->out_coding_list);
- spin_lock_init(&orig_node->in_coding_list_lock);
- spin_lock_init(&orig_node->out_coding_list_lock);
-}
-
-/**
- * batadv_nc_node_release() - release nc_node from lists and queue for free
- * after rcu grace period
- * @ref: kref pointer of the nc_node
- */
-static void batadv_nc_node_release(struct kref *ref)
-{
- struct batadv_nc_node *nc_node;
-
- nc_node = container_of(ref, struct batadv_nc_node, refcount);
-
- batadv_orig_node_put(nc_node->orig_node);
- kfree_rcu(nc_node, rcu);
-}
-
-/**
- * batadv_nc_node_put() - decrement the nc_node refcounter and possibly
- * release it
- * @nc_node: nc_node to be free'd
- */
-static void batadv_nc_node_put(struct batadv_nc_node *nc_node)
-{
- if (!nc_node)
- return;
-
- kref_put(&nc_node->refcount, batadv_nc_node_release);
-}
-
-/**
- * batadv_nc_path_release() - release nc_path from lists and queue for free
- * after rcu grace period
- * @ref: kref pointer of the nc_path
- */
-static void batadv_nc_path_release(struct kref *ref)
-{
- struct batadv_nc_path *nc_path;
-
- nc_path = container_of(ref, struct batadv_nc_path, refcount);
-
- kfree_rcu(nc_path, rcu);
-}
-
-/**
- * batadv_nc_path_put() - decrement the nc_path refcounter and possibly
- * release it
- * @nc_path: nc_path to be free'd
- */
-static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
-{
- if (!nc_path)
- return;
-
- kref_put(&nc_path->refcount, batadv_nc_path_release);
-}
-
-/**
- * batadv_nc_packet_free() - frees nc packet
- * @nc_packet: the nc packet to free
- * @dropped: whether the packet is freed because is dropped
- */
-static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet,
- bool dropped)
-{
- if (dropped)
- kfree_skb(nc_packet->skb);
- else
- consume_skb(nc_packet->skb);
-
- batadv_nc_path_put(nc_packet->nc_path);
- kfree(nc_packet);
-}
-
-/**
- * batadv_nc_to_purge_nc_node() - checks whether an nc node has to be purged
- * @bat_priv: the bat priv with all the mesh interface information
- * @nc_node: the nc node to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
- struct batadv_nc_node *nc_node)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- return batadv_has_timed_out(nc_node->last_seen, BATADV_NC_NODE_TIMEOUT);
-}
-
-/**
- * batadv_nc_to_purge_nc_path_coding() - checks whether an nc path has timed out
- * @bat_priv: the bat priv with all the mesh interface information
- * @nc_path: the nc path to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- /* purge the path when no packets has been added for 10 times the
- * max_fwd_delay time
- */
- return batadv_has_timed_out(nc_path->last_valid,
- bat_priv->nc.max_fwd_delay * 10);
-}
-
-/**
- * batadv_nc_to_purge_nc_path_decoding() - checks whether an nc path has timed
- * out
- * @bat_priv: the bat priv with all the mesh interface information
- * @nc_path: the nc path to check
- *
- * Return: true if the entry has to be purged now, false otherwise
- */
-static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path)
-{
- if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE)
- return true;
-
- /* purge the path when no packets has been added for 10 times the
- * max_buffer time
- */
- return batadv_has_timed_out(nc_path->last_valid,
- bat_priv->nc.max_buffer_time * 10);
-}
-
-/**
- * batadv_nc_purge_orig_nc_nodes() - go through list of nc nodes and purge stale
- * entries
- * @bat_priv: the bat priv with all the mesh interface information
- * @list: list of nc nodes
- * @lock: nc node list lock
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true if the entry has to be deleted, false
- * otherwise
- */
-static void
-batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
- struct list_head *list,
- spinlock_t *lock,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
- struct batadv_nc_node *nc_node, *nc_node_tmp;
-
- /* For each nc_node in list */
- spin_lock_bh(lock);
- list_for_each_entry_safe(nc_node, nc_node_tmp, list, list) {
- /* if an helper function has been passed as parameter,
- * ask it if the entry has to be purged or not
- */
- if (to_purge && !to_purge(bat_priv, nc_node))
- continue;
-
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "Removing nc_node %pM -> %pM\n",
- nc_node->addr, nc_node->orig_node->orig);
- list_del_rcu(&nc_node->list);
- batadv_nc_node_put(nc_node);
- }
- spin_unlock_bh(lock);
-}
-
-/**
- * batadv_nc_purge_orig() - purges all nc node data attached of the given
- * originator
- * @bat_priv: the bat priv with all the mesh interface information
- * @orig_node: orig_node with the nc node entries to be purged
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true is the entry has to be deleted, false
- * otherwise
- */
-void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
- /* Check ingoing nc_node's of this orig_node */
- batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->in_coding_list,
- &orig_node->in_coding_list_lock,
- to_purge);
-
- /* Check outgoing nc_node's of this orig_node */
- batadv_nc_purge_orig_nc_nodes(bat_priv, &orig_node->out_coding_list,
- &orig_node->out_coding_list_lock,
- to_purge);
-}
-
-/**
- * batadv_nc_purge_orig_hash() - traverse entire originator hash to check if
- * they have timed out nc nodes
- * @bat_priv: the bat priv with all the mesh interface information
- */
-static void batadv_nc_purge_orig_hash(struct batadv_priv *bat_priv)
-{
- struct batadv_hashtable *hash = bat_priv->orig_hash;
- struct hlist_head *head;
- struct batadv_orig_node *orig_node;
- u32 i;
-
- if (!hash)
- return;
-
- /* For each orig_node */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(orig_node, head, hash_entry)
- batadv_nc_purge_orig(bat_priv, orig_node,
- batadv_nc_to_purge_nc_node);
- rcu_read_unlock();
- }
-}
-
-/**
- * batadv_nc_purge_paths() - traverse all nc paths part of the hash and remove
- * unused ones
- * @bat_priv: the bat priv with all the mesh interface information
- * @hash: hash table containing the nc paths to check
- * @to_purge: function in charge to decide whether an entry has to be purged or
- * not. This function takes the nc node as argument and has to return
- * a boolean value: true is the entry has to be deleted, false
- * otherwise
- */
-static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_path *))
-{
- struct hlist_head *head;
- struct hlist_node *node_tmp;
- struct batadv_nc_path *nc_path;
- spinlock_t *lock; /* Protects lists in hash */
- u32 i;
-
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
- lock = &hash->list_locks[i];
-
- /* For each nc_path in this bin */
- spin_lock_bh(lock);
- hlist_for_each_entry_safe(nc_path, node_tmp, head, hash_entry) {
- /* if an helper function has been passed as parameter,
- * ask it if the entry has to be purged or not
- */
- if (to_purge && !to_purge(bat_priv, nc_path))
- continue;
-
- /* purging an non-empty nc_path should never happen, but
- * is observed under high CPU load. Delay the purging
- * until next iteration to allow the packet_list to be
- * emptied first.
- */
- if (!unlikely(list_empty(&nc_path->packet_list))) {
- net_ratelimited_function(printk,
- KERN_WARNING
- "Skipping free of non-empty nc_path (%pM -> %pM)!\n",
- nc_path->prev_hop,
- nc_path->next_hop);
- continue;
- }
-
- /* nc_path is unused, so remove it */
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "Remove nc_path %pM -> %pM\n",
- nc_path->prev_hop, nc_path->next_hop);
- hlist_del_rcu(&nc_path->hash_entry);
- batadv_nc_path_put(nc_path);
- }
- spin_unlock_bh(lock);
- }
-}
-
-/**
- * batadv_nc_hash_key_gen() - computes the nc_path hash key
- * @key: buffer to hold the final hash key
- * @src: source ethernet mac address going into the hash key
- * @dst: destination ethernet mac address going into the hash key
- */
-static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
- const char *dst)
-{
- memcpy(key->prev_hop, src, sizeof(key->prev_hop));
- memcpy(key->next_hop, dst, sizeof(key->next_hop));
-}
-
-/**
- * batadv_nc_hash_choose() - compute the hash value for an nc path
- * @data: data to hash
- * @size: size of the hash table
- *
- * Return: the selected index in the hash table for the given data.
- */
-static u32 batadv_nc_hash_choose(const void *data, u32 size)
-{
- const struct batadv_nc_path *nc_path = data;
- u32 hash = 0;
-
- hash = jhash(&nc_path->prev_hop, sizeof(nc_path->prev_hop), hash);
- hash = jhash(&nc_path->next_hop, sizeof(nc_path->next_hop), hash);
-
- return hash % size;
-}
-
-/**
- * batadv_nc_hash_compare() - comparing function used in the network coding hash
- * tables
- * @node: node in the local table
- * @data2: second object to compare the node to
- *
- * Return: true if the two entry are the same, false otherwise
- */
-static bool batadv_nc_hash_compare(const struct hlist_node *node,
- const void *data2)
-{
- const struct batadv_nc_path *nc_path1, *nc_path2;
-
- nc_path1 = container_of(node, struct batadv_nc_path, hash_entry);
- nc_path2 = data2;
-
- /* Return 1 if the two keys are identical */
- if (!batadv_compare_eth(nc_path1->prev_hop, nc_path2->prev_hop))
- return false;
-
- if (!batadv_compare_eth(nc_path1->next_hop, nc_path2->next_hop))
- return false;
-
- return true;
-}
-
-/**
- * batadv_nc_hash_find() - search for an existing nc path and return it
- * @hash: hash table containing the nc path
- * @data: search key
- *
- * Return: the nc_path if found, NULL otherwise.
- */
-static struct batadv_nc_path *
-batadv_nc_hash_find(struct batadv_hashtable *hash,
- void *data)
-{
- struct hlist_head *head;
- struct batadv_nc_path *nc_path, *nc_path_tmp = NULL;
- int index;
-
- if (!hash)
- return NULL;
-
- index = batadv_nc_hash_choose(data, hash->size);
- head = &hash->table[index];
-
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
- if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
- continue;
-
- if (!kref_get_unless_zero(&nc_path->refcount))
- continue;
-
- nc_path_tmp = nc_path;
- break;
- }
- rcu_read_unlock();
-
- return nc_path_tmp;
-}
-
-/**
- * batadv_nc_send_packet() - send non-coded packet and free nc_packet struct
- * @nc_packet: the nc packet to send
- */
-static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
-{
- batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
- nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet, false);
-}
-
-/**
- * batadv_nc_sniffed_purge() - Checks timestamp of given sniffed nc_packet.
- * @bat_priv: the bat priv with all the mesh interface information
- * @nc_path: the nc path the packet belongs to
- * @nc_packet: the nc packet to be checked
- *
- * Checks whether the given sniffed (overheard) nc_packet has hit its buffering
- * timeout. If so, the packet is no longer kept and the entry deleted from the
- * queue. Has to be called with the appropriate locks.
- *
- * Return: false as soon as the entry in the fifo queue has not been timed out
- * yet and true otherwise.
- */
-static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path,
- struct batadv_nc_packet *nc_packet)
-{
- unsigned long timeout = bat_priv->nc.max_buffer_time;
- bool res = false;
-
- lockdep_assert_held(&nc_path->packet_list_lock);
-
- /* Packets are added to tail, so the remaining packets did not time
- * out and we can stop processing the current queue
- */
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
- !batadv_has_timed_out(nc_packet->timestamp, timeout))
- goto out;
-
- /* purge nc packet */
- list_del(&nc_packet->list);
- batadv_nc_packet_free(nc_packet, true);
-
- res = true;
-
-out:
- return res;
-}
-
-/**
- * batadv_nc_fwd_flush() - Checks the timestamp of the given nc packet.
- * @bat_priv: the bat priv with all the mesh interface information
- * @nc_path: the nc path the packet belongs to
- * @nc_packet: the nc packet to be checked
- *
- * Checks whether the given nc packet has hit its forward timeout. If so, the
- * packet is no longer delayed, immediately sent and the entry deleted from the
- * queue. Has to be called with the appropriate locks.
- *
- * Return: false as soon as the entry in the fifo queue has not been timed out
- * yet and true otherwise.
- */
-static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
- struct batadv_nc_path *nc_path,
- struct batadv_nc_packet *nc_packet)
-{
- unsigned long timeout = bat_priv->nc.max_fwd_delay;
-
- lockdep_assert_held(&nc_path->packet_list_lock);
-
- /* Packets are added to tail, so the remaining packets did not time
- * out and we can stop processing the current queue
- */
- if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE &&
- !batadv_has_timed_out(nc_packet->timestamp, timeout))
- return false;
-
- /* Send packet */
- batadv_inc_counter(bat_priv, BATADV_CNT_FORWARD);
- batadv_add_counter(bat_priv, BATADV_CNT_FORWARD_BYTES,
- nc_packet->skb->len + ETH_HLEN);
- list_del(&nc_packet->list);
- batadv_nc_send_packet(nc_packet);
-
- return true;
-}
-
-/**
- * batadv_nc_process_nc_paths() - traverse given nc packet pool and free timed
- * out nc packets
- * @bat_priv: the bat priv with all the mesh interface information
- * @hash: to be processed hash table
- * @process_fn: Function called to process given nc packet. Should return true
- * to encourage this function to proceed with the next packet.
- * Otherwise the rest of the current queue is skipped.
- */
-static void
-batadv_nc_process_nc_paths(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- bool (*process_fn)(struct batadv_priv *,
- struct batadv_nc_path *,
- struct batadv_nc_packet *))
-{
- struct hlist_head *head;
- struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
- struct batadv_nc_path *nc_path;
- bool ret;
- int i;
-
- if (!hash)
- return;
-
- /* Loop hash table bins */
- for (i = 0; i < hash->size; i++) {
- head = &hash->table[i];
-
- /* Loop coding paths */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, head, hash_entry) {
- /* Loop packets */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_for_each_entry_safe(nc_packet, nc_packet_tmp,
- &nc_path->packet_list, list) {
- ret = process_fn(bat_priv, nc_path, nc_packet);
- if (!ret)
- break;
- }
- spin_unlock_bh(&nc_path->packet_list_lock);
- }
- rcu_read_unlock();
- }
-}
-
-/**
- * batadv_nc_worker() - periodic task for housekeeping related to network
- * coding
- * @work: kernel work struct
- */
-static void batadv_nc_worker(struct work_struct *work)
-{
- struct delayed_work *delayed_work;
- struct batadv_priv_nc *priv_nc;
- struct batadv_priv *bat_priv;
- unsigned long timeout;
-
- delayed_work = to_delayed_work(work);
- priv_nc = container_of(delayed_work, struct batadv_priv_nc, work);
- bat_priv = container_of(priv_nc, struct batadv_priv, nc);
-
- batadv_nc_purge_orig_hash(bat_priv);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash,
- batadv_nc_to_purge_nc_path_coding);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash,
- batadv_nc_to_purge_nc_path_decoding);
-
- timeout = bat_priv->nc.max_fwd_delay;
-
- if (batadv_has_timed_out(bat_priv->nc.timestamp_fwd_flush, timeout)) {
- batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.coding_hash,
- batadv_nc_fwd_flush);
- bat_priv->nc.timestamp_fwd_flush = jiffies;
- }
-
- if (batadv_has_timed_out(bat_priv->nc.timestamp_sniffed_purge,
- bat_priv->nc.max_buffer_time)) {
- batadv_nc_process_nc_paths(bat_priv, bat_priv->nc.decoding_hash,
- batadv_nc_sniffed_purge);
- bat_priv->nc.timestamp_sniffed_purge = jiffies;
- }
-
- /* Schedule a new check */
- batadv_nc_start_timer(bat_priv);
-}
-
-/**
- * batadv_can_nc_with_orig() - checks whether the given orig node is suitable
- * for coding or not
- * @bat_priv: the bat priv with all the mesh interface information
- * @orig_node: neighboring orig node which may be used as nc candidate
- * @ogm_packet: incoming ogm packet also used for the checks
- *
- * Return: true if:
- * 1) The OGM must have the most recent sequence number.
- * 2) The TTL must be decremented by one and only one.
- * 3) The OGM must be received from the first hop from orig_node.
- * 4) The TQ value of the OGM must be above bat_priv->nc.min_tq.
- */
-static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_ogm_packet *ogm_packet)
-{
- struct batadv_orig_ifinfo *orig_ifinfo;
- u32 last_real_seqno;
- u8 last_ttl;
-
- orig_ifinfo = batadv_orig_ifinfo_get(orig_node, BATADV_IF_DEFAULT);
- if (!orig_ifinfo)
- return false;
-
- last_ttl = orig_ifinfo->last_ttl;
- last_real_seqno = orig_ifinfo->last_real_seqno;
- batadv_orig_ifinfo_put(orig_ifinfo);
-
- if (last_real_seqno != ntohl(ogm_packet->seqno))
- return false;
- if (last_ttl != ogm_packet->ttl + 1)
- return false;
- if (!batadv_compare_eth(ogm_packet->orig, ogm_packet->prev_sender))
- return false;
- if (ogm_packet->tq < bat_priv->nc.min_tq)
- return false;
-
- return true;
-}
-
-/**
- * batadv_nc_find_nc_node() - search for an existing nc node and return it
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @in_coding: traverse incoming or outgoing network coding list
- *
- * Return: the nc_node if found, NULL otherwise.
- */
-static struct batadv_nc_node *
-batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
-{
- struct batadv_nc_node *nc_node, *nc_node_out = NULL;
- struct list_head *list;
-
- if (in_coding)
- list = &orig_neigh_node->in_coding_list;
- else
- list = &orig_neigh_node->out_coding_list;
-
- /* Traverse list of nc_nodes to orig_node */
- rcu_read_lock();
- list_for_each_entry_rcu(nc_node, list, list) {
- if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
- continue;
-
- if (!kref_get_unless_zero(&nc_node->refcount))
- continue;
-
- /* Found a match */
- nc_node_out = nc_node;
- break;
- }
- rcu_read_unlock();
-
- return nc_node_out;
-}
-
-/**
- * batadv_nc_get_nc_node() - retrieves an nc node or creates the entry if it was
- * not found
- * @bat_priv: the bat priv with all the mesh interface information
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @in_coding: traverse incoming or outgoing network coding list
- *
- * Return: the nc_node if found or created, NULL in case of an error.
- */
-static struct batadv_nc_node *
-batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- bool in_coding)
-{
- struct batadv_nc_node *nc_node;
- spinlock_t *lock; /* Used to lock list selected by "int in_coding" */
- struct list_head *list;
-
- /* Select ingoing or outgoing coding node */
- if (in_coding) {
- lock = &orig_neigh_node->in_coding_list_lock;
- list = &orig_neigh_node->in_coding_list;
- } else {
- lock = &orig_neigh_node->out_coding_list_lock;
- list = &orig_neigh_node->out_coding_list;
- }
-
- spin_lock_bh(lock);
-
- /* Check if nc_node is already added */
- nc_node = batadv_nc_find_nc_node(orig_node, orig_neigh_node, in_coding);
-
- /* Node found */
- if (nc_node)
- goto unlock;
-
- nc_node = kzalloc(sizeof(*nc_node), GFP_ATOMIC);
- if (!nc_node)
- goto unlock;
-
- /* Initialize nc_node */
- INIT_LIST_HEAD(&nc_node->list);
- kref_init(&nc_node->refcount);
- ether_addr_copy(nc_node->addr, orig_node->orig);
- kref_get(&orig_neigh_node->refcount);
- nc_node->orig_node = orig_neigh_node;
-
- batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_node %pM -> %pM\n",
- nc_node->addr, nc_node->orig_node->orig);
-
- /* Add nc_node to orig_node */
- kref_get(&nc_node->refcount);
- list_add_tail_rcu(&nc_node->list, list);
-
-unlock:
- spin_unlock_bh(lock);
-
- return nc_node;
-}
-
-/**
- * batadv_nc_update_nc_node() - updates stored incoming and outgoing nc node
- * structs (best called on incoming OGMs)
- * @bat_priv: the bat priv with all the mesh interface information
- * @orig_node: orig node originating the ogm packet
- * @orig_neigh_node: neighboring orig node from which we received the ogm packet
- * (can be equal to orig_node)
- * @ogm_packet: incoming ogm packet
- * @is_single_hop_neigh: orig_node is a single hop neighbor
- */
-void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh)
-{
- struct batadv_nc_node *in_nc_node = NULL;
- struct batadv_nc_node *out_nc_node = NULL;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* check if orig node is network coding enabled */
- if (!test_bit(BATADV_ORIG_CAPA_HAS_NC, &orig_node->capabilities))
- goto out;
-
- /* accept ogms from 'good' neighbors and single hop neighbors */
- if (!batadv_can_nc_with_orig(bat_priv, orig_node, ogm_packet) &&
- !is_single_hop_neigh)
- goto out;
-
- /* Add orig_node as in_nc_node on hop */
- in_nc_node = batadv_nc_get_nc_node(bat_priv, orig_node,
- orig_neigh_node, true);
- if (!in_nc_node)
- goto out;
-
- in_nc_node->last_seen = jiffies;
-
- /* Add hop as out_nc_node on orig_node */
- out_nc_node = batadv_nc_get_nc_node(bat_priv, orig_neigh_node,
- orig_node, false);
- if (!out_nc_node)
- goto out;
-
- out_nc_node->last_seen = jiffies;
-
-out:
- batadv_nc_node_put(in_nc_node);
- batadv_nc_node_put(out_nc_node);
-}
-
-/**
- * batadv_nc_get_path() - get existing nc_path or allocate a new one
- * @bat_priv: the bat priv with all the mesh interface information
- * @hash: hash table containing the nc path
- * @src: ethernet source address - first half of the nc path search key
- * @dst: ethernet destination address - second half of the nc path search key
- *
- * Return: pointer to nc_path if the path was found or created, returns NULL
- * on error.
- */
-static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
- struct batadv_hashtable *hash,
- u8 *src,
- u8 *dst)
-{
- int hash_added;
- struct batadv_nc_path *nc_path, nc_path_key;
-
- batadv_nc_hash_key_gen(&nc_path_key, src, dst);
-
- /* Search for existing nc_path */
- nc_path = batadv_nc_hash_find(hash, (void *)&nc_path_key);
-
- if (nc_path) {
- /* Set timestamp to delay removal of nc_path */
- nc_path->last_valid = jiffies;
- return nc_path;
- }
-
- /* No existing nc_path was found; create a new */
- nc_path = kzalloc(sizeof(*nc_path), GFP_ATOMIC);
-
- if (!nc_path)
- return NULL;
-
- /* Initialize nc_path */
- INIT_LIST_HEAD(&nc_path->packet_list);
- spin_lock_init(&nc_path->packet_list_lock);
- kref_init(&nc_path->refcount);
- nc_path->last_valid = jiffies;
- ether_addr_copy(nc_path->next_hop, dst);
- ether_addr_copy(nc_path->prev_hop, src);
-
- batadv_dbg(BATADV_DBG_NC, bat_priv, "Adding nc_path %pM -> %pM\n",
- nc_path->prev_hop,
- nc_path->next_hop);
-
- /* Add nc_path to hash table */
- kref_get(&nc_path->refcount);
- hash_added = batadv_hash_add(hash, batadv_nc_hash_compare,
- batadv_nc_hash_choose, &nc_path_key,
- &nc_path->hash_entry);
-
- if (hash_added < 0) {
- kfree(nc_path);
- return NULL;
- }
-
- return nc_path;
-}
-
-/**
- * batadv_nc_random_weight_tq() - scale the receivers TQ-value to avoid unfair
- * selection of a receiver with slightly lower TQ than the other
- * @tq: to be weighted tq value
- *
- * Return: scaled tq value
- */
-static u8 batadv_nc_random_weight_tq(u8 tq)
-{
- /* randomize the estimated packet loss (max TQ - estimated TQ) */
- u8 rand_tq = get_random_u32_below(BATADV_TQ_MAX_VALUE + 1 - tq);
-
- /* convert to (randomized) estimated tq again */
- return BATADV_TQ_MAX_VALUE - rand_tq;
-}
-
-/**
- * batadv_nc_memxor() - XOR destination with source
- * @dst: byte array to XOR into
- * @src: byte array to XOR from
- * @len: length of destination array
- */
-static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
-{
- unsigned int i;
-
- for (i = 0; i < len; ++i)
- dst[i] ^= src[i];
-}
-
-/**
- * batadv_nc_code_packets() - code a received unicast_packet with an nc packet
- * into a coded_packet and send it
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: data skb to forward
- * @ethhdr: pointer to the ethernet header inside the skb
- * @nc_packet: structure containing the packet to the skb can be coded with
- * @neigh_node: next hop to forward packet to
- *
- * Return: true if both packets are consumed, false otherwise.
- */
-static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- struct ethhdr *ethhdr,
- struct batadv_nc_packet *nc_packet,
- struct batadv_neigh_node *neigh_node)
-{
- u8 tq_weighted_neigh, tq_weighted_coding, tq_tmp;
- struct sk_buff *skb_dest, *skb_src;
- struct batadv_unicast_packet *packet1;
- struct batadv_unicast_packet *packet2;
- struct batadv_coded_packet *coded_packet;
- struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest;
- struct batadv_neigh_node *router_coding = NULL, *second_dest;
- struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
- struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
- u8 *first_source, *second_source;
- __be32 packet_id1, packet_id2;
- size_t count;
- bool res = false;
- int coding_len;
- int unicast_size = sizeof(*packet1);
- int coded_size = sizeof(*coded_packet);
- int header_add = coded_size - unicast_size;
-
- /* TODO: do we need to consider the outgoing interface for
- * coded packets?
- */
- router_neigh = batadv_orig_router_get(neigh_node->orig_node,
- BATADV_IF_DEFAULT);
- if (!router_neigh)
- goto out;
-
- router_neigh_ifinfo = batadv_neigh_ifinfo_get(router_neigh,
- BATADV_IF_DEFAULT);
- if (!router_neigh_ifinfo)
- goto out;
-
- neigh_tmp = nc_packet->neigh_node;
- router_coding = batadv_orig_router_get(neigh_tmp->orig_node,
- BATADV_IF_DEFAULT);
- if (!router_coding)
- goto out;
-
- router_coding_ifinfo = batadv_neigh_ifinfo_get(router_coding,
- BATADV_IF_DEFAULT);
- if (!router_coding_ifinfo)
- goto out;
-
- tq_tmp = router_neigh_ifinfo->bat_iv.tq_avg;
- tq_weighted_neigh = batadv_nc_random_weight_tq(tq_tmp);
- tq_tmp = router_coding_ifinfo->bat_iv.tq_avg;
- tq_weighted_coding = batadv_nc_random_weight_tq(tq_tmp);
-
- /* Select one destination for the MAC-header dst-field based on
- * weighted TQ-values.
- */
- if (tq_weighted_neigh >= tq_weighted_coding) {
- /* Destination from nc_packet is selected for MAC-header */
- first_dest = nc_packet->neigh_node;
- first_source = nc_packet->nc_path->prev_hop;
- second_dest = neigh_node;
- second_source = ethhdr->h_source;
- packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
- packet2 = (struct batadv_unicast_packet *)skb->data;
- packet_id1 = nc_packet->packet_id;
- packet_id2 = batadv_skb_crc32(skb,
- skb->data + sizeof(*packet2));
- } else {
- /* Destination for skb is selected for MAC-header */
- first_dest = neigh_node;
- first_source = ethhdr->h_source;
- second_dest = nc_packet->neigh_node;
- second_source = nc_packet->nc_path->prev_hop;
- packet1 = (struct batadv_unicast_packet *)skb->data;
- packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
- packet_id1 = batadv_skb_crc32(skb,
- skb->data + sizeof(*packet1));
- packet_id2 = nc_packet->packet_id;
- }
-
- /* Instead of zero padding the smallest data buffer, we
- * code into the largest.
- */
- if (skb->len <= nc_packet->skb->len) {
- skb_dest = nc_packet->skb;
- skb_src = skb;
- } else {
- skb_dest = skb;
- skb_src = nc_packet->skb;
- }
-
- /* coding_len is used when decoding the packet shorter packet */
- coding_len = skb_src->len - unicast_size;
-
- if (skb_linearize(skb_dest) < 0 || skb_linearize(skb_src) < 0)
- goto out;
-
- skb_push(skb_dest, header_add);
-
- coded_packet = (struct batadv_coded_packet *)skb_dest->data;
- skb_reset_mac_header(skb_dest);
-
- coded_packet->packet_type = BATADV_CODED;
- coded_packet->version = BATADV_COMPAT_VERSION;
- coded_packet->ttl = packet1->ttl;
-
- /* Info about first unicast packet */
- ether_addr_copy(coded_packet->first_source, first_source);
- ether_addr_copy(coded_packet->first_orig_dest, packet1->dest);
- coded_packet->first_crc = packet_id1;
- coded_packet->first_ttvn = packet1->ttvn;
-
- /* Info about second unicast packet */
- ether_addr_copy(coded_packet->second_dest, second_dest->addr);
- ether_addr_copy(coded_packet->second_source, second_source);
- ether_addr_copy(coded_packet->second_orig_dest, packet2->dest);
- coded_packet->second_crc = packet_id2;
- coded_packet->second_ttl = packet2->ttl;
- coded_packet->second_ttvn = packet2->ttvn;
- coded_packet->coded_len = htons(coding_len);
-
- /* This is where the magic happens: Code skb_src into skb_dest */
- batadv_nc_memxor(skb_dest->data + coded_size,
- skb_src->data + unicast_size, coding_len);
-
- /* Update counters accordingly */
- if (BATADV_SKB_CB(skb_src)->decoded &&
- BATADV_SKB_CB(skb_dest)->decoded) {
- /* Both packets are recoded */
- count = skb_src->len + ETH_HLEN;
- count += skb_dest->len + ETH_HLEN;
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE, 2);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES, count);
- } else if (!BATADV_SKB_CB(skb_src)->decoded &&
- !BATADV_SKB_CB(skb_dest)->decoded) {
- /* Both packets are newly coded */
- count = skb_src->len + ETH_HLEN;
- count += skb_dest->len + ETH_HLEN;
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE, 2);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES, count);
- } else if (BATADV_SKB_CB(skb_src)->decoded &&
- !BATADV_SKB_CB(skb_dest)->decoded) {
- /* skb_src recoded and skb_dest is newly coded */
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
- skb_src->len + ETH_HLEN);
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
- skb_dest->len + ETH_HLEN);
- } else if (!BATADV_SKB_CB(skb_src)->decoded &&
- BATADV_SKB_CB(skb_dest)->decoded) {
- /* skb_src is newly coded and skb_dest is recoded */
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_CODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_CODE_BYTES,
- skb_src->len + ETH_HLEN);
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_RECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_RECODE_BYTES,
- skb_dest->len + ETH_HLEN);
- }
-
- /* skb_src is now coded into skb_dest, so free it */
- consume_skb(skb_src);
-
- /* avoid duplicate free of skb from nc_packet */
- nc_packet->skb = NULL;
- batadv_nc_packet_free(nc_packet, false);
-
- /* Send the coded packet and return true */
- batadv_send_unicast_skb(skb_dest, first_dest);
- res = true;
-out:
- batadv_neigh_node_put(router_neigh);
- batadv_neigh_node_put(router_coding);
- batadv_neigh_ifinfo_put(router_neigh_ifinfo);
- batadv_neigh_ifinfo_put(router_coding_ifinfo);
- return res;
-}
-
-/**
- * batadv_nc_skb_coding_possible() - true if a decoded skb is available at dst.
- * @skb: data skb to forward
- * @dst: destination mac address of the other skb to code with
- * @src: source mac address of skb
- *
- * Whenever we network code a packet we have to check whether we received it in
- * a network coded form. If so, we may not be able to use it for coding because
- * some neighbors may also have received (overheard) the packet in the network
- * coded form without being able to decode it. It is hard to know which of the
- * neighboring nodes was able to decode the packet, therefore we can only
- * re-code the packet if the source of the previous encoded packet is involved.
- * Since the source encoded the packet we can be certain it has all necessary
- * decode information.
- *
- * Return: true if coding of a decoded packet is allowed.
- */
-static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
-{
- if (BATADV_SKB_CB(skb)->decoded && !batadv_compare_eth(dst, src))
- return false;
- return true;
-}
-
-/**
- * batadv_nc_path_search() - Find the coding path matching in_nc_node and
- * out_nc_node to retrieve a buffered packet that can be used for coding.
- * @bat_priv: the bat priv with all the mesh interface information
- * @in_nc_node: pointer to skb next hop's neighbor nc node
- * @out_nc_node: pointer to skb source's neighbor nc node
- * @skb: data skb to forward
- * @eth_dst: next hop mac address of skb
- *
- * Return: true if coding of a decoded skb is allowed.
- */
-static struct batadv_nc_packet *
-batadv_nc_path_search(struct batadv_priv *bat_priv,
- struct batadv_nc_node *in_nc_node,
- struct batadv_nc_node *out_nc_node,
- struct sk_buff *skb,
- u8 *eth_dst)
-{
- struct batadv_nc_path *nc_path, nc_path_key;
- struct batadv_nc_packet *nc_packet_out = NULL;
- struct batadv_nc_packet *nc_packet, *nc_packet_tmp;
- struct batadv_hashtable *hash = bat_priv->nc.coding_hash;
- int idx;
-
- if (!hash)
- return NULL;
-
- /* Create almost path key */
- batadv_nc_hash_key_gen(&nc_path_key, in_nc_node->addr,
- out_nc_node->addr);
- idx = batadv_nc_hash_choose(&nc_path_key, hash->size);
-
- /* Check for coding opportunities in this nc_path */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, &hash->table[idx], hash_entry) {
- if (!batadv_compare_eth(nc_path->prev_hop, in_nc_node->addr))
- continue;
-
- if (!batadv_compare_eth(nc_path->next_hop, out_nc_node->addr))
- continue;
-
- spin_lock_bh(&nc_path->packet_list_lock);
- if (list_empty(&nc_path->packet_list)) {
- spin_unlock_bh(&nc_path->packet_list_lock);
- continue;
- }
-
- list_for_each_entry_safe(nc_packet, nc_packet_tmp,
- &nc_path->packet_list, list) {
- if (!batadv_nc_skb_coding_possible(nc_packet->skb,
- eth_dst,
- in_nc_node->addr))
- continue;
-
- /* Coding opportunity is found! */
- list_del(&nc_packet->list);
- nc_packet_out = nc_packet;
- break;
- }
-
- spin_unlock_bh(&nc_path->packet_list_lock);
- break;
- }
- rcu_read_unlock();
-
- return nc_packet_out;
-}
-
-/**
- * batadv_nc_skb_src_search() - Loops through the list of neighboring nodes of
- * the skb's sender (may be equal to the originator).
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: data skb to forward
- * @eth_dst: next hop mac address of skb
- * @eth_src: source mac address of skb
- * @in_nc_node: pointer to skb next hop's neighbor nc node
- *
- * Return: an nc packet if a suitable coding packet was found, NULL otherwise.
- */
-static struct batadv_nc_packet *
-batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- u8 *eth_dst,
- u8 *eth_src,
- struct batadv_nc_node *in_nc_node)
-{
- struct batadv_orig_node *orig_node;
- struct batadv_nc_node *out_nc_node;
- struct batadv_nc_packet *nc_packet = NULL;
-
- orig_node = batadv_orig_hash_find(bat_priv, eth_src);
- if (!orig_node)
- return NULL;
-
- rcu_read_lock();
- list_for_each_entry_rcu(out_nc_node,
- &orig_node->out_coding_list, list) {
- /* Check if the skb is decoded and if recoding is possible */
- if (!batadv_nc_skb_coding_possible(skb,
- out_nc_node->addr, eth_src))
- continue;
-
- /* Search for an opportunity in this nc_path */
- nc_packet = batadv_nc_path_search(bat_priv, in_nc_node,
- out_nc_node, skb, eth_dst);
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- batadv_orig_node_put(orig_node);
- return nc_packet;
-}
-
-/**
- * batadv_nc_skb_store_before_coding() - set the ethernet src and dst of the
- * unicast skb before it is stored for use in later decoding
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: data skb to store
- * @eth_dst_new: new destination mac address of skb
- */
-static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
- struct sk_buff *skb,
- u8 *eth_dst_new)
-{
- struct ethhdr *ethhdr;
-
- /* Copy skb header to change the mac header */
- skb = pskb_copy_for_clone(skb, GFP_ATOMIC);
- if (!skb)
- return;
-
- /* Set the mac header as if we actually sent the packet uncoded */
- ethhdr = eth_hdr(skb);
- ether_addr_copy(ethhdr->h_source, ethhdr->h_dest);
- ether_addr_copy(ethhdr->h_dest, eth_dst_new);
-
- /* Set data pointer to MAC header to mimic packets from our tx path */
- skb_push(skb, ETH_HLEN);
-
- /* Add the packet to the decoding packet pool */
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-
- /* batadv_nc_skb_store_for_decoding() clones the skb, so we must free
- * our ref
- */
- consume_skb(skb);
-}
-
-/**
- * batadv_nc_skb_dst_search() - Loops through list of neighboring nodes to dst.
- * @skb: data skb to forward
- * @neigh_node: next hop to forward packet to
- * @ethhdr: pointer to the ethernet header inside the skb
- *
- * Loops through the list of neighboring nodes the next hop has a good
- * connection to (receives OGMs with a sufficient quality). We need to find a
- * neighbor of our next hop that potentially sent a packet which our next hop
- * also received (overheard) and has stored for later decoding.
- *
- * Return: true if the skb was consumed (encoded packet sent) or false otherwise
- */
-static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node,
- struct ethhdr *ethhdr)
-{
- struct net_device *netdev = neigh_node->if_incoming->mesh_iface;
- struct batadv_priv *bat_priv = netdev_priv(netdev);
- struct batadv_orig_node *orig_node = neigh_node->orig_node;
- struct batadv_nc_node *nc_node;
- struct batadv_nc_packet *nc_packet = NULL;
-
- rcu_read_lock();
- list_for_each_entry_rcu(nc_node, &orig_node->in_coding_list, list) {
- /* Search for coding opportunity with this in_nc_node */
- nc_packet = batadv_nc_skb_src_search(bat_priv, skb,
- neigh_node->addr,
- ethhdr->h_source, nc_node);
-
- /* Opportunity was found, so stop searching */
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- if (!nc_packet)
- return false;
-
- /* Save packets for later decoding */
- batadv_nc_skb_store_before_coding(bat_priv, skb,
- neigh_node->addr);
- batadv_nc_skb_store_before_coding(bat_priv, nc_packet->skb,
- nc_packet->neigh_node->addr);
-
- /* Code and send packets */
- if (batadv_nc_code_packets(bat_priv, skb, ethhdr, nc_packet,
- neigh_node))
- return true;
-
- /* out of mem ? Coding failed - we have to free the buffered packet
- * to avoid memleaks. The skb passed as argument will be dealt with
- * by the calling function.
- */
- batadv_nc_send_packet(nc_packet);
- return false;
-}
-
-/**
- * batadv_nc_skb_add_to_path() - buffer skb for later encoding / decoding
- * @skb: skb to add to path
- * @nc_path: path to add skb to
- * @neigh_node: next hop to forward packet to
- * @packet_id: checksum to identify packet
- *
- * Return: true if the packet was buffered or false in case of an error.
- */
-static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
- struct batadv_nc_path *nc_path,
- struct batadv_neigh_node *neigh_node,
- __be32 packet_id)
-{
- struct batadv_nc_packet *nc_packet;
-
- nc_packet = kzalloc(sizeof(*nc_packet), GFP_ATOMIC);
- if (!nc_packet)
- return false;
-
- /* Initialize nc_packet */
- nc_packet->timestamp = jiffies;
- nc_packet->packet_id = packet_id;
- nc_packet->skb = skb;
- nc_packet->neigh_node = neigh_node;
- nc_packet->nc_path = nc_path;
-
- /* Add coding packet to list */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_add_tail(&nc_packet->list, &nc_path->packet_list);
- spin_unlock_bh(&nc_path->packet_list_lock);
-
- return true;
-}
-
-/**
- * batadv_nc_skb_forward() - try to code a packet or add it to the coding packet
- * buffer
- * @skb: data skb to forward
- * @neigh_node: next hop to forward packet to
- *
- * Return: true if the skb was consumed (encoded packet sent) or false otherwise
- */
-bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node)
-{
- const struct net_device *netdev = neigh_node->if_incoming->mesh_iface;
- struct batadv_priv *bat_priv = netdev_priv(netdev);
- struct batadv_unicast_packet *packet;
- struct batadv_nc_path *nc_path;
- struct ethhdr *ethhdr = eth_hdr(skb);
- __be32 packet_id;
- u8 *payload;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* We only handle unicast packets */
- payload = skb_network_header(skb);
- packet = (struct batadv_unicast_packet *)payload;
- if (packet->packet_type != BATADV_UNICAST)
- goto out;
-
- /* Try to find a coding opportunity and send the skb if one is found */
- if (batadv_nc_skb_dst_search(skb, neigh_node, ethhdr))
- return true;
-
- /* Find or create a nc_path for this src-dst pair */
- nc_path = batadv_nc_get_path(bat_priv,
- bat_priv->nc.coding_hash,
- ethhdr->h_source,
- neigh_node->addr);
-
- if (!nc_path)
- goto out;
-
- /* Add skb to nc_path */
- packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
- if (!batadv_nc_skb_add_to_path(skb, nc_path, neigh_node, packet_id))
- goto free_nc_path;
-
- /* Packet is consumed */
- return true;
-
-free_nc_path:
- batadv_nc_path_put(nc_path);
-out:
- /* Packet is not consumed */
- return false;
-}
-
-/**
- * batadv_nc_skb_store_for_decoding() - save a clone of the skb which can be
- * used when decoding coded packets
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: data skb to store
- */
-void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
- struct batadv_unicast_packet *packet;
- struct batadv_nc_path *nc_path;
- struct ethhdr *ethhdr = eth_hdr(skb);
- __be32 packet_id;
- u8 *payload;
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto out;
-
- /* Check for supported packet type */
- payload = skb_network_header(skb);
- packet = (struct batadv_unicast_packet *)payload;
- if (packet->packet_type != BATADV_UNICAST)
- goto out;
-
- /* Find existing nc_path or create a new */
- nc_path = batadv_nc_get_path(bat_priv,
- bat_priv->nc.decoding_hash,
- ethhdr->h_source,
- ethhdr->h_dest);
-
- if (!nc_path)
- goto out;
-
- /* Clone skb and adjust skb->data to point at batman header */
- skb = skb_clone(skb, GFP_ATOMIC);
- if (unlikely(!skb))
- goto free_nc_path;
-
- if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
- goto free_skb;
-
- if (unlikely(!skb_pull_rcsum(skb, ETH_HLEN)))
- goto free_skb;
-
- /* Add skb to nc_path */
- packet_id = batadv_skb_crc32(skb, payload + sizeof(*packet));
- if (!batadv_nc_skb_add_to_path(skb, nc_path, NULL, packet_id))
- goto free_skb;
-
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_BUFFER);
- return;
-
-free_skb:
- kfree_skb(skb);
-free_nc_path:
- batadv_nc_path_put(nc_path);
-out:
- return;
-}
-
-/**
- * batadv_nc_skb_store_sniffed_unicast() - check if a received unicast packet
- * should be saved in the decoding buffer and, if so, store it there
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: unicast skb to store
- */
-void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
- struct ethhdr *ethhdr = eth_hdr(skb);
-
- if (batadv_is_my_mac(bat_priv, ethhdr->h_dest))
- return;
-
- /* Set data pointer to MAC header to mimic packets from our tx path */
- skb_push(skb, ETH_HLEN);
-
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-}
-
-/**
- * batadv_nc_skb_decode_packet() - decode given skb using the decode data stored
- * in nc_packet
- * @bat_priv: the bat priv with all the mesh interface information
- * @skb: unicast skb to decode
- * @nc_packet: decode data needed to decode the skb
- *
- * Return: pointer to decoded unicast packet if the packet was decoded or NULL
- * in case of an error.
- */
-static struct batadv_unicast_packet *
-batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
- struct batadv_nc_packet *nc_packet)
-{
- const int h_size = sizeof(struct batadv_unicast_packet);
- const int h_diff = sizeof(struct batadv_coded_packet) - h_size;
- struct batadv_unicast_packet *unicast_packet;
- struct batadv_coded_packet coded_packet_tmp;
- struct ethhdr *ethhdr, ethhdr_tmp;
- u8 *orig_dest, ttl, ttvn;
- unsigned int coding_len;
- int err;
-
- /* Save headers temporarily */
- memcpy(&coded_packet_tmp, skb->data, sizeof(coded_packet_tmp));
- memcpy(&ethhdr_tmp, skb_mac_header(skb), sizeof(ethhdr_tmp));
-
- if (skb_cow(skb, 0) < 0)
- return NULL;
-
- if (unlikely(!skb_pull_rcsum(skb, h_diff)))
- return NULL;
-
- /* Data points to batman header, so set mac header 14 bytes before
- * and network to data
- */
- skb_set_mac_header(skb, -ETH_HLEN);
- skb_reset_network_header(skb);
-
- /* Reconstruct original mac header */
- ethhdr = eth_hdr(skb);
- *ethhdr = ethhdr_tmp;
-
- /* Select the correct unicast header information based on the location
- * of our mac address in the coded_packet header
- */
- if (batadv_is_my_mac(bat_priv, coded_packet_tmp.second_dest)) {
- /* If we are the second destination the packet was overheard,
- * so the Ethernet address must be copied to h_dest and
- * pkt_type changed from PACKET_OTHERHOST to PACKET_HOST
- */
- ether_addr_copy(ethhdr->h_dest, coded_packet_tmp.second_dest);
- skb->pkt_type = PACKET_HOST;
-
- orig_dest = coded_packet_tmp.second_orig_dest;
- ttl = coded_packet_tmp.second_ttl;
- ttvn = coded_packet_tmp.second_ttvn;
- } else {
- orig_dest = coded_packet_tmp.first_orig_dest;
- ttl = coded_packet_tmp.ttl;
- ttvn = coded_packet_tmp.first_ttvn;
- }
-
- coding_len = ntohs(coded_packet_tmp.coded_len);
-
- /* ensure dst buffer is large enough (payload only) */
- if (coding_len + h_size > skb->len)
- return NULL;
-
- /* ensure src buffer is large enough (payload only) */
- if (coding_len + h_size > nc_packet->skb->len)
- return NULL;
-
- /* Here the magic is reversed:
- * extract the missing packet from the received coded packet
- */
- batadv_nc_memxor(skb->data + h_size,
- nc_packet->skb->data + h_size,
- coding_len);
-
- /* Resize decoded skb if decoded with larger packet */
- if (nc_packet->skb->len > coding_len + h_size) {
- err = pskb_trim_rcsum(skb, coding_len + h_size);
- if (err)
- return NULL;
- }
-
- /* Create decoded unicast packet */
- unicast_packet = (struct batadv_unicast_packet *)skb->data;
- unicast_packet->packet_type = BATADV_UNICAST;
- unicast_packet->version = BATADV_COMPAT_VERSION;
- unicast_packet->ttl = ttl;
- ether_addr_copy(unicast_packet->dest, orig_dest);
- unicast_packet->ttvn = ttvn;
-
- batadv_nc_packet_free(nc_packet, false);
- return unicast_packet;
-}
-
-/**
- * batadv_nc_find_decoding_packet() - search through buffered decoding data to
- * find the data needed to decode the coded packet
- * @bat_priv: the bat priv with all the mesh interface information
- * @ethhdr: pointer to the ethernet header inside the coded packet
- * @coded: coded packet we try to find decode data for
- *
- * Return: pointer to nc packet if the needed data was found or NULL otherwise.
- */
-static struct batadv_nc_packet *
-batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
- struct ethhdr *ethhdr,
- struct batadv_coded_packet *coded)
-{
- struct batadv_hashtable *hash = bat_priv->nc.decoding_hash;
- struct batadv_nc_packet *tmp_nc_packet, *nc_packet = NULL;
- struct batadv_nc_path *nc_path, nc_path_key;
- u8 *dest, *source;
- __be32 packet_id;
- int index;
-
- if (!hash)
- return NULL;
-
- /* Select the correct packet id based on the location of our mac-addr */
- dest = ethhdr->h_source;
- if (!batadv_is_my_mac(bat_priv, coded->second_dest)) {
- source = coded->second_source;
- packet_id = coded->second_crc;
- } else {
- source = coded->first_source;
- packet_id = coded->first_crc;
- }
-
- batadv_nc_hash_key_gen(&nc_path_key, source, dest);
- index = batadv_nc_hash_choose(&nc_path_key, hash->size);
-
- /* Search for matching coding path */
- rcu_read_lock();
- hlist_for_each_entry_rcu(nc_path, &hash->table[index], hash_entry) {
- /* Find matching nc_packet */
- spin_lock_bh(&nc_path->packet_list_lock);
- list_for_each_entry(tmp_nc_packet,
- &nc_path->packet_list, list) {
- if (packet_id == tmp_nc_packet->packet_id) {
- list_del(&tmp_nc_packet->list);
-
- nc_packet = tmp_nc_packet;
- break;
- }
- }
- spin_unlock_bh(&nc_path->packet_list_lock);
-
- if (nc_packet)
- break;
- }
- rcu_read_unlock();
-
- if (!nc_packet)
- batadv_dbg(BATADV_DBG_NC, bat_priv,
- "No decoding packet found for %u\n", packet_id);
-
- return nc_packet;
-}
-
-/**
- * batadv_nc_recv_coded_packet() - try to decode coded packet and enqueue the
- * resulting unicast packet
- * @skb: incoming coded packet
- * @recv_if: pointer to interface this packet was received on
- *
- * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
- * otherwise.
- */
-static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
- struct batadv_hard_iface *recv_if)
-{
- struct batadv_priv *bat_priv = netdev_priv(recv_if->mesh_iface);
- struct batadv_unicast_packet *unicast_packet;
- struct batadv_coded_packet *coded_packet;
- struct batadv_nc_packet *nc_packet;
- struct ethhdr *ethhdr;
- int hdr_size = sizeof(*coded_packet);
-
- /* Check if network coding is enabled */
- if (!atomic_read(&bat_priv->network_coding))
- goto free_skb;
-
- /* Make sure we can access (and remove) header */
- if (unlikely(!pskb_may_pull(skb, hdr_size)))
- goto free_skb;
-
- coded_packet = (struct batadv_coded_packet *)skb->data;
- ethhdr = eth_hdr(skb);
-
- /* Verify frame is destined for us */
- if (!batadv_is_my_mac(bat_priv, ethhdr->h_dest) &&
- !batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- goto free_skb;
-
- /* Update stat counter */
- if (batadv_is_my_mac(bat_priv, coded_packet->second_dest))
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_SNIFFED);
-
- nc_packet = batadv_nc_find_decoding_packet(bat_priv, ethhdr,
- coded_packet);
- if (!nc_packet) {
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- goto free_skb;
- }
-
- /* Make skb's linear, because decoding accesses the entire buffer */
- if (skb_linearize(skb) < 0)
- goto free_nc_packet;
-
- if (skb_linearize(nc_packet->skb) < 0)
- goto free_nc_packet;
-
- /* Decode the packet */
- unicast_packet = batadv_nc_skb_decode_packet(bat_priv, skb, nc_packet);
- if (!unicast_packet) {
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE_FAILED);
- goto free_nc_packet;
- }
-
- /* Mark packet as decoded to do correct recoding when forwarding */
- BATADV_SKB_CB(skb)->decoded = true;
- batadv_inc_counter(bat_priv, BATADV_CNT_NC_DECODE);
- batadv_add_counter(bat_priv, BATADV_CNT_NC_DECODE_BYTES,
- skb->len + ETH_HLEN);
- return batadv_recv_unicast_packet(skb, recv_if);
-
-free_nc_packet:
- batadv_nc_packet_free(nc_packet, true);
-free_skb:
- kfree_skb(skb);
-
- return NET_RX_DROP;
-}
-
-/**
- * batadv_nc_mesh_free() - clean up network coding memory
- * @bat_priv: the bat priv with all the mesh interface information
- */
-void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
-{
- batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_NC, 1);
- batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_NC, 1);
- cancel_delayed_work_sync(&bat_priv->nc.work);
-
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.coding_hash, NULL);
- batadv_hash_destroy(bat_priv->nc.coding_hash);
- batadv_nc_purge_paths(bat_priv, bat_priv->nc.decoding_hash, NULL);
- batadv_hash_destroy(bat_priv->nc.decoding_hash);
-}
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
deleted file mode 100644
index 368cc3130e4c..000000000000
--- a/net/batman-adv/network-coding.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/* Copyright (C) B.A.T.M.A.N. contributors:
- *
- * Martin Hundebøll, Jeppe Ledet-Pedersen
- */
-
-#ifndef _NET_BATMAN_ADV_NETWORK_CODING_H_
-#define _NET_BATMAN_ADV_NETWORK_CODING_H_
-
-#include "main.h"
-
-#include <linux/netdevice.h>
-#include <linux/skbuff.h>
-#include <linux/types.h>
-#include <uapi/linux/batadv_packet.h>
-
-#ifdef CONFIG_BATMAN_ADV_NC
-
-void batadv_nc_status_update(struct net_device *net_dev);
-int batadv_nc_init(void);
-int batadv_nc_mesh_init(struct batadv_priv *bat_priv);
-void batadv_nc_mesh_free(struct batadv_priv *bat_priv);
-void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh);
-void batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *));
-void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv);
-void batadv_nc_init_orig(struct batadv_orig_node *orig_node);
-bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node);
-void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb);
-void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb);
-
-#else /* ifdef CONFIG_BATMAN_ADV_NC */
-
-static inline void batadv_nc_status_update(struct net_device *net_dev)
-{
-}
-
-static inline int batadv_nc_init(void)
-{
- return 0;
-}
-
-static inline int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
-{
- return 0;
-}
-
-static inline void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
-{
-}
-
-static inline void
-batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- struct batadv_orig_node *orig_neigh_node,
- struct batadv_ogm_packet *ogm_packet,
- int is_single_hop_neigh)
-{
-}
-
-static inline void
-batadv_nc_purge_orig(struct batadv_priv *bat_priv,
- struct batadv_orig_node *orig_node,
- bool (*to_purge)(struct batadv_priv *,
- struct batadv_nc_node *))
-{
-}
-
-static inline void batadv_nc_init_bat_priv(struct batadv_priv *bat_priv)
-{
-}
-
-static inline void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
-{
-}
-
-static inline bool batadv_nc_skb_forward(struct sk_buff *skb,
- struct batadv_neigh_node *neigh_node)
-{
- return false;
-}
-
-static inline void
-batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
-}
-
-static inline void
-batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
- struct sk_buff *skb)
-{
-}
-
-#endif /* ifdef CONFIG_BATMAN_ADV_NC */
-
-#endif /* _NET_BATMAN_ADV_NETWORK_CODING_H_ */
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index a464ff96b929..c84420cb410d 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -37,7 +37,6 @@
#include "log.h"
#include "multicast.h"
#include "netlink.h"
-#include "network-coding.h"
#include "routing.h"
#include "translation-table.h"
@@ -883,9 +882,6 @@ void batadv_orig_node_release(struct kref *ref)
}
spin_unlock_bh(&orig_node->vlan_list_lock);
- /* Free nc_nodes */
- batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
-
call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
}
@@ -959,8 +955,6 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
spin_lock_init(&orig_node->tt_lock);
spin_lock_init(&orig_node->vlan_list_lock);
- batadv_nc_init_orig(orig_node);
-
/* extra reference for return */
kref_init(&orig_node->refcount);
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 35d8c5783999..12c16f81cc51 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -31,7 +31,6 @@
#include "hard-interface.h"
#include "log.h"
#include "mesh-interface.h"
-#include "network-coding.h"
#include "originator.h"
#include "send.h"
#include "tp_meter.h"
@@ -956,15 +955,9 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
/* function returns -EREMOTE for promiscuous packets */
check = batadv_check_unicast_packet(bat_priv, skb, hdr_size);
-
- /* Even though the packet is not for us, we might save it to use for
- * decoding a later received coded packet
- */
- if (check == -EREMOTE)
- batadv_nc_skb_store_sniffed_unicast(bat_priv, skb);
-
if (check < 0)
goto free_skb;
+
if (!batadv_check_unicast_ttvn(bat_priv, skb, hdr_size))
goto free_skb;
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index 95849ba004e7..20d85c681064 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -34,7 +34,6 @@
#include "hard-interface.h"
#include "log.h"
#include "mesh-interface.h"
-#include "network-coding.h"
#include "originator.h"
#include "routing.h"
#include "translation-table.h"
@@ -63,12 +62,9 @@ int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
const u8 *dst_addr)
{
- struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
int ret;
- bat_priv = netdev_priv(hard_iface->mesh_iface);
-
if (hard_iface->if_status != BATADV_IF_ACTIVE)
goto send_skb_err;
@@ -97,9 +93,6 @@ int batadv_send_skb_packet(struct sk_buff *skb,
skb->dev = hard_iface->net_dev;
- /* Save a clone of the skb to use when decoding coded packets */
- batadv_nc_skb_store_for_decoding(bat_priv, skb);
-
/* dev_queue_xmit() returns a negative result on error. However on
* congestion and traffic shaping, it drops and returns NET_XMIT_DROP
* (which is > 0). This will not be treated as an error.
@@ -202,14 +195,7 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
goto put_neigh_node;
}
- /* try to network code the packet, if it is received on an interface
- * (i.e. being forwarded). If the packet originates from this node or if
- * network coding fails, then send the packet as usual.
- */
- if (recv_if && batadv_nc_skb_forward(skb, neigh_node))
- ret = -EINPROGRESS;
- else
- ret = batadv_send_unicast_skb(skb, neigh_node);
+ ret = batadv_send_unicast_skb(skb, neigh_node);
/* skb was consumed */
skb = NULL;
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 8d0e04e770cb..6e95e883c2bf 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -212,7 +212,7 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
/**
* batadv_tt_local_entry_release() - release tt_local_entry from lists and queue
* for free after rcu grace period
- * @ref: kref pointer of the nc_node
+ * @ref: kref pointer of the batadv_tt_local_entry
*/
static void batadv_tt_local_entry_release(struct kref *ref)
{
@@ -244,7 +244,7 @@ batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
/**
* batadv_tt_global_entry_release() - release tt_global_entry from lists and
* queue for free after rcu grace period
- * @ref: kref pointer of the nc_node
+ * @ref: kref pointer of the batadv_tt_global_entry
*/
void batadv_tt_global_entry_release(struct kref *ref)
{
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index 0ca0fc072fc9..ae1d7a8dc480 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -505,20 +505,6 @@ struct batadv_orig_node {
/** @rcu: struct used for freeing in an RCU-safe manner */
struct rcu_head rcu;
-#ifdef CONFIG_BATMAN_ADV_NC
- /** @in_coding_list: list of nodes this orig can hear */
- struct list_head in_coding_list;
-
- /** @out_coding_list: list of nodes that can hear this orig */
- struct list_head out_coding_list;
-
- /** @in_coding_list_lock: protects in_coding_list */
- spinlock_t in_coding_list_lock;
-
- /** @out_coding_list_lock: protects out_coding_list */
- spinlock_t out_coding_list_lock;
-#endif
-
/** @fragments: array with heads for fragment chains */
struct batadv_frag_table_entry fragments[BATADV_FRAG_BUFFER_COUNT];
@@ -545,9 +531,6 @@ enum batadv_orig_capabilities {
*/
BATADV_ORIG_CAPA_HAS_DAT,
- /** @BATADV_ORIG_CAPA_HAS_NC: orig node has network coding enabled */
- BATADV_ORIG_CAPA_HAS_NC,
-
/** @BATADV_ORIG_CAPA_HAS_TT: orig node has tt capability */
BATADV_ORIG_CAPA_HAS_TT,
@@ -953,60 +936,6 @@ enum batadv_counters {
BATADV_CNT_DAT_CACHED_REPLY_TX,
#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- /**
- * @BATADV_CNT_NC_CODE: transmitted nc-combined traffic packet counter
- */
- BATADV_CNT_NC_CODE,
-
- /**
- * @BATADV_CNT_NC_CODE_BYTES: transmitted nc-combined traffic bytes
- * counter
- */
- BATADV_CNT_NC_CODE_BYTES,
-
- /**
- * @BATADV_CNT_NC_RECODE: transmitted nc-recombined traffic packet
- * counter
- */
- BATADV_CNT_NC_RECODE,
-
- /**
- * @BATADV_CNT_NC_RECODE_BYTES: transmitted nc-recombined traffic bytes
- * counter
- */
- BATADV_CNT_NC_RECODE_BYTES,
-
- /**
- * @BATADV_CNT_NC_BUFFER: counter for packets buffered for later nc
- * decoding
- */
- BATADV_CNT_NC_BUFFER,
-
- /**
- * @BATADV_CNT_NC_DECODE: received and nc-decoded traffic packet counter
- */
- BATADV_CNT_NC_DECODE,
-
- /**
- * @BATADV_CNT_NC_DECODE_BYTES: received and nc-decoded traffic bytes
- * counter
- */
- BATADV_CNT_NC_DECODE_BYTES,
-
- /**
- * @BATADV_CNT_NC_DECODE_FAILED: received and decode-failed traffic
- * packet counter
- */
- BATADV_CNT_NC_DECODE_FAILED,
-
- /**
- * @BATADV_CNT_NC_SNIFFED: counter for nc-decoded packets received in
- * promisc mode.
- */
- BATADV_CNT_NC_SNIFFED,
-#endif
-
/** @BATADV_CNT_NUM: number of traffic counters */
BATADV_CNT_NUM,
};
@@ -1340,56 +1269,6 @@ struct batadv_priv_mcast {
#endif
/**
- * struct batadv_priv_nc - per mesh interface network coding private data
- */
-struct batadv_priv_nc {
- /** @work: work queue callback item for cleanup */
- struct delayed_work work;
-
- /**
- * @min_tq: only consider neighbors for encoding if neigh_tq > min_tq
- */
- u8 min_tq;
-
- /**
- * @max_fwd_delay: maximum packet forward delay to allow coding of
- * packets
- */
- u32 max_fwd_delay;
-
- /**
- * @max_buffer_time: buffer time for sniffed packets used to decoding
- */
- u32 max_buffer_time;
-
- /**
- * @timestamp_fwd_flush: timestamp of last forward packet queue flush
- */
- unsigned long timestamp_fwd_flush;
-
- /**
- * @timestamp_sniffed_purge: timestamp of last sniffed packet queue
- * purge
- */
- unsigned long timestamp_sniffed_purge;
-
- /**
- * @coding_hash: Hash table used to buffer skbs while waiting for
- * another incoming skb to code it with. Skbs are added to the buffer
- * just before being forwarded in routing.c
- */
- struct batadv_hashtable *coding_hash;
-
- /**
- * @decoding_hash: Hash table used to buffer skbs that might be needed
- * to decode a received coded skb. The buffer is used for 1) skbs
- * arriving on the mesh-interface; 2) skbs overheard on the
- * hard-interface; and 3) skbs forwarded by batman-adv.
- */
- struct batadv_hashtable *decoding_hash;
-};
-
-/**
* struct batadv_tp_unacked - unacked packet meta-information
*
* This struct is supposed to represent a buffer unacked packet. However, since
@@ -1775,16 +1654,6 @@ struct batadv_priv {
struct batadv_priv_mcast mcast;
#endif
-#ifdef CONFIG_BATMAN_ADV_NC
- /**
- * @network_coding: bool indicating whether network coding is enabled
- */
- atomic_t network_coding;
-
- /** @nc: network coding data */
- struct batadv_priv_nc nc;
-#endif /* CONFIG_BATMAN_ADV_NC */
-
#ifdef CONFIG_BATMAN_ADV_BATMAN_V
/** @bat_v: B.A.T.M.A.N. V per mesh-interface private data */
struct batadv_priv_bat_v bat_v;
@@ -2017,95 +1886,10 @@ struct batadv_tt_roam_node {
};
/**
- * struct batadv_nc_node - network coding node
- */
-struct batadv_nc_node {
- /** @list: next and prev pointer for the list handling */
- struct list_head list;
-
- /** @addr: the node's mac address */
- u8 addr[ETH_ALEN];
-
- /** @refcount: number of contexts the object is used by */
- struct kref refcount;
-
- /** @rcu: struct used for freeing in an RCU-safe manner */
- struct rcu_head rcu;
-
- /** @orig_node: pointer to corresponding orig node struct */
- struct batadv_orig_node *orig_node;
-
- /** @last_seen: timestamp of last ogm received from this node */
- unsigned long last_seen;
-};
-
-/**
- * struct batadv_nc_path - network coding path
- */
-struct batadv_nc_path {
- /** @hash_entry: next and prev pointer for the list handling */
- struct hlist_node hash_entry;
-
- /** @rcu: struct used for freeing in an RCU-safe manner */
- struct rcu_head rcu;
-
- /** @refcount: number of contexts the object is used by */
- struct kref refcount;
-
- /** @packet_list: list of buffered packets for this path */
- struct list_head packet_list;
-
- /** @packet_list_lock: access lock for packet list */
- spinlock_t packet_list_lock;
-
- /** @next_hop: next hop (destination) of path */
- u8 next_hop[ETH_ALEN];
-
- /** @prev_hop: previous hop (source) of path */
- u8 prev_hop[ETH_ALEN];
-
- /** @last_valid: timestamp for last validation of path */
- unsigned long last_valid;
-};
-
-/**
- * struct batadv_nc_packet - network coding packet used when coding and
- * decoding packets
- */
-struct batadv_nc_packet {
- /** @list: next and prev pointer for the list handling */
- struct list_head list;
-
- /** @packet_id: crc32 checksum of skb data */
- __be32 packet_id;
-
- /**
- * @timestamp: field containing the info when the packet was added to
- * path
- */
- unsigned long timestamp;
-
- /** @neigh_node: pointer to original next hop neighbor of skb */
- struct batadv_neigh_node *neigh_node;
-
- /** @skb: skb which can be encoded or used for decoding */
- struct sk_buff *skb;
-
- /** @nc_path: pointer to path this nc packet is attached to */
- struct batadv_nc_path *nc_path;
-};
-
-/**
* struct batadv_skb_cb - control buffer structure used to store private data
* relevant to batman-adv in the skb->cb buffer in skbs.
*/
struct batadv_skb_cb {
- /**
- * @decoded: Marks a skb as decoded, which is checked when searching for
- * coding opportunities in network-coding.c
- */
- unsigned char decoded:1;
-
/** @num_bcasts: Counter for broadcast packet retransmissions */
unsigned char num_bcasts;
};
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index e524bb59bff2..111f0e37b672 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -924,10 +924,9 @@ static struct hci_conn *__hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t
case CIS_LINK:
case BIS_LINK:
case PA_LINK:
- if (hdev->iso_mtu)
- /* Dedicated ISO Buffer exists */
- break;
- fallthrough;
+ if (!hdev->iso_mtu)
+ return ERR_PTR(-ECONNREFUSED);
+ break;
case LE_LINK:
if (hdev->le_mtu && hdev->le_mtu < HCI_MIN_LE_MTU)
return ERR_PTR(-ECONNREFUSED);
@@ -1540,7 +1539,7 @@ static int qos_set_bis(struct hci_dev *hdev, struct bt_iso_qos *qos)
/* This function requires the caller holds hdev->lock */
static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
__u8 sid, struct bt_iso_qos *qos,
- __u8 base_len, __u8 *base)
+ __u8 base_len, __u8 *base, u16 timeout)
{
struct hci_conn *conn;
int err;
@@ -1582,6 +1581,7 @@ static struct hci_conn *hci_add_bis(struct hci_dev *hdev, bdaddr_t *dst,
conn->state = BT_CONNECT;
conn->sid = sid;
+ conn->conn_timeout = timeout;
hci_conn_hold(conn);
return conn;
@@ -1922,7 +1922,8 @@ done:
}
struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
- __u8 dst_type, struct bt_iso_qos *qos)
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
{
struct hci_conn *cis;
@@ -1937,6 +1938,7 @@ struct hci_conn *hci_bind_cis(struct hci_dev *hdev, bdaddr_t *dst,
cis->dst_type = dst_type;
cis->iso_qos.ucast.cig = BT_ISO_QOS_CIG_UNSET;
cis->iso_qos.ucast.cis = BT_ISO_QOS_CIS_UNSET;
+ cis->conn_timeout = timeout;
}
if (cis->state == BT_CONNECTED)
@@ -2176,7 +2178,7 @@ static void create_big_complete(struct hci_dev *hdev, void *data, int err)
struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
struct bt_iso_qos *qos,
- __u8 base_len, __u8 *base)
+ __u8 base_len, __u8 *base, u16 timeout)
{
struct hci_conn *conn;
struct hci_conn *parent;
@@ -2197,7 +2199,7 @@ struct hci_conn *hci_bind_bis(struct hci_dev *hdev, bdaddr_t *dst, __u8 sid,
base, base_len);
/* We need hci_conn object using the BDADDR_ANY as dst */
- conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir);
+ conn = hci_add_bis(hdev, dst, sid, qos, base_len, eir, timeout);
if (IS_ERR(conn))
return conn;
@@ -2250,13 +2252,13 @@ static void bis_mark_per_adv(struct hci_conn *conn, void *data)
struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
__u8 dst_type, __u8 sid,
struct bt_iso_qos *qos,
- __u8 base_len, __u8 *base)
+ __u8 base_len, __u8 *base, u16 timeout)
{
struct hci_conn *conn;
int err;
struct iso_list_data data;
- conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base);
+ conn = hci_bind_bis(hdev, dst, sid, qos, base_len, base, timeout);
if (IS_ERR(conn))
return conn;
@@ -2299,7 +2301,8 @@ struct hci_conn *hci_connect_bis(struct hci_dev *hdev, bdaddr_t *dst,
}
struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
- __u8 dst_type, struct bt_iso_qos *qos)
+ __u8 dst_type, struct bt_iso_qos *qos,
+ u16 timeout)
{
struct hci_conn *le;
struct hci_conn *cis;
@@ -2323,7 +2326,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst,
hci_iso_qos_setup(hdev, le, &qos->ucast.in,
le->le_rx_phy ? le->le_rx_phy : hdev->le_rx_def_phys);
- cis = hci_bind_cis(hdev, dst, dst_type, qos);
+ cis = hci_bind_cis(hdev, dst, dst_type, qos, timeout);
if (IS_ERR(cis)) {
hci_conn_drop(le);
return cis;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 55e0722fd066..3418d7b964a1 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -3267,6 +3267,8 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
spin_unlock_bh(&queue->lock);
}
+
+ bt_dev_dbg(hdev, "chan %p queued %d", chan, skb_queue_len(queue));
}
void hci_send_acl(struct hci_chan *chan, struct sk_buff *skb, __u16 flags)
@@ -3298,6 +3300,10 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
hci_skb_pkt_type(skb) = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn,
+ skb_queue_len(&conn->data_q));
+
queue_work(hdev->workqueue, &hdev->tx_work);
}
@@ -3357,6 +3363,8 @@ static void hci_queue_iso(struct hci_conn *conn, struct sk_buff_head *queue,
__skb_queue_tail(queue, skb);
} while (list);
}
+
+ bt_dev_dbg(hdev, "hcon %p queued %d", conn, skb_queue_len(queue));
}
void hci_send_iso(struct hci_conn *conn, struct sk_buff *skb)
@@ -3399,8 +3407,7 @@ static inline void hci_quote_sent(struct hci_conn *conn, int num, int *quote)
case CIS_LINK:
case BIS_LINK:
case PA_LINK:
- cnt = hdev->iso_mtu ? hdev->iso_cnt :
- hdev->le_mtu ? hdev->le_cnt : hdev->acl_cnt;
+ cnt = hdev->iso_cnt;
break;
default:
cnt = 0;
@@ -3428,6 +3435,10 @@ static struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type,
skb_queue_empty(&c->data_q))
continue;
+ bt_dev_dbg(hdev, "hcon %p state %s queued %d", c,
+ state_to_string(c->state),
+ skb_queue_len(&c->data_q));
+
if (c->state != BT_CONNECTED && c->state != BT_CONFIG)
continue;
@@ -3586,24 +3597,37 @@ static void hci_prio_recalculate(struct hci_dev *hdev, __u8 type)
static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type)
{
- unsigned long last_tx;
+ unsigned long timeout;
if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED))
return;
switch (type) {
+ case ACL_LINK:
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->acl_last_tx + HCI_ACL_TX_TIMEOUT;
+ break;
case LE_LINK:
- last_tx = hdev->le_last_tx;
+ /* tx timeout must be longer than maximum link supervision
+ * timeout (40.9 seconds)
+ */
+ timeout = hdev->le_last_tx + HCI_ACL_TX_TIMEOUT;
break;
- default:
- last_tx = hdev->acl_last_tx;
+ case CIS_LINK:
+ case BIS_LINK:
+ case PA_LINK:
+ /* tx timeout must be longer than the maximum transport latency
+ * (8.388607 seconds)
+ */
+ timeout = hdev->iso_last_tx + HCI_ISO_TX_TIMEOUT;
break;
+ default:
+ return;
}
- /* tx timeout must be longer than maximum link supervision timeout
- * (40.9 seconds)
- */
- if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT))
+ if (!cnt && time_after(jiffies, timeout))
hci_link_tx_to(hdev, type);
}
@@ -3759,12 +3783,16 @@ static void hci_sched_iso(struct hci_dev *hdev, __u8 type)
if (!hci_conn_num(hdev, type))
return;
- cnt = hdev->iso_pkts ? &hdev->iso_cnt :
- hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
+ cnt = &hdev->iso_cnt;
+
+ __check_timeout(hdev, *cnt, type);
+
while (*cnt && (conn = hci_low_sent(hdev, type, &quote))) {
while (quote-- && (skb = skb_dequeue(&conn->data_q))) {
BT_DBG("skb %p len %d", skb, skb->len);
+
hci_send_conn_frame(hdev, conn, skb);
+ hdev->iso_last_tx = jiffies;
conn->sent++;
if (conn->sent == ~0)
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index fe49e8a7969f..d790b0d4eb9a 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -4461,19 +4461,9 @@ static void hci_num_comp_pkts_evt(struct hci_dev *hdev, void *data,
case CIS_LINK:
case BIS_LINK:
case PA_LINK:
- if (hdev->iso_pkts) {
- hdev->iso_cnt += count;
- if (hdev->iso_cnt > hdev->iso_pkts)
- hdev->iso_cnt = hdev->iso_pkts;
- } else if (hdev->le_pkts) {
- hdev->le_cnt += count;
- if (hdev->le_cnt > hdev->le_pkts)
- hdev->le_cnt = hdev->le_pkts;
- } else {
- hdev->acl_cnt += count;
- if (hdev->acl_cnt > hdev->acl_pkts)
- hdev->acl_cnt = hdev->acl_pkts;
- }
+ hdev->iso_cnt += count;
+ if (hdev->iso_cnt > hdev->iso_pkts)
+ hdev->iso_cnt = hdev->iso_pkts;
break;
default:
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 7a7d49890858..eefdb6134ca5 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -1325,7 +1325,7 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
{
struct hci_cp_le_set_ext_adv_params cp;
struct hci_rp_le_set_ext_adv_params rp;
- bool connectable;
+ bool connectable, require_privacy;
u32 flags;
bdaddr_t random_addr;
u8 own_addr_type;
@@ -1363,10 +1363,12 @@ int hci_setup_ext_adv_instance_sync(struct hci_dev *hdev, u8 instance)
return -EPERM;
/* Set require_privacy to true only when non-connectable
- * advertising is used. In that case it is fine to use a
- * non-resolvable private address.
+ * advertising is used and it is not periodic.
+ * In that case it is fine to use a non-resolvable private address.
*/
- err = hci_get_random_address(hdev, !connectable,
+ require_privacy = !connectable && !(adv && adv->periodic);
+
+ err = hci_get_random_address(hdev, require_privacy,
adv_use_rpa(hdev, flags), adv,
&own_addr_type, &random_addr);
if (err < 0)
diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c
index 5ce823ca3aaf..9b263d061e05 100644
--- a/net/bluetooth/iso.c
+++ b/net/bluetooth/iso.c
@@ -91,8 +91,8 @@ static struct sock *iso_get_sock(bdaddr_t *src, bdaddr_t *dst,
iso_sock_match_t match, void *data);
/* ---- ISO timers ---- */
-#define ISO_CONN_TIMEOUT (HZ * 40)
-#define ISO_DISCONN_TIMEOUT (HZ * 2)
+#define ISO_CONN_TIMEOUT secs_to_jiffies(20)
+#define ISO_DISCONN_TIMEOUT secs_to_jiffies(2)
static void iso_conn_free(struct kref *ref)
{
@@ -111,6 +111,8 @@ static void iso_conn_free(struct kref *ref)
/* Ensure no more work items will run since hci_conn has been dropped */
disable_delayed_work_sync(&conn->timeout_work);
+ kfree_skb(conn->rx_skb);
+
kfree(conn);
}
@@ -367,7 +369,8 @@ static int iso_connect_bis(struct sock *sk)
if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
hcon = hci_bind_bis(hdev, &iso_pi(sk)->dst, iso_pi(sk)->bc_sid,
&iso_pi(sk)->qos, iso_pi(sk)->base_len,
- iso_pi(sk)->base);
+ iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto unlock;
@@ -376,7 +379,8 @@ static int iso_connect_bis(struct sock *sk)
hcon = hci_connect_bis(hdev, &iso_pi(sk)->dst,
le_addr_type(iso_pi(sk)->dst_type),
iso_pi(sk)->bc_sid, &iso_pi(sk)->qos,
- iso_pi(sk)->base_len, iso_pi(sk)->base);
+ iso_pi(sk)->base_len, iso_pi(sk)->base,
+ READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto unlock;
@@ -458,11 +462,19 @@ static int iso_connect_cis(struct sock *sk)
goto unlock;
}
+ /* Check if there are available buffers for output/TX. */
+ if (iso_pi(sk)->qos.ucast.out.sdu && !hci_iso_count(hdev) &&
+ (hdev->iso_pkts && !hdev->iso_cnt)) {
+ err = -ENOBUFS;
+ goto unlock;
+ }
+
/* Just bind if DEFER_SETUP has been set */
if (test_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) {
hcon = hci_bind_cis(hdev, &iso_pi(sk)->dst,
le_addr_type(iso_pi(sk)->dst_type),
- &iso_pi(sk)->qos);
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto unlock;
@@ -470,7 +482,8 @@ static int iso_connect_cis(struct sock *sk)
} else {
hcon = hci_connect_cis(hdev, &iso_pi(sk)->dst,
le_addr_type(iso_pi(sk)->dst_type),
- &iso_pi(sk)->qos);
+ &iso_pi(sk)->qos,
+ READ_ONCE(sk->sk_sndtimeo));
if (IS_ERR(hcon)) {
err = PTR_ERR(hcon);
goto unlock;
@@ -750,6 +763,13 @@ static void iso_sock_kill(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (iso_pi(sk)->conn) {
+ iso_conn_lock(iso_pi(sk)->conn);
+ iso_pi(sk)->conn->sk = NULL;
+ iso_conn_unlock(iso_pi(sk)->conn);
+ }
+
/* Kill poor orphan */
bt_sock_unlink(&iso_sk_list, sk);
sock_set_flag(sk, SOCK_DEAD);
@@ -2407,7 +2427,7 @@ void iso_recv(struct hci_conn *hcon, struct sk_buff *skb, u16 flags)
skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
skb->len);
conn->rx_len -= skb->len;
- return;
+ break;
case ISO_END:
skb_copy_from_linear_data(skb, skb_put(conn->rx_skb, skb->len),
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 225140fcb3d6..a3d16eece0d2 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -4542,13 +4542,11 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev,
return -ENOMEM;
#ifdef CONFIG_BT_FEATURE_DEBUG
- if (!hdev) {
- flags = bt_dbg_get() ? BIT(0) : 0;
+ flags = bt_dbg_get() ? BIT(0) : 0;
- memcpy(rp->features[idx].uuid, debug_uuid, 16);
- rp->features[idx].flags = cpu_to_le32(flags);
- idx++;
- }
+ memcpy(rp->features[idx].uuid, debug_uuid, 16);
+ rp->features[idx].flags = cpu_to_le32(flags);
+ idx++;
#endif
if (hdev && hci_dev_le_state_simultaneous(hdev)) {
diff --git a/net/bluetooth/mgmt_config.c b/net/bluetooth/mgmt_config.c
index 6ef701c27da4..c4063d200c0a 100644
--- a/net/bluetooth/mgmt_config.c
+++ b/net/bluetooth/mgmt_config.c
@@ -13,13 +13,13 @@
#define HDEV_PARAM_U16(_param_name_) \
struct {\
- struct mgmt_tlv entry; \
+ struct mgmt_tlv_hdr entry; \
__le16 value; \
} __packed _param_name_
#define HDEV_PARAM_U8(_param_name_) \
struct {\
- struct mgmt_tlv entry; \
+ struct mgmt_tlv_hdr entry; \
__u8 value; \
} __packed _param_name_
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index d382d980fd9a..ab0cf442d57b 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -498,6 +498,13 @@ static void sco_sock_kill(struct sock *sk)
BT_DBG("sk %p state %d", sk, sk->sk_state);
+ /* Sock is dead, so set conn->sk to NULL to avoid possible UAF */
+ if (sco_pi(sk)->conn) {
+ sco_conn_lock(sco_pi(sk)->conn);
+ sco_pi(sk)->conn->sk = NULL;
+ sco_conn_unlock(sco_pi(sk)->conn);
+ }
+
/* Kill poor orphan */
bt_sock_unlink(&sco_sk_list, sk);
sock_set_flag(sk, SOCK_DEAD);
diff --git a/net/bridge/br.c b/net/bridge/br.c
index c683baa3847f..c37e52e2f29a 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -37,6 +37,11 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v
int err;
if (netif_is_bridge_master(dev)) {
+ struct net_bridge *br = netdev_priv(dev);
+
+ if (event == NETDEV_REGISTER)
+ br_fdb_change_mac_address(br, dev->dev_addr);
+
err = br_vlan_bridge_event(dev, event, ptr);
if (err)
return notifier_from_errno(err);
@@ -259,6 +264,23 @@ static struct notifier_block br_switchdev_blocking_notifier = {
.notifier_call = br_switchdev_blocking_event,
};
+static int
+br_toggle_fdb_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ if (br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0) == on)
+ return 0;
+
+ err = br_fdb_toggle_local_vlan_0(br, on, extack);
+ if (err)
+ return err;
+
+ br_opt_toggle(br, BROPT_FDB_LOCAL_VLAN_0, on);
+ return 0;
+}
+
/* br_boolopt_toggle - change user-controlled boolean option
*
* @br: bridge device
@@ -287,6 +309,9 @@ int br_boolopt_toggle(struct net_bridge *br, enum br_boolopt_id opt, bool on,
case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
br_opt_toggle(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION, on);
break;
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ err = br_toggle_fdb_local_vlan_0(br, on, extack);
+ break;
default:
/* shouldn't be called with unsupported options */
WARN_ON(1);
@@ -307,6 +332,8 @@ int br_boolopt_get(const struct net_bridge *br, enum br_boolopt_id opt)
return br_opt_get(br, BROPT_MST_ENABLED);
case BR_BOOLOPT_MDB_OFFLOAD_FAIL_NOTIFICATION:
return br_opt_get(br, BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION);
+ case BR_BOOLOPT_FDB_LOCAL_VLAN_0:
+ return br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
default:
/* shouldn't be called with unsupported options */
WARN_ON(1);
diff --git a/net/bridge/br_cfm.c b/net/bridge/br_cfm.c
index a3c755d0a09d..c2c1c7d44c61 100644
--- a/net/bridge/br_cfm.c
+++ b/net/bridge/br_cfm.c
@@ -134,7 +134,7 @@ static void ccm_rx_timer_start(struct br_cfm_peer_mep *peer_mep)
* of the configured CC 'expected_interval'
* in order to detect CCM defect after 3.25 interval.
*/
- queue_delayed_work(system_wq, &peer_mep->ccm_rx_dwork,
+ queue_delayed_work(system_percpu_wq, &peer_mep->ccm_rx_dwork,
usecs_to_jiffies(interval_us / 4));
}
@@ -285,7 +285,7 @@ static void ccm_tx_work_expired(struct work_struct *work)
ccm_frame_tx(skb);
interval_us = interval_to_us(mep->cc_config.exp_interval);
- queue_delayed_work(system_wq, &mep->ccm_tx_dwork,
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork,
usecs_to_jiffies(interval_us));
}
@@ -809,7 +809,7 @@ int br_cfm_cc_ccm_tx(struct net_bridge *br, const u32 instance,
* to send first frame immediately
*/
mep->ccm_tx_end = jiffies + usecs_to_jiffies(tx_info->period * 1000000);
- queue_delayed_work(system_wq, &mep->ccm_tx_dwork, 0);
+ queue_delayed_work(system_percpu_wq, &mep->ccm_tx_dwork, 0);
save:
mep->cc_ccm_tx_info = *tx_info;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 902694c0ce64..58d22e2b85fc 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -459,6 +459,9 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
struct net_bridge_fdb_entry *f;
struct net_bridge *br = p->br;
struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
spin_lock_bh(&br->hash_lock);
vg = nbp_vlan_group(p);
@@ -468,11 +471,11 @@ void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr)
/* delete old one */
fdb_delete_local(br, p, f);
- /* if this port has no vlan information
- * configured, we can safely be done at
- * this point.
+ /* if this port has no vlan information configured, or
+ * local entries are only kept on VLAN 0, we can safely
+ * be done at this point.
*/
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto insert;
}
}
@@ -481,7 +484,7 @@ insert:
/* insert new address, may fail if invalid address or dup. */
fdb_add_local(br, p, newaddr, 0);
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto done;
/* Now add entries for every VLAN configured on the port.
@@ -500,6 +503,9 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
struct net_bridge_vlan_group *vg;
struct net_bridge_fdb_entry *f;
struct net_bridge_vlan *v;
+ bool local_vlan_0;
+
+ local_vlan_0 = br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0);
spin_lock_bh(&br->hash_lock);
@@ -511,7 +517,7 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
fdb_add_local(br, NULL, newaddr, 0);
vg = br_vlan_group(br);
- if (!vg || !vg->num_vlans)
+ if (!vg || !vg->num_vlans || local_vlan_0)
goto out;
/* Now remove and add entries for every VLAN configured on the
* bridge. This function runs under RTNL so the bitmap will not
@@ -576,6 +582,102 @@ void br_fdb_cleanup(struct work_struct *work)
mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
}
+static void br_fdb_delete_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
+
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
+ }
+
+ list_for_each_entry(v, &vg->vlan_list, vlist)
+ br_fdb_find_delete_local(br, p, dev->dev_addr, v->vid);
+}
+
+static void br_fdb_delete_locals_per_vlan(struct net_bridge *br)
+{
+ struct net_bridge_port *p;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list)
+ br_fdb_delete_locals_per_vlan_port(br, p);
+
+ br_fdb_delete_locals_per_vlan_port(br, NULL);
+}
+
+static int br_fdb_insert_locals_per_vlan_port(struct net_bridge *br,
+ struct net_bridge_port *p,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_vlan_group *vg;
+ struct net_bridge_vlan *v;
+ struct net_device *dev;
+ int err;
+
+ if (p) {
+ vg = nbp_vlan_group(p);
+ dev = p->dev;
+ } else {
+ vg = br_vlan_group(br);
+ dev = br->dev;
+ }
+
+ list_for_each_entry(v, &vg->vlan_list, vlist) {
+ if (!br_vlan_should_use(v))
+ continue;
+
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int br_fdb_insert_locals_per_vlan(struct net_bridge *br,
+ struct netlink_ext_ack *extack)
+{
+ struct net_bridge_port *p;
+ int err;
+
+ ASSERT_RTNL();
+
+ list_for_each_entry(p, &br->port_list, list) {
+ err = br_fdb_insert_locals_per_vlan_port(br, p, extack);
+ if (err)
+ goto rollback;
+ }
+
+ err = br_fdb_insert_locals_per_vlan_port(br, NULL, extack);
+ if (err)
+ goto rollback;
+
+ return 0;
+
+rollback:
+ NL_SET_ERR_MSG_MOD(extack, "fdb_local_vlan_0 toggle: FDB entry insertion failed");
+ br_fdb_delete_locals_per_vlan(br);
+ return err;
+}
+
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack)
+{
+ if (!on)
+ return br_fdb_insert_locals_per_vlan(br, extack);
+
+ br_fdb_delete_locals_per_vlan(br);
+ return 0;
+}
+
static bool __fdb_flush_matches(const struct net_bridge *br,
const struct net_bridge_fdb_entry *f,
const struct net_bridge_fdb_flush_desc *desc)
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index 29097e984b4f..870bdf2e082c 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -148,7 +148,8 @@ void br_forward(const struct net_bridge_port *to,
goto out;
/* redirect to backup link if the destination port is down */
- if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
+ if (rcu_access_pointer(to->backup_port) &&
+ (!netif_carrier_ok(to->dev) || !netif_running(to->dev))) {
struct net_bridge_port *backup_port;
backup_port = rcu_dereference(to->backup_port);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index 5f6ac9bf1527..67b4c905e49a 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -202,6 +202,14 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
break;
case BR_PKT_UNICAST:
dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, vid);
+ if (unlikely(!dst && vid &&
+ br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0))) {
+ dst = br_fdb_find_rcu(br, eth_hdr(skb)->h_dest, 0);
+ if (dst &&
+ (!test_bit(BR_FDB_LOCAL, &dst->flags) ||
+ test_bit(BR_FDB_ADDED_BY_USER, &dst->flags)))
+ dst = NULL;
+ }
break;
default:
break;
diff --git a/net/bridge/br_mrp.c b/net/bridge/br_mrp.c
index fd2de35ffb3c..3c36fa24bc05 100644
--- a/net/bridge/br_mrp.c
+++ b/net/bridge/br_mrp.c
@@ -341,7 +341,7 @@ static void br_mrp_test_work_expired(struct work_struct *work)
out:
rcu_read_unlock();
- queue_delayed_work(system_wq, &mrp->test_work,
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
usecs_to_jiffies(mrp->test_interval));
}
@@ -418,7 +418,7 @@ static void br_mrp_in_test_work_expired(struct work_struct *work)
out:
rcu_read_unlock();
- queue_delayed_work(system_wq, &mrp->in_test_work,
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
usecs_to_jiffies(mrp->in_test_interval));
}
@@ -725,7 +725,7 @@ int br_mrp_start_test(struct net_bridge *br,
mrp->test_max_miss = test->max_miss;
mrp->test_monitor = test->monitor;
mrp->test_count_miss = 0;
- queue_delayed_work(system_wq, &mrp->test_work,
+ queue_delayed_work(system_percpu_wq, &mrp->test_work,
usecs_to_jiffies(test->interval));
return 0;
@@ -865,7 +865,7 @@ int br_mrp_start_in_test(struct net_bridge *br,
mrp->in_test_end = jiffies + usecs_to_jiffies(in_test->period);
mrp->in_test_max_miss = in_test->max_miss;
mrp->in_test_count_miss = 0;
- queue_delayed_work(system_wq, &mrp->in_test_work,
+ queue_delayed_work(system_percpu_wq, &mrp->in_test_work,
usecs_to_jiffies(in_test->interval));
return 0;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 8ce145938b02..22d12e545966 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -4049,8 +4049,7 @@ int br_multicast_rcv(struct net_bridge_mcast **brmctx,
}
static void br_multicast_query_expired(struct net_bridge_mcast *brmctx,
- struct bridge_mcast_own_query *query,
- struct bridge_mcast_querier *querier)
+ struct bridge_mcast_own_query *query)
{
spin_lock(&brmctx->br->multicast_lock);
if (br_multicast_ctx_vlan_disabled(brmctx))
@@ -4069,8 +4068,7 @@ static void br_ip4_multicast_query_expired(struct timer_list *t)
struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
ip4_own_query.timer);
- br_multicast_query_expired(brmctx, &brmctx->ip4_own_query,
- &brmctx->ip4_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip4_own_query);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -4079,8 +4077,7 @@ static void br_ip6_multicast_query_expired(struct timer_list *t)
struct net_bridge_mcast *brmctx = timer_container_of(brmctx, t,
ip6_own_query.timer);
- br_multicast_query_expired(brmctx, &brmctx->ip6_own_query,
- &brmctx->ip6_querier);
+ br_multicast_query_expired(brmctx, &brmctx->ip6_own_query);
}
#endif
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 8de0904b9627..16be5d250402 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -487,6 +487,7 @@ enum net_bridge_opts {
BROPT_MCAST_VLAN_SNOOPING_ENABLED,
BROPT_MST_ENABLED,
BROPT_MDB_OFFLOAD_FAIL_NOTIFICATION,
+ BROPT_FDB_LOCAL_VLAN_0,
};
struct net_bridge {
@@ -843,6 +844,8 @@ void br_fdb_find_delete_local(struct net_bridge *br,
void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
void br_fdb_cleanup(struct work_struct *work);
+int br_fdb_toggle_local_vlan_0(struct net_bridge *br, bool on,
+ struct netlink_ext_ack *extack);
void br_fdb_delete_by_port(struct net_bridge *br,
const struct net_bridge_port *p, u16 vid, int do_all);
struct net_bridge_fdb_entry *br_fdb_find_rcu(struct net_bridge *br,
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 939a3aa78d5c..ae911220cb3c 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -331,10 +331,12 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags,
/* Add the dev mac and count the vlan only if it's usable */
if (br_vlan_should_use(v)) {
- err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
- if (err) {
- br_err(br, "failed insert local address into bridge forwarding table\n");
- goto out_filt;
+ if (!br_opt_get(br, BROPT_FDB_LOCAL_VLAN_0)) {
+ err = br_fdb_add_local(br, p, dev->dev_addr, v->vid);
+ if (err) {
+ br_err(br, "failed insert local address into bridge forwarding table\n");
+ goto out_filt;
+ }
}
vg->num_vlans++;
}
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index 3e67d4aff419..5697e3949a36 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -920,8 +920,8 @@ static int translate_table(struct net *net, const char *name,
* if an error occurs
*/
newinfo->chainstack =
- vmalloc(array_size(nr_cpu_ids,
- sizeof(*(newinfo->chainstack))));
+ vmalloc_array(nr_cpu_ids,
+ sizeof(*(newinfo->chainstack)));
if (!newinfo->chainstack)
return -ENOMEM;
for_each_possible_cpu(i) {
@@ -938,7 +938,7 @@ static int translate_table(struct net *net, const char *name,
}
}
- cl_s = vmalloc(array_size(udc_cnt, sizeof(*cl_s)));
+ cl_s = vmalloc_array(udc_cnt, sizeof(*cl_s));
if (!cl_s)
return -ENOMEM;
i = 0; /* the i'th udc */
@@ -1018,8 +1018,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
* the check on the size is done later, when we have the lock
*/
if (repl->num_counters) {
- unsigned long size = repl->num_counters * sizeof(*counterstmp);
- counterstmp = vmalloc(size);
+ counterstmp = vmalloc_array(repl->num_counters,
+ sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
}
@@ -1386,7 +1386,7 @@ static int do_update_counters(struct net *net, const char *name,
if (num_counters == 0)
return -EINVAL;
- tmp = vmalloc(array_size(num_counters, sizeof(*tmp)));
+ tmp = vmalloc_array(num_counters, sizeof(*tmp));
if (!tmp)
return -ENOMEM;
@@ -1526,7 +1526,7 @@ static int copy_counters_to_user(struct ebt_table *t,
if (num_counters != nentries)
return -EINVAL;
- counterstmp = vmalloc(array_size(nentries, sizeof(*counterstmp)));
+ counterstmp = vmalloc_array(nentries, sizeof(*counterstmp));
if (!counterstmp)
return -ENOMEM;
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index 5adced1e7d0c..b7af36bbd306 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -59,6 +59,13 @@ static void nft_meta_bridge_get_eval(const struct nft_expr *expr,
nft_reg_store_be16(dest, htons(p_proto));
return;
}
+ case NFT_META_BRI_IIFHWADDR:
+ br_dev = nft_meta_get_bridge(in);
+ if (!br_dev)
+ goto err;
+
+ memcpy(dest, br_dev->dev_addr, ETH_ALEN);
+ return;
default:
return nft_meta_get_eval(expr, regs, pkt);
}
@@ -86,6 +93,9 @@ static int nft_meta_bridge_get_init(const struct nft_ctx *ctx,
case NFT_META_BRI_IIFVPROTO:
len = sizeof(u16);
break;
+ case NFT_META_BRI_IIFHWADDR:
+ len = ETH_ALEN;
+ break;
default:
return nft_meta_get_init(ctx, expr, tb);
}
@@ -175,6 +185,7 @@ static int nft_meta_bridge_set_validate(const struct nft_ctx *ctx,
switch (priv->key) {
case NFT_META_BRI_BROUTE:
+ case NFT_META_BRI_IIFHWADDR:
hooks = 1 << NF_BR_PRE_ROUTING;
break;
default:
diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c
index 06b604cf9d58..2aa1e7d46eb2 100644
--- a/net/caif/cfctrl.c
+++ b/net/caif/cfctrl.c
@@ -257,9 +257,7 @@ int cfctrl_linkup_request(struct cflayer *layer,
cfpkt_add_body(pkt, &tmp16, 2);
tmp16 = cpu_to_le16(param->u.utility.fifosize_bufs);
cfpkt_add_body(pkt, &tmp16, 2);
- memset(utility_name, 0, sizeof(utility_name));
- strscpy(utility_name, param->u.utility.name,
- UTILITY_NAME_LENGTH);
+ strscpy_pad(utility_name, param->u.utility.name);
cfpkt_add_body(pkt, utility_name, UTILITY_NAME_LENGTH);
tmp8 = param->u.utility.paramlen;
cfpkt_add_body(pkt, &tmp8, 1);
diff --git a/net/can/af_can.c b/net/can/af_can.c
index b2387a46794a..770173d8db42 100644
--- a/net/can/af_can.c
+++ b/net/can/af_can.c
@@ -221,7 +221,7 @@ int can_send(struct sk_buff *skb, int loop)
}
/* Make sure the CAN frame can pass the selected CAN netdevice. */
- if (unlikely(skb->len > skb->dev->mtu)) {
+ if (unlikely(skb->len > READ_ONCE(skb->dev->mtu))) {
err = -EMSGSIZE;
goto inval_skb;
}
diff --git a/net/can/isotp.c b/net/can/isotp.c
index dee1412b3c9c..74ee1e52249b 100644
--- a/net/can/isotp.c
+++ b/net/can/isotp.c
@@ -1313,7 +1313,7 @@ static int isotp_bind(struct socket *sock, struct sockaddr *uaddr, int len)
err = -ENODEV;
goto out;
}
- if (dev->mtu < so->ll.mtu) {
+ if (READ_ONCE(dev->mtu) < so->ll.mtu) {
dev_put(dev);
err = -EINVAL;
goto out;
diff --git a/net/can/raw.c b/net/can/raw.c
index 76b867d21def..a53853f5e9af 100644
--- a/net/can/raw.c
+++ b/net/can/raw.c
@@ -75,31 +75,31 @@ MODULE_ALIAS("can-proto-1");
*/
struct uniqframe {
- int skbcnt;
const struct sk_buff *skb;
+ int skbcnt;
unsigned int join_rx_count;
};
struct raw_sock {
struct sock sk;
- int bound;
- int ifindex;
struct net_device *dev;
netdevice_tracker dev_tracker;
struct list_head notifier;
- int loopback;
- int recv_own_msgs;
- int fd_frames;
- int xl_frames;
+ int ifindex;
+ unsigned int bound:1;
+ unsigned int loopback:1;
+ unsigned int recv_own_msgs:1;
+ unsigned int fd_frames:1;
+ unsigned int xl_frames:1;
+ unsigned int join_filters:1;
struct can_raw_vcid_options raw_vcid_opts;
canid_t tx_vcid_shifted;
canid_t rx_vcid_shifted;
canid_t rx_vcid_mask_shifted;
- int join_filters;
+ can_err_mask_t err_mask;
int count; /* number of active filters */
struct can_filter dfilter; /* default/single filter */
struct can_filter *filter; /* pointer to filter(s) */
- can_err_mask_t err_mask;
struct uniqframe __percpu *uniq;
};
@@ -560,8 +560,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
struct can_filter sfilter; /* single filter */
struct net_device *dev = NULL;
can_err_mask_t err_mask = 0;
- int fd_frames;
int count = 0;
+ int flag;
int err = 0;
if (level != SOL_CAN_RAW)
@@ -682,44 +682,48 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
break;
case CAN_RAW_LOOPBACK:
- if (optlen != sizeof(ro->loopback))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->loopback, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->loopback = !!flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
- if (optlen != sizeof(ro->recv_own_msgs))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->recv_own_msgs, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->recv_own_msgs = !!flag;
break;
case CAN_RAW_FD_FRAMES:
- if (optlen != sizeof(fd_frames))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&fd_frames, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
/* Enabling CAN XL includes CAN FD */
- if (ro->xl_frames && !fd_frames)
+ if (ro->xl_frames && !flag)
return -EINVAL;
- ro->fd_frames = fd_frames;
+ ro->fd_frames = !!flag;
break;
case CAN_RAW_XL_FRAMES:
- if (optlen != sizeof(ro->xl_frames))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->xl_frames, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->xl_frames = !!flag;
+
/* Enabling CAN XL includes CAN FD */
if (ro->xl_frames)
ro->fd_frames = ro->xl_frames;
@@ -739,12 +743,13 @@ static int raw_setsockopt(struct socket *sock, int level, int optname,
break;
case CAN_RAW_JOIN_FILTERS:
- if (optlen != sizeof(ro->join_filters))
+ if (optlen != sizeof(flag))
return -EINVAL;
- if (copy_from_sockptr(&ro->join_filters, optval, optlen))
+ if (copy_from_sockptr(&flag, optval, optlen))
return -EFAULT;
+ ro->join_filters = !!flag;
break;
default:
@@ -758,6 +763,7 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
{
struct sock *sk = sock->sk;
struct raw_sock *ro = raw_sk(sk);
+ int flag;
int len;
void *val;
@@ -806,25 +812,29 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
case CAN_RAW_LOOPBACK:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->loopback;
+ flag = ro->loopback;
+ val = &flag;
break;
case CAN_RAW_RECV_OWN_MSGS:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->recv_own_msgs;
+ flag = ro->recv_own_msgs;
+ val = &flag;
break;
case CAN_RAW_FD_FRAMES:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->fd_frames;
+ flag = ro->fd_frames;
+ val = &flag;
break;
case CAN_RAW_XL_FRAMES:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->xl_frames;
+ flag = ro->xl_frames;
+ val = &flag;
break;
case CAN_RAW_XL_VCID_OPTS: {
@@ -849,7 +859,8 @@ static int raw_getsockopt(struct socket *sock, int level, int optname,
case CAN_RAW_JOIN_FILTERS:
if (len > sizeof(int))
len = sizeof(int);
- val = &ro->join_filters;
+ flag = ro->join_filters;
+ val = &flag;
break;
default:
@@ -950,7 +961,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
err = -EINVAL;
/* check for valid CAN (CC/FD/XL) frame content */
- txmtu = raw_check_txframe(ro, skb, dev->mtu);
+ txmtu = raw_check_txframe(ro, skb, READ_ONCE(dev->mtu));
if (!txmtu)
goto free_skb;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 9f6d860411cb..1fbec4853f00 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -252,7 +252,8 @@ int __init ceph_msgr_init(void)
* The number of active work items is limited by the number of
* connections, so leave @max_active at default.
*/
- ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_MEM_RECLAIM, 0);
+ ceph_msgr_wq = alloc_workqueue("ceph-msgr",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (ceph_msgr_wq)
return 0;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index ab66b599ac47..c227ececa925 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -314,7 +314,7 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
delay = CEPH_MONC_PING_INTERVAL;
dout("__schedule_delayed after %lu\n", delay);
- mod_delayed_work(system_wq, &monc->delayed_work,
+ mod_delayed_work(system_percpu_wq, &monc->delayed_work,
round_jiffies_relative(delay));
}
diff --git a/net/core/Makefile b/net/core/Makefile
index b2a76ce33932..9ef2099c5426 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
obj-y += netdev_rx_queue.o
+obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
obj-$(CONFIG_PROC_FS) += net-procfs.o
obj-$(CONFIG_NET_PKTGEN) += pktgen.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index f474b9b120f9..cb4b9ef2e4e3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -345,7 +345,7 @@ int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
spin_unlock_bh(&sk_queue->lock);
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return err;
}
EXPORT_SYMBOL(__sk_queue_drop_skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index 8d49b2198d07..a64cef2c537e 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3768,8 +3768,14 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
features &= ~dev->gso_partial_features;
- /* Make sure to clear the IPv4 ID mangling feature if the
- * IPv4 header has the potential to be fragmented.
+ /* Make sure to clear the IPv4 ID mangling feature if the IPv4 header
+ * has the potential to be fragmented so that TSO does not generate
+ * segments with the same ID. For encapsulated packets, the ID mangling
+ * feature is guaranteed not to use the same ID for the outer IPv4
+ * headers of the generated segments if the headers have the potential
+ * to be fragmented, so there is no need to clear the IPv4 ID mangling
+ * feature (see the section about NETIF_F_TSO_MANGLEID in
+ * segmentation-offloads.rst).
*/
if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
struct iphdr *iph = skb->encapsulation ?
@@ -3907,6 +3913,38 @@ sw_checksum:
}
EXPORT_SYMBOL(skb_csum_hwoffload_help);
+/* Checks if this SKB belongs to an HW offloaded socket
+ * and whether any SW fallbacks are required based on dev.
+ * Check decrypted mark in case skb_orphan() cleared socket.
+ */
+static struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb,
+ struct net_device *dev)
+{
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ struct sk_buff *(*sk_validate)(struct sock *sk, struct net_device *dev,
+ struct sk_buff *skb);
+ struct sock *sk = skb->sk;
+
+ sk_validate = NULL;
+ if (sk) {
+ if (sk_fullsock(sk))
+ sk_validate = sk->sk_validate_xmit_skb;
+ else if (sk_is_inet(sk) && sk->sk_state == TCP_TIME_WAIT)
+ sk_validate = inet_twsk(sk)->tw_validate_xmit_skb;
+ }
+
+ if (sk_validate) {
+ skb = sk_validate(sk, dev, skb);
+ } else if (unlikely(skb_is_decrypted(skb))) {
+ pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n");
+ kfree_skb(skb);
+ skb = NULL;
+ }
+#endif
+
+ return skb;
+}
+
static struct sk_buff *validate_xmit_unreadable_skb(struct sk_buff *skb,
struct net_device *dev)
{
@@ -4849,9 +4887,40 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
return hash_32(hash, flow_table->log);
}
+#ifdef CONFIG_RFS_ACCEL
+/**
+ * rps_flow_is_active - check whether the flow is recently active.
+ * @rflow: Specific flow to check activity.
+ * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @cpu: CPU saved in @rflow.
+ *
+ * If the CPU has processed many packets since the flow's last activity
+ * (beyond 10 times the table size), the flow is considered stale.
+ *
+ * Return: true if flow was recently active.
+ */
+static bool rps_flow_is_active(struct rps_dev_flow *rflow,
+ struct rps_dev_flow_table *flow_table,
+ unsigned int cpu)
+{
+ unsigned int flow_last_active;
+ unsigned int sd_input_head;
+
+ if (cpu >= nr_cpu_ids)
+ return false;
+
+ sd_input_head = READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head);
+ flow_last_active = READ_ONCE(rflow->last_qtail);
+
+ return (int)(sd_input_head - flow_last_active) <
+ (int)(10 << flow_table->log);
+}
+#endif
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
- struct rps_dev_flow *rflow, u16 next_cpu)
+ struct rps_dev_flow *rflow, u16 next_cpu, u32 hash,
+ u32 flow_id)
{
if (next_cpu < nr_cpu_ids) {
u32 head;
@@ -4859,8 +4928,9 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *old_rflow;
+ struct rps_dev_flow *tmp_rflow;
+ unsigned int tmp_cpu;
u16 rxq_index;
- u32 flow_id;
int rc;
/* Should we steer this flow to a different hardware queue? */
@@ -4875,14 +4945,29 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (!flow_table)
goto out;
- flow_id = rfs_slot(skb_get_hash(skb), flow_table);
+
+ tmp_rflow = &flow_table->flows[flow_id];
+ tmp_cpu = READ_ONCE(tmp_rflow->cpu);
+
+ if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
+ if (rps_flow_is_active(tmp_rflow, flow_table,
+ tmp_cpu)) {
+ if (hash != READ_ONCE(tmp_rflow->hash) ||
+ next_cpu == tmp_cpu)
+ goto out;
+ }
+ }
+
rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
rxq_index, flow_id);
if (rc < 0)
goto out;
+
old_rflow = rflow;
- rflow = &flow_table->flows[flow_id];
+ rflow = tmp_rflow;
WRITE_ONCE(rflow->filter, rc);
+ WRITE_ONCE(rflow->hash, hash);
+
if (old_rflow->filter == rc)
WRITE_ONCE(old_rflow->filter, RPS_NO_FILTER);
out:
@@ -4908,6 +4993,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow_table *flow_table;
struct rps_map *map;
int cpu = -1;
+ u32 flow_id;
u32 tcpu;
u32 hash;
@@ -4954,7 +5040,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
/* OK, now we know there is a match,
* we can look at the local (per receive queue) flow table
*/
- rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+ flow_id = rfs_slot(hash, flow_table);
+ rflow = &flow_table->flows[flow_id];
tcpu = rflow->cpu;
/*
@@ -4973,7 +5060,8 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
((int)(READ_ONCE(per_cpu(softnet_data, tcpu).input_queue_head) -
rflow->last_qtail)) >= 0)) {
tcpu = next_cpu;
- rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
+ rflow = set_rps_cpu(dev, skb, rflow, next_cpu, hash,
+ flow_id);
}
if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
@@ -5017,17 +5105,16 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
struct rps_dev_flow_table *flow_table;
struct rps_dev_flow *rflow;
bool expire = true;
- unsigned int cpu;
rcu_read_lock();
flow_table = rcu_dereference(rxqueue->rps_flow_table);
if (flow_table && flow_id < (1UL << flow_table->log)) {
+ unsigned int cpu;
+
rflow = &flow_table->flows[flow_id];
cpu = READ_ONCE(rflow->cpu);
- if (READ_ONCE(rflow->filter) == filter_id && cpu < nr_cpu_ids &&
- ((int)(READ_ONCE(per_cpu(softnet_data, cpu).input_queue_head) -
- READ_ONCE(rflow->last_qtail)) <
- (int)(10 << flow_table->log)))
+ if (READ_ONCE(rflow->filter) == filter_id &&
+ rps_flow_is_active(rflow, flow_table, cpu))
expire = false;
}
rcu_read_unlock();
@@ -5093,8 +5180,9 @@ static void napi_schedule_rps(struct softnet_data *sd)
__napi_schedule_irqoff(&mysd->backlog);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu)
+void kick_defer_list_purge(unsigned int cpu)
{
+ struct softnet_data *sd = &per_cpu(softnet_data, cpu);
unsigned long flags;
if (use_backlog_threads()) {
@@ -5199,7 +5287,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
backlog_unlock_irq_restore(sd, &flags);
cpu_backlog_drop:
- atomic_inc(&sd->dropped);
+ numa_drop_add(&sd->drop_counters, 1);
bad_dev:
dev_core_stats_rx_dropped_inc(skb->dev);
kfree_skb_reason(skb, reason);
@@ -6628,24 +6716,24 @@ bool napi_complete_done(struct napi_struct *n, int work_done)
}
EXPORT_SYMBOL(napi_complete_done);
-static void skb_defer_free_flush(struct softnet_data *sd)
+static void skb_defer_free_flush(void)
{
+ struct llist_node *free_list;
struct sk_buff *skb, *next;
+ struct skb_defer_node *sdn;
+ int node;
- /* Paired with WRITE_ONCE() in skb_attempt_defer_free() */
- if (!READ_ONCE(sd->defer_list))
- return;
+ for_each_node(node) {
+ sdn = this_cpu_ptr(net_hotdata.skb_defer_nodes) + node;
- spin_lock(&sd->defer_lock);
- skb = sd->defer_list;
- sd->defer_list = NULL;
- sd->defer_count = 0;
- spin_unlock(&sd->defer_lock);
+ if (llist_empty(&sdn->defer_list))
+ continue;
+ atomic_long_set(&sdn->defer_count, 0);
+ free_list = llist_del_all(&sdn->defer_list);
- while (skb != NULL) {
- next = skb->next;
- napi_consume_skb(skb, 1);
- skb = next;
+ llist_for_each_entry_safe(skb, next, free_list, ll_node) {
+ napi_consume_skb(skb, 1);
+ }
}
}
@@ -6773,7 +6861,7 @@ count:
if (work > 0)
__NET_ADD_STATS(dev_net(napi->dev),
LINUX_MIB_BUSYPOLLRXPACKETS, work);
- skb_defer_free_flush(this_cpu_ptr(&softnet_data));
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -7632,7 +7720,7 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
local_irq_disable();
net_rps_action_and_irq_enable(sd);
}
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
bpf_net_ctx_clear(bpf_net_ctx);
local_bh_enable();
@@ -7674,7 +7762,7 @@ start:
for (;;) {
struct napi_struct *n;
- skb_defer_free_flush(sd);
+ skb_defer_free_flush();
if (list_empty(&list)) {
if (list_empty(&repoll)) {
@@ -12908,7 +12996,6 @@ static int __init net_dev_init(void)
sd->cpu = i;
#endif
INIT_CSD(&sd->defer_csd, trigger_rx_softirq, sd);
- spin_lock_init(&sd->defer_lock);
gro_init(&sd->backlog.gro);
sd->backlog.poll = process_backlog;
@@ -12918,6 +13005,11 @@ static int __init net_dev_init(void)
if (net_page_pool_create(i))
goto out;
}
+ net_hotdata.skb_defer_nodes =
+ __alloc_percpu(sizeof(struct skb_defer_node) * nr_node_ids,
+ __alignof__(struct skb_defer_node));
+ if (!net_hotdata.skb_defer_nodes)
+ goto out;
if (use_backlog_threads())
smpboot_register_percpu_thread(&backlog_threads);
diff --git a/net/core/dev.h b/net/core/dev.h
index d6b08d435479..900880e8b5b4 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -357,7 +357,7 @@ static inline void napi_assert_will_not_race(const struct napi_struct *napi)
WARN_ON(READ_ONCE(napi->list_owner) != -1);
}
-void kick_defer_list_purge(struct softnet_data *sd, unsigned int cpu);
+void kick_defer_list_purge(unsigned int cpu);
#define XMIT_RECURSION_LIMIT 8
diff --git a/net/core/devmem.c b/net/core/devmem.c
index 24c591ab38ae..d9de31a6cc7f 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -176,6 +176,7 @@ err_close_rxq:
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack)
@@ -188,6 +189,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
unsigned long virtual;
int err;
+ if (!dma_dev) {
+ NL_SET_ERR_MSG(extack, "Device doesn't support DMA");
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
dmabuf = dma_buf_get(dmabuf_fd);
if (IS_ERR(dmabuf))
return ERR_CAST(dmabuf);
@@ -209,7 +215,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dmabuf = dmabuf;
binding->direction = direction;
- binding->attachment = dma_buf_attach(binding->dmabuf, dev->dev.parent);
+ binding->attachment = dma_buf_attach(binding->dmabuf, dma_dev);
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 41cd6e1c9141..101150d761af 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -85,6 +85,7 @@ struct dmabuf_genpool_chunk_owner {
void __net_devmem_dmabuf_binding_free(struct work_struct *wq);
struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd, struct netdev_nl_sock *priv,
struct netlink_ext_ack *extack);
@@ -170,6 +171,7 @@ static inline void net_devmem_put_net_iov(struct net_iov *niov)
static inline struct net_devmem_dmabuf_binding *
net_devmem_bind_dmabuf(struct net_device *dev,
+ struct device *dma_dev,
enum dma_data_direction direction,
unsigned int dmabuf_fd,
struct netdev_nl_sock *priv,
diff --git a/net/core/dst.c b/net/core/dst.c
index e2de8b68c41d..e9d35f49c9e7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -150,7 +150,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev);
WRITE_ONCE(dst->input, dst_discard);
WRITE_ONCE(dst->output, dst_discard_out);
- WRITE_ONCE(dst->dev, blackhole_netdev);
+ rcu_assign_pointer(dst->dev_rcu, blackhole_netdev);
netdev_ref_replace(dev, blackhole_netdev, &dst->dev_tracker,
GFP_ATOMIC);
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 2af0a5f1d748..5d1838ff1ab9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2373,7 +2373,7 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev,
struct flowi4 fl4 = {
.flowi4_flags = FLOWI_FLAG_ANYSRC,
.flowi4_mark = skb->mark,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip4h)),
+ .flowi4_dscp = ip4h_dscp(ip4h),
.flowi4_oif = dev->ifindex,
.flowi4_proto = ip4h->protocol,
.daddr = ip4h->daddr,
@@ -6028,7 +6028,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
fl4.flowi4_iif = params->ifindex;
fl4.flowi4_oif = 0;
}
- fl4.flowi4_tos = params->tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(params->tos);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
@@ -6775,7 +6775,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
bool refcounted = false;
struct sock *sk = NULL;
@@ -6784,7 +6783,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
__be32 dst4 = tuple->ipv4.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet_lookup(net, hinfo, NULL, 0,
+ sk = __inet_lookup(net, NULL, 0,
src4, tuple->ipv4.sport,
dst4, tuple->ipv4.dport,
dif, sdif, &refcounted);
@@ -6798,7 +6797,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr;
if (proto == IPPROTO_TCP)
- sk = __inet6_lookup(net, hinfo, NULL, 0,
+ sk = __inet6_lookup(net, NULL, 0,
src6, tuple->ipv6.sport,
dst6, ntohs(tuple->ipv6.dport),
dif, sdif, &refcounted);
diff --git a/net/core/gro.c b/net/core/gro.c
index b350e5b69549..5ba4504cfd28 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -1,4 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
+#include <net/psp.h>
#include <net/gro.h>
#include <net/dst_metadata.h>
#include <net/busy_poll.h>
@@ -376,6 +377,7 @@ static void gro_list_prepare(const struct list_head *head,
diffs |= skb_get_nfct(p) ^ skb_get_nfct(skb);
diffs |= gro_list_prepare_tc_ext(skb, p, diffs);
+ diffs |= __psp_skb_coalesce_diff(skb, p, diffs);
}
NAPI_GRO_CB(p)->same_flow = !diffs;
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 864f3bbc3a4c..212cde35affa 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -157,9 +157,9 @@ static void linkwatch_schedule_work(int urgent)
* override the existing timer.
*/
if (test_bit(LW_URGENT, &linkwatch_flags))
- mod_delayed_work(system_unbound_wq, &linkwatch_work, 0);
+ mod_delayed_work(system_dfl_wq, &linkwatch_work, 0);
else
- queue_delayed_work(system_unbound_wq, &linkwatch_work, delay);
+ queue_delayed_work(system_dfl_wq, &linkwatch_work, delay);
}
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index ae74634310a3..9f40be0c3e71 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -8,12 +8,12 @@
#include <linux/skbuff.h>
#include <linux/types.h>
#include <linux/bpf.h>
+#include <net/flow.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip.h>
#include <net/ip6_route.h>
#include <net/ipv6_stubs.h>
-#include <net/inet_dscp.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -209,7 +209,7 @@ static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c
index 4f0f0709a1cb..70e0e9a3b650 100644
--- a/net/core/net-procfs.c
+++ b/net/core/net-procfs.c
@@ -145,7 +145,8 @@ static int softnet_seq_show(struct seq_file *seq, void *v)
seq_printf(seq,
"%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x "
"%08x %08x\n",
- READ_ONCE(sd->processed), atomic_read(&sd->dropped),
+ READ_ONCE(sd->processed),
+ numa_drop_read(&sd->drop_counters),
READ_ONCE(sd->time_squeeze), 0,
0, 0, 0, 0, /* was fastroute */
0, /* was cpu_collision */
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 3c2dc4c5e683..ca878525ad7c 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1120,8 +1120,10 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
return -ENOMEM;
table->log = ilog2(mask) + 1;
- for (count = 0; count <= mask; count++)
+ for (count = 0; count <= mask; count++) {
table->flows[count].cpu = RPS_NO_CPU;
+ table->flows[count].filter = RPS_NO_FILTER;
+ }
} else {
table = NULL;
}
diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c
index 6314eb7bdf69..470fabbeacd9 100644
--- a/net/core/netdev-genl.c
+++ b/net/core/netdev-genl.c
@@ -869,16 +869,79 @@ int netdev_nl_qstats_get_dumpit(struct sk_buff *skb,
return err;
}
-int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+static int netdev_nl_read_rxq_bitmap(struct genl_info *info,
+ u32 rxq_bitmap_len,
+ unsigned long *rxq_bitmap)
{
+ const int maxtype = ARRAY_SIZE(netdev_queue_id_nl_policy) - 1;
struct nlattr *tb[ARRAY_SIZE(netdev_queue_id_nl_policy)];
+ struct nlattr *attr;
+ int rem, err = 0;
+ u32 rxq_idx;
+
+ nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
+ genlmsg_data(info->genlhdr),
+ genlmsg_len(info->genlhdr), rem) {
+ err = nla_parse_nested(tb, maxtype, attr,
+ netdev_queue_id_nl_policy, info->extack);
+ if (err < 0)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
+ NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE))
+ return -EINVAL;
+
+ if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
+ return -EINVAL;
+ }
+
+ rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ if (rxq_idx >= rxq_bitmap_len) {
+ NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_ID]);
+ return -EINVAL;
+ }
+
+ bitmap_set(rxq_bitmap, rxq_idx, 1);
+ }
+
+ return 0;
+}
+
+static struct device *
+netdev_nl_get_dma_dev(struct net_device *netdev, unsigned long *rxq_bitmap,
+ struct netlink_ext_ack *extack)
+{
+ struct device *dma_dev = NULL;
+ u32 rxq_idx, prev_rxq_idx;
+
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
+ struct device *rxq_dma_dev;
+
+ rxq_dma_dev = netdev_queue_get_dma_dev(netdev, rxq_idx);
+ if (dma_dev && rxq_dma_dev != dma_dev) {
+ NL_SET_ERR_MSG_FMT(extack, "DMA device mismatch between queue %u and %u (multi-PF device?)",
+ rxq_idx, prev_rxq_idx);
+ return ERR_PTR(-EOPNOTSUPP);
+ }
+
+ dma_dev = rxq_dma_dev;
+ prev_rxq_idx = rxq_idx;
+ }
+
+ return dma_dev;
+}
+
+int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
+{
struct net_devmem_dmabuf_binding *binding;
u32 ifindex, dmabuf_fd, rxq_idx;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ unsigned long *rxq_bitmap;
+ struct device *dma_dev;
struct sk_buff *rsp;
- struct nlattr *attr;
- int rem, err = 0;
+ int err = 0;
void *hdr;
if (GENL_REQ_ATTR_CHECK(info, NETDEV_A_DEV_IFINDEX) ||
@@ -921,36 +984,31 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_FROM_DEVICE, dmabuf_fd,
- priv, info->extack);
- if (IS_ERR(binding)) {
- err = PTR_ERR(binding);
+ rxq_bitmap = bitmap_zalloc(netdev->real_num_rx_queues, GFP_KERNEL);
+ if (!rxq_bitmap) {
+ err = -ENOMEM;
goto err_unlock;
}
- nla_for_each_attr_type(attr, NETDEV_A_DMABUF_QUEUES,
- genlmsg_data(info->genlhdr),
- genlmsg_len(info->genlhdr), rem) {
- err = nla_parse_nested(
- tb, ARRAY_SIZE(netdev_queue_id_nl_policy) - 1, attr,
- netdev_queue_id_nl_policy, info->extack);
- if (err < 0)
- goto err_unbind;
-
- if (NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_ID) ||
- NL_REQ_ATTR_CHECK(info->extack, attr, tb, NETDEV_A_QUEUE_TYPE)) {
- err = -EINVAL;
- goto err_unbind;
- }
+ err = netdev_nl_read_rxq_bitmap(info, netdev->real_num_rx_queues,
+ rxq_bitmap);
+ if (err)
+ goto err_rxq_bitmap;
- if (nla_get_u32(tb[NETDEV_A_QUEUE_TYPE]) != NETDEV_QUEUE_TYPE_RX) {
- NL_SET_BAD_ATTR(info->extack, tb[NETDEV_A_QUEUE_TYPE]);
- err = -EINVAL;
- goto err_unbind;
- }
+ dma_dev = netdev_nl_get_dma_dev(netdev, rxq_bitmap, info->extack);
+ if (IS_ERR(dma_dev)) {
+ err = PTR_ERR(dma_dev);
+ goto err_rxq_bitmap;
+ }
- rxq_idx = nla_get_u32(tb[NETDEV_A_QUEUE_ID]);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_FROM_DEVICE,
+ dmabuf_fd, priv, info->extack);
+ if (IS_ERR(binding)) {
+ err = PTR_ERR(binding);
+ goto err_rxq_bitmap;
+ }
+ for_each_set_bit(rxq_idx, rxq_bitmap, netdev->real_num_rx_queues) {
err = net_devmem_bind_dmabuf_to_queue(netdev, rxq_idx, binding,
info->extack);
if (err)
@@ -964,6 +1022,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_unbind;
+ bitmap_free(rxq_bitmap);
+
netdev_unlock(netdev);
mutex_unlock(&priv->lock);
@@ -972,6 +1032,8 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info)
err_unbind:
net_devmem_unbind_dmabuf(binding);
+err_rxq_bitmap:
+ bitmap_free(rxq_bitmap);
err_unlock:
netdev_unlock(netdev);
err_unlock_sock:
@@ -986,6 +1048,7 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
struct net_devmem_dmabuf_binding *binding;
struct netdev_nl_sock *priv;
struct net_device *netdev;
+ struct device *dma_dev;
u32 ifindex, dmabuf_fd;
struct sk_buff *rsp;
int err = 0;
@@ -1032,8 +1095,9 @@ int netdev_nl_bind_tx_doit(struct sk_buff *skb, struct genl_info *info)
goto err_unlock_netdev;
}
- binding = net_devmem_bind_dmabuf(netdev, DMA_TO_DEVICE, dmabuf_fd, priv,
- info->extack);
+ dma_dev = netdev_queue_get_dma_dev(netdev, 0);
+ binding = net_devmem_bind_dmabuf(netdev, dma_dev, DMA_TO_DEVICE,
+ dmabuf_fd, priv, info->extack);
if (IS_ERR(binding)) {
err = PTR_ERR(binding);
goto err_unlock_netdev;
diff --git a/net/core/netdev_queues.c b/net/core/netdev_queues.c
new file mode 100644
index 000000000000..251f27a8307f
--- /dev/null
+++ b/net/core/netdev_queues.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <net/netdev_queues.h>
+
+/**
+ * netdev_queue_get_dma_dev() - get dma device for zero-copy operations
+ * @dev: net_device
+ * @idx: queue index
+ *
+ * Get dma device for zero-copy operations to be used for this queue.
+ * When such device is not available or valid, the function will return NULL.
+ *
+ * Return: Device or NULL on error
+ */
+struct device *netdev_queue_get_dma_dev(struct net_device *dev, int idx)
+{
+ const struct netdev_queue_mgmt_ops *queue_ops = dev->queue_mgmt_ops;
+ struct device *dma_dev;
+
+ if (queue_ops && queue_ops->ndo_queue_get_dma_dev)
+ dma_dev = queue_ops->ndo_queue_get_dma_dev(dev, idx);
+ else
+ dma_dev = dev->dev.parent;
+
+ return dma_dev && dma_dev->dma_mask ? dma_dev : NULL;
+}
+
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index 3bf1151d8061..c7d9341b7630 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -9,6 +9,15 @@
#include "page_pool_priv.h"
+/* See also page_pool_is_unreadable() */
+bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
+{
+ struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, idx);
+
+ return !!rxq->mp_params.mp_ops;
+}
+EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
+
int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 5f65b62346d4..60a05d3b7c24 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -591,7 +591,6 @@ int __netpoll_setup(struct netpoll *np, struct net_device *ndev)
np->dev = ndev;
strscpy(np->dev_name, ndev->name, IFNAMSIZ);
- npinfo->netpoll = np;
/* fill up the skb queue */
refill_skbs(np);
@@ -835,7 +834,7 @@ void __netpoll_free(struct netpoll *np)
ASSERT_RTNL();
/* Wait for transmitting packets to finish before freeing. */
- synchronize_rcu();
+ synchronize_net();
__netpoll_cleanup(np);
kfree(np);
}
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index ba70569bd4b0..492728f9e021 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -211,11 +211,7 @@ static int page_pool_init(struct page_pool *pool,
return -EINVAL;
if (pool->p.pool_size)
- ring_qsize = pool->p.pool_size;
-
- /* Sanity limit mem that can be pinned down */
- if (ring_qsize > 32768)
- return -E2BIG;
+ ring_qsize = min(pool->p.pool_size, 16384);
/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
* DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
@@ -555,6 +551,12 @@ static noinline netmem_ref __page_pool_alloc_netmems_slow(struct page_pool *pool
netmem_ref netmem;
int i, nr_pages;
+ /* Unconditionally set NOWARN if allocating from NAPI.
+ * Drivers forget to set it, and OOM reports on packet Rx are useless.
+ */
+ if ((gfp & GFP_ATOMIC) == GFP_ATOMIC)
+ gfp |= __GFP_NOWARN;
+
/* Don't support bulk alloc for high-order pages */
if (unlikely(pp_order))
return page_to_netmem(__page_pool_alloc_page_order(pool, gfp));
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index 0ebe5461d4d9..d41b03fd1f63 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -114,6 +114,7 @@
#include <linux/sys.h>
#include <linux/types.h>
+#include <linux/minmax.h>
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/kernel.h>
@@ -2841,8 +2842,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
}
i = 0;
- frag_len = (datalen/frags) < PAGE_SIZE ?
- (datalen/frags) : PAGE_SIZE;
+ frag_len = min_t(int, datalen / frags, PAGE_SIZE);
while (datalen > 0) {
if (unlikely(!pkt_dev->page)) {
int node = numa_node_id();
@@ -2859,8 +2859,7 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
if (i == (frags - 1))
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0,
- (datalen < PAGE_SIZE ?
- datalen : PAGE_SIZE));
+ min(datalen, PAGE_SIZE));
else
skb_frag_fill_page_desc(&skb_shinfo(skb)->frags[i],
pkt_dev->page, 0, frag_len);
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 63de5c635842..897a8f01a67b 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -77,9 +77,7 @@ void reqsk_queue_alloc(struct request_sock_queue *queue)
* a simple spin lock - one must consider sock_owned_by_user() and arrange
* to use sk_add_backlog() stuff. But what really makes it infeasible is the
* locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock. A corner
- * case might also exist in tcp_v4_hnd_req() that will trigger this locking
- * order.
+ * acquire a child's lock while holding listener's socket lock.
*
* This function also sets "treq->tfo_listener" to false.
* treq->tfo_listener is used by the listener so it is protected by the
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 094b085cff20..8040ff7c356e 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -478,7 +478,7 @@ static int rtnl_unregister(int protocol, int msgtype)
* rtnl_unregister_all - Unregister all rtnetlink message type of a protocol
* @protocol : Protocol family or PF_UNSPEC
*
- * Identical to calling rtnl_unregster() for all registered message types
+ * Identical to calling rtnl_unregister() for all registered message types
* of a certain protocol family.
*/
void rtnl_unregister_all(int protocol)
@@ -1326,6 +1326,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_devlink_port_size(dev)
+ rtnl_dpll_pin_size(dev)
+ nla_total_size(8) /* IFLA_MAX_PACING_OFFLOAD_HORIZON */
+ + nla_total_size(2) /* IFLA_HEADROOM */
+ + nla_total_size(2) /* IFLA_TAILROOM */
+ 0;
}
@@ -2091,7 +2093,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
nla_put_u32(skb, IFLA_CARRIER_UP_COUNT,
atomic_read(&dev->carrier_up_count)) ||
nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT,
- atomic_read(&dev->carrier_down_count)))
+ atomic_read(&dev->carrier_down_count)) ||
+ nla_put_u16(skb, IFLA_HEADROOM,
+ READ_ONCE(dev->needed_headroom)) ||
+ nla_put_u16(skb, IFLA_TAILROOM,
+ READ_ONCE(dev->needed_tailroom)))
goto nla_put_failure;
if (rtnl_fill_proto_down(skb, dev))
@@ -2243,6 +2249,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
[IFLA_GSO_IPV4_MAX_SIZE] = NLA_POLICY_MIN(NLA_U32, MAX_TCP_HEADER + 1),
[IFLA_GRO_IPV4_MAX_SIZE] = { .type = NLA_U32 },
[IFLA_NETNS_IMMUTABLE] = { .type = NLA_REJECT },
+ [IFLA_HEADROOM] = { .type = NLA_REJECT },
+ [IFLA_TAILROOM] = { .type = NLA_REJECT },
};
static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/net/core/scm.c b/net/core/scm.c
index 072d5742440a..66eaee783e8b 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -273,7 +273,9 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data)
check_object_size(data, cmlen - sizeof(*cm), true);
- if (!user_write_access_begin(cm, cmlen))
+ if (can_do_masked_user_access())
+ cm = masked_user_access_begin(cm);
+ else if (!user_write_access_begin(cm, cmlen))
goto efault;
unsafe_put_user(cmlen, &cm->cmsg_len, efault_end);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1c0279b9cb9f..bc12790017b0 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,7 @@
#include <net/mptcp.h>
#include <net/mctp.h>
#include <net/page_pool/helpers.h>
+#include <net/psp/types.h>
#include <net/dropreason.h>
#include <linux/uaccess.h>
@@ -3112,7 +3113,9 @@ static bool __splice_segment(struct page *page, unsigned int poff,
poff += flen;
plen -= flen;
*len -= flen;
- } while (*len && plen);
+ if (!*len)
+ return true;
+ } while (plen);
return false;
}
@@ -5060,6 +5063,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_MCTP_FLOWS)
[SKB_EXT_MCTP] = SKB_EXT_CHUNKSIZEOF(struct mctp_flow),
#endif
+#if IS_ENABLED(CONFIG_INET_PSP)
+ [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -7042,6 +7048,7 @@ void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id,
skb->active_extensions = 1 << id;
return skb_ext_get_ptr(ext, id);
}
+EXPORT_SYMBOL_NS_GPL(__skb_ext_set, "NETDEV_INTERNAL");
/**
* skb_ext_add - allocate space for given extension, COW if needed
@@ -7178,8 +7185,9 @@ static void kfree_skb_napi_cache(struct sk_buff *skb)
*/
void skb_attempt_defer_free(struct sk_buff *skb)
{
+ struct skb_defer_node *sdn;
+ unsigned long defer_count;
int cpu = skb->alloc_cpu;
- struct softnet_data *sd;
unsigned int defer_max;
bool kick;
@@ -7193,27 +7201,24 @@ nodefer: kfree_skb_napi_cache(skb);
DEBUG_NET_WARN_ON_ONCE(skb_dst(skb));
DEBUG_NET_WARN_ON_ONCE(skb->destructor);
- sd = &per_cpu(softnet_data, cpu);
+ sdn = per_cpu_ptr(net_hotdata.skb_defer_nodes, cpu) + numa_node_id();
+
defer_max = READ_ONCE(net_hotdata.sysctl_skb_defer_max);
- if (READ_ONCE(sd->defer_count) >= defer_max)
+ defer_count = atomic_long_inc_return(&sdn->defer_count);
+
+ if (defer_count >= defer_max)
goto nodefer;
- spin_lock_bh(&sd->defer_lock);
- /* Send an IPI every time queue reaches half capacity. */
- kick = sd->defer_count == (defer_max >> 1);
- /* Paired with the READ_ONCE() few lines above */
- WRITE_ONCE(sd->defer_count, sd->defer_count + 1);
+ llist_add(&skb->ll_node, &sdn->defer_list);
- skb->next = sd->defer_list;
- /* Paired with READ_ONCE() in skb_defer_free_flush() */
- WRITE_ONCE(sd->defer_list, skb);
- spin_unlock_bh(&sd->defer_lock);
+ /* Send an IPI every time queue reaches half capacity. */
+ kick = (defer_count - 1) == (defer_max >> 1);
/* Make sure to trigger NET_RX_SOFTIRQ on the remote CPU
* if we are unlucky enough (this seems very unlikely).
*/
if (unlikely(kick))
- kick_defer_list_purge(sd, cpu);
+ kick_defer_list_purge(cpu);
}
static void skb_splice_csum_page(struct sk_buff *skb, struct page *page,
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index 83c78379932e..2ac7731e1e0a 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -876,7 +876,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop(psock);
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
- queue_rcu_work(system_wq, &psock->rwork);
+ queue_rcu_work(system_percpu_wq, &psock->rwork);
}
EXPORT_SYMBOL_GPL(sk_psock_drop);
diff --git a/net/core/sock.c b/net/core/sock.c
index 158bddd23134..dc03d4b5909a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -281,12 +281,12 @@ static struct lock_class_key af_elock_keys[AF_MAX];
static struct lock_class_key af_kern_callback_keys[AF_MAX];
/* Run time adjustable parameters. */
-__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_wmem_max);
-__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = 4 << 20;
EXPORT_SYMBOL(sysctl_rmem_max);
-__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
-__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT;
DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
EXPORT_SYMBOL_GPL(memalloc_socks_key);
@@ -491,13 +491,13 @@ int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
struct sk_buff_head *list = &sk->sk_receive_queue;
if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
}
if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
return -ENOBUFS;
}
@@ -562,7 +562,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
skb->dev = NULL;
if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
goto discard_and_relse;
}
@@ -585,7 +585,7 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
reason = SKB_DROP_REASON_PFMEMALLOC;
if (err == -ENOBUFS)
reason = SKB_DROP_REASON_SOCKET_BACKLOG;
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
goto discard_and_relse;
}
@@ -1032,7 +1032,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
bool charged;
int pages;
- if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
+ if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk))
return -EOPNOTSUPP;
if (!bytes)
@@ -1041,8 +1041,8 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
pages = sk_mem_pages(bytes);
/* pre-charge to memcg */
- charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
- GFP_KERNEL | __GFP_RETRY_MAYFAIL);
+ charged = mem_cgroup_sk_charge(sk, pages,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!charged)
return -ENOMEM;
@@ -1054,7 +1054,7 @@ static int sock_reserve_memory(struct sock *sk, int bytes)
*/
if (allocated > sk_prot_mem_limits(sk, 1)) {
sk_memory_allocated_sub(sk, pages);
- mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
+ mem_cgroup_sk_uncharge(sk, pages);
return -ENOMEM;
}
sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
@@ -2505,15 +2505,18 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk->sk_wmem_queued = 0;
newsk->sk_forward_alloc = 0;
newsk->sk_reserved_mem = 0;
- atomic_set(&newsk->sk_drops, 0);
+ DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters);
+ sk_drops_reset(newsk);
newsk->sk_send_head = NULL;
newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
atomic_set(&newsk->sk_zckey, 0);
sock_reset_flag(newsk, SOCK_DONE);
+#ifdef CONFIG_MEMCG
/* sk->sk_memcg will be populated at accept() time */
newsk->sk_memcg = NULL;
+#endif
cgroup_sk_clone(&newsk->sk_cgrp_data);
@@ -2584,7 +2587,7 @@ free:
}
EXPORT_SYMBOL_GPL(sk_clone_lock);
-static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
+static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev)
{
bool is_ipv6 = false;
u32 max_size;
@@ -2594,8 +2597,8 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
#endif
/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
- max_size = is_ipv6 ? READ_ONCE(dst_dev(dst)->gso_max_size) :
- READ_ONCE(dst_dev(dst)->gso_ipv4_max_size);
+ max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) :
+ READ_ONCE(dev->gso_ipv4_max_size);
if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
max_size = GSO_LEGACY_MAX_SIZE;
@@ -2604,9 +2607,12 @@ static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
+ const struct net_device *dev;
u32 max_segs = 1;
- sk->sk_route_caps = dst_dev(dst)->features;
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ sk->sk_route_caps = dev->features;
if (sk_is_tcp(sk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -2622,13 +2628,14 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
} else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
- sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
+ sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev);
/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
- max_segs = max_t(u32, READ_ONCE(dst_dev(dst)->gso_max_segs), 1);
+ max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1);
}
}
sk->sk_gso_max_segs = max_segs;
sk_dst_set(sk, dst);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -3158,23 +3165,27 @@ void __release_sock(struct sock *sk)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb, *next;
+ int nb = 0;
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
spin_unlock_bh(&sk->sk_lock.slock);
- do {
+ while (1) {
next = skb->next;
prefetch(next);
DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
skb_mark_not_on_list(skb);
sk_backlog_rcv(sk, skb);
- cond_resched();
-
skb = next;
- } while (skb != NULL);
+ if (!skb)
+ break;
+
+ if (!(++nb & 15))
+ cond_resched();
+ }
spin_lock_bh(&sk->sk_lock.slock);
}
@@ -3241,16 +3252,16 @@ EXPORT_SYMBOL(sk_wait_data);
*/
int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
{
- struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
+ bool memcg_enabled = false, charged = false;
struct proto *prot = sk->sk_prot;
- bool charged = true;
long allocated;
sk_memory_allocated_add(sk, amt);
allocated = sk_memory_allocated(sk);
- if (memcg) {
- charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
+ if (mem_cgroup_sk_enabled(sk)) {
+ memcg_enabled = true;
+ charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge());
if (!charged)
goto suppress_allocation;
}
@@ -3324,21 +3335,19 @@ suppress_allocation:
*/
if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
/* Force charge with __GFP_NOFAIL */
- if (memcg && !charged) {
- mem_cgroup_charge_skmem(memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
- }
+ if (memcg_enabled && !charged)
+ mem_cgroup_sk_charge(sk, amt,
+ gfp_memcg_charge() | __GFP_NOFAIL);
return 1;
}
}
- if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
- trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
+ trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
sk_memory_allocated_sub(sk, amt);
- if (memcg && charged)
- mem_cgroup_uncharge_skmem(memcg, amt);
+ if (charged)
+ mem_cgroup_sk_uncharge(sk, amt);
return 0;
}
@@ -3376,8 +3385,8 @@ void __sk_mem_reduce_allocated(struct sock *sk, int amount)
{
sk_memory_allocated_sub(sk, amount);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_uncharge(sk, amount);
if (sk_under_global_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
@@ -3691,7 +3700,7 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
*/
smp_wmb();
refcount_set(&sk->sk_refcnt, 1);
- atomic_set(&sk->sk_drops, 0);
+ sk_drops_reset(sk);
}
EXPORT_SYMBOL(sock_init_data_uid);
@@ -3951,7 +3960,7 @@ void sk_get_meminfo(const struct sock *sk, u32 *mem)
mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
}
#ifdef CONFIG_PROC_FS
@@ -4432,7 +4441,9 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
+#ifdef CONFIG_MEMCG
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
+#endif
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
@@ -4441,7 +4452,7 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
- CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
@@ -4460,12 +4471,15 @@ static int __init sock_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index b23594c767f2..026ce9bd9e5e 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -348,7 +348,7 @@ static struct pernet_operations diag_net_ops = {
static int __init sock_diag_init(void)
{
- broadcast_wq = alloc_workqueue("sock_diag_events", 0, 0);
+ broadcast_wq = alloc_workqueue("sock_diag_events", WQ_PERCPU, 0);
BUG_ON(!broadcast_wq);
return register_pernet_subsys(&diag_net_ops);
}
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 491334b9b8be..9100e160113a 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -663,9 +663,8 @@ struct sk_buff *xdp_build_skb_from_buff(const struct xdp_buff *xdp)
u32 tsize;
tsize = sinfo->xdp_frags_truesize ? : nr_frags * xdp->frame_sz;
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size, tsize,
- xdp_buff_is_frag_pfmemalloc(xdp));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ tsize, xdp_buff_get_skb_flags(xdp));
}
skb->protocol = eth_type_trans(skb, rxq->dev);
@@ -692,7 +691,7 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
struct skb_shared_info *sinfo = skb_shinfo(skb);
const struct skb_shared_info *xinfo;
u32 nr_frags, tsize = 0;
- bool pfmemalloc = false;
+ u32 flags = 0;
xinfo = xdp_get_shared_info_from_buff(xdp);
nr_frags = xinfo->nr_frags;
@@ -714,11 +713,12 @@ static noinline bool xdp_copy_frags_from_zc(struct sk_buff *skb,
__skb_fill_page_desc_noacc(sinfo, i, page, offset, len);
tsize += truesize;
- pfmemalloc |= page_is_pfmemalloc(page);
+ if (page_is_pfmemalloc(page))
+ flags |= XDP_FLAGS_FRAGS_PF_MEMALLOC;
}
- xdp_update_skb_shared_info(skb, nr_frags, xinfo->xdp_frags_size,
- tsize, pfmemalloc);
+ xdp_update_skb_frags_info(skb, nr_frags, xinfo->xdp_frags_size, tsize,
+ flags);
return true;
}
@@ -823,10 +823,9 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
skb_metadata_set(skb, xdpf->metasize);
if (unlikely(xdp_frame_has_frags(xdpf)))
- xdp_update_skb_shared_info(skb, nr_frags,
- sinfo->xdp_frags_size,
- nr_frags * xdpf->frame_sz,
- xdp_frame_is_frag_pfmemalloc(xdpf));
+ xdp_update_skb_frags_info(skb, nr_frags, sinfo->xdp_frags_size,
+ nr_frags * xdpf->frame_sz,
+ xdp_frame_get_skb_flags(xdpf));
/* Essential SKB info: protocol and skb->dev */
skb->protocol = eth_type_trans(skb, dev);
diff --git a/net/devlink/core.c b/net/devlink/core.c
index 7203c39532fc..58093f49c090 100644
--- a/net/devlink/core.c
+++ b/net/devlink/core.c
@@ -320,7 +320,7 @@ static void devlink_release(struct work_struct *work)
void devlink_put(struct devlink *devlink)
{
if (refcount_dec_and_test(&devlink->refcount))
- queue_rcu_work(system_wq, &devlink->rwork);
+ queue_rcu_work(system_percpu_wq, &devlink->rwork);
}
struct devlink *devlinks_xa_find_get(struct net *net, unsigned long *indexp)
diff --git a/net/devlink/health.c b/net/devlink/health.c
index b3ce8ecbb7fb..136a67c36a20 100644
--- a/net/devlink/health.c
+++ b/net/devlink/health.c
@@ -60,6 +60,7 @@ struct devlink_health_reporter {
struct devlink_port *devlink_port;
struct devlink_fmsg *dump_fmsg;
u64 graceful_period;
+ u64 burst_period;
bool auto_recover;
bool auto_dump;
u8 health_state;
@@ -108,11 +109,14 @@ devlink_port_health_reporter_find_by_name(struct devlink_port *devlink_port,
static struct devlink_health_reporter *
__devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
- if (WARN_ON(graceful_period && !ops->recover))
+ if (WARN_ON(ops->default_graceful_period && !ops->recover))
+ return ERR_PTR(-EINVAL);
+
+ if (WARN_ON(ops->default_burst_period && !ops->default_graceful_period))
return ERR_PTR(-EINVAL);
reporter = kzalloc(sizeof(*reporter), GFP_KERNEL);
@@ -122,7 +126,8 @@ __devlink_health_reporter_create(struct devlink *devlink,
reporter->priv = priv;
reporter->ops = ops;
reporter->devlink = devlink;
- reporter->graceful_period = graceful_period;
+ reporter->graceful_period = ops->default_graceful_period;
+ reporter->burst_period = ops->default_burst_period;
reporter->auto_recover = !!ops->recover;
reporter->auto_dump = !!ops->dump;
return reporter;
@@ -134,13 +139,12 @@ __devlink_health_reporter_create(struct devlink *devlink,
*
* @port: devlink_port to which health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -150,8 +154,7 @@ devl_port_health_reporter_create(struct devlink_port *port,
ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(port->devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(port->devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -164,14 +167,13 @@ EXPORT_SYMBOL_GPL(devl_port_health_reporter_create);
struct devlink_health_reporter *
devlink_port_health_reporter_create(struct devlink_port *port,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
struct devlink *devlink = port->devlink;
devl_lock(devlink);
- reporter = devl_port_health_reporter_create(port, ops,
- graceful_period, priv);
+ reporter = devl_port_health_reporter_create(port, ops, priv);
devl_unlock(devlink);
return reporter;
}
@@ -182,13 +184,12 @@ EXPORT_SYMBOL_GPL(devlink_port_health_reporter_create);
*
* @devlink: devlink instance which the health reports will relate
* @ops: devlink health reporter ops
- * @graceful_period: min time (in msec) between recovery attempts
* @priv: driver priv pointer
*/
struct devlink_health_reporter *
devl_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
@@ -197,8 +198,7 @@ devl_health_reporter_create(struct devlink *devlink,
if (devlink_health_reporter_find_by_name(devlink, ops->name))
return ERR_PTR(-EEXIST);
- reporter = __devlink_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = __devlink_health_reporter_create(devlink, ops, priv);
if (IS_ERR(reporter))
return reporter;
@@ -210,13 +210,12 @@ EXPORT_SYMBOL_GPL(devl_health_reporter_create);
struct devlink_health_reporter *
devlink_health_reporter_create(struct devlink *devlink,
const struct devlink_health_reporter_ops *ops,
- u64 graceful_period, void *priv)
+ void *priv)
{
struct devlink_health_reporter *reporter;
devl_lock(devlink);
- reporter = devl_health_reporter_create(devlink, ops,
- graceful_period, priv);
+ reporter = devl_health_reporter_create(devlink, ops, priv);
devl_unlock(devlink);
return reporter;
}
@@ -298,6 +297,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg,
reporter->graceful_period))
goto reporter_nest_cancel;
if (reporter->ops->recover &&
+ devlink_nl_put_u64(msg, DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
+ reporter->burst_period))
+ goto reporter_nest_cancel;
+ if (reporter->ops->recover &&
nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER,
reporter->auto_recover))
goto reporter_nest_cancel;
@@ -462,16 +465,33 @@ int devlink_nl_health_reporter_set_doit(struct sk_buff *skb,
if (!reporter->ops->recover &&
(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] ||
- info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]))
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] ||
+ info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]))
return -EOPNOTSUPP;
if (!reporter->ops->dump &&
info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP])
return -EOPNOTSUPP;
- if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD])
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) {
reporter->graceful_period =
nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]);
+ if (!reporter->graceful_period)
+ reporter->burst_period = 0;
+ }
+
+ if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]) {
+ u64 burst_period =
+ nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD]);
+
+ if (!reporter->graceful_period && burst_period) {
+ NL_SET_ERR_MSG_MOD(info->extack,
+ "Cannot set burst period without a grace period.");
+ return -EINVAL;
+ }
+
+ reporter->burst_period = burst_period;
+ }
if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])
reporter->auto_recover =
@@ -514,11 +534,25 @@ static void devlink_recover_notify(struct devlink_health_reporter *reporter,
devlink_nl_notify_send_desc(devlink, msg, &desc);
}
+static bool
+devlink_health_reporter_in_burst(struct devlink_health_reporter *reporter)
+{
+ unsigned long burst_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period);
+
+ return time_is_after_jiffies(burst_threshold);
+}
+
void
devlink_health_reporter_recovery_done(struct devlink_health_reporter *reporter)
{
reporter->recovery_count++;
- reporter->last_recovery_ts = jiffies;
+ if (!devlink_health_reporter_in_burst(reporter))
+ /* When burst period is set, last_recovery_ts marks the first
+ * recovery within the burst period, not necessarily the last
+ * one.
+ */
+ reporter->last_recovery_ts = jiffies;
}
EXPORT_SYMBOL_GPL(devlink_health_reporter_recovery_done);
@@ -592,12 +626,37 @@ dump_err:
return err;
}
+static bool
+devlink_health_recover_abort(struct devlink_health_reporter *reporter,
+ enum devlink_health_reporter_state prev_state)
+{
+ unsigned long recover_ts_threshold;
+
+ if (!reporter->auto_recover)
+ return false;
+
+ /* abort if the previous error wasn't recovered */
+ if (prev_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY)
+ return true;
+
+ if (devlink_health_reporter_in_burst(reporter))
+ return false;
+
+ recover_ts_threshold = reporter->last_recovery_ts +
+ msecs_to_jiffies(reporter->burst_period) +
+ msecs_to_jiffies(reporter->graceful_period);
+ if (reporter->last_recovery_ts && reporter->recovery_count &&
+ time_is_after_jiffies(recover_ts_threshold))
+ return true;
+
+ return false;
+}
+
int devlink_health_report(struct devlink_health_reporter *reporter,
const char *msg, void *priv_ctx)
{
enum devlink_health_reporter_state prev_health_state;
struct devlink *devlink = reporter->devlink;
- unsigned long recover_ts_threshold;
int ret;
/* write a log message of the current error */
@@ -608,13 +667,7 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR;
devlink_recover_notify(reporter, DEVLINK_CMD_HEALTH_REPORTER_RECOVER);
- /* abort if the previous error wasn't recovered */
- recover_ts_threshold = reporter->last_recovery_ts +
- msecs_to_jiffies(reporter->graceful_period);
- if (reporter->auto_recover &&
- (prev_health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY ||
- (reporter->last_recovery_ts && reporter->recovery_count &&
- time_is_after_jiffies(recover_ts_threshold)))) {
+ if (devlink_health_recover_abort(reporter, prev_health_state)) {
trace_devlink_health_recover_aborted(devlink,
reporter->ops->name,
reporter->health_state,
diff --git a/net/devlink/netlink_gen.c b/net/devlink/netlink_gen.c
index d97c326a9045..9fd00977d59e 100644
--- a/net/devlink/netlink_gen.c
+++ b/net/devlink/netlink_gen.c
@@ -389,7 +389,7 @@ static const struct nla_policy devlink_health_reporter_get_dump_nl_policy[DEVLIN
};
/* DEVLINK_CMD_HEALTH_REPORTER_SET - do */
-static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP + 1] = {
+static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD + 1] = {
[DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING, },
[DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32, },
@@ -397,6 +397,7 @@ static const struct nla_policy devlink_health_reporter_set_nl_policy[DEVLINK_ATT
[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8, },
[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP] = { .type = NLA_U8, },
+ [DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD] = { .type = NLA_U64, },
};
/* DEVLINK_CMD_HEALTH_REPORTER_RECOVER - do */
@@ -1032,7 +1033,7 @@ const struct genl_split_ops devlink_nl_ops[74] = {
.doit = devlink_nl_health_reporter_set_doit,
.post_doit = devlink_nl_post_doit,
.policy = devlink_health_reporter_set_nl_policy,
- .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_AUTO_DUMP,
+ .maxattr = DEVLINK_ATTR_HEALTH_REPORTER_BURST_PERIOD,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
},
{
diff --git a/net/devlink/param.c b/net/devlink/param.c
index 41dcc86cfd94..70e69523412c 100644
--- a/net/devlink/param.c
+++ b/net/devlink/param.c
@@ -102,6 +102,16 @@ static const struct devlink_param devlink_param_generic[] = {
.name = DEVLINK_PARAM_GENERIC_CLOCK_ID_NAME,
.type = DEVLINK_PARAM_GENERIC_CLOCK_ID_TYPE,
},
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_TOTAL_VFS,
+ .name = DEVLINK_PARAM_GENERIC_TOTAL_VFS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_TOTAL_VFS_TYPE,
+ },
+ {
+ .id = DEVLINK_PARAM_GENERIC_ID_NUM_DOORBELLS,
+ .name = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_NAME,
+ .type = DEVLINK_PARAM_GENERIC_NUM_DOORBELLS_TYPE,
+ },
};
static int devlink_param_generic_verify(const struct devlink_param *param)
diff --git a/net/devlink/port.c b/net/devlink/port.c
index cb8d4df61619..93d8a25bb920 100644
--- a/net/devlink/port.c
+++ b/net/devlink/port.c
@@ -1333,8 +1333,8 @@ int devlink_port_netdevice_event(struct notifier_block *nb,
return NOTIFY_OK;
}
-static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
- enum devlink_port_flavour flavour)
+static void __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
@@ -1347,7 +1347,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
} else {
devlink_port->switch_port = false;
}
- return 0;
}
/**
@@ -1357,17 +1356,13 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
* @attrs: devlink port attrs
*/
void devlink_port_attrs_set(struct devlink_port *devlink_port,
- struct devlink_port_attrs *attrs)
+ const struct devlink_port_attrs *attrs)
{
- int ret;
-
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
+ WARN_ON(attrs->splittable && attrs->split);
devlink_port->attrs = *attrs;
- ret = __devlink_port_attrs_set(devlink_port, attrs->flavour);
- if (ret)
- return;
- WARN_ON(attrs->splittable && attrs->split);
+ __devlink_port_attrs_set(devlink_port, attrs->flavour);
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
@@ -1383,14 +1378,10 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_PF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF);
attrs->pci_pf.controller = controller;
attrs->pci_pf.pf = pf;
attrs->pci_pf.external = external;
@@ -1411,14 +1402,10 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, u16 vf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_VF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF);
attrs->pci_vf.controller = controller;
attrs->pci_vf.pf = pf;
attrs->pci_vf.vf = vf;
@@ -1439,14 +1426,10 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
u16 pf, u32 sf, bool external)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
- int ret;
ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port);
- ret = __devlink_port_attrs_set(devlink_port,
- DEVLINK_PORT_FLAVOUR_PCI_SF);
- if (ret)
- return;
+ __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF);
attrs->pci_sf.controller = controller;
attrs->pci_sf.pf = pf;
attrs->pci_sf.sf = sf;
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 4e3651101b86..43e211e611b1 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -613,7 +613,10 @@ EXPORT_SYMBOL(fwnode_get_mac_address);
*/
int device_get_mac_address(struct device *dev, char *addr)
{
- return fwnode_get_mac_address(dev_fwnode(dev), addr);
+ if (!fwnode_get_mac_address(dev_fwnode(dev), addr))
+ return 0;
+
+ return nvmem_get_mac_address(dev, addr);
}
EXPORT_SYMBOL(device_get_mac_address);
diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile
index a1490c4afe6b..1e493553b977 100644
--- a/net/ethtool/Makefile
+++ b/net/ethtool/Makefile
@@ -8,5 +8,5 @@ ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o rss.o \
linkstate.o debug.o wol.o features.o privflags.o rings.o \
channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \
tunnels.o fec.o eeprom.o stats.o phc_vclocks.o mm.o \
- module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o mm.o \
+ module.o cmis_fw_update.o cmis_cdb.o pse-pd.o plca.o \
phy.o tsconfig.o
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 92e6a681c797..55223ebc2a7e 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -577,6 +577,26 @@ int __ethtool_get_link(struct net_device *dev)
return netif_running(dev) && dev->ethtool_ops->get_link(dev);
}
+int ethtool_get_rx_ring_count(struct net_device *dev)
+{
+ const struct ethtool_ops *ops = dev->ethtool_ops;
+ struct ethtool_rxnfc rx_rings = {};
+ int ret;
+
+ if (ops->get_rx_ring_count)
+ return ops->get_rx_ring_count(dev);
+
+ if (!ops->get_rxnfc)
+ return -EOPNOTSUPP;
+
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret < 0)
+ return ret;
+
+ return rx_rings.data;
+}
+
static int ethtool_get_rxnfc_rule_count(struct net_device *dev)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
diff --git a/net/ethtool/common.h b/net/ethtool/common.h
index c4d084dde5bf..1609cf4e53eb 100644
--- a/net/ethtool/common.h
+++ b/net/ethtool/common.h
@@ -54,6 +54,8 @@ void ethtool_ringparam_get_cfg(struct net_device *dev,
struct kernel_ethtool_ringparam *kparam,
struct netlink_ext_ack *extack);
+int ethtool_get_rx_ring_count(struct net_device *dev);
+
int __ethtool_get_ts_info(struct net_device *dev, struct kernel_ethtool_ts_info *info);
int ethtool_get_ts_info_by_phc(struct net_device *dev,
struct kernel_ethtool_ts_info *info,
diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c
index e7d3f2c352a3..4669e74cbcaa 100644
--- a/net/ethtool/fec.c
+++ b/net/ethtool/fec.c
@@ -17,6 +17,7 @@ struct fec_reply_data {
u64 stats[1 + ETHTOOL_MAX_LANES];
u8 cnt;
} corr, uncorr, corr_bits;
+ struct ethtool_fec_hist fec_stat_hist;
};
#define FEC_REPDATA(__reply_base) \
@@ -113,7 +114,10 @@ static int fec_prepare_data(const struct ethnl_req_info *req_base,
struct ethtool_fec_stats stats;
ethtool_stats_init((u64 *)&stats, sizeof(stats) / 8);
- dev->ethtool_ops->get_fec_stats(dev, &stats);
+ ethtool_stats_init((u64 *)data->fec_stat_hist.values,
+ sizeof(data->fec_stat_hist.values) / 8);
+ dev->ethtool_ops->get_fec_stats(dev, &stats,
+ &data->fec_stat_hist);
fec_stats_recalc(&data->corr, &stats.corrected_blocks);
fec_stats_recalc(&data->uncorr, &stats.uncorrectable_blocks);
@@ -157,13 +161,77 @@ static int fec_reply_size(const struct ethnl_req_info *req_base,
len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */
nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */
- if (req_base->flags & ETHTOOL_FLAG_STATS)
+ if (req_base->flags & ETHTOOL_FLAG_STATS) {
len += 3 * nla_total_size_64bit(sizeof(u64) *
(1 + ETHTOOL_MAX_LANES));
+ /* add FEC bins information */
+ len += (nla_total_size(0) + /* _A_FEC_HIST */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_LOW */
+ nla_total_size(4) + /* _A_FEC_HIST_BIN_HI */
+ /* _A_FEC_HIST_BIN_VAL + per-lane values */
+ nla_total_size_64bit(sizeof(u64)) +
+ nla_total_size_64bit(sizeof(u64) * ETHTOOL_MAX_LANES)) *
+ ETHTOOL_FEC_HIST_MAX;
+ }
return len;
}
+static int fec_put_hist(struct sk_buff *skb,
+ const struct ethtool_fec_hist *hist)
+{
+ const struct ethtool_fec_hist_range *ranges = hist->ranges;
+ const struct ethtool_fec_hist_value *values = hist->values;
+ struct nlattr *nest;
+ int i, j;
+ u64 sum;
+
+ if (!ranges)
+ return 0;
+
+ for (i = 0; i < ETHTOOL_FEC_HIST_MAX; i++) {
+ if (i && !ranges[i].low && !ranges[i].high)
+ break;
+
+ if (WARN_ON_ONCE(values[i].sum == ETHTOOL_STAT_NOT_SET &&
+ values[i].per_lane[0] == ETHTOOL_STAT_NOT_SET))
+ break;
+
+ nest = nla_nest_start(skb, ETHTOOL_A_FEC_STAT_HIST);
+ if (!nest)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_LOW,
+ ranges[i].low) ||
+ nla_put_u32(skb, ETHTOOL_A_FEC_HIST_BIN_HIGH,
+ ranges[i].high))
+ goto err_cancel_hist;
+ sum = 0;
+ for (j = 0; j < ETHTOOL_MAX_LANES; j++) {
+ if (values[i].per_lane[j] == ETHTOOL_STAT_NOT_SET)
+ break;
+ sum += values[i].per_lane[j];
+ }
+ if (nla_put_uint(skb, ETHTOOL_A_FEC_HIST_BIN_VAL,
+ values[i].sum == ETHTOOL_STAT_NOT_SET ?
+ sum : values[i].sum))
+ goto err_cancel_hist;
+ if (j && nla_put_64bit(skb, ETHTOOL_A_FEC_HIST_BIN_VAL_PER_LANE,
+ sizeof(u64) * j,
+ values[i].per_lane,
+ ETHTOOL_A_FEC_HIST_PAD))
+ goto err_cancel_hist;
+
+ nla_nest_end(skb, nest);
+ }
+
+ return 0;
+
+err_cancel_hist:
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+}
+
static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
{
struct nlattr *nest;
@@ -183,6 +251,9 @@ static int fec_put_stats(struct sk_buff *skb, const struct fec_reply_data *data)
data->corr_bits.stats, ETHTOOL_A_FEC_STAT_PAD))
goto err_cancel;
+ if (fec_put_hist(skb, &data->fec_stat_hist))
+ goto err_cancel;
+
nla_nest_end(skb, nest);
return 0;
diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c
index 43a7854e784e..fa83ddade4f8 100644
--- a/net/ethtool/ioctl.c
+++ b/net/ethtool/ioctl.c
@@ -1014,6 +1014,28 @@ static bool flow_type_hashable(u32 flow_type)
return false;
}
+static bool flow_type_v6(u32 flow_type)
+{
+ switch (flow_type) {
+ case TCP_V6_FLOW:
+ case UDP_V6_FLOW:
+ case SCTP_V6_FLOW:
+ case AH_ESP_V6_FLOW:
+ case AH_V6_FLOW:
+ case ESP_V6_FLOW:
+ case IPV6_FLOW:
+ case GTPU_V6_FLOW:
+ case GTPC_V6_FLOW:
+ case GTPC_TEID_V6_FLOW:
+ case GTPU_EH_V6_FLOW:
+ case GTPU_UL_V6_FLOW:
+ case GTPU_DL_V6_FLOW:
+ return true;
+ }
+
+ return false;
+}
+
/* When adding a new type, update the assert and, if it's hashable, add it to
* the flow_type_hashable switch case.
*/
@@ -1077,6 +1099,9 @@ ethtool_set_rxfh_fields(struct net_device *dev, u32 cmd, void __user *useraddr)
if (rc)
return rc;
+ if (info.data & RXH_IP6_FL && !flow_type_v6(info.flow_type))
+ return -EINVAL;
+
if (info.flow_type & FLOW_RSS && info.rss_context &&
!ops->rxfh_per_ctx_fields)
return -EINVAL;
@@ -1183,18 +1208,41 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
return 0;
}
+static noinline_for_stack int ethtool_get_rxrings(struct net_device *dev,
+ u32 cmd,
+ void __user *useraddr)
+{
+ struct ethtool_rxnfc info;
+ size_t info_size;
+ int ret;
+
+ info_size = sizeof(info);
+ ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
+ if (ret)
+ return ret;
+
+ ret = ethtool_get_rx_ring_count(dev);
+ if (ret < 0)
+ return ret;
+
+ info.data = ret;
+
+ return ethtool_rxnfc_copy_to_user(useraddr, &info, info_size, NULL);
+}
+
static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
u32 cmd, void __user *useraddr)
{
- struct ethtool_rxnfc info;
- size_t info_size = sizeof(info);
const struct ethtool_ops *ops = dev->ethtool_ops;
- int ret;
+ struct ethtool_rxnfc info;
void *rule_buf = NULL;
+ size_t info_size;
+ int ret;
if (!ops->get_rxnfc)
return -EOPNOTSUPP;
+ info_size = sizeof(info);
ret = ethtool_rxnfc_copy_struct(cmd, &info, &info_size, useraddr);
if (ret)
return ret;
@@ -1221,8 +1269,8 @@ err_out:
}
static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
- struct ethtool_rxnfc *rx_rings,
- u32 size)
+ int num_rx_rings,
+ u32 size)
{
int i;
@@ -1231,7 +1279,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
/* Validate ring indices */
for (i = 0; i < size; i++)
- if (indir[i] >= rx_rings->data)
+ if (indir[i] >= num_rx_rings)
return -EINVAL;
return 0;
@@ -1302,13 +1350,12 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
const struct ethtool_ops *ops = dev->ethtool_ops;
struct ethtool_rxfh_param rxfh_dev = {};
struct netlink_ext_ack *extack = NULL;
- struct ethtool_rxnfc rx_rings;
+ int num_rx_rings;
u32 user_size, i;
int ret;
u32 ringidx_offset = offsetof(struct ethtool_rxfh_indir, ring_index[0]);
- if (!ops->get_rxfh_indir_size || !ops->set_rxfh ||
- !ops->get_rxnfc)
+ if (!ops->get_rxfh_indir_size || !ops->set_rxfh)
return -EOPNOTSUPP;
rxfh_dev.indir_size = ops->get_rxfh_indir_size(dev);
@@ -1328,20 +1375,21 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
if (!rxfh_dev.indir)
return -ENOMEM;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- ret = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (ret)
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
goto out;
+ }
if (user_size == 0) {
u32 *indir = rxfh_dev.indir;
for (i = 0; i < rxfh_dev.indir_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ indir[i] = ethtool_rxfh_indir_default(i, num_rx_rings);
} else {
ret = ethtool_copy_validate_indir(rxfh_dev.indir,
useraddr + ringidx_offset,
- &rx_rings,
+ num_rx_rings,
rxfh_dev.indir_size);
if (ret)
goto out;
@@ -1483,14 +1531,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
struct ethtool_rxfh_param rxfh_dev = {};
struct ethtool_rxfh_context *ctx = NULL;
struct netlink_ext_ack *extack = NULL;
- struct ethtool_rxnfc rx_rings;
struct ethtool_rxfh rxfh;
bool create = false;
+ int num_rx_rings;
u8 *rss_config;
int ntf = 0;
int ret;
- if (!ops->get_rxnfc || !ops->set_rxfh)
+ if (!ops->set_rxfh)
return -EOPNOTSUPP;
if (ops->get_rxfh_indir_size)
@@ -1546,10 +1594,11 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
if (!rss_config)
return -ENOMEM;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- ret = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (ret)
+ num_rx_rings = ethtool_get_rx_ring_count(dev);
+ if (num_rx_rings < 0) {
+ ret = num_rx_rings;
goto out_free;
+ }
/* rxfh.indir_size == 0 means reset the indir table to default (master
* context) or delete the context (other RSS contexts).
@@ -1562,7 +1611,7 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
rxfh_dev.indir_size = dev_indir_size;
ret = ethtool_copy_validate_indir(rxfh_dev.indir,
useraddr + rss_cfg_offset,
- &rx_rings,
+ num_rx_rings,
rxfh.indir_size);
if (ret)
goto out_free;
@@ -1574,7 +1623,8 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
rxfh_dev.indir_size = dev_indir_size;
indir = rxfh_dev.indir;
for (i = 0; i < dev_indir_size; i++)
- indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ indir[i] =
+ ethtool_rxfh_indir_default(i, num_rx_rings);
} else {
rxfh_dev.rss_delete = true;
}
@@ -3352,6 +3402,8 @@ __dev_ethtool(struct net *net, struct ifreq *ifr, void __user *useraddr,
rc = ethtool_set_rxfh_fields(dev, ethcmd, useraddr);
break;
case ETHTOOL_GRXRINGS:
+ rc = ethtool_get_rxrings(dev, ethcmd, useraddr);
+ break;
case ETHTOOL_GRXCLSRLCNT:
case ETHTOOL_GRXCLSRULE:
case ETHTOOL_GRXCLSRLALL:
diff --git a/net/ethtool/rss.c b/net/ethtool/rss.c
index 992e98abe9dd..4dced53be4b3 100644
--- a/net/ethtool/rss.c
+++ b/net/ethtool/rss.c
@@ -536,35 +536,36 @@ void ethtool_rss_notify(struct net_device *dev, u32 type, u32 rss_context)
#define RFH_MASK (RXH_L2DA | RXH_VLAN | RXH_IP_SRC | RXH_IP_DST | \
RXH_L3_PROTO | RXH_L4_B_0_1 | RXH_L4_B_2_3 | \
RXH_GTP_TEID | RXH_DISCARD)
+#define RFH_MASKv6 (RFH_MASK | RXH_IP6_FL)
static const struct nla_policy ethnl_rss_flows_policy[] = {
[ETHTOOL_A_FLOW_ETHER] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_IP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_IP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_TCP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_UDP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_SCTP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_AH_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_TCP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_UDP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_SCTP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_AH_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_AH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
[ETHTOOL_A_FLOW_ESP4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_AH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
+ [ETHTOOL_A_FLOW_ESP6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPC4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPC_TEID4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPC_TEID6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_EH4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_EH6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_UL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_UL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
[ETHTOOL_A_FLOW_GTPU_DL4] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
- [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASK),
+ [ETHTOOL_A_FLOW_GTPU_DL6] = NLA_POLICY_MASK(NLA_UINT, RFH_MASKv6),
};
const struct nla_policy ethnl_rss_set_policy[ETHTOOL_A_RSS_FLOW_HASH + 1] = {
@@ -619,23 +620,22 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
struct rss_reply_data *data, struct ethtool_rxfh_param *rxfh,
bool *reset, bool *mod)
{
- const struct ethtool_ops *ops = dev->ethtool_ops;
struct netlink_ext_ack *extack = info->extack;
struct nlattr **tb = info->attrs;
- struct ethtool_rxnfc rx_rings;
size_t alloc_size;
+ int num_rx_rings;
u32 user_size;
int i, err;
if (!tb[ETHTOOL_A_RSS_INDIR])
return 0;
- if (!data->indir_size || !ops->get_rxnfc)
+ if (!data->indir_size)
return -EOPNOTSUPP;
- rx_rings.cmd = ETHTOOL_GRXRINGS;
- err = ops->get_rxnfc(dev, &rx_rings, NULL);
- if (err)
+ err = ethtool_get_rx_ring_count(dev);
+ if (err < 0)
return err;
+ num_rx_rings = err;
if (nla_len(tb[ETHTOOL_A_RSS_INDIR]) % 4) {
NL_SET_BAD_ATTR(info->extack, tb[ETHTOOL_A_RSS_INDIR]);
@@ -664,7 +664,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
nla_memcpy(rxfh->indir, tb[ETHTOOL_A_RSS_INDIR], alloc_size);
for (i = 0; i < user_size; i++) {
- if (rxfh->indir[i] < rx_rings.data)
+ if (rxfh->indir[i] < num_rx_rings)
continue;
NL_SET_ERR_MSG_ATTR_FMT(extack, tb[ETHTOOL_A_RSS_INDIR],
@@ -681,7 +681,7 @@ rss_set_prep_indir(struct net_device *dev, struct genl_info *info,
} else {
for (i = 0; i < data->indir_size; i++)
rxfh->indir[i] =
- ethtool_rxfh_indir_default(i, rx_rings.data);
+ ethtool_rxfh_indir_default(i, num_rx_rings);
}
*mod |= memcmp(rxfh->indir, data->indir_table, data->indir_size);
diff --git a/net/ethtool/tsconfig.c b/net/ethtool/tsconfig.c
index 2be356bdfe87..169b413b31fc 100644
--- a/net/ethtool/tsconfig.c
+++ b/net/ethtool/tsconfig.c
@@ -423,13 +423,11 @@ static int ethnl_set_tsconfig(struct ethnl_req_info *req_base,
return ret;
}
- if (hwprov_mod || config_mod) {
- ret = tsconfig_send_reply(dev, info);
- if (ret && ret != -EOPNOTSUPP) {
- NL_SET_ERR_MSG(info->extack,
- "error while reading the new configuration set");
- return ret;
- }
+ ret = tsconfig_send_reply(dev, info);
+ if (ret && ret != -EOPNOTSUPP) {
+ NL_SET_ERR_MSG(info->extack,
+ "error while reading the new configuration set");
+ return ret;
}
/* tsconfig has no notification */
diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c
index 102eccf5ead7..8177ac6c2d26 100644
--- a/net/hsr/hsr_slave.c
+++ b/net/hsr/hsr_slave.c
@@ -143,6 +143,7 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
struct netlink_ext_ack *extack)
{
+ struct netdev_lag_upper_info lag_upper_info;
struct net_device *hsr_dev;
struct hsr_port *master;
int res;
@@ -159,7 +160,9 @@ static int hsr_portdev_setup(struct hsr_priv *hsr, struct net_device *dev,
master = hsr_port_get_hsr(hsr, HSR_PT_MASTER);
hsr_dev = master->dev;
- res = netdev_upper_dev_link(dev, hsr_dev, extack);
+ lag_upper_info.tx_type = NETDEV_LAG_TX_TYPE_BROADCAST;
+ lag_upper_info.hash_type = NETDEV_LAG_HASH_UNKNOWN;
+ res = netdev_master_upper_dev_link(dev, hsr_dev, NULL, &lag_upper_info, extack);
if (res)
goto fail_upper_dev_link;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 76e38092cd8a..3109c5ec38f3 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -102,6 +102,7 @@
#include <net/gro.h>
#include <net/gso.h>
#include <net/tcp.h>
+#include <net/psp.h>
#include <net/udp.h>
#include <net/udplite.h>
#include <net/ping.h>
@@ -158,6 +159,7 @@ void inet_sock_destruct(struct sock *sk)
kfree(rcu_dereference_protected(inet->inet_opt, 1));
dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1));
dst_release(rcu_dereference_protected(sk->sk_rx_dst, 1));
+ psp_sk_assoc_free(sk);
}
EXPORT_SYMBOL(inet_sock_destruct);
@@ -1393,14 +1395,10 @@ struct sk_buff *inet_gso_segment(struct sk_buff *skb,
segs = ERR_PTR(-EPROTONOSUPPORT);
- if (!skb->encapsulation || encap) {
- udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
- fixedid = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TCP_FIXEDID);
+ fixedid = !!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCP_FIXEDID << encap));
- /* fixed ID is invalid if DF bit is not set */
- if (fixedid && !(ip_hdr(skb)->frag_off & htons(IP_DF)))
- goto out;
- }
+ if (!skb->encapsulation || encap)
+ udpfrag = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
ops = rcu_dereference(inet_offloads[proto]);
if (likely(ops && ops->callbacks.gso_segment)) {
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 5cfc1c939673..833f2cf97178 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -170,7 +170,7 @@ struct neigh_table arp_tbl = {
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c
index 740af8541d2f..709021197e1c 100644
--- a/net/ipv4/cipso_ipv4.c
+++ b/net/ipv4/cipso_ipv4.c
@@ -1715,8 +1715,7 @@ validate_return:
*/
void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
{
- unsigned char optbuf[sizeof(struct ip_options) + 40];
- struct ip_options *opt = (struct ip_options *)optbuf;
+ struct inet_skb_parm parm;
int res;
if (ip_hdr(skb)->protocol == IPPROTO_ICMP || error != -EACCES)
@@ -1727,19 +1726,19 @@ void cipso_v4_error(struct sk_buff *skb, int error, u32 gateway)
* so we can not use icmp_send and IPCB here.
*/
- memset(opt, 0, sizeof(struct ip_options));
- opt->optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr);
+ memset(&parm, 0, sizeof(parm));
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
- res = __ip_options_compile(dev_net(skb->dev), opt, skb, NULL);
+ res = __ip_options_compile(dev_net(skb->dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
if (gateway)
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_NET_ANO, 0, &parm);
else
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_ANO, 0, &parm);
}
/**
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f14a41ee4aa1..2c922afadb8f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -132,8 +132,8 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x)
dport = encap->encap_dport;
spin_unlock_bh(&x->lock);
- sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4,
- dport, x->props.saddr.a4, sport, 0);
+ sk = inet_lookup_established(net, x->id.daddr.a4, dport,
+ x->props.saddr.a4, sport, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6e1b94796f67..1dab44e13d3b 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -32,6 +32,7 @@
#include <linux/list.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -293,7 +294,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
.flowi4_iif = LOOPBACK_IFINDEX,
.flowi4_l3mdev = l3mdev_master_ifindex_rcu(dev),
.daddr = ip_hdr(skb)->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb))),
+ .flowi4_dscp = ip4h_dscp(ip_hdr(skb)),
.flowi4_scope = scope,
.flowi4_mark = vmark ? skb->mark : 0,
};
@@ -358,7 +359,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_tun_key.tun_id = 0;
fl4.flowi4_flags = 0;
@@ -1372,7 +1373,7 @@ static void nl_fib_lookup(struct net *net, struct fib_result_nl *frn)
struct flowi4 fl4 = {
.flowi4_mark = frn->fl_mark,
.daddr = frn->fl_addr,
- .flowi4_tos = frn->fl_tos & INET_DSCP_MASK,
+ .flowi4_dscp = inet_dsfield_to_dscp(frn->fl_tos),
.flowi4_scope = frn->fl_scope,
};
struct fib_table *tb;
diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c
index fa58d6620ed6..51f0193092f0 100644
--- a/net/ipv4/fib_rules.c
+++ b/net/ipv4/fib_rules.c
@@ -23,6 +23,7 @@
#include <linux/list.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -193,8 +194,7 @@ INDIRECT_CALLABLE_SCOPE int fib4_rule_match(struct fib_rule *rule,
* to mask the upper three DSCP bits prior to matching to maintain
* legacy behavior.
*/
- if (r->dscp_full &&
- (r->dscp ^ inet_dsfield_to_dscp(fl4->flowi4_tos)) & r->dscp_mask)
+ if (r->dscp_full && (r->dscp ^ fl4->flowi4_dscp) & r->dscp_mask)
return 0;
else if (!r->dscp_full && r->dscp &&
!fib_dscp_masked_match(r->dscp, fl4))
diff --git a/net/ipv4/fou_core.c b/net/ipv4/fou_core.c
index 3e30745e2c09..3970b6b7ace5 100644
--- a/net/ipv4/fou_core.c
+++ b/net/ipv4/fou_core.c
@@ -228,21 +228,27 @@ drop:
return 0;
}
+static const struct net_offload *fou_gro_ops(const struct sock *sk,
+ int proto)
+{
+ const struct net_offload __rcu **offloads;
+
+ /* FOU doesn't allow IPv4 on IPv6 sockets. */
+ offloads = sk->sk_family == AF_INET6 ? inet6_offloads : inet_offloads;
+ return rcu_dereference(offloads[proto]);
+}
+
static struct sk_buff *fou_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload __rcu **offloads;
struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
struct sk_buff *pp = NULL;
- u8 proto;
if (!fou)
goto out;
- proto = fou->protocol;
-
/* We can clear the encap_mark for FOU as we are essentially doing
* one of two possible things. We are either adding an L4 tunnel
* header to the outer L3 tunnel header, or we are simply
@@ -254,8 +260,7 @@ static struct sk_buff *fou_gro_receive(struct sock *sk,
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, fou->protocol);
if (!ops || !ops->callbacks.gro_receive)
goto out;
@@ -268,10 +273,8 @@ out:
static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
int nhoff)
{
- const struct net_offload __rcu **offloads;
struct fou *fou = fou_from_sock(sk);
const struct net_offload *ops;
- u8 proto;
int err;
if (!fou) {
@@ -279,10 +282,7 @@ static int fou_gro_complete(struct sock *sk, struct sk_buff *skb,
goto out;
}
- proto = fou->protocol;
-
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, fou->protocol);
if (WARN_ON(!ops || !ops->callbacks.gro_complete)) {
err = -ENOSYS;
goto out;
@@ -323,7 +323,6 @@ static struct sk_buff *gue_gro_receive(struct sock *sk,
struct list_head *head,
struct sk_buff *skb)
{
- const struct net_offload __rcu **offloads;
const struct net_offload *ops;
struct sk_buff *pp = NULL;
struct sk_buff *p;
@@ -450,8 +449,7 @@ next_proto:
/* Flag this frame as already having an outer encap header */
NAPI_GRO_CB(skb)->is_fou = 1;
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, proto);
if (!ops || !ops->callbacks.gro_receive)
goto out;
@@ -467,7 +465,6 @@ out:
static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
{
struct guehdr *guehdr = (struct guehdr *)(skb->data + nhoff);
- const struct net_offload __rcu **offloads;
const struct net_offload *ops;
unsigned int guehlen = 0;
u8 proto;
@@ -494,8 +491,7 @@ static int gue_gro_complete(struct sock *sk, struct sk_buff *skb, int nhoff)
return err;
}
- offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
- ops = rcu_dereference(offloads[proto]);
+ ops = fou_gro_ops(sk, proto);
if (WARN_ON(!ops || !ops->callbacks.gro_complete))
goto out;
diff --git a/net/ipv4/fou_nl.c b/net/ipv4/fou_nl.c
index 3d9614609b2d..506260b4a4dc 100644
--- a/net/ipv4/fou_nl.c
+++ b/net/ipv4/fou_nl.c
@@ -18,9 +18,9 @@ const struct nla_policy fou_nl_policy[FOU_ATTR_IFINDEX + 1] = {
[FOU_ATTR_TYPE] = { .type = NLA_U8, },
[FOU_ATTR_REMCSUM_NOPARTIAL] = { .type = NLA_FLAG, },
[FOU_ATTR_LOCAL_V4] = { .type = NLA_U32, },
- [FOU_ATTR_LOCAL_V6] = { .len = 16, },
+ [FOU_ATTR_LOCAL_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_V4] = { .type = NLA_U32, },
- [FOU_ATTR_PEER_V6] = { .len = 16, },
+ [FOU_ATTR_PEER_V6] = NLA_POLICY_EXACT_LEN(16),
[FOU_ATTR_PEER_PORT] = { .type = NLA_BE16, },
[FOU_ATTR_IFINDEX] = { .type = NLA_S32, },
};
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c48c572f024d..1b7fb5d935ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -72,6 +72,7 @@
#include <linux/string.h>
#include <linux/netfilter_ipv4.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/route.h>
@@ -318,17 +319,17 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt,
return true;
/* No rate limit on loopback */
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dev && (dev->flags & IFF_LOOPBACK))
goto out;
- rcu_read_lock();
peer = inet_getpeer_v4(net->ipv4.peers, fl4->daddr,
l3mdev_master_ifindex_rcu(dev));
rc = inet_peer_xrlim_allow(peer,
READ_ONCE(net->ipv4.sysctl_icmp_ratelimit));
- rcu_read_unlock();
out:
+ rcu_read_unlock();
if (!rc)
__ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST);
else
@@ -444,7 +445,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb)
fl4.saddr = saddr;
fl4.flowi4_mark = mark;
fl4.flowi4_uid = sock_net_uid(net, NULL);
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(skb)));
+ fl4.flowi4_dscp = ip4h_dscp(ip_hdr(skb));
fl4.flowi4_proto = IPPROTO_ICMP;
fl4.flowi4_oif = l3mdev_master_ifindex(skb->dev);
security_skb_classify_flow(skb, flowi4_to_flowi_common(&fl4));
@@ -495,7 +496,7 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
fl4->saddr = saddr;
fl4->flowi4_mark = mark;
fl4->flowi4_uid = sock_net_uid(net, NULL);
- fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_proto = IPPROTO_ICMP;
fl4->fl4_icmp_type = type;
fl4->fl4_icmp_code = code;
@@ -544,14 +545,15 @@ static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4,
goto relookup_failed;
}
/* Ugh! */
- orefdst = skb_in->_skb_refdst; /* save old refdst */
- skb_dst_set(skb_in, NULL);
+ orefdst = skb_dstref_steal(skb_in);
err = ip_route_input(skb_in, fl4_dec.daddr, fl4_dec.saddr,
dscp, rt2->dst.dev) ? -EINVAL : 0;
dst_release(&rt2->dst);
rt2 = skb_rtable(skb_in);
- skb_in->_skb_refdst = orefdst; /* restore old refdst */
+ /* steal dst entry from skb_in, don't drop refcnt */
+ skb_dstref_steal(skb_in);
+ skb_dstref_restore(skb_in, orefdst);
}
if (err)
@@ -592,7 +594,7 @@ relookup_failed:
*/
void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
- const struct ip_options *opt)
+ const struct inet_skb_parm *parm)
{
struct iphdr *iph;
int room;
@@ -708,7 +710,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
rcu_read_lock();
if (rt_is_input_route(rt) &&
READ_ONCE(net->ipv4.sysctl_icmp_errors_use_inbound_ifaddr))
- dev = dev_get_by_index_rcu(net, inet_iif(skb_in));
+ dev = dev_get_by_index_rcu(net, parm->iif ? parm->iif :
+ inet_iif(skb_in));
if (dev)
saddr = inet_select_addr(dev, iph->saddr,
@@ -723,7 +726,8 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info,
iph->tos;
mark = IP4_REPLY_MARK(net, skb_in->mark);
- if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in, opt))
+ if (__ip_options_echo(net, &icmp_param.replyopts.opt.opt, skb_in,
+ &parm->opt))
goto out_unlock;
@@ -797,15 +801,16 @@ EXPORT_SYMBOL(__icmp_send);
void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
{
struct sk_buff *cloned_skb = NULL;
- struct ip_options opts = { 0 };
enum ip_conntrack_info ctinfo;
enum ip_conntrack_dir dir;
+ struct inet_skb_parm parm;
struct nf_conn *ct;
__be32 orig_ip;
+ memset(&parm, 0, sizeof(parm));
ct = nf_ct_get(skb_in, &ctinfo);
if (!ct || !(READ_ONCE(ct->status) & IPS_NAT_MASK)) {
- __icmp_send(skb_in, type, code, info, &opts);
+ __icmp_send(skb_in, type, code, info, &parm);
return;
}
@@ -821,7 +826,7 @@ void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info)
orig_ip = ip_hdr(skb_in)->saddr;
dir = CTINFO2DIR(ctinfo);
ip_hdr(skb_in)->saddr = ct->tuplehash[dir].tuple.src.u3.ip;
- __icmp_send(skb_in, type, code, info, &opts);
+ __icmp_send(skb_in, type, code, info, &parm);
ip_hdr(skb_in)->saddr = orig_ip;
out:
consume_skb(cloned_skb);
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1e2df51427fe..cdd1e12aac8c 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -423,7 +423,7 @@ success:
}
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
- struct sock *sk)
+ const struct sock *sk)
{
if (tb->fastreuseport <= 0)
return 0;
@@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
ipv6_only_sock(sk), true, false);
}
-void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
- struct sock *sk)
+void inet_csk_update_fastreuse(const struct sock *sk,
+ struct inet_bind_bucket *tb,
+ struct inet_bind2_bucket *tb2)
{
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
@@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
tb->fastreuseport = 0;
}
}
+
+ tb2->fastreuse = tb->fastreuse;
+ tb2->fastreuseport = tb->fastreuseport;
}
/* Obtain a reference to a local port for the given sock,
@@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
}
success:
- inet_csk_update_fastreuse(tb, sk);
+ inet_csk_update_fastreuse(sk, tb, tb2);
if (!inet_csk(sk)->icsk_bind_hash)
inet_bind_hash(sk, tb, tb2, port);
@@ -706,9 +710,9 @@ struct sock *inet_csk_accept(struct sock *sk, struct proto_accept_arg *arg)
spin_unlock_bh(&queue->fastopenq.lock);
}
-out:
release_sock(sk);
- if (newsk && mem_cgroup_sockets_enabled) {
+
+ if (mem_cgroup_sockets_enabled) {
gfp_t gfp = GFP_KERNEL | __GFP_NOFAIL;
int amt = 0;
@@ -718,7 +722,7 @@ out:
lock_sock(newsk);
mem_cgroup_sk_alloc(newsk);
- if (newsk->sk_memcg) {
+ if (mem_cgroup_from_sk(newsk)) {
/* The socket has not been accepted yet, no need
* to look at newsk->sk_wmem_queued.
*/
@@ -727,23 +731,22 @@ out:
}
if (amt)
- mem_cgroup_charge_skmem(newsk->sk_memcg, amt, gfp);
+ mem_cgroup_sk_charge(newsk, amt, gfp);
kmem_cache_charge(newsk, gfp);
release_sock(newsk);
}
+
if (req)
reqsk_put(req);
- if (newsk)
- inet_init_csk_locks(newsk);
-
+ inet_init_csk_locks(newsk);
return newsk;
+
out_err:
- newsk = NULL;
- req = NULL;
+ release_sock(sk);
arg->err = error;
- goto out;
+ return NULL;
}
EXPORT_SYMBOL(inet_csk_accept);
@@ -1297,12 +1300,19 @@ void inet_csk_destroy_sock(struct sock *sk)
xfrm_sk_free_policy(sk);
- this_cpu_dec(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_dec();
sock_put(sk);
}
EXPORT_SYMBOL(inet_csk_destroy_sock);
+void inet_csk_prepare_for_destroy_sock(struct sock *sk)
+{
+ /* The below has to be done to allow calling inet_csk_destroy_sock */
+ sock_set_flag(sk, SOCK_DEAD);
+ tcp_orphan_count_inc();
+}
+
/* This function allows to force a closure of a socket after the call to
* tcp_create_openreq_child().
*/
@@ -1370,7 +1380,7 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
sock_orphan(child);
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->tfo_listener) {
BUG_ON(rcu_access_pointer(tcp_sk(child)->fastopen_rsk) != req);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index 2fa53b16fe77..f0b6c5a411a2 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -20,9 +20,6 @@
#include <net/ipv6.h>
#include <net/inet_common.h>
#include <net/inet_connection_sock.h>
-#include <net/inet_hashtables.h>
-#include <net/inet_timewait_sock.h>
-#include <net/inet6_hashtables.h>
#include <net/bpf_sk_storage.h>
#include <net/netlink.h>
@@ -74,54 +71,29 @@ static void inet_diag_unlock_handler(const struct inet_diag_handler *handler)
void inet_diag_msg_common_fill(struct inet_diag_msg *r, struct sock *sk)
{
- r->idiag_family = sk->sk_family;
+ r->idiag_family = READ_ONCE(sk->sk_family);
- r->id.idiag_sport = htons(sk->sk_num);
- r->id.idiag_dport = sk->sk_dport;
- r->id.idiag_if = sk->sk_bound_dev_if;
+ r->id.idiag_sport = htons(READ_ONCE(sk->sk_num));
+ r->id.idiag_dport = READ_ONCE(sk->sk_dport);
+ r->id.idiag_if = READ_ONCE(sk->sk_bound_dev_if);
sock_diag_save_cookie(sk, r->id.idiag_cookie);
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
- *(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr;
- *(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr;
+ if (r->idiag_family == AF_INET6) {
+ data_race(*(struct in6_addr *)r->id.idiag_src = sk->sk_v6_rcv_saddr);
+ data_race(*(struct in6_addr *)r->id.idiag_dst = sk->sk_v6_daddr);
} else
#endif
{
memset(&r->id.idiag_src, 0, sizeof(r->id.idiag_src));
memset(&r->id.idiag_dst, 0, sizeof(r->id.idiag_dst));
- r->id.idiag_src[0] = sk->sk_rcv_saddr;
- r->id.idiag_dst[0] = sk->sk_daddr;
+ r->id.idiag_src[0] = READ_ONCE(sk->sk_rcv_saddr);
+ r->id.idiag_dst[0] = READ_ONCE(sk->sk_daddr);
}
}
EXPORT_SYMBOL_GPL(inet_diag_msg_common_fill);
-static size_t inet_sk_attr_size(struct sock *sk,
- const struct inet_diag_req_v2 *req,
- bool net_admin)
-{
- const struct inet_diag_handler *handler;
- size_t aux = 0;
-
- rcu_read_lock();
- handler = rcu_dereference(inet_diag_table[req->sdiag_protocol]);
- DEBUG_NET_WARN_ON_ONCE(!handler);
- if (handler && handler->idiag_get_aux_size)
- aux = handler->idiag_get_aux_size(sk, net_admin);
- rcu_read_unlock();
-
- return nla_total_size(sizeof(struct tcp_info))
- + nla_total_size(sizeof(struct inet_diag_msg))
- + inet_diag_msg_attrs_size()
- + nla_total_size(sizeof(struct inet_diag_meminfo))
- + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
- + nla_total_size(TCP_CA_NAME_MAX)
- + nla_total_size(sizeof(struct tcpvegas_info))
- + aux
- + 64;
-}
-
int inet_diag_msg_attrs_fill(struct sock *sk, struct sk_buff *skb,
struct inet_diag_msg *r, int ext,
struct user_namespace *user_ns,
@@ -313,17 +285,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
icsk_pending == ICSK_TIME_REO_TIMEOUT ||
icsk_pending == ICSK_TIME_LOSS_PROBE) {
r->idiag_timer = 1;
- r->idiag_retrans = icsk->icsk_retransmits;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_retransmits);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (icsk_pending == ICSK_TIME_PROBE0) {
r->idiag_timer = 4;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(icsk_timeout(icsk) - jiffies);
} else if (timer_pending(&sk->sk_timer)) {
r->idiag_timer = 2;
- r->idiag_retrans = icsk->icsk_probes_out;
+ r->idiag_retrans = READ_ONCE(icsk->icsk_probes_out);
r->idiag_expires =
jiffies_delta_to_msecs(sk->sk_timer.expires - jiffies);
}
@@ -422,183 +394,6 @@ errout:
}
EXPORT_SYMBOL_GPL(inet_sk_diag_fill);
-static int inet_twsk_diag_fill(struct sock *sk,
- struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct inet_timewait_sock *tw = inet_twsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
- sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- BUG_ON(tw->tw_state != TCP_TIME_WAIT);
-
- inet_diag_msg_common_fill(r, sk);
- r->idiag_retrans = 0;
-
- r->idiag_state = READ_ONCE(tw->tw_substate);
- r->idiag_timer = 3;
- tmo = tw->tw_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- tw->tw_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int inet_req_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- u16 nlmsg_flags, bool net_admin)
-{
- struct request_sock *reqsk = inet_reqsk(sk);
- struct inet_diag_msg *r;
- struct nlmsghdr *nlh;
- long tmo;
-
- nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
- cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
- if (!nlh)
- return -EMSGSIZE;
-
- r = nlmsg_data(nlh);
- inet_diag_msg_common_fill(r, sk);
- r->idiag_state = TCP_SYN_RECV;
- r->idiag_timer = 1;
- r->idiag_retrans = reqsk->num_retrans;
-
- BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
- offsetof(struct sock, sk_cookie));
-
- tmo = inet_reqsk(sk)->rsk_timer.expires - jiffies;
- r->idiag_expires = jiffies_delta_to_msecs(tmo);
- r->idiag_rqueue = 0;
- r->idiag_wqueue = 0;
- r->idiag_uid = 0;
- r->idiag_inode = 0;
-
- if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
- inet_rsk(reqsk)->ir_mark)) {
- nlmsg_cancel(skb, nlh);
- return -EMSGSIZE;
- }
-
- nlmsg_end(skb, nlh);
- return 0;
-}
-
-static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r,
- u16 nlmsg_flags, bool net_admin)
-{
- if (sk->sk_state == TCP_TIME_WAIT)
- return inet_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- if (sk->sk_state == TCP_NEW_SYN_RECV)
- return inet_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
-
- return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
- net_admin);
-}
-
-struct sock *inet_diag_find_one_icsk(struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct inet_diag_req_v2 *req)
-{
- struct sock *sk;
-
- rcu_read_lock();
- if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
- req->id.idiag_dport, req->id.idiag_src[0],
- req->id.idiag_sport, req->id.idiag_if);
-#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6) {
- if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
- ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
- sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
- req->id.idiag_dport, req->id.idiag_src[3],
- req->id.idiag_sport, req->id.idiag_if);
- else
- sk = inet6_lookup(net, hashinfo, NULL, 0,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
- }
-#endif
- else {
- rcu_read_unlock();
- return ERR_PTR(-EINVAL);
- }
- rcu_read_unlock();
- if (!sk)
- return ERR_PTR(-ENOENT);
-
- if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
- sock_gen_put(sk);
- return ERR_PTR(-ENOENT);
- }
-
- return sk;
-}
-EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
-
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *req)
-{
- struct sk_buff *in_skb = cb->skb;
- bool net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
- struct sock *sk;
- int err;
-
- sk = inet_diag_find_one_icsk(net, hashinfo, req);
- if (IS_ERR(sk))
- return PTR_ERR(sk);
-
- rep = nlmsg_new(inet_sk_attr_size(sk, req, net_admin), GFP_KERNEL);
- if (!rep) {
- err = -ENOMEM;
- goto out;
- }
-
- err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
- if (err < 0) {
- WARN_ON(err == -EMSGSIZE);
- nlmsg_free(rep);
- goto out;
- }
- err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
-
-out:
- if (sk)
- sock_gen_put(sk);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-
static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
int hdrlen,
@@ -785,7 +580,7 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
const struct sock *sk)
{
#if IS_ENABLED(CONFIG_IPV6)
- if (sk->sk_family == AF_INET6) {
+ if (entry->family == AF_INET6) {
entry->saddr = sk->sk_v6_rcv_saddr.s6_addr32;
entry->daddr = sk->sk_v6_daddr.s6_addr32;
} else
@@ -796,31 +591,36 @@ static void entry_fill_addrs(struct inet_diag_entry *entry,
}
}
-int inet_diag_bc_sk(const struct nlattr *bc, struct sock *sk)
+int inet_diag_bc_sk(const struct inet_diag_dump_data *cb_data, struct sock *sk)
{
- struct inet_sock *inet = inet_sk(sk);
+ const struct nlattr *bc = cb_data->inet_diag_nla_bc;
+ const struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
if (!bc)
return 1;
- entry.family = sk->sk_family;
+ entry.family = READ_ONCE(sk->sk_family);
entry_fill_addrs(&entry, sk);
- entry.sport = inet->inet_num;
- entry.dport = ntohs(inet->inet_dport);
- entry.ifindex = sk->sk_bound_dev_if;
- entry.userlocks = sk_fullsock(sk) ? sk->sk_userlocks : 0;
- if (sk_fullsock(sk))
- entry.mark = READ_ONCE(sk->sk_mark);
- else if (sk->sk_state == TCP_NEW_SYN_RECV)
- entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
- else if (sk->sk_state == TCP_TIME_WAIT)
- entry.mark = inet_twsk(sk)->tw_mark;
- else
- entry.mark = 0;
+ entry.sport = READ_ONCE(inet->inet_num);
+ entry.dport = ntohs(READ_ONCE(inet->inet_dport));
+ entry.ifindex = READ_ONCE(sk->sk_bound_dev_if);
+ if (cb_data->userlocks_needed)
+ entry.userlocks = sk_fullsock(sk) ? READ_ONCE(sk->sk_userlocks) : 0;
+ if (cb_data->mark_needed) {
+ if (sk_fullsock(sk))
+ entry.mark = READ_ONCE(sk->sk_mark);
+ else if (sk->sk_state == TCP_NEW_SYN_RECV)
+ entry.mark = inet_rsk(inet_reqsk(sk))->ir_mark;
+ else if (sk->sk_state == TCP_TIME_WAIT)
+ entry.mark = inet_twsk(sk)->tw_mark;
+ else
+ entry.mark = 0;
+ }
#ifdef CONFIG_SOCK_CGROUP_DATA
- entry.cgroup_id = sk_fullsock(sk) ?
- cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
+ if (cb_data->cgroup_needed)
+ entry.cgroup_id = sk_fullsock(sk) ?
+ cgroup_id(sock_cgroup_ptr(&sk->sk_cgrp_data)) : 0;
#endif
return inet_diag_bc_run(bc, &entry);
@@ -920,16 +720,21 @@ static bool valid_cgroupcond(const struct inet_diag_bc_op *op, int len,
}
#endif
-static int inet_diag_bc_audit(const struct nlattr *attr,
+static int inet_diag_bc_audit(struct inet_diag_dump_data *cb_data,
const struct sk_buff *skb)
{
- bool net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
+ const struct nlattr *attr = cb_data->inet_diag_nla_bc;
const void *bytecode, *bc;
int bytecode_len, len;
+ bool net_admin;
+
+ if (!attr)
+ return 0;
- if (!attr || nla_len(attr) < sizeof(struct inet_diag_bc_op))
+ if (nla_len(attr) < sizeof(struct inet_diag_bc_op))
return -EINVAL;
+ net_admin = netlink_net_capable(skb, CAP_NET_ADMIN);
bytecode = bc = nla_data(attr);
len = bytecode_len = nla_len(attr);
@@ -961,14 +766,18 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return -EPERM;
if (!valid_markcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->mark_needed = true;
break;
#ifdef CONFIG_SOCK_CGROUP_DATA
case INET_DIAG_BC_CGROUP_COND:
if (!valid_cgroupcond(bc, len, &min_len))
return -EINVAL;
+ cb_data->cgroup_needed = true;
break;
#endif
case INET_DIAG_BC_AUTO:
+ cb_data->userlocks_needed = true;
+ fallthrough;
case INET_DIAG_BC_JMP:
case INET_DIAG_BC_NOP:
break;
@@ -992,280 +801,6 @@ static int inet_diag_bc_audit(const struct nlattr *attr,
return len == 0 ? 0 : -EINVAL;
}
-static void twsk_build_assert(void)
-{
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
- offsetof(struct sock, sk_family));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
- offsetof(struct inet_sock, inet_num));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
- offsetof(struct inet_sock, inet_dport));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
- offsetof(struct inet_sock, inet_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
- offsetof(struct inet_sock, inet_daddr));
-
-#if IS_ENABLED(CONFIG_IPV6)
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
- offsetof(struct sock, sk_v6_rcv_saddr));
-
- BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
- offsetof(struct sock, sk_v6_daddr));
-#endif
-}
-
-void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
- struct netlink_callback *cb,
- const struct inet_diag_req_v2 *r)
-{
- bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
- struct inet_diag_dump_data *cb_data = cb->data;
- struct net *net = sock_net(skb->sk);
- u32 idiag_states = r->idiag_states;
- int i, num, s_i, s_num;
- struct nlattr *bc;
- struct sock *sk;
-
- bc = cb_data->inet_diag_nla_bc;
- if (idiag_states & TCPF_SYN_RECV)
- idiag_states |= TCPF_NEW_SYN_RECV;
- s_i = cb->args[1];
- s_num = num = cb->args[2];
-
- if (cb->args[0] == 0) {
- if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
- goto skip_listen_ht;
-
- for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
- struct inet_listen_hashbucket *ilb;
- struct hlist_nulls_node *node;
-
- num = 0;
- ilb = &hashinfo->lhash2[i];
-
- if (hlist_nulls_empty(&ilb->nulls_head)) {
- s_num = 0;
- continue;
- }
- spin_lock(&ilb->lock);
- sk_nulls_for_each(sk, node, &ilb->nulls_head) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (!net_eq(sock_net(sk), net))
- continue;
-
- if (num < s_num) {
- num++;
- continue;
- }
-
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_listen;
-
- if (r->id.idiag_sport != inet->inet_sport &&
- r->id.idiag_sport)
- goto next_listen;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_listen;
-
- if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
- cb, r, NLM_F_MULTI,
- net_admin) < 0) {
- spin_unlock(&ilb->lock);
- goto done;
- }
-
-next_listen:
- ++num;
- }
- spin_unlock(&ilb->lock);
-
- s_num = 0;
- }
-skip_listen_ht:
- cb->args[0] = 1;
- s_i = num = s_num = 0;
- }
-
-/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
- * with bh disabled.
- */
-#define SKARR_SZ 16
-
- /* Dump bound but inactive (not listening, connecting, etc.) sockets */
- if (cb->args[0] == 1) {
- if (!(idiag_states & TCPF_BOUND_INACTIVE))
- goto skip_bind_ht;
-
- for (i = s_i; i < hashinfo->bhash_size; i++) {
- struct inet_bind_hashbucket *ibb;
- struct inet_bind2_bucket *tb2;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
-resume_bind_walk:
- num = 0;
- accum = 0;
- ibb = &hashinfo->bhash2[i];
-
- if (hlist_empty(&ibb->chain)) {
- s_num = 0;
- continue;
- }
- spin_lock_bh(&ibb->lock);
- inet_bind_bucket_for_each(tb2, &ibb->chain) {
- if (!net_eq(ib2_net(tb2), net))
- continue;
-
- sk_for_each_bound(sk, &tb2->owners) {
- struct inet_sock *inet = inet_sk(sk);
-
- if (num < s_num)
- goto next_bind;
-
- if (sk->sk_state != TCP_CLOSE ||
- !inet->inet_num)
- goto next_bind;
-
- if (r->sdiag_family != AF_UNSPEC &&
- r->sdiag_family != sk->sk_family)
- goto next_bind;
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_bind;
-
- sock_hold(sk);
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- goto pause_bind_walk;
-next_bind:
- num++;
- }
- }
-pause_bind_walk:
- spin_unlock_bh(&ibb->lock);
-
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = inet_sk_diag_fill(sk_arr[idx],
- NULL, skb, cb,
- r, NLM_F_MULTI,
- net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_put(sk_arr[idx]);
- }
- if (res < 0)
- goto done;
-
- cond_resched();
-
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto resume_bind_walk;
- }
-
- s_num = 0;
- }
-skip_bind_ht:
- cb->args[0] = 2;
- s_i = num = s_num = 0;
- }
-
- if (!(idiag_states & ~TCPF_LISTEN))
- goto out;
-
- for (i = s_i; i <= hashinfo->ehash_mask; i++) {
- struct inet_ehash_bucket *head = &hashinfo->ehash[i];
- spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
- struct hlist_nulls_node *node;
- struct sock *sk_arr[SKARR_SZ];
- int num_arr[SKARR_SZ];
- int idx, accum, res;
-
- if (hlist_nulls_empty(&head->chain))
- continue;
-
- if (i > s_i)
- s_num = 0;
-
-next_chunk:
- num = 0;
- accum = 0;
- spin_lock_bh(lock);
- sk_nulls_for_each(sk, node, &head->chain) {
- int state;
-
- if (!net_eq(sock_net(sk), net))
- continue;
- if (num < s_num)
- goto next_normal;
- state = (sk->sk_state == TCP_TIME_WAIT) ?
- READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
- if (!(idiag_states & (1 << state)))
- goto next_normal;
- if (r->sdiag_family != AF_UNSPEC &&
- sk->sk_family != r->sdiag_family)
- goto next_normal;
- if (r->id.idiag_sport != htons(sk->sk_num) &&
- r->id.idiag_sport)
- goto next_normal;
- if (r->id.idiag_dport != sk->sk_dport &&
- r->id.idiag_dport)
- goto next_normal;
- twsk_build_assert();
-
- if (!inet_diag_bc_sk(bc, sk))
- goto next_normal;
-
- if (!refcount_inc_not_zero(&sk->sk_refcnt))
- goto next_normal;
-
- num_arr[accum] = num;
- sk_arr[accum] = sk;
- if (++accum == SKARR_SZ)
- break;
-next_normal:
- ++num;
- }
- spin_unlock_bh(lock);
- res = 0;
- for (idx = 0; idx < accum; idx++) {
- if (res >= 0) {
- res = sk_diag_fill(sk_arr[idx], skb, cb, r,
- NLM_F_MULTI, net_admin);
- if (res < 0)
- num = num_arr[idx];
- }
- sock_gen_put(sk_arr[idx]);
- }
- if (res < 0)
- break;
- cond_resched();
- if (accum == SKARR_SZ) {
- s_num = num + 1;
- goto next_chunk;
- }
- }
-
-done:
- cb->args[1] = i;
- cb->args[2] = num;
-out:
- ;
-}
-EXPORT_SYMBOL_GPL(inet_diag_dump_icsk);
-
static int __inet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
@@ -1319,13 +854,10 @@ static int __inet_diag_dump_start(struct netlink_callback *cb, int hdrlen)
kfree(cb_data);
return err;
}
- nla = cb_data->inet_diag_nla_bc;
- if (nla) {
- err = inet_diag_bc_audit(nla, skb);
- if (err) {
- kfree(cb_data);
- return err;
- }
+ err = inet_diag_bc_audit(cb_data, skb);
+ if (err) {
+ kfree(cb_data);
+ return err;
}
nla = cb_data->inet_diag_nla_bpf_stgs;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 470ab17ceb51..025895eb6ec5 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -183,7 +183,7 @@ static void fqdir_work_fn(struct work_struct *work)
rhashtable_free_and_destroy(&fqdir->rhashtable, inet_frags_free_cb, NULL);
if (llist_add(&fqdir->free_list, &fqdir_free_list))
- queue_delayed_work(system_wq, &fqdir_free_work, HZ);
+ queue_delayed_work(system_percpu_wq, &fqdir_free_work, HZ);
}
int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net)
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ceeeec9b7290..b7024e3d9ac3 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk)
sk->sk_daddr, sk->sk_dport);
}
+static bool sk_is_connect_bind(const struct sock *sk)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return inet_twsk(sk)->tw_connect_bind;
+ else
+ return sk->sk_userlocks & SOCK_CONNECT_BIND;
+}
+
/*
* Allocate and initialize a new local port bind bucket.
* The bindhash mutex for snum's hash chain must be held here.
@@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
*/
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
{
+ const struct inet_bind2_bucket *tb2;
+
if (hlist_empty(&tb->bhash2)) {
hlist_del_rcu(&tb->node);
kfree_rcu(tb, rcu);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
+ if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
+ return;
}
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
}
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
@@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
#else
tb2->rcv_saddr = sk->sk_rcv_saddr;
#endif
+ tb2->fastreuse = 0;
+ tb2->fastreuseport = 0;
INIT_HLIST_HEAD(&tb2->owners);
hlist_add_head(&tb2->node, &head->chain);
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
@@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
/* Caller must hold hashbucket lock for this tb with local BH disabled */
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
{
+ const struct sock *sk;
+
if (hlist_empty(&tb->owners)) {
__hlist_del(&tb->node);
__hlist_del(&tb->bhash_node);
kmem_cache_free(cachep, tb);
+ return;
+ }
+
+ if (tb->fastreuse == -1 && tb->fastreuseport == -1)
+ return;
+ sk_for_each_bound(sk, &tb->owners) {
+ if (!sk_is_connect_bind(sk))
+ return;
}
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
}
static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
@@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk)
tb = inet_csk(sk)->icsk_bind_hash;
inet_csk(sk)->icsk_bind_hash = NULL;
inet_sk(sk)->inet_num = 0;
+ sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
spin_lock(&head2->lock);
if (inet_csk(sk)->icsk_bind2_hash) {
@@ -277,7 +312,7 @@ bhash2_find:
}
}
if (update_fastreuse)
- inet_csk_update_fastreuse(tb, child);
+ inet_csk_update_fastreuse(child, tb, tb2);
inet_bind_hash(child, tb, tb2, port);
spin_unlock(&head2->lock);
spin_unlock(&head->lock);
@@ -425,19 +460,18 @@ struct sock *inet_lookup_run_sk_lookup(const struct net *net,
}
struct sock *__inet_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
- hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
saddr, sport, daddr, hnum, dif,
inet_ehashfn);
@@ -445,6 +479,7 @@ struct sock *__inet_lookup_listener(const struct net *net,
goto done;
}
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv4_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -490,21 +525,22 @@ void sock_edemux(struct sk_buff *skb)
EXPORT_SYMBOL(sock_edemux);
struct sock *__inet_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
- const __be32 saddr, const __be16 sport,
- const __be32 daddr, const u16 hnum,
- const int dif, const int sdif)
+ const __be32 saddr, const __be16 sport,
+ const __be32 daddr, const u16 hnum,
+ const int dif, const int sdif)
{
- INET_ADDR_COOKIE(acookie, saddr, daddr);
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- struct sock *sk;
+ INET_ADDR_COOKIE(acookie, saddr, daddr);
const struct hlist_nulls_node *node;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
@@ -579,8 +615,7 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -707,7 +742,7 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
if (ok) {
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} else {
- this_cpu_inc(*sk->sk_prot->orphan_count);
+ tcp_orphan_count_inc();
inet_sk_set_state(sk, TCP_CLOSE);
sock_set_flag(sk, SOCK_DEAD);
inet_csk_destroy_sock(sk);
@@ -739,15 +774,18 @@ static int inet_reuseport_add_sock(struct sock *sk,
return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
}
-int __inet_hash(struct sock *sk, struct sock *osk)
+int inet_hash(struct sock *sk)
{
struct inet_hashinfo *hashinfo = tcp_get_hashinfo(sk);
struct inet_listen_hashbucket *ilb2;
int err = 0;
+ if (sk->sk_state == TCP_CLOSE)
+ return 0;
+
if (sk->sk_state != TCP_LISTEN) {
local_bh_disable();
- inet_ehash_nolisten(sk, osk, NULL);
+ inet_ehash_nolisten(sk, NULL, NULL);
local_bh_enable();
return 0;
}
@@ -772,17 +810,7 @@ unlock:
return err;
}
-EXPORT_IPV6_MOD(__inet_hash);
-
-int inet_hash(struct sock *sk)
-{
- int err = 0;
-
- if (sk->sk_state != TCP_CLOSE)
- err = __inet_hash(sk, NULL);
-
- return err;
-}
+EXPORT_IPV6_MOD(inet_hash);
void inet_unhash(struct sock *sk)
{
@@ -800,11 +828,6 @@ void inet_unhash(struct sock *sk)
* avoid circular locking dependency on PREEMPT_RT.
*/
spin_lock(&ilb2->lock);
- if (sk_unhashed(sk)) {
- spin_unlock(&ilb2->lock);
- return;
- }
-
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_stop_listen_sock(sk);
@@ -815,10 +838,6 @@ void inet_unhash(struct sock *sk)
spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
- if (sk_unhashed(sk)) {
- spin_unlock_bh(lock);
- return;
- }
__sk_nulls_del_node_init_rcu(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
spin_unlock_bh(lock);
@@ -966,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family,
if (!tb2) {
tb2 = new_tb2;
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
+ if (sk_is_connect_bind(sk)) {
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
+ }
}
inet_csk(sk)->icsk_bind2_hash = tb2;
sk_add_bind_node(sk, &tb2->owners);
@@ -1136,6 +1159,8 @@ ok:
head2, tb, sk);
if (!tb2)
goto error;
+ tb2->fastreuse = -1;
+ tb2->fastreuseport = -1;
}
/* Here we want to add a little bit of randomness to the next source
@@ -1148,6 +1173,7 @@ ok:
/* Head lock still held and bh's disabled */
inet_bind_hash(sk, tb, tb2, port);
+ sk->sk_userlocks |= SOCK_CONNECT_BIND;
if (sk_unhashed(sk)) {
inet_sk(sk)->inet_sport = htons(port);
diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
index 56a117560c0c..c96d61d08854 100644
--- a/net/ipv4/inet_timewait_sock.c
+++ b/net/ipv4/inet_timewait_sock.c
@@ -15,7 +15,8 @@
#include <net/inet_hashtables.h>
#include <net/inet_timewait_sock.h>
#include <net/ip.h>
-
+#include <net/tcp.h>
+#include <net/psp.h>
/**
* inet_twsk_bind_unhash - unhash a timewait socket from bind hash
@@ -74,7 +75,8 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw)
void inet_twsk_free(struct inet_timewait_sock *tw)
{
struct module *owner = tw->tw_prot->owner;
- twsk_destructor((struct sock *)tw);
+
+ tcp_twsk_destructor((struct sock *)tw);
kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
module_put(owner);
}
@@ -206,10 +208,14 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
tw->tw_hash = sk->sk_hash;
tw->tw_ipv6only = 0;
tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
+ tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
tw->tw_prot = sk->sk_prot_creator;
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
twsk_net_set(tw, sock_net(sk));
timer_setup(&tw->tw_timer, tw_timer_handler, 0);
+#ifdef CONFIG_SOCK_VALIDATE_XMIT
+ tw->tw_validate_xmit_skb = NULL;
+#endif
/*
* Because we use RCU lookups, we should not set tw_refcnt
* to a non null value before everything is setup for this
@@ -218,6 +224,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
refcount_set(&tw->tw_refcnt, 0);
__module_get(tw->tw_prot->owner);
+ psp_twsk_init(tw, sk);
}
return tw;
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index b2584cce90ae..f7012479713b 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -476,14 +476,16 @@ out_fail:
/* Process an incoming IP datagram fragment. */
int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
{
- struct net_device *dev = skb->dev ? : skb_dst_dev(skb);
- int vif = l3mdev_master_ifindex_rcu(dev);
+ struct net_device *dev;
struct ipq *qp;
+ int vif;
__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
/* Lookup (or create) queue header */
rcu_read_lock();
+ dev = skb->dev ? : skb_dst_dev_rcu(skb);
+ vif = l3mdev_master_ifindex_rcu(dev);
qp = ip_find(net, ip_hdr(skb), user, vif);
if (qp) {
int ret, refs = 0;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f5b9004d6938..761a53c6a89a 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -28,6 +28,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
+#include <net/flow.h>
#include <net/sock.h>
#include <net/ip.h>
#include <net/icmp.h>
@@ -44,7 +45,6 @@
#include <net/gre.h>
#include <net/dst_metadata.h>
#include <net/erspan.h>
-#include <net/inet_dscp.h>
/*
Problems & solutions
@@ -930,7 +930,7 @@ static int ipgre_open(struct net_device *dev)
if (ipv4_is_multicast(t->parms.iph.daddr)) {
struct flowi4 fl4 = {
.flowi4_oif = t->parms.link,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(&t->parms.iph)),
+ .flowi4_dscp = ip4h_dscp(&t->parms.iph),
.flowi4_scope = RT_SCOPE_UNIVERSE,
.flowi4_proto = IPPROTO_GRE,
.saddr = t->parms.iph.saddr,
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index fc323994b1fa..273578579a6b 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -263,10 +263,11 @@ int ip_local_deliver(struct sk_buff *skb)
}
EXPORT_SYMBOL(ip_local_deliver);
-static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
+static inline enum skb_drop_reason
+ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
{
- struct ip_options *opt;
const struct iphdr *iph;
+ struct ip_options *opt;
/* It looks as overkill, because not all
IP options require packet mangling.
@@ -277,7 +278,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
*/
if (skb_cow(skb, skb_headroom(skb))) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
- goto drop;
+ return SKB_DROP_REASON_NOMEM;
}
iph = ip_hdr(skb);
@@ -286,7 +287,7 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
if (ip_options_compile(dev_net(dev), opt, skb)) {
__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
- goto drop;
+ return SKB_DROP_REASON_IP_INHDR;
}
if (unlikely(opt->srr)) {
@@ -298,17 +299,15 @@ static inline bool ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
net_info_ratelimited("source route option %pI4 -> %pI4\n",
&iph->saddr,
&iph->daddr);
- goto drop;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
}
if (ip_options_rcv_srr(skb, dev))
- goto drop;
+ return SKB_DROP_REASON_NOT_SPECIFIED;
}
- return false;
-drop:
- return true;
+ return SKB_NOT_DROPPED_YET;
}
static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
@@ -319,7 +318,7 @@ static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
}
int tcp_v4_early_demux(struct sk_buff *skb);
-int udp_v4_early_demux(struct sk_buff *skb);
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb);
static int ip_rcv_finish_core(struct net *net,
struct sk_buff *skb, struct net_device *dev,
const struct sk_buff *hint)
@@ -335,7 +334,6 @@ static int ip_rcv_finish_core(struct net *net,
goto drop_error;
}
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
!skb_dst(skb) &&
!skb->sk &&
@@ -354,7 +352,6 @@ static int ip_rcv_finish_core(struct net *net,
drop_reason = udp_v4_early_demux(skb);
if (unlikely(drop_reason))
goto drop_error;
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
/* must reload iph, skb->head might have changed */
iph = ip_hdr(skb);
@@ -372,7 +369,6 @@ static int ip_rcv_finish_core(struct net *net,
ip4h_dscp(iph), dev);
if (unlikely(drop_reason))
goto drop_error;
- drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
} else {
struct in_device *in_dev = __in_dev_get_rcu(dev);
@@ -391,8 +387,11 @@ static int ip_rcv_finish_core(struct net *net,
}
#endif
- if (iph->ihl > 5 && ip_rcv_options(skb, dev))
- goto drop;
+ if (iph->ihl > 5) {
+ drop_reason = ip_rcv_options(skb, dev);
+ if (drop_reason)
+ goto drop;
+ }
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
@@ -587,9 +586,13 @@ static void ip_sublist_rcv_finish(struct list_head *head)
}
static struct sk_buff *ip_extract_route_hint(const struct net *net,
- struct sk_buff *skb, int rt_type)
+ struct sk_buff *skb)
{
- if (fib4_has_custom_rules(net) || rt_type == RTN_BROADCAST ||
+ const struct iphdr *iph = ip_hdr(skb);
+
+ if (fib4_has_custom_rules(net) ||
+ ipv4_is_lbcast(iph->daddr) ||
+ ipv4_is_zeronet(iph->daddr) ||
IPCB(skb)->flags & IPSKB_MULTIPATH)
return NULL;
@@ -618,8 +621,7 @@ static void ip_list_rcv_finish(struct net *net, struct list_head *head)
dst = skb_dst(skb);
if (curr_dst != dst) {
- hint = ip_extract_route_hint(net, skb,
- dst_rtable(dst)->rt_type);
+ hint = ip_extract_route_hint(net, skb);
/* dispatch old sublist */
if (!list_empty(&sublist))
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index e3321932bec0..be8815ce3ac2 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -615,14 +615,13 @@ int ip_options_rcv_srr(struct sk_buff *skb, struct net_device *dev)
}
memcpy(&nexthop, &optptr[srrptr-1], 4);
- orefdst = skb->_skb_refdst;
- skb_dst_set(skb, NULL);
+ orefdst = skb_dstref_steal(skb);
err = ip_route_input(skb, nexthop, iph->saddr, ip4h_dscp(iph),
dev) ? -EINVAL : 0;
rt2 = skb_rtable(skb);
if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) {
skb_dst_drop(skb);
- skb->_skb_refdst = orefdst;
+ skb_dstref_restore(skb, orefdst);
return -EINVAL;
}
refdst_drop(orefdst);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 84e7f8a2f50f..5ca97ede979c 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -63,6 +63,7 @@
#include <linux/stat.h>
#include <linux/init.h>
+#include <net/flow.h>
#include <net/snmp.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -83,6 +84,7 @@
#include <linux/netfilter_bridge.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
+#include <net/psp.h>
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
@@ -485,7 +487,7 @@ int __ip_queue_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
inet_sk_init_flowi4(inet, fl4);
/* sctp_v4_xmit() uses its own DSCP value */
- fl4->flowi4_tos = tos & INET_DSCP_MASK;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(tos);
/* If this fails, retransmit mechanism of transport layer will
* keep trying until route appears or the connection times
@@ -1664,8 +1666,10 @@ void ip_send_unicast_reply(struct sock *sk, const struct sock *orig_sk,
arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum));
nskb->ip_summed = CHECKSUM_NONE;
- if (orig_sk)
+ if (orig_sk) {
skb_set_owner_edemux(nskb, (struct sock *)orig_sk);
+ psp_reply_set_decrypted(nskb);
+ }
if (transmit_time)
nskb->tstamp_type = SKB_CLOCK_MONOTONIC;
if (txhash)
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index e86a8a862c41..ca9eaee4c2ef 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -42,6 +42,7 @@
#include <linux/init.h>
#include <linux/if_ether.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -1904,7 +1905,7 @@ static int ipmr_prepare_xmit(struct net *net, struct mr_table *mrt,
return -1;
}
- encap += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+ encap += LL_RESERVED_SPACE(dst_dev_rcu(&rt->dst)) + rt->dst.header_len;
if (skb_cow(skb, encap)) {
ip_rt_put(rt);
@@ -1957,7 +1958,7 @@ static void ipmr_queue_fwd_xmit(struct net *net, struct mr_table *mrt,
* result in receiving multiple packets.
*/
NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
- net, NULL, skb, skb->dev, rt->dst.dev,
+ net, NULL, skb, skb->dev, dst_dev_rcu(&rt->dst),
ipmr_forward_finish);
return;
@@ -2120,7 +2121,7 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = (rt_is_output_route(rt) ?
skb->dev->ifindex : 0),
.flowi4_iif = (rt_is_output_route(rt) ?
@@ -2301,7 +2302,7 @@ int ip_mr_output(struct net *net, struct sock *sk, struct sk_buff *skb)
guard(rcu)();
- dev = rt->dst.dev;
+ dev = dst_dev_rcu(&rt->dst);
if (IPCB(skb)->flags & IPSKB_FORWARDED)
goto mc_output;
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 0565f001120d..ce310eb779e0 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -11,10 +11,10 @@
#include <linux/skbuff.h>
#include <linux/gfp.h>
#include <linux/export.h>
+#include <net/flow.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/ip.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/nf_queue.h>
/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
@@ -44,7 +44,7 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
*/
fl4.daddr = iph->daddr;
fl4.saddr = saddr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_oif = sk ? sk->sk_bound_dev_if : 0;
fl4.flowi4_l3mdev = l3mdev_master_ifindex(dev);
fl4.flowi4_mark = skb->mark;
@@ -65,7 +65,10 @@ int ip_route_me_harder(struct net *net, struct sock *sk, struct sk_buff *skb, un
if (!(IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(net, skb, flowi4_to_flowi(&fl4), AF_INET) == 0) {
struct dst_entry *dst = skb_dst(skb);
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi4_to_flowi(&fl4), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c
index a27782d7653e..6d9bf5106868 100644
--- a/net/ipv4/netfilter/ipt_rpfilter.c
+++ b/net/ipv4/netfilter/ipt_rpfilter.c
@@ -8,8 +8,8 @@
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
-#include <net/inet_dscp.h>
#include <linux/ip.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par)
flow.daddr = iph->saddr;
flow.saddr = rpfilter_get_saddr(iph->daddr);
flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0;
- flow.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ flow.flowi4_dscp = ip4h_dscp(iph);
flow.flowi4_scope = RT_SCOPE_UNIVERSE;
flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par));
flow.flowi4_uid = sock_net_uid(xt_net(par), NULL);
diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c
index ed08fb78cfa8..9a773502f10a 100644
--- a/net/ipv4/netfilter/nf_dup_ipv4.c
+++ b/net/ipv4/netfilter/nf_dup_ipv4.c
@@ -12,10 +12,10 @@
#include <linux/skbuff.h>
#include <linux/netfilter.h>
#include <net/checksum.h>
+#include <net/flow.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/route.h>
-#include <net/inet_dscp.h>
#include <net/netfilter/ipv4/nf_dup_ipv4.h>
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
#include <net/netfilter/nf_conntrack.h>
@@ -33,7 +33,7 @@ static bool nf_dup_ipv4_route(struct net *net, struct sk_buff *skb,
fl4.flowi4_oif = oif;
fl4.daddr = gw->s_addr;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = FLOWI_FLAG_KNOWN_NH;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index 0d3cb2ba6fc8..fae4aa4a5f09 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -12,6 +12,15 @@
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl);
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth);
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook);
+
static int nf_reject_iphdr_validate(struct sk_buff *skb)
{
struct iphdr *iph;
@@ -71,6 +80,27 @@ struct sk_buff *nf_reject_skb_v4_tcp_reset(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v4_tcp_reset);
+static bool nf_skb_is_icmp_unreach(const struct sk_buff *skb)
+{
+ const struct iphdr *iph = ip_hdr(skb);
+ u8 *tp, _type;
+ int thoff;
+
+ if (iph->protocol != IPPROTO_ICMP)
+ return false;
+
+ thoff = skb_network_offset(skb) + sizeof(*iph);
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmphdr, type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMP_DEST_UNREACH;
+}
+
struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
@@ -91,6 +121,10 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
if (ip_hdr(oldskb)->frag_off & htons(IP_OFFSET))
return NULL;
+ /* don't reply to ICMP_DEST_UNREACH with ICMP_DEST_UNREACH. */
+ if (nf_skb_is_icmp_unreach(oldskb))
+ return NULL;
+
/* RFC says return as much as we can without exceeding 576 bytes. */
len = min_t(unsigned int, 536, oldskb->len);
@@ -136,8 +170,9 @@ struct sk_buff *nf_reject_skb_v4_unreach(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v4_unreach);
-const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *_oth, int hook)
+static const struct tcphdr *
+nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *_oth, int hook)
{
const struct tcphdr *oth;
@@ -163,11 +198,10 @@ const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
return oth;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_get);
-struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int ttl)
+static struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int ttl)
{
struct iphdr *niph, *oiph = ip_hdr(oldskb);
@@ -188,10 +222,9 @@ struct iphdr *nf_reject_iphdr_put(struct sk_buff *nskb,
return niph;
}
-EXPORT_SYMBOL_GPL(nf_reject_iphdr_put);
-void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
- const struct tcphdr *oth)
+static void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
+ const struct tcphdr *oth)
{
struct iphdr *niph = ip_hdr(nskb);
struct tcphdr *tcph;
@@ -218,7 +251,6 @@ void nf_reject_ip_tcphdr_put(struct sk_buff *nskb, const struct sk_buff *oldskb,
nskb->csum_start = (unsigned char *)tcph - nskb->head;
nskb->csum_offset = offsetof(struct tcphdr, check);
}
-EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put);
static int nf_reject_fill_skb_dst(struct sk_buff *skb_in)
{
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c
index a1350fc25838..5080fa5fbf6a 100644
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -71,8 +71,7 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo,
- skb, doff, saddr, sport, daddr, dport,
+ return inet_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp4_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c
index 73e66a088e25..041c3f37f237 100644
--- a/net/ipv4/netfilter/nf_tproxy_ipv4.c
+++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c
@@ -81,7 +81,6 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -95,7 +94,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, hinfo, skb,
+ sk = inet_lookup_listener(net, skb,
ip_hdrlen(skb) + __tcp_hdrlen(hp),
saddr, sport, daddr, dport,
in->ifindex, 0);
@@ -109,7 +108,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = inet_lookup_established(net, hinfo, saddr, sport,
+ sk = inet_lookup_established(net, saddr, sport,
daddr, dport, in->ifindex);
break;
default:
diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c
index 7e7c49535e3f..82af6cd76d13 100644
--- a/net/ipv4/netfilter/nft_fib_ipv4.c
+++ b/net/ipv4/netfilter/nft_fib_ipv4.c
@@ -10,7 +10,7 @@
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nft_fib.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/ip_fib.h>
#include <net/route.h>
@@ -114,7 +114,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs,
if (priv->flags & NFTA_FIB_F_MARK)
fl4.flowi4_mark = pkt->skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph));
+ fl4.flowi4_dscp = ip4h_dscp(iph);
if (priv->flags & NFTA_FIB_F_DADDR) {
fl4.daddr = iph->daddr;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 34137768e7f9..7b9d70f9b31c 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -2087,6 +2087,12 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
{
struct nh_grp_entry *nhge, *tmp;
+ /* If there is nothing to do, let's avoid the costly call to
+ * synchronize_net()
+ */
+ if (list_empty(&nh->grp_list))
+ return;
+
list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
remove_nh_grp_entry(net, nhge, nlinfo);
@@ -3518,12 +3524,42 @@ static int rtm_dump_walk_nexthops(struct sk_buff *skb,
int err;
s_idx = ctx->idx;
- for (node = rb_first(root); node; node = rb_next(node)) {
+
+ /* If this is not the first invocation, ctx->idx will contain the id of
+ * the last nexthop we processed. Instead of starting from the very
+ * first element of the red/black tree again and linearly skipping the
+ * (potentially large) set of nodes with an id smaller than s_idx, walk
+ * the tree and find the left-most node whose id is >= s_idx. This
+ * provides an efficient O(log n) starting point for the dump
+ * continuation.
+ */
+ if (s_idx != 0) {
+ struct rb_node *tmp = root->rb_node;
+
+ node = NULL;
+ while (tmp) {
+ struct nexthop *nh;
+
+ nh = rb_entry(tmp, struct nexthop, rb_node);
+ if (nh->id < s_idx) {
+ tmp = tmp->rb_right;
+ } else {
+ /* Track current candidate and keep looking on
+ * the left side to find the left-most
+ * (smallest id) that is still >= s_idx.
+ */
+ node = tmp;
+ tmp = tmp->rb_left;
+ }
+ }
+ } else {
+ node = rb_first(root);
+ }
+
+ for (; node; node = rb_next(node)) {
struct nexthop *nh;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh->id < s_idx)
- continue;
ctx->idx = nh->id;
err = nh_cb(skb, cb, nh, data);
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 031df4c19fcc..5321c5801c64 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -56,9 +56,7 @@ struct ping_table {
static struct ping_table ping_table;
struct pingv6_ops pingv6_ops;
-EXPORT_SYMBOL_GPL(pingv6_ops);
-
-static u16 ping_port_rover;
+EXPORT_IPV6_MOD_GPL(pingv6_ops);
static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
{
@@ -67,7 +65,6 @@ static inline u32 ping_hashfn(const struct net *net, u32 num, u32 mask)
pr_debug("hash(%u) = %u\n", num, res);
return res;
}
-EXPORT_SYMBOL_GPL(ping_hash);
static inline struct hlist_head *ping_hashslot(struct ping_table *table,
struct net *net, unsigned int num)
@@ -77,6 +74,7 @@ static inline struct hlist_head *ping_hashslot(struct ping_table *table,
int ping_get_port(struct sock *sk, unsigned short ident)
{
+ struct net *net = sock_net(sk);
struct inet_sock *isk, *isk2;
struct hlist_head *hlist;
struct sock *sk2 = NULL;
@@ -84,15 +82,16 @@ int ping_get_port(struct sock *sk, unsigned short ident)
isk = inet_sk(sk);
spin_lock(&ping_table.lock);
if (ident == 0) {
+ u16 result = net->ipv4.ping_port_rover + 1;
u32 i;
- u16 result = ping_port_rover + 1;
for (i = 0; i < (1L << 16); i++, result++) {
if (!result)
- result++; /* avoid zero */
- hlist = ping_hashslot(&ping_table, sock_net(sk),
- result);
+ continue; /* avoid zero */
+ hlist = ping_hashslot(&ping_table, net, result);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
if (isk2->inet_num == result)
@@ -100,7 +99,7 @@ int ping_get_port(struct sock *sk, unsigned short ident)
}
/* found */
- ping_port_rover = ident = result;
+ net->ipv4.ping_port_rover = ident = result;
break;
next_port:
;
@@ -108,8 +107,10 @@ next_port:
if (i >= (1L << 16))
goto fail;
} else {
- hlist = ping_hashslot(&ping_table, sock_net(sk), ident);
+ hlist = ping_hashslot(&ping_table, net, ident);
sk_for_each(sk2, hlist) {
+ if (!net_eq(sock_net(sk2), net))
+ continue;
isk2 = inet_sk(sk2);
/* BUG? Why is this reuse and not reuseaddr? ping.c
@@ -129,7 +130,7 @@ next_port:
pr_debug("was not hashed\n");
sk_add_node_rcu(sk, hlist);
sock_set_flag(sk, SOCK_RCU_FREE);
- sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+ sock_prot_inuse_add(net, sk->sk_prot, 1);
}
spin_unlock(&ping_table.lock);
return 0;
@@ -138,15 +139,7 @@ fail:
spin_unlock(&ping_table.lock);
return -EADDRINUSE;
}
-EXPORT_SYMBOL_GPL(ping_get_port);
-
-int ping_hash(struct sock *sk)
-{
- pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
- BUG(); /* "Please do not press this button again." */
-
- return 0;
-}
+EXPORT_IPV6_MOD_GPL(ping_get_port);
void ping_unhash(struct sock *sk)
{
@@ -161,7 +154,7 @@ void ping_unhash(struct sock *sk)
}
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_unhash);
+EXPORT_IPV6_MOD_GPL(ping_unhash);
/* Called under rcu_read_lock() */
static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
@@ -188,6 +181,8 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
}
sk_for_each_rcu(sk, hslot) {
+ if (!net_eq(sock_net(sk), net))
+ continue;
isk = inet_sk(sk);
pr_debug("iterate\n");
@@ -279,7 +274,7 @@ out_release_group:
put_group_info(group_info);
return ret;
}
-EXPORT_SYMBOL_GPL(ping_init_sock);
+EXPORT_IPV6_MOD_GPL(ping_init_sock);
void ping_close(struct sock *sk, long timeout)
{
@@ -289,7 +284,7 @@ void ping_close(struct sock *sk, long timeout)
sk_common_release(sk);
}
-EXPORT_SYMBOL_GPL(ping_close);
+EXPORT_IPV6_MOD_GPL(ping_close);
static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
int addr_len)
@@ -467,7 +462,7 @@ out:
pr_debug("ping_v4_bind -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_bind);
+EXPORT_IPV6_MOD_GPL(ping_bind);
/*
* Is this a supported type of ICMP message?
@@ -600,7 +595,7 @@ void ping_err(struct sk_buff *skb, int offset, u32 info)
out:
return;
}
-EXPORT_SYMBOL_GPL(ping_err);
+EXPORT_IPV6_MOD_GPL(ping_err);
/*
* Copy and checksum an ICMP Echo packet from user space into a buffer
@@ -630,7 +625,7 @@ int ping_getfrag(void *from, char *to,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_getfrag);
+EXPORT_IPV6_MOD_GPL(ping_getfrag);
static int ping_v4_push_pending_frames(struct sock *sk, struct pingfakehdr *pfh,
struct flowi4 *fl4)
@@ -691,7 +686,7 @@ int ping_common_sendmsg(int family, struct msghdr *msg, size_t len,
return 0;
}
-EXPORT_SYMBOL_GPL(ping_common_sendmsg);
+EXPORT_IPV6_MOD_GPL(ping_common_sendmsg);
static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
@@ -936,7 +931,7 @@ out:
pr_debug("ping_recvmsg -> %d\n", err);
return err;
}
-EXPORT_SYMBOL_GPL(ping_recvmsg);
+EXPORT_IPV6_MOD_GPL(ping_recvmsg);
static enum skb_drop_reason __ping_queue_rcv_skb(struct sock *sk,
struct sk_buff *skb)
@@ -957,7 +952,7 @@ int ping_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
return __ping_queue_rcv_skb(sk, skb) ? -1 : 0;
}
-EXPORT_SYMBOL_GPL(ping_queue_rcv_skb);
+EXPORT_IPV6_MOD_GPL(ping_queue_rcv_skb);
/*
@@ -985,7 +980,7 @@ enum skb_drop_reason ping_rcv(struct sk_buff *skb)
kfree_skb_reason(skb, SKB_DROP_REASON_NO_SOCKET);
return SKB_DROP_REASON_NO_SOCKET;
}
-EXPORT_SYMBOL_GPL(ping_rcv);
+EXPORT_IPV6_MOD_GPL(ping_rcv);
struct proto ping_prot = {
.name = "PING",
@@ -1002,13 +997,12 @@ struct proto ping_prot = {
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
.release_cb = ip4_datagram_release_cb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
.put_port = ping_unhash,
.obj_size = sizeof(struct inet_sock),
};
-EXPORT_SYMBOL(ping_prot);
+EXPORT_IPV6_MOD(ping_prot);
#ifdef CONFIG_PROC_FS
@@ -1073,7 +1067,7 @@ void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN;
}
-EXPORT_SYMBOL_GPL(ping_seq_start);
+EXPORT_IPV6_MOD_GPL(ping_seq_start);
static void *ping_v4_seq_start(struct seq_file *seq, loff_t *pos)
{
@@ -1092,14 +1086,14 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos;
return sk;
}
-EXPORT_SYMBOL_GPL(ping_seq_next);
+EXPORT_IPV6_MOD_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
__releases(ping_table.lock)
{
spin_unlock(&ping_table.lock);
}
-EXPORT_SYMBOL_GPL(ping_seq_stop);
+EXPORT_IPV6_MOD_GPL(ping_seq_stop);
static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
int bucket)
@@ -1119,7 +1113,7 @@ static void ping_v4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
static int ping_v4_seq_show(struct seq_file *seq, void *v)
@@ -1150,6 +1144,8 @@ static int __net_init ping_v4_proc_init_net(struct net *net)
if (!proc_create_net("icmp", 0444, net->proc_net, &ping_v4_seq_ops,
sizeof(struct ping_iter_state)))
return -ENOMEM;
+
+ net->ipv4.ping_port_rover = get_random_u16();
return 0;
}
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 65b0d0ab0084..974afc4ecbe2 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -95,7 +95,6 @@ static const struct snmp_mib snmp4_ipstats_list[] = {
SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS),
SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES),
SNMP_MIB_ITEM("OutTransmits", IPSTATS_MIB_OUTPKTS),
- SNMP_MIB_SENTINEL
};
/* Following items are displayed in /proc/net/netstat */
@@ -119,7 +118,6 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
- SNMP_MIB_SENTINEL
};
static const struct {
@@ -157,7 +155,6 @@ static const struct snmp_mib snmp4_tcp_list[] = {
SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS),
SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS),
SNMP_MIB_ITEM("InCsumErrors", TCP_MIB_CSUMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_udp_list[] = {
@@ -170,7 +167,6 @@ static const struct snmp_mib snmp4_udp_list[] = {
SNMP_MIB_ITEM("InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_ITEM("MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp4_net_list[] = {
@@ -309,7 +305,6 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPAOKeyNotFound", LINUX_MIB_TCPAOKEYNOTFOUND),
SNMP_MIB_ITEM("TCPAOGood", LINUX_MIB_TCPAOGOOD),
SNMP_MIB_ITEM("TCPAODroppedIcmps", LINUX_MIB_TCPAODROPPEDICMPS),
- SNMP_MIB_SENTINEL
};
static void icmpmsg_put_line(struct seq_file *seq, unsigned long *vals,
@@ -389,14 +384,15 @@ static void icmp_put(struct seq_file *seq)
*/
static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
{
+ const int cnt = ARRAY_SIZE(snmp4_ipstats_list);
+ u64 buff64[ARRAY_SIZE(snmp4_ipstats_list)];
struct net *net = seq->private;
- u64 buff64[IPSTATS_MIB_MAX];
int i;
- memset(buff64, 0, IPSTATS_MIB_MAX * sizeof(u64));
+ memset(buff64, 0, sizeof(buff64));
seq_puts(seq, "Ip: Forwarding DefaultTTL");
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", snmp4_ipstats_list[i].name);
seq_printf(seq, "\nIp: %d %d",
@@ -404,10 +400,10 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
READ_ONCE(net->ipv4.sysctl_ip_default_ttl));
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
- snmp_get_cpu_field64_batch(buff64, snmp4_ipstats_list,
- net->mib.ip_statistics,
- offsetof(struct ipstats_mib, syncp));
- for (i = 0; snmp4_ipstats_list[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipstats_list, cnt,
+ net->mib.ip_statistics,
+ offsetof(struct ipstats_mib, syncp));
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %llu", buff64[i]);
return 0;
@@ -415,20 +411,23 @@ static int snmp_seq_show_ipstats(struct seq_file *seq, void *v)
static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
{
+ const int udp_cnt = ARRAY_SIZE(snmp4_udp_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_tcp_list);
unsigned long buff[TCPUDP_MIB_MAX];
struct net *net = seq->private;
int i;
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, tcp_cnt * sizeof(unsigned long));
seq_puts(seq, "\nTcp:");
- for (i = 0; snmp4_tcp_list[i].name; i++)
+ for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %s", snmp4_tcp_list[i].name);
seq_puts(seq, "\nTcp:");
- snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
- net->mib.tcp_statistics);
- for (i = 0; snmp4_tcp_list[i].name; i++) {
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_tcp_list,
+ tcp_cnt,
+ net->mib.tcp_statistics);
+ for (i = 0; i < tcp_cnt; i++) {
/* MaxConn field is signed, RFC 2012 */
if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
seq_printf(seq, " %ld", buff[i]);
@@ -436,27 +435,29 @@ static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
seq_printf(seq, " %lu", buff[i]);
}
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udp_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udp_statistics);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdp:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
- memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
+ memset(buff, 0, udp_cnt * sizeof(unsigned long));
/* the UDP and UDP-Lite MIBs are the same */
seq_puts(seq, "\nUdpLite:");
- snmp_get_cpu_field_batch(buff, snmp4_udp_list,
- net->mib.udplite_statistics);
- for (i = 0; snmp4_udp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_udp_list,
+ udp_cnt,
+ net->mib.udplite_statistics);
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %s", snmp4_udp_list[i].name);
seq_puts(seq, "\nUdpLite:");
- for (i = 0; snmp4_udp_list[i].name; i++)
+ for (i = 0; i < udp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
seq_putc(seq, '\n');
@@ -480,8 +481,8 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
*/
static int netstat_seq_show(struct seq_file *seq, void *v)
{
- const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list) - 1;
- const int tcp_cnt = ARRAY_SIZE(snmp4_net_list) - 1;
+ const int ip_cnt = ARRAY_SIZE(snmp4_ipextstats_list);
+ const int tcp_cnt = ARRAY_SIZE(snmp4_net_list);
struct net *net = seq->private;
unsigned long *buff;
int i;
@@ -494,8 +495,8 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
buff = kzalloc(max(tcp_cnt * sizeof(long), ip_cnt * sizeof(u64)),
GFP_KERNEL);
if (buff) {
- snmp_get_cpu_field_batch(buff, snmp4_net_list,
- net->mib.net_statistics);
+ snmp_get_cpu_field_batch_cnt(buff, snmp4_net_list, tcp_cnt,
+ net->mib.net_statistics);
for (i = 0; i < tcp_cnt; i++)
seq_printf(seq, " %lu", buff[i]);
} else {
@@ -513,7 +514,7 @@ static int netstat_seq_show(struct seq_file *seq, void *v)
u64 *buff64 = (u64 *)buff;
memset(buff64, 0, ip_cnt * sizeof(u64));
- snmp_get_cpu_field64_batch(buff64, snmp4_ipextstats_list,
+ snmp_get_cpu_field64_batch_cnt(buff64, snmp4_ipextstats_list, ip_cnt,
net->mib.ip_statistics,
offsetof(struct ipstats_mib, syncp));
for (i = 0; i < ip_cnt; i++)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 1d2c89d63cc7..d54ebb7df966 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -178,7 +178,7 @@ static int raw_v4_input(struct net *net, struct sk_buff *skb,
if (atomic_read(&sk->sk_rmem_alloc) >=
READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
continue;
}
@@ -311,7 +311,7 @@ static int raw_rcv_skb(struct sock *sk, struct sk_buff *skb)
int raw_rcv(struct sock *sk, struct sk_buff *skb)
{
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -793,6 +793,7 @@ static int raw_sk_init(struct sock *sk)
{
struct raw_sock *rp = raw_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
if (inet_sk(sk)->inet_num == IPPROTO_ICMP)
memset(&rp->filter, 0, sizeof(rp->filter));
return 0;
@@ -1045,7 +1046,7 @@ static void raw_sock_seq_show(struct seq_file *seq, struct sock *sp, int i)
0, 0L, 0,
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
0, sock_i_ino(sp),
- refcount_read(&sp->sk_refcnt), sp, atomic_read(&sp->sk_drops));
+ refcount_read(&sp->sk_refcnt), sp, sk_drops_read(sp));
}
static int raw_seq_show(struct seq_file *seq, void *v)
diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c
index cc793bd8de25..943e5998e0ad 100644
--- a/net/ipv4/raw_diag.c
+++ b/net/ipv4/raw_diag.c
@@ -126,9 +126,9 @@ static int raw_diag_dump_one(struct netlink_callback *cb,
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, r, NLM_F_MULTI, net_admin);
@@ -140,17 +140,13 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
struct hlist_head *hlist;
struct sock *sk = NULL;
- struct nlattr *bc;
if (IS_ERR(hashinfo))
return;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -174,7 +170,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
if (r->id.idiag_dport != inet->inet_dport &&
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0)
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0)
goto out_unlock;
next:
num++;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index baa43e5966b1..6d27d3610c1c 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -84,6 +84,7 @@
#include <linux/jhash.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/inet_dscp.h>
#include <net/net_namespace.h>
#include <net/ip.h>
@@ -413,11 +414,11 @@ static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
const void *daddr)
{
const struct rtable *rt = container_of(dst, struct rtable, dst);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev;
struct neighbour *n;
rcu_read_lock();
-
+ dev = dst_dev_rcu(dst);
if (likely(rt->rt_gw_family == AF_INET)) {
n = ip_neigh_gw4(dev, rt->rt_gw4);
} else if (rt->rt_gw_family == AF_INET6) {
@@ -1026,7 +1027,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
return;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv4.ip_rt_min_pmtu) {
lock = true;
mtu = min(old_mtu, net->ipv4.ip_rt_min_pmtu);
@@ -1221,8 +1222,8 @@ EXPORT_INDIRECT_CALLABLE(ipv4_dst_check);
static void ipv4_send_dest_unreach(struct sk_buff *skb)
{
+ struct inet_skb_parm parm;
struct net_device *dev;
- struct ip_options opt;
int res;
/* Recompile ip options since IPCB may not be valid anymore.
@@ -1232,21 +1233,21 @@ static void ipv4_send_dest_unreach(struct sk_buff *skb)
ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
return;
- memset(&opt, 0, sizeof(opt));
+ memset(&parm, 0, sizeof(parm));
if (ip_hdr(skb)->ihl > 5) {
if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
return;
- opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
+ parm.opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
rcu_read_lock();
dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
- res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
+ res = __ip_options_compile(dev_net(dev), &parm.opt, skb, NULL);
rcu_read_unlock();
if (res)
return;
}
- __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
+ __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &parm);
}
static void ipv4_link_failure(struct sk_buff *skb)
@@ -1291,7 +1292,7 @@ void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
struct flowi4 fl4 = {
.daddr = iph->daddr,
.saddr = iph->saddr,
- .flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(iph)),
+ .flowi4_dscp = ip4h_dscp(iph),
.flowi4_oif = rt->dst.dev->ifindex,
.flowi4_iif = skb->dev->ifindex,
.flowi4_mark = skb->mark,
@@ -1326,7 +1327,7 @@ static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
struct net *net;
rcu_read_lock();
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
net->ipv4.ip_rt_min_advmss);
rcu_read_unlock();
@@ -2210,7 +2211,7 @@ ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
goto martian_source;
}
- if (rt->rt_type != RTN_LOCAL)
+ if (!(rt->rt_flags & RTCF_LOCAL))
goto skip_validate_source;
reason = fib_validate_source_reason(skb, saddr, daddr, dscp, 0, dev,
@@ -2331,7 +2332,7 @@ ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
fl4.flowi4_oif = 0;
fl4.flowi4_iif = dev->ifindex;
fl4.flowi4_mark = skb->mark;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
fl4.flowi4_flags = 0;
fl4.daddr = daddr;
@@ -2694,7 +2695,6 @@ struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
struct rtable *rth;
fl4->flowi4_iif = LOOPBACK_IFINDEX;
- fl4->flowi4_tos &= INET_DSCP_MASK;
rcu_read_lock();
rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
@@ -3337,7 +3337,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
fl4.daddr = dst;
fl4.saddr = src;
- fl4.flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4.flowi4_dscp = dscp;
fl4.flowi4_oif = nla_get_u32_default(tb[RTA_OIF], 0);
fl4.flowi4_mark = mark;
fl4.flowi4_uid = uid;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index eb0819463fae..569befcf021b 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -12,6 +12,7 @@
#include <linux/export.h>
#include <net/secure_seq.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/route.h>
static siphash_aligned_key_t syncookie_secret[2];
@@ -403,6 +404,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
struct tcp_sock *tp = tcp_sk(sk);
struct inet_request_sock *ireq;
struct net *net = sock_net(sk);
+ struct tcp_request_sock *treq;
struct request_sock *req;
struct sock *ret = sk;
struct flowi4 fl4;
@@ -428,6 +430,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
}
ireq = inet_rsk(req);
+ treq = tcp_rsk(req);
sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
@@ -483,6 +486,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
if (!req->syncookie)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok &= cookie_ecn_ok(net, &rt->dst);
+ treq->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 3a43010d726f..24dbc603cc44 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -47,6 +47,7 @@ static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
static int tcp_plb_max_rounds = 31;
static int tcp_plb_max_cong_thresh = 256;
static unsigned int tcp_tw_reuse_delay_max = TCP_PAWS_MSL * MSEC_PER_SEC;
+static int tcp_ecn_mode_max = 2;
/* obsolete */
static int sysctl_tcp_low_latency __read_mostly;
@@ -728,9 +729,27 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dou8vec_minmax,
.extra1 = SYSCTL_ZERO,
+ .extra2 = &tcp_ecn_mode_max,
+ },
+ {
+ .procname = "tcp_ecn_option",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
{
+ .procname = "tcp_ecn_option_beacon",
+ .data = &init_net.ipv4.sysctl_tcp_ecn_option_beacon,
+ .maxlen = sizeof(u8),
+ .mode = 0644,
+ .proc_handler = proc_dou8vec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_THREE,
+ },
+ {
.procname = "tcp_ecn_fallback",
.data = &init_net.ipv4.sysctl_tcp_ecn_fallback,
.maxlen = sizeof(u8),
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index ad76556800f2..7949d16506a4 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -270,11 +270,14 @@
#include <net/icmp.h>
#include <net/inet_common.h>
+#include <net/inet_ecn.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
#include <net/xfrm.h>
#include <net/ip.h>
+#include <net/psp.h>
#include <net/sock.h>
#include <net/rstreason.h>
@@ -412,6 +415,22 @@ static u64 tcp_compute_delivery_rate(const struct tcp_sock *tp)
return rate64;
}
+#ifdef CONFIG_TCP_MD5SIG
+void tcp_md5_destruct_sock(struct sock *sk)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (tp->md5sig_info) {
+
+ tcp_clear_md5_list(sk);
+ kfree(rcu_replace_pointer(tp->md5sig_info, NULL, 1));
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+ }
+}
+EXPORT_IPV6_MOD_GPL(tcp_md5_destruct_sock);
+#endif
+
/* Address-family independent initialization for a tcp_sock.
*
* NOTE: A lot of things set to zero explicitly by call to
@@ -687,6 +706,7 @@ void tcp_skb_entail(struct sock *sk, struct sk_buff *skb)
tcb->seq = tcb->end_seq = tp->write_seq;
tcb->tcp_flags = TCPHDR_ACK;
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
@@ -2818,9 +2838,9 @@ found_ok_skb:
err = tcp_recvmsg_dmabuf(sk, skb, offset, msg,
used);
- if (err <= 0) {
+ if (err < 0) {
if (!copied)
- copied = -EFAULT;
+ copied = err;
break;
}
@@ -3099,8 +3119,8 @@ bool tcp_check_oom(const struct sock *sk, int shift)
void __tcp_close(struct sock *sk, long timeout)
{
+ bool data_was_unread = false;
struct sk_buff *skb;
- int data_was_unread = 0;
int state;
WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
@@ -3118,13 +3138,14 @@ void __tcp_close(struct sock *sk, long timeout)
* descriptor close, not protocol-sourced closes, because the
* reader process may not have drained the data yet!
*/
- while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
- u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq;
+ while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
+ u32 end_seq = TCP_SKB_CB(skb)->end_seq;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
- len--;
- data_was_unread += len;
- __kfree_skb(skb);
+ end_seq--;
+ if (after(end_seq, tcp_sk(sk)->copied_seq))
+ data_was_unread = true;
+ tcp_eat_recv_skb(sk, skb);
}
/* If socket has been already reset (e.g. in tcp_reset()) - kill it. */
@@ -3195,7 +3216,7 @@ adjudge_to_death:
/* remove backlog if any, without releasing ownership. */
__release_sock(sk);
- this_cpu_inc(tcp_orphan_count);
+ tcp_orphan_count_inc();
/* Have we already been destroyed by a softirq or backlog? */
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -3377,7 +3398,7 @@ int tcp_disconnect(struct sock *sk, int flags)
WRITE_ONCE(tp->write_seq, seq);
icsk->icsk_backoff = 0;
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
icsk->icsk_rto = TCP_TIMEOUT_INIT;
WRITE_ONCE(icsk->icsk_rto_min, TCP_RTO_MIN);
@@ -3390,6 +3411,11 @@ int tcp_disconnect(struct sock *sk, int flags)
tp->window_clamp = 0;
tp->delivered = 0;
tp->delivered_ce = 0;
+ tp->accecn_fail_mode = 0;
+ tp->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_accecn_init_counters(tp);
+ tp->prev_ecnfield = 0;
+ tp->accecn_opt_tstamp = 0;
if (icsk->icsk_ca_initialized && icsk->icsk_ca_ops->release)
icsk->icsk_ca_ops->release(sk);
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
@@ -3765,7 +3791,7 @@ int tcp_sock_set_maxseg(struct sock *sk, int val)
if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW))
return -EINVAL;
- tcp_sk(sk)->rx_opt.user_mss = val;
+ WRITE_ONCE(tcp_sk(sk)->rx_opt.user_mss, val);
return 0;
}
@@ -3895,15 +3921,13 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
WRITE_ONCE(inet_csk(sk)->icsk_delack_max, delack_max);
return 0;
}
+ case TCP_MAXSEG:
+ return tcp_sock_set_maxseg(sk, val);
}
sockopt_lock_sock(sk);
switch (optname) {
- case TCP_MAXSEG:
- err = tcp_sock_set_maxseg(sk, val);
- break;
-
case TCP_NODELAY:
__tcp_sock_set_nodelay(sk, val);
break;
@@ -4142,6 +4166,9 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
{
const struct tcp_sock *tp = tcp_sk(sk); /* iff sk_type == SOCK_STREAM */
const struct inet_connection_sock *icsk = inet_csk(sk);
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
unsigned long rate;
u32 now;
u64 rate64;
@@ -4268,6 +4295,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
if (tp->rto_stamp)
info->tcpi_total_rto_time += tcp_clock_ms() - tp->rto_stamp;
+ info->tcpi_accecn_fail_mode = tp->accecn_fail_mode;
+ info->tcpi_accecn_opt_seen = tp->saw_accecn_opt;
+ info->tcpi_received_ce = tp->received_ce;
+ info->tcpi_delivered_e1_bytes = tp->delivered_ecn_bytes[ect1_idx];
+ info->tcpi_delivered_e0_bytes = tp->delivered_ecn_bytes[ect0_idx];
+ info->tcpi_delivered_ce_bytes = tp->delivered_ecn_bytes[ce_idx];
+ info->tcpi_received_e1_bytes = tp->received_ecn_bytes[ect1_idx];
+ info->tcpi_received_e0_bytes = tp->received_ecn_bytes[ect0_idx];
+ info->tcpi_received_ce_bytes = tp->received_ecn_bytes[ce_idx];
+
unlock_sock_fast(sk, slow);
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -4353,7 +4390,8 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk,
nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering);
nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp));
- nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits);
+ nla_put_u8(stats, TCP_NLA_RECUR_RETRANS,
+ READ_ONCE(inet_csk(sk)->icsk_retransmits));
nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited);
nla_put_u32(stats, TCP_NLA_SND_SSTHRESH, tp->snd_ssthresh);
nla_put_u32(stats, TCP_NLA_DELIVERED, tp->delivered);
@@ -4388,6 +4426,7 @@ int do_tcp_getsockopt(struct sock *sk, int level,
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
+ int user_mss;
int val, len;
if (copy_from_sockptr(&len, optlen, sizeof(int)))
@@ -4401,9 +4440,10 @@ int do_tcp_getsockopt(struct sock *sk, int level,
switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache;
- if (tp->rx_opt.user_mss &&
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss &&
((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
- val = tp->rx_opt.user_mss;
+ val = user_mss;
if (tp->repair)
val = tp->rx_opt.mss_clamp;
break;
@@ -5061,7 +5101,9 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, notsent_lowat);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, gso_segs);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, retransmit_skb_hint);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_tx, 32);
+#if IS_ENABLED(CONFIG_TLS_DEVICE)
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_tx, tcp_clean_acked);
+#endif
/* TXRX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, tsoffset);
@@ -5072,11 +5114,9 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, lost_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, sacked_out);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_txrx, scaling_ratio);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_txrx, 32);
/* RX read-mostly hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, copied_seq);
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rcv_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_wl1);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tlp_high_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rttvar_us);
@@ -5087,12 +5127,6 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, rtt_min);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, out_of_order_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, snd_ssthresh);
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
- CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_read_rx, tcp_clean_acked);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 77);
-#else
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_read_rx, 69);
-#endif
/* TX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, segs_out);
@@ -5106,11 +5140,11 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, lsndtime);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, mdev_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tcp_wstamp_ns);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, accecn_opt_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, rtt_seq);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, tsorted_sent_queue);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, highest_sack);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_tx, ecn_flags);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_tx, 89);
/* TXRX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, pred_flags);
@@ -5125,15 +5159,13 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, snd_up);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, delivered_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ce);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, received_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, app_limited);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_wnd);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rcv_tstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_txrx, rx_opt);
- /* 32bit arches with 8byte alignment on u64 fields might need padding
- * before tcp_clock_cache.
- */
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_txrx, 92 + 4);
-
/* RX read-write hotpath cache lines */
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_received);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, segs_in);
@@ -5144,12 +5176,12 @@ static void __init tcp_struct_check(void)
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_delivered);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rate_interval_us);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_last_tsecr);
+ CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_ecn_bytes);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, first_tx_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, delivered_mstamp);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, bytes_acked);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcv_rtt_est);
CACHELINE_ASSERT_GROUP_MEMBER(struct tcp_sock, tcp_sock_write_rx, rcvq_space);
- CACHELINE_ASSERT_GROUP_SIZE(struct tcp_sock, tcp_sock_write_rx, 99);
}
void __init tcp_init(void)
diff --git a/net/ipv4/tcp_ao.c b/net/ipv4/tcp_ao.c
index 3338b6cc85c4..34b8450829d0 100644
--- a/net/ipv4/tcp_ao.c
+++ b/net/ipv4/tcp_ao.c
@@ -268,9 +268,8 @@ static void tcp_ao_key_free_rcu(struct rcu_head *head)
kfree_sensitive(key);
}
-static void tcp_ao_info_free_rcu(struct rcu_head *head)
+static void tcp_ao_info_free(struct tcp_ao_info *ao)
{
- struct tcp_ao_info *ao = container_of(head, struct tcp_ao_info, rcu);
struct tcp_ao_key *key;
struct hlist_node *n;
@@ -310,7 +309,7 @@ void tcp_ao_destroy_sock(struct sock *sk, bool twsk)
if (!twsk)
tcp_ao_sk_omem_free(sk, ao);
- call_rcu(&ao->rcu, tcp_ao_info_free_rcu);
+ tcp_ao_info_free(ao);
}
void tcp_ao_time_wait(struct tcp_timewait_sock *tcptw, struct tcp_sock *tp)
diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c
index ba4d98e510e0..fbad6c35dee9 100644
--- a/net/ipv4/tcp_cdg.c
+++ b/net/ipv4/tcp_cdg.c
@@ -379,7 +379,7 @@ static void tcp_cdg_init(struct sock *sk)
/* We silently fall back to window = 1 if allocation fails. */
if (window > 1)
ca->gradients = kcalloc(window, sizeof(ca->gradients[0]),
- GFP_NOWAIT | __GFP_NOWARN);
+ GFP_NOWAIT);
ca->rtt_seq = tp->snd_nxt;
ca->shadow_wnd = tcp_snd_cwnd(tp);
}
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 45e174b8cd22..d83efd91f461 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -12,6 +12,9 @@
#include <linux/tcp.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
+#include <net/inet_timewait_sock.h>
#include <net/netlink.h>
#include <net/tcp.h>
@@ -174,27 +177,465 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin)
size += ulp_ops->get_info_size(sk, net_admin);
}
}
- return size;
+
+ return size
+ + nla_total_size(sizeof(struct tcp_info))
+ + nla_total_size(sizeof(struct inet_diag_msg))
+ + inet_diag_msg_attrs_size()
+ + nla_total_size(sizeof(struct inet_diag_meminfo))
+ + nla_total_size(SK_MEMINFO_VARS * sizeof(u32))
+ + nla_total_size(TCP_CA_NAME_MAX)
+ + nla_total_size(sizeof(struct tcpvegas_info))
+ + 64;
+}
+
+static int tcp_twsk_diag_fill(struct sock *sk,
+ struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct inet_timewait_sock *tw = inet_twsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type,
+ sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ DEBUG_NET_WARN_ON_ONCE(tw->tw_state != TCP_TIME_WAIT);
+
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_retrans = 0;
+
+ r->idiag_state = READ_ONCE(tw->tw_substate);
+ r->idiag_timer = 3;
+ tmo = tw->tw_timer.expires - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ tw->tw_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int tcp_req_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ u16 nlmsg_flags, bool net_admin)
+{
+ struct request_sock *reqsk = inet_reqsk(sk);
+ struct inet_diag_msg *r;
+ struct nlmsghdr *nlh;
+ long tmo;
+
+ nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ cb->nlh->nlmsg_type, sizeof(*r), nlmsg_flags);
+ if (!nlh)
+ return -EMSGSIZE;
+
+ r = nlmsg_data(nlh);
+ inet_diag_msg_common_fill(r, sk);
+ r->idiag_state = TCP_SYN_RECV;
+ r->idiag_timer = 1;
+ r->idiag_retrans = READ_ONCE(reqsk->num_retrans);
+
+ BUILD_BUG_ON(offsetof(struct inet_request_sock, ir_cookie) !=
+ offsetof(struct sock, sk_cookie));
+
+ tmo = READ_ONCE(inet_reqsk(sk)->rsk_timer.expires) - jiffies;
+ r->idiag_expires = jiffies_delta_to_msecs(tmo);
+ r->idiag_rqueue = 0;
+ r->idiag_wqueue = 0;
+ r->idiag_uid = 0;
+ r->idiag_inode = 0;
+
+ if (net_admin && nla_put_u32(skb, INET_DIAG_MARK,
+ inet_rsk(reqsk)->ir_mark)) {
+ nlmsg_cancel(skb, nlh);
+ return -EMSGSIZE;
+ }
+
+ nlmsg_end(skb, nlh);
+ return 0;
+}
+
+static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ const struct inet_diag_req_v2 *r,
+ u16 nlmsg_flags, bool net_admin)
+{
+ if (sk->sk_state == TCP_TIME_WAIT)
+ return tcp_twsk_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ if (sk->sk_state == TCP_NEW_SYN_RECV)
+ return tcp_req_diag_fill(sk, skb, cb, nlmsg_flags, net_admin);
+
+ return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, r, nlmsg_flags,
+ net_admin);
+}
+
+static void twsk_build_assert(void)
+{
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_family) !=
+ offsetof(struct sock, sk_family));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_num) !=
+ offsetof(struct inet_sock, inet_num));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_dport) !=
+ offsetof(struct inet_sock, inet_dport));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_rcv_saddr) !=
+ offsetof(struct inet_sock, inet_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_daddr) !=
+ offsetof(struct inet_sock, inet_daddr));
+
+#if IS_ENABLED(CONFIG_IPV6)
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_rcv_saddr) !=
+ offsetof(struct sock, sk_v6_rcv_saddr));
+
+ BUILD_BUG_ON(offsetof(struct inet_timewait_sock, tw_v6_daddr) !=
+ offsetof(struct sock, sk_v6_daddr));
+#endif
}
static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r)
{
- struct inet_hashinfo *hinfo;
+ bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
+ struct inet_diag_dump_data *cb_data = cb->data;
+ struct net *net = sock_net(skb->sk);
+ u32 idiag_states = r->idiag_states;
+ struct inet_hashinfo *hashinfo;
+ int i, num, s_i, s_num;
+ struct sock *sk;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ if (idiag_states & TCPF_SYN_RECV)
+ idiag_states |= TCPF_NEW_SYN_RECV;
+ s_i = cb->args[1];
+ s_num = num = cb->args[2];
+
+ if (cb->args[0] == 0) {
+ if (!(idiag_states & TCPF_LISTEN) || r->id.idiag_dport)
+ goto skip_listen_ht;
+
+ for (i = s_i; i <= hashinfo->lhash2_mask; i++) {
+ struct inet_listen_hashbucket *ilb;
+ struct hlist_nulls_node *node;
+
+ num = 0;
+ ilb = &hashinfo->lhash2[i];
+
+ if (hlist_nulls_empty(&ilb->nulls_head)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock(&ilb->lock);
+ sk_nulls_for_each(sk, node, &ilb->nulls_head) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+
+ if (num < s_num) {
+ num++;
+ continue;
+ }
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_listen;
+
+ if (r->id.idiag_sport != inet->inet_sport &&
+ r->id.idiag_sport)
+ goto next_listen;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_listen;
+
+ if (inet_sk_diag_fill(sk, inet_csk(sk), skb,
+ cb, r, NLM_F_MULTI,
+ net_admin) < 0) {
+ spin_unlock(&ilb->lock);
+ goto done;
+ }
+
+next_listen:
+ ++num;
+ }
+ spin_unlock(&ilb->lock);
+
+ s_num = 0;
+ }
+skip_listen_ht:
+ cb->args[0] = 1;
+ s_i = num = s_num = 0;
+ }
+
+/* Process a maximum of SKARR_SZ sockets at a time when walking hash buckets
+ * with bh disabled.
+ */
+#define SKARR_SZ 16
+
+ /* Dump bound but inactive (not listening, connecting, etc.) sockets */
+ if (cb->args[0] == 1) {
+ if (!(idiag_states & TCPF_BOUND_INACTIVE))
+ goto skip_bind_ht;
+
+ for (i = s_i; i < hashinfo->bhash_size; i++) {
+ struct inet_bind_hashbucket *ibb;
+ struct inet_bind2_bucket *tb2;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+resume_bind_walk:
+ num = 0;
+ accum = 0;
+ ibb = &hashinfo->bhash2[i];
+
+ if (hlist_empty(&ibb->chain)) {
+ s_num = 0;
+ continue;
+ }
+ spin_lock_bh(&ibb->lock);
+ inet_bind_bucket_for_each(tb2, &ibb->chain) {
+ if (!net_eq(ib2_net(tb2), net))
+ continue;
+
+ sk_for_each_bound(sk, &tb2->owners) {
+ struct inet_sock *inet = inet_sk(sk);
+
+ if (num < s_num)
+ goto next_bind;
+
+ if (sk->sk_state != TCP_CLOSE ||
+ !inet->inet_num)
+ goto next_bind;
+
+ if (r->sdiag_family != AF_UNSPEC &&
+ r->sdiag_family != sk->sk_family)
+ goto next_bind;
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_bind;
+
+ sock_hold(sk);
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ goto pause_bind_walk;
+next_bind:
+ num++;
+ }
+ }
+pause_bind_walk:
+ spin_unlock_bh(&ibb->lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = inet_sk_diag_fill(sk_arr[idx],
+ NULL, skb, cb,
+ r, NLM_F_MULTI,
+ net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ goto done;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto resume_bind_walk;
+ }
+
+ s_num = 0;
+ }
+skip_bind_ht:
+ cb->args[0] = 2;
+ s_i = num = s_num = 0;
+ }
- inet_diag_dump_icsk(hinfo, skb, cb, r);
+ if (!(idiag_states & ~TCPF_LISTEN))
+ goto out;
+
+ for (i = s_i; i <= hashinfo->ehash_mask; i++) {
+ struct inet_ehash_bucket *head = &hashinfo->ehash[i];
+ spinlock_t *lock = inet_ehash_lockp(hashinfo, i);
+ struct hlist_nulls_node *node;
+ struct sock *sk_arr[SKARR_SZ];
+ int num_arr[SKARR_SZ];
+ int idx, accum, res;
+
+ if (hlist_nulls_empty(&head->chain))
+ continue;
+
+ if (i > s_i)
+ s_num = 0;
+
+next_chunk:
+ num = 0;
+ accum = 0;
+ spin_lock_bh(lock);
+ sk_nulls_for_each(sk, node, &head->chain) {
+ int state;
+
+ if (!net_eq(sock_net(sk), net))
+ continue;
+ if (num < s_num)
+ goto next_normal;
+ state = (sk->sk_state == TCP_TIME_WAIT) ?
+ READ_ONCE(inet_twsk(sk)->tw_substate) : sk->sk_state;
+ if (!(idiag_states & (1 << state)))
+ goto next_normal;
+ if (r->sdiag_family != AF_UNSPEC &&
+ sk->sk_family != r->sdiag_family)
+ goto next_normal;
+ if (r->id.idiag_sport != htons(sk->sk_num) &&
+ r->id.idiag_sport)
+ goto next_normal;
+ if (r->id.idiag_dport != sk->sk_dport &&
+ r->id.idiag_dport)
+ goto next_normal;
+ twsk_build_assert();
+
+ if (!inet_diag_bc_sk(cb_data, sk))
+ goto next_normal;
+
+ if (!refcount_inc_not_zero(&sk->sk_refcnt))
+ goto next_normal;
+
+ num_arr[accum] = num;
+ sk_arr[accum] = sk;
+ if (++accum == SKARR_SZ)
+ break;
+next_normal:
+ ++num;
+ }
+ spin_unlock_bh(lock);
+
+ res = 0;
+ for (idx = 0; idx < accum; idx++) {
+ if (res >= 0) {
+ res = sk_diag_fill(sk_arr[idx], skb, cb, r,
+ NLM_F_MULTI, net_admin);
+ if (res < 0)
+ num = num_arr[idx];
+ }
+ sock_gen_put(sk_arr[idx]);
+ }
+ if (res < 0)
+ break;
+
+ cond_resched();
+
+ if (accum == SKARR_SZ) {
+ s_num = num + 1;
+ goto next_chunk;
+ }
+ }
+
+done:
+ cb->args[1] = i;
+ cb->args[2] = num;
+out:
+ ;
+}
+
+static struct sock *tcp_diag_find_one_icsk(struct net *net,
+ const struct inet_diag_req_v2 *req)
+{
+ struct sock *sk;
+
+ rcu_read_lock();
+ if (req->sdiag_family == AF_INET) {
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[0],
+ req->id.idiag_dport, req->id.idiag_src[0],
+ req->id.idiag_sport, req->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+ } else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+#endif
+ } else {
+ rcu_read_unlock();
+ return ERR_PTR(-EINVAL);
+ }
+ rcu_read_unlock();
+ if (!sk)
+ return ERR_PTR(-ENOENT);
+
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
}
static int tcp_diag_dump_one(struct netlink_callback *cb,
const struct inet_diag_req_v2 *req)
{
- struct inet_hashinfo *hinfo;
+ struct sk_buff *in_skb = cb->skb;
+ struct sk_buff *rep;
+ struct sock *sk;
+ struct net *net;
+ bool net_admin;
+ int err;
- hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo;
+ net = sock_net(in_skb->sk);
+ sk = tcp_diag_find_one_icsk(net, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
- return inet_diag_dump_one_icsk(hinfo, cb, req);
+ net_admin = netlink_net_capable(in_skb, CAP_NET_ADMIN);
+ rep = nlmsg_new(tcp_diag_get_aux_size(sk, net_admin), GFP_KERNEL);
+ if (!rep) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = sk_diag_fill(sk, rep, cb, req, 0, net_admin);
+ if (err < 0) {
+ WARN_ON(err == -EMSGSIZE);
+ nlmsg_free(rep);
+ goto out;
+ }
+ err = nlmsg_unicast(net->diag_nlsk, rep, NETLINK_CB(in_skb).portid);
+
+out:
+ if (sk)
+ sock_gen_put(sk);
+
+ return err;
}
#ifdef CONFIG_INET_DIAG_DESTROY
@@ -202,13 +643,10 @@ static int tcp_diag_destroy(struct sk_buff *in_skb,
const struct inet_diag_req_v2 *req)
{
struct net *net = sock_net(in_skb->sk);
- struct inet_hashinfo *hinfo;
struct sock *sk;
int err;
- hinfo = net->ipv4.tcp_death_row.hashinfo;
- sk = inet_diag_find_one_icsk(net, hinfo, req);
-
+ sk = tcp_diag_find_one_icsk(net, req);
if (IS_ERR(sk))
return PTR_ERR(sk);
@@ -226,7 +664,6 @@ static const struct inet_diag_handler tcp_diag_handler = {
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_get_aux = tcp_diag_get_aux,
- .idiag_get_aux_size = tcp_diag_get_aux_size,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
#ifdef CONFIG_INET_DIAG_DESTROY
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index f1884f0c9e52..7d945a527daf 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -576,11 +576,12 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk)
}
} else if (tp->syn_fastopen_ch &&
atomic_read(&sock_net(sk)->ipv4.tfo_active_disable_times)) {
- dst = sk_dst_get(sk);
- dev = dst ? dst_dev(dst) : NULL;
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&sock_net(sk)->ipv4.tfo_active_disable_times, 0);
- dst_release(dst);
+ rcu_read_unlock();
}
}
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 71b76e98371a..b44fdc309633 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -70,8 +70,10 @@
#include <linux/sysctl.h>
#include <linux/kernel.h>
#include <linux/prefetch.h>
+#include <linux/bitops.h>
#include <net/dst.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/proto_memory.h>
#include <net/inet_common.h>
#include <linux/ipsec.h>
@@ -339,31 +341,6 @@ static bool tcp_in_quickack_mode(struct sock *sk)
(icsk->icsk_ack.quick && !inet_csk_in_pingpong_mode(sk));
}
-static void tcp_ecn_queue_cwr(struct tcp_sock *tp)
-{
- if (tcp_ecn_mode_rfc3168(tp))
- tp->ecn_flags |= TCP_ECN_QUEUE_CWR;
-}
-
-static void tcp_ecn_accept_cwr(struct sock *sk, const struct sk_buff *skb)
-{
- if (tcp_hdr(skb)->cwr) {
- tcp_sk(sk)->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
-
- /* If the sender is telling us it has entered CWR, then its
- * cwnd may be very low (even just 1 packet), so we should ACK
- * immediately.
- */
- if (TCP_SKB_CB(skb)->seq != TCP_SKB_CB(skb)->end_seq)
- inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
- }
-}
-
-static void tcp_ecn_withdraw_cwr(struct tcp_sock *tp)
-{
- tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR;
-}
-
static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -384,38 +361,117 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb)
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_IS_CE);
- if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) {
+ if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) &&
+ tcp_ecn_mode_rfc3168(tp)) {
/* Better not delay acks, sender can have a very low cwnd */
tcp_enter_quickack_mode(sk, 2);
tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
}
+ /* As for RFC3168 ECN, the TCP_ECN_SEEN flag is set by
+ * tcp_data_ecn_check() when the ECN codepoint of
+ * received TCP data contains ECT(0), ECT(1), or CE.
+ */
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
default:
if (tcp_ca_needs_ecn(sk))
tcp_ca_event(sk, CA_EVENT_ECN_NO_CE);
+ if (!tcp_ecn_mode_rfc3168(tp))
+ break;
tp->ecn_flags |= TCP_ECN_SEEN;
break;
}
}
-static void tcp_ecn_rcv_synack(struct tcp_sock *tp, const struct tcphdr *th)
+/* Returns true if the byte counters can be used */
+static bool tcp_accecn_process_option(struct tcp_sock *tp,
+ const struct sk_buff *skb,
+ u32 delivered_bytes, int flag)
{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ u8 estimate_ecnfield = tp->est_ecnfield;
+ bool ambiguous_ecn_bytes_incr = false;
+ bool first_changed = false;
+ unsigned int optlen;
+ bool order1, res;
+ unsigned int i;
+ u8 *ptr;
-static void tcp_ecn_rcv_syn(struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (tcp_ecn_mode_rfc3168(tp) && (!th->ece || !th->cwr))
- tcp_ecn_mode_set(tp, TCP_ECN_DISABLED);
-}
+ if (tcp_accecn_opt_fail_recv(tp))
+ return false;
-static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr *th)
-{
- if (th->ece && !th->syn && tcp_ecn_mode_rfc3168(tp))
- return true;
- return false;
+ if (!(flag & FLAG_SLOWPATH) || !tp->rx_opt.accecn) {
+ if (!tp->saw_accecn_opt) {
+ /* Too late to enable after this point due to
+ * potential counter wraps
+ */
+ if (tp->bytes_sent >= (1 << 23) - 1) {
+ u8 saw_opt = TCP_ACCECN_OPT_FAIL_SEEN;
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ }
+ return false;
+ }
+
+ if (estimate_ecnfield) {
+ u8 ecnfield = estimate_ecnfield - 1;
+
+ tp->delivered_ecn_bytes[ecnfield] += delivered_bytes;
+ return true;
+ }
+ return false;
+ }
+
+ ptr = skb_transport_header(skb) + tp->rx_opt.accecn;
+ optlen = ptr[1] - 2;
+ if (WARN_ON_ONCE(ptr[0] != TCPOPT_ACCECN0 && ptr[0] != TCPOPT_ACCECN1))
+ return false;
+ order1 = (ptr[0] == TCPOPT_ACCECN1);
+ ptr += 2;
+
+ if (tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ tp->saw_accecn_opt = tcp_accecn_option_init(skb,
+ tp->rx_opt.accecn);
+ if (tp->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN)
+ tcp_accecn_fail_mode_set(tp, TCP_ACCECN_OPT_FAIL_RECV);
+ }
+
+ res = !!estimate_ecnfield;
+ for (i = 0; i < 3; i++) {
+ u32 init_offset;
+ u8 ecnfield;
+ s32 delta;
+ u32 *cnt;
+
+ if (optlen < TCPOLEN_ACCECN_PERFIELD)
+ break;
+
+ ecnfield = tcp_accecn_optfield_to_ecnfield(i, order1);
+ init_offset = tcp_accecn_field_init_offset(ecnfield);
+ cnt = &tp->delivered_ecn_bytes[ecnfield - 1];
+ delta = tcp_update_ecn_bytes(cnt, ptr, init_offset);
+ if (delta && delta < 0) {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ if (delta && ecnfield != estimate_ecnfield) {
+ if (!first_changed) {
+ tp->est_ecnfield = ecnfield;
+ first_changed = true;
+ } else {
+ res = false;
+ ambiguous_ecn_bytes_incr = true;
+ }
+ }
+
+ optlen -= TCPOLEN_ACCECN_PERFIELD;
+ ptr += TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (ambiguous_ecn_bytes_incr)
+ tp->est_ecnfield = 0;
+
+ return res;
}
static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count)
@@ -428,10 +484,101 @@ static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered,
bool ece_ack)
{
tp->delivered += delivered;
- if (ece_ack)
+ if (tcp_ecn_mode_rfc3168(tp) && ece_ack)
tcp_count_delivered_ce(tp, delivered);
}
+/* Returns the ECN CE delta */
+static u32 __tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int flag)
+{
+ u32 old_ceb = tcp_sk(sk)->delivered_ecn_bytes[INET_ECN_CE - 1];
+ const struct tcphdr *th = tcp_hdr(skb);
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta, safe_delta, d_ceb;
+ bool opt_deltas_valid;
+ u32 corrected_ace;
+
+ /* Reordered ACK or uncertain due to lack of data to send and ts */
+ if (!(flag & (FLAG_FORWARD_PROGRESS | FLAG_TS_PROGRESS)))
+ return 0;
+
+ opt_deltas_valid = tcp_accecn_process_option(tp, skb,
+ delivered_bytes, flag);
+
+ if (!(flag & FLAG_SLOWPATH)) {
+ /* AccECN counter might overflow on large ACKs */
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return 0;
+ }
+
+ /* ACE field is not available during handshake */
+ if (flag & FLAG_SYN_ACKED)
+ return 0;
+
+ if (tp->received_ce_pending >= TCP_ACCECN_ACE_MAX_DELTA)
+ inet_csk(sk)->icsk_ack.pending |= ICSK_ACK_NOW;
+
+ corrected_ace = tcp_accecn_ace(th) - TCP_ACCECN_CEP_INIT_OFFSET;
+ delta = (corrected_ace - tp->delivered_ce) & TCP_ACCECN_CEP_ACE_MASK;
+ if (delivered_pkts <= TCP_ACCECN_CEP_ACE_MASK)
+ return delta;
+
+ safe_delta = delivered_pkts -
+ ((delivered_pkts - delta) & TCP_ACCECN_CEP_ACE_MASK);
+
+ if (opt_deltas_valid) {
+ d_ceb = tp->delivered_ecn_bytes[INET_ECN_CE - 1] - old_ceb;
+ if (!d_ceb)
+ return delta;
+
+ if ((delivered_pkts >= (TCP_ACCECN_CEP_ACE_MASK + 1) * 2) &&
+ (tcp_is_sack(tp) ||
+ ((1 << inet_csk(sk)->icsk_ca_state) &
+ (TCPF_CA_Open | TCPF_CA_CWR)))) {
+ u32 est_d_cep;
+
+ if (delivered_bytes <= d_ceb)
+ return safe_delta;
+
+ est_d_cep = DIV_ROUND_UP_ULL((u64)d_ceb *
+ delivered_pkts,
+ delivered_bytes);
+ return min(safe_delta,
+ delta +
+ (est_d_cep & ~TCP_ACCECN_CEP_ACE_MASK));
+ }
+
+ if (d_ceb > delta * tp->mss_cache)
+ return safe_delta;
+ if (d_ceb <
+ safe_delta * tp->mss_cache >> TCP_ACCECN_SAFETY_SHIFT)
+ return delta;
+ }
+
+ return safe_delta;
+}
+
+static u32 tcp_accecn_process(struct sock *sk, const struct sk_buff *skb,
+ u32 delivered_pkts, u32 delivered_bytes,
+ int *flag)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u32 delta;
+
+ delta = __tcp_accecn_process(sk, skb, delivered_pkts,
+ delivered_bytes, *flag);
+ if (delta > 0) {
+ tcp_count_delivered_ce(tp, delta);
+ *flag |= FLAG_ECE;
+ /* Recalculate header predictor */
+ if (tp->pred_flags)
+ tcp_fast_path_on(tp);
+ }
+ return delta;
+}
+
/* Buffer size and advertised window tuning.
*
* 1. Tuning sk->sk_sndbuf, when connection enters established state.
@@ -744,7 +891,7 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
}
}
-static void tcp_rcvbuf_grow(struct sock *sk)
+void tcp_rcvbuf_grow(struct sock *sk)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -1030,6 +1177,7 @@ struct tcp_sacktag_state {
u64 last_sackt;
u32 reord;
u32 sack_delivered;
+ u32 delivered_bytes;
int flag;
unsigned int mss_now;
struct rate_sample *rate;
@@ -1391,7 +1539,7 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb,
static u8 tcp_sacktag_one(struct sock *sk,
struct tcp_sacktag_state *state, u8 sacked,
u32 start_seq, u32 end_seq,
- int dup_sack, int pcount,
+ int dup_sack, int pcount, u32 plen,
u64 xmit_time)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -1451,6 +1599,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
tp->sacked_out += pcount;
/* Out-of-order packets delivered */
state->sack_delivered += pcount;
+ state->delivered_bytes += plen;
}
/* D-SACK. We can detect redundant retransmission in S|R and plain R
@@ -1487,7 +1636,7 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev,
* tcp_highest_sack_seq() when skb is highest_sack.
*/
tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked,
- start_seq, end_seq, dup_sack, pcount,
+ start_seq, end_seq, dup_sack, pcount, skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
@@ -1772,6 +1921,7 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk,
TCP_SKB_CB(skb)->end_seq,
dup_sack,
tcp_skb_pcount(skb),
+ skb->len,
tcp_skb_timestamp_us(skb));
tcp_rate_skb_delivered(sk, skb, state->rate);
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
@@ -2569,7 +2719,7 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo)
if (frto_undo)
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUSRTOS);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
if (tcp_is_non_sack_preventing_reopen(sk))
return true;
if (frto_undo || tcp_is_sack(tp)) {
@@ -3280,6 +3430,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
+ /* snd_una delta covers these skbs */
+ sack->delivered_bytes -= skb->len;
} else if (tcp_is_sack(tp)) {
tcp_count_delivered(tp, acked_pcount, ece_ack);
if (!tcp_skb_spurious_retrans(tp, skb))
@@ -3376,6 +3528,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, const struct sk_buff *ack_skb,
if (before(reord, prior_fack))
tcp_check_sack_reordering(sk, reord, 0);
}
+
+ sack->delivered_bytes = (skb ?
+ TCP_SKB_CB(skb)->seq : tp->snd_una) -
+ prior_snd_una;
} else if (skb && rtt_update && sack_rtt_us >= 0 &&
sack_rtt_us > tcp_stamp_us_delta(tp->tcp_mstamp,
tcp_skb_timestamp_us(skb))) {
@@ -3645,8 +3801,18 @@ bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb,
return __tcp_oow_rate_limited(net, mib_idx, last_oow_ack_time);
}
+static void tcp_send_ack_reflect_ect(struct sock *sk, bool accecn_reflector)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+ u16 flags = 0;
+
+ if (accecn_reflector)
+ flags = tcp_accecn_reflector_flags(tp->syn_ect_rcv);
+ __tcp_send_ack(sk, tp->rcv_nxt, flags);
+}
+
/* RFC 5961 7 [ACK Throttling] */
-static void tcp_send_challenge_ack(struct sock *sk)
+static void tcp_send_challenge_ack(struct sock *sk, bool accecn_reflector)
{
struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
@@ -3676,7 +3842,7 @@ static void tcp_send_challenge_ack(struct sock *sk)
WRITE_ONCE(net->ipv4.tcp_challenge_count, count - 1);
send_ack:
NET_INC_STATS(net, LINUX_MIB_TCPCHALLENGEACK);
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, accecn_reflector);
}
}
@@ -3787,7 +3953,8 @@ static void tcp_xmit_recovery(struct sock *sk, int rexmit)
}
/* Returns the number of packets newly acked or sacked by the current ACK */
-static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
+static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered,
+ u32 ecn_count, int flag)
{
const struct net *net = sock_net(sk);
struct tcp_sock *tp = tcp_sk(sk);
@@ -3795,8 +3962,12 @@ static u32 tcp_newly_delivered(struct sock *sk, u32 prior_delivered, int flag)
delivered = tp->delivered - prior_delivered;
NET_ADD_STATS(net, LINUX_MIB_TCPDELIVERED, delivered);
- if (flag & FLAG_ECE)
- NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, delivered);
+
+ if (flag & FLAG_ECE) {
+ if (tcp_ecn_mode_rfc3168(tp))
+ ecn_count = delivered;
+ NET_ADD_STATS(net, LINUX_MIB_TCPDELIVEREDCE, ecn_count);
+ }
return delivered;
}
@@ -3817,11 +3988,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
u32 delivered = tp->delivered;
u32 lost = tp->lost;
int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
+ u32 ecn_count = 0; /* Did we receive ECE/an AccECN ACE update? */
u32 prior_fack;
sack_state.first_sackt = 0;
sack_state.rate = &rs;
sack_state.sack_delivered = 0;
+ sack_state.delivered_bytes = 0;
/* We very likely will need to access rtx queue. */
prefetch(sk->tcp_rtx_queue.rb_node);
@@ -3837,7 +4010,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* RFC 5961 5.2 [Blind Data Injection Attack].[Mitigation] */
if (before(ack, prior_snd_una - max_window)) {
if (!(flag & FLAG_NO_CHALLENGE_ACK))
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
return -SKB_DROP_REASON_TCP_TOO_OLD_ACK;
}
goto old_ack;
@@ -3851,7 +4024,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (after(ack, prior_snd_una)) {
flag |= FLAG_SND_UNA_ADVANCED;
- icsk->icsk_retransmits = 0;
+ WRITE_ONCE(icsk->icsk_retransmits, 0);
#if IS_ENABLED(CONFIG_TLS_DEVICE)
if (static_branch_unlikely(&clean_acked_data_enabled.key))
@@ -3912,8 +4085,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
/* We passed data and got it acked, remove any soft error
* log. Something worked...
*/
- WRITE_ONCE(sk->sk_err_soft, 0);
- icsk->icsk_probes_out = 0;
+ if (READ_ONCE(sk->sk_err_soft))
+ WRITE_ONCE(sk->sk_err_soft, 0);
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
tp->rcv_tstamp = tcp_jiffies32;
if (!prior_packets)
goto no_queue;
@@ -3924,6 +4098,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
tcp_rack_update_reo_wnd(sk, &rs);
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
+
tcp_in_ack_event(sk, flag);
if (tp->tlp_high_seq)
@@ -3948,7 +4128,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP))
sk_dst_confirm(sk);
- delivered = tcp_newly_delivered(sk, delivered, flag);
+ delivered = tcp_newly_delivered(sk, delivered, ecn_count, flag);
+
lost = tp->lost - lost; /* freshly marked lost */
rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED);
tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
@@ -3957,12 +4138,17 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
return 1;
no_queue:
+ if (tcp_ecn_mode_accecn(tp))
+ ecn_count = tcp_accecn_process(sk, skb,
+ tp->delivered - delivered,
+ sack_state.delivered_bytes,
+ &flag);
tcp_in_ack_event(sk, flag);
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK) {
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
}
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
@@ -3983,7 +4169,7 @@ old_ack:
&sack_state);
tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
&rexmit);
- tcp_newly_delivered(sk, delivered, flag);
+ tcp_newly_delivered(sk, delivered, ecn_count, flag);
tcp_xmit_recovery(sk, rexmit);
}
@@ -4083,6 +4269,7 @@ void tcp_parse_options(const struct net *net,
ptr = (const unsigned char *)(th + 1);
opt_rx->saw_tstamp = 0;
+ opt_rx->accecn = 0;
opt_rx->saw_unknown = 0;
while (length > 0) {
@@ -4174,6 +4361,12 @@ void tcp_parse_options(const struct net *net,
ptr, th->syn, foc, false);
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ /* Save offset of AccECN option in TCP header */
+ opt_rx->accecn = (ptr - 2) - (__u8 *)th;
+ break;
+
case TCPOPT_EXP:
/* Fast Open option shares code 254 using a
* 16 bits magic number.
@@ -4234,11 +4427,14 @@ static bool tcp_fast_parse_options(const struct net *net,
*/
if (th->doff == (sizeof(*th) / 4)) {
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
return false;
} else if (tp->rx_opt.tstamp_ok &&
th->doff == ((sizeof(*th) + TCPOLEN_TSTAMP_ALIGNED) / 4)) {
- if (tcp_parse_aligned_timestamp(tp, th))
+ if (tcp_parse_aligned_timestamp(tp, th)) {
+ tp->rx_opt.accecn = 0;
return true;
+ }
}
tcp_parse_options(net, skb, &tp->rx_opt, 1, NULL);
@@ -4830,7 +5026,7 @@ static bool tcp_ooo_try_coalesce(struct sock *sk,
noinline_for_tracing static void
tcp_drop_reason(struct sock *sk, struct sk_buff *skb, enum skb_drop_reason reason)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
sk_skb_reason_drop(sk, skb, reason);
}
@@ -4890,12 +5086,23 @@ static int tcp_prune_queue(struct sock *sk, const struct sk_buff *in_skb);
/* Check if this incoming skb can be added to socket receive queues
* while satisfying sk->sk_rcvbuf limit.
+ *
+ * In theory we should use skb->truesize, but this can cause problems
+ * when applications use too small SO_RCVBUF values.
+ * When LRO / hw gro is used, the socket might have a high tp->scaling_ratio,
+ * allowing RWIN to be close to available space.
+ * Whenever the receive queue gets full, we can receive a small packet
+ * filling RWIN, but with a high skb->truesize, because most NIC use 4K page
+ * plus sk_buff metadata even when receiving less than 1500 bytes of payload.
+ *
+ * Note that we use skb->len to decide to accept or drop this packet,
+ * but sk->sk_rmem_alloc is the sum of all skb->truesize.
*/
static bool tcp_can_ingest(const struct sock *sk, const struct sk_buff *skb)
{
- unsigned int new_mem = atomic_read(&sk->sk_rmem_alloc) + skb->truesize;
+ unsigned int rmem = atomic_read(&sk->sk_rmem_alloc);
- return new_mem <= sk->sk_rcvbuf;
+ return rmem + skb->len <= sk->sk_rcvbuf;
}
static int tcp_try_rmem_schedule(struct sock *sk, const struct sk_buff *skb,
@@ -5871,6 +6078,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb,
const struct tcphdr *th, int syn_inerr)
{
struct tcp_sock *tp = tcp_sk(sk);
+ bool accecn_reflector = false;
SKB_DR(reason);
/* RFC1323: H1. Apply PAWS check first. */
@@ -5968,7 +6176,7 @@ step1:
if (tp->syn_fastopen && !tp->data_segs_in &&
sk->sk_state == TCP_ESTABLISHED)
tcp_fastopen_active_disable(sk);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
SKB_DR_SET(reason, TCP_RESET);
goto discard;
}
@@ -5979,6 +6187,16 @@ step1:
* RFC 5961 4.2 : Send a challenge ack
*/
if (th->syn) {
+ if (tcp_ecn_mode_accecn(tp)) {
+ accecn_reflector = true;
+ if (tp->rx_opt.accecn &&
+ tp->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tp->rx_opt.accecn);
+
+ tcp_accecn_saw_opt_fail_recv(tp, saw_opt);
+ tcp_accecn_opt_demand_min(sk, 1);
+ }
+ }
if (sk->sk_state == TCP_SYN_RECV && sk->sk_socket && th->ack &&
TCP_SKB_CB(skb)->seq + 1 == TCP_SKB_CB(skb)->end_seq &&
TCP_SKB_CB(skb)->seq + 1 == tp->rcv_nxt &&
@@ -5988,7 +6206,7 @@ syn_challenge:
if (syn_inerr)
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSYNCHALLENGE);
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, accecn_reflector);
SKB_DR_SET(reason, TCP_INVALID_SYN);
goto discard;
}
@@ -6060,6 +6278,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
*/
tp->rx_opt.saw_tstamp = 0;
+ tp->rx_opt.accecn = 0;
/* pred_flags is 0xS?10 << 16 + snd_wnd
* if header_prediction is to be made
@@ -6114,6 +6333,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
flag |= __tcp_replace_ts_recent(tp,
delta);
+ tcp_ecn_received_counters(sk, skb, 0);
+
/* We know that such packets are checksummed
* on entry.
*/
@@ -6162,6 +6383,8 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb)
/* Bulk data transfer: receiver */
tcp_cleanup_skb(skb);
__skb_pull(skb, tcp_header_len);
+ tcp_ecn_received_counters(sk, skb,
+ len - tcp_header_len);
eaten = tcp_queue_rcv(sk, skb, &fragstolen);
tcp_event_data_recv(sk, skb);
@@ -6202,6 +6425,8 @@ validate:
return;
step5:
+ tcp_ecn_received_counters_payload(sk, skb);
+
reason = tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT);
if ((int)reason < 0) {
reason = -reason;
@@ -6297,7 +6522,7 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
u16 mss = tp->rx_opt.mss_clamp, try_exp = 0;
bool syn_drop = false;
- if (mss == tp->rx_opt.user_mss) {
+ if (mss == READ_ONCE(tp->rx_opt.user_mss)) {
struct tcp_options_received opt;
/* Get original SYNACK MSS value if user MSS sets mss_clamp */
@@ -6452,7 +6677,9 @@ consume:
* state to ESTABLISHED..."
*/
- tcp_ecn_rcv_synack(tp, th);
+ if (tcp_ecn_mode_any(tp))
+ tcp_ecn_rcv_synack(sk, skb, th,
+ TCP_SKB_CB(skb)->ip_dsfield);
tcp_init_wl(tp, TCP_SKB_CB(skb)->seq);
tcp_try_undo_spurious_syn(sk);
@@ -6524,7 +6751,7 @@ consume:
TCP_DELACK_MAX, false);
goto consume;
}
- tcp_send_ack(sk);
+ tcp_send_ack_reflect_ect(sk, tcp_ecn_mode_accecn(tp));
return -1;
}
@@ -6583,7 +6810,7 @@ consume:
tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
tp->max_window = tp->snd_wnd;
- tcp_ecn_rcv_syn(tp, th);
+ tcp_ecn_rcv_syn(tp, th, skb);
tcp_mtup_init(sk);
tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -6636,7 +6863,7 @@ static void tcp_rcv_synrecv_state_fastopen(struct sock *sk)
tcp_try_undo_recovery(sk);
tcp_update_rto_time(tp);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
/* In tcp_fastopen_synack_timer() on the first SYNACK RTO we set
* retrans_stamp but don't enter CA_Loss, so in case that happened we
* need to zero retrans_stamp here to prevent spurious
@@ -6765,7 +6992,7 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
}
/* accept old ack during closing */
if ((int)reason < 0) {
- tcp_send_challenge_ack(sk);
+ tcp_send_challenge_ack(sk, false);
reason = -reason;
goto discard;
}
@@ -6812,9 +7039,12 @@ tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
tp->lsndtime = tcp_jiffies32;
tcp_initialize_rcv_mss(sk);
+ if (tcp_ecn_mode_accecn(tp))
+ tcp_accecn_third_ack(sk, skb, tp->syn_ect_snt);
tcp_fast_path_on(tp);
if (sk->sk_shutdown & SEND_SHUTDOWN)
tcp_shutdown(sk, SEND_SHUTDOWN);
+
break;
case TCP_FIN_WAIT1: {
@@ -6984,6 +7214,15 @@ static void tcp_ecn_create_request(struct request_sock *req,
bool ect, ecn_ok;
u32 ecn_ok_dst;
+ if (tcp_accecn_syn_requested(th) &&
+ READ_ONCE(net->ipv4.sysctl_tcp_ecn) >= 3) {
+ inet_rsk(req)->ecn_ok = 1;
+ tcp_rsk(req)->accecn_ok = 1;
+ tcp_rsk(req)->syn_ect_rcv = TCP_SKB_CB(skb)->ip_dsfield &
+ INET_ECN_MASK;
+ return;
+ }
+
if (!th_ecn)
return;
@@ -6991,7 +7230,8 @@ static void tcp_ecn_create_request(struct request_sock *req,
ecn_ok_dst = dst_feature(dst, DST_FEATURE_ECN_MASK);
ecn_ok = READ_ONCE(net->ipv4.sysctl_tcp_ecn) || ecn_ok_dst;
- if (((!ect || th->res1) && ecn_ok) || tcp_ca_needs_ecn(listen_sk) ||
+ if (((!ect || th->res1 || th->ae) && ecn_ok) ||
+ tcp_ca_needs_ecn(listen_sk) ||
(ecn_ok_dst & DST_FEATURE_ECN_CA) ||
tcp_bpf_ca_needs_ecn((struct sock *)req))
inet_rsk(req)->ecn_ok = 1;
@@ -7009,6 +7249,11 @@ static void tcp_openreq_init(struct request_sock *req,
tcp_rsk(req)->snt_synack = 0;
tcp_rsk(req)->snt_tsval_first = 0;
tcp_rsk(req)->last_oow_ack_time = 0;
+ tcp_rsk(req)->accecn_ok = 0;
+ tcp_rsk(req)->saw_accecn_opt = TCP_ACCECN_OPT_NOT_SEEN;
+ tcp_rsk(req)->accecn_fail_mode = 0;
+ tcp_rsk(req)->syn_ect_rcv = 0;
+ tcp_rsk(req)->syn_ect_snt = 0;
req->mss = rx_opt->mss_clamp;
req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
ireq->tstamp_ok = rx_opt->tstamp_ok;
@@ -7048,8 +7293,8 @@ static bool tcp_syn_flood_action(struct sock *sk, const char *proto)
#endif
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
- if (!READ_ONCE(queue->synflood_warned) && syncookies != 2 &&
- xchg(&queue->synflood_warned, 1) == 0) {
+ if (syncookies != 2 && !READ_ONCE(queue->synflood_warned)) {
+ WRITE_ONCE(queue->synflood_warned, 1);
if (IS_ENABLED(CONFIG_IPV6) && sk->sk_family == AF_INET6) {
net_info_ratelimited("%s: Possible SYN flooding on port [%pI6c]:%u. %s.\n",
proto, inet6_rcv_saddr(sk),
@@ -7117,7 +7362,7 @@ u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops,
return 0;
}
- mss = tcp_parse_mss_option(th, tp->rx_opt.user_mss);
+ mss = tcp_parse_mss_option(th, READ_ONCE(tp->rx_opt.user_mss));
if (!mss)
mss = af_ops->mss_clamp;
@@ -7131,7 +7376,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
{
struct tcp_fastopen_cookie foc = { .len = -1 };
struct tcp_options_received tmp_opt;
- struct tcp_sock *tp = tcp_sk(sk);
+ const struct tcp_sock *tp = tcp_sk(sk);
struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct request_sock *req;
@@ -7182,7 +7427,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_clear_options(&tmp_opt);
tmp_opt.mss_clamp = af_ops->mss_clamp;
- tmp_opt.user_mss = tp->rx_opt.user_mss;
+ tmp_opt.user_mss = READ_ONCE(tp->rx_opt.user_mss);
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0,
want_cookie ? NULL : &foc);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 84d3d556ed80..b1fcf3e4e1ce 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -65,6 +65,7 @@
#include <net/icmp.h>
#include <net/inet_hashtables.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/transp_v6.h>
#include <net/ipv6.h>
#include <net/inet_common.h>
@@ -74,6 +75,7 @@
#include <net/secure_seq.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/inet.h>
#include <linux/ipv6.h>
@@ -292,9 +294,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_dport = usin->sin_port;
sk_daddr_set(sk, daddr);
- inet_csk(sk)->icsk_ext_hdr_len = 0;
+ inet_csk(sk)->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (inet_opt)
- inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;
+ inet_csk(sk)->icsk_ext_hdr_len += inet_opt->opt.optlen;
tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
@@ -506,8 +508,7 @@ int tcp_v4_err(struct sk_buff *skb, u32 info)
struct sock *sk;
int err;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->daddr, th->dest, iph->saddr,
+ sk = __inet_lookup_established(net, iph->daddr, th->dest, iph->saddr,
ntohs(th->source), inet_iif(skb), 0);
if (!sk) {
__ICMP_INC_STATS(net, ICMP_MIB_INERRORS);
@@ -823,8 +824,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, NULL, 0, ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), dif, sdif);
/* don't send rst if it can't find key */
@@ -1191,7 +1191,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
enum tcp_synack_type synack_type,
struct sk_buff *syn_skb)
{
- const struct inet_request_sock *ireq = inet_rsk(req);
+ struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4;
int err = -1;
struct sk_buff *skb;
@@ -1204,6 +1204,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK;
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
tos = READ_ONCE(inet_sk(sk)->tos);
@@ -1505,9 +1506,9 @@ void tcp_clear_md5_list(struct sock *sk)
md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
hlist_for_each_entry_safe(key, n, &md5sig->head, node) {
- hlist_del_rcu(&key->node);
+ hlist_del(&key->node);
atomic_sub(sizeof(*key), &sk->sk_omem_alloc);
- kfree_rcu(key, rcu);
+ kfree(key);
}
}
@@ -1907,6 +1908,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
enum skb_drop_reason reason;
struct sock *rsk;
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst;
@@ -1968,6 +1973,7 @@ csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
@@ -1992,8 +1998,7 @@ int tcp_v4_early_demux(struct sk_buff *skb)
if (th->doff < sizeof(struct tcphdr) / 4)
return 0;
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
skb->skb_iif, inet_sdif(skb));
if (sk) {
@@ -2070,7 +2075,9 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb,
(TCPHDR_ECE | TCPHDR_CWR | TCPHDR_AE)) ||
!tcp_skb_can_collapse_rx(tail, skb) ||
thtail->doff != th->doff ||
- memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)))
+ memcmp(thtail + 1, th + 1, hdrlen - sizeof(*th)) ||
+ /* prior to PSP Rx policy check, retain exact PSP metadata */
+ psp_skb_coalesce_diff(tail, skb))
goto no_coalesce;
__skb_pull(skb, hdrlen);
@@ -2236,8 +2243,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
lookup:
- sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th), th->source,
+ sk = __inet_lookup_skb(skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
@@ -2258,7 +2264,7 @@ lookup:
&iph->saddr, &iph->daddr,
AF_INET, dif, sdif);
if (unlikely(drop_reason)) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -2403,7 +2409,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -2426,9 +2432,7 @@ do_time_wait:
&drop_reason);
switch (tw_status) {
case TCP_TW_SYN: {
- struct sock *sk2 = inet_lookup_listener(net,
- net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ struct sock *sk2 = inet_lookup_listener(net, skb, __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb),
@@ -2441,6 +2445,10 @@ do_time_wait:
__this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
fallthrough;
@@ -2459,7 +2467,6 @@ do_time_wait:
static struct timewait_sock_ops tcp_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp_timewait_sock),
- .twsk_destructor= tcp_twsk_destructor,
};
void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
@@ -2501,6 +2508,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
#endif
};
+
+static void tcp4_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -2516,23 +2530,12 @@ static int tcp_v4_init_sock(struct sock *sk)
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;
+ sk->sk_destruct = tcp4_destruct_sock;
#endif
return 0;
}
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5sig_info_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_info *md5sig;
-
- md5sig = container_of(head, struct tcp_md5sig_info, rcu);
- kfree(md5sig);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
static void tcp_release_user_frags(struct sock *sk)
{
#ifdef CONFIG_PAGE_POOL
@@ -2569,19 +2572,6 @@ void tcp_v4_destroy_sock(struct sock *sk)
/* Cleans up our, hopefully empty, out_of_order_queue. */
skb_rbtree_purge(&tp->out_of_order_queue);
-#ifdef CONFIG_TCP_MD5SIG
- /* Clean up the MD5 key list, if any */
- if (tp->md5sig_info) {
- struct tcp_md5sig_info *md5sig;
-
- md5sig = rcu_dereference_protected(tp->md5sig_info, 1);
- tcp_clear_md5_list(sk);
- call_rcu(&md5sig->rcu, tcp_md5sig_info_free_rcu);
- rcu_assign_pointer(tp->md5sig_info, NULL);
- }
-#endif
- tcp_ao_destroy_sock(sk, false);
-
/* Clean up a referenced TCP bind bucket. */
if (inet_csk(sk)->icsk_bind_hash)
inet_put_port(sk);
@@ -2958,9 +2948,9 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(f), sk_uid(sk)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -3524,7 +3514,6 @@ struct proto tcp_prot = {
.leave_memory_pressure = tcp_leave_memory_pressure,
.stream_memory_free = tcp_stream_memory_free,
.sockets_allocated = &tcp_sockets_allocated,
- .orphan_count = &tcp_orphan_count,
.memory_allocated = &net_aligned_data.tcp_memory_allocated,
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
@@ -3583,7 +3572,9 @@ fallback:
static int __net_init tcp_sk_init(struct net *net)
{
- net->ipv4.sysctl_tcp_ecn = 2;
+ net->ipv4.sysctl_tcp_ecn = TCP_ECN_IN_ECN_OUT_NOECN;
+ net->ipv4.sysctl_tcp_ecn_option = TCP_ACCECN_OPTION_FULL;
+ net->ipv4.sysctl_tcp_ecn_option_beacon = TCP_ACCECN_OPTION_BEACON;
net->ipv4.sysctl_tcp_ecn_fallback = 1;
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index b67f94c60f9f..45b6ecd16412 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -170,7 +170,7 @@ static struct tcp_metrics_block *tcpm_new(struct dst_entry *dst,
struct net *net;
spin_lock_bh(&tcp_metrics_lock);
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
/* While waiting for the spin-lock the cache might have been populated
* with this entry and so we have to check again.
@@ -273,7 +273,7 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req,
return NULL;
}
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
@@ -318,7 +318,7 @@ static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk,
else
return NULL;
- net = dev_net_rcu(dst_dev(dst));
+ net = dst_dev_net_rcu(dst);
hash ^= net_hash_mix(net);
hash = hash_32(hash, tcp_metrics_hash_log);
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 2994c9222c9c..2ec8c6f1cdcc 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -20,9 +20,11 @@
*/
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/xfrm.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
static bool tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
{
@@ -103,9 +105,16 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
u32 rcv_nxt = READ_ONCE(tcptw->tw_rcv_nxt);
struct tcp_options_received tmp_opt;
+ enum skb_drop_reason psp_drop;
bool paws_reject = false;
int ts_recent_stamp;
+ /* Instead of dropping immediately, wait to see what value is
+ * returned. We will accept a non psp-encapsulated syn in the
+ * case where TCP_TW_SYN is returned.
+ */
+ psp_drop = psp_twsk_rx_policy_check(tw, skb);
+
tmp_opt.saw_tstamp = 0;
ts_recent_stamp = READ_ONCE(tcptw->tw_ts_recent_stamp);
if (th->doff > (sizeof(*th) >> 2) && ts_recent_stamp) {
@@ -123,6 +132,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
if (READ_ONCE(tw->tw_substate) == TCP_FIN_WAIT2) {
/* Just repeat all the checks of tcp_rcv_state_process() */
+ if (psp_drop)
+ goto out_put;
+
/* Out of window, send ACK */
if (paws_reject ||
!tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
@@ -193,6 +205,9 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
(TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
/* In window segment, it may be only reset or bare ack. */
+ if (psp_drop)
+ goto out_put;
+
if (th->rst) {
/* This is TIME_WAIT assassination, in two flavors.
* Oh well... nobody has a sufficient solution to this
@@ -246,6 +261,9 @@ kill:
return TCP_TW_SYN;
}
+ if (psp_drop)
+ goto out_put;
+
if (paws_reject) {
*drop_reason = SKB_DROP_REASON_TCP_RFC7323_TW_PAWS;
__NET_INC_STATS(twsk_net(tw), LINUX_MIB_PAWS_TW_REJECTED);
@@ -264,6 +282,8 @@ kill:
return tcp_timewait_check_oow_rate_limit(
tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT);
}
+
+out_put:
inet_twsk_put(tw);
return TCP_TW_SUCCESS;
}
@@ -377,31 +397,22 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
}
EXPORT_SYMBOL(tcp_time_wait);
-#ifdef CONFIG_TCP_MD5SIG
-static void tcp_md5_twsk_free_rcu(struct rcu_head *head)
-{
- struct tcp_md5sig_key *key;
-
- key = container_of(head, struct tcp_md5sig_key, rcu);
- kfree(key);
- static_branch_slow_dec_deferred(&tcp_md5_needed);
- tcp_md5_release_sigpool();
-}
-#endif
-
void tcp_twsk_destructor(struct sock *sk)
{
#ifdef CONFIG_TCP_MD5SIG
if (static_branch_unlikely(&tcp_md5_needed.key)) {
struct tcp_timewait_sock *twsk = tcp_twsk(sk);
- if (twsk->tw_md5_key)
- call_rcu(&twsk->tw_md5_key->rcu, tcp_md5_twsk_free_rcu);
+ if (twsk->tw_md5_key) {
+ kfree(twsk->tw_md5_key);
+ static_branch_slow_dec_deferred(&tcp_md5_needed);
+ tcp_md5_release_sigpool();
+ }
}
#endif
tcp_ao_destroy_sock(sk, true);
+ psp_twsk_assoc_free(inet_twsk(sk));
}
-EXPORT_IPV6_MOD_GPL(tcp_twsk_destructor);
void tcp_twsk_purge(struct list_head *net_exit_list)
{
@@ -461,12 +472,26 @@ void tcp_openreq_init_rwin(struct request_sock *req,
ireq->rcv_wscale = rcv_wscale;
}
-static void tcp_ecn_openreq_child(struct tcp_sock *tp,
- const struct request_sock *req)
+static void tcp_ecn_openreq_child(struct sock *sk,
+ const struct request_sock *req,
+ const struct sk_buff *skb)
{
- tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
- TCP_ECN_MODE_RFC3168 :
- TCP_ECN_DISABLED);
+ const struct tcp_request_sock *treq = tcp_rsk(req);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (treq->accecn_ok) {
+ tcp_ecn_mode_set(tp, TCP_ECN_MODE_ACCECN);
+ tp->syn_ect_snt = treq->syn_ect_snt;
+ tcp_accecn_third_ack(sk, skb, treq->syn_ect_snt);
+ tp->saw_accecn_opt = treq->saw_accecn_opt;
+ tp->prev_ecnfield = treq->syn_ect_rcv;
+ tp->accecn_opt_demand = 1;
+ tcp_ecn_received_counters_payload(sk, skb);
+ } else {
+ tcp_ecn_mode_set(tp, inet_rsk(req)->ecn_ok ?
+ TCP_ECN_MODE_RFC3168 :
+ TCP_ECN_DISABLED);
+ }
}
void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst)
@@ -631,7 +656,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
if (skb->len >= TCP_MSS_DEFAULT + newtp->tcp_header_len)
newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
newtp->rx_opt.mss_clamp = req->mss;
- tcp_ecn_openreq_child(newtp, req);
+ tcp_ecn_openreq_child(newsk, req, skb);
newtp->fastopen_req = NULL;
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
@@ -674,6 +699,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
bool own_req;
tmp_opt.saw_tstamp = 0;
+ tmp_opt.accecn = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL);
@@ -851,6 +877,18 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
if (!(flg & TCP_FLAG_ACK))
return NULL;
+ if (tcp_rsk(req)->accecn_ok && tmp_opt.accecn &&
+ tcp_rsk(req)->saw_accecn_opt < TCP_ACCECN_OPT_COUNTER_SEEN) {
+ u8 saw_opt = tcp_accecn_option_init(skb, tmp_opt.accecn);
+
+ tcp_rsk(req)->saw_accecn_opt = saw_opt;
+ if (tcp_rsk(req)->saw_accecn_opt == TCP_ACCECN_OPT_FAIL_SEEN) {
+ u8 fail_mode = TCP_ACCECN_OPT_FAIL_RECV;
+
+ tcp_rsk(req)->accecn_fail_mode |= fail_mode;
+ }
+ }
+
/* For Fast Open no more processing is needed (sk is the
* child socket).
*/
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index be5c2294610e..2cb93da93abc 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -434,8 +434,7 @@ static void tcp4_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet_get_iif_sdif(skb, &iif, &sdif);
iph = skb_gro_network_header(skb);
net = dev_net_rcu(skb->dev);
- sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- iph->saddr, th->source,
+ sk = __inet_lookup_established(net, iph->saddr, th->source,
iph->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
@@ -485,6 +484,7 @@ INDIRECT_CALLABLE_SCOPE int tcp4_gro_complete(struct sk_buff *skb, int thoff)
th->check = ~tcp_v4_check(skb->len - thoff, iph->saddr,
iph->daddr, 0);
+ BUILD_BUG_ON(SKB_GSO_TCP_FIXEDID << 1 != SKB_GSO_TCP_FIXEDID_INNER);
skb_shinfo(skb)->gso_type |= SKB_GSO_TCPV4 |
(NAPI_GRO_CB(skb)->ip_fixedid * SKB_GSO_TCP_FIXEDID);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index caf11920a878..bb3576ac0ad7 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -38,8 +38,10 @@
#define pr_fmt(fmt) "TCP: " fmt
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#include <net/mptcp.h>
#include <net/proto_memory.h>
+#include <net/psp.h>
#include <linux/compiler.h>
#include <linux/gfp.h>
@@ -319,60 +321,6 @@ static u16 tcp_select_window(struct sock *sk)
return new_win;
}
-/* Packet ECN state for a SYN-ACK */
-static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
-{
- const struct tcp_sock *tp = tcp_sk(sk);
-
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_CWR;
- if (tcp_ecn_disabled(tp))
- TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_ECE;
- else if (tcp_ca_needs_ecn(sk) ||
- tcp_bpf_ca_needs_ecn(sk))
- INET_ECN_xmit(sk);
-}
-
-/* Packet ECN state for a SYN. */
-static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
-{
- struct tcp_sock *tp = tcp_sk(sk);
- bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk);
- bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 ||
- tcp_ca_needs_ecn(sk) || bpf_needs_ecn;
-
- if (!use_ecn) {
- const struct dst_entry *dst = __sk_dst_get(sk);
-
- if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
- use_ecn = true;
- }
-
- tp->ecn_flags = 0;
-
- if (use_ecn) {
- TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
- tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168);
- if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn)
- INET_ECN_xmit(sk);
- }
-}
-
-static void tcp_ecn_clear_syn(struct sock *sk, struct sk_buff *skb)
-{
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_fallback))
- /* tp->ecn_flags are cleared at a later point in time when
- * SYN ACK is ultimatively being received.
- */
- TCP_SKB_CB(skb)->tcp_flags &= ~(TCPHDR_ECE | TCPHDR_CWR);
-}
-
-static void
-tcp_ecn_make_synack(const struct request_sock *req, struct tcphdr *th)
-{
- if (inet_rsk(req)->ecn_ok)
- th->ece = 1;
-}
-
/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
* be sent.
*/
@@ -381,7 +329,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
{
struct tcp_sock *tp = tcp_sk(sk);
- if (tcp_ecn_mode_rfc3168(tp)) {
+ if (!tcp_ecn_mode_any(tp))
+ return;
+
+ if (tcp_ecn_mode_accecn(tp)) {
+ if (!tcp_accecn_ace_fail_recv(tp))
+ INET_ECN_xmit(sk);
+ tcp_accecn_set_ace(tp, skb, th);
+ skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ACCECN;
+ } else {
/* Not-retransmitted data segment: set ECT and inject CWR. */
if (skb->len != tcp_header_len &&
!before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) {
@@ -403,13 +359,15 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb,
/* Constructs common control bits of non-data skb. If SYN/FIN is present,
* auto increment end seqno.
*/
-static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u16 flags)
+static void tcp_init_nondata_skb(struct sk_buff *skb, struct sock *sk,
+ u32 seq, u16 flags)
{
skb->ip_summed = CHECKSUM_PARTIAL;
TCP_SKB_CB(skb)->tcp_flags = flags;
tcp_skb_pcount_set(skb, 1);
+ psp_enqueue_set_decrypted(sk, skb);
TCP_SKB_CB(skb)->seq = seq;
if (flags & (TCPHDR_SYN | TCPHDR_FIN))
@@ -430,6 +388,7 @@ static inline bool tcp_urg_mode(const struct tcp_sock *tp)
#define OPTION_SMC BIT(9)
#define OPTION_MPTCP BIT(10)
#define OPTION_AO BIT(11)
+#define OPTION_ACCECN BIT(12)
static void smc_options_write(__be32 *ptr, u16 *options)
{
@@ -451,6 +410,8 @@ struct tcp_out_options {
u16 mss; /* 0 to disable */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 num_accecn_fields:7, /* number of AccECN fields needed */
+ use_synack_ecn_bytes:1; /* Use synack_ecn_bytes or not */
u8 hash_size; /* bytes in hash_location */
u8 bpf_opt_len; /* length of BPF hdr option */
__u8 *hash_location; /* temporary pointer, overloaded */
@@ -648,6 +609,11 @@ static __be32 *process_tcp_ao_options(struct tcp_sock *tp,
return ptr;
}
+/* Initial values for AccECN option, ordered is based on ECN field bits
+ * similar to received_ecn_bytes. Used for SYN/ACK AccECN option.
+ */
+static const u32 synack_ecn_bytes[3] = { 0, 0, 0 };
+
/* Write previously computed TCP options to the packet.
*
* Beware: Something in the Internet is very sensitive to the ordering of
@@ -666,6 +632,8 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
struct tcp_out_options *opts,
struct tcp_key *key)
{
+ u8 leftover_highbyte = TCPOPT_NOP; /* replace 1st NOP if avail */
+ u8 leftover_lowbyte = TCPOPT_NOP; /* replace 2nd NOP in succession */
__be32 *ptr = (__be32 *)(th + 1);
u16 options = opts->options; /* mungable copy */
@@ -701,15 +669,75 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
*ptr++ = htonl(opts->tsecr);
}
+ if (OPTION_ACCECN & options) {
+ const u32 *ecn_bytes = opts->use_synack_ecn_bytes ?
+ synack_ecn_bytes :
+ tp->received_ecn_bytes;
+ const u8 ect0_idx = INET_ECN_ECT_0 - 1;
+ const u8 ect1_idx = INET_ECN_ECT_1 - 1;
+ const u8 ce_idx = INET_ECN_CE - 1;
+ u32 e0b;
+ u32 e1b;
+ u32 ceb;
+ u8 len;
+
+ e0b = ecn_bytes[ect0_idx] + TCP_ACCECN_E0B_INIT_OFFSET;
+ e1b = ecn_bytes[ect1_idx] + TCP_ACCECN_E1B_INIT_OFFSET;
+ ceb = ecn_bytes[ce_idx] + TCP_ACCECN_CEB_INIT_OFFSET;
+ len = TCPOLEN_ACCECN_BASE +
+ opts->num_accecn_fields * TCPOLEN_ACCECN_PERFIELD;
+
+ if (opts->num_accecn_fields == 2) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ } else if (opts->num_accecn_fields == 1) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ leftover_highbyte = e1b & 0xff;
+ leftover_lowbyte = TCPOPT_NOP;
+ } else if (opts->num_accecn_fields == 0) {
+ leftover_highbyte = TCPOPT_ACCECN1;
+ leftover_lowbyte = len;
+ } else if (opts->num_accecn_fields == 3) {
+ *ptr++ = htonl((TCPOPT_ACCECN1 << 24) | (len << 16) |
+ ((e1b >> 8) & 0xffff));
+ *ptr++ = htonl(((e1b & 0xff) << 24) |
+ (ceb & 0xffffff));
+ *ptr++ = htonl(((e0b & 0xffffff) << 8) |
+ TCPOPT_NOP);
+ }
+ if (tp) {
+ tp->accecn_minlen = 0;
+ tp->accecn_opt_tstamp = tp->tcp_mstamp;
+ if (tp->accecn_opt_demand)
+ tp->accecn_opt_demand--;
+ }
+ }
+
if (unlikely(OPTION_SACK_ADVERTISE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_WSCALE & options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
+ u8 highbyte = TCPOPT_NOP;
+
+ /* Do not split the leftover 2-byte to fit into a single
+ * NOP, i.e., replace this NOP only when 1 byte is leftover
+ * within leftover_highbyte.
+ */
+ if (unlikely(leftover_highbyte != TCPOPT_NOP &&
+ leftover_lowbyte == TCPOPT_NOP)) {
+ highbyte = leftover_highbyte;
+ leftover_highbyte = TCPOPT_NOP;
+ }
+ *ptr++ = htonl((highbyte << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
opts->ws);
@@ -720,11 +748,13 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
tp->duplicate_sack : tp->selective_acks;
int this_sack;
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
(TCPOPT_SACK << 8) |
(TCPOLEN_SACK_BASE + (opts->num_sack_blocks *
TCPOLEN_SACK_PERBLOCK)));
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
for (this_sack = 0; this_sack < opts->num_sack_blocks;
++this_sack) {
@@ -733,6 +763,14 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
}
tp->rx_opt.dsack = 0;
+ } else if (unlikely(leftover_highbyte != TCPOPT_NOP ||
+ leftover_lowbyte != TCPOPT_NOP)) {
+ *ptr++ = htonl((leftover_highbyte << 24) |
+ (leftover_lowbyte << 16) |
+ (TCPOPT_NOP << 8) |
+ TCPOPT_NOP);
+ leftover_highbyte = TCPOPT_NOP;
+ leftover_lowbyte = TCPOPT_NOP;
}
if (unlikely(OPTION_FAST_OPEN_COOKIE & options)) {
@@ -813,6 +851,80 @@ static void mptcp_set_option_cond(const struct request_sock *req,
}
}
+static u32 tcp_synack_options_combine_saving(struct tcp_out_options *opts)
+{
+ /* How much there's room for combining with the alignment padding? */
+ if ((opts->options & (OPTION_SACK_ADVERTISE | OPTION_TS)) ==
+ OPTION_SACK_ADVERTISE)
+ return 2;
+ else if (opts->options & OPTION_WSCALE)
+ return 1;
+ return 0;
+}
+
+/* Calculates how long AccECN option will fit to @remaining option space.
+ *
+ * AccECN option can sometimes replace NOPs used for alignment of other
+ * TCP options (up to @max_combine_saving available).
+ *
+ * Only solutions with at least @required AccECN fields are accepted.
+ *
+ * Returns: The size of the AccECN option excluding space repurposed from
+ * the alignment of the other options.
+ */
+static int tcp_options_fit_accecn(struct tcp_out_options *opts, int required,
+ int remaining)
+{
+ int size = TCP_ACCECN_MAXSIZE;
+ int sack_blocks_reduce = 0;
+ int max_combine_saving;
+ int rem = remaining;
+ int align_size;
+
+ if (opts->use_synack_ecn_bytes)
+ max_combine_saving = tcp_synack_options_combine_saving(opts);
+ else
+ max_combine_saving = opts->num_sack_blocks > 0 ? 2 : 0;
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ while (opts->num_accecn_fields >= required) {
+ /* Pad to dword if cannot combine */
+ if ((size & 0x3) > max_combine_saving)
+ align_size = ALIGN(size, 4);
+ else
+ align_size = ALIGN_DOWN(size, 4);
+
+ if (rem >= align_size) {
+ size = align_size;
+ break;
+ } else if (opts->num_accecn_fields == required &&
+ opts->num_sack_blocks > 2 &&
+ required > 0) {
+ /* Try to fit the option by removing one SACK block */
+ opts->num_sack_blocks--;
+ sack_blocks_reduce++;
+ rem = rem + TCPOLEN_SACK_PERBLOCK;
+
+ opts->num_accecn_fields = TCP_ACCECN_NUMFIELDS;
+ size = TCP_ACCECN_MAXSIZE;
+ continue;
+ }
+
+ opts->num_accecn_fields--;
+ size -= TCPOLEN_ACCECN_PERFIELD;
+ }
+ if (sack_blocks_reduce > 0) {
+ if (opts->num_accecn_fields >= required)
+ size -= sack_blocks_reduce * TCPOLEN_SACK_PERBLOCK;
+ else
+ opts->num_sack_blocks += sack_blocks_reduce;
+ }
+ if (opts->num_accecn_fields < required)
+ return 0;
+
+ opts->options |= OPTION_ACCECN;
+ return size;
+}
+
/* Compute TCP options for SYN packets. This is not the final
* network wire format yet.
*/
@@ -895,6 +1007,20 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
}
}
+ /* Simultaneous open SYN/ACK needs AccECN option but not SYN.
+ * It is attempted to negotiate the use of AccECN also on the first
+ * retransmitted SYN, as mentioned in "3.1.4.1. Retransmitted SYNs"
+ * of AccECN draft.
+ */
+ if (unlikely((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) &&
+ tcp_ecn_mode_accecn(tp) &&
+ inet_csk(sk)->icsk_retransmits < 2 &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ remaining >= TCPOLEN_ACCECN_BASE)) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len(sk, skb, NULL, NULL, 0, opts, &remaining);
return MAX_TCP_OPTION_SPACE - remaining;
@@ -912,6 +1038,7 @@ static unsigned int tcp_synack_options(const struct sock *sk,
{
struct inet_request_sock *ireq = inet_rsk(req);
unsigned int remaining = MAX_TCP_OPTION_SPACE;
+ struct tcp_request_sock *treq = tcp_rsk(req);
if (tcp_key_is_md5(key)) {
opts->options |= OPTION_MD5;
@@ -974,6 +1101,13 @@ static unsigned int tcp_synack_options(const struct sock *sk,
smc_set_option_cond(tcp_sk(sk), ireq, opts, &remaining);
+ if (treq->accecn_ok &&
+ READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option) &&
+ req->num_timeout < 1 && remaining >= TCPOLEN_ACCECN_BASE) {
+ opts->use_synack_ecn_bytes = 1;
+ remaining -= tcp_options_fit_accecn(opts, 0, remaining);
+ }
+
bpf_skops_hdr_opt_len((struct sock *)sk, skb, req, syn_skb,
synack_type, opts, &remaining);
@@ -1030,17 +1164,32 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
if (unlikely(eff_sacks)) {
const unsigned int remaining = MAX_TCP_OPTION_SPACE - size;
- if (unlikely(remaining < TCPOLEN_SACK_BASE_ALIGNED +
- TCPOLEN_SACK_PERBLOCK))
- return size;
+ if (likely(remaining >= TCPOLEN_SACK_BASE_ALIGNED +
+ TCPOLEN_SACK_PERBLOCK)) {
+ opts->num_sack_blocks =
+ min_t(unsigned int, eff_sacks,
+ (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
+ TCPOLEN_SACK_PERBLOCK);
+
+ size += TCPOLEN_SACK_BASE_ALIGNED +
+ opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ } else {
+ opts->num_sack_blocks = 0;
+ }
+ } else {
+ opts->num_sack_blocks = 0;
+ }
- opts->num_sack_blocks =
- min_t(unsigned int, eff_sacks,
- (remaining - TCPOLEN_SACK_BASE_ALIGNED) /
- TCPOLEN_SACK_PERBLOCK);
+ if (tcp_ecn_mode_accecn(tp)) {
+ int ecn_opt = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn_option);
- size += TCPOLEN_SACK_BASE_ALIGNED +
- opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK;
+ if (ecn_opt && tp->saw_accecn_opt && !tcp_accecn_opt_fail_send(tp) &&
+ (ecn_opt >= TCP_ACCECN_OPTION_FULL || tp->accecn_opt_demand ||
+ tcp_accecn_option_beacon_check(sk))) {
+ opts->use_synack_ecn_bytes = 0;
+ size += tcp_options_fit_accecn(opts, tp->accecn_minlen,
+ MAX_TCP_OPTION_SPACE - size);
+ }
}
if (unlikely(BPF_SOCK_OPS_TEST_FLAG(tp,
@@ -1437,7 +1586,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
err = tcp_ao_transmit_skb(sk, skb, key.ao_key, th,
opts.hash_location);
if (err) {
- kfree_skb_reason(skb, SKB_DROP_REASON_NOT_SPECIFIED);
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_NOT_SPECIFIED);
return -ENOMEM;
}
}
@@ -1510,6 +1659,7 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
/* Advance write_seq and place onto the write_queue. */
WRITE_ONCE(tp->write_seq, TCP_SKB_CB(skb)->end_seq);
__skb_header_release(skb);
+ psp_enqueue_set_decrypted(sk, skb);
tcp_add_write_queue_tail(sk, skb);
sk_wmem_queued_add(sk, skb->truesize);
sk_mem_charge(sk, skb->truesize);
@@ -2750,6 +2900,11 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
sent_pkts = 0;
tcp_mstamp_refresh(tp);
+
+ /* AccECN option beacon depends on mstamp, it may change mss */
+ if (tcp_ecn_mode_accecn(tp) && tcp_accecn_option_beacon_check(sk))
+ mss_now = tcp_current_mss(sk);
+
if (!push_one) {
/* Do MTU probing. */
result = tcp_mtu_probe(sk);
@@ -3402,7 +3557,10 @@ start:
tcp_retrans_try_collapse(sk, skb, avail_wnd);
}
- /* RFC3168, section 6.1.1.1. ECN fallback */
+ /* RFC3168, section 6.1.1.1. ECN fallback
+ * As AccECN uses the same SYN flags (+ AE), this check covers both
+ * cases.
+ */
if ((TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN_ECN) == TCPHDR_SYN_ECN)
tcp_ecn_clear_syn(sk, skb);
@@ -3578,9 +3736,8 @@ void sk_forced_mem_schedule(struct sock *sk, int size)
sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
sk_memory_allocated_add(sk, amt);
- if (mem_cgroup_sockets_enabled && sk->sk_memcg)
- mem_cgroup_charge_skmem(sk->sk_memcg, amt,
- gfp_memcg_charge() | __GFP_NOFAIL);
+ if (mem_cgroup_sk_enabled(sk))
+ mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge() | __GFP_NOFAIL);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3625,7 +3782,7 @@ void tcp_send_fin(struct sock *sk)
skb_reserve(skb, MAX_TCP_HEADER);
sk_forced_mem_schedule(sk, skb->truesize);
/* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
- tcp_init_nondata_skb(skb, tp->write_seq,
+ tcp_init_nondata_skb(skb, sk, tp->write_seq,
TCPHDR_ACK | TCPHDR_FIN);
tcp_queue_skb(sk, skb);
}
@@ -3653,7 +3810,7 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority,
/* Reserve space for headers and prepare control bits. */
skb_reserve(skb, MAX_TCP_HEADER);
- tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk),
+ tcp_init_nondata_skb(skb, sk, tcp_acceptable_seq(sk),
TCPHDR_ACK | TCPHDR_RST);
tcp_mstamp_refresh(tcp_sk(sk));
/* Send it off. */
@@ -3891,6 +4048,7 @@ static void tcp_connect_init(struct sock *sk)
const struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
__u8 rcv_wscale;
+ u16 user_mss;
u32 rcv_wnd;
/* We'll fix this up when we get a response from the other end.
@@ -3903,8 +4061,9 @@ static void tcp_connect_init(struct sock *sk)
tcp_ao_connect_init(sk);
/* If user gave his TCP_MAXSEG, record it to clamp */
- if (tp->rx_opt.user_mss)
- tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
+ user_mss = READ_ONCE(tp->rx_opt.user_mss);
+ if (user_mss)
+ tp->rx_opt.mss_clamp = user_mss;
tp->max_window = 0;
tcp_mtup_init(sk);
tcp_sync_mss(sk, dst_mtu(dst));
@@ -3955,7 +4114,7 @@ static void tcp_connect_init(struct sock *sk)
WRITE_ONCE(tp->copied_seq, tp->rcv_nxt);
inet_csk(sk)->icsk_rto = tcp_timeout_init(sk);
- inet_csk(sk)->icsk_retransmits = 0;
+ WRITE_ONCE(inet_csk(sk)->icsk_retransmits, 0);
tcp_clear_retrans(tp);
}
@@ -4148,7 +4307,7 @@ int tcp_connect(struct sock *sk)
/* SYN eats a sequence byte, write_seq updated by
* tcp_connect_queue_skb().
*/
- tcp_init_nondata_skb(buff, tp->write_seq, TCPHDR_SYN);
+ tcp_init_nondata_skb(buff, sk, tp->write_seq, TCPHDR_SYN);
tcp_mstamp_refresh(tp);
tp->retrans_stamp = tcp_time_stamp_ts(tp);
tcp_connect_queue_skb(sk, buff);
@@ -4273,7 +4432,8 @@ void __tcp_send_ack(struct sock *sk, u32 rcv_nxt, u16 flags)
/* Reserve space for headers and prepare control bits. */
skb_reserve(buff, MAX_TCP_HEADER);
- tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
+ tcp_init_nondata_skb(buff, sk,
+ tcp_acceptable_seq(sk), TCPHDR_ACK | flags);
/* We do not want pure acks influencing TCP Small Queues or fq/pacing
* too much.
@@ -4319,7 +4479,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
* end to send an ack. Don't queue or clone SKB, just
* send it.
*/
- tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
+ tcp_init_nondata_skb(skb, sk, tp->snd_una - !urgent, TCPHDR_ACK);
NET_INC_STATS(sock_net(sk), mib);
return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
@@ -4393,13 +4553,13 @@ void tcp_send_probe0(struct sock *sk)
if (tp->packets_out || tcp_write_queue_empty(sk)) {
/* Cancel probe timer, if it is not required. */
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_backoff = 0;
icsk->icsk_probes_tstamp = 0;
return;
}
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
if (err <= 0) {
if (icsk->icsk_backoff < READ_ONCE(net->ipv4.sysctl_tcp_retries2))
icsk->icsk_backoff++;
@@ -4437,7 +4597,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
tcp_sk_rw(sk)->total_retrans++;
}
trace_tcp_retransmit_synack(sk, req);
- req->num_retrans++;
+ WRITE_ONCE(req->num_retrans, req->num_retrans + 1);
}
return res;
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index a207877270fb..2dd73a4e8e51 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -392,7 +392,7 @@ static void tcp_probe_timer(struct sock *sk)
int max_probes;
if (tp->packets_out || !skb) {
- icsk->icsk_probes_out = 0;
+ WRITE_ONCE(icsk->icsk_probes_out, 0);
icsk->icsk_probes_tstamp = 0;
return;
}
@@ -444,7 +444,7 @@ static void tcp_update_rto_stats(struct sock *sk)
tp->total_rto_recoveries++;
tp->rto_stamp = tcp_time_stamp_ms(tp);
}
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits, icsk->icsk_retransmits + 1);
tp->total_rto++;
}
@@ -839,7 +839,7 @@ static void tcp_keepalive_timer(struct timer_list *t)
goto out;
}
if (tcp_write_wakeup(sk, LINUX_MIB_TCPKEEPALIVE) <= 0) {
- icsk->icsk_probes_out++;
+ WRITE_ONCE(icsk->icsk_probes_out, icsk->icsk_probes_out + 1);
elapsed = keepalive_intvl_when(tp);
} else {
/* If keepalive was lost due to local congestion,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cc3ce0f762ec..95241093b7f0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -68,7 +68,7 @@
* YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
* Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind
* a single port at the same time.
- * Derek Atkins <derek@ihtfp.com>: Add Encapulation Support
+ * Derek Atkins <derek@ihtfp.com>: Add Encapsulation Support
* James Chapman : Add L2TP encapsulation type.
*/
@@ -509,7 +509,7 @@ rescore:
/* compute_score is too long of a function to be
* inlined, and calling it again here yields
- * measureable overhead for some
+ * measurable overhead for some
* workloads. Work around it by jumping
* backwards to rescore 'result'.
*/
@@ -1685,31 +1685,6 @@ static void udp_skb_dtor_locked(struct sock *sk, struct sk_buff *skb)
udp_rmem_release(sk, udp_skb_truesize(skb), 1, true);
}
-/* Idea of busylocks is to let producers grab an extra spinlock
- * to relieve pressure on the receive_queue spinlock shared by consumer.
- * Under flood, this means that only one producer can be in line
- * trying to acquire the receive_queue spinlock.
- * These busylock can be allocated on a per cpu manner, instead of a
- * per socket one (that would consume a cache line per socket)
- */
-static int udp_busylocks_log __read_mostly;
-static spinlock_t *udp_busylocks __read_mostly;
-
-static spinlock_t *busylock_acquire(void *ptr)
-{
- spinlock_t *busy;
-
- busy = udp_busylocks + hash_ptr(ptr, udp_busylocks_log);
- spin_lock(busy);
- return busy;
-}
-
-static void busylock_release(spinlock_t *busy)
-{
- if (busy)
- spin_unlock(busy);
-}
-
static int udp_rmem_schedule(struct sock *sk, int size)
{
int delta;
@@ -1724,14 +1699,24 @@ static int udp_rmem_schedule(struct sock *sk, int size)
int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
{
struct sk_buff_head *list = &sk->sk_receive_queue;
+ struct udp_prod_queue *udp_prod_queue;
+ struct sk_buff *next, *to_drop = NULL;
+ struct llist_node *ll_list;
unsigned int rmem, rcvbuf;
- spinlock_t *busy = NULL;
int size, err = -ENOMEM;
+ int total_size = 0;
+ int q_size = 0;
+ int dropcount;
+ int nb = 0;
rmem = atomic_read(&sk->sk_rmem_alloc);
rcvbuf = READ_ONCE(sk->sk_rcvbuf);
size = skb->truesize;
+ udp_prod_queue = &udp_sk(sk)->udp_prod_queue[numa_node_id()];
+
+ rmem += atomic_read(&udp_prod_queue->rmem_alloc);
+
/* Immediately drop when the receive queue is full.
* Cast to unsigned int performs the boundary check for INT_MAX.
*/
@@ -1739,8 +1724,8 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (rcvbuf > INT_MAX >> 1)
goto drop;
- /* Always allow at least one packet for small buffer. */
- if (rmem > rcvbuf)
+ /* Accept the packet if queue is empty. */
+ if (rmem)
goto drop;
}
@@ -1753,42 +1738,77 @@ int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb)
if (rmem > (rcvbuf >> 1)) {
skb_condense(skb);
size = skb->truesize;
- busy = busylock_acquire(sk);
}
udp_set_dev_scratch(skb);
- atomic_add(size, &sk->sk_rmem_alloc);
+ atomic_add(size, &udp_prod_queue->rmem_alloc);
+
+ if (!llist_add(&skb->ll_node, &udp_prod_queue->ll_root))
+ return 0;
+
+ dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? sk_drops_read(sk) : 0;
spin_lock(&list->lock);
- err = udp_rmem_schedule(sk, size);
- if (err) {
- spin_unlock(&list->lock);
- goto uncharge_drop;
- }
- sk_forward_alloc_add(sk, -size);
+ ll_list = llist_del_all(&udp_prod_queue->ll_root);
- /* no need to setup a destructor, we will explicitly release the
- * forward allocated memory on dequeue
- */
- sock_skb_set_dropcount(sk, skb);
+ ll_list = llist_reverse_order(ll_list);
+
+ llist_for_each_entry_safe(skb, next, ll_list, ll_node) {
+ size = udp_skb_truesize(skb);
+ total_size += size;
+ err = udp_rmem_schedule(sk, size);
+ if (unlikely(err)) {
+ /* Free the skbs outside of locked section. */
+ skb->next = to_drop;
+ to_drop = skb;
+ continue;
+ }
+
+ q_size += size;
+ sk_forward_alloc_add(sk, -size);
+
+ /* no need to setup a destructor, we will explicitly release the
+ * forward allocated memory on dequeue
+ */
+ SOCK_SKB_CB(skb)->dropcount = dropcount;
+ nb++;
+ __skb_queue_tail(list, skb);
+ }
+
+ atomic_add(q_size, &sk->sk_rmem_alloc);
- __skb_queue_tail(list, skb);
spin_unlock(&list->lock);
- if (!sock_flag(sk, SOCK_DEAD))
- INDIRECT_CALL_1(sk->sk_data_ready, sock_def_readable, sk);
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ /* Multiple threads might be blocked in recvmsg(),
+ * using prepare_to_wait_exclusive().
+ */
+ while (nb) {
+ INDIRECT_CALL_1(sk->sk_data_ready,
+ sock_def_readable, sk);
+ nb--;
+ }
+ }
- busylock_release(busy);
- return 0;
+ if (unlikely(to_drop)) {
+ for (nb = 0; to_drop != NULL; nb++) {
+ skb = to_drop;
+ to_drop = skb->next;
+ skb_mark_not_on_list(skb);
+ /* TODO: update SNMP values. */
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PROTO_MEM);
+ }
+ numa_drop_add(&udp_sk(sk)->drop_counters, nb);
+ }
-uncharge_drop:
- atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ atomic_sub(total_size, &udp_prod_queue->rmem_alloc);
+
+ return 0;
drop:
- atomic_inc(&sk->sk_drops);
- busylock_release(busy);
+ udp_drops_inc(sk);
return err;
}
EXPORT_IPV6_MOD_GPL(__udp_enqueue_schedule_skb);
@@ -1806,6 +1826,7 @@ void udp_destruct_common(struct sock *sk)
kfree_skb(skb);
}
udp_rmem_release(sk, total, 0, true);
+ kfree(up->udp_prod_queue);
}
EXPORT_IPV6_MOD_GPL(udp_destruct_common);
@@ -1817,10 +1838,11 @@ static void udp_destruct_sock(struct sock *sk)
int udp_init_sock(struct sock *sk)
{
- udp_lib_init_sock(sk);
+ int res = udp_lib_init_sock(sk);
+
sk->sk_destruct = udp_destruct_sock;
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
- return 0;
+ return res;
}
void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
@@ -1828,6 +1850,13 @@ void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len)
if (unlikely(READ_ONCE(udp_sk(sk)->peeking_with_offset)))
sk_peek_offset_bwd(sk, len);
+ if (!skb_shared(skb)) {
+ if (unlikely(udp_skb_has_head_state(skb)))
+ skb_release_head_state(skb);
+ skb_attempt_defer_free(skb);
+ return;
+ }
+
if (!skb_unref(skb))
return;
@@ -1852,7 +1881,7 @@ static struct sk_buff *__first_packet_length(struct sock *sk,
IS_UDPLITE(sk));
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS,
IS_UDPLITE(sk));
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__skb_unlink(skb, rcvq);
*total += skb->truesize;
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
@@ -2008,7 +2037,7 @@ try_again:
__UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite);
__UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
kfree_skb_reason(skb, SKB_DROP_REASON_UDP_CSUM);
goto try_again;
}
@@ -2078,7 +2107,7 @@ try_again:
if (unlikely(err)) {
if (!peeking) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
UDP_INC_STATS(sock_net(sk),
UDP_MIB_INERRORS, is_udplite);
}
@@ -2449,7 +2478,7 @@ csum_error:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -2534,7 +2563,7 @@ start_lookup:
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__UDP_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP_INC_STATS(net, UDP_MIB_INERRORS,
@@ -2609,7 +2638,7 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh,
return 0;
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
@@ -2807,7 +2836,7 @@ static struct sock *__udp4_lib_demux_lookup(struct net *net,
return NULL;
}
-int udp_v4_early_demux(struct sk_buff *skb)
+enum skb_drop_reason udp_v4_early_demux(struct sk_buff *skb)
{
struct net *net = dev_net(skb->dev);
struct in_device *in_dev = NULL;
@@ -2821,7 +2850,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
/* validate the packet */
if (!pskb_may_pull(skb, skb_transport_offset(skb) + sizeof(struct udphdr)))
- return 0;
+ return SKB_NOT_DROPPED_YET;
iph = ip_hdr(skb);
uh = udp_hdr(skb);
@@ -2830,12 +2859,12 @@ int udp_v4_early_demux(struct sk_buff *skb)
in_dev = __in_dev_get_rcu(skb->dev);
if (!in_dev)
- return 0;
+ return SKB_NOT_DROPPED_YET;
ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
iph->protocol);
if (!ours)
- return 0;
+ return SKB_NOT_DROPPED_YET;
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr,
@@ -2846,7 +2875,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
}
if (!sk)
- return 0;
+ return SKB_NOT_DROPPED_YET;
skb->sk = sk;
DEBUG_NET_WARN_ON_ONCE(sk_is_refcounted(sk));
@@ -2873,7 +2902,7 @@ int udp_v4_early_demux(struct sk_buff *skb)
ip4h_dscp(iph),
skb->dev, in_dev, &itag);
}
- return 0;
+ return SKB_NOT_DROPPED_YET;
}
int udp_rcv(struct sk_buff *skb)
@@ -3386,7 +3415,7 @@ static void udp4_format_sock(struct sock *sp, struct seq_file *f,
from_kuid_munged(seq_user_ns(f), sk_uid(sp)),
0, sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
int udp4_seq_show(struct seq_file *seq, void *v)
@@ -3994,7 +4023,6 @@ static void __init bpf_iter_register(void)
void __init udp_init(void)
{
unsigned long limit;
- unsigned int i;
udp_table_init(&udp_table, "UDP");
limit = nr_free_buffer_pages() / 8;
@@ -4003,15 +4031,6 @@ void __init udp_init(void)
sysctl_udp_mem[1] = limit;
sysctl_udp_mem[2] = sysctl_udp_mem[0] * 2;
- /* 16 spinlocks per cpu */
- udp_busylocks_log = ilog2(nr_cpu_ids) + 4;
- udp_busylocks = kmalloc(sizeof(spinlock_t) << udp_busylocks_log,
- GFP_KERNEL);
- if (!udp_busylocks)
- panic("UDP: failed to alloc udp_busylocks\n");
- for (i = 0; i < (1U << udp_busylocks_log); i++)
- spin_lock_init(udp_busylocks + i);
-
if (register_pernet_subsys(&udp_sysctl_ops))
panic("UDP: failed to init sysctl parameters.\n");
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 38cb3a28e4ed..6e491c720c90 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -16,9 +16,9 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, NULL, skb, cb, req, NLM_F_MULTI,
@@ -92,12 +92,8 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
{
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
int num, s_num, slot, s_slot;
- struct nlattr *bc;
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
s_slot = cb->args[0];
num = s_num = cb->args[1];
@@ -130,7 +126,7 @@ static void udp_dump(struct udp_table *table, struct sk_buff *skb,
r->id.idiag_dport)
goto next;
- if (sk_diag_dump(sk, skb, cb, r, bc, net_admin) < 0) {
+ if (sk_diag_dump(sk, skb, cb, r, net_admin) < 0) {
spin_unlock_bh(&hslot->lock);
goto done;
}
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index b1f3fd302e9d..19d0b5b09ffa 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -891,8 +891,6 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb)
skb_gro_checksum_try_convert(skb, IPPROTO_UDP,
inet_gro_compute_pseudo);
skip:
- NAPI_GRO_CB(skb)->is_ipv6 = 0;
-
if (static_branch_unlikely(&udp_encap_needed_key))
sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest);
diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c
index fce945f23069..54386e06a813 100644
--- a/net/ipv4/udp_tunnel_core.c
+++ b/net/ipv4/udp_tunnel_core.c
@@ -4,6 +4,7 @@
#include <linux/socket.h>
#include <linux/kernel.h>
#include <net/dst_metadata.h>
+#include <net/flow.h>
#include <net/udp.h>
#include <net/udp_tunnel.h>
#include <net/inet_dscp.h>
@@ -253,7 +254,7 @@ struct rtable *udp_tunnel_dst_lookup(struct sk_buff *skb,
fl4.saddr = key->u.ipv4.src;
fl4.fl4_dport = dport;
fl4.fl4_sport = sport;
- fl4.flowi4_tos = tos & INET_DSCP_MASK;
+ fl4.flowi4_dscp = inet_dsfield_to_dscp(tos);
fl4.flowi4_flags = key->flow_flags;
rt = ip_route_output_key(net, &fl4);
diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index ff66db48453c..944b3cf25468 100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -930,7 +930,7 @@ udp_tunnel_nic_netdevice_event(struct notifier_block *unused,
err = udp_tunnel_nic_register(dev);
if (err)
- netdev_WARN(dev, "failed to register for UDP tunnel offloads: %d", err);
+ netdev_warn(dev, "failed to register for UDP tunnel offloads: %d", err);
return notifier_from_errno(err);
}
/* All other events will need the udp_tunnel_nic state */
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 7fb6205619e7..58faf1ddd2b1 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -14,7 +14,7 @@
#include <linux/inetdevice.h>
#include <net/dst.h>
#include <net/xfrm.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/ip.h>
#include <net/l3mdev.h>
@@ -25,7 +25,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct flowi4 *fl4,
memset(fl4, 0, sizeof(*fl4));
fl4->daddr = params->daddr->a4;
- fl4->flowi4_tos = inet_dscp_to_dsfield(params->dscp);
+ fl4->flowi4_dscp = params->dscp;
fl4->flowi4_l3mdev = l3mdev_master_ifindex_by_index(params->net,
params->oif);
fl4->flowi4_mark = params->mark;
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 1c9c686d9522..b8f9a8c0302e 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -304,10 +304,9 @@ config IPV6_SEG6_LWTUNNEL
config IPV6_SEG6_HMAC
bool "IPv6: Segment Routing HMAC support"
depends on IPV6
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA1
- select CRYPTO_SHA256
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
help
Support for HMAC signature generation and verification
of SR-enabled packets.
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index f17a5dd4789f..40e9c336f6c5 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -7238,7 +7238,9 @@ static const struct ctl_table addrconf_sysctl[] = {
.data = &ipv6_devconf.rpl_seg_enabled,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = SYSCTL_ONE,
},
{
.procname = "ioam6_enabled",
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 1992621e3f3f..1b0314644e0c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -857,7 +857,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
return PTR_ERR(dst);
}
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
}
return 0;
diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c
index eb474f0987ae..95372e0f1d21 100644
--- a/net/ipv6/ah6.c
+++ b/net/ipv6/ah6.c
@@ -46,6 +46,34 @@ struct ah_skb_cb {
#define AH_SKB_CB(__skb) ((struct ah_skb_cb *)&((__skb)->cb[0]))
+/* Helper to save IPv6 addresses and extension headers to temporary storage */
+static inline void ah6_save_hdrs(struct tmp_ext *iph_ext,
+ struct ipv6hdr *top_iph, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ iph_ext->saddr = top_iph->saddr;
+#endif
+ iph_ext->daddr = top_iph->daddr;
+ memcpy(&iph_ext->hdrs, top_iph + 1, extlen - sizeof(*iph_ext));
+}
+
+/* Helper to restore IPv6 addresses and extension headers from temporary storage */
+static inline void ah6_restore_hdrs(struct ipv6hdr *top_iph,
+ struct tmp_ext *iph_ext, int extlen)
+{
+ if (!extlen)
+ return;
+
+#if IS_ENABLED(CONFIG_IPV6_MIP6)
+ top_iph->saddr = iph_ext->saddr;
+#endif
+ top_iph->daddr = iph_ext->daddr;
+ memcpy(top_iph + 1, &iph_ext->hdrs, extlen - sizeof(*iph_ext));
+}
+
static void *ah_alloc_tmp(struct crypto_ahash *ahash, int nfrags,
unsigned int size)
{
@@ -301,13 +329,7 @@ static void ah6_output_done(void *data, int err)
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
- if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(&top_iph->saddr, iph_ext, extlen);
-#else
- memcpy(&top_iph->daddr, iph_ext, extlen);
-#endif
- }
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
kfree(AH_SKB_CB(skb)->tmp);
xfrm_output_resume(skb->sk, skb, err);
@@ -378,12 +400,8 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
*/
memcpy(iph_base, top_iph, IPV6HDR_BASELEN);
+ ah6_save_hdrs(iph_ext, top_iph, extlen);
if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(iph_ext, &top_iph->saddr, extlen);
-#else
- memcpy(iph_ext, &top_iph->daddr, extlen);
-#endif
err = ipv6_clear_mutable_options(top_iph,
extlen - sizeof(*iph_ext) +
sizeof(*top_iph),
@@ -434,13 +452,7 @@ static int ah6_output(struct xfrm_state *x, struct sk_buff *skb)
memcpy(ah->auth_data, icv, ahp->icv_trunc_len);
memcpy(top_iph, iph_base, IPV6HDR_BASELEN);
- if (extlen) {
-#if IS_ENABLED(CONFIG_IPV6_MIP6)
- memcpy(&top_iph->saddr, iph_ext, extlen);
-#else
- memcpy(&top_iph->daddr, iph_ext, extlen);
-#endif
- }
+ ah6_restore_hdrs(top_iph, iph_ext, extlen);
out_free:
kfree(iph_base);
diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c
index f8a8e46286b8..52599584422b 100644
--- a/net/ipv6/anycast.c
+++ b/net/ipv6/anycast.c
@@ -104,7 +104,7 @@ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
rcu_read_lock();
rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
if (rt) {
- dev = dst_dev(&rt->dst);
+ dev = dst_dev_rcu(&rt->dst);
netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
ip6_rt_put(rt);
} else if (ishost) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 972bf0426d59..33ebe93d80e3 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -1068,5 +1068,5 @@ void __ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp,
0,
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
- atomic_read(&sp->sk_drops));
+ sk_drops_read(sp));
}
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 72adfc107b55..e75da98f5283 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -149,8 +149,8 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x)
dport = encap->encap_dport;
spin_unlock_bh(&x->lock);
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6,
- dport, &x->props.saddr.in6, ntohs(sport), 0, 0);
+ sk = __inet6_lookup_established(net, &x->id.daddr.in6, dport,
+ &x->props.saddr.in6, ntohs(sport), 0, 0);
if (!sk)
return ERR_PTR(-ENOENT);
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 44550957fd4e..56c974cf75d1 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -209,7 +209,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
* this lookup should be more aggressive (not longer than timeout).
*/
dst = ip6_route_output(net, sk, fl6);
- dev = dst_dev(dst);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
if (dst->error) {
IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_OUTNOROUTES);
@@ -224,14 +225,12 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type,
if (rt->rt6i_dst.plen < 128)
tmo >>= ((128 - rt->rt6i_dst.plen)>>5);
- rcu_read_lock();
peer = inet_getpeer_v6(net->ipv6.peers, &fl6->daddr);
res = inet_peer_xrlim_allow(peer, tmo);
- rcu_read_unlock();
}
+ rcu_read_unlock();
if (!res)
- __ICMP6_INC_STATS(net, ip6_dst_idev(dst),
- ICMP6_MIB_RATELIMITHOST);
+ __ICMP6_INC_STATS(net, NULL, ICMP6_MIB_RATELIMITHOST);
else
icmp_global_consume(net);
dst_release(dst);
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 333e43434dd7..ea5cf3fdfdd6 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -91,7 +91,7 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_p);
if (!IS_ERR(dst))
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
}
return dst;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 76ee521189eb..5e1da088d8e1 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -47,24 +47,23 @@ EXPORT_SYMBOL_GPL(inet6_ehashfn);
* The sockhash lock must be held as a reader here.
*/
struct sock *__inet6_lookup_established(const struct net *net,
- struct inet_hashinfo *hashinfo,
- const struct in6_addr *saddr,
- const __be16 sport,
- const struct in6_addr *daddr,
- const u16 hnum,
- const int dif, const int sdif)
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const u16 hnum,
+ const int dif, const int sdif)
{
- struct sock *sk;
- const struct hlist_nulls_node *node;
const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- unsigned int hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
- unsigned int slot = hash & hashinfo->ehash_mask;
- struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
-
+ const struct hlist_nulls_node *node;
+ struct inet_ehash_bucket *head;
+ struct inet_hashinfo *hashinfo;
+ unsigned int hash, slot;
+ struct sock *sk;
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
+ hash = inet6_ehashfn(net, daddr, hnum, saddr, sport);
+ slot = hash & hashinfo->ehash_mask;
+ head = &hashinfo->ehash[slot];
begin:
sk_nulls_for_each_rcu(sk, node, &head->chain) {
if (sk->sk_hash != hash)
@@ -200,19 +199,20 @@ struct sock *inet6_lookup_run_sk_lookup(const struct net *net,
EXPORT_SYMBOL_GPL(inet6_lookup_run_sk_lookup);
struct sock *inet6_lookup_listener(const struct net *net,
- struct inet_hashinfo *hashinfo,
- struct sk_buff *skb, int doff,
- const struct in6_addr *saddr,
- const __be16 sport, const struct in6_addr *daddr,
- const unsigned short hnum, const int dif, const int sdif)
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
+ const __be16 sport,
+ const struct in6_addr *daddr,
+ const unsigned short hnum,
+ const int dif, const int sdif)
{
struct inet_listen_hashbucket *ilb2;
+ struct inet_hashinfo *hashinfo;
struct sock *result = NULL;
unsigned int hash2;
/* Lookup redirect from BPF */
- if (static_branch_unlikely(&bpf_sk_lookup_enabled) &&
- hashinfo == net->ipv4.tcp_death_row.hashinfo) {
+ if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
result = inet6_lookup_run_sk_lookup(net, IPPROTO_TCP, skb, doff,
saddr, sport, daddr, hnum, dif,
inet6_ehashfn);
@@ -220,6 +220,7 @@ struct sock *inet6_lookup_listener(const struct net *net,
goto done;
}
+ hashinfo = net->ipv4.tcp_death_row.hashinfo;
hash2 = ipv6_portaddr_hash(net, daddr, hnum);
ilb2 = inet_lhash2_bucket(hashinfo, hash2);
@@ -244,7 +245,6 @@ done:
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
struct sock *inet6_lookup(const struct net *net,
- struct inet_hashinfo *hashinfo,
struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
@@ -253,7 +253,7 @@ struct sock *inet6_lookup(const struct net *net,
struct sock *sk;
bool refcounted;
- sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+ sk = __inet6_lookup(net, skb, doff, saddr, sport, daddr,
ntohs(dport), dif, 0, &refcounted);
if (sk && !refcounted && !refcount_inc_not_zero(&sk->sk_refcnt))
sk = NULL;
@@ -305,8 +305,7 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
dif, sdif))) {
if (sk2->sk_state == TCP_TIME_WAIT) {
tw = inet_twsk(sk2);
- if (sk->sk_protocol == IPPROTO_TCP &&
- tcp_twsk_unique(sk, sk2, twp))
+ if (tcp_twsk_unique(sk, sk2, twp))
break;
}
goto not_unique;
@@ -369,14 +368,3 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row,
__inet6_check_established);
}
EXPORT_SYMBOL_GPL(inet6_hash_connect);
-
-int inet6_hash(struct sock *sk)
-{
- int err = 0;
-
- if (sk->sk_state != TCP_CLOSE)
- err = __inet_hash(sk, NULL);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(inet6_hash);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 74d49dd6124d..c82a75510c0e 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -329,9 +329,9 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net,
if (parms->name[0]) {
if (!dev_valid_name(parms->name))
return NULL;
- strscpy(name, parms->name, IFNAMSIZ);
+ strscpy(name, parms->name);
} else {
- strcpy(name, "ip6gre%d");
+ strscpy(name, "ip6gre%d");
}
dev = alloc_netdev(sizeof(*t), name, NET_NAME_UNKNOWN,
ip6gre_tunnel_setup);
@@ -1469,7 +1469,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
@@ -1529,7 +1529,7 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev)
tunnel->dev = dev;
tunnel->net = dev_net(dev);
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
tunnel->hlen = sizeof(struct ipv6hdr) + 4;
}
@@ -1842,7 +1842,7 @@ static int ip6erspan_tap_init(struct net_device *dev)
tunnel = netdev_priv(dev);
tunnel->dev = dev;
- strcpy(tunnel->parms.name, dev->name);
+ strscpy(tunnel->parms.name, dev->name);
ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret)
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 1e1410237b6e..f904739e99b9 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -60,7 +60,7 @@
static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev = dst_dev_rcu(dst);
struct inet6_dev *idev = ip6_dst_idev(dst);
unsigned int hh_len = LL_RESERVED_SPACE(dev);
const struct in6_addr *daddr, *nexthop;
@@ -70,15 +70,12 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
/* Be paranoid, rather than too clever. */
if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
- /* Make sure idev stays alive */
- rcu_read_lock();
+ /* idev stays alive because we hold rcu_read_lock(). */
skb = skb_expand_head(skb, hh_len);
if (!skb) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- rcu_read_unlock();
return -ENOMEM;
}
- rcu_read_unlock();
}
hdr = ipv6_hdr(skb);
@@ -123,7 +120,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
- rcu_read_lock();
nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
@@ -131,7 +127,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
if (unlikely(!neigh))
neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
if (IS_ERR(neigh)) {
- rcu_read_unlock();
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
return -EINVAL;
@@ -139,7 +134,6 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
}
sock_confirm_neigh(skb, neigh);
ret = neigh_output(neigh, skb, false);
- rcu_read_unlock();
return ret;
}
@@ -233,22 +227,29 @@ static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst), *indev = skb->dev;
- struct inet6_dev *idev = ip6_dst_idev(dst);
+ struct net_device *dev, *indev = skb->dev;
+ struct inet6_dev *idev;
+ int ret;
skb->protocol = htons(ETH_P_IPV6);
+ rcu_read_lock();
+ dev = dst_dev_rcu(dst);
+ idev = ip6_dst_idev(dst);
skb->dev = dev;
if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
+ rcu_read_unlock();
kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
return 0;
}
- return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
- net, sk, skb, indev, dev,
- ip6_finish_output,
- !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
+ net, sk, skb, indev, dev,
+ ip6_finish_output,
+ !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(ip6_output);
@@ -268,35 +269,36 @@ bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
__u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
{
- struct net *net = sock_net(sk);
const struct ipv6_pinfo *np = inet6_sk(sk);
struct in6_addr *first_hop = &fl6->daddr;
struct dst_entry *dst = skb_dst(skb);
- struct net_device *dev = dst_dev(dst);
struct inet6_dev *idev = ip6_dst_idev(dst);
struct hop_jumbo_hdr *hop_jumbo;
int hoplen = sizeof(*hop_jumbo);
+ struct net *net = sock_net(sk);
unsigned int head_room;
+ struct net_device *dev;
struct ipv6hdr *hdr;
u8 proto = fl6->flowi6_proto;
int seg_len = skb->len;
- int hlimit = -1;
+ int ret, hlimit = -1;
u32 mtu;
+ rcu_read_lock();
+
+ dev = dst_dev_rcu(dst);
head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
if (opt)
head_room += opt->opt_nflen + opt->opt_flen;
if (unlikely(head_room > skb_headroom(skb))) {
- /* Make sure idev stays alive */
- rcu_read_lock();
+ /* idev stays alive while we hold rcu_read_lock(). */
skb = skb_expand_head(skb, head_room);
if (!skb) {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
- rcu_read_unlock();
- return -ENOBUFS;
+ ret = -ENOBUFS;
+ goto unlock;
}
- rcu_read_unlock();
}
if (opt) {
@@ -358,17 +360,21 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
* skb to its handler for processing
*/
skb = l3mdev_ip6_out((struct sock *)sk, skb);
- if (unlikely(!skb))
- return 0;
+ if (unlikely(!skb)) {
+ ret = 0;
+ goto unlock;
+ }
/* hooks should never assume socket lock is held.
* we promote our socket to non const
*/
- return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
- net, (struct sock *)sk, skb, NULL, dev,
- dst_output);
+ ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
+ net, (struct sock *)sk, skb, NULL, dev,
+ dst_output);
+ goto unlock;
}
+ ret = -EMSGSIZE;
skb->dev = dev;
/* ipv6_local_error() does not require socket lock,
* we promote our socket to non const
@@ -377,7 +383,9 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
kfree_skb(skb);
- return -EMSGSIZE;
+unlock:
+ rcu_read_unlock();
+ return ret;
}
EXPORT_SYMBOL(ip6_xmit);
@@ -1092,9 +1100,11 @@ static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
* sockets.
* 2. oif also should be the same.
*/
- if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
+ if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
+ np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
#ifdef CONFIG_IPV6_SUBTREES
- ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
+ ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
+ np->saddr_cache ? &np->saddr : NULL) ||
#endif
(fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
dst_release(dst);
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index e66ec623972e..a61e742794f9 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -49,6 +49,7 @@
#include <net/xfrm.h>
#include <net/compat.h>
#include <net/seg6.h>
+#include <net/psp.h>
#include <linux/uaccess.h>
@@ -107,7 +108,10 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) &&
inet_sk(sk)->inet_daddr != LOOPBACK4_IPV6) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
+
+ icsk->icsk_ext_hdr_len =
+ psp_sk_overhead(sk) +
+ opt->opt_flen + opt->opt_nflen;
icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 36ca27496b3c..016b572e7d6f 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -169,6 +169,29 @@ static int unsolicited_report_interval(struct inet6_dev *idev)
return iv > 0 ? iv : 1;
}
+static struct net_device *ip6_mc_find_dev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
+{
+ struct net_device *dev = NULL;
+ struct rt6_info *rt;
+
+ if (ifindex == 0) {
+ rcu_read_lock();
+ rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
+ if (rt) {
+ dev = dst_dev_rcu(&rt->dst);
+ dev_hold(dev);
+ ip6_rt_put(rt);
+ }
+ rcu_read_unlock();
+ } else {
+ dev = dev_get_by_index(net, ifindex);
+ }
+
+ return dev;
+}
+
/*
* socket join on multicast group
*/
@@ -191,28 +214,13 @@ static int __ipv6_sock_mc_join(struct sock *sk, int ifindex,
}
mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL);
-
if (!mc_lst)
return -ENOMEM;
mc_lst->next = NULL;
mc_lst->addr = *addr;
- if (ifindex == 0) {
- struct rt6_info *rt;
-
- rcu_read_lock();
- rt = rt6_lookup(net, addr, NULL, 0, NULL, 0);
- if (rt) {
- dev = dst_dev(&rt->dst);
- dev_hold(dev);
- ip6_rt_put(rt);
- }
- rcu_read_unlock();
- } else {
- dev = dev_get_by_index(net, ifindex);
- }
-
+ dev = ip6_mc_find_dev(net, addr, ifindex);
if (!dev) {
sock_kfree_s(sk, mc_lst, sizeof(*mc_lst));
return -ENODEV;
@@ -302,27 +310,14 @@ int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr)
}
EXPORT_SYMBOL(ipv6_sock_mc_drop);
-static struct inet6_dev *ip6_mc_find_dev(struct net *net,
- const struct in6_addr *group,
- int ifindex)
+static struct inet6_dev *ip6_mc_find_idev(struct net *net,
+ const struct in6_addr *group,
+ int ifindex)
{
- struct net_device *dev = NULL;
+ struct net_device *dev;
struct inet6_dev *idev;
- if (ifindex == 0) {
- struct rt6_info *rt;
-
- rcu_read_lock();
- rt = rt6_lookup(net, group, NULL, 0, NULL, 0);
- if (rt) {
- dev = dst_dev(&rt->dst);
- dev_hold(dev);
- ip6_rt_put(rt);
- }
- rcu_read_unlock();
- } else {
- dev = dev_get_by_index(net, ifindex);
- }
+ dev = ip6_mc_find_dev(net, group, ifindex);
if (!dev)
return NULL;
@@ -374,7 +369,7 @@ int ip6_mc_source(int add, int omode, struct sock *sk,
if (!ipv6_addr_is_multicast(group))
return -EINVAL;
- idev = ip6_mc_find_dev(net, group, pgsr->gsr_interface);
+ idev = ip6_mc_find_idev(net, group, pgsr->gsr_interface);
if (!idev)
return -ENODEV;
@@ -509,7 +504,7 @@ int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf,
gsf->gf_fmode != MCAST_EXCLUDE)
return -EINVAL;
- idev = ip6_mc_find_dev(net, group, gsf->gf_interface);
+ idev = ip6_mc_find_idev(net, group, gsf->gf_interface);
if (!idev)
return -ENODEV;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 7d5abb3158ec..f427e41e9c49 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -130,7 +130,7 @@ struct neigh_table nd_tbl = {
[NEIGH_VAR_DELAY_PROBE_TIME] = 5 * HZ,
[NEIGH_VAR_INTERVAL_PROBE_TIME_MS] = 5 * HZ,
[NEIGH_VAR_GC_STALETIME] = 60 * HZ,
- [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_MAX,
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = SK_WMEM_DEFAULT,
[NEIGH_VAR_PROXY_QLEN] = 64,
[NEIGH_VAR_ANYCAST_DELAY] = 1 * HZ,
[NEIGH_VAR_PROXY_DELAY] = (8 * HZ) / 10,
@@ -505,7 +505,7 @@ void ndisc_send_skb(struct sk_buff *skb, const struct in6_addr *daddr,
ip6_nd_hdr(skb, saddr, daddr, READ_ONCE(inet6_sk(sk)->hop_limit), skb->len);
- dev = dst_dev(dst);
+ dev = dst_dev_rcu(dst);
idev = __in6_dev_get(dev);
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 45f9105f9ac1..46540a5a4331 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -63,7 +63,10 @@ int ip6_route_me_harder(struct net *net, struct sock *sk_partial, struct sk_buff
#ifdef CONFIG_XFRM
if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
xfrm_decode_session(net, skb, flowi6_to_flowi(&fl6), AF_INET6) == 0) {
- skb_dst_set(skb, NULL);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), sk, 0);
if (IS_ERR(dst))
return PTR_ERR(dst);
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index cb2d38e80de9..ef5b7e85cffa 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -12,6 +12,19 @@
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit);
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen);
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook);
+
static bool nf_reject_v6_csum_ok(struct sk_buff *skb, int hook)
{
const struct ipv6hdr *ip6h = ipv6_hdr(skb);
@@ -91,6 +104,32 @@ struct sk_buff *nf_reject_skb_v6_tcp_reset(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v6_tcp_reset);
+static bool nf_skb_is_icmp6_unreach(const struct sk_buff *skb)
+{
+ const struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ u8 proto = ip6h->nexthdr;
+ u8 _type, *tp;
+ int thoff;
+ __be16 fo;
+
+ thoff = ipv6_skip_exthdr(skb, ((u8 *)(ip6h + 1) - skb->data), &proto, &fo);
+
+ if (thoff < 0 || thoff >= skb->len || fo != 0)
+ return false;
+
+ if (proto != IPPROTO_ICMPV6)
+ return false;
+
+ tp = skb_header_pointer(skb,
+ thoff + offsetof(struct icmp6hdr, icmp6_type),
+ sizeof(_type), &_type);
+
+ if (!tp)
+ return false;
+
+ return *tp == ICMPV6_DEST_UNREACH;
+}
+
struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
struct sk_buff *oldskb,
const struct net_device *dev,
@@ -104,6 +143,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
if (!nf_reject_ip6hdr_validate(oldskb))
return NULL;
+ /* Don't reply to ICMPV6_DEST_UNREACH with ICMPV6_DEST_UNREACH */
+ if (nf_skb_is_icmp6_unreach(oldskb))
+ return NULL;
+
/* Include "As much of invoking packet as possible without the ICMPv6
* packet exceeding the minimum IPv6 MTU" in the ICMP payload.
*/
@@ -146,9 +189,10 @@ struct sk_buff *nf_reject_skb_v6_unreach(struct net *net,
}
EXPORT_SYMBOL_GPL(nf_reject_skb_v6_unreach);
-const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
- struct tcphdr *otcph,
- unsigned int *otcplen, int hook)
+static const struct tcphdr *
+nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
+ struct tcphdr *otcph,
+ unsigned int *otcplen, int hook)
{
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
u8 proto;
@@ -192,11 +236,11 @@ const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
return otcph;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_get);
-struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- __u8 protocol, int hoplimit)
+static struct ipv6hdr *
+nf_reject_ip6hdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ __u8 protocol, int hoplimit)
{
struct ipv6hdr *ip6h;
const struct ipv6hdr *oip6h = ipv6_hdr(oldskb);
@@ -216,11 +260,11 @@ struct ipv6hdr *nf_reject_ip6hdr_put(struct sk_buff *nskb,
return ip6h;
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6hdr_put);
-void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
- const struct sk_buff *oldskb,
- const struct tcphdr *oth, unsigned int otcplen)
+static void
+nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
+ const struct sk_buff *oldskb,
+ const struct tcphdr *oth, unsigned int otcplen)
{
struct tcphdr *tcph;
@@ -248,7 +292,6 @@ void nf_reject_ip6_tcphdr_put(struct sk_buff *nskb,
csum_partial(tcph,
sizeof(struct tcphdr), 0));
}
-EXPORT_SYMBOL_GPL(nf_reject_ip6_tcphdr_put);
static int nf_reject6_fill_skb_dst(struct sk_buff *skb_in)
{
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c
index 9ea5ef56cb27..ced8bd44828e 100644
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -83,8 +83,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo,
- skb, doff, saddr, sport, daddr, dport,
+ return inet6_lookup(net, skb, doff, saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
return udp6_lib_lookup(net, saddr, sport, daddr, dport,
diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c
index 52f828bb5a83..b2f59ed9d7cc 100644
--- a/net/ipv6/netfilter/nf_tproxy_ipv6.c
+++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c
@@ -80,7 +80,6 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
- struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo;
struct sock *sk;
switch (protocol) {
@@ -94,7 +93,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
switch (lookup_type) {
case NF_TPROXY_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, hinfo, skb,
+ sk = inet6_lookup_listener(net, skb,
thoff + __tcp_hdrlen(hp),
saddr, sport,
daddr, ntohs(dport),
@@ -109,7 +108,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff,
*/
break;
case NF_TPROXY_LOOKUP_ESTABLISHED:
- sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr,
+ sk = __inet6_lookup_established(net, saddr, sport, daddr,
ntohs(dport), in->ifindex, 0);
break;
default:
diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c
index d21fe27fe21e..1c9b283a4132 100644
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -104,18 +104,20 @@ EXPORT_SYMBOL(ip6_find_1stfragopt);
int ip6_dst_hoplimit(struct dst_entry *dst)
{
int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
+
+ rcu_read_lock();
if (hoplimit == 0) {
- struct net_device *dev = dst_dev(dst);
+ struct net_device *dev = dst_dev_rcu(dst);
struct inet6_dev *idev;
- rcu_read_lock();
idev = __in6_dev_get(dev);
if (idev)
hoplimit = READ_ONCE(idev->cnf.hop_limit);
else
hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
- rcu_read_unlock();
}
+ rcu_read_unlock();
+
return hoplimit;
}
EXPORT_SYMBOL(ip6_dst_hoplimit);
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 82b0492923d4..d7a2cdaa2631 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -208,7 +208,6 @@ struct proto pingv6_prot = {
.recvmsg = ping_recvmsg,
.bind = ping_bind,
.backlog_rcv = ping_queue_rcv_skb,
- .hash = ping_hash,
.unhash = ping_unhash,
.get_port = ping_get_port,
.put_port = ping_unhash,
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 752327b10dde..73296f38c252 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -85,7 +85,6 @@ static const struct snmp_mib snmp6_ipstats_list[] = {
SNMP_MIB_ITEM("Ip6InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("Ip6InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("Ip6OutTransmits", IPSTATS_MIB_OUTPKTS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp6_icmp6_list[] = {
@@ -95,30 +94,10 @@ static const struct snmp_mib snmp6_icmp6_list[] = {
SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS),
SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS),
SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS),
+/* ICMP6_MIB_RATELIMITHOST needs to be last, see snmp6_dev_seq_show(). */
SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST),
- SNMP_MIB_SENTINEL
};
-/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
-static const char *const icmp6type2name[256] = {
- [ICMPV6_DEST_UNREACH] = "DestUnreachs",
- [ICMPV6_PKT_TOOBIG] = "PktTooBigs",
- [ICMPV6_TIME_EXCEED] = "TimeExcds",
- [ICMPV6_PARAMPROB] = "ParmProblems",
- [ICMPV6_ECHO_REQUEST] = "Echos",
- [ICMPV6_ECHO_REPLY] = "EchoReplies",
- [ICMPV6_MGM_QUERY] = "GroupMembQueries",
- [ICMPV6_MGM_REPORT] = "GroupMembResponses",
- [ICMPV6_MGM_REDUCTION] = "GroupMembReductions",
- [ICMPV6_MLD2_REPORT] = "MLDv2Reports",
- [NDISC_ROUTER_ADVERTISEMENT] = "RouterAdvertisements",
- [NDISC_ROUTER_SOLICITATION] = "RouterSolicits",
- [NDISC_NEIGHBOUR_ADVERTISEMENT] = "NeighborAdvertisements",
- [NDISC_NEIGHBOUR_SOLICITATION] = "NeighborSolicits",
- [NDISC_REDIRECT] = "Redirects",
-};
-
-
static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS),
SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS),
@@ -129,7 +108,6 @@ static const struct snmp_mib snmp6_udp6_list[] = {
SNMP_MIB_ITEM("Udp6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("Udp6IgnoredMulti", UDP_MIB_IGNOREDMULTI),
SNMP_MIB_ITEM("Udp6MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static const struct snmp_mib snmp6_udplite6_list[] = {
@@ -141,7 +119,6 @@ static const struct snmp_mib snmp6_udplite6_list[] = {
SNMP_MIB_ITEM("UdpLite6SndbufErrors", UDP_MIB_SNDBUFERRORS),
SNMP_MIB_ITEM("UdpLite6InCsumErrors", UDP_MIB_CSUMERRORS),
SNMP_MIB_ITEM("UdpLite6MemErrors", UDP_MIB_MEMERRORS),
- SNMP_MIB_SENTINEL
};
static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
@@ -151,11 +128,31 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
/* print by name -- deprecated items */
for (i = 0; i < ICMP6MSG_MIB_MAX; i++) {
+ const char *p = NULL;
int icmptype;
- const char *p;
+
+#define CASE(TYP, STR) case TYP: p = STR; break;
icmptype = i & 0xff;
- p = icmp6type2name[icmptype];
+ switch (icmptype) {
+/* RFC 4293 v6 ICMPMsgStatsTable; named items for RFC 2466 compatibility */
+ CASE(ICMPV6_DEST_UNREACH, "DestUnreachs")
+ CASE(ICMPV6_PKT_TOOBIG, "PktTooBigs")
+ CASE(ICMPV6_TIME_EXCEED, "TimeExcds")
+ CASE(ICMPV6_PARAMPROB, "ParmProblems")
+ CASE(ICMPV6_ECHO_REQUEST, "Echos")
+ CASE(ICMPV6_ECHO_REPLY, "EchoReplies")
+ CASE(ICMPV6_MGM_QUERY, "GroupMembQueries")
+ CASE(ICMPV6_MGM_REPORT, "GroupMembResponses")
+ CASE(ICMPV6_MGM_REDUCTION, "GroupMembReductions")
+ CASE(ICMPV6_MLD2_REPORT, "MLDv2Reports")
+ CASE(NDISC_ROUTER_ADVERTISEMENT, "RouterAdvertisements")
+ CASE(NDISC_ROUTER_SOLICITATION, "RouterSolicits")
+ CASE(NDISC_NEIGHBOUR_ADVERTISEMENT, "NeighborAdvertisements")
+ CASE(NDISC_NEIGHBOUR_SOLICITATION, "NeighborSolicits")
+ CASE(NDISC_REDIRECT, "Redirects")
+ }
+#undef CASE
if (!p) /* don't print un-named types here */
continue;
snprintf(name, sizeof(name), "Icmp6%s%s",
@@ -182,35 +179,37 @@ static void snmp6_seq_show_icmpv6msg(struct seq_file *seq, atomic_long_t *smib)
*/
static void snmp6_seq_show_item(struct seq_file *seq, void __percpu *pcpumib,
atomic_long_t *smib,
- const struct snmp_mib *itemlist)
+ const struct snmp_mib *itemlist,
+ int cnt)
{
unsigned long buff[SNMP_MIB_MAX];
int i;
if (pcpumib) {
- memset(buff, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
+ memset(buff, 0, sizeof(unsigned long) * cnt);
- snmp_get_cpu_field_batch(buff, itemlist, pcpumib);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, itemlist, cnt, pcpumib);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n",
itemlist[i].name, buff[i]);
} else {
- for (i = 0; itemlist[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name,
atomic_long_read(smib + itemlist[i].entry));
}
}
static void snmp6_seq_show_item64(struct seq_file *seq, void __percpu *mib,
- const struct snmp_mib *itemlist, size_t syncpoff)
+ const struct snmp_mib *itemlist,
+ int cnt, size_t syncpoff)
{
u64 buff64[SNMP_MIB_MAX];
int i;
- memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
+ memset(buff64, 0, sizeof(u64) * cnt);
- snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
- for (i = 0; itemlist[i].name; i++)
+ snmp_get_cpu_field64_batch_cnt(buff64, itemlist, cnt, mib, syncpoff);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%llu\n", itemlist[i].name, buff64[i]);
}
@@ -219,14 +218,19 @@ static int snmp6_seq_show(struct seq_file *seq, void *v)
struct net *net = (struct net *)seq->private;
snmp6_seq_show_item64(seq, net->mib.ipv6_statistics,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
snmp6_seq_show_item(seq, net->mib.icmpv6_statistics,
- NULL, snmp6_icmp6_list);
+ NULL, snmp6_icmp6_list,
+ ARRAY_SIZE(snmp6_icmp6_list));
snmp6_seq_show_icmpv6msg(seq, net->mib.icmpv6msg_statistics->mibs);
snmp6_seq_show_item(seq, net->mib.udp_stats_in6,
- NULL, snmp6_udp6_list);
+ NULL, snmp6_udp6_list,
+ ARRAY_SIZE(snmp6_udp6_list));
snmp6_seq_show_item(seq, net->mib.udplite_stats_in6,
- NULL, snmp6_udplite6_list);
+ NULL, snmp6_udplite6_list,
+ ARRAY_SIZE(snmp6_udplite6_list));
return 0;
}
@@ -236,9 +240,14 @@ static int snmp6_dev_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex);
snmp6_seq_show_item64(seq, idev->stats.ipv6,
- snmp6_ipstats_list, offsetof(struct ipstats_mib, syncp));
+ snmp6_ipstats_list,
+ ARRAY_SIZE(snmp6_ipstats_list),
+ offsetof(struct ipstats_mib, syncp));
+
+ /* Per idev icmp stats do not have ICMP6_MIB_RATELIMITHOST */
snmp6_seq_show_item(seq, NULL, idev->stats.icmpv6dev->mibs,
- snmp6_icmp6_list);
+ snmp6_icmp6_list, ARRAY_SIZE(snmp6_icmp6_list) - 1);
+
snmp6_seq_show_icmpv6msg(seq, idev->stats.icmpv6msgdev->mibs);
return 0;
}
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 4c3f8245c40f..e369f54844dd 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -163,7 +163,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr)
if (atomic_read(&sk->sk_rmem_alloc) >=
READ_ONCE(sk->sk_rcvbuf)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
continue;
}
@@ -361,7 +361,7 @@ static inline int rawv6_rcv_skb(struct sock *sk, struct sk_buff *skb)
if ((raw6_sk(sk)->checksum || rcu_access_pointer(sk->sk_filter)) &&
skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
@@ -389,7 +389,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
struct raw6_sock *rp = raw6_sk(sk);
if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_XFRM_POLICY);
return NET_RX_DROP;
}
@@ -414,7 +414,7 @@ int rawv6_rcv(struct sock *sk, struct sk_buff *skb)
if (inet_test_bit(HDRINCL, sk)) {
if (skb_checksum_complete(skb)) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_SKB_CSUM);
return NET_RX_DROP;
}
@@ -445,7 +445,7 @@ static int rawv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
- if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
skb = skb_recv_datagram(sk, flags, &err);
@@ -1175,6 +1175,7 @@ static int rawv6_init_sk(struct sock *sk)
{
struct raw6_sock *rp = raw6_sk(sk);
+ sk->sk_drop_counters = &rp->drop_counters;
switch (inet_sk(sk)->inet_num) {
case IPPROTO_ICMPV6:
rp->checksum = 1;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 3299cfa12e21..aee6a10b112a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2943,7 +2943,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (res.f6i->nh) {
struct fib6_nh_match_arg arg = {
- .dev = dst_dev(dst),
+ .dev = dst_dev_rcu(dst),
.gw = &rt6->rt6i_gateway,
};
@@ -3032,13 +3032,12 @@ void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
#endif
ip6_dst_store(sk, dst,
- ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
- &sk->sk_v6_daddr : NULL,
+ ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr),
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
- &np->saddr :
+ true :
#endif
- NULL);
+ false);
}
static bool ip6_redirect_nh_match(const struct fib6_result *res,
@@ -3238,7 +3237,6 @@ EXPORT_SYMBOL_GPL(ip6_sk_redirect);
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
- struct net_device *dev = dst_dev(dst);
unsigned int mtu = dst_mtu(dst);
struct net *net;
@@ -3246,7 +3244,7 @@ static unsigned int ip6_default_advmss(const struct dst_entry *dst)
rcu_read_lock();
- net = dev_net_rcu(dev);
+ net = dst_dev_net_rcu(dst);
if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
@@ -4301,7 +4299,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_bu
if (res.f6i->nh) {
struct fib6_nh_match_arg arg = {
- .dev = dst_dev(dst),
+ .dev = dst_dev_rcu(dst),
.gw = &rt->rt6i_gateway,
};
diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c
index 180da19c148c..a5c4c629b788 100644
--- a/net/ipv6/seg6.c
+++ b/net/ipv6/seg6.c
@@ -522,16 +522,10 @@ int __init seg6_init(void)
if (err)
goto out_unregister_iptun;
- err = seg6_hmac_init();
- if (err)
- goto out_unregister_seg6;
-
pr_info("Segment Routing with IPv6\n");
out:
return err;
-out_unregister_seg6:
- seg6_local_exit();
out_unregister_iptun:
seg6_iptunnel_exit();
out_unregister_genl:
@@ -543,7 +537,6 @@ out_unregister_pernet:
void seg6_exit(void)
{
- seg6_hmac_exit();
seg6_local_exit();
seg6_iptunnel_exit();
genl_unregister_family(&seg6_genl_family);
diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c
index fd58426f222b..ee6bac0160ac 100644
--- a/net/ipv6/seg6_hmac.c
+++ b/net/ipv6/seg6_hmac.c
@@ -16,7 +16,6 @@
#include <linux/in6.h>
#include <linux/icmpv6.h>
#include <linux/mroute6.h>
-#include <linux/slab.h>
#include <linux/rhashtable.h>
#include <linux/netfilter.h>
@@ -34,7 +33,8 @@
#include <net/addrconf.h>
#include <net/xfrm.h>
-#include <crypto/hash.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <crypto/utils.h>
#include <net/seg6.h>
#include <net/genetlink.h>
@@ -78,17 +78,6 @@ static const struct rhashtable_params rht_params = {
.obj_cmpfn = seg6_hmac_cmpfn,
};
-static struct seg6_hmac_algo hmac_algos[] = {
- {
- .alg_id = SEG6_HMAC_ALGO_SHA1,
- .name = "hmac(sha1)",
- },
- {
- .alg_id = SEG6_HMAC_ALGO_SHA256,
- .name = "hmac(sha256)",
- },
-};
-
static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
{
struct sr6_tlv_hmac *tlv;
@@ -108,75 +97,13 @@ static struct sr6_tlv_hmac *seg6_get_tlv_hmac(struct ipv6_sr_hdr *srh)
return tlv;
}
-static struct seg6_hmac_algo *__hmac_get_algo(u8 alg_id)
-{
- struct seg6_hmac_algo *algo;
- int i, alg_count;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
- if (algo->alg_id == alg_id)
- return algo;
- }
-
- return NULL;
-}
-
-static int __do_hmac(struct seg6_hmac_info *hinfo, const char *text, u8 psize,
- u8 *output, int outlen)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int ret, dgsize;
-
- algo = __hmac_get_algo(hinfo->alg_id);
- if (!algo)
- return -ENOENT;
-
- tfm = *this_cpu_ptr(algo->tfms);
-
- dgsize = crypto_shash_digestsize(tfm);
- if (dgsize > outlen) {
- pr_debug("sr-ipv6: __do_hmac: digest size too big (%d / %d)\n",
- dgsize, outlen);
- return -ENOMEM;
- }
-
- ret = crypto_shash_setkey(tfm, hinfo->secret, hinfo->slen);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_setkey failed: err %d\n", ret);
- goto failed;
- }
-
- shash = *this_cpu_ptr(algo->shashs);
- shash->tfm = tfm;
-
- ret = crypto_shash_digest(shash, text, psize, output);
- if (ret < 0) {
- pr_debug("sr-ipv6: crypto_shash_digest failed: err %d\n", ret);
- goto failed;
- }
-
- return dgsize;
-
-failed:
- return ret;
-}
-
int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
struct in6_addr *saddr, u8 *output)
{
__be32 hmackeyid = cpu_to_be32(hinfo->hmackeyid);
- u8 tmp_out[SEG6_HMAC_MAX_DIGESTSIZE];
- int plen, i, dgsize, wrsize;
+ int plen, i, ret = 0;
char *ring, *off;
- /* a 160-byte buffer for digest output allows to store highest known
- * hash function (RadioGatun) with up to 1216 bits
- */
-
/* saddr(16) + first_seg(1) + flags(1) + keyid(4) + seglist(16n) */
plen = 16 + 1 + 1 + 4 + (hdr->first_segment + 1) * 16;
@@ -219,22 +146,25 @@ int seg6_hmac_compute(struct seg6_hmac_info *hinfo, struct ipv6_sr_hdr *hdr,
off += 16;
}
- dgsize = __do_hmac(hinfo, ring, plen, tmp_out,
- SEG6_HMAC_MAX_DIGESTSIZE);
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1(&hinfo->key.sha1, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN > SHA1_DIGEST_SIZE);
+ memset(&output[SHA1_DIGEST_SIZE], 0,
+ SEG6_HMAC_FIELD_LEN - SHA1_DIGEST_SIZE);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256(&hinfo->key.sha256, ring, plen, output);
+ static_assert(SEG6_HMAC_FIELD_LEN == SHA256_DIGEST_SIZE);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ ret = -EINVAL;
+ break;
+ }
local_unlock_nested_bh(&hmac_storage.bh_lock);
local_bh_enable();
-
- if (dgsize < 0)
- return dgsize;
-
- wrsize = SEG6_HMAC_FIELD_LEN;
- if (wrsize > dgsize)
- wrsize = dgsize;
-
- memset(output, 0, SEG6_HMAC_FIELD_LEN);
- memcpy(output, tmp_out, wrsize);
-
- return 0;
+ return ret;
}
EXPORT_SYMBOL(seg6_hmac_compute);
@@ -305,8 +235,18 @@ int seg6_hmac_info_add(struct net *net, u32 key, struct seg6_hmac_info *hinfo)
struct seg6_pernet_data *sdata = seg6_pernet(net);
int err;
- if (!__hmac_get_algo(hinfo->alg_id))
+ switch (hinfo->alg_id) {
+ case SEG6_HMAC_ALGO_SHA1:
+ hmac_sha1_preparekey(&hinfo->key.sha1,
+ hinfo->secret, hinfo->slen);
+ break;
+ case SEG6_HMAC_ALGO_SHA256:
+ hmac_sha256_preparekey(&hinfo->key.sha256,
+ hinfo->secret, hinfo->slen);
+ break;
+ default:
return -EINVAL;
+ }
err = rhashtable_lookup_insert_fast(&sdata->hmac_infos, &hinfo->node,
rht_params);
@@ -363,65 +303,6 @@ out:
}
EXPORT_SYMBOL(seg6_push_hmac);
-static int seg6_hmac_init_algo(void)
-{
- struct seg6_hmac_algo *algo;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int i, alg_count, cpu;
- int ret = -ENOMEM;
-
- alg_count = ARRAY_SIZE(hmac_algos);
-
- for (i = 0; i < alg_count; i++) {
- struct crypto_shash **p_tfm;
- int shsize;
-
- algo = &hmac_algos[i];
- algo->tfms = alloc_percpu(struct crypto_shash *);
- if (!algo->tfms)
- goto error_out;
-
- for_each_possible_cpu(cpu) {
- tfm = crypto_alloc_shash(algo->name, 0, 0);
- if (IS_ERR(tfm)) {
- ret = PTR_ERR(tfm);
- goto error_out;
- }
- p_tfm = per_cpu_ptr(algo->tfms, cpu);
- *p_tfm = tfm;
- }
-
- p_tfm = raw_cpu_ptr(algo->tfms);
- tfm = *p_tfm;
-
- shsize = sizeof(*shash) + crypto_shash_descsize(tfm);
-
- algo->shashs = alloc_percpu(struct shash_desc *);
- if (!algo->shashs)
- goto error_out;
-
- for_each_possible_cpu(cpu) {
- shash = kzalloc_node(shsize, GFP_KERNEL,
- cpu_to_node(cpu));
- if (!shash)
- goto error_out;
- *per_cpu_ptr(algo->shashs, cpu) = shash;
- }
- }
-
- return 0;
-
-error_out:
- seg6_hmac_exit();
- return ret;
-}
-
-int __init seg6_hmac_init(void)
-{
- return seg6_hmac_init_algo();
-}
-
int __net_init seg6_hmac_net_init(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
@@ -429,36 +310,6 @@ int __net_init seg6_hmac_net_init(struct net *net)
return rhashtable_init(&sdata->hmac_infos, &rht_params);
}
-void seg6_hmac_exit(void)
-{
- struct seg6_hmac_algo *algo = NULL;
- struct crypto_shash *tfm;
- struct shash_desc *shash;
- int i, alg_count, cpu;
-
- alg_count = ARRAY_SIZE(hmac_algos);
- for (i = 0; i < alg_count; i++) {
- algo = &hmac_algos[i];
-
- if (algo->shashs) {
- for_each_possible_cpu(cpu) {
- shash = *per_cpu_ptr(algo->shashs, cpu);
- kfree(shash);
- }
- free_percpu(algo->shashs);
- }
-
- if (algo->tfms) {
- for_each_possible_cpu(cpu) {
- tfm = *per_cpu_ptr(algo->tfms, cpu);
- crypto_free_shash(tfm);
- }
- free_percpu(algo->tfms);
- }
- }
-}
-EXPORT_SYMBOL(seg6_hmac_exit);
-
void __net_exit seg6_hmac_net_exit(struct net *net)
{
struct seg6_pernet_data *sdata = seg6_pernet(net);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 12496ba1b7d4..cf37ad9686e6 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -848,6 +848,49 @@ static inline __be32 try_6rd(struct ip_tunnel *tunnel,
return dst;
}
+static bool ipip6_tunnel_dst_find(struct sk_buff *skb, __be32 *dst,
+ bool is_isatap)
+{
+ const struct ipv6hdr *iph6 = ipv6_hdr(skb);
+ struct neighbour *neigh = NULL;
+ const struct in6_addr *addr6;
+ bool found = false;
+ int addr_type;
+
+ if (skb_dst(skb))
+ neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
+
+ if (!neigh) {
+ net_dbg_ratelimited("nexthop == NULL\n");
+ return false;
+ }
+
+ addr6 = (const struct in6_addr *)&neigh->primary_key;
+ addr_type = ipv6_addr_type(addr6);
+
+ if (is_isatap) {
+ if ((addr_type & IPV6_ADDR_UNICAST) &&
+ ipv6_addr_is_isatap(addr6)) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ } else {
+ if (addr_type == IPV6_ADDR_ANY) {
+ addr6 = &ipv6_hdr(skb)->daddr;
+ addr_type = ipv6_addr_type(addr6);
+ }
+
+ if ((addr_type & IPV6_ADDR_COMPATv4) != 0) {
+ *dst = addr6->s6_addr32[3];
+ found = true;
+ }
+ }
+
+ neigh_release(neigh);
+
+ return found;
+}
+
/*
* This function assumes it is being called from dev_queue_xmit()
* and that skb is filled properly by that function.
@@ -867,8 +910,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
__be32 dst = tiph->daddr;
struct flowi4 fl4;
int mtu;
- const struct in6_addr *addr6;
- int addr_type;
u8 ttl;
u8 protocol = IPPROTO_IPV6;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
@@ -877,64 +918,15 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
tos = ipv6_get_dsfield(iph6);
/* ISATAP (RFC4214) - must come before 6to4 */
- if (dev->priv_flags & IFF_ISATAP) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if ((addr_type & IPV6_ADDR_UNICAST) &&
- ipv6_addr_is_isatap(addr6))
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if ((dev->priv_flags & IFF_ISATAP) &&
+ !ipip6_tunnel_dst_find(skb, &dst, true))
+ goto tx_error;
if (!dst)
dst = try_6rd(tunnel, &iph6->daddr);
- if (!dst) {
- struct neighbour *neigh = NULL;
- bool do_tx_error = false;
-
- if (skb_dst(skb))
- neigh = dst_neigh_lookup(skb_dst(skb), &iph6->daddr);
-
- if (!neigh) {
- net_dbg_ratelimited("nexthop == NULL\n");
- goto tx_error;
- }
-
- addr6 = (const struct in6_addr *)&neigh->primary_key;
- addr_type = ipv6_addr_type(addr6);
-
- if (addr_type == IPV6_ADDR_ANY) {
- addr6 = &ipv6_hdr(skb)->daddr;
- addr_type = ipv6_addr_type(addr6);
- }
-
- if ((addr_type & IPV6_ADDR_COMPATv4) != 0)
- dst = addr6->s6_addr32[3];
- else
- do_tx_error = true;
-
- neigh_release(neigh);
- if (do_tx_error)
- goto tx_error;
- }
+ if (!dst && !ipip6_tunnel_dst_find(skb, &dst, false))
+ goto tx_error;
flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark,
tos & INET_DSCP_MASK, RT_SCOPE_UNIVERSE,
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index f0ee1a909771..7e007f013ec8 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -16,6 +16,7 @@
#include <net/secure_seq.h>
#include <net/ipv6.h>
#include <net/tcp.h>
+#include <net/tcp_ecn.h>
#define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
@@ -264,6 +265,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
if (!req->syncookie)
ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok &= cookie_ecn_ok(net, dst);
+ tcp_rsk(req)->accecn_ok = ireq->ecn_ok && cookie_accecn_ok(th);
ret = tcp_get_cookie_sock(sk, skb, req, dst);
if (!ret) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index e885629312a4..9622c2776ade 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -62,6 +62,7 @@
#include <net/hotdata.h>
#include <net/busy_poll.h>
#include <net/rstreason.h>
+#include <net/psp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -299,12 +300,12 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, false, false);
- icsk->icsk_ext_hdr_len = 0;
+ icsk->icsk_ext_hdr_len = psp_sk_overhead(sk);
if (opt)
- icsk->icsk_ext_hdr_len = opt->opt_flen +
- opt->opt_nflen;
+ icsk->icsk_ext_hdr_len += opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -388,8 +389,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
bool fatal;
int err;
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->daddr, th->dest,
+ sk = __inet6_lookup_established(net, &hdr->daddr, th->dest,
&hdr->saddr, ntohs(th->source),
skb->dev->ifindex, inet6_sdif(skb));
@@ -545,6 +545,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
skb = tcp_make_synack(sk, dst, req, foc, synack_type, syn_skb);
if (skb) {
+ tcp_rsk(req)->syn_ect_snt = np->tclass & INET_ECN_MASK;
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
@@ -973,6 +974,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (sk) {
/* unconstify the socket only to attach it to buff with care. */
skb_set_owner_edemux(buff, (struct sock *)sk);
+ psp_reply_set_decrypted(buff);
if (sk->sk_state == TCP_TIME_WAIT)
mark = inet_twsk(sk)->tw_mark;
@@ -1073,8 +1075,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb,
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- NULL, 0, &ipv6h->saddr, th->source,
+ sk1 = inet6_lookup_listener(net, NULL, 0, &ipv6h->saddr, th->source,
&ipv6h->daddr, ntohs(th->source),
dif, sdif);
if (!sk1)
@@ -1460,7 +1461,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
- ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, false, false);
newnp->saddr = ireq->ir_v6_loc_addr;
@@ -1606,6 +1607,10 @@ int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
if (skb->protocol == htons(ETH_P_IP))
return tcp_v4_do_rcv(sk, skb);
+ reason = psp_sk_rx_policy_check(sk, skb);
+ if (reason)
+ goto err_discard;
+
/*
* socket locking is here for SMP purposes as backlog rcv
* is currently called with bh processing disabled.
@@ -1685,6 +1690,7 @@ csum_err:
reason = SKB_DROP_REASON_TCP_CSUM;
trace_tcp_bad_csum(skb);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
+err_discard:
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
@@ -1787,7 +1793,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th),
+ sk = __inet6_lookup_skb(skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
&refcounted);
if (!sk)
@@ -1809,7 +1815,7 @@ lookup:
&hdr->saddr, &hdr->daddr,
AF_INET6, dif, sdif);
if (drop_reason) {
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
reqsk_put(req);
goto discard_it;
}
@@ -1948,7 +1954,7 @@ discard_it:
return 0;
discard_and_relse:
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
if (refcounted)
sock_put(sk);
goto discard_it;
@@ -1974,8 +1980,7 @@ do_time_wait:
{
struct sock *sk2;
- sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo,
- skb, __tcp_hdrlen(th),
+ sk2 = inet6_lookup_listener(net, skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest),
@@ -1990,6 +1995,10 @@ do_time_wait:
__this_cpu_write(tcp_tw_isn, isn);
goto process;
}
+
+ drop_reason = psp_twsk_rx_policy_check(inet_twsk(sk), skb);
+ if (drop_reason)
+ break;
}
/* to ACK */
fallthrough;
@@ -2027,8 +2036,7 @@ void tcp_v6_early_demux(struct sk_buff *skb)
return;
/* Note : We use inet6_iif() here, not tcp_v6_iif() */
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
inet6_iif(skb), inet6_sdif(skb));
if (sk) {
@@ -2048,7 +2056,6 @@ void tcp_v6_early_demux(struct sk_buff *skb)
static struct timewait_sock_ops tcp6_timewait_sock_ops = {
.twsk_obj_size = sizeof(struct tcp6_timewait_sock),
- .twsk_destructor = tcp_twsk_destructor,
};
INDIRECT_CALLABLE_SCOPE void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)
@@ -2115,6 +2122,13 @@ static const struct tcp_sock_af_ops tcp_sock_ipv6_mapped_specific = {
.ao_calc_key_sk = tcp_v4_ao_calc_key_sk,
#endif
};
+
+static void tcp6_destruct_sock(struct sock *sk)
+{
+ tcp_md5_destruct_sock(sk);
+ tcp_ao_destroy_sock(sk, false);
+ inet6_sock_destruct(sk);
+}
#endif
/* NOTE: A lot of things set to zero explicitly by call to
@@ -2130,6 +2144,7 @@ static int tcp_v6_init_sock(struct sock *sk)
#if defined(CONFIG_TCP_MD5SIG) || defined(CONFIG_TCP_AO)
tcp_sk(sk)->af_specific = &tcp_sock_ipv6_specific;
+ sk->sk_destruct = tcp6_destruct_sock;
#endif
return 0;
@@ -2228,9 +2243,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
- icsk->icsk_retransmits,
+ READ_ONCE(icsk->icsk_retransmits),
from_kuid_munged(seq_user_ns(seq), sk_uid(sp)),
- icsk->icsk_probes_out,
+ READ_ONCE(icsk->icsk_probes_out),
sock_i_ino(sp),
refcount_read(&sp->sk_refcnt), sp,
jiffies_to_clock_t(icsk->icsk_rto),
@@ -2340,7 +2355,7 @@ struct proto tcpv6_prot = {
.splice_eof = tcp_splice_eof,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
- .hash = inet6_hash,
+ .hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.put_port = inet_put_port,
@@ -2356,7 +2371,6 @@ struct proto tcpv6_prot = {
.per_cpu_fw_alloc = &tcp_memory_per_cpu_fw_alloc,
.memory_pressure = &tcp_memory_pressure,
- .orphan_count = &tcp_orphan_count,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem),
.sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem),
diff --git a/net/ipv6/tcpv6_offload.c b/net/ipv6/tcpv6_offload.c
index a8a04f441e78..effeba58630b 100644
--- a/net/ipv6/tcpv6_offload.c
+++ b/net/ipv6/tcpv6_offload.c
@@ -36,8 +36,7 @@ static void tcp6_check_fraglist_gro(struct list_head *head, struct sk_buff *skb,
inet6_get_iif_sdif(skb, &iif, &sdif);
hdr = skb_gro_network_header(skb);
net = dev_net_rcu(skb->dev);
- sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo,
- &hdr->saddr, th->source,
+ sk = __inet6_lookup_established(net, &hdr->saddr, th->source,
&hdr->daddr, ntohs(th->dest),
iif, sdif);
NAPI_GRO_CB(skb)->is_flist = !sk;
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 6a68f77da44b..813a2ba75824 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -67,10 +67,11 @@ static void udpv6_destruct_sock(struct sock *sk)
int udpv6_init_sock(struct sock *sk)
{
- udp_lib_init_sock(sk);
+ int res = udp_lib_init_sock(sk);
+
sk->sk_destruct = udpv6_destruct_sock;
set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
- return 0;
+ return res;
}
INDIRECT_CALLABLE_SCOPE
@@ -260,7 +261,7 @@ rescore:
/* compute_score is too long of a function to be
* inlined, and calling it again here yields
- * measureable overhead for some
+ * measurable overhead for some
* workloads. Work around it by jumping
* backwards to rescore 'result'.
*/
@@ -449,7 +450,7 @@ struct sock *udp6_lib_lookup(const struct net *net, const struct in6_addr *saddr
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
#endif
-/* do not use the scratch area len for jumbogram: their length execeeds the
+/* do not use the scratch area len for jumbogram: their length exceeds the
* scratch area space; note that the IP6CB flags is still in the first
* cacheline, so checking for jumbograms is cheap
*/
@@ -479,7 +480,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (flags & MSG_ERRQUEUE)
return ipv6_recv_error(sk, msg, len, addr_len);
- if (np->rxpmtu && np->rxopt.bits.rxpmtu)
+ if (np->rxopt.bits.rxpmtu && READ_ONCE(np->rxpmtu))
return ipv6_recv_rxpmtu(sk, msg, len, addr_len);
try_again:
@@ -524,7 +525,7 @@ try_again:
}
if (unlikely(err)) {
if (!peeking) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
SNMP_INC_STATS(mib, UDP_MIB_INERRORS);
}
kfree_skb(skb);
@@ -908,7 +909,7 @@ csum_error:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
drop:
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
sk_skb_reason_drop(sk, skb, drop_reason);
return -1;
}
@@ -1013,7 +1014,7 @@ start_lookup:
}
nskb = skb_clone(skb, GFP_ATOMIC);
if (unlikely(!nskb)) {
- atomic_inc(&sk->sk_drops);
+ udp_drops_inc(sk);
__UDP6_INC_STATS(net, UDP_MIB_RCVBUFERRORS,
IS_UDPLITE(sk));
__UDP6_INC_STATS(net, UDP_MIB_INERRORS,
@@ -1048,7 +1049,7 @@ static void udp6_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst)
sk->sk_rx_dst_cookie = rt6_get_cookie(dst_rt6_info(dst));
}
-/* wrapper for udp_queue_rcv_skb tacking care of csum conversion and
+/* wrapper for udp_queue_rcv_skb taking care of csum conversion and
* return code conversion for ip layer consumption
*/
static int udp6_unicast_rcv_skb(struct sock *sk, struct sk_buff *skb,
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index d8445ac1b2e4..046f13b1d77a 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -154,8 +154,6 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb)
ip6_gro_compute_pseudo);
skip:
- NAPI_GRO_CB(skb)->is_ipv6 = 1;
-
if (static_branch_unlikely(&udpv6_encap_needed_key))
sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index cc2b3c44bc05..6c717a7ef292 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1187,7 +1187,7 @@ static void iucv_process_message(struct sock *sk, struct sk_buff *skb,
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return;
}
@@ -2011,7 +2011,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
skb_reset_network_header(skb);
IUCV_SKB_CB(skb)->offset = 0;
if (sk_filter(sk, skb)) {
- atomic_inc(&sk->sk_drops); /* skb rejected by filter */
+ sk_drops_inc(sk); /* skb rejected by filter */
kfree_skb(skb);
return NET_RX_SUCCESS;
}
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index 2ed07fa121ab..d9aca1c3c097 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -311,6 +311,96 @@ static void ieee80211_stop_p2p_device(struct wiphy *wiphy,
ieee80211_sdata_stop(IEEE80211_WDEV_TO_SUB_IF(wdev));
}
+static void ieee80211_nan_conf_free(struct cfg80211_nan_conf *conf)
+{
+ kfree(conf->cluster_id);
+ kfree(conf->extra_nan_attrs);
+ kfree(conf->vendor_elems);
+ memset(conf, 0, sizeof(*conf));
+}
+
+static void ieee80211_stop_nan(struct wiphy *wiphy,
+ struct wireless_dev *wdev)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+
+ if (!sdata->u.nan.started)
+ return;
+
+ drv_stop_nan(sdata->local, sdata);
+ sdata->u.nan.started = false;
+
+ ieee80211_nan_conf_free(&sdata->u.nan.conf);
+
+ ieee80211_sdata_stop(sdata);
+ ieee80211_recalc_idle(sdata->local);
+}
+
+static int ieee80211_nan_conf_copy(struct cfg80211_nan_conf *dst,
+ struct cfg80211_nan_conf *src,
+ u32 changes)
+{
+ if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
+ dst->master_pref = src->master_pref;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
+ dst->bands = src->bands;
+
+ if (changes & CFG80211_NAN_CONF_CHANGED_CONFIG) {
+ dst->scan_period = src->scan_period;
+ dst->scan_dwell_time = src->scan_dwell_time;
+ dst->discovery_beacon_interval =
+ src->discovery_beacon_interval;
+ dst->enable_dw_notification = src->enable_dw_notification;
+ memcpy(&dst->band_cfgs, &src->band_cfgs,
+ sizeof(dst->band_cfgs));
+
+ kfree(dst->cluster_id);
+ dst->cluster_id = NULL;
+
+ kfree(dst->extra_nan_attrs);
+ dst->extra_nan_attrs = NULL;
+ dst->extra_nan_attrs_len = 0;
+
+ kfree(dst->vendor_elems);
+ dst->vendor_elems = NULL;
+ dst->vendor_elems_len = 0;
+
+ if (src->cluster_id) {
+ dst->cluster_id = kmemdup(src->cluster_id, ETH_ALEN,
+ GFP_KERNEL);
+ if (!dst->cluster_id)
+ goto no_mem;
+ }
+
+ if (src->extra_nan_attrs && src->extra_nan_attrs_len) {
+ dst->extra_nan_attrs = kmemdup(src->extra_nan_attrs,
+ src->extra_nan_attrs_len,
+ GFP_KERNEL);
+ if (!dst->extra_nan_attrs)
+ goto no_mem;
+
+ dst->extra_nan_attrs_len = src->extra_nan_attrs_len;
+ }
+
+ if (src->vendor_elems && src->vendor_elems_len) {
+ dst->vendor_elems = kmemdup(src->vendor_elems,
+ src->vendor_elems_len,
+ GFP_KERNEL);
+ if (!dst->vendor_elems)
+ goto no_mem;
+
+ dst->vendor_elems_len = src->vendor_elems_len;
+ }
+ }
+
+ return 0;
+
+no_mem:
+ ieee80211_nan_conf_free(dst);
+ return -ENOMEM;
+}
+
static int ieee80211_start_nan(struct wiphy *wiphy,
struct wireless_dev *wdev,
struct cfg80211_nan_conf *conf)
@@ -320,6 +410,9 @@ static int ieee80211_start_nan(struct wiphy *wiphy,
lockdep_assert_wiphy(sdata->local->hw.wiphy);
+ if (sdata->u.nan.started)
+ return -EALREADY;
+
ret = ieee80211_check_combinations(sdata, NULL, 0, 0, -1);
if (ret < 0)
return ret;
@@ -329,21 +422,21 @@ static int ieee80211_start_nan(struct wiphy *wiphy,
return ret;
ret = drv_start_nan(sdata->local, sdata, conf);
- if (ret)
+ if (ret) {
ieee80211_sdata_stop(sdata);
+ return ret;
+ }
- sdata->u.nan.conf = *conf;
-
- return ret;
-}
+ sdata->u.nan.started = true;
+ ieee80211_recalc_idle(sdata->local);
-static void ieee80211_stop_nan(struct wiphy *wiphy,
- struct wireless_dev *wdev)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ ret = ieee80211_nan_conf_copy(&sdata->u.nan.conf, conf, 0xFFFFFFFF);
+ if (ret) {
+ ieee80211_stop_nan(wiphy, wdev);
+ return ret;
+ }
- drv_stop_nan(sdata->local, sdata);
- ieee80211_sdata_stop(sdata);
+ return 0;
}
static int ieee80211_nan_change_conf(struct wiphy *wiphy,
@@ -352,7 +445,7 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
u32 changes)
{
struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct cfg80211_nan_conf new_conf;
+ struct cfg80211_nan_conf new_conf = {};
int ret = 0;
if (sdata->vif.type != NL80211_IFTYPE_NAN)
@@ -361,17 +454,28 @@ static int ieee80211_nan_change_conf(struct wiphy *wiphy,
if (!ieee80211_sdata_running(sdata))
return -ENETDOWN;
- new_conf = sdata->u.nan.conf;
+ if (!changes)
+ return 0;
- if (changes & CFG80211_NAN_CONF_CHANGED_PREF)
- new_conf.master_pref = conf->master_pref;
+ /* First make a full copy of the previous configuration and then apply
+ * the changes. This might be a little wasteful, but it is simpler.
+ */
+ ret = ieee80211_nan_conf_copy(&new_conf, &sdata->u.nan.conf,
+ 0xFFFFFFFF);
+ if (ret < 0)
+ return ret;
- if (changes & CFG80211_NAN_CONF_CHANGED_BANDS)
- new_conf.bands = conf->bands;
+ ret = ieee80211_nan_conf_copy(&new_conf, conf, changes);
+ if (ret < 0)
+ return ret;
ret = drv_nan_change_conf(sdata->local, sdata, &new_conf, changes);
- if (!ret)
+ if (ret) {
+ ieee80211_nan_conf_free(&new_conf);
+ } else {
+ ieee80211_nan_conf_free(&sdata->u.nan.conf);
sdata->u.nan.conf = new_conf;
+ }
return ret;
}
@@ -2171,10 +2275,16 @@ static int sta_apply_parameters(struct ieee80211_local *local,
/*
* cfg80211 validates this (1-2007) and allows setting the AID
- * only when creating a new station entry
+ * only when creating a new station entry. For S1G APs, the current
+ * implementation supports a maximum of 1600 AIDs.
*/
- if (params->aid)
+ if (params->aid) {
+ if (sdata->vif.cfg.s1g &&
+ params->aid > IEEE80211_MAX_SUPPORTED_S1G_AID)
+ return -EINVAL;
+
sta->sta.aid = params->aid;
+ }
/*
* Some of the following updates would be racy if called on an
@@ -3001,6 +3111,9 @@ static int ieee80211_scan(struct wiphy *wiphy,
struct cfg80211_scan_request *req)
{
struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_link_data *link;
+ struct ieee80211_channel *chan;
+ int radio_idx;
sdata = IEEE80211_WDEV_TO_SUB_IF(req->wdev);
@@ -3028,10 +3141,20 @@ static int ieee80211_scan(struct wiphy *wiphy,
* the frames sent while scanning on other channel will be
* lost)
*/
- if (ieee80211_num_beaconing_links(sdata) &&
- (!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
- !(req->flags & NL80211_SCAN_FLAG_AP)))
- return -EOPNOTSUPP;
+ for_each_link_data(sdata, link) {
+ /* if the link is not beaconing, ignore it */
+ if (!sdata_dereference(link->u.ap.beacon, sdata))
+ continue;
+
+ chan = link->conf->chanreq.oper.chan;
+ radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan);
+
+ if (ieee80211_is_radio_idx_in_scan_req(wiphy, req,
+ radio_idx) &&
+ (!(wiphy->features & NL80211_FEATURE_AP_SCAN) ||
+ !(req->flags & NL80211_SCAN_FLAG_AP)))
+ return -EOPNOTSUPP;
+ }
break;
case NL80211_IFTYPE_NAN:
default:
@@ -3677,12 +3800,7 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy,
if (list_empty(&local->roc_list) && !local->scanning)
return false;
- if (wiphy->n_radio < 2)
- return true;
-
req_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chandef->chan);
- if (req_radio_idx < 0)
- return true;
if (local->scanning) {
scan_req = wiphy_dereference(wiphy, local->scan_req);
@@ -3701,14 +3819,6 @@ static bool ieee80211_is_scan_ongoing(struct wiphy *wiphy,
list_for_each_entry(roc, &local->roc_list, list) {
chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy,
roc->chan);
- /*
- * The roc work is added but chan_radio_idx is invalid.
- * Should not happen but if it does, let's not take
- * risk and return true.
- */
- if (chan_radio_idx < 0)
- return true;
-
if (chan_radio_idx == req_radio_idx)
return true;
}
@@ -4825,7 +4935,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy,
int ret = 0;
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
if (wdev) {
sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
@@ -4851,7 +4960,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy,
}
out:
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return ret;
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index c9cea0e7ac16..57065714cf8c 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -659,19 +659,8 @@ bool ieee80211_is_radar_required(struct ieee80211_local *local,
for_each_sdata_link(local, link) {
if (link->radar_required) {
- if (wiphy->n_radio < 2)
- return true;
-
chan = link->conf->chanreq.oper.chan;
radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan);
- /*
- * The radio index (radio_idx) is expected to be valid,
- * as it's derived from a channel tied to a link. If
- * it's invalid (i.e., negative), return true to avoid
- * potential issues with radar-sensitive operations.
- */
- if (radio_idx < 0)
- return true;
if (ieee80211_is_radio_idx_in_scan_req(wiphy, req,
radio_idx))
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index e8b78ec682da..d02f07368c51 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -82,7 +82,6 @@ static ssize_t aqm_read(struct file *file,
int len = 0;
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
len = scnprintf(buf, sizeof(buf),
"access name value\n"
@@ -105,7 +104,6 @@ static ssize_t aqm_read(struct file *file,
fq->limit,
fq->quantum);
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return simple_read_from_buffer(user_buf, count, ppos,
@@ -717,7 +715,6 @@ void debugfs_hw_add(struct ieee80211_local *local)
DEBUGFS_STATS_ADD(dot11ReceivedFragmentCount);
DEBUGFS_STATS_ADD(dot11MulticastReceivedFrameCount);
DEBUGFS_STATS_ADD(dot11TransmittedFrameCount);
- DEBUGFS_STATS_ADD(tx_handlers_drop);
DEBUGFS_STATS_ADD(tx_handlers_queued);
DEBUGFS_STATS_ADD(tx_handlers_drop_wep);
DEBUGFS_STATS_ADD(tx_handlers_drop_not_assoc);
diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c
index 1dac78271045..30a5a978a678 100644
--- a/net/mac80211/debugfs_netdev.c
+++ b/net/mac80211/debugfs_netdev.c
@@ -625,7 +625,6 @@ static ssize_t ieee80211_if_fmt_aqm(
txqi = to_txq_info(sdata->vif.txq);
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
len = scnprintf(buf,
buflen,
@@ -642,7 +641,6 @@ static ssize_t ieee80211_if_fmt_aqm(
txqi->tin.tx_bytes,
txqi->tin.tx_packets);
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
return len;
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index 49061bd4151b..ef75255d47d5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -148,7 +148,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
return -ENOMEM;
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
p += scnprintf(p,
bufsz + buf - p,
@@ -178,7 +177,6 @@ static ssize_t sta_aqm_read(struct file *file, char __user *userbuf,
test_bit(IEEE80211_TXQ_DIRTY, &txqi->flags) ? " DIRTY" : "");
}
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
rv = simple_read_from_buffer(userbuf, count, ppos, buf, p - buf);
diff --git a/net/mac80211/ethtool.c b/net/mac80211/ethtool.c
index 0397755a3bd1..3d365626faa4 100644
--- a/net/mac80211/ethtool.c
+++ b/net/mac80211/ethtool.c
@@ -48,8 +48,8 @@ static const char ieee80211_gstrings_sta_stats[][ETH_GSTRING_LEN] = {
"rx_duplicates", "rx_fragments", "rx_dropped",
"tx_packets", "tx_bytes",
"tx_filtered", "tx_retry_failed", "tx_retries",
- "sta_state", "txrate", "rxrate", "signal",
- "channel", "noise", "ch_time", "ch_time_busy",
+ "tx_handlers_drop", "sta_state", "txrate", "rxrate",
+ "signal", "channel", "noise", "ch_time", "ch_time_busy",
"ch_time_ext_busy", "ch_time_rx", "ch_time_tx"
};
#define STA_STATS_LEN ARRAY_SIZE(ieee80211_gstrings_sta_stats)
@@ -120,6 +120,7 @@ static void ieee80211_get_stats(struct net_device *dev,
i = 0;
ADD_STA_STATS(&sta->deflink);
+ data[i++] = sdata->tx_handlers_drop;
data[i++] = sta->sta_state;
@@ -145,6 +146,7 @@ static void ieee80211_get_stats(struct net_device *dev,
sta_set_sinfo(sta, &sinfo, false);
i = 0;
ADD_STA_STATS(&sta->deflink);
+ data[i++] = sdata->tx_handlers_drop;
}
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index 8afa2404eaa8..73fd86ec1bce 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -86,6 +86,14 @@ extern const u8 ieee80211_ac_to_qos_mask[IEEE80211_NUM_ACS];
#define IEEE80211_MAX_NAN_INSTANCE_ID 255
+/*
+ * Current mac80211 implementation supports a maximum of 1600 AIDS
+ * for S1G interfaces. With regards to an S1G TIM, this covers 25 blocks
+ * as each block is 64 AIDs.
+ */
+#define IEEE80211_MAX_SUPPORTED_S1G_AID 1600
+#define IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS 25
+
enum ieee80211_status_data {
IEEE80211_STATUS_TYPE_MASK = 0x00f,
IEEE80211_STATUS_TYPE_INVALID = 0,
@@ -977,11 +985,13 @@ struct ieee80211_if_mntr {
* struct ieee80211_if_nan - NAN state
*
* @conf: current NAN configuration
+ * @started: true iff NAN is started
* @func_lock: lock for @func_inst_ids
* @function_inst_ids: a bitmap of available instance_id's
*/
struct ieee80211_if_nan {
struct cfg80211_nan_conf conf;
+ bool started;
/* protects function_inst_ids */
spinlock_t func_lock;
@@ -1210,6 +1220,7 @@ struct ieee80211_sub_if_data {
} debugfs;
#endif
+ u32 tx_handlers_drop;
/* must be last, dynamically sized area in this! */
struct ieee80211_vif vif;
};
@@ -1599,7 +1610,6 @@ struct ieee80211_local {
u32 dot11TransmittedFrameCount;
/* TX/RX handler statistics */
- unsigned int tx_handlers_drop;
unsigned int tx_handlers_queued;
unsigned int tx_handlers_drop_wep;
unsigned int tx_handlers_drop_not_assoc;
@@ -1665,8 +1675,6 @@ struct ieee80211_local {
struct idr ack_status_frames;
spinlock_t ack_status_lock;
- struct ieee80211_sub_if_data __rcu *p2p_sdata;
-
/* virtual monitor interface */
struct ieee80211_sub_if_data __rcu *monitor_sdata;
struct ieee80211_chan_req monitor_chanreq;
@@ -2702,7 +2710,8 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local,
const struct ieee80211_he_operation *he_oper,
const struct ieee80211_eht_operation *eht_oper,
struct cfg80211_chan_def *chandef);
-bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
+bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local,
+ const struct ieee80211_s1g_oper_ie *oper,
struct cfg80211_chan_def *chandef);
void ieee80211_chandef_downgrade(struct cfg80211_chan_def *chandef,
struct ieee80211_conn_settings *conn);
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 07ba68f7cd81..a7873832d4fa 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -107,6 +107,7 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local,
{
bool working, scanning, active;
unsigned int led_trig_start = 0, led_trig_stop = 0;
+ struct ieee80211_sub_if_data *iter;
lockdep_assert_wiphy(local->hw.wiphy);
@@ -117,6 +118,14 @@ static u32 __ieee80211_recalc_idle(struct ieee80211_local *local,
working = !local->ops->remain_on_channel &&
!list_empty(&local->roc_list);
+ list_for_each_entry(iter, &local->interfaces, list) {
+ if (iter->vif.type == NL80211_IFTYPE_NAN &&
+ iter->u.nan.started) {
+ working = true;
+ break;
+ }
+ }
+
scanning = test_bit(SCAN_SW_SCANNING, &local->scanning) ||
test_bit(SCAN_ONCHANNEL_SCANNING, &local->scanning);
@@ -611,10 +620,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do
spin_unlock_bh(&sdata->u.nan.func_lock);
break;
- case NL80211_IFTYPE_P2P_DEVICE:
- /* relies on synchronize_rcu() below */
- RCU_INIT_POINTER(local->p2p_sdata, NULL);
- fallthrough;
default:
wiphy_work_cancel(sdata->local->hw.wiphy, &sdata->work);
/*
@@ -1405,6 +1410,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
ieee80211_recalc_idle(local);
netif_carrier_on(dev);
+ list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
break;
default:
if (coming_up) {
@@ -1468,17 +1474,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up)
sdata->vif.type != NL80211_IFTYPE_STATION);
}
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_P2P_DEVICE:
- rcu_assign_pointer(local->p2p_sdata, sdata);
- break;
- case NL80211_IFTYPE_MONITOR:
- list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list);
- break;
- default:
- break;
- }
-
/*
* set_multicast_list will be invoked by the networking core
* which will check whether any increments here were done in
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 3ae6104e5cb2..eefa6f7e899b 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -746,6 +746,11 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = {
BIT(IEEE80211_STYPE_PROBE_REQ >> 4) |
BIT(IEEE80211_STYPE_AUTH >> 4),
},
+ [NL80211_IFTYPE_NAN] = {
+ .tx = 0xffff,
+ .rx = BIT(IEEE80211_STYPE_ACTION >> 4) |
+ BIT(IEEE80211_STYPE_AUTH >> 4),
+ },
};
static const struct ieee80211_ht_cap mac80211_ht_capa_mod_mask = {
@@ -862,6 +867,14 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len,
if (emulate_chanctx || ops->remain_on_channel)
wiphy->flags |= WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL;
+ wiphy->bss_param_support = WIPHY_BSS_PARAM_CTS_PROT |
+ WIPHY_BSS_PARAM_SHORT_PREAMBLE |
+ WIPHY_BSS_PARAM_SHORT_SLOT_TIME |
+ WIPHY_BSS_PARAM_BASIC_RATES |
+ WIPHY_BSS_PARAM_AP_ISOLATE |
+ WIPHY_BSS_PARAM_HT_OPMODE |
+ WIPHY_BSS_PARAM_P2P_CTWINDOW |
+ WIPHY_BSS_PARAM_P2P_OPPPS;
wiphy->features |= NL80211_FEATURE_SK_TX_STATUS |
NL80211_FEATURE_SAE |
NL80211_FEATURE_HT_IBSS |
@@ -1164,9 +1177,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (WARN_ON(!ieee80211_hw_check(hw, MFP_CAPABLE)))
return -EINVAL;
- if (WARN_ON(!ieee80211_hw_check(hw, CONNECTION_MONITOR)))
- return -EINVAL;
-
if (WARN_ON(ieee80211_hw_check(hw, NEED_DTIM_BEFORE_ASSOC)))
return -EINVAL;
@@ -1239,11 +1249,13 @@ int ieee80211_register_hw(struct ieee80211_hw *hw)
if (!dflt_chandef.chan) {
/*
* Assign the first enabled channel to dflt_chandef
- * from the list of channels
+ * from the list of channels. For S1G interfaces
+ * ensure it can be used as a primary.
*/
for (i = 0; i < sband->n_channels; i++)
if (!(sband->channels[i].flags &
- IEEE80211_CHAN_DISABLED))
+ (IEEE80211_CHAN_DISABLED |
+ IEEE80211_CHAN_S1G_NO_PRIMARY)))
break;
/* if none found then use the first anyway */
if (i == sband->n_channels)
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index a4a715f6f1c3..f37068a533f4 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -624,6 +624,9 @@ int mesh_add_he_6ghz_cap_ie(struct ieee80211_sub_if_data *sdata,
if (!sband)
return -EINVAL;
+ if (sband->band != NL80211_BAND_6GHZ)
+ return 0;
+
iftd = ieee80211_get_sband_iftype_data(sband,
NL80211_IFTYPE_MESH_POINT);
/* The device doesn't support HE in mesh mode or at all */
diff --git a/net/mac80211/mesh_ps.c b/net/mac80211/mesh_ps.c
index 20e022a03933..ebab1f0a0138 100644
--- a/net/mac80211/mesh_ps.c
+++ b/net/mac80211/mesh_ps.c
@@ -586,7 +586,7 @@ void ieee80211_mps_frame_release(struct sta_info *sta,
if (sta->mesh->plink_state == NL80211_PLINK_ESTAB)
has_buffered = ieee80211_check_tim(elems->tim, elems->tim_len,
- sta->mesh->aid);
+ sta->mesh->aid, false);
if (has_buffered)
mps_dbg(sta->sdata, "%pM indicates buffered frames\n",
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index dd650a127a31..3b5827ea438e 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -180,10 +180,11 @@ ieee80211_determine_ap_chan(struct ieee80211_sub_if_data *sdata,
/* get special S1G case out of the way */
if (sband->band == NL80211_BAND_S1GHZ) {
- if (!ieee80211_chandef_s1g_oper(elems->s1g_oper, chandef)) {
- sdata_info(sdata,
- "Missing S1G Operation Element? Trying operating == primary\n");
- chandef->width = ieee80211_s1g_channel_width(channel);
+ if (!ieee80211_chandef_s1g_oper(sdata->local, elems->s1g_oper,
+ chandef)) {
+ /* Fallback to default 1MHz */
+ chandef->width = NL80211_CHAN_WIDTH_1;
+ chandef->s1g_primary_2mhz = false;
}
return IEEE80211_CONN_MODE_S1G;
@@ -1046,6 +1047,14 @@ again:
ret = -EINVAL;
goto free;
}
+
+ chanreq->oper = *ap_chandef;
+ if (!cfg80211_chandef_usable(sdata->wdev.wiphy, &chanreq->oper,
+ IEEE80211_CHAN_DISABLED)) {
+ ret = -EINVAL;
+ goto free;
+ }
+
return elems;
case NL80211_BAND_6GHZ:
if (ap_mode < IEEE80211_CONN_MODE_HE) {
@@ -1850,7 +1859,8 @@ ieee80211_add_link_elems(struct ieee80211_sub_if_data *sdata,
ieee80211_put_he_cap(skb, sdata, sband,
&assoc_data->link[link_id].conn);
ADD_PRESENT_EXT_ELEM(WLAN_EID_EXT_HE_CAPABILITY);
- ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode);
+ if (sband->band == NL80211_BAND_6GHZ)
+ ieee80211_put_he_6ghz_cap(skb, sdata, smps_mode);
}
/*
@@ -2112,8 +2122,11 @@ ieee80211_link_common_elems_size(struct ieee80211_sub_if_data *sdata,
sizeof(struct ieee80211_he_mcs_nss_supp) +
IEEE80211_HE_PPE_THRES_MAX_LEN;
- if (sband->band == NL80211_BAND_6GHZ)
+ if (sband->band == NL80211_BAND_6GHZ) {
size += 2 + 1 + sizeof(struct ieee80211_he_6ghz_capa);
+ /* reg connection */
+ size += 4;
+ }
size += 2 + 1 + sizeof(struct ieee80211_eht_cap_elem) +
sizeof(struct ieee80211_eht_mcs_nss_supp) +
@@ -2187,11 +2200,7 @@ static int ieee80211_send_assoc(struct ieee80211_sub_if_data *sdata)
2 + /* ext capa & op */
2; /* EML capa */
- /*
- * The capability elements were already considered above;
- * note this over-estimates a bit because there's no
- * STA profile for the assoc link.
- */
+ /* The capability elements were already considered above */
size += (n_links - 1) *
(1 + 1 + /* subelement ID/length */
2 + /* STA control */
@@ -5729,7 +5738,7 @@ static u8 ieee80211_max_rx_chains(struct ieee80211_link_data *link,
he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY,
ies->data, ies->len);
- if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap))
+ if (!he_cap_elem || he_cap_elem->datalen < sizeof(*he_cap) + 1)
return chains;
/* skip one byte ext_tag_id */
@@ -6356,6 +6365,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
};
u8 ap_mld_addr[ETH_ALEN] __aligned(2);
unsigned int link_id;
+ u16 max_aid = IEEE80211_MAX_AID;
lockdep_assert_wiphy(sdata->local->hw.wiphy);
@@ -6382,10 +6392,12 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
reassoc = ieee80211_is_reassoc_resp(mgmt->frame_control);
capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info);
status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code);
- if (assoc_data->s1g)
+ if (assoc_data->s1g) {
elem_start = mgmt->u.s1g_assoc_resp.variable;
- else
+ max_aid = IEEE80211_MAX_SUPPORTED_S1G_AID;
+ } else {
elem_start = mgmt->u.assoc_resp.variable;
+ }
/*
* Note: this may not be perfect, AP might misbehave - if
@@ -6409,16 +6421,15 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
if (elems->aid_resp)
aid = le16_to_cpu(elems->aid_resp->aid);
- else if (assoc_data->s1g)
- aid = 0; /* TODO */
else
aid = le16_to_cpu(mgmt->u.assoc_resp.aid);
/*
- * The 5 MSB of the AID field are reserved
- * (802.11-2016 9.4.1.8 AID field)
+ * The 5 MSB of the AID field are reserved for a non-S1G STA. For
+ * an S1G STA the 3 MSBs are reserved.
+ * (802.11-2016 9.4.1.8 AID field).
*/
- aid &= 0x7ff;
+ aid &= assoc_data->s1g ? 0x1fff : 0x7ff;
sdata_info(sdata,
"RX %sssocResp from %pM (capab=0x%x status=%d aid=%d)\n",
@@ -6455,7 +6466,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
event.u.mlme.reason = status_code;
drv_event_callback(sdata->local, sdata, &event);
} else {
- if (aid == 0 || aid > IEEE80211_MAX_AID) {
+ if (aid == 0 || aid > max_aid) {
sdata_info(sdata,
"invalid AID value %d (out of range), turn off PS\n",
aid);
@@ -6493,6 +6504,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata,
}
sdata->vif.cfg.aid = aid;
+ sdata->vif.cfg.s1g = assoc_data->s1g;
if (!ieee80211_assoc_success(sdata, mgmt, elems,
elem_start, elem_len)) {
@@ -7289,6 +7301,38 @@ static bool ieee80211_mgd_ssid_mismatch(struct ieee80211_sub_if_data *sdata,
return memcmp(elems->ssid, cfg->ssid, cfg->ssid_len);
}
+static bool
+ieee80211_rx_beacon_freq_valid(struct ieee80211_local *local,
+ struct ieee80211_mgmt *mgmt,
+ struct ieee80211_rx_status *rx_status,
+ struct ieee80211_chanctx_conf *chanctx)
+{
+ u32 pri_2mhz_khz;
+ struct ieee80211_channel *s1g_sibling_1mhz;
+ u32 pri_khz = ieee80211_channel_to_khz(chanctx->def.chan);
+ u32 rx_khz = ieee80211_rx_status_to_khz(rx_status);
+
+ if (rx_khz == pri_khz)
+ return true;
+
+ if (!chanctx->def.s1g_primary_2mhz)
+ return false;
+
+ /*
+ * If we have an S1G interface with a 2MHz primary, beacons are
+ * sent on the center frequency of the 2MHz primary. Find the sibling
+ * 1MHz channel and calculate the 2MHz primary center frequency.
+ */
+ s1g_sibling_1mhz = cfg80211_s1g_get_primary_sibling(local->hw.wiphy,
+ &chanctx->def);
+ if (!s1g_sibling_1mhz)
+ return false;
+
+ pri_2mhz_khz =
+ (pri_khz + ieee80211_channel_to_khz(s1g_sibling_1mhz)) / 2;
+ return rx_khz == pri_2mhz_khz;
+}
+
static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
struct ieee80211_hdr *hdr, size_t len,
struct ieee80211_rx_status *rx_status)
@@ -7343,8 +7387,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
return;
}
- if (ieee80211_rx_status_to_khz(rx_status) !=
- ieee80211_channel_to_khz(chanctx_conf->def.chan)) {
+ if (!ieee80211_rx_beacon_freq_valid(local, mgmt, rx_status,
+ chanctx_conf)) {
rcu_read_unlock();
return;
}
@@ -7440,7 +7484,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link,
ncrc = elems->crc;
if (ieee80211_hw_check(&local->hw, PS_NULLFUNC_STACK) &&
- ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid)) {
+ ieee80211_check_tim(elems->tim, elems->tim_len, vif_cfg->aid,
+ vif_cfg->s1g)) {
if (local->hw.conf.dynamic_ps_timeout > 0) {
if (local->hw.conf.flags & IEEE80211_CONF_PS) {
local->hw.conf.flags &= ~IEEE80211_CONF_PS;
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 13df6321634d..ae82533e3c02 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -8,7 +8,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2009 Johannes Berg <johannes@sipsolutions.net>
- * Copyright (C) 2019, 2022-2024 Intel Corporation
+ * Copyright (C) 2019, 2022-2025 Intel Corporation
*/
#include <linux/export.h>
#include <net/mac80211.h>
@@ -897,6 +897,7 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
need_offchan = true;
break;
case NL80211_IFTYPE_NAN:
+ break;
default:
return -EOPNOTSUPP;
}
@@ -910,6 +911,8 @@ int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
/* Check if the operating channel is the requested channel */
if (!params->chan && mlo_sta) {
need_offchan = false;
+ } else if (sdata->vif.type == NL80211_IFTYPE_NAN) {
+ /* Frames can be sent during NAN schedule */
} else if (!need_offchan) {
struct ieee80211_chanctx_conf *chanctx_conf = NULL;
int i;
diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c
index 3cb2ad6d0b28..e441f8541603 100644
--- a/net/mac80211/rate.c
+++ b/net/mac80211/rate.c
@@ -4,7 +4,7 @@
* Copyright 2005-2006, Devicescape Software, Inc.
* Copyright (c) 2006 Jiri Benc <jbenc@suse.cz>
* Copyright 2017 Intel Deutschland GmbH
- * Copyright (C) 2019, 2022-2024 Intel Corporation
+ * Copyright (C) 2019, 2022-2025 Intel Corporation
*/
#include <linux/kernel.h>
@@ -98,6 +98,9 @@ void rate_control_tx_status(struct ieee80211_local *local,
if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL))
return;
+ if (st->info->band >= NUM_NL80211_BANDS)
+ return;
+
sband = local->hw.wiphy->bands[st->info->band];
spin_lock_bh(&sta->rate_ctrl_lock);
@@ -419,6 +422,9 @@ static bool rate_control_send_low(struct ieee80211_sta *pubsta,
int mcast_rate;
bool use_basicrate = false;
+ if (!sband)
+ return false;
+
if (!pubsta || rc_no_data_or_no_ack_use_min(txrc)) {
__rate_control_send_low(txrc->hw, sband, pubsta, info,
txrc->rate_idx_mask);
@@ -898,6 +904,9 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif,
return;
sdata = vif_to_sdata(vif);
+ if (info->band >= NUM_NL80211_BANDS)
+ return;
+
sband = sdata->local->hw.wiphy->bands[info->band];
if (ieee80211_is_tx_data(skb))
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 4d4ff4d4917a..6af43dfefdd6 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4502,8 +4502,16 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
(ieee80211_is_auth(hdr->frame_control) &&
ether_addr_equal(sdata->vif.addr, hdr->addr1));
case NL80211_IFTYPE_NAN:
- /* Currently no frames on NAN interface are allowed */
- return false;
+ /* Accept only frames that are addressed to the NAN cluster
+ * (based on the Cluster ID). From these frames, accept only
+ * action frames or authentication frames that are addressed to
+ * the local NAN interface.
+ */
+ return memcmp(sdata->wdev.u.nan.cluster_id,
+ hdr->addr3, ETH_ALEN) == 0 &&
+ (ieee80211_is_public_action(hdr, skb->len) ||
+ (ieee80211_is_auth(hdr->frame_control) &&
+ ether_addr_equal(sdata->vif.addr, hdr->addr1)));
default:
break;
}
@@ -5230,12 +5238,20 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
}
rx.sdata = prev_sta->sdata;
+ if (!status->link_valid && prev_sta->sta.mlo) {
+ struct link_sta_info *link_sta;
+
+ link_sta = link_sta_info_get_bss(rx.sdata,
+ hdr->addr2);
+ if (!link_sta)
+ continue;
+
+ link_id = link_sta->link_id;
+ }
+
if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id))
goto out;
- if (!status->link_valid && prev_sta->sta.mlo)
- continue;
-
ieee80211_prepare_and_rx_handle(&rx, skb, false);
prev_sta = sta;
@@ -5243,10 +5259,18 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw,
if (prev_sta) {
rx.sdata = prev_sta->sdata;
- if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id))
- goto out;
+ if (!status->link_valid && prev_sta->sta.mlo) {
+ struct link_sta_info *link_sta;
+
+ link_sta = link_sta_info_get_bss(rx.sdata,
+ hdr->addr2);
+ if (!link_sta)
+ goto out;
- if (!status->link_valid && prev_sta->sta.mlo)
+ link_id = link_sta->link_id;
+ }
+
+ if (!ieee80211_rx_data_set_sta(&rx, prev_sta, link_id))
goto out;
if (ieee80211_prepare_and_rx_handle(&rx, skb, true))
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index dbf98aa4cd67..bb9563f50e7b 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -996,15 +996,15 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
local->scan_chandef.freq1_offset = chan->freq_offset;
local->scan_chandef.center_freq2 = 0;
- /* For scanning on the S1G band, detect the channel width according to
- * the channel being scanned.
- */
+ /* For S1G, only scan the 1MHz primaries. */
if (chan->band == NL80211_BAND_S1GHZ) {
- local->scan_chandef.width = ieee80211_s1g_channel_width(chan);
+ local->scan_chandef.width = NL80211_CHAN_WIDTH_1;
+ local->scan_chandef.s1g_primary_2mhz = false;
goto set_channel;
}
- /* If scanning on oper channel, use whatever channel-type
+ /*
+ * If scanning on oper channel, use whatever channel-type
* is currently in use.
*/
if (chan == local->hw.conf.chandef.chan)
@@ -1213,7 +1213,8 @@ int ieee80211_request_ibss_scan(struct ieee80211_sub_if_data *sdata,
for (band = 0; band < NUM_NL80211_BANDS; band++) {
if (!local->hw.wiphy->bands[band] ||
- band == NL80211_BAND_6GHZ)
+ band == NL80211_BAND_6GHZ ||
+ band == NL80211_BAND_S1GHZ)
continue;
max_n = local->hw.wiphy->bands[band]->n_channels;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 8c550aab9bdc..f4d3b67fda06 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2637,13 +2637,11 @@ static void sta_set_tidstats(struct sta_info *sta,
if (link_id < 0 && tid < IEEE80211_NUM_TIDS) {
spin_lock_bh(&local->fq.lock);
- rcu_read_lock();
tidstats->filled |= BIT(NL80211_TID_STATS_TXQ_STATS);
ieee80211_fill_txq_stats(&tidstats->txq_stats,
to_txq_info(sta->sta.txq[tid]));
- rcu_read_unlock();
spin_unlock_bh(&local->fq.lock);
}
}
@@ -2962,7 +2960,7 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
u32 thr = 0;
- int i, ac, cpu, link_id;
+ int i, ac, cpu;
struct ieee80211_sta_rx_stats *last_rxstats;
last_rxstats = sta_get_last_rx_stats(sta, -1);
@@ -3204,18 +3202,23 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo,
if (sta->sta.valid_links) {
struct ieee80211_link_data *link;
struct link_sta_info *link_sta;
+ int link_id;
ether_addr_copy(sinfo->mld_addr, sta->addr);
+
+ /* assign valid links first for iteration */
+ sinfo->valid_links = sta->sta.valid_links;
+
for_each_valid_link(sinfo, link_id) {
link_sta = wiphy_dereference(sta->local->hw.wiphy,
sta->link[link_id]);
link = wiphy_dereference(sdata->local->hw.wiphy,
sdata->link[link_id]);
- if (!link_sta || !sinfo->links[link_id] || !link)
+ if (!link_sta || !sinfo->links[link_id] || !link) {
+ sinfo->valid_links &= ~BIT(link_id);
continue;
-
- sinfo->valid_links = sta->sta.valid_links;
+ }
sta_set_link_sinfo(sta, sinfo->links[link_id],
link, tidstats);
}
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index a362254b310c..4b38aa0e902a 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -5,7 +5,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2008-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2021-2024 Intel Corporation
+ * Copyright 2021-2025 Intel Corporation
*/
#include <linux/export.h>
@@ -572,6 +572,7 @@ static struct ieee80211_sub_if_data *
ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb)
{
struct ieee80211_sub_if_data *sdata;
+ struct ieee80211_hdr *hdr = (void *)skb->data;
if (skb->dev) {
list_for_each_entry_rcu(sdata, &local->interfaces, list) {
@@ -585,7 +586,23 @@ ieee80211_sdata_from_skb(struct ieee80211_local *local, struct sk_buff *skb)
return NULL;
}
- return rcu_dereference(local->p2p_sdata);
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_P2P_DEVICE:
+ break;
+ case NL80211_IFTYPE_NAN:
+ if (sdata->u.nan.started)
+ break;
+ fallthrough;
+ default:
+ continue;
+ }
+
+ if (ether_addr_equal(sdata->vif.addr, hdr->addr2))
+ return sdata;
+ }
+
+ return NULL;
}
static void ieee80211_report_ack_skb(struct ieee80211_local *local,
diff --git a/net/mac80211/tests/Makefile b/net/mac80211/tests/Makefile
index 3b0c08356fc5..3c7f874e5c41 100644
--- a/net/mac80211/tests/Makefile
+++ b/net/mac80211/tests/Makefile
@@ -1,3 +1,3 @@
-mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o
+mac80211-tests-y += module.o util.o elems.o mfp.o tpe.o chan-mode.o s1g_tim.o
obj-$(CONFIG_MAC80211_KUNIT_TEST) += mac80211-tests.o
diff --git a/net/mac80211/tests/s1g_tim.c b/net/mac80211/tests/s1g_tim.c
new file mode 100644
index 000000000000..642fa4ece89f
--- /dev/null
+++ b/net/mac80211/tests/s1g_tim.c
@@ -0,0 +1,356 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * KUnit tests for S1G TIM PVB decoding. This test suite covers
+ * IEEE80211-2024 Annex L figures 8, 9, 10, 12, 13, 14. ADE mode
+ * is not covered as it is an optional encoding format and is not
+ * currently supported by mac80211.
+ *
+ * Copyright (C) 2025 Morse Micro
+ */
+#include <linux/ieee80211.h>
+#include <kunit/test.h>
+#include <kunit/test-bug.h>
+
+#define MAX_AID 128
+
+#define BC(enc_mode, inverse, blk_off) \
+ ((((blk_off) & 0x1f) << 3) | ((inverse) ? BIT(2) : 0) | \
+ ((enc_mode) & 0x3))
+
+static void byte_to_bitstr(u8 v, char *out)
+{
+ for (int b = 7; b >= 0; b--)
+ *out++ = (v & BIT(b)) ? '1' : '0';
+ *out = '\0';
+}
+
+static void dump_tim_bits(struct kunit *test,
+ const struct ieee80211_tim_ie *tim, u8 tim_len)
+{
+ const u8 *ptr = tim->virtual_map;
+ const u8 *end = (const u8 *)tim + tim_len;
+ unsigned int oct = 1;
+ unsigned int blk = 0;
+ char bits[9];
+
+ while (ptr < end) {
+ u8 ctrl = *ptr++;
+ u8 mode = ctrl & 0x03;
+ bool inverse = ctrl & BIT(2);
+ u8 blk_off = ctrl >> 3;
+
+ kunit_info(
+ test, "Block %u (ENC=%s, blk_off=%u, inverse=%u)", blk,
+ (mode == IEEE80211_S1G_TIM_ENC_MODE_BLOCK) ? "BLOCK" :
+ (mode == IEEE80211_S1G_TIM_ENC_MODE_SINGLE) ? "SINGLE" :
+ "OLB",
+ blk_off, inverse);
+
+ byte_to_bitstr(ctrl, bits);
+ kunit_info(test, " octet %2u (ctrl) : %s (0x%02x)", oct,
+ bits, ctrl);
+ ++oct;
+
+ switch (mode) {
+ case IEEE80211_S1G_TIM_ENC_MODE_BLOCK: {
+ u8 blkmap = *ptr++;
+
+ byte_to_bitstr(blkmap, bits);
+ kunit_info(test, " octet %2u (blk-map) : %s (0x%02x)",
+ oct, bits, blkmap);
+ ++oct;
+
+ for (u8 sb = 0; sb < 8; sb++) {
+ if (!(blkmap & BIT(sb)))
+ continue;
+ u8 sub = *ptr++;
+
+ byte_to_bitstr(sub, bits);
+ kunit_info(
+ test,
+ " octet %2u (SB %2u) : %s (0x%02x)",
+ oct, sb, bits, sub);
+ ++oct;
+ }
+ break;
+ }
+ case IEEE80211_S1G_TIM_ENC_MODE_SINGLE: {
+ u8 single = *ptr++;
+
+ byte_to_bitstr(single, bits);
+ kunit_info(test, " octet %2u (single) : %s (0x%02x)",
+ oct, bits, single);
+ ++oct;
+ break;
+ }
+ case IEEE80211_S1G_TIM_ENC_MODE_OLB: {
+ u8 len = *ptr++;
+
+ byte_to_bitstr(len, bits);
+ kunit_info(test, " octet %2u (len=%2u) : %s (0x%02x)",
+ oct, len, bits, len);
+ ++oct;
+
+ for (u8 i = 0; i < len && ptr < end; i++) {
+ u8 sub = *ptr++;
+
+ byte_to_bitstr(sub, bits);
+ kunit_info(
+ test,
+ " octet %2u (SB %2u) : %s (0x%02x)",
+ oct, i, bits, sub);
+ ++oct;
+ }
+ break;
+ }
+ default:
+ kunit_info(test, " ** unknown encoding 0x%x **", mode);
+ return;
+ }
+ blk++;
+ }
+}
+
+static void tim_push(u8 **p, u8 v)
+{
+ *(*p)++ = v;
+}
+
+static void tim_begin(struct ieee80211_tim_ie *tim, u8 **p)
+{
+ tim->dtim_count = 0;
+ tim->dtim_period = 1;
+ tim->bitmap_ctrl = 0;
+ *p = tim->virtual_map;
+}
+
+static u8 tim_end(struct ieee80211_tim_ie *tim, u8 *tail)
+{
+ return tail - (u8 *)tim;
+}
+
+static void pvb_add_block_bitmap(u8 **p, u8 blk_off, bool inverse, u8 blk_bmap,
+ const u8 *subblocks)
+{
+ u8 enc = IEEE80211_S1G_TIM_ENC_MODE_BLOCK;
+ u8 n = hweight8(blk_bmap);
+
+ tim_push(p, BC(enc, inverse, blk_off));
+ tim_push(p, blk_bmap);
+
+ for (u8 i = 0; i < n; i++)
+ tim_push(p, subblocks[i]);
+}
+
+static void pvb_add_single_aid(u8 **p, u8 blk_off, bool inverse, u8 single6)
+{
+ u8 enc = IEEE80211_S1G_TIM_ENC_MODE_SINGLE;
+
+ tim_push(p, BC(enc, inverse, blk_off));
+ tim_push(p, single6 & GENMASK(5, 0));
+}
+
+static void pvb_add_olb(u8 **p, u8 blk_off, bool inverse, const u8 *subblocks,
+ u8 len)
+{
+ u8 enc = IEEE80211_S1G_TIM_ENC_MODE_OLB;
+
+ tim_push(p, BC(enc, inverse, blk_off));
+ tim_push(p, len);
+ for (u8 i = 0; i < len; i++)
+ tim_push(p, subblocks[i]);
+}
+
+static void check_all_aids(struct kunit *test,
+ const struct ieee80211_tim_ie *tim, u8 tim_len,
+ const unsigned long *expected)
+{
+ for (u16 aid = 1; aid <= MAX_AID; aid++) {
+ bool want = test_bit(aid, expected);
+ bool got = ieee80211_s1g_check_tim(tim, tim_len, aid);
+
+ KUNIT_ASSERT_EQ_MSG(test, got, want,
+ "AID %u mismatch (got=%d want=%d)", aid,
+ got, want);
+ }
+}
+
+static void fill_bitmap(unsigned long *bm, const u16 *list, size_t n)
+{
+ size_t i;
+
+ bitmap_zero(bm, MAX_AID + 1);
+ for (i = 0; i < n; i++)
+ __set_bit(list[i], bm);
+}
+
+static void fill_bitmap_inverse(unsigned long *bm, u16 max_aid,
+ const u16 *except, size_t n_except)
+{
+ bitmap_zero(bm, MAX_AID + 1);
+ for (u16 aid = 1; aid <= max_aid; aid++)
+ __set_bit(aid, bm);
+
+ for (size_t i = 0; i < n_except; i++)
+ if (except[i] <= max_aid)
+ __clear_bit(except[i], bm);
+}
+
+static void s1g_tim_block_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ static const u8 subblocks[] = {
+ 0x42, /* SB m=0: AIDs 1,6 */
+ 0xA0, /* SB m=2: AIDs 21,23 */
+ };
+ u8 blk_bmap = 0x05; /* bits 0 and 2 set */
+ bool inverse = false;
+ static const u16 set_list[] = { 1, 6, 21, 23 };
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap(exp, set_list, ARRAY_SIZE(set_list));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static void s1g_tim_single_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ bool inverse = false;
+ u8 blk_off = 0;
+ u8 single6 = 0x1f; /* 31 */
+ static const u16 set_list[] = { 31 };
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_single_aid(&p, blk_off, inverse, single6);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap(exp, set_list, ARRAY_SIZE(set_list));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static void s1g_tim_olb_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ bool inverse = false;
+ u8 blk_off = 0;
+ static const u16 set_list[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33,
+ 38, 45, 47, 49, 54, 61, 63, 65, 70 };
+ static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42,
+ 0xA0, 0x42, 0xA0, 0x42 };
+ u8 len = ARRAY_SIZE(subblocks);
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_olb(&p, blk_off, inverse, subblocks, len);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap(exp, set_list, ARRAY_SIZE(set_list));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static void s1g_tim_inverse_block_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ /* Same sub-block content as Figure L-8, but inverse = true */
+ static const u8 subblocks[] = {
+ 0x42, /* SB m=0: AIDs 1,6 */
+ 0xA0, /* SB m=2: AIDs 21,23 */
+ };
+ u8 blk_bmap = 0x05;
+ bool inverse = true;
+ /* All AIDs except 1,6,21,23 are set */
+ static const u16 except[] = { 1, 6, 21, 23 };
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_block_bitmap(&p, 0, inverse, blk_bmap, subblocks);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static void s1g_tim_inverse_single_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ bool inverse = true;
+ u8 blk_off = 0;
+ u8 single6 = 0x1f; /* 31 */
+ /* All AIDs except 31 are set */
+ static const u16 except[] = { 31 };
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_single_aid(&p, blk_off, inverse, single6);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap_inverse(exp, 63, except, ARRAY_SIZE(except));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static void s1g_tim_inverse_olb_test(struct kunit *test)
+{
+ u8 buf[256] = {};
+ struct ieee80211_tim_ie *tim = (void *)buf;
+ u8 *p, tim_len;
+ bool inverse = true;
+ u8 blk_off = 0, len;
+ /* All AIDs except the list below are set */
+ static const u16 except[] = { 1, 6, 13, 15, 17, 22, 29, 31, 33,
+ 38, 45, 47, 49, 54, 61, 63, 65, 70 };
+ static const u8 subblocks[] = { 0x42, 0xA0, 0x42, 0xA0, 0x42,
+ 0xA0, 0x42, 0xA0, 0x42 };
+ len = ARRAY_SIZE(subblocks);
+ DECLARE_BITMAP(exp, MAX_AID + 1);
+
+ tim_begin(tim, &p);
+ pvb_add_olb(&p, blk_off, inverse, subblocks, len);
+ tim_len = tim_end(tim, p);
+
+ fill_bitmap_inverse(exp, 127, except, ARRAY_SIZE(except));
+
+ dump_tim_bits(test, tim, tim_len);
+ check_all_aids(test, tim, tim_len, exp);
+}
+
+static struct kunit_case s1g_tim_test_cases[] = {
+ KUNIT_CASE(s1g_tim_block_test),
+ KUNIT_CASE(s1g_tim_single_test),
+ KUNIT_CASE(s1g_tim_olb_test),
+ KUNIT_CASE(s1g_tim_inverse_block_test),
+ KUNIT_CASE(s1g_tim_inverse_single_test),
+ KUNIT_CASE(s1g_tim_inverse_olb_test),
+ {}
+};
+
+static struct kunit_suite s1g_tim = {
+ .name = "mac80211-s1g-tim",
+ .test_cases = s1g_tim_test_cases,
+};
+
+kunit_test_suite(s1g_tim);
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 00671ae45b2f..e7b141c55f7a 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -59,6 +59,9 @@ static __le16 ieee80211_duration(struct ieee80211_tx_data *tx,
if (WARN_ON_ONCE(tx->rate.idx < 0))
return 0;
+ if (info->band >= NUM_NL80211_BANDS)
+ return 0;
+
sband = local->hw.wiphy->bands[info->band];
txrate = &sband->bitrates[tx->rate.idx];
@@ -683,7 +686,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
memset(&txrc, 0, sizeof(txrc));
- sband = tx->local->hw.wiphy->bands[info->band];
+ if (info->band < NUM_NL80211_BANDS)
+ sband = tx->local->hw.wiphy->bands[info->band];
+ else
+ return TX_CONTINUE;
len = min_t(u32, tx->skb->len + FCS_LEN,
tx->local->hw.wiphy->frag_threshold);
@@ -1814,7 +1820,7 @@ static int invoke_tx_handlers_early(struct ieee80211_tx_data *tx)
txh_done:
if (unlikely(res == TX_DROP)) {
- I802_DEBUG_INC(tx->local->tx_handlers_drop);
+ tx->sdata->tx_handlers_drop++;
if (tx->skb)
ieee80211_free_txskb(&tx->local->hw, tx->skb);
else
@@ -1858,7 +1864,7 @@ static int invoke_tx_handlers_late(struct ieee80211_tx_data *tx)
txh_done:
if (unlikely(res == TX_DROP)) {
- I802_DEBUG_INC(tx->local->tx_handlers_drop);
+ tx->sdata->tx_handlers_drop++;
if (tx->skb)
ieee80211_free_txskb(&tx->local->hw, tx->skb);
else
@@ -1942,6 +1948,7 @@ static bool ieee80211_tx(struct ieee80211_sub_if_data *sdata,
if (unlikely(res_prepare == TX_DROP)) {
ieee80211_free_txskb(&local->hw, skb);
+ tx.sdata->tx_handlers_drop++;
return true;
} else if (unlikely(res_prepare == TX_QUEUED)) {
return true;
@@ -3728,8 +3735,10 @@ void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
fast_tx->key, &tx);
tx.skb = NULL;
- if (r == TX_DROP)
+ if (r == TX_DROP) {
+ tx.sdata->tx_handlers_drop++;
goto free;
+ }
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(sdata->bss,
@@ -4882,15 +4891,114 @@ void ieee80211_tx_pending(struct tasklet_struct *t)
/* functions for drivers to get certain frames */
+static void ieee80211_beacon_add_tim_pvb(struct ps_data *ps,
+ struct sk_buff *skb,
+ bool mcast_traffic)
+{
+ int i, n1 = 0, n2;
+
+ /*
+ * Find largest even number N1 so that bits numbered 1 through
+ * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits
+ * (N2 + 1) x 8 through 2007 are 0.
+ */
+ for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) {
+ if (ps->tim[i]) {
+ n1 = i & 0xfe;
+ break;
+ }
+ }
+ n2 = n1;
+ for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) {
+ if (ps->tim[i]) {
+ n2 = i;
+ break;
+ }
+ }
+
+ /* Bitmap control */
+ skb_put_u8(skb, n1 | mcast_traffic);
+ /* Part Virt Bitmap */
+ skb_put_data(skb, ps->tim + n1, n2 - n1 + 1);
+}
+
+/*
+ * mac80211 currently supports encoding using block bitmap mode, non
+ * inversed. The current implementation supports up to 1600 AIDs.
+ *
+ * Block bitmap encoding breaks down the AID bitmap into blocks of 64
+ * AIDs. Each block contains between 0 and 8 subblocks. Each subblock
+ * describes 8 AIDs and the presence of a subblock is determined by
+ * the block bitmap.
+ */
+static void ieee80211_s1g_beacon_add_tim_pvb(struct ps_data *ps,
+ struct sk_buff *skb,
+ bool mcast_traffic)
+{
+ int blk;
+
+ /*
+ * Emit a bitmap control block with a page slice number of 31 and a
+ * page index of 0 which indicates as per IEEE80211-2024 9.4.2.5.1
+ * that the entire page (2048 bits) indicated by the page index
+ * is encoded in the partial virtual bitmap.
+ */
+ skb_put_u8(skb, mcast_traffic | (31 << 1));
+
+ /* Emit an encoded block for each non-zero sub-block */
+ for (blk = 0; blk < IEEE80211_MAX_SUPPORTED_S1G_TIM_BLOCKS; blk++) {
+ u8 blk_bmap = 0;
+ int sblk;
+
+ for (sblk = 0; sblk < 8; sblk++) {
+ int sblk_idx = blk * 8 + sblk;
+
+ /*
+ * If the current subblock is non-zero, increase the
+ * number of subblocks to emit for the current block.
+ */
+ if (ps->tim[sblk_idx])
+ blk_bmap |= BIT(sblk);
+ }
+
+ /* If the current block contains no non-zero sublocks */
+ if (!blk_bmap)
+ continue;
+
+ /*
+ * Emit a block control byte for the current encoded block
+ * with an encoding mode of block bitmap (0x0), not inverse
+ * (0x0) and the current block offset (5 bits)
+ */
+ skb_put_u8(skb, blk << 3);
+
+ /*
+ * Emit the block bitmap for the current encoded block which
+ * contains the present subblocks.
+ */
+ skb_put_u8(skb, blk_bmap);
+
+ /* Emit the present subblocks */
+ for (sblk = 0; sblk < 8; sblk++) {
+ int sblk_idx = blk * 8 + sblk;
+
+ if (!(blk_bmap & BIT(sblk)))
+ continue;
+
+ skb_put_u8(skb, ps->tim[sblk_idx]);
+ }
+ }
+}
+
static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
struct ieee80211_link_data *link,
struct ps_data *ps, struct sk_buff *skb,
bool is_template)
{
- u8 *pos, *tim;
- int aid0 = 0;
- int i, have_bits = 0, n1, n2;
+ struct element *tim;
+ bool mcast_traffic = false, have_bits = false;
struct ieee80211_bss_conf *link_conf = link->conf;
+ bool s1g = ieee80211_get_link_sband(link)->band == NL80211_BAND_S1GHZ;
/* Generate bitmap for TIM only if there are any STAs in power save
* mode. */
@@ -4898,7 +5006,8 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
/* in the hope that this is faster than
* checking byte-for-byte */
have_bits = !bitmap_empty((unsigned long *)ps->tim,
- IEEE80211_MAX_AID+1);
+ IEEE80211_MAX_AID + 1);
+
if (!is_template) {
if (ps->dtim_count == 0)
ps->dtim_count = link_conf->dtim_period - 1;
@@ -4906,51 +5015,39 @@ static void __ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
ps->dtim_count--;
}
- tim = pos = skb_put(skb, 5);
- *pos++ = WLAN_EID_TIM;
- *pos++ = 3;
- *pos++ = ps->dtim_count;
- *pos++ = link_conf->dtim_period;
+ /* Length is set after parsing the AID bitmap */
+ tim = skb_put(skb, sizeof(struct element));
+ tim->id = WLAN_EID_TIM;
+ skb_put_u8(skb, ps->dtim_count);
+ skb_put_u8(skb, link_conf->dtim_period);
if (ps->dtim_count == 0 && !skb_queue_empty(&ps->bc_buf))
- aid0 = 1;
+ mcast_traffic = true;
- ps->dtim_bc_mc = aid0 == 1;
+ ps->dtim_bc_mc = mcast_traffic;
if (have_bits) {
- /* Find largest even number N1 so that bits numbered 1 through
- * (N1 x 8) - 1 in the bitmap are 0 and number N2 so that bits
- * (N2 + 1) x 8 through 2007 are 0. */
- n1 = 0;
- for (i = 0; i < IEEE80211_MAX_TIM_LEN; i++) {
- if (ps->tim[i]) {
- n1 = i & 0xfe;
- break;
- }
- }
- n2 = n1;
- for (i = IEEE80211_MAX_TIM_LEN - 1; i >= n1; i--) {
- if (ps->tim[i]) {
- n2 = i;
- break;
- }
- }
-
- /* Bitmap control */
- *pos++ = n1 | aid0;
- /* Part Virt Bitmap */
- skb_put_data(skb, ps->tim + n1, n2 - n1 + 1);
-
- tim[1] = n2 - n1 + 4;
+ if (s1g)
+ ieee80211_s1g_beacon_add_tim_pvb(ps, skb,
+ mcast_traffic);
+ else
+ ieee80211_beacon_add_tim_pvb(ps, skb, mcast_traffic);
} else {
- *pos++ = aid0; /* Bitmap control */
-
- if (ieee80211_get_link_sband(link)->band != NL80211_BAND_S1GHZ) {
- tim[1] = 4;
+ /*
+ * If there is no buffered unicast traffic for an S1G
+ * interface, we can exclude the bitmap control. This is in
+ * contrast to other phy types as they do include the bitmap
+ * control and pvb even when there is no buffered traffic.
+ */
+ if (!s1g) {
+ /* Bitmap control */
+ skb_put_u8(skb, mcast_traffic);
/* Part Virt Bitmap */
skb_put_u8(skb, 0);
}
}
+
+ tim->datalen = skb_tail_pointer(skb) - tim->data;
}
static int ieee80211_beacon_add_tim(struct ieee80211_sub_if_data *sdata,
@@ -6195,7 +6292,9 @@ void ieee80211_tx_skb_tid(struct ieee80211_sub_if_data *sdata,
enum nl80211_band band;
rcu_read_lock();
- if (!ieee80211_vif_is_mld(&sdata->vif)) {
+ if (sdata->vif.type == NL80211_IFTYPE_NAN) {
+ band = NUM_NL80211_BANDS;
+ } else if (!ieee80211_vif_is_mld(&sdata->vif)) {
WARN_ON(link_id >= 0);
chanctx_conf =
rcu_dereference(sdata->vif.bss_conf.chanctx_conf);
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 32f1bc5908c5..c9931537f9d2 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -1756,7 +1756,6 @@ int ieee80211_reconfig(struct ieee80211_local *local)
bool sched_scan_stopped = false;
bool suspended = local->suspended;
bool in_reconfig = false;
- u32 rts_threshold;
lockdep_assert_wiphy(local->hw.wiphy);
@@ -1832,7 +1831,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
/* setup RTS threshold */
if (hw->wiphy->n_radio > 0) {
for (i = 0; i < hw->wiphy->n_radio; i++) {
- rts_threshold = hw->wiphy->radio_cfg[i].rts_threshold;
+ u32 rts_threshold =
+ hw->wiphy->radio_cfg[i].rts_threshold;
+
drv_set_rts_threshold(local, i, rts_threshold);
}
} else {
@@ -3198,10 +3199,11 @@ bool ieee80211_chandef_he_6ghz_oper(struct ieee80211_local *local,
return true;
}
-bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
+bool ieee80211_chandef_s1g_oper(struct ieee80211_local *local,
+ const struct ieee80211_s1g_oper_ie *oper,
struct cfg80211_chan_def *chandef)
{
- u32 oper_freq;
+ u32 oper_khz, pri_1mhz_khz, pri_2mhz_khz;
if (!oper)
return false;
@@ -3226,12 +3228,36 @@ bool ieee80211_chandef_s1g_oper(const struct ieee80211_s1g_oper_ie *oper,
return false;
}
- oper_freq = ieee80211_channel_to_freq_khz(oper->oper_ch,
- NL80211_BAND_S1GHZ);
- chandef->center_freq1 = KHZ_TO_MHZ(oper_freq);
- chandef->freq1_offset = oper_freq % 1000;
+ chandef->s1g_primary_2mhz = false;
- return true;
+ switch (u8_get_bits(oper->ch_width, S1G_OPER_CH_WIDTH_PRIMARY)) {
+ case IEEE80211_S1G_PRI_CHANWIDTH_1MHZ:
+ pri_1mhz_khz = ieee80211_channel_to_freq_khz(
+ oper->primary_ch, NL80211_BAND_S1GHZ);
+ break;
+ case IEEE80211_S1G_PRI_CHANWIDTH_2MHZ:
+ chandef->s1g_primary_2mhz = true;
+ pri_2mhz_khz = ieee80211_channel_to_freq_khz(
+ oper->primary_ch, NL80211_BAND_S1GHZ);
+
+ if (u8_get_bits(oper->ch_width, S1G_OPER_CH_PRIMARY_LOCATION) ==
+ S1G_2M_PRIMARY_LOCATION_LOWER)
+ pri_1mhz_khz = pri_2mhz_khz - 500;
+ else
+ pri_1mhz_khz = pri_2mhz_khz + 500;
+ break;
+ default:
+ return false;
+ }
+
+ oper_khz = ieee80211_channel_to_freq_khz(oper->oper_ch,
+ NL80211_BAND_S1GHZ);
+ chandef->center_freq1 = KHZ_TO_MHZ(oper_khz);
+ chandef->freq1_offset = oper_khz % 1000;
+ chandef->chan =
+ ieee80211_get_channel_khz(local->hw.wiphy, pri_1mhz_khz);
+
+ return chandef->chan;
}
int ieee80211_put_srates_elem(struct sk_buff *skb,
@@ -4022,16 +4048,13 @@ bool ieee80211_is_radio_idx_in_scan_req(struct wiphy *wiphy,
for (i = 0; i < scan_req->n_channels; i++) {
chan = scan_req->channels[i];
chan_radio_idx = cfg80211_get_radio_idx_by_chan(wiphy, chan);
- /*
- * The chan_radio_idx should be valid since it's taken from a
- * valid scan request.
- * However, if chan_radio_idx is unexpectedly invalid (negative),
- * we take a conservative approach and assume the scan request
- * might use the specified radio_idx. Hence, return true.
- */
- if (WARN_ON(chan_radio_idx < 0))
- return true;
+ /* The radio index either matched successfully, or an error
+ * occurred. For example, if radio-level information is
+ * missing, the same error value is returned. This
+ * typically implies a single-radio setup, in which case
+ * the operation should not be allowed.
+ */
if (chan_radio_idx == radio_idx)
return true;
}
@@ -4514,3 +4537,11 @@ void ieee80211_clear_tpe(struct ieee80211_parsed_tpe *tpe)
sizeof(tpe->psd_reg_client[i].power));
}
}
+
+bool ieee80211_vif_nan_started(struct ieee80211_vif *vif)
+{
+ struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif);
+
+ return vif->type == NL80211_IFTYPE_NAN && sdata->u.nan.started;
+}
+EXPORT_SYMBOL_GPL(ieee80211_vif_nan_started);
diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c
index 685524800d70..b99ba14f39d2 100644
--- a/net/mctp/af_mctp.c
+++ b/net/mctp/af_mctp.c
@@ -256,7 +256,7 @@ static int mctp_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
skb_reserve(skb, hlen);
- /* set type as fist byte in payload */
+ /* set type as first byte in payload */
*(u8 *)skb_put(skb, 1) = addr->smctp_type;
rc = memcpy_from_msg((void *)skb_put(skb, len), msg, len);
diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c
index b08ba959ac4f..31948e18d97d 100644
--- a/net/mptcp/crypto.c
+++ b/net/mptcp/crypto.c
@@ -22,7 +22,6 @@
#include <linux/kernel.h>
#include <crypto/sha2.h>
-#include <linux/unaligned.h>
#include "protocol.h"
@@ -43,39 +42,9 @@ void mptcp_crypto_key_sha(u64 key, u32 *token, u64 *idsn)
void mptcp_crypto_hmac_sha(u64 key1, u64 key2, u8 *msg, int len, void *hmac)
{
- u8 input[SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE];
- u8 key1be[8];
- u8 key2be[8];
- int i;
+ __be64 key[2] = { cpu_to_be64(key1), cpu_to_be64(key2) };
- if (WARN_ON_ONCE(len > SHA256_DIGEST_SIZE))
- len = SHA256_DIGEST_SIZE;
-
- put_unaligned_be64(key1, key1be);
- put_unaligned_be64(key2, key2be);
-
- /* Generate key xored with ipad */
- memset(input, 0x36, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- memcpy(&input[SHA256_BLOCK_SIZE], msg, len);
-
- /* emit sha256(K1 || msg) on the second input block, so we can
- * reuse 'input' for the last hashing
- */
- sha256(input, SHA256_BLOCK_SIZE + len, &input[SHA256_BLOCK_SIZE]);
-
- /* Prepare second part of hmac */
- memset(input, 0x5C, SHA256_BLOCK_SIZE);
- for (i = 0; i < 8; i++)
- input[i] ^= key1be[i];
- for (i = 0; i < 8; i++)
- input[i + 8] ^= key2be[i];
-
- sha256(input, SHA256_BLOCK_SIZE + SHA256_DIGEST_SIZE, hmac);
+ hmac_sha256_usingrawkey((const u8 *)key, sizeof(key), msg, len, hmac);
}
#if IS_MODULE(CONFIG_MPTCP_KUNIT_TEST)
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index fed40dae5583..d96130e49942 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -501,10 +501,15 @@ void mptcp_active_enable(struct sock *sk)
struct mptcp_pernet *pernet = mptcp_get_pernet(sock_net(sk));
if (atomic_read(&pernet->active_disable_times)) {
- struct dst_entry *dst = sk_dst_get(sk);
+ struct net_device *dev;
+ struct dst_entry *dst;
- if (dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!(dev && (dev->flags & IFF_LOOPBACK)))
atomic_set(&pernet->active_disable_times, 0);
+ rcu_read_unlock();
}
}
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index cf879c188ca2..6003e47c770a 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,7 +85,6 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
- SNMP_MIB_SENTINEL
};
/* mptcp_mib_alloc - allocate percpu mib counters
@@ -108,22 +107,23 @@ bool mptcp_mib_alloc(struct net *net)
void mptcp_seq_show(struct seq_file *seq)
{
- unsigned long sum[ARRAY_SIZE(mptcp_snmp_list) - 1];
+ unsigned long sum[ARRAY_SIZE(mptcp_snmp_list)];
+ const int cnt = ARRAY_SIZE(mptcp_snmp_list);
struct net *net = seq->private;
int i;
seq_puts(seq, "MPTcpExt:");
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %s", mptcp_snmp_list[i].name);
seq_puts(seq, "\nMPTcpExt:");
memset(sum, 0, sizeof(sum));
if (net->mib.mptcp_statistics)
- snmp_get_cpu_field_batch(sum, mptcp_snmp_list,
- net->mib.mptcp_statistics);
+ snmp_get_cpu_field_batch_cnt(sum, mptcp_snmp_list, cnt,
+ net->mib.mptcp_statistics);
- for (i = 0; mptcp_snmp_list[i].name; i++)
+ for (i = 0; i < cnt; i++)
seq_printf(seq, " %lu", sum[i]);
seq_putc(seq, '\n');
diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c
index 0566dd793810..ac974299de71 100644
--- a/net/mptcp/mptcp_diag.c
+++ b/net/mptcp/mptcp_diag.c
@@ -15,9 +15,9 @@
static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *req,
- struct nlattr *bc, bool net_admin)
+ bool net_admin)
{
- if (!inet_diag_bc_sk(bc, sk))
+ if (!inet_diag_bc_sk(cb->data, sk))
return 0;
return inet_sk_diag_fill(sk, inet_csk(sk), skb, cb, req, NLM_F_MULTI,
@@ -76,9 +76,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
const struct inet_diag_req_v2 *r,
bool net_admin)
{
- struct inet_diag_dump_data *cb_data = cb->data;
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
- struct nlattr *bc = cb_data->inet_diag_nla_bc;
struct net *net = sock_net(skb->sk);
struct inet_hashinfo *hinfo;
int i;
@@ -121,7 +119,7 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba
if (!refcount_inc_not_zero(&sk->sk_refcnt))
goto next_listen;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
sock_put(sk);
@@ -154,15 +152,10 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
bool net_admin = netlink_net_capable(cb->skb, CAP_NET_ADMIN);
struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx;
struct net *net = sock_net(skb->sk);
- struct inet_diag_dump_data *cb_data;
struct mptcp_sock *msk;
- struct nlattr *bc;
BUILD_BUG_ON(sizeof(cb->ctx) < sizeof(*diag_ctx));
- cb_data = cb->data;
- bc = cb_data->inet_diag_nla_bc;
-
while ((msk = mptcp_token_iter_next(net, &diag_ctx->s_slot,
&diag_ctx->s_num)) != NULL) {
struct inet_sock *inet = (struct inet_sock *)msk;
@@ -181,7 +174,7 @@ static void mptcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
r->id.idiag_dport)
goto next;
- ret = sk_diag_dump(sk, skb, cb, r, bc, net_admin);
+ ret = sk_diag_dump(sk, skb, cb, r, net_admin);
next:
sock_put(sk);
if (ret < 0) {
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 136a380602ca..2ff1b9499568 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -268,6 +268,27 @@ int mptcp_pm_mp_prio_send_ack(struct mptcp_sock *msk,
return -EINVAL;
}
+static unsigned int mptcp_adjust_add_addr_timeout(struct mptcp_sock *msk)
+{
+ const struct net *net = sock_net((struct sock *)msk);
+ unsigned int rto = mptcp_get_add_addr_timeout(net);
+ struct mptcp_subflow_context *subflow;
+ unsigned int max = 0;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+ struct inet_connection_sock *icsk = inet_csk(ssk);
+
+ if (icsk->icsk_rto > max)
+ max = icsk->icsk_rto;
+ }
+
+ if (max && max < rto)
+ rto = max;
+
+ return rto;
+}
+
static void mptcp_pm_add_timer(struct timer_list *timer)
{
struct mptcp_pm_add_entry *entry = timer_container_of(entry, timer,
@@ -292,7 +313,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
goto out;
}
- timeout = mptcp_get_add_addr_timeout(sock_net(sk));
+ timeout = mptcp_adjust_add_addr_timeout(msk);
if (!timeout)
goto out;
@@ -307,7 +328,7 @@ static void mptcp_pm_add_timer(struct timer_list *timer)
if (entry->retrans_times < ADD_ADDR_RETRANS_MAX)
sk_reset_timer(sk, timer,
- jiffies + timeout);
+ jiffies + (timeout << entry->retrans_times));
spin_unlock_bh(&msk->pm.lock);
@@ -348,7 +369,6 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
- struct net *net = sock_net(sk);
unsigned int timeout;
lockdep_assert_held(&msk->pm.lock);
@@ -374,7 +394,7 @@ bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
reset_timer:
- timeout = mptcp_get_add_addr_timeout(net);
+ timeout = mptcp_adjust_add_addr_timeout(msk);
if (timeout)
sk_reset_timer(sk, &add_entry->add_timer, jiffies + timeout);
@@ -463,23 +483,24 @@ void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int
bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
{
struct mptcp_pm_data *pm = &msk->pm;
- unsigned int subflows_max;
+ unsigned int limit_extra_subflows;
int ret = 0;
if (mptcp_pm_is_userspace(msk)) {
if (mptcp_userspace_pm_active(msk)) {
spin_lock_bh(&pm->lock);
- pm->subflows++;
+ pm->extra_subflows++;
spin_unlock_bh(&pm->lock);
return true;
}
return false;
}
- subflows_max = mptcp_pm_get_subflows_max(msk);
+ limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
- pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk, pm->subflows,
- subflows_max, READ_ONCE(pm->accept_subflow));
+ pr_debug("msk=%p subflows=%d max=%d allow=%d\n", msk,
+ pm->extra_subflows, limit_extra_subflows,
+ READ_ONCE(pm->accept_subflow));
/* try to avoid acquiring the lock below */
if (!READ_ONCE(pm->accept_subflow))
@@ -487,8 +508,8 @@ bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
spin_lock_bh(&pm->lock);
if (READ_ONCE(pm->accept_subflow)) {
- ret = pm->subflows < subflows_max;
- if (ret && ++pm->subflows == subflows_max)
+ ret = pm->extra_subflows < limit_extra_subflows;
+ if (ret && ++pm->extra_subflows == limit_extra_subflows)
WRITE_ONCE(pm->accept_subflow, false);
}
spin_unlock_bh(&pm->lock);
@@ -574,7 +595,7 @@ void mptcp_pm_subflow_check_next(struct mptcp_sock *msk,
if (mptcp_pm_is_userspace(msk)) {
if (update_subflows) {
spin_lock_bh(&pm->lock);
- pm->subflows--;
+ pm->extra_subflows--;
spin_unlock_bh(&pm->lock);
}
return;
@@ -617,9 +638,12 @@ void mptcp_pm_add_addr_received(const struct sock *ssk,
} else {
__MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
}
- /* id0 should not have a different address */
+ /* - id0 should not have a different address
+ * - special case for C-flag: linked to fill_local_addresses_vec()
+ */
} else if ((addr->id == 0 && !mptcp_pm_is_init_remote_addr(msk, addr)) ||
- (addr->id > 0 && !READ_ONCE(pm->accept_addr))) {
+ (addr->id > 0 && !READ_ONCE(pm->accept_addr) &&
+ !mptcp_pm_add_addr_c_flag_case(msk))) {
mptcp_pm_announce_addr(msk, addr, true);
mptcp_pm_add_addr_send_ack(msk);
} else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
@@ -1005,17 +1029,17 @@ void mptcp_pm_data_reset(struct mptcp_sock *msk)
WRITE_ONCE(pm->pm_type, pm_type);
if (pm_type == MPTCP_PM_TYPE_KERNEL) {
- bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
+ bool subflows_allowed = !!mptcp_pm_get_limit_extra_subflows(msk);
/* pm->work_pending must be only be set to 'true' when
* pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
*/
WRITE_ONCE(pm->work_pending,
- (!!mptcp_pm_get_local_addr_max(msk) &&
+ (!!mptcp_pm_get_endp_subflow_max(msk) &&
subflows_allowed) ||
- !!mptcp_pm_get_add_addr_signal_max(msk));
+ !!mptcp_pm_get_endp_signal_max(msk));
WRITE_ONCE(pm->accept_addr,
- !!mptcp_pm_get_add_addr_accept_max(msk) &&
+ !!mptcp_pm_get_limit_add_addr_accepted(msk) &&
subflows_allowed);
WRITE_ONCE(pm->accept_subflow, subflows_allowed);
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index 667803d72b64..e0f44dc232aa 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -17,14 +17,14 @@ static int pm_nl_pernet_id;
struct pm_nl_pernet {
/* protects pernet updates */
spinlock_t lock;
- struct list_head local_addr_list;
- unsigned int addrs;
- unsigned int stale_loss_cnt;
- unsigned int add_addr_signal_max;
- unsigned int add_addr_accept_max;
- unsigned int local_addr_max;
- unsigned int subflows_max;
- unsigned int next_id;
+ struct list_head endp_list;
+ u8 endpoints;
+ u8 endp_signal_max;
+ u8 endp_subflow_max;
+ u8 endp_laminar_max;
+ u8 limit_add_addr_accepted;
+ u8 limit_extra_subflows;
+ u8 next_id;
DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
};
@@ -46,37 +46,45 @@ static struct pm_nl_pernet *genl_info_pm_nl(struct genl_info *info)
return pm_nl_get_pernet(genl_info_net(info));
}
-unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk)
+u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk)
{
const struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- return READ_ONCE(pernet->add_addr_signal_max);
+ return READ_ONCE(pernet->endp_signal_max);
}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_signal_max);
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_signal_max);
-unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk)
+u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- return READ_ONCE(pernet->add_addr_accept_max);
+ return READ_ONCE(pernet->endp_subflow_max);
}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_add_addr_accept_max);
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
-unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk)
+u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- return READ_ONCE(pernet->subflows_max);
+ return READ_ONCE(pernet->endp_laminar_max);
}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_subflows_max);
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_laminar_max);
-unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk)
+u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- return READ_ONCE(pernet->local_addr_max);
+ return READ_ONCE(pernet->limit_add_addr_accepted);
}
-EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max);
+EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_add_addr_accepted);
+
+u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+ return READ_ONCE(pernet->limit_extra_subflows);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_limit_extra_subflows);
static bool lookup_subflow_by_daddr(const struct list_head *list,
const struct mptcp_addr_info *daddr)
@@ -110,7 +118,7 @@ select_local_address(const struct pm_nl_pernet *pernet,
msk_owned_by_me(msk);
rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW))
continue;
@@ -141,7 +149,7 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
* Note: removal from the local address list during the msk life-cycle
* can lead to additional addresses not being announced.
*/
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap))
continue;
@@ -159,80 +167,96 @@ select_signal_address(struct pm_nl_pernet *pernet, const struct mptcp_sock *msk,
return found;
}
-/* Fill all the remote addresses into the array addrs[],
- * and return the array size.
- */
-static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk,
- struct mptcp_addr_info *local,
- bool fullmesh,
- struct mptcp_addr_info *addrs)
+static unsigned int
+fill_remote_addr(struct mptcp_sock *msk, struct mptcp_addr_info *local,
+ struct mptcp_addr_info *addrs)
{
bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
- struct sock *sk = (struct sock *)msk, *ssk;
- struct mptcp_subflow_context *subflow;
struct mptcp_addr_info remote = { 0 };
- unsigned int subflows_max;
- int i = 0;
+ struct sock *sk = (struct sock *)msk;
+
+ if (deny_id0)
+ return 0;
- subflows_max = mptcp_pm_get_subflows_max(msk);
mptcp_remote_address((struct sock_common *)sk, &remote);
- /* Non-fullmesh endpoint, fill in the single entry
- * corresponding to the primary MPC subflow remote address
- */
- if (!fullmesh) {
- if (deny_id0)
- return 0;
+ if (!mptcp_pm_addr_families_match(sk, local, &remote))
+ return 0;
- if (!mptcp_pm_addr_families_match(sk, local, &remote))
- return 0;
+ msk->pm.extra_subflows++;
+ *addrs = remote;
- msk->pm.subflows++;
- addrs[i++] = remote;
- } else {
- DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ return 1;
+}
- /* Forbid creation of new subflows matching existing
- * ones, possibly already created by incoming ADD_ADDR
- */
- bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
- mptcp_for_each_subflow(msk, subflow)
- if (READ_ONCE(subflow->local_id) == local->id)
- __set_bit(subflow->remote_id, unavail_id);
-
- mptcp_for_each_subflow(msk, subflow) {
- ssk = mptcp_subflow_tcp_sock(subflow);
- mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
- addrs[i].id = READ_ONCE(subflow->remote_id);
- if (deny_id0 && !addrs[i].id)
- continue;
+static unsigned int
+fill_remote_addresses_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *local,
+ struct mptcp_addr_info *addrs)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ bool deny_id0 = READ_ONCE(msk->pm.remote_deny_join_id0);
+ DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct sock *sk = (struct sock *)msk, *ssk;
+ struct mptcp_subflow_context *subflow;
+ int i = 0;
- if (test_bit(addrs[i].id, unavail_id))
- continue;
+ /* Forbid creation of new subflows matching existing ones, possibly
+ * already created by incoming ADD_ADDR
+ */
+ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ mptcp_for_each_subflow(msk, subflow)
+ if (READ_ONCE(subflow->local_id) == local->id)
+ __set_bit(subflow->remote_id, unavail_id);
+
+ mptcp_for_each_subflow(msk, subflow) {
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ mptcp_remote_address((struct sock_common *)ssk, &addrs[i]);
+ addrs[i].id = READ_ONCE(subflow->remote_id);
+ if (deny_id0 && !addrs[i].id)
+ continue;
- if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
- continue;
+ if (test_bit(addrs[i].id, unavail_id))
+ continue;
- if (msk->pm.subflows < subflows_max) {
- /* forbid creating multiple address towards
- * this id
- */
- __set_bit(addrs[i].id, unavail_id);
- msk->pm.subflows++;
- i++;
- }
- }
+ if (!mptcp_pm_addr_families_match(sk, local, &addrs[i]))
+ continue;
+
+ /* forbid creating multiple address towards this id */
+ __set_bit(addrs[i].id, unavail_id);
+ msk->pm.extra_subflows++;
+ i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
}
return i;
}
+/* Fill all the remote addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int
+fill_remote_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *local,
+ bool fullmesh, struct mptcp_addr_info *addrs)
+{
+ /* Non-fullmesh: fill in the single entry corresponding to the primary
+ * MPC subflow remote address, and return 1, corresponding to 1 entry.
+ */
+ if (!fullmesh)
+ return fill_remote_addr(msk, local, addrs);
+
+ /* Fullmesh endpoint: fill all possible remote addresses */
+ return fill_remote_addresses_fullmesh(msk, local, addrs);
+}
+
static struct mptcp_pm_addr_entry *
__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (entry->addr.id == id)
return entry;
@@ -245,7 +269,7 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
{
struct mptcp_pm_addr_entry *entry;
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list,
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list,
lockdep_is_held(&pernet->lock)) {
if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port))
return entry;
@@ -253,52 +277,65 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info)
return NULL;
}
-static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
+ const struct mptcp_addr_info *addr)
{
- struct sock *sk = (struct sock *)msk;
- unsigned int add_addr_signal_max;
- bool signal_and_subflow = false;
- unsigned int local_addr_max;
- struct pm_nl_pernet *pernet;
- struct mptcp_pm_local local;
- unsigned int subflows_max;
-
- pernet = pm_nl_get_pernet(sock_net(sk));
+ return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
+}
- add_addr_signal_max = mptcp_pm_get_add_addr_signal_max(msk);
- local_addr_max = mptcp_pm_get_local_addr_max(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
+/* Set mpc_endpoint_id, and send MP_PRIO for ID0 if needed */
+static void mptcp_mpc_endpoint_setup(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_addr_info mpc_addr;
+ struct pm_nl_pernet *pernet;
+ bool backup = false;
/* do lazy endpoint usage accounting for the MPC subflows */
- if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) {
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(msk->first);
- struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info mpc_addr;
- bool backup = false;
-
- mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
- rcu_read_lock();
- entry = __lookup_addr(pernet, &mpc_addr);
- if (entry) {
- __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
- msk->mpc_endpoint_id = entry->addr.id;
- backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
- }
- rcu_read_unlock();
+ if (likely(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED)) ||
+ !msk->first)
+ return;
- if (backup)
- mptcp_pm_send_ack(msk, subflow, true, backup);
+ subflow = mptcp_subflow_ctx(msk->first);
+ pernet = pm_nl_get_pernet_from_msk(msk);
- msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+ mptcp_local_address((struct sock_common *)msk->first, &mpc_addr);
+ rcu_read_lock();
+ entry = __lookup_addr(pernet, &mpc_addr);
+ if (entry) {
+ __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap);
+ msk->mpc_endpoint_id = entry->addr.id;
+ backup = !!(entry->flags & MPTCP_PM_ADDR_FLAG_BACKUP);
}
+ rcu_read_unlock();
+
+ /* Send MP_PRIO */
+ if (backup)
+ mptcp_pm_send_ack(msk, subflow, true, backup);
+
+ msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED);
+}
+
+static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
+ u8 endp_signal_max = mptcp_pm_get_endp_signal_max(msk);
+ struct sock *sk = (struct sock *)msk;
+ bool signal_and_subflow = false;
+ struct mptcp_pm_local local;
+
+ mptcp_mpc_endpoint_setup(msk);
pr_debug("local %d:%d signal %d:%d subflows %d:%d\n",
- msk->pm.local_addr_used, local_addr_max,
- msk->pm.add_addr_signaled, add_addr_signal_max,
- msk->pm.subflows, subflows_max);
+ msk->pm.local_addr_used, endp_subflow_max,
+ msk->pm.add_addr_signaled, endp_signal_max,
+ msk->pm.extra_subflows, limit_extra_subflows);
/* check first for announce */
- if (msk->pm.add_addr_signaled < add_addr_signal_max) {
+ if (msk->pm.add_addr_signaled < endp_signal_max) {
/* due to racing events on both ends we can reach here while
* previous add address is still running: if we invoke now
* mptcp_pm_announce_addr(), that will fail and the
@@ -334,8 +371,8 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
subflow:
/* check if should create a new subflow */
- while (msk->pm.local_addr_used < local_addr_max &&
- msk->pm.subflows < subflows_max) {
+ while (msk->pm.local_addr_used < endp_subflow_max &&
+ msk->pm.extra_subflows < limit_extra_subflows) {
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
bool fullmesh;
int i, nr;
@@ -377,90 +414,225 @@ static void mptcp_pm_nl_subflow_established(struct mptcp_sock *msk)
mptcp_pm_create_subflow_or_signal_addr(msk);
}
-/* Fill all the local addresses into the array addrs[],
- * and return the array size.
- */
-static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
- struct mptcp_addr_info *remote,
- struct mptcp_pm_local *locals)
+static unsigned int
+fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals,
+ bool c_flag_case)
{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *entry;
- struct mptcp_addr_info mpc_addr;
- struct pm_nl_pernet *pernet;
- unsigned int subflows_max;
+ struct mptcp_pm_local *local;
int i = 0;
- pernet = pm_nl_get_pernet_from_msk(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
-
- mptcp_local_address((struct sock_common *)msk, &mpc_addr);
-
rcu_read_lock();
- list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) {
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ bool is_id0;
+
if (!(entry->flags & MPTCP_PM_ADDR_FLAG_FULLMESH))
continue;
if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
continue;
- if (msk->pm.subflows < subflows_max) {
- locals[i].addr = entry->addr;
- locals[i].flags = entry->flags;
- locals[i].ifindex = entry->ifindex;
+ local = &locals[i];
+ local->addr = entry->addr;
+ local->flags = entry->flags;
+ local->ifindex = entry->ifindex;
- /* Special case for ID0: set the correct ID */
- if (mptcp_addresses_equal(&locals[i].addr, &mpc_addr, locals[i].addr.port))
- locals[i].addr.id = 0;
+ is_id0 = local->addr.id == msk->mpc_endpoint_id;
- msk->pm.subflows++;
- i++;
+ if (c_flag_case &&
+ (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+ if (!is_id0)
+ msk->pm.local_addr_used++;
}
+
+ /* Special case for ID0: set the correct ID */
+ if (is_id0)
+ local->addr.id = 0;
+
+ msk->pm.extra_subflows++;
+ i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
}
rcu_read_unlock();
- /* If the array is empty, fill in the single
- * 'IPADDRANY' local address
+ return i;
+}
+
+static unsigned int
+fill_local_laminar_endp(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_addr_entry *entry;
+ struct mptcp_pm_local *local;
+ int found = 0;
+
+ /* Forbid creation of new subflows matching existing ones, possibly
+ * already created by 'subflow' endpoints
*/
- if (!i) {
- memset(&locals[i], 0, sizeof(locals[i]));
- locals[i].addr.family =
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- remote->family == AF_INET6 &&
- ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
-#endif
- remote->family;
+ bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ if ((1 << inet_sk_state_load(ssk)) &
+ (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING |
+ TCPF_CLOSE))
+ continue;
+
+ __set_bit(subflow_get_local_id(subflow), unavail_id);
+ }
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+ if (!(entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR))
+ continue;
+
+ if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+ continue;
+
+ if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
+ unavail_id))
+ continue;
+
+ local = &locals[0];
+ local->addr = entry->addr;
+ local->flags = entry->flags;
+ local->ifindex = entry->ifindex;
+
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+ if (local->addr.id != msk->mpc_endpoint_id)
+ msk->pm.local_addr_used++;
+ }
+
+ msk->pm.extra_subflows++;
+ found = 1;
+ break;
+ }
+ rcu_read_unlock();
+
+ return found;
+}
+
+static unsigned int
+fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
+ struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
+ struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+ u8 endp_subflow_max = mptcp_pm_get_endp_subflow_max(msk);
+ struct sock *sk = (struct sock *)msk;
+ struct mptcp_pm_local *local;
+ int i = 0;
+
+ while (msk->pm.local_addr_used < endp_subflow_max) {
+ local = &locals[i];
- if (!mptcp_pm_addr_families_match(sk, &locals[i].addr, remote))
- return 0;
+ if (!select_local_address(pernet, msk, local))
+ break;
+
+ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
- msk->pm.subflows++;
+ if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
+ continue;
+
+ if (local->addr.id == msk->mpc_endpoint_id)
+ continue;
+
+ msk->pm.local_addr_used++;
+ msk->pm.extra_subflows++;
i++;
+
+ if (msk->pm.extra_subflows >= limit_extra_subflows)
+ break;
}
return i;
}
+static unsigned int
+fill_local_address_any(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *local)
+{
+ struct sock *sk = (struct sock *)msk;
+
+ memset(local, 0, sizeof(*local));
+ local->addr.family =
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ remote->family == AF_INET6 &&
+ ipv6_addr_v4mapped(&remote->addr6) ? AF_INET :
+#endif
+ remote->family;
+
+ if (!mptcp_pm_addr_families_match(sk, &local->addr, remote))
+ return 0;
+
+ msk->pm.extra_subflows++;
+
+ return 1;
+}
+
+/* Fill all the local addresses into the array addrs[],
+ * and return the array size.
+ */
+static unsigned int
+fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
+ struct mptcp_pm_local *locals)
+{
+ bool c_flag_case = remote->id && mptcp_pm_add_addr_c_flag_case(msk);
+ int i;
+
+ /* If there is at least one MPTCP endpoint with a fullmesh flag */
+ i = fill_local_addresses_vec_fullmesh(msk, remote, locals, c_flag_case);
+ if (i)
+ return i;
+
+ /* If there is at least one MPTCP endpoint with a laminar flag */
+ if (mptcp_pm_get_endp_laminar_max(msk))
+ return fill_local_laminar_endp(msk, remote, locals);
+
+ /* Special case: peer sets the C flag, accept one ADD_ADDR if default
+ * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
+ */
+ if (c_flag_case)
+ return fill_local_addresses_vec_c_flag(msk, remote, locals);
+
+ /* No special case: fill in the single 'IPADDRANY' local address */
+ return fill_local_address_any(msk, remote, &locals[0]);
+}
+
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
{
+ u8 limit_add_addr_accepted = mptcp_pm_get_limit_add_addr_accepted(msk);
+ u8 limit_extra_subflows = mptcp_pm_get_limit_extra_subflows(msk);
struct mptcp_pm_local locals[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
- unsigned int add_addr_accept_max;
struct mptcp_addr_info remote;
- unsigned int subflows_max;
bool sf_created = false;
int i, nr;
- add_addr_accept_max = mptcp_pm_get_add_addr_accept_max(msk);
- subflows_max = mptcp_pm_get_subflows_max(msk);
-
pr_debug("accepted %d:%d remote family %d\n",
- msk->pm.add_addr_accepted, add_addr_accept_max,
+ msk->pm.add_addr_accepted, limit_add_addr_accepted,
msk->pm.remote.family);
remote = msk->pm.remote;
mptcp_pm_announce_addr(msk, &remote, true);
mptcp_pm_addr_send_ack(msk);
+ mptcp_mpc_endpoint_setup(msk);
if (lookup_subflow_by_daddr(&msk->conn_list, &remote))
return;
@@ -486,8 +658,8 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
/* add_addr_accepted is not decr for ID 0 */
if (remote.id)
msk->pm.add_addr_accepted++;
- if (msk->pm.add_addr_accepted >= add_addr_accept_max ||
- msk->pm.subflows >= subflows_max)
+ if (msk->pm.add_addr_accepted >= limit_add_addr_accepted ||
+ msk->pm.extra_subflows >= limit_extra_subflows)
WRITE_ONCE(msk->pm.accept_addr, false);
}
}
@@ -495,10 +667,13 @@ static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
void mptcp_pm_nl_rm_addr(struct mptcp_sock *msk, u8 rm_id)
{
if (rm_id && WARN_ON_ONCE(msk->pm.add_addr_accepted == 0)) {
+ u8 limit_add_addr_accepted =
+ mptcp_pm_get_limit_add_addr_accepted(msk);
+
/* Note: if the subflow has been closed before, this
* add_addr_accepted counter will not be decremented.
*/
- if (--msk->pm.add_addr_accepted < mptcp_pm_get_add_addr_accept_max(msk))
+ if (--msk->pm.add_addr_accepted < limit_add_addr_accepted)
WRITE_ONCE(msk->pm.accept_addr, true);
}
}
@@ -523,8 +698,8 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
bool needs_id, bool replace)
{
struct mptcp_pm_addr_entry *cur, *del_entry = NULL;
- unsigned int addr_max;
int ret = -EINVAL;
+ u8 addr_max;
spin_lock_bh(&pernet->lock);
/* to keep the code simple, don't do IDR-like allocation for address ID,
@@ -532,7 +707,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
*/
if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
pernet->next_id = 1;
- if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
+ if (pernet->endpoints >= MPTCP_PM_ADDR_MAX) {
ret = -ERANGE;
goto out;
}
@@ -546,7 +721,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
*/
if (!address_use_port(entry))
entry->addr.port = 0;
- list_for_each_entry(cur, &pernet->local_addr_list, list) {
+ list_for_each_entry(cur, &pernet->endp_list, list) {
if (mptcp_addresses_equal(&cur->addr, &entry->addr,
cur->addr.port || entry->addr.port)) {
/* allow replacing the exiting endpoint only if such
@@ -571,7 +746,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
goto out;
}
- pernet->addrs--;
+ pernet->endpoints--;
entry->addr.id = cur->addr.id;
list_del_rcu(&cur->list);
del_entry = cur;
@@ -598,19 +773,23 @@ find_next:
pernet->next_id = entry->addr.id;
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
- addr_max = pernet->add_addr_signal_max;
- WRITE_ONCE(pernet->add_addr_signal_max, addr_max + 1);
+ addr_max = pernet->endp_signal_max;
+ WRITE_ONCE(pernet->endp_signal_max, addr_max + 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
- addr_max = pernet->local_addr_max;
- WRITE_ONCE(pernet->local_addr_max, addr_max + 1);
+ addr_max = pernet->endp_subflow_max;
+ WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
+ addr_max = pernet->endp_laminar_max;
+ WRITE_ONCE(pernet->endp_laminar_max, addr_max + 1);
}
- pernet->addrs++;
+ pernet->endpoints++;
if (!entry->addr.port)
- list_add_tail_rcu(&entry->list, &pernet->local_addr_list);
+ list_add_tail_rcu(&entry->list, &pernet->endp_list);
else
- list_add_rcu(&entry->list, &pernet->local_addr_list);
+ list_add_rcu(&entry->list, &pernet->endp_list);
ret = entry->addr.id;
out:
@@ -845,12 +1024,6 @@ out_free:
return ret;
}
-static u8 mptcp_endp_get_local_id(struct mptcp_sock *msk,
- const struct mptcp_addr_info *addr)
-{
- return msk->mpc_endpoint_id == addr->id ? 0 : addr->id;
-}
-
static bool mptcp_pm_remove_anno_addr(struct mptcp_sock *msk,
const struct mptcp_addr_info *addr,
bool force)
@@ -969,8 +1142,8 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
{
struct pm_nl_pernet *pernet = genl_info_pm_nl(info);
struct mptcp_pm_addr_entry addr, *entry;
- unsigned int addr_max;
struct nlattr *attr;
+ u8 addr_max;
int ret;
if (GENL_REQ_ATTR_CHECK(info, MPTCP_PM_ENDPOINT_ADDR))
@@ -997,15 +1170,19 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
return -EINVAL;
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL) {
- addr_max = pernet->add_addr_signal_max;
- WRITE_ONCE(pernet->add_addr_signal_max, addr_max - 1);
+ addr_max = pernet->endp_signal_max;
+ WRITE_ONCE(pernet->endp_signal_max, addr_max - 1);
}
if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
- addr_max = pernet->local_addr_max;
- WRITE_ONCE(pernet->local_addr_max, addr_max - 1);
+ addr_max = pernet->endp_subflow_max;
+ WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
+ }
+ if (entry->flags & MPTCP_PM_ADDR_FLAG_LAMINAR) {
+ addr_max = pernet->endp_laminar_max;
+ WRITE_ONCE(pernet->endp_laminar_max, addr_max - 1);
}
- pernet->addrs--;
+ pernet->endpoints--;
list_del_rcu(&entry->list);
__clear_bit(entry->addr.id, pernet->id_bitmap);
spin_unlock_bh(&pernet->lock);
@@ -1084,9 +1261,10 @@ static void __flush_addrs(struct list_head *list)
static void __reset_counters(struct pm_nl_pernet *pernet)
{
- WRITE_ONCE(pernet->add_addr_signal_max, 0);
- WRITE_ONCE(pernet->local_addr_max, 0);
- pernet->addrs = 0;
+ WRITE_ONCE(pernet->endp_signal_max, 0);
+ WRITE_ONCE(pernet->endp_subflow_max, 0);
+ WRITE_ONCE(pernet->endp_laminar_max, 0);
+ pernet->endpoints = 0;
}
int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
@@ -1095,7 +1273,7 @@ int mptcp_pm_nl_flush_addrs_doit(struct sk_buff *skb, struct genl_info *info)
LIST_HEAD(free_list);
spin_lock_bh(&pernet->lock);
- list_splice_init(&pernet->local_addr_list, &free_list);
+ list_splice_init(&pernet->endp_list, &free_list);
__reset_counters(pernet);
pernet->next_id = 1;
bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
@@ -1181,18 +1359,18 @@ int mptcp_pm_nl_set_limits_doit(struct sk_buff *skb, struct genl_info *info)
int ret;
spin_lock_bh(&pernet->lock);
- rcv_addrs = pernet->add_addr_accept_max;
+ rcv_addrs = pernet->limit_add_addr_accepted;
ret = parse_limit(info, MPTCP_PM_ATTR_RCV_ADD_ADDRS, &rcv_addrs);
if (ret)
goto unlock;
- subflows = pernet->subflows_max;
+ subflows = pernet->limit_extra_subflows;
ret = parse_limit(info, MPTCP_PM_ATTR_SUBFLOWS, &subflows);
if (ret)
goto unlock;
- WRITE_ONCE(pernet->add_addr_accept_max, rcv_addrs);
- WRITE_ONCE(pernet->subflows_max, subflows);
+ WRITE_ONCE(pernet->limit_add_addr_accepted, rcv_addrs);
+ WRITE_ONCE(pernet->limit_extra_subflows, subflows);
unlock:
spin_unlock_bh(&pernet->lock);
@@ -1215,11 +1393,11 @@ int mptcp_pm_nl_get_limits_doit(struct sk_buff *skb, struct genl_info *info)
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_RCV_ADD_ADDRS,
- READ_ONCE(pernet->add_addr_accept_max)))
+ READ_ONCE(pernet->limit_add_addr_accepted)))
goto fail;
if (nla_put_u32(msg, MPTCP_PM_ATTR_SUBFLOWS,
- READ_ONCE(pernet->subflows_max)))
+ READ_ONCE(pernet->limit_extra_subflows)))
goto fail;
genlmsg_end(msg, reply);
@@ -1328,7 +1506,7 @@ bool mptcp_pm_nl_check_work_pending(struct mptcp_sock *msk)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
- if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) ||
+ if (msk->pm.extra_subflows == mptcp_pm_get_limit_extra_subflows(msk) ||
(find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap,
MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) {
WRITE_ONCE(msk->pm.work_pending, false);
@@ -1360,12 +1538,11 @@ static int __net_init pm_nl_init_net(struct net *net)
{
struct pm_nl_pernet *pernet = pm_nl_get_pernet(net);
- INIT_LIST_HEAD_RCU(&pernet->local_addr_list);
+ INIT_LIST_HEAD_RCU(&pernet->endp_list);
/* Cit. 2 subflows ought to be enough for anybody. */
- pernet->subflows_max = 2;
+ pernet->limit_extra_subflows = 2;
pernet->next_id = 1;
- pernet->stale_loss_cnt = 4;
spin_lock_init(&pernet->lock);
/* No need to initialize other pernet fields, the struct is zeroed at
@@ -1386,7 +1563,7 @@ static void __net_exit pm_nl_exit_net(struct list_head *net_list)
* other modifiers, also netns core already waited for a
* RCU grace period.
*/
- __flush_addrs(&pernet->local_addr_list);
+ __flush_addrs(&pernet->endp_list);
}
}
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index ce7d42d3bd00..d5b383870f79 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -113,7 +113,7 @@ int mptcp_pm_parse_entry(struct nlattr *attr, struct genl_info *info,
return err;
if (tb[MPTCP_PM_ADDR_ATTR_IF_IDX]) {
- u32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
+ s32 val = nla_get_s32(tb[MPTCP_PM_ADDR_ATTR_IF_IDX]);
entry->ifindex = val;
}
@@ -413,8 +413,13 @@ static int mptcp_event_created(struct sk_buff *skb,
if (err)
return err;
- if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, READ_ONCE(msk->pm.server_side)))
- return -EMSGSIZE;
+ if (READ_ONCE(msk->pm.server_side)) {
+ flags |= MPTCP_PM_EV_FLAG_SERVER_SIDE;
+
+ /* Deprecated, and only set when it is the server side */
+ if (nla_put_u8(skb, MPTCP_ATTR_SERVER_SIDE, 1))
+ return -EMSGSIZE;
+ }
if (READ_ONCE(msk->pm.remote_deny_join_id0))
flags |= MPTCP_PM_EV_FLAG_DENY_JOIN_ID0;
diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c
index a715dcbe0146..8cbc1920afb4 100644
--- a/net/mptcp/pm_userspace.c
+++ b/net/mptcp/pm_userspace.c
@@ -419,7 +419,7 @@ int mptcp_pm_nl_subflow_create_doit(struct sk_buff *skb, struct genl_info *info)
if (err)
mptcp_userspace_pm_delete_local_addr(msk, &entry);
else
- msk->pm.subflows++;
+ msk->pm.extra_subflows++;
spin_unlock_bh(&msk->pm.lock);
create_err:
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5e497a83e967..0292162a14ee 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -12,6 +12,7 @@
#include <linux/sched/signal.h>
#include <linux/atomic.h>
#include <net/aligned_data.h>
+#include <net/rps.h>
#include <net/sock.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
@@ -137,26 +138,37 @@ struct sock *__mptcp_nmpc_sk(struct mptcp_sock *msk)
static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
{
- sk_drops_add(sk, skb);
+ sk_drops_skbadd(sk, skb);
__kfree_skb(skb);
}
-static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
- struct sk_buff *from)
+static bool __mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from, bool *fragstolen,
+ int *delta)
{
- bool fragstolen;
- int delta;
+ int limit = READ_ONCE(sk->sk_rcvbuf);
if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) ||
MPTCP_SKB_CB(from)->offset ||
- ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) ||
- !skb_try_coalesce(to, from, &fragstolen, &delta))
+ ((to->len + from->len) > (limit >> 3)) ||
+ !skb_try_coalesce(to, from, fragstolen, delta))
return false;
pr_debug("colesced seq %llx into %llx new len %d new end seq %llx\n",
MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq,
to->len, MPTCP_SKB_CB(from)->end_seq);
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
+ return true;
+}
+
+static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
+ struct sk_buff *from)
+{
+ bool fragstolen;
+ int delta;
+
+ if (!__mptcp_try_coalesce(sk, to, from, &fragstolen, &delta))
+ return false;
/* note the fwd memory can reach a negative value after accounting
* for the delta, but the later skb free will restore a non
@@ -178,6 +190,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
+/* "inspired" by tcp_rcvbuf_grow(), main difference:
+ * - mptcp does not maintain a msk-level window clamp
+ * - returns true when the receive buffer is actually updated
+ */
+static bool mptcp_rcvbuf_grow(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ const struct net *net = sock_net(sk);
+ int rcvwin, rcvbuf, cap;
+
+ if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK))
+ return false;
+
+ rcvwin = msk->rcvq_space.space << 1;
+
+ if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
+ rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
+
+ cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+
+ rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
+ if (rcvbuf > sk->sk_rcvbuf) {
+ WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ return true;
+ }
+ return false;
+}
+
/* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs
* - don't cope with sacks
@@ -291,29 +332,16 @@ merge_right:
end:
skb_condense(skb);
skb_set_owner_r(skb, sk);
+ /* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
+ if (sk->sk_socket)
+ mptcp_rcvbuf_grow(sk);
}
-static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
- struct sk_buff *skb, unsigned int offset,
- size_t copy_len)
+static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
+ int copy_len)
{
- struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
- struct sock *sk = (struct sock *)msk;
- struct sk_buff *tail;
- bool has_rxtstamp;
-
- __skb_unlink(skb, &ssk->sk_receive_queue);
-
- skb_ext_reset(skb);
- skb_orphan(skb);
-
- /* try to fetch required memory from subflow */
- if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
- goto drop;
- }
-
- has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ bool has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
/* the skb map_seq accounts for the skb offset:
* mptcp_subflow_get_mapped_dsn() is based on the current tp->copied_seq
@@ -325,6 +353,24 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp;
MPTCP_SKB_CB(skb)->cant_coalesce = 0;
+ __skb_unlink(skb, &ssk->sk_receive_queue);
+
+ skb_ext_reset(skb);
+ skb_dst_drop(skb);
+}
+
+static bool __mptcp_move_skb(struct sock *sk, struct sk_buff *skb)
+{
+ u64 copy_len = MPTCP_SKB_CB(skb)->end_seq - MPTCP_SKB_CB(skb)->map_seq;
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ struct sk_buff *tail;
+
+ /* try to fetch required memory from subflow */
+ if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
+ MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RCVPRUNED);
+ goto drop;
+ }
+
if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
/* in sequence */
msk->bytes_received += copy_len;
@@ -544,11 +590,10 @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied)
}
}
-static bool mptcp_check_data_fin(struct sock *sk)
+static void mptcp_check_data_fin(struct sock *sk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
u64 rcv_data_fin_seq;
- bool ret = false;
/* Need to ack a DATA_FIN received from a peer while this side
* of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
@@ -587,12 +632,10 @@ static bool mptcp_check_data_fin(struct sock *sk)
break;
}
- ret = true;
if (!__mptcp_check_fallback(msk))
mptcp_send_ack(msk);
mptcp_close_wake_up(sk);
}
- return ret;
}
static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
@@ -648,7 +691,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
if (offset < skb->len) {
size_t len = skb->len - offset;
- ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret;
+ mptcp_init_skb(ssk, skb, offset, len);
+ skb_orphan(skb);
+ ret = __mptcp_move_skb(sk, skb) || ret;
seq += len;
if (unlikely(map_remaining < len)) {
@@ -769,12 +814,8 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
moved = __mptcp_move_skbs_from_subflow(msk, ssk);
__mptcp_ofo_queue(msk);
- if (unlikely(ssk->sk_err)) {
- if (!sock_owned_by_user(sk))
- __mptcp_error_report(sk);
- else
- __set_bit(MPTCP_ERROR_REPORT, &msk->cb_flags);
- }
+ if (unlikely(ssk->sk_err))
+ __mptcp_subflow_error_report(sk, ssk);
/* If the moves have caught up with the DATA_FIN sequence number
* it's time to ack the DATA_FIN and change socket state, but
@@ -786,18 +827,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
return moved;
}
-static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
-{
- if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
- WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
-}
-
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
{
struct mptcp_sock *msk = mptcp_sk(sk);
- __mptcp_rcvbuf_update(sk, ssk);
-
/* Wake-up the reader only for in-sequence data */
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
sk->sk_data_ready(sk);
@@ -1756,6 +1789,20 @@ static u32 mptcp_send_limit(const struct sock *sk)
return limit - not_sent;
}
+static void mptcp_rps_record_subflows(const struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+
+ if (!rfs_is_needed())
+ return;
+
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+ sock_rps_record_flow(ssk);
+ }
+}
+
static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct mptcp_sock *msk = mptcp_sk(sk);
@@ -1769,6 +1816,8 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
lock_sock(sk);
+ mptcp_rps_record_subflows(msk);
+
if (unlikely(inet_test_bit(DEFER_CONNECT, sk) ||
msg->msg_flags & MSG_FASTOPEN)) {
int copied_syn = 0;
@@ -1929,12 +1978,13 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
}
if (!(flags & MSG_PEEK)) {
- /* avoid the indirect call, we know the destructor is sock_wfree */
+ /* avoid the indirect call, we know the destructor is sock_rfree */
skb->destructor = NULL;
+ skb->sk = NULL;
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
sk_mem_uncharge(sk, skb->truesize);
__skb_unlink(skb, &sk->sk_receive_queue);
- __kfree_skb(skb);
+ skb_attempt_defer_free(skb);
msk->bytes_consumed += count;
}
@@ -1999,48 +2049,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
goto new_measure;
- if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
- !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
- u64 rcvwin, grow;
- int rcvbuf;
-
- rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
-
- grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
-
- do_div(grow, msk->rcvq_space.space);
- rcvwin += (grow << 1);
-
- rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
-
- if (rcvbuf > sk->sk_rcvbuf) {
- u32 window_clamp;
-
- window_clamp = mptcp_win_from_space(sk, rcvbuf);
- WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
+ msk->rcvq_space.space = msk->rcvq_space.copied;
+ if (mptcp_rcvbuf_grow(sk)) {
- /* Make subflows follow along. If we do not do this, we
- * get drops at subflow level if skbs can't be moved to
- * the mptcp rx queue fast enough (announced rcv_win can
- * exceed ssk->sk_rcvbuf).
- */
- mptcp_for_each_subflow(msk, subflow) {
- struct sock *ssk;
- bool slow;
+ /* Make subflows follow along. If we do not do this, we
+ * get drops at subflow level if skbs can't be moved to
+ * the mptcp rx queue fast enough (announced rcv_win can
+ * exceed ssk->sk_rcvbuf).
+ */
+ mptcp_for_each_subflow(msk, subflow) {
+ struct sock *ssk;
+ bool slow;
- ssk = mptcp_subflow_tcp_sock(subflow);
- slow = lock_sock_fast(ssk);
- WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
- WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
- if (tcp_can_send_ack(ssk))
- tcp_cleanup_rbuf(ssk, 1);
- unlock_sock_fast(ssk, slow);
- }
+ ssk = mptcp_subflow_tcp_sock(subflow);
+ slow = lock_sock_fast(ssk);
+ tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
+ tcp_rcvbuf_grow(ssk);
+ unlock_sock_fast(ssk, slow);
}
}
- msk->rcvq_space.space = msk->rcvq_space.copied;
new_measure:
msk->rcvq_space.copied = 0;
msk->rcvq_space.time = mstamp;
@@ -2069,11 +2097,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
if (list_empty(&msk->conn_list))
return false;
- /* verify we can move any data from the subflow, eventually updating */
- if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
- mptcp_for_each_subflow(msk, subflow)
- __mptcp_rcvbuf_update(sk, subflow->tcp_sock);
-
subflow = list_first_entry(&msk->conn_list,
struct mptcp_subflow_context, node);
for (;;) {
@@ -2147,6 +2170,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
goto out_err;
}
+ mptcp_rps_record_subflows(msk);
+
timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
len = min_t(size_t, len, INT_MAX);
@@ -2189,14 +2214,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
break;
}
- if (sk->sk_shutdown & RCV_SHUTDOWN) {
- /* race breaker: the shutdown could be after the
- * previous receive queue check
- */
- if (__mptcp_move_skbs(sk))
- continue;
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
break;
- }
if (sk->sk_state == TCP_CLOSE) {
copied = -ENOTCONN;
@@ -2603,7 +2622,8 @@ static void __mptcp_retrans(struct sock *sk)
if (mptcp_data_fin_enabled(msk)) {
struct inet_connection_sock *icsk = inet_csk(sk);
- icsk->icsk_retransmits++;
+ WRITE_ONCE(icsk->icsk_retransmits,
+ icsk->icsk_retransmits + 1);
mptcp_set_datafin_timeout(sk);
mptcp_send_ack(msk);
@@ -3936,6 +3956,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock,
mptcp_sock_graft(ssk, newsock);
}
+ mptcp_rps_record_subflows(msk);
+
/* Do late cleanup for the first subflow as necessary. Also
* deal with bad peers not doing a complete shutdown.
*/
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index b15d7fab5c4b..52f9cfa4ce95 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -235,7 +235,7 @@ struct mptcp_pm_data {
u8 add_addr_accepted;
u8 local_addr_used;
u8 pm_type;
- u8 subflows;
+ u8 extra_subflows;
u8 status;
);
@@ -341,8 +341,8 @@ struct mptcp_sock {
struct mptcp_pm_data pm;
struct mptcp_sched_ops *sched;
struct {
- u32 space; /* bytes copied in last measurement window */
- u32 copied; /* bytes copied in this measurement window */
+ int space; /* bytes copied in last measurement window */
+ int copied; /* bytes copied in this measurement window */
u64 time; /* start time of measurement window */
u64 rtt_us; /* last maximum rtt of subflows */
} rcvq_space;
@@ -788,9 +788,7 @@ static inline bool mptcp_epollin_ready(const struct sock *sk)
* as it can always coalesce them
*/
return (data_avail >= sk->sk_rcvlowat) ||
- (mem_cgroup_sockets_enabled && sk->sk_memcg &&
- mem_cgroup_under_socket_pressure(sk->sk_memcg)) ||
- READ_ONCE(tcp_memory_pressure);
+ tcp_under_memory_pressure(sk);
}
int mptcp_set_rcvlowat(struct sock *sk, int val);
@@ -1182,15 +1180,16 @@ void __init mptcp_pm_userspace_register(void);
void __init mptcp_pm_nl_init(void);
void mptcp_pm_worker(struct mptcp_sock *msk);
void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_add_addr_signal_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_add_addr_accept_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_subflows_max(const struct mptcp_sock *msk);
-unsigned int mptcp_pm_get_local_addr_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_laminar_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
/* called under PM lock */
static inline void __mptcp_pm_close_subflow(struct mptcp_sock *msk)
{
- if (--msk->pm.subflows < mptcp_pm_get_subflows_max(msk))
+ if (--msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk))
WRITE_ONCE(msk->pm.accept_subflow, true);
}
@@ -1201,6 +1200,14 @@ static inline void mptcp_pm_close_subflow(struct mptcp_sock *msk)
spin_unlock_bh(&msk->pm.lock);
}
+static inline bool mptcp_pm_add_addr_c_flag_case(struct mptcp_sock *msk)
+{
+ return READ_ONCE(msk->pm.remote_deny_join_id0) &&
+ msk->pm.local_addr_used == 0 &&
+ mptcp_pm_get_limit_add_addr_accepted(msk) == 0 &&
+ msk->pm.extra_subflows < mptcp_pm_get_limit_extra_subflows(msk);
+}
+
void mptcp_sockopt_sync_locked(struct mptcp_sock *msk, struct sock *ssk);
static inline struct mptcp_ext *mptcp_get_ext(const struct sk_buff *skb)
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 2abe6f1e9940..a28a48385885 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -962,7 +962,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
memset(info, 0, sizeof(*info));
- info->mptcpi_subflows = READ_ONCE(msk->pm.subflows);
+ info->mptcpi_extra_subflows = READ_ONCE(msk->pm.extra_subflows);
info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled);
info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted);
info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used);
@@ -972,14 +972,16 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
/* The following limits only make sense for the in-kernel PM */
if (mptcp_pm_is_kernel(msk)) {
- info->mptcpi_subflows_max =
- mptcp_pm_get_subflows_max(msk);
- info->mptcpi_add_addr_signal_max =
- mptcp_pm_get_add_addr_signal_max(msk);
- info->mptcpi_add_addr_accepted_max =
- mptcp_pm_get_add_addr_accept_max(msk);
- info->mptcpi_local_addr_max =
- mptcp_pm_get_local_addr_max(msk);
+ info->mptcpi_limit_extra_subflows =
+ mptcp_pm_get_limit_extra_subflows(msk);
+ info->mptcpi_endp_signal_max =
+ mptcp_pm_get_endp_signal_max(msk);
+ info->mptcpi_limit_add_addr_accepted =
+ mptcp_pm_get_limit_add_addr_accepted(msk);
+ info->mptcpi_endp_subflow_max =
+ mptcp_pm_get_endp_subflow_max(msk);
+ info->mptcpi_endp_laminar_max =
+ mptcp_pm_get_endp_laminar_max(msk);
}
if (__mptcp_check_fallback(msk))
@@ -996,7 +998,7 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
info->mptcpi_bytes_sent = msk->bytes_sent;
info->mptcpi_bytes_received = msk->bytes_received;
info->mptcpi_bytes_retrans = msk->bytes_retrans;
- info->mptcpi_subflows_total = info->mptcpi_subflows +
+ info->mptcpi_subflows_total = info->mptcpi_extra_subflows +
__mptcp_has_initial_subflow(msk);
now = tcp_jiffies32;
info->mptcpi_last_data_sent = jiffies_to_msecs(now - msk->last_data_sent);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index f31a3a79531a..e8325890a322 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -1721,19 +1721,14 @@ static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
/* only the additional subflows created by kworkers have to be modified */
if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
cgroup_id(sock_cgroup_ptr(child_skcd))) {
-#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = parent->sk_memcg;
-
- mem_cgroup_sk_free(child);
- if (memcg && css_tryget(&memcg->css))
- child->sk_memcg = memcg;
-#endif /* CONFIG_MEMCG */
-
cgroup_sk_free(child_skcd);
*child_skcd = *parent_skcd;
cgroup_sk_clone(child_skcd);
}
#endif /* CONFIG_SOCK_CGROUP_DATA */
+
+ if (mem_cgroup_sockets_enabled)
+ mem_cgroup_sk_inherit(parent, child);
}
static void mptcp_subflow_ops_override(struct sock *ssk)
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 5251524b96af..5e4453e9ef8e 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -63,7 +63,7 @@ struct hbucket {
: jhash_size((htable_bits) - HTABLE_REGION_BITS))
#define ahash_sizeof_regions(htable_bits) \
(ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
-#define ahash_region(n, htable_bits) \
+#define ahash_region(n) \
((n) / jhash_size(HTABLE_REGION_BITS))
#define ahash_bucket_start(h, htable_bits) \
((htable_bits) < HTABLE_REGION_BITS ? 0 \
@@ -702,7 +702,7 @@ retry:
#endif
key = HKEY(data, h->initval, htable_bits);
m = __ipset_dereference(hbucket(t, key));
- nr = ahash_region(key, htable_bits);
+ nr = ahash_region(key);
if (!m) {
m = kzalloc(sizeof(*m) +
AHASH_INIT_SIZE * dsize,
@@ -852,7 +852,7 @@ mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
elements = t->hregion[r].elements;
maxelem = t->maxelem;
@@ -1050,7 +1050,7 @@ mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
rcu_read_lock_bh();
t = rcu_dereference_bh(h->table);
key = HKEY(value, h->initval, t->htable_bits);
- r = ahash_region(key, t->htable_bits);
+ r = ahash_region(key);
atomic_inc(&t->uref);
rcu_read_unlock_bh();
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 965f3c8e5089..37ebb0cb62b8 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -885,7 +885,7 @@ static void ip_vs_conn_expire(struct timer_list *t)
* conntrack cleanup for the net.
*/
smp_rmb();
- if (ipvs->enable)
+ if (READ_ONCE(ipvs->enable))
ip_vs_conn_drop_conntrack(cp);
}
@@ -1439,7 +1439,7 @@ void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
cond_resched_rcu();
/* netns clean up started, abort delayed work */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
break;
}
rcu_read_unlock();
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index c7a8a08b7308..5ea7ab8bf4dc 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1353,9 +1353,6 @@ ip_vs_out_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *stat
if (unlikely(!skb_dst(skb)))
return NF_ACCEPT;
- if (!ipvs->enable)
- return NF_ACCEPT;
-
ip_vs_fill_iph_skb(af, skb, false, &iph);
#ifdef CONFIG_IP_VS_IPV6
if (af == AF_INET6) {
@@ -1940,7 +1937,7 @@ ip_vs_in_hook(void *priv, struct sk_buff *skb, const struct nf_hook_state *state
return NF_ACCEPT;
}
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
ip_vs_fill_iph_skb(af, skb, false, &iph);
@@ -2108,7 +2105,7 @@ ip_vs_forward_icmp(void *priv, struct sk_buff *skb,
int r;
/* ipvs enabled in this netns ? */
- if (unlikely(sysctl_backup_only(ipvs) || !ipvs->enable))
+ if (unlikely(sysctl_backup_only(ipvs)))
return NF_ACCEPT;
if (state->pf == NFPROTO_IPV4) {
@@ -2295,7 +2292,7 @@ static int __net_init __ip_vs_init(struct net *net)
return -ENOMEM;
/* Hold the beast until a service is registered */
- ipvs->enable = 0;
+ WRITE_ONCE(ipvs->enable, 0);
ipvs->net = net;
/* Counters used for creating unique names */
ipvs->gen = atomic_read(&ipvs_netns_cnt);
@@ -2367,7 +2364,7 @@ static void __net_exit __ip_vs_dev_cleanup_batch(struct list_head *net_list)
ipvs = net_ipvs(net);
ip_vs_unregister_hooks(ipvs, AF_INET);
ip_vs_unregister_hooks(ipvs, AF_INET6);
- ipvs->enable = 0; /* Disable packet reception */
+ WRITE_ONCE(ipvs->enable, 0); /* Disable packet reception */
smp_wmb();
ip_vs_sync_net_cleanup(ipvs);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6a6fc4478533..4c8fa22be88a 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -256,7 +256,7 @@ static void est_reload_work_handler(struct work_struct *work)
struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
/* netns clean up started, abort delayed work */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
goto unlock;
if (!kd)
continue;
@@ -1483,9 +1483,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
*svc_p = svc;
- if (!ipvs->enable) {
+ if (!READ_ONCE(ipvs->enable)) {
/* Now there is a service - full throttle */
- ipvs->enable = 1;
+ WRITE_ONCE(ipvs->enable, 1);
/* Start estimation for first time */
ip_vs_est_reload_start(ipvs);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 15049b826732..93a925f1ed9b 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -231,7 +231,7 @@ static int ip_vs_estimation_kthread(void *data)
void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
{
/* Ignore reloads before first service is added */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
return;
ip_vs_est_stopped_recalc(ipvs);
/* Bump the kthread configuration genid */
@@ -306,7 +306,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
int i;
if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
- ipvs->enable && ipvs->est_max_threads)
+ READ_ONCE(ipvs->enable) && ipvs->est_max_threads)
return -EINVAL;
mutex_lock(&ipvs->est_mutex);
@@ -343,7 +343,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
}
/* Start kthread tasks only when services are present */
- if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
+ if (READ_ONCE(ipvs->enable) && !ip_vs_est_stopped(ipvs)) {
ret = ip_vs_est_kthread_start(ipvs, kd);
if (ret < 0)
goto out;
@@ -486,7 +486,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
struct ip_vs_estimator *est = &stats->est;
int ret;
- if (!ipvs->est_max_threads && ipvs->enable)
+ if (!ipvs->est_max_threads && READ_ONCE(ipvs->enable))
ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
est->ktid = -1;
@@ -663,7 +663,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
/* Wait for cpufreq frequency transition */
wait_event_idle_timeout(wq, kthread_should_stop(),
HZ / 50);
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto stop;
}
@@ -681,7 +681,7 @@ static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
rcu_read_unlock();
local_bh_enable();
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto stop;
cond_resched();
@@ -757,7 +757,7 @@ static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
mutex_lock(&ipvs->est_mutex);
for (id = 1; id < ipvs->est_kt_count; id++) {
/* netns clean up started, abort */
- if (!ipvs->enable)
+ if (!READ_ONCE(ipvs->enable))
goto unlock2;
kd = ipvs->est_kt_arr[id];
if (!kd)
@@ -787,7 +787,7 @@ last_kt:
id = ipvs->est_kt_count;
next_kt:
- if (!ipvs->enable || kthread_should_stop())
+ if (!READ_ONCE(ipvs->enable) || kthread_should_stop())
goto unlock;
id--;
if (id < 0)
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index d8a284999544..206c6700e200 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -53,6 +53,7 @@ enum {
IP_VS_FTP_EPSV,
};
+static bool exiting_module;
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
* First port is set to the default port.
@@ -605,7 +606,7 @@ static void __ip_vs_ftp_exit(struct net *net)
{
struct netns_ipvs *ipvs = net_ipvs(net);
- if (!ipvs)
+ if (!ipvs || !exiting_module)
return;
unregister_ip_vs_app(ipvs, &ip_vs_ftp);
@@ -627,6 +628,7 @@ static int __init ip_vs_ftp_init(void)
*/
static void __exit ip_vs_ftp_exit(void)
{
+ exiting_module = true;
unregister_pernet_subsys(&ip_vs_ftp_ops);
/* rcu_barrier() is called by netns */
}
diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c
index af68c64acaab..81baf2082604 100644
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -301,7 +301,7 @@ void nf_conntrack_ecache_work(struct net *net, enum nf_ct_ecache_state state)
net->ct.ecache_dwork_pending = true;
} else if (state == NFCT_ECACHE_DESTROY_SENT) {
if (!hlist_nulls_empty(&cnet->ecache.dying_list))
- mod_delayed_work(system_wq, &cnet->ecache.dwork, 0);
+ mod_delayed_work(system_percpu_wq, &cnet->ecache.dwork, 0);
else
net->ct.ecache_dwork_pending = false;
}
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 50fd6809380f..3a04665adf99 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -60,7 +60,7 @@ MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("List and change connection tracking table");
struct ctnetlink_list_dump_ctx {
- struct nf_conn *last;
+ unsigned long last_id;
unsigned int cpu;
bool done;
};
@@ -1733,16 +1733,6 @@ static int ctnetlink_get_conntrack(struct sk_buff *skb,
return nfnetlink_unicast(skb2, info->net, NETLINK_CB(skb).portid);
}
-static int ctnetlink_done_list(struct netlink_callback *cb)
-{
- struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
-
- if (ctx->last)
- nf_ct_put(ctx->last);
-
- return 0;
-}
-
#ifdef CONFIG_NF_CONNTRACK_EVENTS
static int ctnetlink_dump_one_entry(struct sk_buff *skb,
struct netlink_callback *cb,
@@ -1757,11 +1747,11 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
if (l3proto && nf_ct_l3num(ct) != l3proto)
return 0;
- if (ctx->last) {
- if (ct != ctx->last)
+ if (ctx->last_id) {
+ if (ctnetlink_get_id(ct) != ctx->last_id)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
}
/* We can't dump extension info for the unconfirmed
@@ -1775,12 +1765,8 @@ static int ctnetlink_dump_one_entry(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
NFNL_MSG_TYPE(cb->nlh->nlmsg_type),
ct, dying, 0);
- if (res < 0) {
- if (!refcount_inc_not_zero(&ct->ct_general.use))
- return 0;
-
- ctx->last = ct;
- }
+ if (res < 0)
+ ctx->last_id = ctnetlink_get_id(ct);
return res;
}
@@ -1796,10 +1782,10 @@ static int
ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ctnetlink_list_dump_ctx *ctx = (void *)cb->ctx;
- struct nf_conn *last = ctx->last;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
const struct net *net = sock_net(skb->sk);
struct nf_conntrack_net_ecache *ecache_net;
+ unsigned long last_id = ctx->last_id;
struct nf_conntrack_tuple_hash *h;
struct hlist_nulls_node *n;
#endif
@@ -1807,7 +1793,7 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
if (ctx->done)
return 0;
- ctx->last = NULL;
+ ctx->last_id = 0;
#ifdef CONFIG_NF_CONNTRACK_EVENTS
ecache_net = nf_conn_pernet_ecache(net);
@@ -1818,24 +1804,21 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
int res;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (last && last != ct)
+ if (last_id && last_id != ctnetlink_get_id(ct))
continue;
res = ctnetlink_dump_one_entry(skb, cb, ct, true);
if (res < 0) {
spin_unlock_bh(&ecache_net->dying_lock);
- nf_ct_put(last);
return skb->len;
}
- nf_ct_put(last);
- last = NULL;
+ last_id = 0;
}
spin_unlock_bh(&ecache_net->dying_lock);
#endif
ctx->done = true;
- nf_ct_put(last);
return skb->len;
}
@@ -1847,7 +1830,6 @@ static int ctnetlink_get_ct_dying(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_dying,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
@@ -1862,7 +1844,6 @@ static int ctnetlink_get_ct_unconfirmed(struct sk_buff *skb,
if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
.dump = ctnetlink_dump_unconfirmed,
- .done = ctnetlink_done_list,
};
return netlink_dump_start(info->sk, skb, info->nlh, &c);
}
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 1f14ef0436c6..708b79380f04 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -317,6 +317,9 @@ static int ct_seq_show(struct seq_file *s, void *v)
smp_acquire__after_ctrl_dep();
if (nf_ct_should_gc(ct)) {
+ struct ct_iter_state *st = s->private;
+
+ st->skip_elems--;
nf_ct_kill(ct);
goto release;
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index c3c73411c40c..eed434e0a970 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -151,12 +151,12 @@ static void nft_ctx_init(struct nft_ctx *ctx,
bitmap_zero(ctx->reg_inited, NFT_REG32_NUM);
}
-static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
- int msg_type, u32 size, gfp_t gfp)
+static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
+ int msg_type, u32 size)
{
struct nft_trans *trans;
- trans = kzalloc(size, gfp);
+ trans = kzalloc(size, GFP_KERNEL);
if (trans == NULL)
return NULL;
@@ -172,12 +172,6 @@ static struct nft_trans *nft_trans_alloc_gfp(const struct nft_ctx *ctx,
return trans;
}
-static struct nft_trans *nft_trans_alloc(const struct nft_ctx *ctx,
- int msg_type, u32 size)
-{
- return nft_trans_alloc_gfp(ctx, msg_type, size, GFP_KERNEL);
-}
-
static struct nft_trans_binding *nft_trans_get_binding(struct nft_trans *trans)
{
switch (trans->msg_type) {
@@ -442,8 +436,7 @@ static bool nft_trans_collapse_set_elem_allowed(const struct nft_trans_elem *a,
static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
struct nft_trans_elem *tail,
- struct nft_trans_elem *trans,
- gfp_t gfp)
+ struct nft_trans_elem *trans)
{
unsigned int nelems, old_nelems = tail->nelems;
struct nft_trans_elem *new_trans;
@@ -466,9 +459,11 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
/* krealloc might free tail which invalidates list pointers */
list_del_init(&tail->nft_trans.list);
- new_trans = krealloc(tail, struct_size(tail, elems, nelems), gfp);
+ new_trans = krealloc(tail, struct_size(tail, elems, nelems),
+ GFP_KERNEL);
if (!new_trans) {
- list_add_tail(&tail->nft_trans.list, &nft_net->commit_list);
+ list_add_tail(&tail->nft_trans.list,
+ &nft_net->commit_list);
return false;
}
@@ -484,7 +479,7 @@ static bool nft_trans_collapse_set_elem(struct nftables_pernet *nft_net,
}
static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
- struct nft_trans *trans, gfp_t gfp)
+ struct nft_trans *trans)
{
struct nft_trans *tail;
@@ -501,7 +496,7 @@ static bool nft_trans_try_collapse(struct nftables_pernet *nft_net,
case NFT_MSG_DELSETELEM:
return nft_trans_collapse_set_elem(nft_net,
nft_trans_container_elem(tail),
- nft_trans_container_elem(trans), gfp);
+ nft_trans_container_elem(trans));
}
return false;
@@ -537,17 +532,14 @@ static void nft_trans_commit_list_add_tail(struct net *net, struct nft_trans *tr
}
}
-static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans,
- gfp_t gfp)
+static void nft_trans_commit_list_add_elem(struct net *net, struct nft_trans *trans)
{
struct nftables_pernet *nft_net = nft_pernet(net);
WARN_ON_ONCE(trans->msg_type != NFT_MSG_NEWSETELEM &&
trans->msg_type != NFT_MSG_DELSETELEM);
- might_alloc(gfp);
-
- if (nft_trans_try_collapse(nft_net, trans, gfp)) {
+ if (nft_trans_try_collapse(nft_net, trans)) {
kfree(trans);
return;
}
@@ -7573,7 +7565,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
ue->priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
goto err_elem_free;
}
}
@@ -7597,7 +7589,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
}
nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
err_set_full:
@@ -7863,7 +7855,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
nft_setelem_data_deactivate(ctx->net, set, elem.priv);
nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
fail_ops:
@@ -7888,9 +7880,8 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
if (!nft_set_elem_active(ext, iter->genmask))
return 0;
- trans = nft_trans_alloc_gfp(ctx, NFT_MSG_DELSETELEM,
- struct_size_t(struct nft_trans_elem, elems, 1),
- GFP_ATOMIC);
+ trans = nft_trans_alloc(ctx, NFT_MSG_DELSETELEM,
+ struct_size_t(struct nft_trans_elem, elems, 1));
if (!trans)
return -ENOMEM;
@@ -7901,7 +7892,7 @@ static int nft_setelem_flush(const struct nft_ctx *ctx,
nft_trans_elem_set(trans) = set;
nft_trans_container_elem(trans)->nelems = 1;
nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_ATOMIC);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
}
@@ -7918,7 +7909,7 @@ static int __nft_set_catchall_flush(const struct nft_ctx *ctx,
nft_setelem_data_deactivate(ctx->net, set, elem_priv);
nft_trans_container_elem(trans)->elems[0].priv = elem_priv;
- nft_trans_commit_list_add_elem(ctx->net, trans, GFP_KERNEL);
+ nft_trans_commit_list_add_elem(ctx->net, trans);
return 0;
}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index e598a2a252b0..811d02b4c4f7 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -376,6 +376,7 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh,
const struct nfnetlink_subsystem *ss;
const struct nfnl_callback *nc;
struct netlink_ext_ack extack;
+ struct nlmsghdr *onlh = nlh;
LIST_HEAD(err_list);
u32 status;
int err;
@@ -386,6 +387,7 @@ replay:
status = 0;
replay_abort:
skb = netlink_skb_clone(oskb, GFP_KERNEL);
+ nlh = onlh;
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM, NULL);
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index 225ff293cd50..14dd1c0698c3 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -9,7 +9,7 @@
#include <linux/netfilter/nf_conntrack_common.h>
#include <linux/netfilter/nf_tables.h>
#include <net/ip.h>
-#include <net/inet_dscp.h>
+#include <net/flow.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_conntrack_core.h>
@@ -236,7 +236,7 @@ static int nft_flow_route(const struct nft_pktinfo *pkt,
fl.u.ip4.saddr = ct->tuplehash[!dir].tuple.src.u3.ip;
fl.u.ip4.flowi4_oif = nft_in(pkt)->ifindex;
fl.u.ip4.flowi4_iif = this_dst->dev->ifindex;
- fl.u.ip4.flowi4_tos = inet_dscp_to_dsfield(ip4h_dscp(ip_hdr(pkt->skb)));
+ fl.u.ip4.flowi4_dscp = ip4h_dscp(ip_hdr(pkt->skb));
fl.u.ip4.flowi4_mark = pkt->skb->mark;
fl.u.ip4.flowi4_flags = FLOWI_FLAG_ANYSRC;
break;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 7dfc5343dae4..b0214418f75a 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -40,7 +40,7 @@ static bool nft_payload_rebuild_vlan_hdr(const struct sk_buff *skb, int mac_off,
/* add vlan header into the user buffer for if tag was removed by offloads */
static bool
-nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u8 offset, u8 len)
+nft_payload_copy_vlan(u32 *d, const struct sk_buff *skb, u16 offset, u8 len)
{
int mac_off = skb_mac_header(skb) - skb->data;
u8 *vlanh, *dst_u8 = (u8 *) d;
@@ -212,7 +212,7 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
[NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
[NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_BE32 },
[NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX(NLA_BE32, 255),
[NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
[NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX(NLA_BE32, 255),
@@ -684,7 +684,7 @@ static const struct nft_expr_ops nft_payload_inner_ops = {
static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
{
- *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
+ csum_replace4(sum, (__force __be32)fsum, (__force __be32)tsum);
if (*sum == 0)
*sum = CSUM_MANGLED_0;
}
@@ -797,7 +797,7 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
struct nft_payload_set {
enum nft_payload_bases base:8;
- u8 offset;
+ u16 offset;
u8 len;
u8 sreg;
u8 csum_type;
@@ -812,7 +812,7 @@ struct nft_payload_vlan_hdr {
};
static bool
-nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u8 offset, u8 len,
+nft_payload_set_vlan(const u32 *src, struct sk_buff *skb, u16 offset, u8 len,
int *vlan_hlen)
{
struct nft_payload_vlan_hdr *vlanh;
@@ -940,14 +940,18 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 csum_offset, offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
struct nft_payload_set *priv = nft_expr_priv(expr);
- u32 csum_offset, csum_type = NFT_PAYLOAD_CSUM_NONE;
int err;
priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
- priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
+ if (err < 0)
+ return err;
+ priv->offset = offset;
+
if (tb[NFTA_PAYLOAD_CSUM_TYPE])
csum_type = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
if (tb[NFTA_PAYLOAD_CSUM_OFFSET]) {
@@ -1069,7 +1073,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
if (tb[NFTA_PAYLOAD_DREG] == NULL)
return ERR_PTR(-EINVAL);
- err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U8_MAX, &offset);
+ err = nft_parse_u32_check(tb[NFTA_PAYLOAD_OFFSET], U16_MAX, &offset);
if (err < 0)
return ERR_PTR(err);
diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c
index 266d0c637225..ba01ce75d6de 100644
--- a/net/netfilter/nft_set_hash.c
+++ b/net/netfilter/nft_set_hash.c
@@ -30,6 +30,7 @@ struct nft_rhash {
struct nft_rhash_elem {
struct nft_elem_priv priv;
struct rhash_head node;
+ struct llist_node walk_node;
u32 wq_gc_seq;
struct nft_set_ext ext;
};
@@ -144,6 +145,7 @@ nft_rhash_update(struct nft_set *set, const u32 *key,
goto err1;
he = nft_elem_priv_cast(elem_priv);
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -180,6 +182,7 @@ static int nft_rhash_insert(const struct net *net, const struct nft_set *set,
};
struct nft_rhash_elem *prev;
+ init_llist_node(&he->walk_node);
prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node,
nft_rhash_params);
if (IS_ERR(prev))
@@ -261,12 +264,12 @@ static bool nft_rhash_delete(const struct nft_set *set,
return true;
}
-static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rhash_walk_ro(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rhash *priv = nft_set_priv(set);
- struct nft_rhash_elem *he;
struct rhashtable_iter hti;
+ struct nft_rhash_elem *he;
rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti);
@@ -295,6 +298,97 @@ cont:
rhashtable_walk_exit(&hti);
}
+static void nft_rhash_walk_update(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rhash *priv = nft_set_priv(set);
+ struct nft_rhash_elem *he, *tmp;
+ struct llist_node *first_node;
+ struct rhashtable_iter hti;
+ LLIST_HEAD(walk_list);
+
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+
+ if (set->in_update_walk) {
+ /* This can happen with bogus rulesets during ruleset validation
+ * when a verdict map causes a jump back to the same map.
+ *
+ * Without this extra check the walk_next loop below will see
+ * elems on the callers walk_list and skip (not validate) them.
+ */
+ iter->err = -EMLINK;
+ return;
+ }
+
+ /* walk happens under RCU.
+ *
+ * We create a snapshot list so ->iter callback can sleep.
+ * commit_mutex is held, elements can ...
+ * .. be added in parallel from dataplane (dynset)
+ * .. be marked as dead in parallel from dataplane (dynset).
+ * .. be queued for removal in parallel (gc timeout).
+ * .. not be freed: transaction mutex is held.
+ */
+ rhashtable_walk_enter(&priv->ht, &hti);
+ rhashtable_walk_start(&hti);
+
+ while ((he = rhashtable_walk_next(&hti))) {
+ if (IS_ERR(he)) {
+ if (PTR_ERR(he) != -EAGAIN) {
+ iter->err = PTR_ERR(he);
+ break;
+ }
+
+ continue;
+ }
+
+ /* rhashtable resized during walk, skip */
+ if (llist_on_list(&he->walk_node))
+ continue;
+
+ llist_add(&he->walk_node, &walk_list);
+ }
+ rhashtable_walk_stop(&hti);
+ rhashtable_walk_exit(&hti);
+
+ first_node = __llist_del_all(&walk_list);
+ set->in_update_walk = true;
+ llist_for_each_entry_safe(he, tmp, first_node, walk_node) {
+ if (iter->err == 0) {
+ iter->err = iter->fn(ctx, set, iter, &he->priv);
+ if (iter->err == 0)
+ iter->count++;
+ }
+
+ /* all entries must be cleared again, else next ->walk iteration
+ * will skip entries.
+ */
+ init_llist_node(&he->walk_node);
+ }
+ set->in_update_walk = false;
+}
+
+static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ /* only relevant for netlink dumps which use READ type */
+ WARN_ON_ONCE(iter->skip != 0);
+
+ nft_rhash_walk_update(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ nft_rhash_walk_ro(ctx, set, iter);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
+}
+
static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
struct nft_set_ext *ext)
{
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 793790d79d13..112fe46788b6 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -397,7 +397,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
}
/**
- * pipapo_get() - Get matching element reference given key data
+ * pipapo_get_slow() - Get matching element reference given key data
* @m: storage containing the set elements
* @data: Key data to be matched against existing elements
* @genmask: If set, check that element is active in given genmask
@@ -414,12 +414,12 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
*
* Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
- const u8 *data, u8 genmask,
- u64 tstamp)
+static struct nft_pipapo_elem *pipapo_get_slow(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
+ unsigned long *res_map, *fill_map, *map;
struct nft_pipapo_scratch *scratch;
- unsigned long *res_map, *fill_map;
const struct nft_pipapo_field *f;
bool map_index;
int i;
@@ -429,11 +429,13 @@ static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
scratch = *raw_cpu_ptr(m->scratch);
if (unlikely(!scratch))
goto out;
+ __local_lock_nested_bh(&scratch->bh_lock);
map_index = scratch->map_index;
- res_map = scratch->map + (map_index ? m->bsize_max : 0);
- fill_map = scratch->map + (map_index ? 0 : m->bsize_max);
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res_map = map + (map_index ? m->bsize_max : 0);
+ fill_map = map + (map_index ? 0 : m->bsize_max);
pipapo_resmap_init(m, res_map);
@@ -464,6 +466,7 @@ next_match:
last);
if (b < 0) {
scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return NULL;
@@ -483,6 +486,7 @@ next_match:
* *next* bitmap (not initial) for the next packet.
*/
scratch->map_index = map_index;
+ __local_unlock_nested_bh(&scratch->bh_lock);
local_bh_enable();
return e;
}
@@ -497,12 +501,47 @@ next_match:
data += NFT_PIPAPO_GROUPS_PADDING(f);
}
+ __local_unlock_nested_bh(&scratch->bh_lock);
out:
local_bh_enable();
return NULL;
}
/**
+ * pipapo_get() - Get matching element reference given key data
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
+ *
+ * This is a dispatcher function, either calling out the generic C
+ * implementation or, if available, the AVX2 one.
+ * This helper is only called from the control plane, with either RCU
+ * read lock or transaction mutex held.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
+ */
+static struct nft_pipapo_elem *pipapo_get(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
+{
+ struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+#if defined(CONFIG_X86_64) && !defined(CONFIG_UML)
+ if (boot_cpu_has(X86_FEATURE_AVX2) && irq_fpu_usable()) {
+ e = pipapo_get_avx2(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+ }
+#endif
+ e = pipapo_get_slow(m, data, genmask, tstamp);
+ local_bh_enable();
+ return e;
+}
+
+/**
* nft_pipapo_lookup() - Dataplane fronted for main lookup function
* @net: Network namespace
* @set: nftables API set representation
@@ -510,8 +549,7 @@ out:
*
* This function is called from the data path. It will search for
* an element matching the given key in the current active copy.
- * Unlike other set types, this uses NFT_GENMASK_ANY instead of
- * nft_genmask_cur().
+ * Unlike other set types, this uses 0 instead of nft_genmask_cur().
*
* This is because new (future) elements are not reachable from
* priv->match, they get added to priv->clone instead.
@@ -521,8 +559,8 @@ out:
* inconsistent state: matching old entries get skipped but thew
* newly matching entries are unreachable.
*
- * GENMASK will still find the 'now old' entries which ensures consistent
- * priv->match view.
+ * GENMASK_ANY doesn't work for the same reason: old-gen entries get
+ * skipped, new-gen entries are only reachable from priv->clone.
*
* nft_pipapo_commit swaps ->clone and ->match shortly after the
* genbit flip. As ->clone doesn't contain the old entries in the first
@@ -539,7 +577,7 @@ nft_pipapo_lookup(const struct net *net, const struct nft_set *set,
const struct nft_pipapo_elem *e;
m = rcu_dereference(priv->match);
- e = pipapo_get(m, (const u8 *)key, NFT_GENMASK_ANY, get_jiffies_64());
+ e = pipapo_get_slow(m, (const u8 *)key, 0, get_jiffies_64());
return e ? &e->ext : NULL;
}
@@ -1152,22 +1190,17 @@ static void pipapo_map(struct nft_pipapo_match *m,
}
/**
- * pipapo_free_scratch() - Free per-CPU map at original (not aligned) address
+ * pipapo_free_scratch() - Free per-CPU map at original address
* @m: Matching data
* @cpu: CPU number
*/
static void pipapo_free_scratch(const struct nft_pipapo_match *m, unsigned int cpu)
{
struct nft_pipapo_scratch *s;
- void *mem;
s = *per_cpu_ptr(m->scratch, cpu);
- if (!s)
- return;
- mem = s;
- mem -= s->align_off;
- kvfree(mem);
+ kvfree(s);
}
/**
@@ -1184,11 +1217,8 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
for_each_possible_cpu(i) {
struct nft_pipapo_scratch *scratch;
-#ifdef NFT_PIPAPO_ALIGN
- void *scratch_aligned;
- u32 align_off;
-#endif
- scratch = kvzalloc_node(struct_size(scratch, map, bsize_max * 2) +
+
+ scratch = kvzalloc_node(struct_size(scratch, __map, bsize_max * 2) +
NFT_PIPAPO_ALIGN_HEADROOM,
GFP_KERNEL_ACCOUNT, cpu_to_node(i));
if (!scratch) {
@@ -1203,23 +1233,7 @@ static int pipapo_realloc_scratch(struct nft_pipapo_match *clone,
}
pipapo_free_scratch(clone, i);
-
-#ifdef NFT_PIPAPO_ALIGN
- /* Align &scratch->map (not the struct itself): the extra
- * %NFT_PIPAPO_ALIGN_HEADROOM bytes passed to kzalloc_node()
- * above guarantee we can waste up to those bytes in order
- * to align the map field regardless of its offset within
- * the struct.
- */
- BUILD_BUG_ON(offsetof(struct nft_pipapo_scratch, map) > NFT_PIPAPO_ALIGN_HEADROOM);
-
- scratch_aligned = NFT_PIPAPO_LT_ALIGN(&scratch->map);
- scratch_aligned -= offsetof(struct nft_pipapo_scratch, map);
- align_off = scratch_aligned - (void *)scratch;
-
- scratch = scratch_aligned;
- scratch->align_off = align_off;
-#endif
+ local_lock_init(&scratch->bh_lock);
*per_cpu_ptr(clone->scratch, i) = scratch;
}
diff --git a/net/netfilter/nft_set_pipapo.h b/net/netfilter/nft_set_pipapo.h
index 4a2ff85ce1c4..eaab422aa56a 100644
--- a/net/netfilter/nft_set_pipapo.h
+++ b/net/netfilter/nft_set_pipapo.h
@@ -124,14 +124,14 @@ struct nft_pipapo_field {
/**
* struct nft_pipapo_scratch - percpu data used for lookup and matching
+ * @bh_lock: PREEMPT_RT local spinlock
* @map_index: Current working bitmap index, toggled between field matches
- * @align_off: Offset to get the originally allocated address
- * @map: store partial matching results during lookup
+ * @__map: store partial matching results during lookup
*/
struct nft_pipapo_scratch {
+ local_lock_t bh_lock;
u8 map_index;
- u32 align_off;
- unsigned long map[];
+ unsigned long __map[];
};
/**
diff --git a/net/netfilter/nft_set_pipapo_avx2.c b/net/netfilter/nft_set_pipapo_avx2.c
index c0884fa68c79..7ff90325c97f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.c
+++ b/net/netfilter/nft_set_pipapo_avx2.c
@@ -1099,7 +1099,7 @@ bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
desc->field_count < NFT_PIPAPO_MIN_FIELDS)
return false;
- if (!boot_cpu_has(X86_FEATURE_AVX2) || !boot_cpu_has(X86_FEATURE_AVX))
+ if (!boot_cpu_has(X86_FEATURE_AVX2))
return false;
est->size = pipapo_estimate_size(desc);
@@ -1133,75 +1133,59 @@ static inline void pipapo_resmap_init_avx2(const struct nft_pipapo_match *m, uns
}
/**
- * nft_pipapo_avx2_lookup() - Lookup function for AVX2 implementation
- * @net: Network namespace
- * @set: nftables API set representation
- * @key: nftables API element representation containing key data
+ * pipapo_get_avx2() - Lookup function for AVX2 implementation
+ * @m: Storage containing the set elements
+ * @data: Key data to be matched against existing elements
+ * @genmask: If set, check that element is active in given genmask
+ * @tstamp: Timestamp to check for expired elements
*
* For more details, see DOC: Theory of Operation in nft_set_pipapo.c.
*
* This implementation exploits the repetitive characteristic of the algorithm
* to provide a fast, vectorised version using the AVX2 SIMD instruction set.
*
- * Return: true on match, false otherwise.
+ * The caller must check that the FPU is usable.
+ * This function must be called with BH disabled.
+ *
+ * Return: pointer to &struct nft_pipapo_elem on match, NULL otherwise.
*/
-const struct nft_set_ext *
-nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
- const u32 *key)
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp)
{
- struct nft_pipapo *priv = nft_set_priv(set);
- const struct nft_set_ext *ext = NULL;
struct nft_pipapo_scratch *scratch;
- const struct nft_pipapo_match *m;
const struct nft_pipapo_field *f;
- const u8 *rp = (const u8 *)key;
- unsigned long *res, *fill;
+ unsigned long *res, *fill, *map;
bool map_index;
int i;
- local_bh_disable();
-
- if (unlikely(!irq_fpu_usable())) {
- ext = nft_pipapo_lookup(net, set, key);
+ scratch = *raw_cpu_ptr(m->scratch);
+ if (unlikely(!scratch))
+ return NULL;
- local_bh_enable();
- return ext;
- }
+ __local_lock_nested_bh(&scratch->bh_lock);
+ map_index = scratch->map_index;
+ map = NFT_PIPAPO_LT_ALIGN(&scratch->__map[0]);
+ res = map + (map_index ? m->bsize_max : 0);
+ fill = map + (map_index ? 0 : m->bsize_max);
- m = rcu_dereference(priv->match);
+ pipapo_resmap_init_avx2(m, res);
- /* This also protects access to all data related to scratch maps.
- *
- * Note that we don't need a valid MXCSR state for any of the
+ /* Note that we don't need a valid MXCSR state for any of the
* operations we use here, so pass 0 as mask and spare a LDMXCSR
* instruction.
*/
kernel_fpu_begin_mask(0);
- scratch = *raw_cpu_ptr(m->scratch);
- if (unlikely(!scratch)) {
- kernel_fpu_end();
- local_bh_enable();
- return NULL;
- }
-
- map_index = scratch->map_index;
-
- res = scratch->map + (map_index ? m->bsize_max : 0);
- fill = scratch->map + (map_index ? 0 : m->bsize_max);
-
- pipapo_resmap_init_avx2(m, res);
-
nft_pipapo_avx2_prepare();
-next_match:
nft_pipapo_for_each_field(f, i, m) {
bool last = i == m->field_count - 1, first = !i;
int ret = 0;
#define NFT_SET_PIPAPO_AVX2_LOOKUP(b, n) \
(ret = nft_pipapo_avx2_lookup_##b##b_##n(res, fill, f, \
- ret, rp, \
+ ret, data, \
first, last))
if (likely(f->bb == 8)) {
@@ -1217,7 +1201,7 @@ next_match:
NFT_SET_PIPAPO_AVX2_LOOKUP(8, 16);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
- ret, rp,
+ ret, data,
first, last);
}
} else {
@@ -1233,7 +1217,7 @@ next_match:
NFT_SET_PIPAPO_AVX2_LOOKUP(4, 32);
} else {
ret = nft_pipapo_avx2_lookup_slow(m, res, fill, f,
- ret, rp,
+ ret, data,
first, last);
}
}
@@ -1241,28 +1225,78 @@ next_match:
#undef NFT_SET_PIPAPO_AVX2_LOOKUP
- if (ret < 0)
- goto out;
+next_match:
+ if (ret < 0) {
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+ }
if (last) {
- const struct nft_set_ext *e = &f->mt[ret].e->ext;
+ struct nft_pipapo_elem *e;
- if (unlikely(nft_set_elem_expired(e)))
+ e = f->mt[ret].e;
+ if (unlikely(__nft_set_elem_expired(&e->ext, tstamp) ||
+ !nft_set_elem_active(&e->ext, genmask))) {
+ ret = pipapo_refill(res, f->bsize, f->rules,
+ fill, f->mt, last);
goto next_match;
+ }
- ext = e;
- goto out;
+ scratch->map_index = map_index;
+ kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return e;
}
+ map_index = !map_index;
swap(res, fill);
- rp += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
+ data += NFT_PIPAPO_GROUPS_PADDED_SIZE(f);
}
-out:
- if (i % 2)
- scratch->map_index = !map_index;
kernel_fpu_end();
+ __local_unlock_nested_bh(&scratch->bh_lock);
+ return NULL;
+}
+
+/**
+ * nft_pipapo_avx2_lookup() - Dataplane frontend for AVX2 implementation
+ * @net: Network namespace
+ * @set: nftables API set representation
+ * @key: nftables API element representation containing key data
+ *
+ * This function is called from the data path. It will search for
+ * an element matching the given key in the current active copy using
+ * the AVX2 routines if the FPU is usable or fall back to the generic
+ * implementation of the algorithm otherwise.
+ *
+ * Return: nftables API extension pointer or NULL if no match.
+ */
+const struct nft_set_ext *
+nft_pipapo_avx2_lookup(const struct net *net, const struct nft_set *set,
+ const u32 *key)
+{
+ struct nft_pipapo *priv = nft_set_priv(set);
+ const struct nft_pipapo_match *m;
+ const u8 *rp = (const u8 *)key;
+ const struct nft_pipapo_elem *e;
+
+ local_bh_disable();
+
+ if (unlikely(!irq_fpu_usable())) {
+ const struct nft_set_ext *ext;
+
+ ext = nft_pipapo_lookup(net, set, key);
+
+ local_bh_enable();
+ return ext;
+ }
+
+ m = rcu_dereference(priv->match);
+
+ e = pipapo_get_avx2(m, rp, 0, get_jiffies_64());
local_bh_enable();
- return ext;
+ return e ? &e->ext : NULL;
}
diff --git a/net/netfilter/nft_set_pipapo_avx2.h b/net/netfilter/nft_set_pipapo_avx2.h
index dbb6aaca8a7a..c2999b63da3f 100644
--- a/net/netfilter/nft_set_pipapo_avx2.h
+++ b/net/netfilter/nft_set_pipapo_avx2.h
@@ -5,8 +5,12 @@
#include <asm/fpu/xstate.h>
#define NFT_PIPAPO_ALIGN (XSAVE_YMM_SIZE / BITS_PER_BYTE)
+struct nft_pipapo_match;
bool nft_pipapo_avx2_estimate(const struct nft_set_desc *desc, u32 features,
struct nft_set_estimate *est);
+struct nft_pipapo_elem *pipapo_get_avx2(const struct nft_pipapo_match *m,
+ const u8 *data, u8 genmask,
+ u64 tstamp);
#endif /* defined(CONFIG_X86_64) && !defined(CONFIG_UML) */
#endif /* _NFT_SET_PIPAPO_AVX2_H */
diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c
index b1f04168ec93..ca594161b840 100644
--- a/net/netfilter/nft_set_rbtree.c
+++ b/net/netfilter/nft_set_rbtree.c
@@ -584,15 +584,14 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
return NULL;
}
-static void nft_rbtree_walk(const struct nft_ctx *ctx,
- struct nft_set *set,
- struct nft_set_iter *iter)
+static void nft_rbtree_do_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe;
struct rb_node *node;
- read_lock_bh(&priv->lock);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
rbe = rb_entry(node, struct nft_rbtree_elem, node);
@@ -600,14 +599,34 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
goto cont;
iter->err = iter->fn(ctx, set, iter, &rbe->priv);
- if (iter->err < 0) {
- read_unlock_bh(&priv->lock);
+ if (iter->err < 0)
return;
- }
cont:
iter->count++;
}
- read_unlock_bh(&priv->lock);
+}
+
+static void nft_rbtree_walk(const struct nft_ctx *ctx,
+ struct nft_set *set,
+ struct nft_set_iter *iter)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+
+ switch (iter->type) {
+ case NFT_ITER_UPDATE:
+ lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
+ nft_rbtree_do_walk(ctx, set, iter);
+ break;
+ case NFT_ITER_READ:
+ read_lock_bh(&priv->lock);
+ nft_rbtree_do_walk(ctx, set, iter);
+ read_unlock_bh(&priv->lock);
+ break;
+ default:
+ iter->err = -EINVAL;
+ WARN_ON_ONCE(1);
+ break;
+ }
}
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index e2f7080dd5d7..2b46c0cd752a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -356,7 +356,7 @@ static void netlink_overrun(struct sock *sk)
sk_error_report(sk);
}
}
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
}
static void netlink_rcv_wake(struct sock *sk)
@@ -2711,7 +2711,7 @@ static int netlink_native_seq_show(struct seq_file *seq, void *v)
sk_wmem_alloc_get(s),
READ_ONCE(nlk->cb_running),
refcount_read(&s->sk_refcnt),
- atomic_read(&s->sk_drops),
+ sk_drops_read(s),
sock_i_ino(s)
);
diff --git a/net/nfc/nci/ntf.c b/net/nfc/nci/ntf.c
index a818eff27e6b..418b84e2b260 100644
--- a/net/nfc/nci/ntf.c
+++ b/net/nfc/nci/ntf.c
@@ -27,11 +27,16 @@
/* Handle NCI Notification packets */
-static void nci_core_reset_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_core_reset_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
/* Handle NCI 2.x core reset notification */
- const struct nci_core_reset_ntf *ntf = (void *)skb->data;
+ const struct nci_core_reset_ntf *ntf;
+
+ if (skb->len < sizeof(struct nci_core_reset_ntf))
+ return -EINVAL;
+
+ ntf = (struct nci_core_reset_ntf *)skb->data;
ndev->nci_ver = ntf->nci_ver;
pr_debug("nci_ver 0x%x, config_status 0x%x\n",
@@ -42,15 +47,22 @@ static void nci_core_reset_ntf_packet(struct nci_dev *ndev,
__le32_to_cpu(ntf->manufact_specific_info);
nci_req_complete(ndev, NCI_STATUS_OK);
+
+ return 0;
}
-static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
- struct sk_buff *skb)
+static int nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
{
- struct nci_core_conn_credit_ntf *ntf = (void *) skb->data;
+ struct nci_core_conn_credit_ntf *ntf;
struct nci_conn_info *conn_info;
int i;
+ if (skb->len < sizeof(struct nci_core_conn_credit_ntf))
+ return -EINVAL;
+
+ ntf = (struct nci_core_conn_credit_ntf *)skb->data;
+
pr_debug("num_entries %d\n", ntf->num_entries);
if (ntf->num_entries > NCI_MAX_NUM_CONN)
@@ -68,7 +80,7 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
conn_info = nci_get_conn_info_by_conn_id(ndev,
ntf->conn_entries[i].conn_id);
if (!conn_info)
- return;
+ return 0;
atomic_add(ntf->conn_entries[i].credits,
&conn_info->credits_cnt);
@@ -77,12 +89,19 @@ static void nci_core_conn_credits_ntf_packet(struct nci_dev *ndev,
/* trigger the next tx */
if (!skb_queue_empty(&ndev->tx_q))
queue_work(ndev->tx_wq, &ndev->tx_work);
+
+ return 0;
}
-static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_core_generic_error_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
- __u8 status = skb->data[0];
+ __u8 status;
+
+ if (skb->len < 1)
+ return -EINVAL;
+
+ status = skb->data[0];
pr_debug("status 0x%x\n", status);
@@ -91,12 +110,19 @@ static void nci_core_generic_error_ntf_packet(struct nci_dev *ndev,
(the state remains the same) */
nci_req_complete(ndev, status);
}
+
+ return 0;
}
-static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev,
- struct sk_buff *skb)
+static int nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev,
+ struct sk_buff *skb)
{
- struct nci_core_intf_error_ntf *ntf = (void *) skb->data;
+ struct nci_core_intf_error_ntf *ntf;
+
+ if (skb->len < sizeof(struct nci_core_intf_error_ntf))
+ return -EINVAL;
+
+ ntf = (struct nci_core_intf_error_ntf *)skb->data;
ntf->conn_id = nci_conn_id(&ntf->conn_id);
@@ -105,6 +131,8 @@ static void nci_core_conn_intf_error_ntf_packet(struct nci_dev *ndev,
/* complete the data exchange transaction, if exists */
if (test_bit(NCI_DATA_EXCHANGE, &ndev->flags))
nci_data_exchange_complete(ndev, NULL, ntf->conn_id, -EIO);
+
+ return 0;
}
static const __u8 *
@@ -329,13 +357,18 @@ void nci_clear_target_list(struct nci_dev *ndev)
ndev->n_targets = 0;
}
-static void nci_rf_discover_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_rf_discover_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
struct nci_rf_discover_ntf ntf;
- const __u8 *data = skb->data;
+ const __u8 *data;
bool add_target = true;
+ if (skb->len < sizeof(struct nci_rf_discover_ntf))
+ return -EINVAL;
+
+ data = skb->data;
+
ntf.rf_discovery_id = *data++;
ntf.rf_protocol = *data++;
ntf.rf_tech_and_mode = *data++;
@@ -390,6 +423,8 @@ static void nci_rf_discover_ntf_packet(struct nci_dev *ndev,
nfc_targets_found(ndev->nfc_dev, ndev->targets,
ndev->n_targets);
}
+
+ return 0;
}
static int nci_extract_activation_params_iso_dep(struct nci_dev *ndev,
@@ -553,14 +588,19 @@ static int nci_store_ats_nfc_iso_dep(struct nci_dev *ndev,
return NCI_STATUS_OK;
}
-static void nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_rf_intf_activated_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
struct nci_conn_info *conn_info;
struct nci_rf_intf_activated_ntf ntf;
- const __u8 *data = skb->data;
+ const __u8 *data;
int err = NCI_STATUS_OK;
+ if (skb->len < sizeof(struct nci_rf_intf_activated_ntf))
+ return -EINVAL;
+
+ data = skb->data;
+
ntf.rf_discovery_id = *data++;
ntf.rf_interface = *data++;
ntf.rf_protocol = *data++;
@@ -667,7 +707,7 @@ exit:
if (err == NCI_STATUS_OK) {
conn_info = ndev->rf_conn_info;
if (!conn_info)
- return;
+ return 0;
conn_info->max_pkt_payload_len = ntf.max_data_pkt_payload_size;
conn_info->initial_num_credits = ntf.initial_num_credits;
@@ -721,19 +761,26 @@ listen:
pr_err("error when signaling tm activation\n");
}
}
+
+ return 0;
}
-static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
const struct nci_conn_info *conn_info;
- const struct nci_rf_deactivate_ntf *ntf = (void *)skb->data;
+ const struct nci_rf_deactivate_ntf *ntf;
+
+ if (skb->len < sizeof(struct nci_rf_deactivate_ntf))
+ return -EINVAL;
+
+ ntf = (struct nci_rf_deactivate_ntf *)skb->data;
pr_debug("entry, type 0x%x, reason 0x%x\n", ntf->type, ntf->reason);
conn_info = ndev->rf_conn_info;
if (!conn_info)
- return;
+ return 0;
/* drop tx data queue */
skb_queue_purge(&ndev->tx_q);
@@ -765,14 +812,20 @@ static void nci_rf_deactivate_ntf_packet(struct nci_dev *ndev,
}
nci_req_complete(ndev, NCI_STATUS_OK);
+
+ return 0;
}
-static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
- const struct sk_buff *skb)
+static int nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
+ const struct sk_buff *skb)
{
u8 status = NCI_STATUS_OK;
- const struct nci_nfcee_discover_ntf *nfcee_ntf =
- (struct nci_nfcee_discover_ntf *)skb->data;
+ const struct nci_nfcee_discover_ntf *nfcee_ntf;
+
+ if (skb->len < sizeof(struct nci_nfcee_discover_ntf))
+ return -EINVAL;
+
+ nfcee_ntf = (struct nci_nfcee_discover_ntf *)skb->data;
/* NFCForum NCI 9.2.1 HCI Network Specific Handling
* If the NFCC supports the HCI Network, it SHALL return one,
@@ -783,6 +836,8 @@ static void nci_nfcee_discover_ntf_packet(struct nci_dev *ndev,
ndev->cur_params.id = nfcee_ntf->nfcee_id;
nci_req_complete(ndev, status);
+
+ return 0;
}
void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
@@ -809,35 +864,43 @@ void nci_ntf_packet(struct nci_dev *ndev, struct sk_buff *skb)
switch (ntf_opcode) {
case NCI_OP_CORE_RESET_NTF:
- nci_core_reset_ntf_packet(ndev, skb);
+ if (nci_core_reset_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_CORE_CONN_CREDITS_NTF:
- nci_core_conn_credits_ntf_packet(ndev, skb);
+ if (nci_core_conn_credits_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_CORE_GENERIC_ERROR_NTF:
- nci_core_generic_error_ntf_packet(ndev, skb);
+ if (nci_core_generic_error_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_CORE_INTF_ERROR_NTF:
- nci_core_conn_intf_error_ntf_packet(ndev, skb);
+ if (nci_core_conn_intf_error_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_RF_DISCOVER_NTF:
- nci_rf_discover_ntf_packet(ndev, skb);
+ if (nci_rf_discover_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_RF_INTF_ACTIVATED_NTF:
- nci_rf_intf_activated_ntf_packet(ndev, skb);
+ if (nci_rf_intf_activated_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_RF_DEACTIVATE_NTF:
- nci_rf_deactivate_ntf_packet(ndev, skb);
+ if (nci_rf_deactivate_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_NFCEE_DISCOVER_NTF:
- nci_nfcee_discover_ntf_packet(ndev, skb);
+ if (nci_nfcee_discover_ntf_packet(ndev, skb))
+ goto end;
break;
case NCI_OP_RF_NFCEE_ACTION_NTF:
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index 7af0cde8b293..a2af90ee99af 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -75,7 +75,7 @@ static int dp_device_event(struct notifier_block *unused, unsigned long event,
/* schedule vport destroy, dev_put and genl notification */
ovs_net = net_generic(dev_net(dev), ovs_net_id);
- queue_work(system_wq, &ovs_net->dp_notify_work);
+ queue_work(system_percpu_wq, &ovs_net->dp_notify_work);
}
return NOTIFY_DONE;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index b80bd3a90773..66366982f604 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -129,15 +129,13 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
struct ovs_flow_stats *ovs_stats,
unsigned long *used, __be16 *tcp_flags)
{
- int cpu;
+ unsigned int cpu;
*used = 0;
*tcp_flags = 0;
memset(ovs_stats, 0, sizeof(*ovs_stats));
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+ for_each_cpu(cpu, flow->cpu_used_mask) {
struct sw_flow_stats *stats = rcu_dereference_ovsl(flow->stats[cpu]);
if (stats) {
@@ -158,11 +156,9 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
/* Called with ovs_mutex. */
void ovs_flow_stats_clear(struct sw_flow *flow)
{
- int cpu;
+ unsigned int cpu;
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+ for_each_cpu(cpu, flow->cpu_used_mask) {
struct sw_flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
if (stats) {
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index d108ae0bd0ee..ffc72a741a50 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -107,16 +107,15 @@ int ovs_flow_tbl_count(const struct flow_table *table)
static void flow_free(struct sw_flow *flow)
{
- int cpu;
+ unsigned int cpu;
if (ovs_identifier_is_key(&flow->id))
kfree(flow->id.unmasked_key);
if (flow->sf_acts)
ovs_nla_free_flow_actions((struct sw_flow_actions __force *)
flow->sf_acts);
- /* We open code this to make sure cpu 0 is always considered */
- for (cpu = 0; cpu < nr_cpu_ids;
- cpu = cpumask_next(cpu, flow->cpu_used_mask)) {
+
+ for_each_cpu(cpu, flow->cpu_used_mask) {
if (flow->stats[cpu])
kmem_cache_free(flow_stats_cache,
(struct sw_flow_stats __force *)flow->stats[cpu]);
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index a7017d7f0927..173e6edda08f 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
static int prb_queue_frozen(struct tpacket_kbdq_core *);
static void prb_open_block(struct tpacket_kbdq_core *,
struct tpacket_block_desc *);
-static void prb_retire_rx_blk_timer_expired(struct timer_list *);
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
static void prb_clear_rxhash(struct tpacket_kbdq_core *,
struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
return proto;
}
-static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
- timer_delete_sync(&pkc->retire_blk_timer);
-}
-
static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
struct sk_buff_head *rb_queue)
{
struct tpacket_kbdq_core *pkc;
pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-
- spin_lock_bh(&rb_queue->lock);
- pkc->delete_blk_timer = 1;
- spin_unlock_bh(&rb_queue->lock);
-
- prb_del_retire_blk_timer(pkc);
-}
-
-static void prb_setup_retire_blk_timer(struct packet_sock *po)
-{
- struct tpacket_kbdq_core *pkc;
-
- pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
- timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
- 0);
- pkc->retire_blk_timer.expires = jiffies;
+ hrtimer_cancel(&pkc->retire_blk_timer);
}
static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -669,57 +648,36 @@ static void init_prb_bdqc(struct packet_sock *po,
p1->knum_blocks = req_u->req3.tp_block_nr;
p1->hdrlen = po->tp_hdrlen;
p1->version = po->tp_version;
- p1->last_kactive_blk_num = 0;
po->stats.stats3.tp_freeze_q_cnt = 0;
if (req_u->req3.tp_retire_blk_tov)
- p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+ p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
else
- p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
- req_u->req3.tp_block_size);
- p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+ p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
+ req_u->req3.tp_block_size));
p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
rwlock_init(&p1->blk_fill_in_prog_lock);
p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
prb_init_ft_ops(p1, req_u);
- prb_setup_retire_blk_timer(po);
+ hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+ hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
+ HRTIMER_MODE_REL_SOFT);
prb_open_block(p1, pbd);
}
-/* Do NOT update the last_blk_num first.
- * Assumes sk_buff_head lock is held.
- */
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
- mod_timer(&pkc->retire_blk_timer,
- jiffies + pkc->tov_in_jiffies);
- pkc->last_kactive_blk_num = pkc->kactive_blk_num;
-}
-
/*
- * Timer logic:
- * 1) We refresh the timer only when we open a block.
- * By doing this we don't waste cycles refreshing the timer
- * on packet-by-packet basis.
- *
* With a 1MB block-size, on a 1Gbps line, it will take
* i) ~8 ms to fill a block + ii) memcpy etc.
* In this cut we are not accounting for the memcpy time.
*
- * So, if the user sets the 'tmo' to 10ms then the timer
- * will never fire while the block is still getting filled
- * (which is what we want). However, the user could choose
- * to close a block early and that's fine.
- *
- * But when the timer does fire, we check whether or not to refresh it.
* Since the tmo granularity is in msecs, it is not too expensive
* to refresh the timer, lets say every '8' msecs.
* Either the user can set the 'tmo' or we can derive it based on
* a) line-speed and b) block-size.
* prb_calc_retire_blk_tmo() calculates the tmo.
- *
*/
-static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
{
struct packet_sock *po =
timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -732,9 +690,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
frozen = prb_queue_frozen(pkc);
pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
- if (unlikely(pkc->delete_blk_timer))
- goto out;
-
/* We only need to plug the race when the block is partially filled.
* tpacket_rcv:
* lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -750,46 +705,31 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
write_unlock(&pkc->blk_fill_in_prog_lock);
}
- if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
- if (!frozen) {
- if (!BLOCK_NUM_PKTS(pbd)) {
- /* An empty block. Just refresh the timer. */
- goto refresh_timer;
- }
+ if (!frozen) {
+ if (BLOCK_NUM_PKTS(pbd)) {
+ /* Not an empty block. Need retire the block. */
prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
- if (!prb_dispatch_next_block(pkc, po))
- goto refresh_timer;
- else
- goto out;
- } else {
- /* Case 1. Queue was frozen because user-space was
- * lagging behind.
+ prb_dispatch_next_block(pkc, po);
+ }
+ } else {
+ /* Case 1. Queue was frozen because user-space was
+ * lagging behind.
+ */
+ if (!prb_curr_blk_in_use(pbd)) {
+ /* Case 2. queue was frozen,user-space caught up,
+ * now the link went idle && the timer fired.
+ * We don't have a block to close.So we open this
+ * block and restart the timer.
+ * opening a block thaws the queue,restarts timer
+ * Thawing/timer-refresh is a side effect.
*/
- if (prb_curr_blk_in_use(pbd)) {
- /*
- * Ok, user-space is still behind.
- * So just refresh the timer.
- */
- goto refresh_timer;
- } else {
- /* Case 2. queue was frozen,user-space caught up,
- * now the link went idle && the timer fired.
- * We don't have a block to close.So we open this
- * block and restart the timer.
- * opening a block thaws the queue,restarts timer
- * Thawing/timer-refresh is a side effect.
- */
- prb_open_block(pkc, pbd);
- goto out;
- }
+ prb_open_block(pkc, pbd);
}
}
-refresh_timer:
- _prb_refresh_rx_retire_blk_timer(pkc);
-
-out:
+ hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
spin_unlock(&po->sk.sk_receive_queue.lock);
+ return HRTIMER_RESTART;
}
static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -883,11 +823,18 @@ static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
}
/*
- * Side effect of opening a block:
+ * prb_open_block is called by tpacket_rcv or timer callback.
*
- * 1) prb_queue is thawed.
- * 2) retire_blk_timer is refreshed.
+ * Reasons why NOT update hrtimer in prb_open_block:
+ * 1) It will increase complexity to distinguish the two caller scenario.
+ * 2) hrtimer_cancel and hrtimer_start need to be called if you want to update
+ * TMO of an already enqueued hrtimer, leading to complex shutdown logic.
*
+ * One side effect of NOT update hrtimer when called by tpacket_rcv is that
+ * a newly opened block triggered by tpacket_rcv may be retired earlier than
+ * expected. On the other hand, if timeout is updated in prb_open_block, the
+ * frequent reception of network packets that leads to prb_open_block being
+ * called may cause hrtimer to be removed and enqueued repeatedly.
*/
static void prb_open_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_block_desc *pbd1)
@@ -921,7 +868,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
prb_thaw_queue(pkc1);
- _prb_refresh_rx_retire_blk_timer(pkc1);
smp_wmb();
}
@@ -2265,7 +2211,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
atomic_inc(&po->tp_drops);
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
drop_reason = SKB_DROP_REASON_PACKET_SOCK_ERROR;
drop_n_restore:
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 6ce1dcc284d9..c8f43e0c1925 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
pdr.pdr_frame_nr = ring->frame_max + 1;
if (ver > TPACKET_V2) {
- pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+ pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
pdr.pdr_features = ring->prb_bdqc.feature_req_word;
} else {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 1e743d0316fd..b76e645cd78d 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -20,15 +20,10 @@ struct tpacket_kbdq_core {
unsigned int feature_req_word;
unsigned int hdrlen;
unsigned char reset_pending_on_curr_blk;
- unsigned char delete_blk_timer;
unsigned short kactive_blk_num;
unsigned short blk_sizeof_priv;
- /* last_kactive_blk_num:
- * trick to see if user-space has caught up
- * in order to avoid refreshing timer when every single pkt arrives.
- */
- unsigned short last_kactive_blk_num;
+ unsigned short version;
char *pkblk_start;
char *pkblk_end;
@@ -38,6 +33,7 @@ struct tpacket_kbdq_core {
uint64_t knxt_seq_num;
char *prev;
char *nxt_offset;
+
struct sk_buff *skb;
rwlock_t blk_fill_in_prog_lock;
@@ -45,12 +41,10 @@ struct tpacket_kbdq_core {
/* Default is set to 8ms */
#define DEFAULT_PRB_RETIRE_TOV (8)
- unsigned short retire_blk_tov;
- unsigned short version;
- unsigned long tov_in_jiffies;
+ ktime_t interval_ktime;
/* timer to retire an outstanding block */
- struct timer_list retire_blk_timer;
+ struct hrtimer retire_blk_timer;
};
struct pgv {
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index a27efa4faa4e..238a9638d2b0 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -22,7 +22,7 @@
#include <net/phonet/pn_dev.h>
/* Transport protocol registration */
-static const struct phonet_protocol *proto_tab[PHONET_NPROTO] __read_mostly;
+static const struct phonet_protocol __rcu *proto_tab[PHONET_NPROTO] __read_mostly;
static const struct phonet_protocol *phonet_proto_get(unsigned int protocol)
{
@@ -482,7 +482,7 @@ void phonet_proto_unregister(unsigned int protocol,
const struct phonet_protocol *pp)
{
mutex_lock(&proto_tab_lock);
- BUG_ON(proto_tab[protocol] != pp);
+ BUG_ON(rcu_access_pointer(proto_tab[protocol]) != pp);
RCU_INIT_POINTER(proto_tab[protocol], NULL);
mutex_unlock(&proto_tab_lock);
synchronize_rcu();
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 62527e1ebb88..4db564d9d522 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -376,7 +376,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
case PNS_PEP_CTRL_REQ:
if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
break;
}
__skb_pull(skb, 4);
@@ -397,7 +397,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb)
}
if (pn->rx_credits == 0) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = -ENOBUFS;
break;
}
@@ -567,7 +567,7 @@ static int pipe_handler_do_rcv(struct sock *sk, struct sk_buff *skb)
}
if (pn->rx_credits == 0) {
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = NET_RX_DROP;
break;
}
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index ea4d5e6533db..db2d552e9b32 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -587,7 +587,7 @@ static int pn_sock_seq_show(struct seq_file *seq, void *v)
from_kuid_munged(seq_user_ns(seq), sk_uid(sk)),
sock_i_ino(sk),
refcount_read(&sk->sk_refcnt), sk,
- atomic_read(&sk->sk_drops));
+ sk_drops_read(sk));
}
seq_pad(seq, '\n');
return 0;
@@ -602,7 +602,7 @@ const struct seq_operations pn_sock_seq_ops = {
#endif
static struct {
- struct sock *sk[256];
+ struct sock __rcu *sk[256];
} pnres;
/*
@@ -654,7 +654,7 @@ int pn_sock_unbind_res(struct sock *sk, u8 res)
return -EPERM;
mutex_lock(&resource_mutex);
- if (pnres.sk[res] == sk) {
+ if (rcu_access_pointer(pnres.sk[res]) == sk) {
RCU_INIT_POINTER(pnres.sk[res], NULL);
ret = 0;
}
@@ -673,7 +673,7 @@ void pn_sock_unbind_all_res(struct sock *sk)
mutex_lock(&resource_mutex);
for (res = 0; res < 256; res++) {
- if (pnres.sk[res] == sk) {
+ if (rcu_access_pointer(pnres.sk[res]) == sk) {
RCU_INIT_POINTER(pnres.sk[res], NULL);
match++;
}
@@ -688,7 +688,7 @@ void pn_sock_unbind_all_res(struct sock *sk)
}
#ifdef CONFIG_PROC_FS
-static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
+static struct sock __rcu **pn_res_get_idx(struct seq_file *seq, loff_t pos)
{
struct net *net = seq_file_net(seq);
unsigned int i;
@@ -697,7 +697,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
return NULL;
for (i = 0; i < 256; i++) {
- if (pnres.sk[i] == NULL)
+ if (rcu_access_pointer(pnres.sk[i]) == NULL)
continue;
if (!pos)
return pnres.sk + i;
@@ -706,7 +706,7 @@ static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos)
return NULL;
}
-static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk)
+static struct sock __rcu **pn_res_get_next(struct seq_file *seq, struct sock __rcu **sk)
{
struct net *net = seq_file_net(seq);
unsigned int i;
@@ -728,7 +728,7 @@ static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos)
static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct sock **sk;
+ struct sock __rcu **sk;
if (v == SEQ_START_TOKEN)
sk = pn_res_get_idx(seq, 0);
@@ -747,11 +747,12 @@ static void pn_res_seq_stop(struct seq_file *seq, void *v)
static int pn_res_seq_show(struct seq_file *seq, void *v)
{
seq_setwidth(seq, 63);
- if (v == SEQ_START_TOKEN)
+ if (v == SEQ_START_TOKEN) {
seq_puts(seq, "rs uid inode");
- else {
- struct sock **psk = v;
- struct sock *sk = *psk;
+ } else {
+ struct sock __rcu **psk = v;
+ struct sock *sk = rcu_dereference_protected(*psk,
+ lockdep_is_held(&resource_mutex));
seq_printf(seq, "%02X %5u %lu",
(int) (psk - pnres.sk),
diff --git a/net/psp/Kconfig b/net/psp/Kconfig
new file mode 100644
index 000000000000..371e8771f3bd
--- /dev/null
+++ b/net/psp/Kconfig
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# PSP configuration
+#
+config INET_PSP
+ bool "PSP Security Protocol support"
+ depends on INET
+ select SKB_DECRYPTED
+ select SOCK_VALIDATE_XMIT
+ help
+ Enable kernel support for the PSP Security Protocol (PSP).
+ For more information see:
+ https://raw.githubusercontent.com/google/psp/main/doc/PSP_Arch_Spec.pdf
+
+ If unsure, say N.
diff --git a/net/psp/Makefile b/net/psp/Makefile
new file mode 100644
index 000000000000..eb5ff3c5bfb2
--- /dev/null
+++ b/net/psp/Makefile
@@ -0,0 +1,5 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+obj-$(CONFIG_INET_PSP) += psp.o
+
+psp-y := psp_main.o psp_nl.o psp_sock.o psp-nl-gen.o
diff --git a/net/psp/psp-nl-gen.c b/net/psp/psp-nl-gen.c
new file mode 100644
index 000000000000..9fdd6f831803
--- /dev/null
+++ b/net/psp/psp-nl-gen.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause)
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/psp.yaml */
+/* YNL-GEN kernel source */
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include "psp-nl-gen.h"
+
+#include <uapi/linux/psp.h>
+
+/* Common nested types */
+const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1] = {
+ [PSP_A_KEYS_KEY] = { .type = NLA_BINARY, },
+ [PSP_A_KEYS_SPI] = { .type = NLA_U32, },
+};
+
+/* PSP_CMD_DEV_GET - do */
+static const struct nla_policy psp_dev_get_nl_policy[PSP_A_DEV_ID + 1] = {
+ [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* PSP_CMD_DEV_SET - do */
+static const struct nla_policy psp_dev_set_nl_policy[PSP_A_DEV_PSP_VERSIONS_ENA + 1] = {
+ [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+ [PSP_A_DEV_PSP_VERSIONS_ENA] = NLA_POLICY_MASK(NLA_U32, 0xf),
+};
+
+/* PSP_CMD_KEY_ROTATE - do */
+static const struct nla_policy psp_key_rotate_nl_policy[PSP_A_DEV_ID + 1] = {
+ [PSP_A_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+};
+
+/* PSP_CMD_RX_ASSOC - do */
+static const struct nla_policy psp_rx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = {
+ [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+ [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3),
+ [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, },
+};
+
+/* PSP_CMD_TX_ASSOC - do */
+static const struct nla_policy psp_tx_assoc_nl_policy[PSP_A_ASSOC_SOCK_FD + 1] = {
+ [PSP_A_ASSOC_DEV_ID] = NLA_POLICY_MIN(NLA_U32, 1),
+ [PSP_A_ASSOC_VERSION] = NLA_POLICY_MAX(NLA_U32, 3),
+ [PSP_A_ASSOC_TX_KEY] = NLA_POLICY_NESTED(psp_keys_nl_policy),
+ [PSP_A_ASSOC_SOCK_FD] = { .type = NLA_U32, },
+};
+
+/* Ops table for psp */
+static const struct genl_split_ops psp_nl_ops[] = {
+ {
+ .cmd = PSP_CMD_DEV_GET,
+ .pre_doit = psp_device_get_locked,
+ .doit = psp_nl_dev_get_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_dev_get_nl_policy,
+ .maxattr = PSP_A_DEV_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = PSP_CMD_DEV_GET,
+ .dumpit = psp_nl_dev_get_dumpit,
+ .flags = GENL_CMD_CAP_DUMP,
+ },
+ {
+ .cmd = PSP_CMD_DEV_SET,
+ .pre_doit = psp_device_get_locked,
+ .doit = psp_nl_dev_set_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_dev_set_nl_policy,
+ .maxattr = PSP_A_DEV_PSP_VERSIONS_ENA,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = PSP_CMD_KEY_ROTATE,
+ .pre_doit = psp_device_get_locked,
+ .doit = psp_nl_key_rotate_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_key_rotate_nl_policy,
+ .maxattr = PSP_A_DEV_ID,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = PSP_CMD_RX_ASSOC,
+ .pre_doit = psp_assoc_device_get_locked,
+ .doit = psp_nl_rx_assoc_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_rx_assoc_nl_policy,
+ .maxattr = PSP_A_ASSOC_SOCK_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
+ {
+ .cmd = PSP_CMD_TX_ASSOC,
+ .pre_doit = psp_assoc_device_get_locked,
+ .doit = psp_nl_tx_assoc_doit,
+ .post_doit = psp_device_unlock,
+ .policy = psp_tx_assoc_nl_policy,
+ .maxattr = PSP_A_ASSOC_SOCK_FD,
+ .flags = GENL_CMD_CAP_DO,
+ },
+};
+
+static const struct genl_multicast_group psp_nl_mcgrps[] = {
+ [PSP_NLGRP_MGMT] = { "mgmt", },
+ [PSP_NLGRP_USE] = { "use", },
+};
+
+struct genl_family psp_nl_family __ro_after_init = {
+ .name = PSP_FAMILY_NAME,
+ .version = PSP_FAMILY_VERSION,
+ .netnsok = true,
+ .parallel_ops = true,
+ .module = THIS_MODULE,
+ .split_ops = psp_nl_ops,
+ .n_split_ops = ARRAY_SIZE(psp_nl_ops),
+ .mcgrps = psp_nl_mcgrps,
+ .n_mcgrps = ARRAY_SIZE(psp_nl_mcgrps),
+};
diff --git a/net/psp/psp-nl-gen.h b/net/psp/psp-nl-gen.h
new file mode 100644
index 000000000000..25268ed11fb5
--- /dev/null
+++ b/net/psp/psp-nl-gen.h
@@ -0,0 +1,39 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/psp.yaml */
+/* YNL-GEN kernel header */
+
+#ifndef _LINUX_PSP_GEN_H
+#define _LINUX_PSP_GEN_H
+
+#include <net/netlink.h>
+#include <net/genetlink.h>
+
+#include <uapi/linux/psp.h>
+
+/* Common nested types */
+extern const struct nla_policy psp_keys_nl_policy[PSP_A_KEYS_SPI + 1];
+
+int psp_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info);
+void
+psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info);
+
+int psp_nl_dev_get_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_dev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
+int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
+int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info);
+
+enum {
+ PSP_NLGRP_MGMT,
+ PSP_NLGRP_USE,
+};
+
+extern struct genl_family psp_nl_family;
+
+#endif /* _LINUX_PSP_GEN_H */
diff --git a/net/psp/psp.h b/net/psp/psp.h
new file mode 100644
index 000000000000..9f19137593a0
--- /dev/null
+++ b/net/psp/psp.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __PSP_PSP_H
+#define __PSP_PSP_H
+
+#include <linux/list.h>
+#include <linux/lockdep.h>
+#include <linux/mutex.h>
+#include <net/netns/generic.h>
+#include <net/psp.h>
+#include <net/sock.h>
+
+extern struct xarray psp_devs;
+extern struct mutex psp_devs_lock;
+
+void psp_dev_free(struct psp_dev *psd);
+int psp_dev_check_access(struct psp_dev *psd, struct net *net);
+
+void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd);
+
+struct psp_assoc *psp_assoc_create(struct psp_dev *psd);
+struct psp_dev *psp_dev_get_for_sock(struct sock *sk);
+void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas);
+int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas,
+ struct psp_key_parsed *key,
+ struct netlink_ext_ack *extack);
+int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd,
+ u32 version, struct psp_key_parsed *key,
+ struct netlink_ext_ack *extack);
+void psp_assocs_key_rotated(struct psp_dev *psd);
+
+static inline void psp_dev_get(struct psp_dev *psd)
+{
+ refcount_inc(&psd->refcnt);
+}
+
+static inline bool psp_dev_tryget(struct psp_dev *psd)
+{
+ return refcount_inc_not_zero(&psd->refcnt);
+}
+
+static inline void psp_dev_put(struct psp_dev *psd)
+{
+ if (refcount_dec_and_test(&psd->refcnt))
+ psp_dev_free(psd);
+}
+
+static inline bool psp_dev_is_registered(struct psp_dev *psd)
+{
+ lockdep_assert_held(&psd->lock);
+ return !!psd->ops;
+}
+
+#endif /* __PSP_PSP_H */
diff --git a/net/psp/psp_main.c b/net/psp/psp_main.c
new file mode 100644
index 000000000000..481aaf0fc9fc
--- /dev/null
+++ b/net/psp/psp_main.c
@@ -0,0 +1,322 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/bitfield.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/xarray.h>
+#include <net/net_namespace.h>
+#include <net/psp.h>
+#include <net/udp.h>
+
+#include "psp.h"
+#include "psp-nl-gen.h"
+
+DEFINE_XARRAY_ALLOC1(psp_devs);
+struct mutex psp_devs_lock;
+
+/**
+ * DOC: PSP locking
+ *
+ * psp_devs_lock protects the psp_devs xarray.
+ * Ordering is take the psp_devs_lock and then the instance lock.
+ * Each instance is protected by RCU, and has a refcount.
+ * When driver unregisters the instance gets flushed, but struct sticks around.
+ */
+
+/**
+ * psp_dev_check_access() - check if user in a given net ns can access PSP dev
+ * @psd: PSP device structure user is trying to access
+ * @net: net namespace user is in
+ *
+ * Return: 0 if PSP device should be visible in @net, errno otherwise.
+ */
+int psp_dev_check_access(struct psp_dev *psd, struct net *net)
+{
+ if (dev_net(psd->main_netdev) == net)
+ return 0;
+ return -ENOENT;
+}
+
+/**
+ * psp_dev_create() - create and register PSP device
+ * @netdev: main netdevice
+ * @psd_ops: driver callbacks
+ * @psd_caps: device capabilities
+ * @priv_ptr: back-pointer to driver private data
+ *
+ * Return: pointer to allocated PSP device, or ERR_PTR.
+ */
+struct psp_dev *
+psp_dev_create(struct net_device *netdev,
+ struct psp_dev_ops *psd_ops, struct psp_dev_caps *psd_caps,
+ void *priv_ptr)
+{
+ struct psp_dev *psd;
+ static u32 last_id;
+ int err;
+
+ if (WARN_ON(!psd_caps->versions ||
+ !psd_ops->set_config ||
+ !psd_ops->key_rotate ||
+ !psd_ops->rx_spi_alloc ||
+ !psd_ops->tx_key_add ||
+ !psd_ops->tx_key_del))
+ return ERR_PTR(-EINVAL);
+
+ psd = kzalloc(sizeof(*psd), GFP_KERNEL);
+ if (!psd)
+ return ERR_PTR(-ENOMEM);
+
+ psd->main_netdev = netdev;
+ psd->ops = psd_ops;
+ psd->caps = psd_caps;
+ psd->drv_priv = priv_ptr;
+
+ mutex_init(&psd->lock);
+ INIT_LIST_HEAD(&psd->active_assocs);
+ INIT_LIST_HEAD(&psd->prev_assocs);
+ INIT_LIST_HEAD(&psd->stale_assocs);
+ refcount_set(&psd->refcnt, 1);
+
+ mutex_lock(&psp_devs_lock);
+ err = xa_alloc_cyclic(&psp_devs, &psd->id, psd, xa_limit_16b,
+ &last_id, GFP_KERNEL);
+ if (err) {
+ mutex_unlock(&psp_devs_lock);
+ kfree(psd);
+ return ERR_PTR(err);
+ }
+ mutex_lock(&psd->lock);
+ mutex_unlock(&psp_devs_lock);
+
+ psp_nl_notify_dev(psd, PSP_CMD_DEV_ADD_NTF);
+
+ rcu_assign_pointer(netdev->psp_dev, psd);
+
+ mutex_unlock(&psd->lock);
+
+ return psd;
+}
+EXPORT_SYMBOL(psp_dev_create);
+
+void psp_dev_free(struct psp_dev *psd)
+{
+ mutex_lock(&psp_devs_lock);
+ xa_erase(&psp_devs, psd->id);
+ mutex_unlock(&psp_devs_lock);
+
+ mutex_destroy(&psd->lock);
+ kfree_rcu(psd, rcu);
+}
+
+/**
+ * psp_dev_unregister() - unregister PSP device
+ * @psd: PSP device structure
+ */
+void psp_dev_unregister(struct psp_dev *psd)
+{
+ struct psp_assoc *pas, *next;
+
+ mutex_lock(&psp_devs_lock);
+ mutex_lock(&psd->lock);
+
+ psp_nl_notify_dev(psd, PSP_CMD_DEV_DEL_NTF);
+
+ /* Wait until psp_dev_free() to call xa_erase() to prevent a
+ * different psd from being added to the xarray with this id, while
+ * there are still references to this psd being held.
+ */
+ xa_store(&psp_devs, psd->id, NULL, GFP_KERNEL);
+ mutex_unlock(&psp_devs_lock);
+
+ list_splice_init(&psd->active_assocs, &psd->prev_assocs);
+ list_splice_init(&psd->prev_assocs, &psd->stale_assocs);
+ list_for_each_entry_safe(pas, next, &psd->stale_assocs, assocs_list)
+ psp_dev_tx_key_del(psd, pas);
+
+ rcu_assign_pointer(psd->main_netdev->psp_dev, NULL);
+
+ psd->ops = NULL;
+ psd->drv_priv = NULL;
+
+ mutex_unlock(&psd->lock);
+
+ psp_dev_put(psd);
+}
+EXPORT_SYMBOL(psp_dev_unregister);
+
+unsigned int psp_key_size(u32 version)
+{
+ switch (version) {
+ case PSP_VERSION_HDR0_AES_GCM_128:
+ case PSP_VERSION_HDR0_AES_GMAC_128:
+ return 16;
+ case PSP_VERSION_HDR0_AES_GCM_256:
+ case PSP_VERSION_HDR0_AES_GMAC_256:
+ return 32;
+ default:
+ return 0;
+ }
+}
+EXPORT_SYMBOL(psp_key_size);
+
+static void psp_write_headers(struct net *net, struct sk_buff *skb, __be32 spi,
+ u8 ver, unsigned int udp_len, __be16 sport)
+{
+ struct udphdr *uh = udp_hdr(skb);
+ struct psphdr *psph = (struct psphdr *)(uh + 1);
+
+ uh->dest = htons(PSP_DEFAULT_UDP_PORT);
+ uh->source = udp_flow_src_port(net, skb, 0, 0, false);
+ uh->check = 0;
+ uh->len = htons(udp_len);
+
+ psph->nexthdr = IPPROTO_TCP;
+ psph->hdrlen = PSP_HDRLEN_NOOPT;
+ psph->crypt_offset = 0;
+ psph->verfl = FIELD_PREP(PSPHDR_VERFL_VERSION, ver) |
+ FIELD_PREP(PSPHDR_VERFL_ONE, 1);
+ psph->spi = spi;
+ memset(&psph->iv, 0, sizeof(psph->iv));
+}
+
+/* Encapsulate a TCP packet with PSP by adding the UDP+PSP headers and filling
+ * them in.
+ */
+bool psp_dev_encapsulate(struct net *net, struct sk_buff *skb, __be32 spi,
+ u8 ver, __be16 sport)
+{
+ u32 network_len = skb_network_header_len(skb);
+ u32 ethr_len = skb_mac_header_len(skb);
+ u32 bufflen = ethr_len + network_len;
+
+ if (skb_cow_head(skb, PSP_ENCAP_HLEN))
+ return false;
+
+ skb_push(skb, PSP_ENCAP_HLEN);
+ skb->mac_header -= PSP_ENCAP_HLEN;
+ skb->network_header -= PSP_ENCAP_HLEN;
+ skb->transport_header -= PSP_ENCAP_HLEN;
+ memmove(skb->data, skb->data + PSP_ENCAP_HLEN, bufflen);
+
+ if (skb->protocol == htons(ETH_P_IP)) {
+ ip_hdr(skb)->protocol = IPPROTO_UDP;
+ be16_add_cpu(&ip_hdr(skb)->tot_len, PSP_ENCAP_HLEN);
+ ip_hdr(skb)->check = 0;
+ ip_hdr(skb)->check =
+ ip_fast_csum((u8 *)ip_hdr(skb), ip_hdr(skb)->ihl);
+ } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ ipv6_hdr(skb)->nexthdr = IPPROTO_UDP;
+ be16_add_cpu(&ipv6_hdr(skb)->payload_len, PSP_ENCAP_HLEN);
+ } else {
+ return false;
+ }
+
+ skb_set_inner_ipproto(skb, IPPROTO_TCP);
+ skb_set_inner_transport_header(skb, skb_transport_offset(skb) +
+ PSP_ENCAP_HLEN);
+ skb->encapsulation = 1;
+ psp_write_headers(net, skb, spi, ver,
+ skb->len - skb_transport_offset(skb), sport);
+
+ return true;
+}
+EXPORT_SYMBOL(psp_dev_encapsulate);
+
+/* Receive handler for PSP packets.
+ *
+ * Presently it accepts only already-authenticated packets and does not
+ * support optional fields, such as virtualization cookies. The caller should
+ * ensure that skb->data is pointing to the mac header, and that skb->mac_len
+ * is set. This function does not currently adjust skb->csum (CHECKSUM_COMPLETE
+ * is not supported).
+ */
+int psp_dev_rcv(struct sk_buff *skb, u16 dev_id, u8 generation, bool strip_icv)
+{
+ int l2_hlen = 0, l3_hlen, encap;
+ struct psp_skb_ext *pse;
+ struct psphdr *psph;
+ struct ethhdr *eth;
+ struct udphdr *uh;
+ __be16 proto;
+ bool is_udp;
+
+ eth = (struct ethhdr *)skb->data;
+ proto = __vlan_get_protocol(skb, eth->h_proto, &l2_hlen);
+ if (proto == htons(ETH_P_IP))
+ l3_hlen = sizeof(struct iphdr);
+ else if (proto == htons(ETH_P_IPV6))
+ l3_hlen = sizeof(struct ipv6hdr);
+ else
+ return -EINVAL;
+
+ if (unlikely(!pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN)))
+ return -EINVAL;
+
+ if (proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen);
+
+ is_udp = iph->protocol == IPPROTO_UDP;
+ l3_hlen = iph->ihl * 4;
+ if (l3_hlen != sizeof(struct iphdr) &&
+ !pskb_may_pull(skb, l2_hlen + l3_hlen + PSP_ENCAP_HLEN))
+ return -EINVAL;
+ } else {
+ struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen);
+
+ is_udp = ipv6h->nexthdr == IPPROTO_UDP;
+ }
+
+ if (unlikely(!is_udp))
+ return -EINVAL;
+
+ uh = (struct udphdr *)(skb->data + l2_hlen + l3_hlen);
+ if (unlikely(uh->dest != htons(PSP_DEFAULT_UDP_PORT)))
+ return -EINVAL;
+
+ pse = skb_ext_add(skb, SKB_EXT_PSP);
+ if (!pse)
+ return -EINVAL;
+
+ psph = (struct psphdr *)(skb->data + l2_hlen + l3_hlen +
+ sizeof(struct udphdr));
+ pse->spi = psph->spi;
+ pse->dev_id = dev_id;
+ pse->generation = generation;
+ pse->version = FIELD_GET(PSPHDR_VERFL_VERSION, psph->verfl);
+
+ encap = PSP_ENCAP_HLEN;
+ encap += strip_icv ? PSP_TRL_SIZE : 0;
+
+ if (proto == htons(ETH_P_IP)) {
+ struct iphdr *iph = (struct iphdr *)(skb->data + l2_hlen);
+
+ iph->protocol = psph->nexthdr;
+ iph->tot_len = htons(ntohs(iph->tot_len) - encap);
+ iph->check = 0;
+ iph->check = ip_fast_csum((u8 *)iph, iph->ihl);
+ } else {
+ struct ipv6hdr *ipv6h = (struct ipv6hdr *)(skb->data + l2_hlen);
+
+ ipv6h->nexthdr = psph->nexthdr;
+ ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) - encap);
+ }
+
+ memmove(skb->data + PSP_ENCAP_HLEN, skb->data, l2_hlen + l3_hlen);
+ skb_pull(skb, PSP_ENCAP_HLEN);
+
+ if (strip_icv)
+ pskb_trim(skb, skb->len - PSP_TRL_SIZE);
+
+ return 0;
+}
+EXPORT_SYMBOL(psp_dev_rcv);
+
+static int __init psp_init(void)
+{
+ mutex_init(&psp_devs_lock);
+
+ return genl_register_family(&psp_nl_family);
+}
+
+subsys_initcall(psp_init);
diff --git a/net/psp/psp_nl.c b/net/psp/psp_nl.c
new file mode 100644
index 000000000000..8aaca62744c3
--- /dev/null
+++ b/net/psp/psp_nl.c
@@ -0,0 +1,505 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/skbuff.h>
+#include <linux/xarray.h>
+#include <net/genetlink.h>
+#include <net/psp.h>
+#include <net/sock.h>
+
+#include "psp-nl-gen.h"
+#include "psp.h"
+
+/* Netlink helpers */
+
+static struct sk_buff *psp_nl_reply_new(struct genl_info *info)
+{
+ struct sk_buff *rsp;
+ void *hdr;
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return NULL;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr) {
+ nlmsg_free(rsp);
+ return NULL;
+ }
+
+ return rsp;
+}
+
+static int psp_nl_reply_send(struct sk_buff *rsp, struct genl_info *info)
+{
+ /* Note that this *only* works with a single message per skb! */
+ nlmsg_end(rsp, (struct nlmsghdr *)rsp->data);
+
+ return genlmsg_reply(rsp, info);
+}
+
+/* Device stuff */
+
+static struct psp_dev *
+psp_device_get_and_lock(struct net *net, struct nlattr *dev_id)
+{
+ struct psp_dev *psd;
+ int err;
+
+ mutex_lock(&psp_devs_lock);
+ psd = xa_load(&psp_devs, nla_get_u32(dev_id));
+ if (!psd) {
+ mutex_unlock(&psp_devs_lock);
+ return ERR_PTR(-ENODEV);
+ }
+
+ mutex_lock(&psd->lock);
+ mutex_unlock(&psp_devs_lock);
+
+ err = psp_dev_check_access(psd, net);
+ if (err) {
+ mutex_unlock(&psd->lock);
+ return ERR_PTR(err);
+ }
+
+ return psd;
+}
+
+int psp_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ if (GENL_REQ_ATTR_CHECK(info, PSP_A_DEV_ID))
+ return -EINVAL;
+
+ info->user_ptr[0] = psp_device_get_and_lock(genl_info_net(info),
+ info->attrs[PSP_A_DEV_ID]);
+ return PTR_ERR_OR_ZERO(info->user_ptr[0]);
+}
+
+void
+psp_device_unlock(const struct genl_split_ops *ops, struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct socket *socket = info->user_ptr[1];
+ struct psp_dev *psd = info->user_ptr[0];
+
+ mutex_unlock(&psd->lock);
+ if (socket)
+ sockfd_put(socket);
+}
+
+static int
+psp_nl_dev_fill(struct psp_dev *psd, struct sk_buff *rsp,
+ const struct genl_info *info)
+{
+ void *hdr;
+
+ hdr = genlmsg_iput(rsp, info);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) ||
+ nla_put_u32(rsp, PSP_A_DEV_IFINDEX, psd->main_netdev->ifindex) ||
+ nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_CAP, psd->caps->versions) ||
+ nla_put_u32(rsp, PSP_A_DEV_PSP_VERSIONS_ENA, psd->config.versions))
+ goto err_cancel_msg;
+
+ genlmsg_end(rsp, hdr);
+ return 0;
+
+err_cancel_msg:
+ genlmsg_cancel(rsp, hdr);
+ return -EMSGSIZE;
+}
+
+void psp_nl_notify_dev(struct psp_dev *psd, u32 cmd)
+{
+ struct genl_info info;
+ struct sk_buff *ntf;
+
+ if (!genl_has_listeners(&psp_nl_family, dev_net(psd->main_netdev),
+ PSP_NLGRP_MGMT))
+ return;
+
+ ntf = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!ntf)
+ return;
+
+ genl_info_init_ntf(&info, &psp_nl_family, cmd);
+ if (psp_nl_dev_fill(psd, ntf, &info)) {
+ nlmsg_free(ntf);
+ return;
+ }
+
+ genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf,
+ 0, PSP_NLGRP_MGMT, GFP_KERNEL);
+}
+
+int psp_nl_dev_get_doit(struct sk_buff *req, struct genl_info *info)
+{
+ struct psp_dev *psd = info->user_ptr[0];
+ struct sk_buff *rsp;
+ int err;
+
+ rsp = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!rsp)
+ return -ENOMEM;
+
+ err = psp_nl_dev_fill(psd, rsp, info);
+ if (err)
+ goto err_free_msg;
+
+ return genlmsg_reply(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
+
+static int
+psp_nl_dev_get_dumpit_one(struct sk_buff *rsp, struct netlink_callback *cb,
+ struct psp_dev *psd)
+{
+ if (psp_dev_check_access(psd, sock_net(rsp->sk)))
+ return 0;
+
+ return psp_nl_dev_fill(psd, rsp, genl_info_dump(cb));
+}
+
+int psp_nl_dev_get_dumpit(struct sk_buff *rsp, struct netlink_callback *cb)
+{
+ struct psp_dev *psd;
+ int err = 0;
+
+ mutex_lock(&psp_devs_lock);
+ xa_for_each_start(&psp_devs, cb->args[0], psd, cb->args[0]) {
+ mutex_lock(&psd->lock);
+ err = psp_nl_dev_get_dumpit_one(rsp, cb, psd);
+ mutex_unlock(&psd->lock);
+ if (err)
+ break;
+ }
+ mutex_unlock(&psp_devs_lock);
+
+ return err;
+}
+
+int psp_nl_dev_set_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct psp_dev *psd = info->user_ptr[0];
+ struct psp_dev_config new_config;
+ struct sk_buff *rsp;
+ int err;
+
+ memcpy(&new_config, &psd->config, sizeof(new_config));
+
+ if (info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]) {
+ new_config.versions =
+ nla_get_u32(info->attrs[PSP_A_DEV_PSP_VERSIONS_ENA]);
+ if (new_config.versions & ~psd->caps->versions) {
+ NL_SET_ERR_MSG(info->extack, "Requested PSP versions not supported by the device");
+ return -EINVAL;
+ }
+ } else {
+ NL_SET_ERR_MSG(info->extack, "No settings present");
+ return -EINVAL;
+ }
+
+ rsp = psp_nl_reply_new(info);
+ if (!rsp)
+ return -ENOMEM;
+
+ if (memcmp(&new_config, &psd->config, sizeof(new_config))) {
+ err = psd->ops->set_config(psd, &new_config, info->extack);
+ if (err)
+ goto err_free_rsp;
+
+ memcpy(&psd->config, &new_config, sizeof(new_config));
+ }
+
+ psp_nl_notify_dev(psd, PSP_CMD_DEV_CHANGE_NTF);
+
+ return psp_nl_reply_send(rsp, info);
+
+err_free_rsp:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int psp_nl_key_rotate_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct psp_dev *psd = info->user_ptr[0];
+ struct genl_info ntf_info;
+ struct sk_buff *ntf, *rsp;
+ u8 prev_gen;
+ int err;
+
+ rsp = psp_nl_reply_new(info);
+ if (!rsp)
+ return -ENOMEM;
+
+ genl_info_init_ntf(&ntf_info, &psp_nl_family, PSP_CMD_KEY_ROTATE_NTF);
+ ntf = psp_nl_reply_new(&ntf_info);
+ if (!ntf) {
+ err = -ENOMEM;
+ goto err_free_rsp;
+ }
+
+ if (nla_put_u32(rsp, PSP_A_DEV_ID, psd->id) ||
+ nla_put_u32(ntf, PSP_A_DEV_ID, psd->id)) {
+ err = -EMSGSIZE;
+ goto err_free_ntf;
+ }
+
+ /* suggest the next gen number, driver can override */
+ prev_gen = psd->generation;
+ psd->generation = (prev_gen + 1) & PSP_GEN_VALID_MASK;
+
+ err = psd->ops->key_rotate(psd, info->extack);
+ if (err)
+ goto err_free_ntf;
+
+ WARN_ON_ONCE((psd->generation && psd->generation == prev_gen) ||
+ psd->generation & ~PSP_GEN_VALID_MASK);
+
+ psp_assocs_key_rotated(psd);
+
+ nlmsg_end(ntf, (struct nlmsghdr *)ntf->data);
+ genlmsg_multicast_netns(&psp_nl_family, dev_net(psd->main_netdev), ntf,
+ 0, PSP_NLGRP_USE, GFP_KERNEL);
+ return psp_nl_reply_send(rsp, info);
+
+err_free_ntf:
+ nlmsg_free(ntf);
+err_free_rsp:
+ nlmsg_free(rsp);
+ return err;
+}
+
+/* Key etc. */
+
+int psp_assoc_device_get_locked(const struct genl_split_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct socket *socket;
+ struct psp_dev *psd;
+ struct nlattr *id;
+ int fd, err;
+
+ if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_SOCK_FD))
+ return -EINVAL;
+
+ fd = nla_get_u32(info->attrs[PSP_A_ASSOC_SOCK_FD]);
+ socket = sockfd_lookup(fd, &err);
+ if (!socket)
+ return err;
+
+ if (!sk_is_tcp(socket->sk)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ info->attrs[PSP_A_ASSOC_SOCK_FD],
+ "Unsupported socket family and type");
+ err = -EOPNOTSUPP;
+ goto err_sock_put;
+ }
+
+ psd = psp_dev_get_for_sock(socket->sk);
+ if (psd) {
+ err = psp_dev_check_access(psd, genl_info_net(info));
+ if (err) {
+ psp_dev_put(psd);
+ psd = NULL;
+ }
+ }
+
+ if (!psd && GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_DEV_ID)) {
+ err = -EINVAL;
+ goto err_sock_put;
+ }
+
+ id = info->attrs[PSP_A_ASSOC_DEV_ID];
+ if (psd) {
+ mutex_lock(&psd->lock);
+ if (id && psd->id != nla_get_u32(id)) {
+ mutex_unlock(&psd->lock);
+ NL_SET_ERR_MSG_ATTR(info->extack, id,
+ "Device id vs socket mismatch");
+ err = -EINVAL;
+ goto err_psd_put;
+ }
+
+ psp_dev_put(psd);
+ } else {
+ psd = psp_device_get_and_lock(genl_info_net(info), id);
+ if (IS_ERR(psd)) {
+ err = PTR_ERR(psd);
+ goto err_sock_put;
+ }
+ }
+
+ info->user_ptr[0] = psd;
+ info->user_ptr[1] = socket;
+
+ return 0;
+
+err_psd_put:
+ psp_dev_put(psd);
+err_sock_put:
+ sockfd_put(socket);
+ return err;
+}
+
+static int
+psp_nl_parse_key(struct genl_info *info, u32 attr, struct psp_key_parsed *key,
+ unsigned int key_sz)
+{
+ struct nlattr *nest = info->attrs[attr];
+ struct nlattr *tb[PSP_A_KEYS_SPI + 1];
+ u32 spi;
+ int err;
+
+ err = nla_parse_nested(tb, ARRAY_SIZE(tb) - 1, nest,
+ psp_keys_nl_policy, info->extack);
+ if (err)
+ return err;
+
+ if (NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_KEY) ||
+ NL_REQ_ATTR_CHECK(info->extack, nest, tb, PSP_A_KEYS_SPI))
+ return -EINVAL;
+
+ if (nla_len(tb[PSP_A_KEYS_KEY]) != key_sz) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY],
+ "incorrect key length");
+ return -EINVAL;
+ }
+
+ spi = nla_get_u32(tb[PSP_A_KEYS_SPI]);
+ if (!(spi & PSP_SPI_KEY_ID)) {
+ NL_SET_ERR_MSG_ATTR(info->extack, tb[PSP_A_KEYS_KEY],
+ "invalid SPI: lower 31b must be non-zero");
+ return -EINVAL;
+ }
+
+ key->spi = cpu_to_be32(spi);
+ memcpy(key->key, nla_data(tb[PSP_A_KEYS_KEY]), key_sz);
+
+ return 0;
+}
+
+static int
+psp_nl_put_key(struct sk_buff *skb, u32 attr, u32 version,
+ struct psp_key_parsed *key)
+{
+ int key_sz = psp_key_size(version);
+ void *nest;
+
+ nest = nla_nest_start(skb, attr);
+
+ if (nla_put_u32(skb, PSP_A_KEYS_SPI, be32_to_cpu(key->spi)) ||
+ nla_put(skb, PSP_A_KEYS_KEY, key_sz, key->key)) {
+ nla_nest_cancel(skb, nest);
+ return -EMSGSIZE;
+ }
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+}
+
+int psp_nl_rx_assoc_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct socket *socket = info->user_ptr[1];
+ struct psp_dev *psd = info->user_ptr[0];
+ struct psp_key_parsed key;
+ struct psp_assoc *pas;
+ struct sk_buff *rsp;
+ u32 version;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION))
+ return -EINVAL;
+
+ version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]);
+ if (!(psd->caps->versions & (1 << version))) {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]);
+ return -EOPNOTSUPP;
+ }
+
+ rsp = psp_nl_reply_new(info);
+ if (!rsp)
+ return -ENOMEM;
+
+ pas = psp_assoc_create(psd);
+ if (!pas) {
+ err = -ENOMEM;
+ goto err_free_rsp;
+ }
+ pas->version = version;
+
+ err = psd->ops->rx_spi_alloc(psd, version, &key, info->extack);
+ if (err)
+ goto err_free_pas;
+
+ if (nla_put_u32(rsp, PSP_A_ASSOC_DEV_ID, psd->id) ||
+ psp_nl_put_key(rsp, PSP_A_ASSOC_RX_KEY, version, &key)) {
+ err = -EMSGSIZE;
+ goto err_free_pas;
+ }
+
+ err = psp_sock_assoc_set_rx(socket->sk, pas, &key, info->extack);
+ if (err) {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_SOCK_FD]);
+ goto err_free_pas;
+ }
+ psp_assoc_put(pas);
+
+ return psp_nl_reply_send(rsp, info);
+
+err_free_pas:
+ psp_assoc_put(pas);
+err_free_rsp:
+ nlmsg_free(rsp);
+ return err;
+}
+
+int psp_nl_tx_assoc_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct socket *socket = info->user_ptr[1];
+ struct psp_dev *psd = info->user_ptr[0];
+ struct psp_key_parsed key;
+ struct sk_buff *rsp;
+ unsigned int key_sz;
+ u32 version;
+ int err;
+
+ if (GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_VERSION) ||
+ GENL_REQ_ATTR_CHECK(info, PSP_A_ASSOC_TX_KEY))
+ return -EINVAL;
+
+ version = nla_get_u32(info->attrs[PSP_A_ASSOC_VERSION]);
+ if (!(psd->caps->versions & (1 << version))) {
+ NL_SET_BAD_ATTR(info->extack, info->attrs[PSP_A_ASSOC_VERSION]);
+ return -EOPNOTSUPP;
+ }
+
+ key_sz = psp_key_size(version);
+ if (!key_sz)
+ return -EINVAL;
+
+ err = psp_nl_parse_key(info, PSP_A_ASSOC_TX_KEY, &key, key_sz);
+ if (err < 0)
+ return err;
+
+ rsp = psp_nl_reply_new(info);
+ if (!rsp)
+ return -ENOMEM;
+
+ err = psp_sock_assoc_set_tx(socket->sk, psd, version, &key,
+ info->extack);
+ if (err)
+ goto err_free_msg;
+
+ return psp_nl_reply_send(rsp, info);
+
+err_free_msg:
+ nlmsg_free(rsp);
+ return err;
+}
diff --git a/net/psp/psp_sock.c b/net/psp/psp_sock.c
new file mode 100644
index 000000000000..5324a7603bed
--- /dev/null
+++ b/net/psp/psp_sock.c
@@ -0,0 +1,292 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/file.h>
+#include <linux/net.h>
+#include <linux/rcupdate.h>
+#include <linux/tcp.h>
+
+#include <net/ip.h>
+#include <net/psp.h>
+#include "psp.h"
+
+struct psp_dev *psp_dev_get_for_sock(struct sock *sk)
+{
+ struct psp_dev *psd = NULL;
+ struct dst_entry *dst;
+
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ if (dst) {
+ psd = rcu_dereference(dst_dev_rcu(dst)->psp_dev);
+ if (psd && !psp_dev_tryget(psd))
+ psd = NULL;
+ }
+ rcu_read_unlock();
+
+ return psd;
+}
+
+static struct sk_buff *
+psp_validate_xmit(struct sock *sk, struct net_device *dev, struct sk_buff *skb)
+{
+ struct psp_assoc *pas;
+ bool good;
+
+ rcu_read_lock();
+ pas = psp_skb_get_assoc_rcu(skb);
+ good = !pas || rcu_access_pointer(dev->psp_dev) == pas->psd;
+ rcu_read_unlock();
+ if (!good) {
+ sk_skb_reason_drop(sk, skb, SKB_DROP_REASON_PSP_OUTPUT);
+ return NULL;
+ }
+
+ return skb;
+}
+
+struct psp_assoc *psp_assoc_create(struct psp_dev *psd)
+{
+ struct psp_assoc *pas;
+
+ lockdep_assert_held(&psd->lock);
+
+ pas = kzalloc(struct_size(pas, drv_data, psd->caps->assoc_drv_spc),
+ GFP_KERNEL_ACCOUNT);
+ if (!pas)
+ return NULL;
+
+ pas->psd = psd;
+ pas->dev_id = psd->id;
+ pas->generation = psd->generation;
+ psp_dev_get(psd);
+ refcount_set(&pas->refcnt, 1);
+
+ list_add_tail(&pas->assocs_list, &psd->active_assocs);
+
+ return pas;
+}
+
+static struct psp_assoc *psp_assoc_dummy(struct psp_assoc *pas)
+{
+ struct psp_dev *psd = pas->psd;
+ size_t sz;
+
+ lockdep_assert_held(&psd->lock);
+
+ sz = struct_size(pas, drv_data, psd->caps->assoc_drv_spc);
+ return kmemdup(pas, sz, GFP_KERNEL);
+}
+
+static int psp_dev_tx_key_add(struct psp_dev *psd, struct psp_assoc *pas,
+ struct netlink_ext_ack *extack)
+{
+ return psd->ops->tx_key_add(psd, pas, extack);
+}
+
+void psp_dev_tx_key_del(struct psp_dev *psd, struct psp_assoc *pas)
+{
+ if (pas->tx.spi)
+ psd->ops->tx_key_del(psd, pas);
+ list_del(&pas->assocs_list);
+}
+
+static void psp_assoc_free(struct work_struct *work)
+{
+ struct psp_assoc *pas = container_of(work, struct psp_assoc, work);
+ struct psp_dev *psd = pas->psd;
+
+ mutex_lock(&psd->lock);
+ if (psd->ops)
+ psp_dev_tx_key_del(psd, pas);
+ mutex_unlock(&psd->lock);
+ psp_dev_put(psd);
+ kfree(pas);
+}
+
+static void psp_assoc_free_queue(struct rcu_head *head)
+{
+ struct psp_assoc *pas = container_of(head, struct psp_assoc, rcu);
+
+ INIT_WORK(&pas->work, psp_assoc_free);
+ schedule_work(&pas->work);
+}
+
+/**
+ * psp_assoc_put() - release a reference on a PSP association
+ * @pas: association to release
+ */
+void psp_assoc_put(struct psp_assoc *pas)
+{
+ if (pas && refcount_dec_and_test(&pas->refcnt))
+ call_rcu(&pas->rcu, psp_assoc_free_queue);
+}
+
+void psp_sk_assoc_free(struct sock *sk)
+{
+ struct psp_assoc *pas = rcu_dereference_protected(sk->psp_assoc, 1);
+
+ rcu_assign_pointer(sk->psp_assoc, NULL);
+ psp_assoc_put(pas);
+}
+
+int psp_sock_assoc_set_rx(struct sock *sk, struct psp_assoc *pas,
+ struct psp_key_parsed *key,
+ struct netlink_ext_ack *extack)
+{
+ int err;
+
+ memcpy(&pas->rx, key, sizeof(*key));
+
+ lock_sock(sk);
+
+ if (psp_sk_assoc(sk)) {
+ NL_SET_ERR_MSG(extack, "Socket already has PSP state");
+ err = -EBUSY;
+ goto exit_unlock;
+ }
+
+ refcount_inc(&pas->refcnt);
+ rcu_assign_pointer(sk->psp_assoc, pas);
+ err = 0;
+
+exit_unlock:
+ release_sock(sk);
+
+ return err;
+}
+
+static int psp_sock_recv_queue_check(struct sock *sk, struct psp_assoc *pas)
+{
+ struct psp_skb_ext *pse;
+ struct sk_buff *skb;
+
+ skb_rbtree_walk(skb, &tcp_sk(sk)->out_of_order_queue) {
+ pse = skb_ext_find(skb, SKB_EXT_PSP);
+ if (!psp_pse_matches_pas(pse, pas))
+ return -EBUSY;
+ }
+
+ skb_queue_walk(&sk->sk_receive_queue, skb) {
+ pse = skb_ext_find(skb, SKB_EXT_PSP);
+ if (!psp_pse_matches_pas(pse, pas))
+ return -EBUSY;
+ }
+ return 0;
+}
+
+int psp_sock_assoc_set_tx(struct sock *sk, struct psp_dev *psd,
+ u32 version, struct psp_key_parsed *key,
+ struct netlink_ext_ack *extack)
+{
+ struct inet_connection_sock *icsk;
+ struct psp_assoc *pas, *dummy;
+ int err;
+
+ lock_sock(sk);
+
+ pas = psp_sk_assoc(sk);
+ if (!pas) {
+ NL_SET_ERR_MSG(extack, "Socket has no Rx key");
+ err = -EINVAL;
+ goto exit_unlock;
+ }
+ if (pas->psd != psd) {
+ NL_SET_ERR_MSG(extack, "Rx key from different device");
+ err = -EINVAL;
+ goto exit_unlock;
+ }
+ if (pas->version != version) {
+ NL_SET_ERR_MSG(extack,
+ "PSP version mismatch with existing state");
+ err = -EINVAL;
+ goto exit_unlock;
+ }
+ if (pas->tx.spi) {
+ NL_SET_ERR_MSG(extack, "Tx key already set");
+ err = -EBUSY;
+ goto exit_unlock;
+ }
+
+ err = psp_sock_recv_queue_check(sk, pas);
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Socket has incompatible segments already in the recv queue");
+ goto exit_unlock;
+ }
+
+ /* Pass a fake association to drivers to make sure they don't
+ * try to store pointers to it. For re-keying we'll need to
+ * re-allocate the assoc structures.
+ */
+ dummy = psp_assoc_dummy(pas);
+ if (!dummy) {
+ err = -ENOMEM;
+ goto exit_unlock;
+ }
+
+ memcpy(&dummy->tx, key, sizeof(*key));
+ err = psp_dev_tx_key_add(psd, dummy, extack);
+ if (err)
+ goto exit_free_dummy;
+
+ memcpy(pas->drv_data, dummy->drv_data, psd->caps->assoc_drv_spc);
+ memcpy(&pas->tx, key, sizeof(*key));
+
+ WRITE_ONCE(sk->sk_validate_xmit_skb, psp_validate_xmit);
+ tcp_write_collapse_fence(sk);
+ pas->upgrade_seq = tcp_sk(sk)->rcv_nxt;
+
+ icsk = inet_csk(sk);
+ icsk->icsk_ext_hdr_len += psp_sk_overhead(sk);
+ icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
+
+exit_free_dummy:
+ kfree(dummy);
+exit_unlock:
+ release_sock(sk);
+ return err;
+}
+
+void psp_assocs_key_rotated(struct psp_dev *psd)
+{
+ struct psp_assoc *pas, *next;
+
+ /* Mark the stale associations as invalid, they will no longer
+ * be able to Rx any traffic.
+ */
+ list_for_each_entry_safe(pas, next, &psd->prev_assocs, assocs_list)
+ pas->generation |= ~PSP_GEN_VALID_MASK;
+ list_splice_init(&psd->prev_assocs, &psd->stale_assocs);
+ list_splice_init(&psd->active_assocs, &psd->prev_assocs);
+
+ /* TODO: we should inform the sockets that got shut down */
+}
+
+void psp_twsk_init(struct inet_timewait_sock *tw, const struct sock *sk)
+{
+ struct psp_assoc *pas = psp_sk_assoc(sk);
+
+ if (pas)
+ refcount_inc(&pas->refcnt);
+ rcu_assign_pointer(tw->psp_assoc, pas);
+ tw->tw_validate_xmit_skb = psp_validate_xmit;
+}
+
+void psp_twsk_assoc_free(struct inet_timewait_sock *tw)
+{
+ struct psp_assoc *pas = rcu_dereference_protected(tw->psp_assoc, 1);
+
+ rcu_assign_pointer(tw->psp_assoc, NULL);
+ psp_assoc_put(pas);
+}
+
+void psp_reply_set_decrypted(struct sk_buff *skb)
+{
+ struct psp_assoc *pas;
+
+ rcu_read_lock();
+ pas = psp_sk_get_assoc_rcu(skb->sk);
+ if (pas && pas->tx.spi)
+ skb->decrypted = 1;
+ rcu_read_unlock();
+}
+EXPORT_IPV6_MOD_GPL(psp_reply_set_decrypted);
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 086a13170e09..4a7217fbeab6 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -242,7 +242,7 @@ static __poll_t rds_poll(struct file *file, struct socket *sock,
if (rs->rs_snd_bytes < rds_sk_sndbuf(rs))
mask |= (EPOLLOUT | EPOLLWRNORM);
if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
- mask |= POLLERR;
+ mask |= EPOLLERR;
read_unlock_irqrestore(&rs->rs_recv_lock, flags);
/* clear state any time we wake a seen-congested socket */
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d62f486ab29f..68bc88cce84e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -57,16 +57,17 @@ static struct hlist_head *rds_conn_bucket(const struct in6_addr *laddr,
static u32 rds6_hash_secret __read_mostly;
static u32 rds_hash_secret __read_mostly;
- u32 lhash, fhash, hash;
+ __be32 lhash, fhash;
+ u32 hash;
net_get_random_once(&rds_hash_secret, sizeof(rds_hash_secret));
net_get_random_once(&rds6_hash_secret, sizeof(rds6_hash_secret));
- lhash = (__force u32)laddr->s6_addr32[3];
+ lhash = laddr->s6_addr32[3];
#if IS_ENABLED(CONFIG_IPV6)
- fhash = __ipv6_addr_jhash(faddr, rds6_hash_secret);
+ fhash = (__force __be32)__ipv6_addr_jhash(faddr, rds6_hash_secret);
#else
- fhash = (__force u32)faddr->s6_addr32[3];
+ fhash = faddr->s6_addr32[3];
#endif
hash = __inet_ehashfn(lhash, 0, fhash, 0, rds_hash_secret);
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
index ea5e9aee4959..5884de8c6f45 100644
--- a/net/rds/ib_mr.h
+++ b/net/rds/ib_mr.h
@@ -108,7 +108,6 @@ struct rds_ib_mr_pool {
};
extern struct workqueue_struct *rds_ib_mr_wq;
-extern bool prefer_frmr;
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index d1cfceeff133..6585164c7059 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -672,7 +672,8 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
int rds_ib_mr_init(void)
{
- rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd", WQ_MEM_RECLAIM, 0);
+ rds_ib_mr_wq = alloc_workqueue("rds_mr_flushd",
+ WQ_MEM_RECLAIM | WQ_PERCPU, 0);
if (!rds_ib_mr_wq)
return -ENOMEM;
return 0;
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index e53b7f266bd7..4248dfa816eb 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -1034,7 +1034,7 @@ void rds_ib_recv_cqe_handler(struct rds_ib_connection *ic,
rds_ib_stats_inc(s_ib_rx_ring_empty);
if (rds_ib_ring_low(&ic->i_recv_ring)) {
- rds_ib_recv_refill(conn, 0, GFP_NOWAIT | __GFP_NOWARN);
+ rds_ib_recv_refill(conn, 0, GFP_NOWAIT);
rds_ib_stats_inc(s_ib_rx_refill_from_cq);
}
}
diff --git a/net/rds/message.c b/net/rds/message.c
index 7af59d2443e5..199a899a43e9 100644
--- a/net/rds/message.c
+++ b/net/rds/message.c
@@ -44,8 +44,8 @@ static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = {
[RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version),
[RDS_EXTHDR_RDMA] = sizeof(struct rds_ext_header_rdma),
[RDS_EXTHDR_RDMA_DEST] = sizeof(struct rds_ext_header_rdma_dest),
-[RDS_EXTHDR_NPATHS] = sizeof(u16),
-[RDS_EXTHDR_GEN_NUM] = sizeof(u32),
+[RDS_EXTHDR_NPATHS] = sizeof(__be16),
+[RDS_EXTHDR_GEN_NUM] = sizeof(__be32),
};
void rds_message_addref(struct rds_message *rm)
diff --git a/net/rds/rds.h b/net/rds/rds.h
index dc360252c515..5b1c072e2e7f 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -93,7 +93,7 @@ enum {
/* Max number of multipaths per RDS connection. Must be a power of 2 */
#define RDS_MPATH_WORKERS 8
-#define RDS_MPATH_HASH(rs, n) (jhash_1word((rs)->rs_bound_port, \
+#define RDS_MPATH_HASH(rs, n) (jhash_1word(ntohs((rs)->rs_bound_port), \
(rs)->rs_hash_initval) & ((n) - 1))
#define IS_CANONICAL(laddr, faddr) (htonl(laddr) < htonl(faddr))
diff --git a/net/rds/recv.c b/net/rds/recv.c
index 5627f80013f8..66205d6924bf 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -202,8 +202,8 @@ static void rds_recv_hs_exthdrs(struct rds_header *hdr,
unsigned int pos = 0, type, len;
union {
struct rds_ext_header_version version;
- u16 rds_npaths;
- u32 rds_gen_num;
+ __be16 rds_npaths;
+ __be32 rds_gen_num;
} buffer;
u32 new_peer_gen_num = 0;
diff --git a/net/rds/send.c b/net/rds/send.c
index 42d991bc8543..0b3d0ef2f008 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1454,8 +1454,8 @@ rds_send_probe(struct rds_conn_path *cp, __be16 sport,
if (RDS_HS_PROBE(be16_to_cpu(sport), be16_to_cpu(dport)) &&
cp->cp_conn->c_trans->t_mp_capable) {
- u16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
- u32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
+ __be16 npaths = cpu_to_be16(RDS_MPATH_WORKERS);
+ __be32 my_gen_num = cpu_to_be32(cp->cp_conn->c_my_gen_num);
rds_message_add_extension(&rm->m_inc.i_hdr,
RDS_EXTHDR_NPATHS, &npaths,
diff --git a/net/rfkill/input.c b/net/rfkill/input.c
index 598d0a61bda7..53d286b10843 100644
--- a/net/rfkill/input.c
+++ b/net/rfkill/input.c
@@ -159,7 +159,7 @@ static void rfkill_schedule_global_op(enum rfkill_sched_op op)
rfkill_op_pending = true;
if (op == RFKILL_GLOBAL_OP_EPO && !rfkill_is_epo_lock_active()) {
/* bypass the limiter for EPO */
- mod_delayed_work(system_wq, &rfkill_op_work, 0);
+ mod_delayed_work(system_percpu_wq, &rfkill_op_work, 0);
rfkill_last_scheduled = jiffies;
} else
rfkill_schedule_ratelimited();
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
index 0377301156b0..2ea71e3831f7 100644
--- a/net/rxrpc/rxperf.c
+++ b/net/rxrpc/rxperf.c
@@ -630,7 +630,7 @@ static int __init rxperf_init(void)
pr_info("Server registering\n");
- rxperf_workqueue = alloc_workqueue("rxperf", 0, 0);
+ rxperf_workqueue = alloc_workqueue("rxperf", WQ_PERCPU, 0);
if (!rxperf_workqueue)
goto error_workqueue;
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 9e468e463467..ff6be5cfe2b0 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -1585,7 +1585,7 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets,
}
_bstats_update(&a->tcfa_bstats, bytes, packets);
- a->tcfa_qstats.drops += drops;
+ atomic_add(drops, &a->tcfa_drops);
if (hw)
_bstats_update(&a->tcfa_bstats_hw, bytes, packets);
}
@@ -1594,8 +1594,9 @@ EXPORT_SYMBOL(tcf_action_update_stats);
int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
int compat_mode)
{
- int err = 0;
+ struct gnet_stats_queue qstats = {0};
struct gnet_dump d;
+ int err = 0;
if (p == NULL)
goto errout;
@@ -1619,14 +1620,17 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
if (err < 0)
goto errout;
+ qstats.drops = atomic_read(&p->tcfa_drops);
+ qstats.overlimits = atomic_read(&p->tcfa_overlimits);
+
if (gnet_stats_copy_basic(&d, p->cpu_bstats,
&p->tcfa_bstats, false) < 0 ||
gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw,
&p->tcfa_bstats_hw, false) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
- &p->tcfa_qstats,
- p->tcfa_qstats.qlen) < 0)
+ &qstats,
+ qstats.qlen) < 0)
goto errout;
if (gnet_stats_finish_copy(&d) < 0)
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index f3abe0545989..8e69a919b4fe 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -72,7 +72,6 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata,
d = to_defact(a);
spin_lock_bh(&d->tcf_lock);
goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch);
- memset(d->tcfd_defdata, 0, SIMP_MAX_DATA);
nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA);
spin_unlock_bh(&d->tcf_lock);
if (goto_ch)
diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c
index dc0229693461..a9e0c1326e2a 100644
--- a/net/sched/act_skbmod.c
+++ b/net/sched/act_skbmod.c
@@ -27,19 +27,18 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
struct tcf_result *res)
{
struct tcf_skbmod *d = to_skbmod(a);
- int action, max_edit_len, err;
struct tcf_skbmod_params *p;
+ int max_edit_len, err;
u64 flags;
tcf_lastuse_update(&d->tcf_tm);
bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb);
- action = READ_ONCE(d->tcf_action);
- if (unlikely(action == TC_ACT_SHOT))
+ p = rcu_dereference_bh(d->skbmod_p);
+ if (unlikely(p->action == TC_ACT_SHOT))
goto drop;
max_edit_len = skb_mac_header_len(skb);
- p = rcu_dereference_bh(d->skbmod_p);
flags = p->flags;
/* tcf_skbmod_init() guarantees "flags" to be one of the following:
@@ -85,7 +84,7 @@ TC_INDIRECT_SCOPE int tcf_skbmod_act(struct sk_buff *skb,
INET_ECN_set_ce(skb);
out:
- return action;
+ return p->action;
drop:
qstats_overlimit_inc(this_cpu_ptr(d->common.cpu_qstats));
@@ -193,7 +192,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla,
}
p->flags = lflags;
-
+ p->action = parm->action;
if (ovr)
spin_lock_bh(&d->tcf_lock);
/* Protected by tcf_lock if overwriting existing action. */
@@ -248,10 +247,9 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
opt.index = d->tcf_index;
opt.refcnt = refcount_read(&d->tcf_refcnt) - ref;
opt.bindcnt = atomic_read(&d->tcf_bindcnt) - bind;
- spin_lock_bh(&d->tcf_lock);
- opt.action = d->tcf_action;
- p = rcu_dereference_protected(d->skbmod_p,
- lockdep_is_held(&d->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(d->skbmod_p);
+ opt.action = p->action;
opt.flags = p->flags;
if (nla_put(skb, TCA_SKBMOD_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -269,10 +267,10 @@ static int tcf_skbmod_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_SKBMOD_TM, sizeof(t), &t, TCA_SKBMOD_PAD))
goto nla_put_failure;
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&d->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c
index 2cef4b08befb..876b30c5709e 100644
--- a/net/sched/act_tunnel_key.c
+++ b/net/sched/act_tunnel_key.c
@@ -29,13 +29,11 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
{
struct tcf_tunnel_key *t = to_tunnel_key(a);
struct tcf_tunnel_key_params *params;
- int action;
params = rcu_dereference_bh(t->params);
tcf_lastuse_update(&t->tcf_tm);
tcf_action_update_bstats(&t->common, skb);
- action = READ_ONCE(t->tcf_action);
switch (params->tcft_action) {
case TCA_TUNNEL_KEY_ACT_RELEASE:
@@ -51,7 +49,7 @@ TC_INDIRECT_SCOPE int tunnel_key_act(struct sk_buff *skb,
break;
}
- return action;
+ return params->action;
}
static const struct nla_policy
@@ -532,6 +530,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla,
params_new->tcft_action = parm->t_action;
params_new->tcft_enc_metadata = metadata;
+ params_new->action = parm->action;
spin_lock_bh(&t->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
params_new = rcu_replace_pointer(t->params, params_new,
@@ -726,10 +725,9 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t tm;
- spin_lock_bh(&t->tcf_lock);
- params = rcu_dereference_protected(t->params,
- lockdep_is_held(&t->tcf_lock));
- opt.action = t->tcf_action;
+ rcu_read_lock();
+ params = rcu_dereference(t->params);
+ opt.action = params->action;
opt.t_action = params->tcft_action;
if (nla_put(skb, TCA_TUNNEL_KEY_PARMS, sizeof(opt), &opt))
@@ -766,12 +764,12 @@ static int tunnel_key_dump(struct sk_buff *skb, struct tc_action *a,
if (nla_put_64bit(skb, TCA_TUNNEL_KEY_TM, sizeof(tm),
&tm, TCA_TUNNEL_KEY_PAD))
goto nla_put_failure;
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&t->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 383bf18b6862..a74621797d69 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -25,7 +25,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
{
struct tcf_vlan *v = to_vlan(a);
struct tcf_vlan_params *p;
- int action;
int err;
u16 tci;
@@ -38,8 +37,6 @@ TC_INDIRECT_SCOPE int tcf_vlan_act(struct sk_buff *skb,
if (skb_at_tc_ingress(skb))
skb_push_rcsum(skb, skb->mac_len);
- action = READ_ONCE(v->tcf_action);
-
p = rcu_dereference_bh(v->vlan_p);
switch (p->tcfv_action) {
@@ -97,7 +94,7 @@ out:
skb_pull_rcsum(skb, skb->mac_len);
skb_reset_mac_len(skb);
- return action;
+ return p->action;
drop:
tcf_action_inc_drop_qstats(&v->common);
@@ -255,6 +252,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
ETH_ALEN);
}
+ p->action = parm->action;
spin_lock_bh(&v->tcf_lock);
goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
p = rcu_replace_pointer(v->vlan_p, p, lockdep_is_held(&v->tcf_lock));
@@ -297,9 +295,9 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
};
struct tcf_t t;
- spin_lock_bh(&v->tcf_lock);
- opt.action = v->tcf_action;
- p = rcu_dereference_protected(v->vlan_p, lockdep_is_held(&v->tcf_lock));
+ rcu_read_lock();
+ p = rcu_dereference(v->vlan_p);
+ opt.action = p->action;
opt.v_action = p->tcfv_action;
if (nla_put(skb, TCA_VLAN_PARMS, sizeof(opt), &opt))
goto nla_put_failure;
@@ -325,12 +323,12 @@ static int tcf_vlan_dump(struct sk_buff *skb, struct tc_action *a,
tcf_tm_dump(&t, &v->tcf_tm);
if (nla_put_64bit(skb, TCA_VLAN_TM, sizeof(t), &t, TCA_VLAN_PAD))
goto nla_put_failure;
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
return skb->len;
nla_put_failure:
- spin_unlock_bh(&v->tcf_lock);
+ rcu_read_unlock();
nlmsg_trim(skb, b);
return -1;
}
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index d7c767b861a4..1e058b46d3e1 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -431,7 +431,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
- !memcmp(&rtab->data, nla_data(tab), 1024)) {
+ !memcmp(&rtab->data, nla_data(tab), TC_RTAB_SIZE)) {
rtab->refcnt++;
return rtab;
}
@@ -441,7 +441,7 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
if (rtab) {
rtab->rate = *r;
rtab->refcnt = 1;
- memcpy(rtab->data, nla_data(tab), 1024);
+ memcpy(rtab->data, nla_data(tab), TC_RTAB_SIZE);
if (r->linklayer == TC_LINKLAYER_UNAWARE)
r->linklayer = __detect_linklayer(r, rtab->data);
rtab->next = qdisc_rtab_list;
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 24d5a35ce894..e947646a380c 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -7,9 +7,9 @@ menuconfig IP_SCTP
tristate "The SCTP Protocol"
depends on INET
depends on IPV6 || IPV6=n
- select CRYPTO
- select CRYPTO_HMAC
- select CRYPTO_SHA1
+ select CRYPTO_LIB_SHA1
+ select CRYPTO_LIB_SHA256
+ select CRYPTO_LIB_UTILS
select NET_CRC32C
select NET_UDP_TUNNEL
help
@@ -49,46 +49,25 @@ config SCTP_DBG_OBJCNT
'cat /proc/net/sctp/sctp_dbg_objcnt'
If unsure, say N
+
choice
- prompt "Default SCTP cookie HMAC encoding"
- default SCTP_DEFAULT_COOKIE_HMAC_MD5
+ prompt "Default SCTP cookie authentication method"
+ default SCTP_DEFAULT_COOKIE_HMAC_SHA256
help
- This option sets the default sctp cookie hmac algorithm
- when in doubt select 'md5'
+ This option sets the default SCTP cookie authentication method, for
+ when a method hasn't been explicitly selected via the
+ net.sctp.cookie_hmac_alg sysctl.
-config SCTP_DEFAULT_COOKIE_HMAC_MD5
- bool "Enable optional MD5 hmac cookie generation"
- help
- Enable optional MD5 hmac based SCTP cookie generation
- select SCTP_COOKIE_HMAC_MD5
+ If unsure, choose the default (HMAC-SHA256).
-config SCTP_DEFAULT_COOKIE_HMAC_SHA1
- bool "Enable optional SHA1 hmac cookie generation"
- help
- Enable optional SHA1 hmac based SCTP cookie generation
- select SCTP_COOKIE_HMAC_SHA1
+config SCTP_DEFAULT_COOKIE_HMAC_SHA256
+ bool "HMAC-SHA256"
config SCTP_DEFAULT_COOKIE_HMAC_NONE
- bool "Use no hmac alg in SCTP cookie generation"
- help
- Use no hmac algorithm in SCTP cookie generation
+ bool "None"
endchoice
-config SCTP_COOKIE_HMAC_MD5
- bool "Enable optional MD5 hmac cookie generation"
- help
- Enable optional MD5 hmac based SCTP cookie generation
- select CRYPTO_HMAC if SCTP_COOKIE_HMAC_MD5
- select CRYPTO_MD5 if SCTP_COOKIE_HMAC_MD5
-
-config SCTP_COOKIE_HMAC_SHA1
- bool "Enable optional SHA1 hmac cookie generation"
- help
- Enable optional SHA1 hmac based SCTP cookie generation
- select CRYPTO_HMAC if SCTP_COOKIE_HMAC_SHA1
- select CRYPTO_SHA1 if SCTP_COOKIE_HMAC_SHA1
-
config INET_SCTP_DIAG
depends on INET_DIAG
def_tristate INET_DIAG
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index c58fffc86a0c..82aad477590e 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -12,36 +12,37 @@
* Vlad Yasevich <vladislav.yasevich@hp.com>
*/
-#include <crypto/hash.h>
+#include <crypto/sha1.h>
+#include <crypto/sha2.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/scatterlist.h>
#include <net/sctp/sctp.h>
#include <net/sctp/auth.h>
-static struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
+static const struct sctp_hmac sctp_hmac_list[SCTP_AUTH_NUM_HMACS] = {
{
/* id 0 is reserved. as all 0 */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_0,
},
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA1,
- .hmac_name = "hmac(sha1)",
- .hmac_len = SCTP_SHA1_SIG_SIZE,
+ .hmac_len = SHA1_DIGEST_SIZE,
},
{
/* id 2 is reserved as well */
.hmac_id = SCTP_AUTH_HMAC_ID_RESERVED_2,
},
-#if IS_ENABLED(CONFIG_CRYPTO_SHA256)
{
.hmac_id = SCTP_AUTH_HMAC_ID_SHA256,
- .hmac_name = "hmac(sha256)",
- .hmac_len = SCTP_SHA256_SIG_SIZE,
+ .hmac_len = SHA256_DIGEST_SIZE,
}
-#endif
};
+static bool sctp_hmac_supported(__u16 hmac_id)
+{
+ return hmac_id < ARRAY_SIZE(sctp_hmac_list) &&
+ sctp_hmac_list[hmac_id].hmac_len != 0;
+}
void sctp_auth_key_put(struct sctp_auth_bytes *key)
{
@@ -444,76 +445,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
return NULL;
}
-/*
- * Initialize all the possible digest transforms that we can use. Right
- * now, the supported digests are SHA1 and SHA256. We do this here once
- * because of the restrictiong that transforms may only be allocated in
- * user context. This forces us to pre-allocated all possible transforms
- * at the endpoint init time.
- */
-int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
-{
- struct crypto_shash *tfm = NULL;
- __u16 id;
-
- /* If the transforms are already allocated, we are done */
- if (ep->auth_hmacs)
- return 0;
-
- /* Allocated the array of pointers to transorms */
- ep->auth_hmacs = kcalloc(SCTP_AUTH_NUM_HMACS,
- sizeof(struct crypto_shash *),
- gfp);
- if (!ep->auth_hmacs)
- return -ENOMEM;
-
- for (id = 0; id < SCTP_AUTH_NUM_HMACS; id++) {
-
- /* See is we support the id. Supported IDs have name and
- * length fields set, so that we can allocated and use
- * them. We can safely just check for name, for without the
- * name, we can't allocate the TFM.
- */
- if (!sctp_hmac_list[id].hmac_name)
- continue;
-
- /* If this TFM has been allocated, we are all set */
- if (ep->auth_hmacs[id])
- continue;
-
- /* Allocate the ID */
- tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0);
- if (IS_ERR(tfm))
- goto out_err;
-
- ep->auth_hmacs[id] = tfm;
- }
-
- return 0;
-
-out_err:
- /* Clean up any successful allocations */
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
- ep->auth_hmacs = NULL;
- return -ENOMEM;
-}
-
-/* Destroy the hmac tfm array */
-void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[])
-{
- int i;
-
- if (!auth_hmacs)
- return;
-
- for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
- crypto_free_shash(auth_hmacs[i]);
- }
- kfree(auth_hmacs);
-}
-
-
-struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
+const struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
{
return &sctp_hmac_list[hmac_id];
}
@@ -521,7 +453,8 @@ struct sctp_hmac *sctp_auth_get_hmac(__u16 hmac_id)
/* Get an hmac description information that we can use to build
* the AUTH chunk
*/
-struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
+const struct sctp_hmac *
+sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
{
struct sctp_hmac_algo_param *hmacs;
__u16 n_elt;
@@ -543,26 +476,10 @@ struct sctp_hmac *sctp_auth_asoc_get_hmac(const struct sctp_association *asoc)
sizeof(struct sctp_paramhdr)) >> 1;
for (i = 0; i < n_elt; i++) {
id = ntohs(hmacs->hmac_ids[i]);
-
- /* Check the id is in the supported range. And
- * see if we support the id. Supported IDs have name and
- * length fields set, so that we can allocate and use
- * them. We can safely just check for name, for without the
- * name, we can't allocate the TFM.
- */
- if (id > SCTP_AUTH_HMAC_ID_MAX ||
- !sctp_hmac_list[id].hmac_name) {
- id = 0;
- continue;
- }
-
- break;
+ if (sctp_hmac_supported(id))
+ return &sctp_hmac_list[id];
}
-
- if (id == 0)
- return NULL;
-
- return &sctp_hmac_list[id];
+ return NULL;
}
static int __sctp_auth_find_hmacid(__be16 *hmacs, int n_elts, __be16 hmac_id)
@@ -606,7 +523,6 @@ int sctp_auth_asoc_verify_hmac_id(const struct sctp_association *asoc,
void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
struct sctp_hmac_algo_param *hmacs)
{
- struct sctp_endpoint *ep;
__u16 id;
int i;
int n_params;
@@ -617,16 +533,9 @@ void sctp_auth_asoc_set_default_hmac(struct sctp_association *asoc,
n_params = (ntohs(hmacs->param_hdr.length) -
sizeof(struct sctp_paramhdr)) >> 1;
- ep = asoc->ep;
for (i = 0; i < n_params; i++) {
id = ntohs(hmacs->hmac_ids[i]);
-
- /* Check the id is in the supported range */
- if (id > SCTP_AUTH_HMAC_ID_MAX)
- continue;
-
- /* If this TFM has been allocated, use this id */
- if (ep->auth_hmacs[id]) {
+ if (sctp_hmac_supported(id)) {
asoc->default_hmac_id = id;
break;
}
@@ -709,10 +618,9 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
struct sctp_shared_key *ep_key, gfp_t gfp)
{
struct sctp_auth_bytes *asoc_key;
- struct crypto_shash *tfm;
__u16 key_id, hmac_id;
- unsigned char *end;
int free_key = 0;
+ size_t data_len;
__u8 *digest;
/* Extract the info we need:
@@ -733,19 +641,17 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
free_key = 1;
}
- /* set up scatter list */
- end = skb_tail_pointer(skb);
-
- tfm = asoc->ep->auth_hmacs[hmac_id];
-
+ data_len = skb_tail_pointer(skb) - (unsigned char *)auth;
digest = (u8 *)(&auth->auth_hdr + 1);
- if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
- goto free;
-
- crypto_shash_tfm_digest(tfm, (u8 *)auth, end - (unsigned char *)auth,
- digest);
+ if (hmac_id == SCTP_AUTH_HMAC_ID_SHA1) {
+ hmac_sha1_usingrawkey(asoc_key->data, asoc_key->len,
+ (const u8 *)auth, data_len, digest);
+ } else {
+ WARN_ON_ONCE(hmac_id != SCTP_AUTH_HMAC_ID_SHA256);
+ hmac_sha256_usingrawkey(asoc_key->data, asoc_key->len,
+ (const u8 *)auth, data_len, digest);
+ }
-free:
if (free_key)
sctp_auth_key_put(asoc_key);
}
@@ -788,14 +694,11 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
for (i = 0; i < hmacs->shmac_num_idents; i++) {
id = hmacs->shmac_idents[i];
- if (id > SCTP_AUTH_HMAC_ID_MAX)
+ if (!sctp_hmac_supported(id))
return -EOPNOTSUPP;
if (SCTP_AUTH_HMAC_ID_SHA1 == id)
has_sha1 = 1;
-
- if (!sctp_hmac_list[id].hmac_name)
- return -EOPNOTSUPP;
}
if (!has_sha1)
@@ -1021,8 +924,6 @@ int sctp_auth_deact_key_id(struct sctp_endpoint *ep,
int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
{
- int err = -ENOMEM;
-
/* Allocate space for HMACS and CHUNKS authentication
* variables. There are arrays that we encode directly
* into parameters to make the rest of the operations easier.
@@ -1060,13 +961,6 @@ int sctp_auth_init(struct sctp_endpoint *ep, gfp_t gfp)
ep->auth_chunk_list = auth_chunks;
}
- /* Allocate and initialize transorms arrays for supported
- * HMACs.
- */
- err = sctp_auth_init_hmacs(ep, gfp);
- if (err)
- goto nomem;
-
return 0;
nomem:
@@ -1075,7 +969,7 @@ nomem:
kfree(ep->auth_chunk_list);
ep->auth_hmacs_list = NULL;
ep->auth_chunk_list = NULL;
- return err;
+ return -ENOMEM;
}
void sctp_auth_free(struct sctp_endpoint *ep)
@@ -1084,6 +978,4 @@ void sctp_auth_free(struct sctp_endpoint *ep)
kfree(ep->auth_chunk_list);
ep->auth_hmacs_list = NULL;
ep->auth_chunk_list = NULL;
- sctp_auth_destroy_hmacs(ep->auth_hmacs);
- ep->auth_hmacs = NULL;
}
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index fd4f8243cc35..c655b571ca01 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -184,7 +184,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
* DATA.
*/
if (sctp_auth_send_cid(SCTP_CID_DATA, asoc)) {
- struct sctp_hmac *hmac_desc = sctp_auth_asoc_get_hmac(asoc);
+ const struct sctp_hmac *hmac_desc =
+ sctp_auth_asoc_get_hmac(asoc);
if (hmac_desc)
max_data -= SCTP_PAD4(sizeof(struct sctp_auth_chunk) +
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index 23359e522273..996c2018f0e6 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -173,7 +173,7 @@ static int inet_sctp_diag_fill(struct sock *sk, struct sctp_association *asoc,
mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
- mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
+ mem[SK_MEMINFO_DROPS] = sk_drops_read(sk);
if (nla_put(skb, INET_DIAG_SKMEMINFO, sizeof(mem), &mem) < 0)
goto errout;
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 7e77b450697c..31e989dfe846 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -35,6 +35,15 @@
/* Forward declarations for internal helpers. */
static void sctp_endpoint_bh_rcv(struct work_struct *work);
+static void gen_cookie_auth_key(struct hmac_sha256_key *key)
+{
+ u8 raw_key[SCTP_COOKIE_KEY_SIZE];
+
+ get_random_bytes(raw_key, sizeof(raw_key));
+ hmac_sha256_preparekey(key, raw_key, sizeof(raw_key));
+ memzero_explicit(raw_key, sizeof(raw_key));
+}
+
/*
* Initialize the base fields of the endpoint structure.
*/
@@ -45,10 +54,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
struct net *net = sock_net(sk);
struct sctp_shared_key *null_key;
- ep->digest = kzalloc(SCTP_SIGNATURE_SIZE, gfp);
- if (!ep->digest)
- return NULL;
-
ep->asconf_enable = net->sctp.addip_enable;
ep->auth_enable = net->sctp.auth_enable;
if (ep->auth_enable) {
@@ -90,8 +95,8 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
/* Get the receive buffer policy for this endpoint */
ep->rcvbuf_policy = net->sctp.rcvbuf_policy;
- /* Initialize the secret key used with cookie. */
- get_random_bytes(ep->secret_key, sizeof(ep->secret_key));
+ /* Generate the cookie authentication key. */
+ gen_cookie_auth_key(&ep->cookie_auth_key);
/* SCTP-AUTH extensions*/
INIT_LIST_HEAD(&ep->endpoint_shared_keys);
@@ -118,7 +123,6 @@ static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep,
nomem_shkey:
sctp_auth_free(ep);
nomem:
- kfree(ep->digest);
return NULL;
}
@@ -205,9 +209,6 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
return;
}
- /* Free the digest buffer */
- kfree(ep->digest);
-
/* SCTP-AUTH: Free up AUTH releated data such as shared keys
* chunks and hmacs arrays that were allocated
*/
@@ -218,7 +219,7 @@ static void sctp_endpoint_destroy(struct sctp_endpoint *ep)
sctp_inq_free(&ep->base.inqueue);
sctp_bind_addr_free(&ep->base.bind_addr);
- memset(ep->secret_key, 0, sizeof(ep->secret_key));
+ memzero_explicit(&ep->cookie_auth_key, sizeof(ep->cookie_auth_key));
sk = ep->base.sk;
/* Remove and free the port */
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 74bff317e205..1ed281f3c355 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -52,21 +52,21 @@ static const struct snmp_mib sctp_snmp_list[] = {
SNMP_MIB_ITEM("SctpInPktBacklog", SCTP_MIB_IN_PKT_BACKLOG),
SNMP_MIB_ITEM("SctpInPktDiscards", SCTP_MIB_IN_PKT_DISCARDS),
SNMP_MIB_ITEM("SctpInDataChunkDiscards", SCTP_MIB_IN_DATA_CHUNK_DISCARDS),
- SNMP_MIB_SENTINEL
};
/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */
static int sctp_snmp_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buff[SCTP_MIB_MAX];
+ unsigned long buff[ARRAY_SIZE(sctp_snmp_list)];
+ const int cnt = ARRAY_SIZE(sctp_snmp_list);
struct net *net = seq->private;
int i;
- memset(buff, 0, sizeof(unsigned long) * SCTP_MIB_MAX);
+ memset(buff, 0, sizeof(buff));
- snmp_get_cpu_field_batch(buff, sctp_snmp_list,
- net->sctp.sctp_statistics);
- for (i = 0; sctp_snmp_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, sctp_snmp_list, cnt,
+ net->sctp.sctp_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name,
buff[i]);
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a5ccada55f2b..9dbc24af749b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -34,6 +34,7 @@
#include <linux/memblock.h>
#include <linux/highmem.h>
#include <linux/slab.h>
+#include <net/flow.h>
#include <net/net_namespace.h>
#include <net/protocol.h>
#include <net/ip.h>
@@ -437,7 +438,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
fl4->fl4_dport = daddr->v4.sin_port;
fl4->flowi4_proto = IPPROTO_SCTP;
if (asoc) {
- fl4->flowi4_tos = inet_dscp_to_dsfield(dscp);
+ fl4->flowi4_dscp = dscp;
fl4->flowi4_scope = ip_sock_rt_scope(asoc->base.sk);
fl4->flowi4_oif = asoc->base.sk->sk_bound_dev_if;
fl4->fl4_sport = htons(asoc->base.bind_addr.port);
@@ -1334,14 +1335,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Whether Cookie Preservative is enabled(1) or not(0) */
net->sctp.cookie_preserve_enable = 1;
- /* Default sctp sockets to use md5 as their hmac alg */
-#if defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5)
- net->sctp.sctp_hmac_alg = "md5";
-#elif defined (CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1)
- net->sctp.sctp_hmac_alg = "sha1";
-#else
- net->sctp.sctp_hmac_alg = NULL;
-#endif
+ /* Whether cookie authentication is enabled(1) or not(0) */
+ net->sctp.cookie_auth_enable =
+ !IS_ENABLED(CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE);
/* Max.Burst - 4 */
net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST;
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 3ead591c72fd..2c0017d058d4 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -30,7 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <crypto/hash.h>
+#include <crypto/utils.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -1319,7 +1319,7 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc,
__u16 key_id)
{
struct sctp_authhdr auth_hdr;
- struct sctp_hmac *hmac_desc;
+ const struct sctp_hmac *hmac_desc;
struct sctp_chunk *retval;
/* Get the first hmac that the peer told us to use */
@@ -1674,8 +1674,10 @@ static struct sctp_cookie_param *sctp_pack_cookie(
* out on the network.
*/
retval = kzalloc(*cookie_len, GFP_ATOMIC);
- if (!retval)
- goto nodata;
+ if (!retval) {
+ *cookie_len = 0;
+ return NULL;
+ }
cookie = (struct sctp_signed_cookie *) retval->body;
@@ -1706,26 +1708,14 @@ static struct sctp_cookie_param *sctp_pack_cookie(
memcpy((__u8 *)(cookie + 1) +
ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
- if (sctp_sk(ep->base.sk)->hmac) {
- struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
- int err;
-
- /* Sign the message. */
- err = crypto_shash_setkey(tfm, ep->secret_key,
- sizeof(ep->secret_key)) ?:
- crypto_shash_tfm_digest(tfm, (u8 *)&cookie->c, bodysize,
- cookie->signature);
- if (err)
- goto free_cookie;
+ /* Sign the cookie, if cookie authentication is enabled. */
+ if (sctp_sk(ep->base.sk)->cookie_auth_enable) {
+ static_assert(sizeof(cookie->mac) == SHA256_DIGEST_SIZE);
+ hmac_sha256(&ep->cookie_auth_key, (const u8 *)&cookie->c,
+ bodysize, cookie->mac);
}
return retval;
-
-free_cookie:
- kfree(retval);
-nodata:
- *cookie_len = 0;
- return NULL;
}
/* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */
@@ -1740,7 +1730,6 @@ struct sctp_association *sctp_unpack_cookie(
struct sctp_signed_cookie *cookie;
struct sk_buff *skb = chunk->skb;
struct sctp_cookie *bear_cookie;
- __u8 *digest = ep->digest;
enum sctp_scope scope;
unsigned int len;
ktime_t kt;
@@ -1770,30 +1759,19 @@ struct sctp_association *sctp_unpack_cookie(
cookie = chunk->subh.cookie_hdr;
bear_cookie = &cookie->c;
- if (!sctp_sk(ep->base.sk)->hmac)
- goto no_hmac;
+ /* Verify the cookie's MAC, if cookie authentication is enabled. */
+ if (sctp_sk(ep->base.sk)->cookie_auth_enable) {
+ u8 mac[SHA256_DIGEST_SIZE];
- /* Check the signature. */
- {
- struct crypto_shash *tfm = sctp_sk(ep->base.sk)->hmac;
- int err;
-
- err = crypto_shash_setkey(tfm, ep->secret_key,
- sizeof(ep->secret_key)) ?:
- crypto_shash_tfm_digest(tfm, (u8 *)bear_cookie, bodysize,
- digest);
- if (err) {
- *error = -SCTP_IERROR_NOMEM;
+ hmac_sha256(&ep->cookie_auth_key, (const u8 *)bear_cookie,
+ bodysize, mac);
+ static_assert(sizeof(cookie->mac) == sizeof(mac));
+ if (crypto_memneq(mac, cookie->mac, sizeof(mac))) {
+ *error = -SCTP_IERROR_BAD_SIG;
goto fail;
}
}
- if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
- *error = -SCTP_IERROR_BAD_SIG;
- goto fail;
- }
-
-no_hmac:
/* IG Section 2.35.2:
* 3) Compare the port numbers and the verification tag contained
* within the COOKIE ECHO chunk to the actual port numbers and the
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index a0524ba8d787..4cb8f393434d 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -30,6 +30,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/utils.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -4361,7 +4362,7 @@ static enum sctp_ierror sctp_sf_authenticate(
struct sctp_shared_key *sh_key = NULL;
struct sctp_authhdr *auth_hdr;
__u8 *save_digest, *digest;
- struct sctp_hmac *hmac;
+ const struct sctp_hmac *hmac;
unsigned int sig_len;
__u16 key_id;
@@ -4416,7 +4417,7 @@ static enum sctp_ierror sctp_sf_authenticate(
sh_key, GFP_ATOMIC);
/* Discard the packet if the digests do not match */
- if (memcmp(save_digest, digest, sig_len)) {
+ if (crypto_memneq(save_digest, digest, sig_len)) {
kfree(save_digest);
return SCTP_IERROR_BAD_SIG;
}
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 4921416434f9..ed8293a34240 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -37,7 +37,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -4987,7 +4986,7 @@ static int sctp_init_sock(struct sock *sk)
sp->default_rcv_context = 0;
sp->max_burst = net->sctp.max_burst;
- sp->sctp_hmac_alg = net->sctp.sctp_hmac_alg;
+ sp->cookie_auth_enable = net->sctp.cookie_auth_enable;
/* Initialize default setup parameters. These parameters
* can be modified with the SCTP_INITMSG socket option or
@@ -5079,8 +5078,6 @@ static int sctp_init_sock(struct sock *sk)
if (!sp->ep)
return -ENOMEM;
- sp->hmac = NULL;
-
sk->sk_destruct = sctp_destruct_sock;
SCTP_DBG_OBJCNT_INC(sock);
@@ -5117,18 +5114,8 @@ static void sctp_destroy_sock(struct sock *sk)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
}
-/* Triggered when there are no references on the socket anymore */
-static void sctp_destruct_common(struct sock *sk)
-{
- struct sctp_sock *sp = sctp_sk(sk);
-
- /* Free up the HMAC transform. */
- crypto_free_shash(sp->hmac);
-}
-
static void sctp_destruct_sock(struct sock *sk)
{
- sctp_destruct_common(sk);
inet_sock_destruct(sk);
}
@@ -8530,22 +8517,8 @@ static int sctp_listen_start(struct sock *sk, int backlog)
{
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
- struct crypto_shash *tfm = NULL;
- char alg[32];
int err;
- /* Allocate HMAC for generating cookie. */
- if (!sp->hmac && sp->sctp_hmac_alg) {
- sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
- tfm = crypto_alloc_shash(alg, 0, 0);
- if (IS_ERR(tfm)) {
- net_info_ratelimited("failed to load transform for %s: %ld\n",
- sp->sctp_hmac_alg, PTR_ERR(tfm));
- return -ENOSYS;
- }
- sctp_sk(sk)->hmac = tfm;
- }
-
/*
* If a bind() or sctp_bindx() is not called prior to a listen()
* call that allows new associations to be accepted, the system
@@ -9561,7 +9534,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
* copy.
*/
newsp->ep = newep;
- newsp->hmac = NULL;
/* Hook this new socket in to the bind_hash list. */
head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
@@ -9581,16 +9553,6 @@ static int sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
if (err)
return err;
- /* New ep's auth_hmacs should be set if old ep's is set, in case
- * that net->sctp.auth_enable has been changed to 0 by users and
- * new ep's auth_hmacs couldn't be set in sctp_endpoint_init().
- */
- if (oldsp->ep->auth_hmacs) {
- err = sctp_auth_init_hmacs(newsp->ep, GFP_KERNEL);
- if (err)
- return err;
- }
-
sctp_auto_asconf_init(newsp);
/* Move any messages in the old socket's receive queue that are for the
@@ -9723,7 +9685,6 @@ struct proto sctp_prot = {
static void sctp_v6_destruct_sock(struct sock *sk)
{
- sctp_destruct_common(sk);
inet6_sock_destruct(sk);
}
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ee3eac338a9d..15e7db9a3ab2 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -174,7 +174,7 @@ static struct ctl_table sctp_net_table[] = {
},
{
.procname = "cookie_hmac_alg",
- .data = &init_net.sctp.sctp_hmac_alg,
+ .data = &init_net.sctp.cookie_auth_enable,
.maxlen = 8,
.mode = 0644,
.proc_handler = proc_sctp_do_hmac_alg,
@@ -388,10 +388,8 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
struct net *net = container_of(ctl->data, struct net,
- sctp.sctp_hmac_alg);
+ sctp.cookie_auth_enable);
struct ctl_table tbl;
- bool changed = false;
- char *none = "none";
char tmp[8] = {0};
int ret;
@@ -399,35 +397,26 @@ static int proc_sctp_do_hmac_alg(const struct ctl_table *ctl, int write,
if (write) {
tbl.data = tmp;
- tbl.maxlen = sizeof(tmp);
- } else {
- tbl.data = net->sctp.sctp_hmac_alg ? : none;
- tbl.maxlen = strlen(tbl.data);
- }
-
- ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
- if (write && ret == 0) {
-#ifdef CONFIG_CRYPTO_MD5
- if (!strncmp(tmp, "md5", 3)) {
- net->sctp.sctp_hmac_alg = "md5";
- changed = true;
+ tbl.maxlen = sizeof(tmp) - 1;
+ ret = proc_dostring(&tbl, 1, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+ if (!strcmp(tmp, "sha256")) {
+ net->sctp.cookie_auth_enable = 1;
+ return 0;
}
-#endif
-#ifdef CONFIG_CRYPTO_SHA1
- if (!strncmp(tmp, "sha1", 4)) {
- net->sctp.sctp_hmac_alg = "sha1";
- changed = true;
+ if (!strcmp(tmp, "none")) {
+ net->sctp.cookie_auth_enable = 0;
+ return 0;
}
-#endif
- if (!strncmp(tmp, "none", 4)) {
- net->sctp.sctp_hmac_alg = NULL;
- changed = true;
- }
- if (!changed)
- ret = -EINVAL;
+ return -EINVAL;
}
-
- return ret;
+ if (net->sctp.cookie_auth_enable)
+ tbl.data = (char *)"sha256";
+ else
+ tbl.data = (char *)"none";
+ tbl.maxlen = strlen(tbl.data);
+ return proc_dostring(&tbl, 0, buffer, lenp, ppos);
}
static int proc_sctp_do_rto_min(const struct ctl_table *ctl, int write,
diff --git a/net/smc/Kconfig b/net/smc/Kconfig
index ba5e6a2dd2fd..99ecd59d1f4b 100644
--- a/net/smc/Kconfig
+++ b/net/smc/Kconfig
@@ -1,8 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config SMC
tristate "SMC socket protocol family"
- depends on INET && INFINIBAND
- depends on m || ISM != m
+ depends on INET && INFINIBAND && DIBS
help
SMC-R provides a "sockets over RDMA" solution making use of
RDMA over Converged Ethernet (RoCE) technology to upgrade
@@ -20,16 +19,3 @@ config SMC_DIAG
smcss.
if unsure, say Y.
-
-config SMC_LO
- bool "SMC intra-OS shortcut with loopback-ism"
- depends on SMC
- default n
- help
- SMC_LO enables the creation of an Emulated-ISM device named
- loopback-ism in SMC and makes use of it for transferring data
- when communication occurs within the same OS. This helps in
- convenient testing of SMC-D since loopback-ism is independent
- of architecture or hardware.
-
- if unsure, say N.
diff --git a/net/smc/Makefile b/net/smc/Makefile
index 60f1c87d5212..0e754cbc38f9 100644
--- a/net/smc/Makefile
+++ b/net/smc/Makefile
@@ -6,4 +6,3 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o smc_inet.o
smc-$(CONFIG_SYSCTL) += smc_sysctl.o
-smc-$(CONFIG_SMC_LO) += smc_loopback.o
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index e0e48f24cd61..77b99e8ef35a 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -57,7 +57,6 @@
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
-#include "smc_loopback.h"
#include "smc_inet.h"
static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
@@ -1097,8 +1096,7 @@ static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
}
/* Check for VLAN ID and register it on ISM device just for CLC handshake */
-static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
- struct smc_init_info *ini)
+static int smc_connect_ism_vlan_setup(struct smc_init_info *ini)
{
if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
return SMC_CLC_DECL_ISMVLANERR;
@@ -1113,7 +1111,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
/* check if there is an ism device available */
if (!(ini->smcd_version & SMC_V1) ||
smc_find_ism_device(smc, ini) ||
- smc_connect_ism_vlan_setup(smc, ini))
+ smc_connect_ism_vlan_setup(ini))
ini->smcd_version &= ~SMC_V1;
/* else ISM V1 is supported for this connection */
@@ -1158,8 +1156,7 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
/* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
* used, the VLAN ID will be registered again during the connection setup.
*/
-static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
- struct smc_init_info *ini)
+static int smc_connect_ism_vlan_cleanup(struct smc_init_info *ini)
{
if (!smcd_indicated(ini->smc_type_v1))
return 0;
@@ -1582,13 +1579,13 @@ static int __smc_connect(struct smc_sock *smc)
goto vlan_cleanup;
SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
- smc_connect_ism_vlan_cleanup(smc, ini);
+ smc_connect_ism_vlan_cleanup(ini);
kfree(buf);
kfree(ini);
return 0;
vlan_cleanup:
- smc_connect_ism_vlan_cleanup(smc, ini);
+ smc_connect_ism_vlan_cleanup(ini);
kfree(buf);
fallback:
kfree(ini);
@@ -3537,15 +3534,15 @@ static int __init smc_init(void)
rc = -ENOMEM;
- smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
+ smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", WQ_PERCPU, 0);
if (!smc_tcp_ls_wq)
goto out_pnet;
- smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
+ smc_hs_wq = alloc_workqueue("smc_hs_wq", WQ_PERCPU, 0);
if (!smc_hs_wq)
goto out_alloc_tcp_ls_wq;
- smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
+ smc_close_wq = alloc_workqueue("smc_close_wq", WQ_PERCPU, 0);
if (!smc_close_wq)
goto out_alloc_hs_wq;
@@ -3593,16 +3590,10 @@ static int __init smc_init(void)
goto out_sock;
}
- rc = smc_loopback_init();
- if (rc) {
- pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
- goto out_ib;
- }
-
rc = tcp_register_ulp(&smc_ulp_ops);
if (rc) {
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
- goto out_lo;
+ goto out_ib;
}
rc = smc_inet_init();
if (rc) {
@@ -3613,8 +3604,6 @@ static int __init smc_init(void)
return 0;
out_ulp:
tcp_unregister_ulp(&smc_ulp_ops);
-out_lo:
- smc_loopback_exit();
out_ib:
smc_ib_unregister_client();
out_sock:
@@ -3653,7 +3642,6 @@ static void __exit smc_exit(void)
tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
- smc_loopback_exit();
smc_ib_unregister_client();
smc_ism_exit();
destroy_workqueue(smc_close_wq);
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 08be56dfb3f2..157aace169d4 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -509,10 +509,10 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
}
/* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
-static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
+static int smc_clc_prfx_set4_rcu(struct net_device *dev, __be32 ipv4,
struct smc_clc_msg_proposal_prefix *prop)
{
- struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
+ struct in_device *in_dev = __in_dev_get_rcu(dev);
const struct in_ifaddr *ifa;
if (!in_dev)
@@ -530,12 +530,12 @@ static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
}
/* fill CLC proposal msg with ipv6 prefixes from device */
-static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
+static int smc_clc_prfx_set6_rcu(struct net_device *dev,
struct smc_clc_msg_proposal_prefix *prop,
struct smc_clc_ipv6_prefix *ipv6_prfx)
{
#if IS_ENABLED(CONFIG_IPV6)
- struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
+ struct inet6_dev *in6_dev = __in6_dev_get(dev);
struct inet6_ifaddr *ifa;
int cnt = 0;
@@ -564,41 +564,44 @@ static int smc_clc_prfx_set(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop,
struct smc_clc_ipv6_prefix *ipv6_prfx)
{
- struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct sockaddr_storage addrs;
struct sockaddr_in6 *addr6;
struct sockaddr_in *addr;
+ struct net_device *dev;
+ struct dst_entry *dst;
int rc = -ENOENT;
- if (!dst) {
- rc = -ENOTCONN;
- goto out;
- }
- if (!dst->dev) {
- rc = -ENODEV;
- goto out_rel;
- }
/* get address to which the internal TCP socket is bound */
if (kernel_getsockname(clcsock, (struct sockaddr *)&addrs) < 0)
- goto out_rel;
+ goto out;
+
/* analyze IP specific data of net_device belonging to TCP socket */
addr6 = (struct sockaddr_in6 *)&addrs;
+
rcu_read_lock();
+
+ dst = __sk_dst_get(clcsock->sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!dev) {
+ rc = -ENODEV;
+ goto out_unlock;
+ }
+
if (addrs.ss_family == PF_INET) {
/* IPv4 */
addr = (struct sockaddr_in *)&addrs;
- rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
+ rc = smc_clc_prfx_set4_rcu(dev, addr->sin_addr.s_addr, prop);
} else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
/* mapped IPv4 address - peer is IPv4 only */
- rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
+ rc = smc_clc_prfx_set4_rcu(dev, addr6->sin6_addr.s6_addr32[3],
prop);
} else {
/* IPv6 */
- rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
+ rc = smc_clc_prfx_set6_rcu(dev, prop, ipv6_prfx);
}
+
+out_unlock:
rcu_read_unlock();
-out_rel:
- dst_release(dst);
out:
return rc;
}
@@ -654,26 +657,26 @@ static int smc_clc_prfx_match6_rcu(struct net_device *dev,
int smc_clc_prfx_match(struct socket *clcsock,
struct smc_clc_msg_proposal_prefix *prop)
{
- struct dst_entry *dst = sk_dst_get(clcsock->sk);
+ struct net_device *dev;
+ struct dst_entry *dst;
int rc;
- if (!dst) {
- rc = -ENOTCONN;
- goto out;
- }
- if (!dst->dev) {
+ rcu_read_lock();
+
+ dst = __sk_dst_get(clcsock->sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!dev) {
rc = -ENODEV;
- goto out_rel;
+ goto out;
}
- rcu_read_lock();
+
if (!prop->ipv6_prefixes_cnt)
- rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
+ rc = smc_clc_prfx_match4_rcu(dev, prop);
else
- rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
- rcu_read_unlock();
-out_rel:
- dst_release(dst);
+ rc = smc_clc_prfx_match6_rcu(dev, prop);
out:
+ rcu_read_unlock();
+
return rc;
}
@@ -913,7 +916,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
/* add SMC-D specifics */
if (ini->ism_dev[0]) {
smcd = ini->ism_dev[0];
- smcd->ops->get_local_gid(smcd, &smcd_gid);
+ copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid);
pclc_smcd->ism.gid = htonll(smcd_gid.gid);
pclc_smcd->ism.chid =
htons(smc_ism_get_chid(ini->ism_dev[0]));
@@ -963,7 +966,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
if (ini->ism_offered_cnt) {
for (i = 1; i <= ini->ism_offered_cnt; i++) {
smcd = ini->ism_dev[i];
- smcd->ops->get_local_gid(smcd, &smcd_gid);
+ copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid);
gidchids[entry].chid =
htons(smc_ism_get_chid(ini->ism_dev[i]));
gidchids[entry].gid = htonll(smcd_gid.gid);
@@ -1056,7 +1059,7 @@ smcd_clc_prep_confirm_accept(struct smc_connection *conn,
/* SMC-D specific settings */
memcpy(clc->hdr.eyecatcher, SMCD_EYECATCHER,
sizeof(SMCD_EYECATCHER));
- smcd->ops->get_local_gid(smcd, &smcd_gid);
+ copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid);
clc->hdr.typev1 = SMC_TYPE_D;
clc->d0.gid = htonll(smcd_gid.gid);
clc->d0.token = htonll(conn->rmb_desc->token);
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 262746e304dd..be0c2da83d2b 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -85,7 +85,7 @@ static void smc_lgr_schedule_free_work(struct smc_link_group *lgr)
* otherwise there is a risk of out-of-sync link groups.
*/
if (!lgr->freeing) {
- mod_delayed_work(system_wq, &lgr->free_work,
+ mod_delayed_work(system_percpu_wq, &lgr->free_work,
(!lgr->is_smcd && lgr->role == SMC_CLNT) ?
SMC_LGR_FREE_DELAY_CLNT :
SMC_LGR_FREE_DELAY_SERV);
@@ -555,7 +555,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
if (nla_put_u32(skb, SMC_NLA_LGR_D_ID, *((u32 *)&lgr->id)))
goto errattr;
- smcd->ops->get_local_gid(smcd, &smcd_gid);
+ copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid);
if (nla_put_u64_64bit(skb, SMC_NLA_LGR_D_GID,
smcd_gid.gid, SMC_NLA_LGR_D_PAD))
goto errattr;
@@ -896,7 +896,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
rc = SMC_CLC_DECL_MEM;
goto ism_put_vlan;
}
- lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", 0, 0,
+ lgr->tx_wq = alloc_workqueue("smc_tx_wq-%*phN", WQ_PERCPU, 0,
SMC_LGR_ID_SIZE, &lgr->id);
if (!lgr->tx_wq) {
rc = -ENOMEM;
@@ -924,7 +924,7 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
if (ini->is_smcd) {
/* SMC-D specific settings */
smcd = ini->ism_dev[ini->ism_selected];
- get_device(smcd->ops->get_dev(smcd));
+ get_device(&smcd->dibs->dev);
lgr->peer_gid.gid =
ini->ism_peer_gid[ini->ism_selected].gid;
lgr->peer_gid.gid_ext =
@@ -1474,7 +1474,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
destroy_workqueue(lgr->tx_wq);
if (lgr->is_smcd) {
smc_ism_put_vlan(lgr->smcd, lgr->vlan_id);
- put_device(lgr->smcd->ops->get_dev(lgr->smcd));
+ put_device(&lgr->smcd->dibs->dev);
}
smc_lgr_put(lgr); /* theoretically last lgr_put */
}
@@ -1883,35 +1883,32 @@ static int smc_vlan_by_tcpsk_walk(struct net_device *lower_dev,
/* Determine vlan of internal TCP socket. */
int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
{
- struct dst_entry *dst = sk_dst_get(clcsock->sk);
struct netdev_nested_priv priv;
struct net_device *ndev;
+ struct dst_entry *dst;
int rc = 0;
ini->vlan_id = 0;
- if (!dst) {
- rc = -ENOTCONN;
- goto out;
- }
- if (!dst->dev) {
+
+ rcu_read_lock();
+
+ dst = __sk_dst_get(clcsock->sk);
+ ndev = dst ? dst_dev_rcu(dst) : NULL;
+ if (!ndev) {
rc = -ENODEV;
- goto out_rel;
+ goto out;
}
- ndev = dst->dev;
if (is_vlan_dev(ndev)) {
ini->vlan_id = vlan_dev_vlan_id(ndev);
- goto out_rel;
+ goto out;
}
priv.data = (void *)&ini->vlan_id;
- rtnl_lock();
- netdev_walk_all_lower_dev(ndev, smc_vlan_by_tcpsk_walk, &priv);
- rtnl_unlock();
-
-out_rel:
- dst_release(dst);
+ netdev_walk_all_lower_dev_rcu(ndev, smc_vlan_by_tcpsk_walk, &priv);
out:
+ rcu_read_unlock();
+
return rc;
}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 48a1b1dcb576..a5a78cbff341 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -13,6 +13,7 @@
#define _SMC_CORE_H
#include <linux/atomic.h>
+#include <linux/types.h>
#include <linux/smc.h>
#include <linux/pci.h>
#include <rdma/ib_verbs.h>
@@ -221,6 +222,10 @@ struct smc_buf_desc {
/* virtually contiguous */
};
struct { /* SMC-D */
+ /* SMC-D tx buffer */
+ bool is_attached;
+ /* no need for explicit writes */
+ /* SMC-D rx buffer: */
unsigned short sba_idx;
/* SBA index number */
u64 token;
diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
index 8ed2f6689b01..bf0beaa23bdb 100644
--- a/net/smc/smc_diag.c
+++ b/net/smc/smc_diag.c
@@ -175,7 +175,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
dinfo.linkid = *((u32 *)conn->lgr->id);
dinfo.peer_gid = conn->lgr->peer_gid.gid;
dinfo.peer_gid_ext = conn->lgr->peer_gid.gid_ext;
- smcd->ops->get_local_gid(smcd, &smcd_gid);
+ copy_to_smcdgid(&smcd_gid, &smcd->dibs->gid);
dinfo.my_gid = smcd_gid.gid;
dinfo.my_gid_ext = smcd_gid.gid_ext;
dinfo.token = conn->rmb_desc->token;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a42ef3f77b96..0052f02756eb 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -974,13 +974,17 @@ static int smc_ib_add_dev(struct ib_device *ibdev)
smcibdev->pnetid[i]))
smc_pnetid_by_table_ib(smcibdev, i + 1);
smc_copy_netdev_ifindex(smcibdev, i);
- pr_warn_ratelimited("smc: ib device %s port %d has pnetid "
- "%.16s%s\n",
- smcibdev->ibdev->name, i + 1,
- smcibdev->pnetid[i],
- smcibdev->pnetid_by_user[i] ?
- " (user defined)" :
- "");
+ if (smc_pnet_is_pnetid_set(smcibdev->pnetid[i]))
+ pr_warn_ratelimited("smc: ib device %s port %d has pnetid %.16s%s\n",
+ smcibdev->ibdev->name, i + 1,
+ smcibdev->pnetid[i],
+ smcibdev->pnetid_by_user[i] ?
+ " (user defined)" :
+ "");
+ else
+ pr_warn_ratelimited("smc: ib device %s port %d has no pnetid\n",
+ smcibdev->ibdev->name, i + 1);
+
}
schedule_work(&smcibdev->port_event_work);
return 0;
diff --git a/net/smc/smc_ism.c b/net/smc/smc_ism.c
index 84f98e18c7db..7b228ca2f96a 100644
--- a/net/smc/smc_ism.c
+++ b/net/smc/smc_ism.c
@@ -17,7 +17,7 @@
#include "smc_ism.h"
#include "smc_pnet.h"
#include "smc_netlink.h"
-#include "linux/ism.h"
+#include "linux/dibs.h"
struct smcd_dev_list smcd_dev_list = {
.list = LIST_HEAD_INIT(smcd_dev_list.list),
@@ -27,21 +27,24 @@ struct smcd_dev_list smcd_dev_list = {
static bool smc_ism_v2_capable;
static u8 smc_ism_v2_system_eid[SMC_MAX_EID_LEN];
-#if IS_ENABLED(CONFIG_ISM)
-static void smcd_register_dev(struct ism_dev *ism);
-static void smcd_unregister_dev(struct ism_dev *ism);
-static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event);
-static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno,
+static void smcd_register_dev(struct dibs_dev *dibs);
+static void smcd_unregister_dev(struct dibs_dev *dibs);
+static void smcd_handle_event(struct dibs_dev *dibs,
+ const struct dibs_event *event);
+static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno,
u16 dmbemask);
-static struct ism_client smc_ism_client = {
- .name = "SMC-D",
- .add = smcd_register_dev,
- .remove = smcd_unregister_dev,
+static struct dibs_client_ops smc_client_ops = {
+ .add_dev = smcd_register_dev,
+ .del_dev = smcd_unregister_dev,
.handle_event = smcd_handle_event,
.handle_irq = smcd_handle_irq,
};
-#endif
+
+static struct dibs_client smc_dibs_client = {
+ .name = "SMC-D",
+ .ops = &smc_client_ops,
+};
static void smc_ism_create_system_eid(void)
{
@@ -68,8 +71,12 @@ static void smc_ism_create_system_eid(void)
int smc_ism_cantalk(struct smcd_gid *peer_gid, unsigned short vlan_id,
struct smcd_dev *smcd)
{
- return smcd->ops->query_remote_gid(smcd, peer_gid, vlan_id ? 1 : 0,
- vlan_id);
+ struct dibs_dev *dibs = smcd->dibs;
+ uuid_t ism_rgid;
+
+ copy_to_dibsgid(&ism_rgid, peer_gid);
+ return dibs->ops->query_remote_gid(dibs, &ism_rgid, vlan_id ? 1 : 0,
+ vlan_id);
}
void smc_ism_get_system_eid(u8 **eid)
@@ -82,7 +89,7 @@ void smc_ism_get_system_eid(u8 **eid)
u16 smc_ism_get_chid(struct smcd_dev *smcd)
{
- return smcd->ops->get_chid(smcd);
+ return smcd->dibs->ops->get_fabric_id(smcd->dibs);
}
/* HW supports ISM V2 and thus System EID is defined */
@@ -131,7 +138,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
if (!vlanid) /* No valid vlan id */
return -EINVAL;
- if (!smcd->ops->add_vlan_id)
+ if (!smcd->dibs->ops->add_vlan_id)
return -EOPNOTSUPP;
/* create new vlan entry, in case we need it */
@@ -154,7 +161,7 @@ int smc_ism_get_vlan(struct smcd_dev *smcd, unsigned short vlanid)
/* no existing entry found.
* add new entry to device; might fail, e.g., if HW limit reached
*/
- if (smcd->ops->add_vlan_id(smcd, vlanid)) {
+ if (smcd->dibs->ops->add_vlan_id(smcd->dibs, vlanid)) {
kfree(new_vlan);
rc = -EIO;
goto out;
@@ -178,7 +185,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
if (!vlanid) /* No valid vlan id */
return -EINVAL;
- if (!smcd->ops->del_vlan_id)
+ if (!smcd->dibs->ops->del_vlan_id)
return -EOPNOTSUPP;
spin_lock_irqsave(&smcd->lock, flags);
@@ -196,7 +203,7 @@ int smc_ism_put_vlan(struct smcd_dev *smcd, unsigned short vlanid)
}
/* Found and the last reference just gone */
- if (smcd->ops->del_vlan_id(smcd, vlanid))
+ if (smcd->dibs->ops->del_vlan_id(smcd->dibs, vlanid))
rc = -EIO;
list_del(&vlan->list);
kfree(vlan);
@@ -205,43 +212,43 @@ out:
return rc;
}
-int smc_ism_unregister_dmb(struct smcd_dev *smcd, struct smc_buf_desc *dmb_desc)
+void smc_ism_unregister_dmb(struct smcd_dev *smcd,
+ struct smc_buf_desc *dmb_desc)
{
- struct smcd_dmb dmb;
- int rc = 0;
+ struct dibs_dmb dmb;
if (!dmb_desc->dma_addr)
- return rc;
+ return;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_tok = dmb_desc->token;
- dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.idx = dmb_desc->sba_idx;
dmb.cpu_addr = dmb_desc->cpu_addr;
dmb.dma_addr = dmb_desc->dma_addr;
dmb.dmb_len = dmb_desc->len;
- rc = smcd->ops->unregister_dmb(smcd, &dmb);
- if (!rc || rc == ISM_ERROR) {
- dmb_desc->cpu_addr = NULL;
- dmb_desc->dma_addr = 0;
- }
- return rc;
+ smcd->dibs->ops->unregister_dmb(smcd->dibs, &dmb);
+
+ return;
}
int smc_ism_register_dmb(struct smc_link_group *lgr, int dmb_len,
struct smc_buf_desc *dmb_desc)
{
- struct smcd_dmb dmb;
+ struct dibs_dev *dibs;
+ struct dibs_dmb dmb;
int rc;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_len = dmb_len;
- dmb.sba_idx = dmb_desc->sba_idx;
+ dmb.idx = dmb_desc->sba_idx;
dmb.vlan_id = lgr->vlan_id;
- dmb.rgid = lgr->peer_gid.gid;
- rc = lgr->smcd->ops->register_dmb(lgr->smcd, &dmb, lgr->smcd->client);
+ copy_to_dibsgid(&dmb.rgid, &lgr->peer_gid);
+
+ dibs = lgr->smcd->dibs;
+ rc = dibs->ops->register_dmb(dibs, &dmb, &smc_dibs_client);
if (!rc) {
- dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->sba_idx = dmb.idx;
dmb_desc->token = dmb.dmb_tok;
dmb_desc->cpu_addr = dmb.cpu_addr;
dmb_desc->dma_addr = dmb.dma_addr;
@@ -256,38 +263,39 @@ bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd)
* merging sndbuf with peer DMB to avoid
* data copies between them.
*/
- return (smcd->ops->support_dmb_nocopy &&
- smcd->ops->support_dmb_nocopy(smcd));
+ return (smcd->dibs->ops->support_mmapped_rdmb &&
+ smcd->dibs->ops->support_mmapped_rdmb(smcd->dibs));
}
int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
struct smc_buf_desc *dmb_desc)
{
- struct smcd_dmb dmb;
+ struct dibs_dmb dmb;
int rc = 0;
- if (!dev->ops->attach_dmb)
+ if (!dev->dibs->ops->attach_dmb)
return -EINVAL;
memset(&dmb, 0, sizeof(dmb));
dmb.dmb_tok = token;
- rc = dev->ops->attach_dmb(dev, &dmb);
+ rc = dev->dibs->ops->attach_dmb(dev->dibs, &dmb);
if (!rc) {
- dmb_desc->sba_idx = dmb.sba_idx;
+ dmb_desc->sba_idx = dmb.idx;
dmb_desc->token = dmb.dmb_tok;
dmb_desc->cpu_addr = dmb.cpu_addr;
dmb_desc->dma_addr = dmb.dma_addr;
dmb_desc->len = dmb.dmb_len;
+ dmb_desc->is_attached = true;
}
return rc;
}
int smc_ism_detach_dmb(struct smcd_dev *dev, u64 token)
{
- if (!dev->ops->detach_dmb)
+ if (!dev->dibs->ops->detach_dmb)
return -EINVAL;
- return dev->ops->detach_dmb(dev, token);
+ return dev->dibs->ops->detach_dmb(dev->dibs, token);
}
static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
@@ -297,12 +305,12 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
char smc_pnet[SMC_MAX_PNETID_LEN + 1];
struct smc_pci_dev smc_pci_dev;
struct nlattr *port_attrs;
+ struct dibs_dev *dibs;
struct nlattr *attrs;
- struct ism_dev *ism;
int use_cnt = 0;
void *nlh;
- ism = smcd->priv;
+ dibs = smcd->dibs;
nlh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
&smc_gen_nl_family, NLM_F_MULTI,
SMC_NETLINK_GET_DEV_SMCD);
@@ -317,7 +325,7 @@ static int smc_nl_handle_smcd_dev(struct smcd_dev *smcd,
if (nla_put_u8(skb, SMC_NLA_DEV_IS_CRIT, use_cnt > 0))
goto errattr;
memset(&smc_pci_dev, 0, sizeof(smc_pci_dev));
- smc_set_pci_values(to_pci_dev(ism->dev.parent), &smc_pci_dev);
+ smc_set_pci_values(to_pci_dev(dibs->dev.parent), &smc_pci_dev);
if (nla_put_u32(skb, SMC_NLA_DEV_PCI_FID, smc_pci_dev.pci_fid))
goto errattr;
if (nla_put_u16(skb, SMC_NLA_DEV_PCI_CHID, smc_pci_dev.pci_pchid))
@@ -367,7 +375,7 @@ static void smc_nl_prep_smcd_dev(struct smcd_dev_list *dev_list,
list_for_each_entry(smcd, &dev_list->list, list) {
if (num < snum)
goto next;
- if (smc_ism_is_loopback(smcd))
+ if (smc_ism_is_loopback(smcd->dibs))
goto next;
if (smc_nl_handle_smcd_dev(smcd, skb, cb))
goto errout;
@@ -385,11 +393,10 @@ int smcd_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-#if IS_ENABLED(CONFIG_ISM)
struct smc_ism_event_work {
struct work_struct work;
struct smcd_dev *smcd;
- struct ism_event event;
+ struct dibs_event event;
};
#define ISM_EVENT_REQUEST 0x0001
@@ -409,25 +416,27 @@ union smcd_sw_event_info {
static void smcd_handle_sw_event(struct smc_ism_event_work *wrk)
{
- struct smcd_gid peer_gid = { .gid = wrk->event.tok,
- .gid_ext = 0 };
+ struct dibs_dev *dibs = wrk->smcd->dibs;
union smcd_sw_event_info ev_info;
+ struct smcd_gid peer_gid;
+ uuid_t ism_rgid;
- ev_info.info = wrk->event.info;
- switch (wrk->event.code) {
+ copy_to_smcdgid(&peer_gid, &wrk->event.gid);
+ ev_info.info = wrk->event.data;
+ switch (wrk->event.subtype) {
case ISM_EVENT_CODE_SHUTDOWN: /* Peer shut down DMBs */
smc_smcd_terminate(wrk->smcd, &peer_gid, ev_info.vlan_id);
break;
case ISM_EVENT_CODE_TESTLINK: /* Activity timer */
if (ev_info.code == ISM_EVENT_REQUEST &&
- wrk->smcd->ops->signal_event) {
+ dibs->ops->signal_event) {
ev_info.code = ISM_EVENT_RESPONSE;
- wrk->smcd->ops->signal_event(wrk->smcd,
- &peer_gid,
- ISM_EVENT_REQUEST_IR,
- ISM_EVENT_CODE_TESTLINK,
- ev_info.info);
- }
+ copy_to_dibsgid(&ism_rgid, &peer_gid);
+ dibs->ops->signal_event(dibs, &ism_rgid,
+ ISM_EVENT_REQUEST_IR,
+ ISM_EVENT_CODE_TESTLINK,
+ ev_info.info);
+ }
break;
}
}
@@ -437,41 +446,39 @@ static void smc_ism_event_work(struct work_struct *work)
{
struct smc_ism_event_work *wrk =
container_of(work, struct smc_ism_event_work, work);
- struct smcd_gid smcd_gid = { .gid = wrk->event.tok,
- .gid_ext = 0 };
+ struct smcd_gid smcd_gid;
+
+ copy_to_smcdgid(&smcd_gid, &wrk->event.gid);
switch (wrk->event.type) {
- case ISM_EVENT_GID: /* GID event, token is peer GID */
+ case DIBS_DEV_EVENT: /* GID event, token is peer GID */
smc_smcd_terminate(wrk->smcd, &smcd_gid, VLAN_VID_MASK);
break;
- case ISM_EVENT_DMB:
+ case DIBS_BUF_EVENT:
break;
- case ISM_EVENT_SWR: /* Software defined event */
+ case DIBS_SW_EVENT: /* Software defined event */
smcd_handle_sw_event(wrk);
break;
}
kfree(wrk);
}
-static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
- const struct smcd_ops *ops, int max_dmbs)
+static struct smcd_dev *smcd_alloc_dev(const char *name, int max_dmbs)
{
struct smcd_dev *smcd;
- smcd = devm_kzalloc(parent, sizeof(*smcd), GFP_KERNEL);
+ smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
if (!smcd)
return NULL;
- smcd->conn = devm_kcalloc(parent, max_dmbs,
- sizeof(struct smc_connection *), GFP_KERNEL);
+ smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
+ GFP_KERNEL);
if (!smcd->conn)
- return NULL;
+ goto free_smcd;
smcd->event_wq = alloc_ordered_workqueue("ism_evt_wq-%s)",
WQ_MEM_RECLAIM, name);
if (!smcd->event_wq)
- return NULL;
-
- smcd->ops = ops;
+ goto free_conn;
spin_lock_init(&smcd->lock);
spin_lock_init(&smcd->lgr_lock);
@@ -479,28 +486,37 @@ static struct smcd_dev *smcd_alloc_dev(struct device *parent, const char *name,
INIT_LIST_HEAD(&smcd->lgr_list);
init_waitqueue_head(&smcd->lgrs_deleted);
return smcd;
+
+free_conn:
+ kfree(smcd->conn);
+free_smcd:
+ kfree(smcd);
+ return NULL;
}
-static void smcd_register_dev(struct ism_dev *ism)
+static void smcd_register_dev(struct dibs_dev *dibs)
{
- const struct smcd_ops *ops = ism_get_smcd_ops();
struct smcd_dev *smcd, *fentry;
+ int max_dmbs;
- if (!ops)
- return;
+ max_dmbs = dibs->ops->max_dmbs();
- smcd = smcd_alloc_dev(&ism->pdev->dev, dev_name(&ism->pdev->dev), ops,
- ISM_NR_DMBS);
+ smcd = smcd_alloc_dev(dev_name(&dibs->dev), max_dmbs);
if (!smcd)
return;
- smcd->priv = ism;
- smcd->client = &smc_ism_client;
- ism_set_priv(ism, &smc_ism_client, smcd);
- if (smc_pnetid_by_dev_port(&ism->pdev->dev, 0, smcd->pnetid))
+
+ smcd->dibs = dibs;
+ dibs_set_priv(dibs, &smc_dibs_client, smcd);
+
+ if (smc_pnetid_by_dev_port(dibs->dev.parent, 0, smcd->pnetid))
smc_pnetid_by_table_smcd(smcd);
- if (smcd->ops->supports_v2())
+ if (smc_ism_is_loopback(dibs) ||
+ (dibs->ops->add_vlan_id &&
+ !dibs->ops->add_vlan_id(dibs, ISM_RESERVED_VLANID))) {
smc_ism_set_v2_capable();
+ }
+
mutex_lock(&smcd_dev_list.mutex);
/* sort list:
* - devices without pnetid before devices with pnetid;
@@ -509,7 +525,7 @@ static void smcd_register_dev(struct ism_dev *ism)
if (!smcd->pnetid[0]) {
fentry = list_first_entry_or_null(&smcd_dev_list.list,
struct smcd_dev, list);
- if (fentry && smc_ism_is_loopback(fentry))
+ if (fentry && smc_ism_is_loopback(fentry->dibs))
list_add(&smcd->list, &fentry->list);
else
list_add(&smcd->list, &smcd_dev_list.list);
@@ -518,25 +534,32 @@ static void smcd_register_dev(struct ism_dev *ism)
}
mutex_unlock(&smcd_dev_list.mutex);
- pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
- dev_name(&ism->dev), smcd->pnetid,
- smcd->pnetid_by_user ? " (user defined)" : "");
-
+ if (smc_pnet_is_pnetid_set(smcd->pnetid))
+ pr_warn_ratelimited("smc: adding smcd device %s with pnetid %.16s%s\n",
+ dev_name(&dibs->dev), smcd->pnetid,
+ smcd->pnetid_by_user ?
+ " (user defined)" :
+ "");
+ else
+ pr_warn_ratelimited("smc: adding smcd device %s without pnetid\n",
+ dev_name(&dibs->dev));
return;
}
-static void smcd_unregister_dev(struct ism_dev *ism)
+static void smcd_unregister_dev(struct dibs_dev *dibs)
{
- struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+ struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client);
pr_warn_ratelimited("smc: removing smcd device %s\n",
- dev_name(&ism->dev));
+ dev_name(&dibs->dev));
smcd->going_away = 1;
smc_smcd_terminate_all(smcd);
mutex_lock(&smcd_dev_list.mutex);
list_del_init(&smcd->list);
mutex_unlock(&smcd_dev_list.mutex);
destroy_workqueue(smcd->event_wq);
+ kfree(smcd->conn);
+ kfree(smcd);
}
/* SMCD Device event handler. Called from ISM device interrupt handler.
@@ -550,9 +573,10 @@ static void smcd_unregister_dev(struct ism_dev *ism)
* Context:
* - Function called in IRQ context from ISM device driver event handler.
*/
-static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event)
+static void smcd_handle_event(struct dibs_dev *dibs,
+ const struct dibs_event *event)
{
- struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+ struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client);
struct smc_ism_event_work *wrk;
if (smcd->going_away)
@@ -574,10 +598,10 @@ static void smcd_handle_event(struct ism_dev *ism, struct ism_event *event)
* Context:
* - Function called in IRQ context from ISM device driver IRQ handler.
*/
-static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno,
+static void smcd_handle_irq(struct dibs_dev *dibs, unsigned int dmbno,
u16 dmbemask)
{
- struct smcd_dev *smcd = ism_get_priv(ism, &smc_ism_client);
+ struct smcd_dev *smcd = dibs_get_priv(dibs, &smc_dibs_client);
struct smc_connection *conn = NULL;
unsigned long flags;
@@ -587,27 +611,26 @@ static void smcd_handle_irq(struct ism_dev *ism, unsigned int dmbno,
tasklet_schedule(&conn->rx_tsklet);
spin_unlock_irqrestore(&smcd->lock, flags);
}
-#endif
int smc_ism_signal_shutdown(struct smc_link_group *lgr)
{
int rc = 0;
-#if IS_ENABLED(CONFIG_ISM)
union smcd_sw_event_info ev_info;
+ uuid_t ism_rgid;
if (lgr->peer_shutdown)
return 0;
- if (!lgr->smcd->ops->signal_event)
+ if (!lgr->smcd->dibs->ops->signal_event)
return 0;
memcpy(ev_info.uid, lgr->id, SMC_LGR_ID_SIZE);
ev_info.vlan_id = lgr->vlan_id;
ev_info.code = ISM_EVENT_REQUEST;
- rc = lgr->smcd->ops->signal_event(lgr->smcd, &lgr->peer_gid,
+ copy_to_dibsgid(&ism_rgid, &lgr->peer_gid);
+ rc = lgr->smcd->dibs->ops->signal_event(lgr->smcd->dibs, &ism_rgid,
ISM_EVENT_REQUEST_IR,
ISM_EVENT_CODE_SHUTDOWN,
ev_info.info);
-#endif
return rc;
}
@@ -618,15 +641,11 @@ int smc_ism_init(void)
smc_ism_v2_capable = false;
smc_ism_create_system_eid();
-#if IS_ENABLED(CONFIG_ISM)
- rc = ism_register_client(&smc_ism_client);
-#endif
+ rc = dibs_register_client(&smc_dibs_client);
return rc;
}
void smc_ism_exit(void)
{
-#if IS_ENABLED(CONFIG_ISM)
- ism_unregister_client(&smc_ism_client);
-#endif
+ dibs_unregister_client(&smc_dibs_client);
}
diff --git a/net/smc/smc_ism.h b/net/smc/smc_ism.h
index 6763133dd8d0..a1575e31df73 100644
--- a/net/smc/smc_ism.h
+++ b/net/smc/smc_ism.h
@@ -12,6 +12,7 @@
#include <linux/uio.h>
#include <linux/types.h>
#include <linux/mutex.h>
+#include <linux/dibs.h>
#include "smc.h"
@@ -47,7 +48,8 @@ int smc_ism_get_vlan(struct smcd_dev *dev, unsigned short vlan_id);
int smc_ism_put_vlan(struct smcd_dev *dev, unsigned short vlan_id);
int smc_ism_register_dmb(struct smc_link_group *lgr, int buf_size,
struct smc_buf_desc *dmb_desc);
-int smc_ism_unregister_dmb(struct smcd_dev *dev, struct smc_buf_desc *dmb_desc);
+void smc_ism_unregister_dmb(struct smcd_dev *dev,
+ struct smc_buf_desc *dmb_desc);
bool smc_ism_support_dmb_nocopy(struct smcd_dev *smcd);
int smc_ism_attach_dmb(struct smcd_dev *dev, u64 token,
struct smc_buf_desc *dmb_desc);
@@ -67,7 +69,9 @@ static inline int smc_ism_write(struct smcd_dev *smcd, u64 dmb_tok,
{
int rc;
- rc = smcd->ops->move_data(smcd, dmb_tok, idx, sf, offset, data, len);
+ rc = smcd->dibs->ops->move_data(smcd->dibs, dmb_tok, idx, sf, offset,
+ data, len);
+
return rc < 0 ? rc : 0;
}
@@ -84,14 +88,36 @@ static inline bool __smc_ism_is_emulated(u16 chid)
static inline bool smc_ism_is_emulated(struct smcd_dev *smcd)
{
- u16 chid = smcd->ops->get_chid(smcd);
+ u16 chid = smcd->dibs->ops->get_fabric_id(smcd->dibs);
return __smc_ism_is_emulated(chid);
}
-static inline bool smc_ism_is_loopback(struct smcd_dev *smcd)
+static inline bool smc_ism_is_loopback(struct dibs_dev *dibs)
{
- return (smcd->ops->get_chid(smcd) == 0xFFFF);
+ return (dibs->ops->get_fabric_id(dibs) == DIBS_LOOPBACK_FABRIC);
+}
+
+static inline void copy_to_smcdgid(struct smcd_gid *sgid, uuid_t *dibs_gid)
+{
+ __be64 temp;
+
+ memcpy(&temp, dibs_gid, sizeof(sgid->gid));
+ sgid->gid = ntohll(temp);
+ memcpy(&temp, (uint8_t *)dibs_gid + sizeof(sgid->gid),
+ sizeof(sgid->gid_ext));
+ sgid->gid_ext = ntohll(temp);
+}
+
+static inline void copy_to_dibsgid(uuid_t *dibs_gid, struct smcd_gid *sgid)
+{
+ __be64 temp;
+
+ temp = htonll(sgid->gid);
+ memcpy(dibs_gid, &temp, sizeof(sgid->gid));
+ temp = htonll(sgid->gid_ext);
+ memcpy((uint8_t *)dibs_gid + sizeof(sgid->gid), &temp,
+ sizeof(sgid->gid_ext));
}
#endif
diff --git a/net/smc/smc_loopback.c b/net/smc/smc_loopback.c
deleted file mode 100644
index 77cc1c6dc3e9..000000000000
--- a/net/smc/smc_loopback.c
+++ /dev/null
@@ -1,425 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Shared Memory Communications Direct over loopback-ism device.
- *
- * Functions for loopback-ism device.
- *
- * Copyright (c) 2024, Alibaba Inc.
- *
- * Author: Wen Gu <guwen@linux.alibaba.com>
- * Tony Lu <tonylu@linux.alibaba.com>
- *
- */
-
-#include <linux/device.h>
-#include <linux/types.h>
-#include <net/smc.h>
-
-#include "smc_cdc.h"
-#include "smc_ism.h"
-#include "smc_loopback.h"
-
-#define SMC_LO_V2_CAPABLE 0x1 /* loopback-ism acts as ISMv2 */
-#define SMC_LO_SUPPORT_NOCOPY 0x1
-#define SMC_DMA_ADDR_INVALID (~(dma_addr_t)0)
-
-static const char smc_lo_dev_name[] = "loopback-ism";
-static struct smc_lo_dev *lo_dev;
-
-static void smc_lo_generate_ids(struct smc_lo_dev *ldev)
-{
- struct smcd_gid *lgid = &ldev->local_gid;
- uuid_t uuid;
-
- uuid_gen(&uuid);
- memcpy(&lgid->gid, &uuid, sizeof(lgid->gid));
- memcpy(&lgid->gid_ext, (u8 *)&uuid + sizeof(lgid->gid),
- sizeof(lgid->gid_ext));
-
- ldev->chid = SMC_LO_RESERVED_CHID;
-}
-
-static int smc_lo_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
- u32 vid_valid, u32 vid)
-{
- struct smc_lo_dev *ldev = smcd->priv;
-
- /* rgid should be the same as lgid */
- if (!ldev || rgid->gid != ldev->local_gid.gid ||
- rgid->gid_ext != ldev->local_gid.gid_ext)
- return -ENETUNREACH;
- return 0;
-}
-
-static int smc_lo_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
- void *client_priv)
-{
- struct smc_lo_dmb_node *dmb_node, *tmp_node;
- struct smc_lo_dev *ldev = smcd->priv;
- struct folio *folio;
- int sba_idx, rc;
-
- /* check space for new dmb */
- for_each_clear_bit(sba_idx, ldev->sba_idx_mask, SMC_LO_MAX_DMBS) {
- if (!test_and_set_bit(sba_idx, ldev->sba_idx_mask))
- break;
- }
- if (sba_idx == SMC_LO_MAX_DMBS)
- return -ENOSPC;
-
- dmb_node = kzalloc(sizeof(*dmb_node), GFP_KERNEL);
- if (!dmb_node) {
- rc = -ENOMEM;
- goto err_bit;
- }
-
- dmb_node->sba_idx = sba_idx;
- dmb_node->len = dmb->dmb_len;
-
- /* not critical; fail under memory pressure and fallback to TCP */
- folio = folio_alloc(GFP_KERNEL | __GFP_NOWARN | __GFP_NOMEMALLOC |
- __GFP_NORETRY | __GFP_ZERO,
- get_order(dmb_node->len));
- if (!folio) {
- rc = -ENOMEM;
- goto err_node;
- }
- dmb_node->cpu_addr = folio_address(folio);
- dmb_node->dma_addr = SMC_DMA_ADDR_INVALID;
- refcount_set(&dmb_node->refcnt, 1);
-
-again:
- /* add new dmb into hash table */
- get_random_bytes(&dmb_node->token, sizeof(dmb_node->token));
- write_lock_bh(&ldev->dmb_ht_lock);
- hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_node->token) {
- if (tmp_node->token == dmb_node->token) {
- write_unlock_bh(&ldev->dmb_ht_lock);
- goto again;
- }
- }
- hash_add(ldev->dmb_ht, &dmb_node->list, dmb_node->token);
- write_unlock_bh(&ldev->dmb_ht_lock);
- atomic_inc(&ldev->dmb_cnt);
-
- dmb->sba_idx = dmb_node->sba_idx;
- dmb->dmb_tok = dmb_node->token;
- dmb->cpu_addr = dmb_node->cpu_addr;
- dmb->dma_addr = dmb_node->dma_addr;
- dmb->dmb_len = dmb_node->len;
-
- return 0;
-
-err_node:
- kfree(dmb_node);
-err_bit:
- clear_bit(sba_idx, ldev->sba_idx_mask);
- return rc;
-}
-
-static void __smc_lo_unregister_dmb(struct smc_lo_dev *ldev,
- struct smc_lo_dmb_node *dmb_node)
-{
- /* remove dmb from hash table */
- write_lock_bh(&ldev->dmb_ht_lock);
- hash_del(&dmb_node->list);
- write_unlock_bh(&ldev->dmb_ht_lock);
-
- clear_bit(dmb_node->sba_idx, ldev->sba_idx_mask);
- folio_put(virt_to_folio(dmb_node->cpu_addr));
- kfree(dmb_node);
-
- if (atomic_dec_and_test(&ldev->dmb_cnt))
- wake_up(&ldev->ldev_release);
-}
-
-static int smc_lo_unregister_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
-{
- struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
- struct smc_lo_dev *ldev = smcd->priv;
-
- /* find dmb from hash table */
- read_lock_bh(&ldev->dmb_ht_lock);
- hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
- if (tmp_node->token == dmb->dmb_tok) {
- dmb_node = tmp_node;
- break;
- }
- }
- if (!dmb_node) {
- read_unlock_bh(&ldev->dmb_ht_lock);
- return -EINVAL;
- }
- read_unlock_bh(&ldev->dmb_ht_lock);
-
- if (refcount_dec_and_test(&dmb_node->refcnt))
- __smc_lo_unregister_dmb(ldev, dmb_node);
- return 0;
-}
-
-static int smc_lo_support_dmb_nocopy(struct smcd_dev *smcd)
-{
- return SMC_LO_SUPPORT_NOCOPY;
-}
-
-static int smc_lo_attach_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb)
-{
- struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
- struct smc_lo_dev *ldev = smcd->priv;
-
- /* find dmb_node according to dmb->dmb_tok */
- read_lock_bh(&ldev->dmb_ht_lock);
- hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb->dmb_tok) {
- if (tmp_node->token == dmb->dmb_tok) {
- dmb_node = tmp_node;
- break;
- }
- }
- if (!dmb_node) {
- read_unlock_bh(&ldev->dmb_ht_lock);
- return -EINVAL;
- }
- read_unlock_bh(&ldev->dmb_ht_lock);
-
- if (!refcount_inc_not_zero(&dmb_node->refcnt))
- /* the dmb is being unregistered, but has
- * not been removed from the hash table.
- */
- return -EINVAL;
-
- /* provide dmb information */
- dmb->sba_idx = dmb_node->sba_idx;
- dmb->dmb_tok = dmb_node->token;
- dmb->cpu_addr = dmb_node->cpu_addr;
- dmb->dma_addr = dmb_node->dma_addr;
- dmb->dmb_len = dmb_node->len;
- return 0;
-}
-
-static int smc_lo_detach_dmb(struct smcd_dev *smcd, u64 token)
-{
- struct smc_lo_dmb_node *dmb_node = NULL, *tmp_node;
- struct smc_lo_dev *ldev = smcd->priv;
-
- /* find dmb_node according to dmb->dmb_tok */
- read_lock_bh(&ldev->dmb_ht_lock);
- hash_for_each_possible(ldev->dmb_ht, tmp_node, list, token) {
- if (tmp_node->token == token) {
- dmb_node = tmp_node;
- break;
- }
- }
- if (!dmb_node) {
- read_unlock_bh(&ldev->dmb_ht_lock);
- return -EINVAL;
- }
- read_unlock_bh(&ldev->dmb_ht_lock);
-
- if (refcount_dec_and_test(&dmb_node->refcnt))
- __smc_lo_unregister_dmb(ldev, dmb_node);
- return 0;
-}
-
-static int smc_lo_move_data(struct smcd_dev *smcd, u64 dmb_tok,
- unsigned int idx, bool sf, unsigned int offset,
- void *data, unsigned int size)
-{
- struct smc_lo_dmb_node *rmb_node = NULL, *tmp_node;
- struct smc_lo_dev *ldev = smcd->priv;
- struct smc_connection *conn;
-
- if (!sf)
- /* since sndbuf is merged with peer DMB, there is
- * no need to copy data from sndbuf to peer DMB.
- */
- return 0;
-
- read_lock_bh(&ldev->dmb_ht_lock);
- hash_for_each_possible(ldev->dmb_ht, tmp_node, list, dmb_tok) {
- if (tmp_node->token == dmb_tok) {
- rmb_node = tmp_node;
- break;
- }
- }
- if (!rmb_node) {
- read_unlock_bh(&ldev->dmb_ht_lock);
- return -EINVAL;
- }
- memcpy((char *)rmb_node->cpu_addr + offset, data, size);
- read_unlock_bh(&ldev->dmb_ht_lock);
-
- conn = smcd->conn[rmb_node->sba_idx];
- if (!conn || conn->killed)
- return -EPIPE;
- tasklet_schedule(&conn->rx_tsklet);
- return 0;
-}
-
-static void smc_lo_get_local_gid(struct smcd_dev *smcd,
- struct smcd_gid *smcd_gid)
-{
- struct smc_lo_dev *ldev = smcd->priv;
-
- smcd_gid->gid = ldev->local_gid.gid;
- smcd_gid->gid_ext = ldev->local_gid.gid_ext;
-}
-
-static u16 smc_lo_get_chid(struct smcd_dev *smcd)
-{
- return ((struct smc_lo_dev *)smcd->priv)->chid;
-}
-
-static struct device *smc_lo_get_dev(struct smcd_dev *smcd)
-{
- return &((struct smc_lo_dev *)smcd->priv)->dev;
-}
-
-static const struct smcd_ops lo_ops = {
- .query_remote_gid = smc_lo_query_rgid,
- .register_dmb = smc_lo_register_dmb,
- .unregister_dmb = smc_lo_unregister_dmb,
- .support_dmb_nocopy = smc_lo_support_dmb_nocopy,
- .attach_dmb = smc_lo_attach_dmb,
- .detach_dmb = smc_lo_detach_dmb,
- .add_vlan_id = NULL,
- .del_vlan_id = NULL,
- .set_vlan_required = NULL,
- .reset_vlan_required = NULL,
- .signal_event = NULL,
- .move_data = smc_lo_move_data,
- .get_local_gid = smc_lo_get_local_gid,
- .get_chid = smc_lo_get_chid,
- .get_dev = smc_lo_get_dev,
-};
-
-static struct smcd_dev *smcd_lo_alloc_dev(const struct smcd_ops *ops,
- int max_dmbs)
-{
- struct smcd_dev *smcd;
-
- smcd = kzalloc(sizeof(*smcd), GFP_KERNEL);
- if (!smcd)
- return NULL;
-
- smcd->conn = kcalloc(max_dmbs, sizeof(struct smc_connection *),
- GFP_KERNEL);
- if (!smcd->conn)
- goto out_smcd;
-
- smcd->ops = ops;
-
- spin_lock_init(&smcd->lock);
- spin_lock_init(&smcd->lgr_lock);
- INIT_LIST_HEAD(&smcd->vlan);
- INIT_LIST_HEAD(&smcd->lgr_list);
- init_waitqueue_head(&smcd->lgrs_deleted);
- return smcd;
-
-out_smcd:
- kfree(smcd);
- return NULL;
-}
-
-static int smcd_lo_register_dev(struct smc_lo_dev *ldev)
-{
- struct smcd_dev *smcd;
-
- smcd = smcd_lo_alloc_dev(&lo_ops, SMC_LO_MAX_DMBS);
- if (!smcd)
- return -ENOMEM;
- ldev->smcd = smcd;
- smcd->priv = ldev;
- smc_ism_set_v2_capable();
- mutex_lock(&smcd_dev_list.mutex);
- list_add(&smcd->list, &smcd_dev_list.list);
- mutex_unlock(&smcd_dev_list.mutex);
- pr_warn_ratelimited("smc: adding smcd device %s\n",
- dev_name(&ldev->dev));
- return 0;
-}
-
-static void smcd_lo_unregister_dev(struct smc_lo_dev *ldev)
-{
- struct smcd_dev *smcd = ldev->smcd;
-
- pr_warn_ratelimited("smc: removing smcd device %s\n",
- dev_name(&ldev->dev));
- smcd->going_away = 1;
- smc_smcd_terminate_all(smcd);
- mutex_lock(&smcd_dev_list.mutex);
- list_del_init(&smcd->list);
- mutex_unlock(&smcd_dev_list.mutex);
- kfree(smcd->conn);
- kfree(smcd);
-}
-
-static int smc_lo_dev_init(struct smc_lo_dev *ldev)
-{
- smc_lo_generate_ids(ldev);
- rwlock_init(&ldev->dmb_ht_lock);
- hash_init(ldev->dmb_ht);
- atomic_set(&ldev->dmb_cnt, 0);
- init_waitqueue_head(&ldev->ldev_release);
-
- return smcd_lo_register_dev(ldev);
-}
-
-static void smc_lo_dev_exit(struct smc_lo_dev *ldev)
-{
- smcd_lo_unregister_dev(ldev);
- if (atomic_read(&ldev->dmb_cnt))
- wait_event(ldev->ldev_release, !atomic_read(&ldev->dmb_cnt));
-}
-
-static void smc_lo_dev_release(struct device *dev)
-{
- struct smc_lo_dev *ldev =
- container_of(dev, struct smc_lo_dev, dev);
-
- kfree(ldev);
-}
-
-static int smc_lo_dev_probe(void)
-{
- struct smc_lo_dev *ldev;
- int ret;
-
- ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
- if (!ldev)
- return -ENOMEM;
-
- ldev->dev.parent = NULL;
- ldev->dev.release = smc_lo_dev_release;
- device_initialize(&ldev->dev);
- dev_set_name(&ldev->dev, smc_lo_dev_name);
-
- ret = smc_lo_dev_init(ldev);
- if (ret)
- goto free_dev;
-
- lo_dev = ldev; /* global loopback device */
- return 0;
-
-free_dev:
- put_device(&ldev->dev);
- return ret;
-}
-
-static void smc_lo_dev_remove(void)
-{
- if (!lo_dev)
- return;
-
- smc_lo_dev_exit(lo_dev);
- put_device(&lo_dev->dev); /* device_initialize in smc_lo_dev_probe */
-}
-
-int smc_loopback_init(void)
-{
- return smc_lo_dev_probe();
-}
-
-void smc_loopback_exit(void)
-{
- smc_lo_dev_remove();
-}
diff --git a/net/smc/smc_loopback.h b/net/smc/smc_loopback.h
deleted file mode 100644
index 04dc6808d2e1..000000000000
--- a/net/smc/smc_loopback.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Shared Memory Communications Direct over loopback-ism device.
- *
- * SMC-D loopback-ism device structure definitions.
- *
- * Copyright (c) 2024, Alibaba Inc.
- *
- * Author: Wen Gu <guwen@linux.alibaba.com>
- * Tony Lu <tonylu@linux.alibaba.com>
- *
- */
-
-#ifndef _SMC_LOOPBACK_H
-#define _SMC_LOOPBACK_H
-
-#include <linux/device.h>
-#include <net/smc.h>
-
-#if IS_ENABLED(CONFIG_SMC_LO)
-#define SMC_LO_MAX_DMBS 5000
-#define SMC_LO_DMBS_HASH_BITS 12
-#define SMC_LO_RESERVED_CHID 0xFFFF
-
-struct smc_lo_dmb_node {
- struct hlist_node list;
- u64 token;
- u32 len;
- u32 sba_idx;
- void *cpu_addr;
- dma_addr_t dma_addr;
- refcount_t refcnt;
-};
-
-struct smc_lo_dev {
- struct smcd_dev *smcd;
- struct device dev;
- u16 chid;
- struct smcd_gid local_gid;
- atomic_t dmb_cnt;
- rwlock_t dmb_ht_lock;
- DECLARE_BITMAP(sba_idx_mask, SMC_LO_MAX_DMBS);
- DECLARE_HASHTABLE(dmb_ht, SMC_LO_DMBS_HASH_BITS);
- wait_queue_head_t ldev_release;
-};
-
-int smc_loopback_init(void);
-void smc_loopback_exit(void);
-#else
-static inline int smc_loopback_init(void)
-{
- return 0;
-}
-
-static inline void smc_loopback_exit(void)
-{
-}
-#endif
-
-#endif /* _SMC_LOOPBACK_H */
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 76ad29e31d60..a3a1e1fde8eb 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -169,7 +169,7 @@ static int smc_pnet_remove_by_pnetid(struct net *net, char *pnet_name)
pr_warn_ratelimited("smc: smcd device %s "
"erased user defined pnetid "
"%.16s\n",
- dev_name(smcd->ops->get_dev(smcd)),
+ dev_name(&smcd->dibs->dev),
smcd->pnetid);
memset(smcd->pnetid, 0, SMC_MAX_PNETID_LEN);
smcd->pnetid_by_user = false;
@@ -332,8 +332,11 @@ static struct smcd_dev *smc_pnet_find_smcd(char *smcd_name)
mutex_lock(&smcd_dev_list.mutex);
list_for_each_entry(smcd_dev, &smcd_dev_list.list, list) {
- if (!strncmp(dev_name(smcd_dev->ops->get_dev(smcd_dev)),
- smcd_name, IB_DEVICE_NAME_MAX - 1))
+ if (!strncmp(dev_name(&smcd_dev->dibs->dev), smcd_name,
+ IB_DEVICE_NAME_MAX - 1) ||
+ (smcd_dev->dibs->dev.parent &&
+ !strncmp(dev_name(smcd_dev->dibs->dev.parent), smcd_name,
+ IB_DEVICE_NAME_MAX - 1)))
goto out;
}
smcd_dev = NULL;
@@ -413,7 +416,6 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
bool smcddev_applied = true;
bool ibdev_applied = true;
struct smcd_dev *smcd;
- struct device *dev;
bool new_ibdev;
/* try to apply the pnetid to active devices */
@@ -431,10 +433,8 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
if (smcd) {
smcddev_applied = smc_pnet_apply_smcd(smcd, pnet_name);
if (smcddev_applied) {
- dev = smcd->ops->get_dev(smcd);
- pr_warn_ratelimited("smc: smcd device %s "
- "applied user defined pnetid "
- "%.16s\n", dev_name(dev),
+ pr_warn_ratelimited("smc: smcd device %s applied user defined pnetid %.16s\n",
+ dev_name(&smcd->dibs->dev),
smcd->pnetid);
}
}
@@ -450,7 +450,7 @@ static int smc_pnet_add_ib(struct smc_pnettable *pnettable, char *ib_name,
return -ENOMEM;
new_pe->type = SMC_PNET_IB;
memcpy(new_pe->pnet_name, pnet_name, SMC_MAX_PNETID_LEN);
- strncpy(new_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX);
+ strscpy(new_pe->ib_name, ib_name);
new_pe->ib_port = ib_port;
new_ibdev = true;
@@ -1126,37 +1126,38 @@ static void smc_pnet_find_ism_by_pnetid(struct net_device *ndev,
*/
void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
{
- struct dst_entry *dst = sk_dst_get(sk);
-
- if (!dst)
- goto out;
- if (!dst->dev)
- goto out_rel;
+ struct net_device *dev;
+ struct dst_entry *dst;
- smc_pnet_find_roce_by_pnetid(dst->dev, ini);
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ dev_hold(dev);
+ rcu_read_unlock();
-out_rel:
- dst_release(dst);
-out:
- return;
+ if (dev) {
+ smc_pnet_find_roce_by_pnetid(dev, ini);
+ dev_put(dev);
+ }
}
void smc_pnet_find_ism_resource(struct sock *sk, struct smc_init_info *ini)
{
- struct dst_entry *dst = sk_dst_get(sk);
+ struct net_device *dev;
+ struct dst_entry *dst;
ini->ism_dev[0] = NULL;
- if (!dst)
- goto out;
- if (!dst->dev)
- goto out_rel;
- smc_pnet_find_ism_by_pnetid(dst->dev, ini);
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ dev_hold(dev);
+ rcu_read_unlock();
-out_rel:
- dst_release(dst);
-out:
- return;
+ if (dev) {
+ smc_pnet_find_ism_by_pnetid(dev, ini);
+ dev_put(dev);
+ }
}
/* Lookup and apply a pnet table entry to the given ib device.
@@ -1192,7 +1193,6 @@ int smc_pnetid_by_table_ib(struct smc_ib_device *smcibdev, u8 ib_port)
*/
int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
{
- const char *ib_name = dev_name(smcddev->ops->get_dev(smcddev));
struct smc_pnettable *pnettable;
struct smc_pnetentry *tmp_pe;
struct smc_net *sn;
@@ -1205,7 +1205,13 @@ int smc_pnetid_by_table_smcd(struct smcd_dev *smcddev)
mutex_lock(&pnettable->lock);
list_for_each_entry(tmp_pe, &pnettable->pnetlist, list) {
if (tmp_pe->type == SMC_PNET_IB &&
- !strncmp(tmp_pe->ib_name, ib_name, IB_DEVICE_NAME_MAX)) {
+ (!strncmp(tmp_pe->ib_name,
+ dev_name(&smcddev->dibs->dev),
+ sizeof(tmp_pe->ib_name)) ||
+ (smcddev->dibs->dev.parent &&
+ !strncmp(tmp_pe->ib_name,
+ dev_name(smcddev->dibs->dev.parent),
+ sizeof(tmp_pe->ib_name))))) {
smc_pnet_apply_smcd(smcddev, tmp_pe->pnet_name);
rc = 0;
break;
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index 214ac3cbcf9a..3144b4b1fe29 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -426,6 +426,9 @@ static int smcd_tx_rdma_writes(struct smc_connection *conn, size_t len,
int srcchunk, dstchunk;
int rc;
+ if (conn->sndbuf_desc->is_attached)
+ return 0;
+
for (dstchunk = 0; dstchunk < 2; dstchunk++) {
for (srcchunk = 0; srcchunk < 2; srcchunk++) {
void *data = conn->sndbuf_desc->cpu_addr + src_off;
diff --git a/net/socket.c b/net/socket.c
index bac335ecee4c..e8892b218708 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -276,28 +276,41 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k
static int move_addr_to_user(struct sockaddr_storage *kaddr, int klen,
void __user *uaddr, int __user *ulen)
{
- int err;
int len;
BUG_ON(klen > sizeof(struct sockaddr_storage));
- err = get_user(len, ulen);
- if (err)
- return err;
+
+ if (can_do_masked_user_access())
+ ulen = masked_user_access_begin(ulen);
+ else if (!user_access_begin(ulen, 4))
+ return -EFAULT;
+
+ unsafe_get_user(len, ulen, efault_end);
+
if (len > klen)
len = klen;
- if (len < 0)
- return -EINVAL;
+ /*
+ * "fromlen shall refer to the value before truncation.."
+ * 1003.1g
+ */
+ if (len >= 0)
+ unsafe_put_user(klen, ulen, efault_end);
+
+ user_access_end();
+
if (len) {
+ if (len < 0)
+ return -EINVAL;
if (audit_sockaddr(klen, kaddr))
return -ENOMEM;
if (copy_to_user(uaddr, kaddr, len))
return -EFAULT;
}
- /*
- * "fromlen shall refer to the value before truncation.."
- * 1003.1g
- */
- return __put_user(klen, ulen);
+ return 0;
+
+efault_end:
+ user_access_end();
+ return -EFAULT;
}
static struct kmem_cache *sock_inode_cachep __ro_after_init;
diff --git a/net/tipc/addr.c b/net/tipc/addr.c
index fd0796269eed..6f5c54cbf8d9 100644
--- a/net/tipc/addr.c
+++ b/net/tipc/addr.c
@@ -79,7 +79,7 @@ void tipc_set_node_addr(struct net *net, u32 addr)
pr_info("Node number set to %u\n", addr);
}
-char *tipc_nodeid2string(char *str, u8 *id)
+int tipc_nodeid2string(char *str, u8 *id)
{
int i;
u8 c;
@@ -109,7 +109,7 @@ char *tipc_nodeid2string(char *str, u8 *id)
if (i == NODE_ID_LEN) {
memcpy(str, id, NODE_ID_LEN);
str[NODE_ID_LEN] = 0;
- return str;
+ return i;
}
/* Translate to hex string */
@@ -120,5 +120,5 @@ char *tipc_nodeid2string(char *str, u8 *id)
for (i = NODE_ID_STR_LEN - 2; str[i] == '0'; i--)
str[i] = 0;
- return str;
+ return i + 1;
}
diff --git a/net/tipc/addr.h b/net/tipc/addr.h
index 93f82398283d..a113cf7e1f89 100644
--- a/net/tipc/addr.h
+++ b/net/tipc/addr.h
@@ -130,6 +130,6 @@ static inline int in_own_node(struct net *net, u32 addr)
bool tipc_in_scope(bool legacy_format, u32 domain, u32 addr);
void tipc_set_node_id(struct net *net, u8 *id);
void tipc_set_node_addr(struct net *net, u32 addr);
-char *tipc_nodeid2string(char *str, u8 *id);
+int tipc_nodeid2string(char *str, u8 *id);
#endif
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 3ee44d731700..931f55f781a1 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -495,11 +495,9 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
/* Set link name for unicast links only */
if (peer_id) {
- tipc_nodeid2string(self_str, tipc_own_id(net));
- if (strlen(self_str) > 16)
+ if (tipc_nodeid2string(self_str, tipc_own_id(net)) > NODE_ID_LEN)
sprintf(self_str, "%x", self);
- tipc_nodeid2string(peer_str, peer_id);
- if (strlen(peer_str) > 16)
+ if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN)
sprintf(peer_str, "%x", peer);
}
/* Peer i/f name will be completed by reset/activate message */
@@ -570,8 +568,7 @@ bool tipc_link_bc_create(struct net *net, u32 ownnode, u32 peer, u8 *peer_id,
if (peer_id) {
char peer_str[NODE_ID_STR_LEN] = {0,};
- tipc_nodeid2string(peer_str, peer_id);
- if (strlen(peer_str) > 16)
+ if (tipc_nodeid2string(peer_str, peer_id) > NODE_ID_LEN)
sprintf(peer_str, "%x", peer);
/* Broadcast receiver link name: "broadcast-link:<peer>" */
snprintf(l->name, sizeof(l->name), "%s:%s", tipc_bclink_name,
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index e028bf658499..1574a83384f8 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -2366,7 +2366,7 @@ static void tipc_sk_filter_rcv(struct sock *sk, struct sk_buff *skb,
else if (sk_rmem_alloc_get(sk) + skb->truesize >= limit) {
trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL,
"err_overload2!");
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
err = TIPC_ERR_OVERLOAD;
}
@@ -2458,7 +2458,7 @@ static void tipc_sk_enqueue(struct sk_buff_head *inputq, struct sock *sk,
trace_tipc_sk_dump(sk, skb, TIPC_DUMP_ALL, "err_overload!");
/* Overload => reject message back to sender */
onode = tipc_own_addr(sock_net(sk));
- atomic_inc(&sk->sk_drops);
+ sk_drops_inc(sk);
if (tipc_msg_reverse(onode, &skb, TIPC_ERR_OVERLOAD)) {
trace_tipc_sk_rej_msg(sk, skb, TIPC_DUMP_ALL,
"@sk_enqueue!");
@@ -3657,7 +3657,7 @@ int tipc_sk_fill_sock_diag(struct sk_buff *skb, struct netlink_callback *cb,
nla_put_u32(skb, TIPC_NLA_SOCK_STAT_SENDQ,
skb_queue_len(&sk->sk_write_queue)) ||
nla_put_u32(skb, TIPC_NLA_SOCK_STAT_DROP,
- atomic_read(&sk->sk_drops)))
+ sk_drops_read(sk)))
goto stat_msg_cancel;
if (tsk->cong_link_cnt &&
diff --git a/net/tls/tls.h b/net/tls/tls.h
index e4c42731ce39..2f86baeb71fc 100644
--- a/net/tls/tls.h
+++ b/net/tls/tls.h
@@ -128,8 +128,9 @@ struct tls_rec {
char aad_space[TLS_AAD_SPACE_SIZE];
u8 iv_data[TLS_MAX_IV_SIZE];
+
+ /* Must be last --ends in a flexible-array member. */
struct aead_request aead_req;
- u8 aead_req_ctx[];
};
int __net_init tls_proc_init(struct net *net);
diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
index f672a62a9a52..a64ae15b1a60 100644
--- a/net/tls/tls_device.c
+++ b/net/tls/tls_device.c
@@ -123,17 +123,19 @@ static void tls_device_queue_ctx_destruction(struct tls_context *ctx)
/* We assume that the socket is already connected */
static struct net_device *get_netdev_for_sock(struct sock *sk)
{
- struct dst_entry *dst = sk_dst_get(sk);
- struct net_device *netdev = NULL;
+ struct net_device *dev, *lowest_dev = NULL;
+ struct dst_entry *dst;
- if (likely(dst)) {
- netdev = netdev_sk_get_lowest_dev(dst->dev, sk);
- dev_hold(netdev);
+ rcu_read_lock();
+ dst = __sk_dst_get(sk);
+ dev = dst ? dst_dev_rcu(dst) : NULL;
+ if (likely(dev)) {
+ lowest_dev = netdev_sk_get_lowest_dev(dev, sk);
+ dev_hold(lowest_dev);
}
+ rcu_read_unlock();
- dst_release(dst);
-
- return netdev;
+ return lowest_dev;
}
static void destroy_record(struct tls_record_info *record)
@@ -1410,7 +1412,7 @@ int __init tls_device_init(void)
if (!dummy_page)
return -ENOMEM;
- destruct_wq = alloc_workqueue("ktls_device_destruct", 0, 0);
+ destruct_wq = alloc_workqueue("ktls_device_destruct", WQ_PERCPU, 0);
if (!destruct_wq) {
err = -ENOMEM;
goto err_free_dummy;
diff --git a/net/tls/tls_proc.c b/net/tls/tls_proc.c
index 367666aa07b8..4012c4372d4c 100644
--- a/net/tls/tls_proc.c
+++ b/net/tls/tls_proc.c
@@ -27,17 +27,19 @@ static const struct snmp_mib tls_mib_list[] = {
SNMP_MIB_ITEM("TlsTxRekeyOk", LINUX_MIB_TLSTXREKEYOK),
SNMP_MIB_ITEM("TlsTxRekeyError", LINUX_MIB_TLSTXREKEYERROR),
SNMP_MIB_ITEM("TlsRxRekeyReceived", LINUX_MIB_TLSRXREKEYRECEIVED),
- SNMP_MIB_SENTINEL
};
static int tls_statistics_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buf[LINUX_MIB_TLSMAX] = {};
+ unsigned long buf[ARRAY_SIZE(tls_mib_list)];
+ const int cnt = ARRAY_SIZE(tls_mib_list);
struct net *net = seq->private;
int i;
- snmp_get_cpu_field_batch(buf, tls_mib_list, net->mib.tls_statistics);
- for (i = 0; tls_mib_list[i].name; i++)
+ memset(buf, 0, sizeof(buf));
+ snmp_get_cpu_field_batch_cnt(buf, tls_mib_list, cnt,
+ net->mib.tls_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-32s\t%lu\n", tls_mib_list[i].name, buf[i]);
return 0;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index 01e2b9452c75..684ab03137b6 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -592,7 +592,7 @@ static DECLARE_WORK(unix_gc_work, __unix_gc);
void unix_gc(void)
{
WRITE_ONCE(gc_in_progress, true);
- queue_work(system_unbound_wq, &unix_gc_work);
+ queue_work(system_dfl_wq, &unix_gc_work);
}
#define UNIX_INFLIGHT_TRIGGER_GC 16000
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bebb355f3ffe..4c2db6cca557 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1029,12 +1029,7 @@ static int vsock_getname(struct socket *sock,
vm_addr = &vsk->local_addr;
}
- /* sys_getsockname() and sys_getpeername() pass us a
- * MAX_SOCK_ADDR-sized buffer and don't set addr_len. Unfortunately
- * that macro is defined in socket.c instead of .h, so we hardcode its
- * value here.
- */
- BUILD_BUG_ON(sizeof(*vm_addr) > 128);
+ BUILD_BUG_ON(sizeof(*vm_addr) > sizeof(struct sockaddr_storage));
memcpy(addr, vm_addr, sizeof(*vm_addr));
err = sizeof(*vm_addr);
@@ -1654,7 +1649,7 @@ static int vsock_connect(struct socket *sock, struct sockaddr *addr,
* reschedule it, then ungrab the socket refcount to
* keep it balanced.
*/
- if (mod_delayed_work(system_wq, &vsk->connect_work,
+ if (mod_delayed_work(system_percpu_wq, &vsk->connect_work,
timeout))
sock_put(sk);
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index b6569b0ca2bb..8c867023a2e5 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -926,7 +926,7 @@ static int __init virtio_vsock_init(void)
{
int ret;
- virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", 0, 0);
+ virtio_vsock_workqueue = alloc_workqueue("virtio_vsock", WQ_PERCPU, 0);
if (!virtio_vsock_workqueue)
return -ENOMEM;
diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
index 6e78927a598e..bc2ff918b315 100644
--- a/net/vmw_vsock/vsock_loopback.c
+++ b/net/vmw_vsock/vsock_loopback.c
@@ -139,7 +139,7 @@ static int __init vsock_loopback_init(void)
struct vsock_loopback *vsock = &the_vsock_loopback;
int ret;
- vsock->workqueue = alloc_workqueue("vsock-loopback", 0, 0);
+ vsock->workqueue = alloc_workqueue("vsock-loopback", WQ_PERCPU, 0);
if (!vsock->workqueue)
return -ENOMEM;
diff --git a/net/wireless/chan.c b/net/wireless/chan.c
index 193734b7f9dc..68221b1ab45e 100644
--- a/net/wireless/chan.c
+++ b/net/wireless/chan.c
@@ -100,6 +100,11 @@ static u32 cfg80211_get_end_freq(const struct cfg80211_chan_def *chandef,
punctured = 0) : (punctured >>= 1))) \
if (!(punctured & 1))
+#define for_each_s1g_subchan(chandef, freq_khz) \
+ for (freq_khz = cfg80211_s1g_get_start_freq_khz(chandef); \
+ freq_khz <= cfg80211_s1g_get_end_freq_khz(chandef); \
+ freq_khz += MHZ_TO_KHZ(1))
+
struct cfg80211_per_bw_puncturing_values {
u8 len;
const u16 *valid_values;
@@ -336,8 +341,7 @@ static bool cfg80211_valid_center_freq(u32 center,
bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
{
- u32 control_freq, oper_freq;
- int oper_width, control_width;
+ u32 control_freq, control_freq_khz, start_khz, end_khz;
if (!chandef->chan)
return false;
@@ -363,27 +367,16 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
case NL80211_CHAN_WIDTH_4:
case NL80211_CHAN_WIDTH_8:
case NL80211_CHAN_WIDTH_16:
- if (chandef->chan->band != NL80211_BAND_S1GHZ)
- return false;
-
- control_freq = ieee80211_channel_to_khz(chandef->chan);
- oper_freq = ieee80211_chandef_to_khz(chandef);
- control_width = nl80211_chan_width_to_mhz(
- ieee80211_s1g_channel_width(
- chandef->chan));
- oper_width = cfg80211_chandef_get_width(chandef);
-
- if (oper_width < 0 || control_width < 0)
+ if (!cfg80211_chandef_is_s1g(chandef))
return false;
if (chandef->center_freq2)
return false;
- if (control_freq + MHZ_TO_KHZ(control_width) / 2 >
- oper_freq + MHZ_TO_KHZ(oper_width) / 2)
- return false;
+ control_freq_khz = ieee80211_channel_to_khz(chandef->chan);
+ start_khz = cfg80211_s1g_get_start_freq_khz(chandef);
+ end_khz = cfg80211_s1g_get_end_freq_khz(chandef);
- if (control_freq - MHZ_TO_KHZ(control_width) / 2 <
- oper_freq - MHZ_TO_KHZ(oper_width) / 2)
+ if (control_freq_khz < start_khz || control_freq_khz > end_khz)
return false;
break;
case NL80211_CHAN_WIDTH_80P80:
@@ -461,6 +454,9 @@ bool cfg80211_chandef_valid(const struct cfg80211_chan_def *chandef)
!cfg80211_edmg_chandef_valid(chandef))
return false;
+ if (!cfg80211_chandef_is_s1g(chandef) && chandef->s1g_primary_2mhz)
+ return false;
+
return valid_puncturing_bitmap(chandef);
}
EXPORT_SYMBOL(cfg80211_chandef_valid);
@@ -725,6 +721,10 @@ static int cfg80211_get_chans_dfs_required(struct wiphy *wiphy,
{
struct ieee80211_channel *c;
+ /* DFS is not required for S1G */
+ if (cfg80211_chandef_is_s1g(chandef))
+ return 0;
+
for_each_subchan(chandef, freq, cf) {
c = ieee80211_get_channel_khz(wiphy, freq);
if (!c)
@@ -1130,6 +1130,55 @@ static bool cfg80211_edmg_usable(struct wiphy *wiphy, u8 edmg_channels,
return true;
}
+static bool cfg80211_s1g_usable(struct wiphy *wiphy,
+ const struct cfg80211_chan_def *chandef)
+{
+ u32 freq_khz;
+ const struct ieee80211_channel *chan;
+ u32 pri_khz = ieee80211_channel_to_khz(chandef->chan);
+ u32 end_khz = cfg80211_s1g_get_end_freq_khz(chandef);
+ u32 start_khz = cfg80211_s1g_get_start_freq_khz(chandef);
+ int width_mhz = cfg80211_chandef_get_width(chandef);
+ u32 prohibited_flags = IEEE80211_CHAN_DISABLED;
+
+ if (width_mhz >= 16)
+ prohibited_flags |= IEEE80211_CHAN_NO_16MHZ;
+ if (width_mhz >= 8)
+ prohibited_flags |= IEEE80211_CHAN_NO_8MHZ;
+ if (width_mhz >= 4)
+ prohibited_flags |= IEEE80211_CHAN_NO_4MHZ;
+
+ if (chandef->chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY)
+ return false;
+
+ if (pri_khz < start_khz || pri_khz > end_khz)
+ return false;
+
+ for_each_s1g_subchan(chandef, freq_khz) {
+ chan = ieee80211_get_channel_khz(wiphy, freq_khz);
+ if (!chan || (chan->flags & prohibited_flags))
+ return false;
+ }
+
+ if (chandef->s1g_primary_2mhz) {
+ u32 sib_khz;
+ const struct ieee80211_channel *sibling;
+
+ sibling = cfg80211_s1g_get_primary_sibling(wiphy, chandef);
+ if (!sibling)
+ return false;
+
+ if (sibling->flags & IEEE80211_CHAN_S1G_NO_PRIMARY)
+ return false;
+
+ sib_khz = ieee80211_channel_to_khz(sibling);
+ if (sib_khz < start_khz || sib_khz > end_khz)
+ return false;
+ }
+
+ return true;
+}
+
bool _cfg80211_chandef_usable(struct wiphy *wiphy,
const struct cfg80211_chan_def *chandef,
u32 prohibited_flags,
@@ -1154,6 +1203,9 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy,
ext_nss_cap = __le16_to_cpu(vht_cap->vht_mcs.tx_highest) &
IEEE80211_VHT_EXT_NSS_BW_CAPABLE;
+ if (cfg80211_chandef_is_s1g(chandef))
+ return cfg80211_s1g_usable(wiphy, chandef);
+
if (edmg_cap->channels &&
!cfg80211_edmg_usable(wiphy,
chandef->edmg.channels,
@@ -1165,21 +1217,6 @@ bool _cfg80211_chandef_usable(struct wiphy *wiphy,
control_freq = chandef->chan->center_freq;
switch (chandef->width) {
- case NL80211_CHAN_WIDTH_1:
- width = 1;
- break;
- case NL80211_CHAN_WIDTH_2:
- width = 2;
- break;
- case NL80211_CHAN_WIDTH_4:
- width = 4;
- break;
- case NL80211_CHAN_WIDTH_8:
- width = 8;
- break;
- case NL80211_CHAN_WIDTH_16:
- width = 16;
- break;
case NL80211_CHAN_WIDTH_5:
width = 5;
break;
diff --git a/net/wireless/core.c b/net/wireless/core.c
index a7e2931ffb2e..797f9f2004a6 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -1018,6 +1018,15 @@ int wiphy_register(struct wiphy *wiphy)
rdev->wiphy.features |= NL80211_FEATURE_SCAN_FLUSH;
+ if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW)
+ rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_CTWIN;
+ else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN)
+ rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_CTWINDOW;
+ if (rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS)
+ rdev->wiphy.features |= NL80211_FEATURE_P2P_GO_OPPPS;
+ else if (rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS)
+ rdev->wiphy.bss_param_support |= WIPHY_BSS_PARAM_P2P_OPPPS;
+
rtnl_lock();
wiphy_lock(&rdev->wiphy);
res = device_add(&rdev->wiphy.dev);
diff --git a/net/wireless/ethtool.c b/net/wireless/ethtool.c
index 2613d6ac0fda..46e4317cbd7e 100644
--- a/net/wireless/ethtool.c
+++ b/net/wireless/ethtool.c
@@ -23,7 +23,7 @@ void cfg80211_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
else
strscpy(info->fw_version, "N/A", sizeof(info->fw_version));
- strscpy(info->bus_info, dev_name(wiphy_dev(wdev->wiphy)),
+ strscpy(info->bus_info, dev_name(pdev),
sizeof(info->bus_info));
}
EXPORT_SYMBOL(cfg80211_get_drvinfo);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 852573423e52..346dfd2bd987 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -312,6 +312,26 @@ static int validate_supported_selectors(const struct nlattr *attr,
return 0;
}
+static int validate_nan_cluster_id(const struct nlattr *attr,
+ struct netlink_ext_ack *extack)
+{
+ const u8 *data = nla_data(attr);
+ unsigned int len = nla_len(attr);
+ static const u8 cluster_id_prefix[4] = {0x50, 0x6f, 0x9a, 0x1};
+
+ if (len != ETH_ALEN) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "bad cluster id length");
+ return -EINVAL;
+ }
+
+ if (memcmp(data, cluster_id_prefix, sizeof(cluster_id_prefix))) {
+ NL_SET_ERR_MSG_ATTR(extack, attr, "invalid cluster id prefix");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/* policy for the attributes */
static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR];
@@ -411,6 +431,14 @@ static const struct nla_policy nl80211_txattr_policy[NL80211_TXRATE_MAX + 1] = {
[NL80211_TXRATE_HE_LTF] = NLA_POLICY_RANGE(NLA_U8,
NL80211_RATE_INFO_HE_1XLTF,
NL80211_RATE_INFO_HE_4XLTF),
+ [NL80211_TXRATE_EHT] = NLA_POLICY_EXACT_LEN(sizeof(struct nl80211_txrate_eht)),
+ [NL80211_TXRATE_EHT_GI] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_EHT_GI_0_8,
+ NL80211_RATE_INFO_EHT_GI_3_2),
+ [NL80211_TXRATE_EHT_LTF] = NLA_POLICY_RANGE(NLA_U8,
+ NL80211_RATE_INFO_EHT_1XLTF,
+ NL80211_RATE_INFO_EHT_8XLTF),
+
};
static const struct nla_policy
@@ -492,6 +520,36 @@ nl80211_s1g_short_beacon[NL80211_S1G_SHORT_BEACON_ATTR_MAX + 1] = {
IEEE80211_MAX_DATA_LEN),
};
+static const struct nla_policy
+nl80211_nan_band_conf_policy[NL80211_NAN_BAND_CONF_ATTR_MAX + 1] = {
+ [NL80211_NAN_BAND_CONF_BAND] = NLA_POLICY_MAX(NLA_U8,
+ NUM_NL80211_BANDS - 1),
+ [NL80211_NAN_BAND_CONF_FREQ] = { .type = NLA_U16 },
+ [NL80211_NAN_BAND_CONF_RSSI_CLOSE] = NLA_POLICY_MIN(NLA_S8, -59),
+ [NL80211_NAN_BAND_CONF_RSSI_MIDDLE] = NLA_POLICY_MIN(NLA_S8, -74),
+ [NL80211_NAN_BAND_CONF_WAKE_DW] = NLA_POLICY_MAX(NLA_U8, 5),
+ [NL80211_NAN_BAND_CONF_DISABLE_SCAN] = { .type = NLA_FLAG },
+};
+
+static const struct nla_policy
+nl80211_nan_conf_policy[NL80211_NAN_CONF_ATTR_MAX + 1] = {
+ [NL80211_NAN_CONF_CLUSTER_ID] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_nan_cluster_id,
+ ETH_ALEN),
+ [NL80211_NAN_CONF_EXTRA_ATTRS] = { .type = NLA_BINARY,
+ .len = IEEE80211_MAX_DATA_LEN},
+ [NL80211_NAN_CONF_VENDOR_ELEMS] =
+ NLA_POLICY_VALIDATE_FN(NLA_BINARY, validate_ie_attr,
+ IEEE80211_MAX_DATA_LEN),
+ [NL80211_NAN_CONF_BAND_CONFIGS] =
+ NLA_POLICY_NESTED_ARRAY(nl80211_nan_band_conf_policy),
+ [NL80211_NAN_CONF_SCAN_PERIOD] = { .type = NLA_U16 },
+ [NL80211_NAN_CONF_SCAN_DWELL_TIME] = NLA_POLICY_RANGE(NLA_U16, 50, 512),
+ [NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL] =
+ NLA_POLICY_RANGE(NLA_U8, 50, 200),
+ [NL80211_NAN_CONF_NOTIFY_DW] = { .type = NLA_FLAG },
+};
+
static const struct netlink_range_validation nl80211_punct_bitmap_range = {
.min = 0,
.max = 0xffff,
@@ -761,6 +819,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR] = NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN),
[NL80211_ATTR_NAN_MASTER_PREF] = NLA_POLICY_MIN(NLA_U8, 1),
[NL80211_ATTR_BANDS] = { .type = NLA_U32 },
+ [NL80211_ATTR_NAN_CONFIG] = NLA_POLICY_NESTED(nl80211_nan_conf_policy),
[NL80211_ATTR_NAN_FUNC] = { .type = NLA_NESTED },
[NL80211_ATTR_FILS_KEK] = { .type = NLA_BINARY,
.len = FILS_MAX_KEK_LEN },
@@ -871,6 +930,8 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_S1G_LONG_BEACON_PERIOD] = NLA_POLICY_MIN(NLA_U8, 2),
[NL80211_ATTR_S1G_SHORT_BEACON] =
NLA_POLICY_NESTED(nl80211_s1g_short_beacon),
+ [NL80211_ATTR_BSS_PARAM] = { .type = NLA_FLAG },
+ [NL80211_ATTR_S1G_PRIMARY_2MHZ] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -1219,21 +1280,6 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
if ((chan->flags & IEEE80211_CHAN_NO_HE) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_HE))
goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_1MHZ) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_1MHZ))
- goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_2MHZ) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_2MHZ))
- goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_4MHZ) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_4MHZ))
- goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_8MHZ) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_8MHZ))
- goto nla_put_failure;
- if ((chan->flags & IEEE80211_CHAN_16MHZ) &&
- nla_put_flag(msg, NL80211_FREQUENCY_ATTR_16MHZ))
- goto nla_put_failure;
if ((chan->flags & IEEE80211_CHAN_NO_320MHZ) &&
nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_320MHZ))
goto nla_put_failure;
@@ -1259,6 +1305,15 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct wiphy *wiphy,
nla_put_flag(msg,
NL80211_FREQUENCY_ATTR_ALLOW_20MHZ_ACTIVITY))
goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_4MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_4MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_8MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_8MHZ))
+ goto nla_put_failure;
+ if ((chan->flags & IEEE80211_CHAN_NO_16MHZ) &&
+ nla_put_flag(msg, NL80211_FREQUENCY_ATTR_NO_16MHZ))
+ goto nla_put_failure;
}
if (nla_put_u32(msg, NL80211_FREQUENCY_ATTR_MAX_TX_POWER,
@@ -2545,6 +2600,41 @@ fail:
return -ENOBUFS;
}
+static int nl80211_put_nan_capa(struct wiphy *wiphy, struct sk_buff *msg)
+{
+ struct nlattr *nan_caps;
+
+ nan_caps = nla_nest_start(msg, NL80211_ATTR_NAN_CAPABILITIES);
+ if (!nan_caps)
+ return -ENOBUFS;
+
+ if (wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_CONFIGURABLE_SYNC &&
+ nla_put_flag(msg, NL80211_NAN_CAPA_CONFIGURABLE_SYNC))
+ goto fail;
+
+ if ((wiphy->nan_capa.flags & WIPHY_NAN_FLAGS_USERSPACE_DE) &&
+ nla_put_flag(msg, NL80211_NAN_CAPA_USERSPACE_DE))
+ goto fail;
+
+ if (nla_put_u8(msg, NL80211_NAN_CAPA_OP_MODE,
+ wiphy->nan_capa.op_mode) ||
+ nla_put_u8(msg, NL80211_NAN_CAPA_NUM_ANTENNAS,
+ wiphy->nan_capa.n_antennas) ||
+ nla_put_u16(msg, NL80211_NAN_CAPA_MAX_CHANNEL_SWITCH_TIME,
+ wiphy->nan_capa.max_channel_switch_time) ||
+ nla_put_u8(msg, NL80211_NAN_CAPA_CAPABILITIES,
+ wiphy->nan_capa.dev_capabilities))
+ goto fail;
+
+ nla_nest_end(msg, nan_caps);
+
+ return 0;
+
+fail:
+ nla_nest_cancel(msg, nan_caps);
+ return -ENOBUFS;
+}
+
struct nl80211_dump_wiphy_state {
s64 filter_wiphy;
long start;
@@ -3019,6 +3109,40 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
rdev->wiphy.ext_features))
goto nla_put_failure;
+ if (rdev->wiphy.bss_param_support) {
+ struct nlattr *nested;
+ u32 parsup = rdev->wiphy.bss_param_support;
+
+ nested = nla_nest_start(msg, NL80211_ATTR_BSS_PARAM);
+ if (!nested)
+ goto nla_put_failure;
+
+ if ((parsup & WIPHY_BSS_PARAM_CTS_PROT) &&
+ nla_put_flag(msg, NL80211_ATTR_BSS_CTS_PROT))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_SHORT_PREAMBLE) &&
+ nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_PREAMBLE))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_SHORT_SLOT_TIME) &&
+ nla_put_flag(msg, NL80211_ATTR_BSS_SHORT_SLOT_TIME))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_BASIC_RATES) &&
+ nla_put_flag(msg, NL80211_ATTR_BSS_BASIC_RATES))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_AP_ISOLATE) &&
+ nla_put_flag(msg, NL80211_ATTR_AP_ISOLATE))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_HT_OPMODE) &&
+ nla_put_flag(msg, NL80211_ATTR_BSS_HT_OPMODE))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_P2P_CTWINDOW) &&
+ nla_put_flag(msg, NL80211_ATTR_P2P_CTWINDOW))
+ goto nla_put_failure;
+ if ((parsup & WIPHY_BSS_PARAM_P2P_OPPPS) &&
+ nla_put_flag(msg, NL80211_ATTR_P2P_OPPPS))
+ goto nla_put_failure;
+ nla_nest_end(msg, nested);
+ }
if (rdev->wiphy.bss_select_support) {
struct nlattr *nested;
u32 bss_select_support = rdev->wiphy.bss_select_support;
@@ -3163,6 +3287,12 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev,
if (nl80211_put_radios(&rdev->wiphy, msg))
goto nla_put_failure;
+ state->split_start++;
+ break;
+ case 18:
+ if (nl80211_put_nan_capa(&rdev->wiphy, msg))
+ goto nla_put_failure;
+
/* done */
state->split_start = 0;
break;
@@ -3406,6 +3536,7 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
chandef->center_freq1 = KHZ_TO_MHZ(control_freq);
chandef->freq1_offset = control_freq % 1000;
chandef->center_freq2 = 0;
+ chandef->s1g_primary_2mhz = false;
if (!chandef->chan) {
NL_SET_ERR_MSG_ATTR(extack, attrs[NL80211_ATTR_WIPHY_FREQ],
@@ -3449,27 +3580,20 @@ static int _nl80211_parse_chandef(struct cfg80211_registered_device *rdev,
return -EINVAL;
}
} else if (attrs[NL80211_ATTR_CHANNEL_WIDTH]) {
- chandef->width =
- nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
- if (chandef->chan->band == NL80211_BAND_S1GHZ) {
- /* User input error for channel width doesn't match channel */
- if (chandef->width != ieee80211_s1g_channel_width(chandef->chan)) {
- NL_SET_ERR_MSG_ATTR(extack,
- attrs[NL80211_ATTR_CHANNEL_WIDTH],
- "bad channel width");
- return -EINVAL;
- }
- }
+ chandef->width = nla_get_u32(attrs[NL80211_ATTR_CHANNEL_WIDTH]);
if (attrs[NL80211_ATTR_CENTER_FREQ1]) {
chandef->center_freq1 =
nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ1]);
- chandef->freq1_offset =
- nla_get_u32_default(attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET],
- 0);
+ chandef->freq1_offset = nla_get_u32_default(
+ attrs[NL80211_ATTR_CENTER_FREQ1_OFFSET], 0);
}
+
if (attrs[NL80211_ATTR_CENTER_FREQ2])
chandef->center_freq2 =
nla_get_u32(attrs[NL80211_ATTR_CENTER_FREQ2]);
+
+ chandef->s1g_primary_2mhz = nla_get_flag(
+ attrs[NL80211_ATTR_S1G_PRIMARY_2MHZ]);
}
if (info->attrs[NL80211_ATTR_WIPHY_EDMG_CHANNELS]) {
@@ -5393,6 +5517,164 @@ static bool he_set_mcs_mask(struct genl_info *info,
return true;
}
+static void eht_build_mcs_mask(struct genl_info *info,
+ const struct ieee80211_sta_eht_cap *eht_cap,
+ u8 mcs_nss_len, u16 *mcs_mask)
+{
+ struct net_device *dev = info->user_ptr[1];
+ struct wireless_dev *wdev = dev->ieee80211_ptr;
+ u8 nss, mcs_7 = 0, mcs_9 = 0, mcs_11 = 0, mcs_13 = 0;
+ unsigned int link_id = nl80211_link_id(info->attrs);
+
+ if (mcs_nss_len == 4) {
+ const struct ieee80211_eht_mcs_nss_supp_20mhz_only *mcs =
+ &eht_cap->eht_mcs_nss_supp.only_20mhz;
+
+ mcs_7 = u8_get_bits(mcs->rx_tx_mcs7_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+
+ } else {
+ const struct ieee80211_eht_mcs_nss_supp_bw *mcs;
+ enum nl80211_chan_width width;
+
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ width = wdev->u.ibss.chandef.width;
+ break;
+ case NL80211_IFTYPE_MESH_POINT:
+ width = wdev->u.mesh.chandef.width;
+ break;
+ case NL80211_IFTYPE_OCB:
+ width = wdev->u.ocb.chandef.width;
+ break;
+ default:
+ if (wdev->valid_links)
+ width = wdev->links[link_id].ap.chandef.width;
+ else
+ width = wdev->u.ap.preset_chandef.width;
+ break;
+ }
+
+ switch (width) {
+ case NL80211_CHAN_WIDTH_320:
+ mcs = &eht_cap->eht_mcs_nss_supp.bw._320;
+ break;
+ case NL80211_CHAN_WIDTH_160:
+ mcs = &eht_cap->eht_mcs_nss_supp.bw._160;
+ break;
+ default:
+ mcs = &eht_cap->eht_mcs_nss_supp.bw._80;
+ break;
+ }
+
+ mcs_7 = u8_get_bits(mcs->rx_tx_mcs9_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_9 = u8_get_bits(mcs->rx_tx_mcs9_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_11 = u8_get_bits(mcs->rx_tx_mcs11_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ mcs_13 = u8_get_bits(mcs->rx_tx_mcs13_max_nss,
+ IEEE80211_EHT_MCS_NSS_TX);
+ }
+
+ /* Enable MCS 14 for NSS 0 */
+ if (eht_cap->eht_cap_elem.phy_cap_info[6] &
+ IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP)
+ mcs_mask[0] |= 0x4000;
+
+ /* Enable MCS 15 for NSS 0 */
+ mcs_mask[0] |= 0x8000;
+
+ for (nss = 0; nss < NL80211_EHT_NSS_MAX; nss++) {
+ if (!mcs_7)
+ continue;
+ mcs_mask[nss] |= 0x00FF;
+ mcs_7--;
+
+ if (!mcs_9)
+ continue;
+ mcs_mask[nss] |= 0x0300;
+ mcs_9--;
+
+ if (!mcs_11)
+ continue;
+ mcs_mask[nss] |= 0x0C00;
+ mcs_11--;
+
+ if (!mcs_13)
+ continue;
+ mcs_mask[nss] |= 0x3000;
+ mcs_13--;
+ }
+}
+
+static bool eht_set_mcs_mask(struct genl_info *info, struct wireless_dev *wdev,
+ struct ieee80211_supported_band *sband,
+ struct nl80211_txrate_eht *txrate,
+ u16 mcs[NL80211_EHT_NSS_MAX])
+{
+ const struct ieee80211_sta_he_cap *he_cap;
+ const struct ieee80211_sta_eht_cap *eht_cap;
+ u16 tx_mcs_mask[NL80211_EHT_NSS_MAX] = { 0 };
+ u8 i, mcs_nss_len;
+
+ he_cap = ieee80211_get_he_iftype_cap(sband, wdev->iftype);
+ if (!he_cap)
+ return false;
+
+ eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype);
+ if (!eht_cap)
+ return false;
+
+ /* Checks for MCS 14 */
+ if (txrate->mcs[0] & 0x4000) {
+ if (sband->band != NL80211_BAND_6GHZ)
+ return false;
+
+ if (!(eht_cap->eht_cap_elem.phy_cap_info[6] &
+ IEEE80211_EHT_PHY_CAP6_EHT_DUP_6GHZ_SUPP))
+ return false;
+ }
+
+ mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
+ &eht_cap->eht_cap_elem,
+ wdev->iftype ==
+ NL80211_IFTYPE_STATION);
+
+ if (mcs_nss_len == 3) {
+ /* Supported iftypes for setting non-20 MHZ only EHT MCS */
+ switch (wdev->iftype) {
+ case NL80211_IFTYPE_ADHOC:
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_P2P_GO:
+ case NL80211_IFTYPE_MESH_POINT:
+ case NL80211_IFTYPE_OCB:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ /* Build eht_mcs_mask from EHT and HE capabilities */
+ eht_build_mcs_mask(info, eht_cap, mcs_nss_len, tx_mcs_mask);
+
+ memset(mcs, 0, sizeof(u16) * NL80211_EHT_NSS_MAX);
+ for (i = 0; i < NL80211_EHT_NSS_MAX; i++) {
+ if ((tx_mcs_mask[i] & txrate->mcs[i]) == txrate->mcs[i])
+ mcs[i] = txrate->mcs[i];
+ else
+ return false;
+ }
+
+ return true;
+}
+
static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
struct nlattr *attrs[],
enum nl80211_attrs attr,
@@ -5413,6 +5695,8 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
/* Default to all rates enabled */
for (i = 0; i < NUM_NL80211_BANDS; i++) {
const struct ieee80211_sta_he_cap *he_cap;
+ const struct ieee80211_sta_eht_cap *eht_cap;
+ u8 mcs_nss_len;
if (!default_all_enabled)
break;
@@ -5441,6 +5725,21 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
mask->control[i].he_gi = 0xFF;
mask->control[i].he_ltf = 0xFF;
+
+ eht_cap = ieee80211_get_eht_iftype_cap(sband, wdev->iftype);
+ if (!eht_cap)
+ continue;
+
+ mcs_nss_len = ieee80211_eht_mcs_nss_size(&he_cap->he_cap_elem,
+ &eht_cap->eht_cap_elem,
+ wdev->iftype ==
+ NL80211_IFTYPE_STATION);
+
+ eht_build_mcs_mask(info, eht_cap, mcs_nss_len,
+ mask->control[i].eht_mcs);
+
+ mask->control[i].eht_gi = 0xFF;
+ mask->control[i].eht_ltf = 0xFF;
}
/* if no rates are given set it back to the defaults */
@@ -5512,13 +5811,27 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
mask->control[band].he_ltf =
nla_get_u8(tb[NL80211_TXRATE_HE_LTF]);
+ if (tb[NL80211_TXRATE_EHT] &&
+ !eht_set_mcs_mask(info, wdev, sband,
+ nla_data(tb[NL80211_TXRATE_EHT]),
+ mask->control[band].eht_mcs))
+ return -EINVAL;
+
+ if (tb[NL80211_TXRATE_EHT_GI])
+ mask->control[band].eht_gi =
+ nla_get_u8(tb[NL80211_TXRATE_EHT_GI]);
+ if (tb[NL80211_TXRATE_EHT_LTF])
+ mask->control[band].eht_ltf =
+ nla_get_u8(tb[NL80211_TXRATE_EHT_LTF]);
+
if (mask->control[band].legacy == 0) {
- /* don't allow empty legacy rates if HT, VHT or HE
+ /* don't allow empty legacy rates if HT, VHT, HE or EHT
* are not even supported.
*/
if (!(rdev->wiphy.bands[band]->ht_cap.ht_supported ||
rdev->wiphy.bands[band]->vht_cap.vht_supported ||
- ieee80211_get_he_iftype_cap(sband, wdev->iftype)))
+ ieee80211_get_he_iftype_cap(sband, wdev->iftype) ||
+ ieee80211_get_eht_iftype_cap(sband, wdev->iftype)))
return -EINVAL;
for (i = 0; i < IEEE80211_HT_MCS_MASK_LEN; i++)
@@ -5533,6 +5846,10 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info,
if (mask->control[band].he_mcs[i])
goto out;
+ for (i = 0; i < NL80211_EHT_NSS_MAX; i++)
+ if (mask->control[band].eht_mcs[i])
+ goto out;
+
/* legacy and mcs rates may not be both empty */
return -EINVAL;
}
@@ -5546,7 +5863,7 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
enum nl80211_band band,
struct cfg80211_bitrate_mask *beacon_rate)
{
- u32 count_ht, count_vht, count_he, i;
+ u32 count_ht, count_vht, count_he, count_eht, i;
u32 rate = beacon_rate->control[band].legacy;
/* Allow only one rate */
@@ -5592,8 +5909,21 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
return -EINVAL;
}
- if ((count_ht && count_vht && count_he) ||
- (!rate && !count_ht && !count_vht && !count_he))
+ count_eht = 0;
+ for (i = 0; i < NL80211_EHT_NSS_MAX; i++) {
+ if (hweight16(beacon_rate->control[band].eht_mcs[i]) > 1) {
+ return -EINVAL;
+ } else if (beacon_rate->control[band].eht_mcs[i]) {
+ count_eht++;
+ if (count_eht > 1)
+ return -EINVAL;
+ }
+ if (count_eht && rate)
+ return -EINVAL;
+ }
+
+ if ((count_ht && count_vht && count_he && count_eht) ||
+ (!rate && !count_ht && !count_vht && !count_he && !count_eht))
return -EINVAL;
if (rate &&
@@ -5613,6 +5943,11 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev,
NL80211_EXT_FEATURE_BEACON_RATE_HE))
return -EINVAL;
+ if (count_eht &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_BEACON_RATE_EHT))
+ return -EINVAL;
+
return 0;
}
@@ -8830,6 +9165,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
struct cfg80211_registered_device *rdev = info->user_ptr[0];
struct net_device *dev = info->user_ptr[1];
struct bss_parameters params;
+ u32 bss_param_support = rdev->wiphy.bss_param_support;
+ u32 changed = 0;
+ bool strict;
memset(&params, 0, sizeof(params));
params.link_id = nl80211_link_id_or_invalid(info->attrs);
@@ -8842,26 +9180,54 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
params.p2p_ctwindow = -1;
params.p2p_opp_ps = -1;
- if (info->attrs[NL80211_ATTR_BSS_CTS_PROT])
+ strict = nla_get_flag(info->attrs[NL80211_ATTR_BSS_PARAM]);
+ if (info->attrs[NL80211_ATTR_BSS_CTS_PROT]) {
+ if (strict && !(bss_param_support & WIPHY_BSS_PARAM_CTS_PROT))
+ return -EINVAL;
params.use_cts_prot =
nla_get_u8(info->attrs[NL80211_ATTR_BSS_CTS_PROT]);
- if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE])
+ changed |= WIPHY_BSS_PARAM_CTS_PROT;
+ }
+ if (info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]) {
+ if (strict &&
+ !(bss_param_support & WIPHY_BSS_PARAM_SHORT_PREAMBLE))
+ return -EINVAL;
params.use_short_preamble =
nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_PREAMBLE]);
- if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME])
+ changed |= WIPHY_BSS_PARAM_SHORT_PREAMBLE;
+ }
+ if (info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]) {
+ if (strict &&
+ !(bss_param_support & WIPHY_BSS_PARAM_SHORT_SLOT_TIME))
+ return -EINVAL;
params.use_short_slot_time =
nla_get_u8(info->attrs[NL80211_ATTR_BSS_SHORT_SLOT_TIME]);
+ changed |= WIPHY_BSS_PARAM_SHORT_SLOT_TIME;
+ }
if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) {
+ if (strict &&
+ !(bss_param_support & WIPHY_BSS_PARAM_BASIC_RATES))
+ return -EINVAL;
params.basic_rates =
nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
params.basic_rates_len =
nla_len(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]);
+ changed |= WIPHY_BSS_PARAM_BASIC_RATES;
+ }
+ if (info->attrs[NL80211_ATTR_AP_ISOLATE]) {
+ if (strict && !(bss_param_support & WIPHY_BSS_PARAM_AP_ISOLATE))
+ return -EINVAL;
+ params.ap_isolate =
+ !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
+ changed |= WIPHY_BSS_PARAM_AP_ISOLATE;
}
- if (info->attrs[NL80211_ATTR_AP_ISOLATE])
- params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]);
- if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE])
+ if (info->attrs[NL80211_ATTR_BSS_HT_OPMODE]) {
+ if (strict && !(bss_param_support & WIPHY_BSS_PARAM_HT_OPMODE))
+ return -EINVAL;
params.ht_opmode =
nla_get_u16(info->attrs[NL80211_ATTR_BSS_HT_OPMODE]);
+ changed |= WIPHY_BSS_PARAM_HT_OPMODE;
+ }
if (info->attrs[NL80211_ATTR_P2P_CTWINDOW]) {
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
@@ -8869,8 +9235,9 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
params.p2p_ctwindow =
nla_get_u8(info->attrs[NL80211_ATTR_P2P_CTWINDOW]);
if (params.p2p_ctwindow != 0 &&
- !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_CTWIN))
+ !(bss_param_support & WIPHY_BSS_PARAM_P2P_CTWINDOW))
return -EINVAL;
+ changed |= WIPHY_BSS_PARAM_P2P_CTWINDOW;
}
if (info->attrs[NL80211_ATTR_P2P_OPPPS]) {
@@ -8879,9 +9246,11 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
return -EINVAL;
tmp = nla_get_u8(info->attrs[NL80211_ATTR_P2P_OPPPS]);
+ if (tmp && !(bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS))
+ return -EINVAL;
params.p2p_opp_ps = tmp;
if (params.p2p_opp_ps &&
- !(rdev->wiphy.features & NL80211_FEATURE_P2P_GO_OPPPS))
+ !(rdev->wiphy.bss_param_support & WIPHY_BSS_PARAM_P2P_OPPPS))
return -EINVAL;
}
@@ -8892,6 +9261,10 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO)
return -EOPNOTSUPP;
+ changed &= rdev->wiphy.bss_param_support;
+ if (!changed)
+ return 0;
+
return rdev_change_bss(rdev, dev, &params);
}
@@ -10071,8 +10444,9 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
goto out_free;
}
- /* ignore disabled channels */
+ /* Ignore disabled / no primary channels */
if (chan->flags & IEEE80211_CHAN_DISABLED ||
+ chan->flags & IEEE80211_CHAN_S1G_NO_PRIMARY ||
!cfg80211_wdev_channel_allowed(wdev, chan))
continue;
@@ -10094,6 +10468,8 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
chan = &wiphy->bands[band]->channels[j];
if (chan->flags & IEEE80211_CHAN_DISABLED ||
+ chan->flags &
+ IEEE80211_CHAN_S1G_NO_PRIMARY ||
!cfg80211_wdev_channel_allowed(wdev, chan))
continue;
@@ -13398,7 +13774,9 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info)
break;
case NL80211_IFTYPE_NAN:
if (!wiphy_ext_feature_isset(wdev->wiphy,
- NL80211_EXT_FEATURE_SECURE_NAN))
+ NL80211_EXT_FEATURE_SECURE_NAN) &&
+ !(wdev->wiphy->nan_capa.flags &
+ WIPHY_NAN_FLAGS_USERSPACE_DE))
return -EOPNOTSUPP;
break;
default:
@@ -13459,7 +13837,9 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info)
break;
case NL80211_IFTYPE_NAN:
if (!wiphy_ext_feature_isset(wdev->wiphy,
- NL80211_EXT_FEATURE_SECURE_NAN))
+ NL80211_EXT_FEATURE_SECURE_NAN) &&
+ !(wdev->wiphy->nan_capa.flags &
+ WIPHY_NAN_FLAGS_USERSPACE_DE))
return -EOPNOTSUPP;
break;
default:
@@ -15106,6 +15486,216 @@ static int nl80211_stop_p2p_device(struct sk_buff *skb, struct genl_info *info)
return 0;
}
+static struct ieee80211_channel *nl80211_get_nan_channel(struct wiphy *wiphy,
+ int freq)
+{
+ struct ieee80211_channel *chan;
+ struct cfg80211_chan_def def;
+
+ /* Check if the frequency is valid for NAN */
+ if (freq != 5220 && freq != 5745 && freq != 2437)
+ return NULL;
+
+ chan = ieee80211_get_channel(wiphy, freq);
+ if (!chan)
+ return NULL;
+
+ cfg80211_chandef_create(&def, chan, NL80211_CHAN_NO_HT);
+
+ /* Check if the channel is allowed */
+ if (cfg80211_reg_can_beacon(wiphy, &def, NL80211_IFTYPE_NAN))
+ return chan;
+
+ return NULL;
+}
+
+static int nl80211_parse_nan_band_config(struct wiphy *wiphy,
+ struct nlattr **tb,
+ struct cfg80211_nan_band_config *cfg,
+ enum nl80211_band band)
+{
+ if (BIT(band) & ~(u32)wiphy->nan_supported_bands)
+ return -EINVAL;
+
+ if (tb[NL80211_NAN_BAND_CONF_FREQ]) {
+ u16 freq = nla_get_u16(tb[NL80211_NAN_BAND_CONF_FREQ]);
+
+ if (band != NL80211_BAND_5GHZ)
+ return -EINVAL;
+
+ cfg->chan = nl80211_get_nan_channel(wiphy, freq);
+ if (!cfg->chan)
+ return -EINVAL;
+ }
+
+ if (tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]) {
+ cfg->rssi_close =
+ nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_CLOSE]);
+ if (!tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE])
+ return -EINVAL;
+ }
+
+ if (tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]) {
+ cfg->rssi_middle =
+ nla_get_s8(tb[NL80211_NAN_BAND_CONF_RSSI_MIDDLE]);
+ if (!cfg->rssi_close || cfg->rssi_middle >= cfg->rssi_close)
+ return -EINVAL;
+ }
+
+ if (tb[NL80211_NAN_BAND_CONF_WAKE_DW]) {
+ cfg->awake_dw_interval =
+ nla_get_u8(tb[NL80211_NAN_BAND_CONF_WAKE_DW]);
+
+ if (band == NL80211_BAND_2GHZ && cfg->awake_dw_interval == 0)
+ return -EINVAL;
+ }
+
+ cfg->disable_scan =
+ nla_get_flag(tb[NL80211_NAN_BAND_CONF_DISABLE_SCAN]);
+ return 0;
+}
+
+static int nl80211_parse_nan_conf(struct wiphy *wiphy,
+ struct genl_info *info,
+ struct cfg80211_nan_conf *conf,
+ u32 *changed_flags)
+{
+ struct nlattr *attrs[NL80211_NAN_CONF_ATTR_MAX + 1];
+ int err, rem;
+ u32 changed = 0;
+ struct nlattr *band_config;
+
+ if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
+ conf->master_pref =
+ nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
+
+ changed |= CFG80211_NAN_CONF_CHANGED_PREF;
+ }
+
+ if (info->attrs[NL80211_ATTR_BANDS]) {
+ u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
+
+ if (bands & ~(u32)wiphy->nan_supported_bands)
+ return -EOPNOTSUPP;
+
+ if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
+ return -EINVAL;
+
+ conf->bands = bands;
+ changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
+ }
+
+ conf->band_cfgs[NL80211_BAND_2GHZ].awake_dw_interval = 1;
+ if (conf->bands & BIT(NL80211_BAND_5GHZ) || !conf->bands)
+ conf->band_cfgs[NL80211_BAND_5GHZ].awake_dw_interval = 1;
+
+ /* On 2.4 GHz band use channel 6 */
+ conf->band_cfgs[NL80211_BAND_2GHZ].chan =
+ nl80211_get_nan_channel(wiphy, 2437);
+ if (!conf->band_cfgs[NL80211_BAND_2GHZ].chan)
+ return -EINVAL;
+
+ if (!info->attrs[NL80211_ATTR_NAN_CONFIG])
+ goto out;
+
+ err = nla_parse_nested(attrs, NL80211_NAN_CONF_ATTR_MAX,
+ info->attrs[NL80211_ATTR_NAN_CONFIG], NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ changed |= CFG80211_NAN_CONF_CHANGED_CONFIG;
+ if (attrs[NL80211_NAN_CONF_CLUSTER_ID])
+ conf->cluster_id =
+ nla_data(attrs[NL80211_NAN_CONF_CLUSTER_ID]);
+
+ if (attrs[NL80211_NAN_CONF_EXTRA_ATTRS]) {
+ conf->extra_nan_attrs =
+ nla_data(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]);
+ conf->extra_nan_attrs_len =
+ nla_len(attrs[NL80211_NAN_CONF_EXTRA_ATTRS]);
+ }
+
+ if (attrs[NL80211_NAN_CONF_VENDOR_ELEMS]) {
+ conf->vendor_elems =
+ nla_data(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]);
+ conf->vendor_elems_len =
+ nla_len(attrs[NL80211_NAN_CONF_VENDOR_ELEMS]);
+ }
+
+ if (attrs[NL80211_NAN_CONF_BAND_CONFIGS]) {
+ nla_for_each_nested(band_config,
+ attrs[NL80211_NAN_CONF_BAND_CONFIGS],
+ rem) {
+ enum nl80211_band band;
+ struct cfg80211_nan_band_config *cfg;
+ struct nlattr *tb[NL80211_NAN_BAND_CONF_ATTR_MAX + 1];
+
+ err = nla_parse_nested(tb,
+ NL80211_NAN_BAND_CONF_ATTR_MAX,
+ band_config, NULL,
+ info->extack);
+ if (err)
+ return err;
+
+ if (!tb[NL80211_NAN_BAND_CONF_BAND])
+ return -EINVAL;
+
+ band = nla_get_u8(tb[NL80211_NAN_BAND_CONF_BAND]);
+ if (conf->bands && !(conf->bands & BIT(band)))
+ return -EINVAL;
+
+ cfg = &conf->band_cfgs[band];
+
+ err = nl80211_parse_nan_band_config(wiphy, tb, cfg,
+ band);
+ if (err)
+ return err;
+ }
+ }
+
+ if (attrs[NL80211_NAN_CONF_SCAN_PERIOD])
+ conf->scan_period =
+ nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_PERIOD]);
+
+ if (attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME])
+ conf->scan_dwell_time =
+ nla_get_u16(attrs[NL80211_NAN_CONF_SCAN_DWELL_TIME]);
+
+ if (attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL])
+ conf->discovery_beacon_interval =
+ nla_get_u8(attrs[NL80211_NAN_CONF_DISCOVERY_BEACON_INTERVAL]);
+
+ if (attrs[NL80211_NAN_CONF_NOTIFY_DW])
+ conf->enable_dw_notification =
+ nla_get_flag(attrs[NL80211_NAN_CONF_NOTIFY_DW]);
+
+out:
+ if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan &&
+ (!conf->bands || conf->bands & BIT(NL80211_BAND_5GHZ))) {
+ /* If no 5GHz channel is specified use default, if possible */
+ conf->band_cfgs[NL80211_BAND_5GHZ].chan =
+ nl80211_get_nan_channel(wiphy, 5745);
+ if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan)
+ conf->band_cfgs[NL80211_BAND_5GHZ].chan =
+ nl80211_get_nan_channel(wiphy, 5220);
+
+ /* Return error if user space asked explicitly for 5 GHz */
+ if (!conf->band_cfgs[NL80211_BAND_5GHZ].chan &&
+ conf->bands & BIT(NL80211_BAND_5GHZ)) {
+ NL_SET_ERR_MSG_ATTR(info->extack,
+ info->attrs[NL80211_ATTR_BANDS],
+ "5 GHz band operation is not allowed");
+ return -EINVAL;
+ }
+ }
+
+ if (changed_flags)
+ *changed_flags = changed;
+
+ return 0;
+}
+
static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
{
struct cfg80211_registered_device *rdev = info->user_ptr[0];
@@ -15122,23 +15712,13 @@ static int nl80211_start_nan(struct sk_buff *skb, struct genl_info *info)
if (rfkill_blocked(rdev->wiphy.rfkill))
return -ERFKILL;
+ /* Master preference is mandatory for START_NAN */
if (!info->attrs[NL80211_ATTR_NAN_MASTER_PREF])
return -EINVAL;
- conf.master_pref =
- nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
-
- if (info->attrs[NL80211_ATTR_BANDS]) {
- u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
-
- if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
- return -EOPNOTSUPP;
-
- if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
- return -EINVAL;
-
- conf.bands = bands;
- }
+ err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, NULL);
+ if (err)
+ return err;
err = rdev_start_nan(rdev, wdev, &conf);
if (err)
@@ -15494,6 +16074,7 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
struct wireless_dev *wdev = info->user_ptr[1];
struct cfg80211_nan_conf conf = {};
u32 changed = 0;
+ int err;
if (wdev->iftype != NL80211_IFTYPE_NAN)
return -EOPNOTSUPP;
@@ -15501,27 +16082,9 @@ static int nl80211_nan_change_config(struct sk_buff *skb,
if (!wdev_running(wdev))
return -ENOTCONN;
- if (info->attrs[NL80211_ATTR_NAN_MASTER_PREF]) {
- conf.master_pref =
- nla_get_u8(info->attrs[NL80211_ATTR_NAN_MASTER_PREF]);
- if (conf.master_pref <= 1 || conf.master_pref == 255)
- return -EINVAL;
-
- changed |= CFG80211_NAN_CONF_CHANGED_PREF;
- }
-
- if (info->attrs[NL80211_ATTR_BANDS]) {
- u32 bands = nla_get_u32(info->attrs[NL80211_ATTR_BANDS]);
-
- if (bands & ~(u32)wdev->wiphy->nan_supported_bands)
- return -EOPNOTSUPP;
-
- if (bands && !(bands & BIT(NL80211_BAND_2GHZ)))
- return -EINVAL;
-
- conf.bands = bands;
- changed |= CFG80211_NAN_CONF_CHANGED_BANDS;
- }
+ err = nl80211_parse_nan_conf(&rdev->wiphy, info, &conf, &changed);
+ if (err)
+ return err;
if (!changed)
return -EINVAL;
@@ -21244,6 +21807,88 @@ void cfg80211_epcs_changed(struct net_device *netdev, bool enabled)
}
EXPORT_SYMBOL(cfg80211_epcs_changed);
+void cfg80211_next_nan_dw_notif(struct wireless_dev *wdev,
+ struct ieee80211_channel *chan, gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_next_nan_dw_notif(wdev, chan);
+
+ if (!wdev->owner_nlportid)
+ return;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0,
+ NL80211_CMD_NAN_NEXT_DW_NOTIFICATION);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
+ nla_put_u32(msg, NL80211_ATTR_WIPHY_FREQ, chan->center_freq))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_unicast(wiphy_net(wiphy), msg, wdev->owner_nlportid);
+
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_next_nan_dw_notif);
+
+void cfg80211_nan_cluster_joined(struct wireless_dev *wdev,
+ const u8 *cluster_id, bool new_cluster,
+ gfp_t gfp)
+{
+ struct wiphy *wiphy = wdev->wiphy;
+ struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy);
+ struct sk_buff *msg;
+ void *hdr;
+
+ trace_cfg80211_nan_cluster_joined(wdev, cluster_id, new_cluster);
+
+ memcpy(wdev->u.nan.cluster_id, cluster_id, ETH_ALEN);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, gfp);
+ if (!msg)
+ return;
+
+ hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_NAN_CLUSTER_JOINED);
+ if (!hdr)
+ goto nla_put_failure;
+
+ if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) ||
+ nla_put_u64_64bit(msg, NL80211_ATTR_WDEV, wdev_id(wdev),
+ NL80211_ATTR_PAD) ||
+ nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cluster_id) ||
+ (new_cluster && nla_put_flag(msg, NL80211_ATTR_NAN_NEW_CLUSTER)))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ if (!wdev->owner_nlportid)
+ genlmsg_multicast_netns(&nl80211_fam, wiphy_net(wiphy),
+ msg, 0, NL80211_MCGRP_NAN, gfp);
+ else
+ genlmsg_unicast(wiphy_net(wiphy), msg,
+ wdev->owner_nlportid);
+ return;
+
+ nla_put_failure:
+ nlmsg_free(msg);
+}
+EXPORT_SYMBOL(cfg80211_nan_cluster_joined);
+
/* initialisation/exit functions */
int __init nl80211_init(void)
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 3b0ac3437f81..73cab51f6379 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -1707,6 +1707,16 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
+ if (is_s1g) {
+ if (max_bandwidth_khz < MHZ_TO_KHZ(16))
+ bw_flags |= IEEE80211_CHAN_NO_16MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(8))
+ bw_flags |= IEEE80211_CHAN_NO_8MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(4))
+ bw_flags |= IEEE80211_CHAN_NO_4MHZ;
+ return bw_flags;
+ }
+
/* If we get a reg_rule we can assume that at least 5Mhz fit */
if (!cfg80211_does_bw_fit_range(freq_range,
center_freq_khz,
@@ -1717,59 +1727,19 @@ static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd
MHZ_TO_KHZ(20)))
bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (is_s1g) {
- /* S1G is strict about non overlapping channels. We can
- * calculate which bandwidth is allowed per channel by finding
- * the largest bandwidth which cleanly divides the freq_range.
- */
- int edge_offset;
- int ch_bw = max_bandwidth_khz;
-
- while (ch_bw) {
- edge_offset = (center_freq_khz - ch_bw / 2) -
- freq_range->start_freq_khz;
- if (edge_offset % ch_bw == 0) {
- switch (KHZ_TO_MHZ(ch_bw)) {
- case 1:
- bw_flags |= IEEE80211_CHAN_1MHZ;
- break;
- case 2:
- bw_flags |= IEEE80211_CHAN_2MHZ;
- break;
- case 4:
- bw_flags |= IEEE80211_CHAN_4MHZ;
- break;
- case 8:
- bw_flags |= IEEE80211_CHAN_8MHZ;
- break;
- case 16:
- bw_flags |= IEEE80211_CHAN_16MHZ;
- break;
- default:
- /* If we got here, no bandwidths fit on
- * this frequency, ie. band edge.
- */
- bw_flags |= IEEE80211_CHAN_DISABLED;
- break;
- }
- break;
- }
- ch_bw /= 2;
- }
- } else {
- if (max_bandwidth_khz < MHZ_TO_KHZ(10))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(20))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags |= IEEE80211_CHAN_NO_HT40;
- if (max_bandwidth_khz < MHZ_TO_KHZ(80))
- bw_flags |= IEEE80211_CHAN_NO_80MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(160))
- bw_flags |= IEEE80211_CHAN_NO_160MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(320))
- bw_flags |= IEEE80211_CHAN_NO_320MHZ;
- }
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(320))
+ bw_flags |= IEEE80211_CHAN_NO_320MHZ;
+
return bw_flags;
}
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 6c7b7c3828a4..90a9187a6b13 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -1816,6 +1816,9 @@ static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known,
WARN_ON(ies != old_ies);
rcu_assign_pointer(bss->pub.beacon_ies, new_ies);
+
+ bss->ts = known->ts;
+ bss->pub.ts_boottime = known->pub.ts_boottime;
}
}
@@ -1882,6 +1885,10 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
{
lockdep_assert_held(&rdev->bss_lock);
+ /* Update time stamps */
+ known->ts = new->ts;
+ known->pub.ts_boottime = new->pub.ts_boottime;
+
/* Update IEs */
if (rcu_access_pointer(new->pub.proberesp_ies)) {
const struct cfg80211_bss_ies *old;
@@ -1945,8 +1952,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev,
if (signal_valid)
known->pub.signal = new->pub.signal;
known->pub.capability = new->pub.capability;
- known->ts = new->ts;
- known->pub.ts_boottime = new->pub.ts_boottime;
known->parent_tsf = new->parent_tsf;
known->pub.chains = new->pub.chains;
memcpy(known->pub.chain_signal, new->pub.chain_signal,
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 34c584a215e5..8a4c34112eb5 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -3137,23 +3137,6 @@ DEFINE_EVENT(cfg80211_netdev_mac_evt, cfg80211_notify_new_peer_candidate,
TP_ARGS(netdev, macaddr)
);
-DECLARE_EVENT_CLASS(netdev_evt_only,
- TP_PROTO(struct net_device *netdev),
- TP_ARGS(netdev),
- TP_STRUCT__entry(
- NETDEV_ENTRY
- ),
- TP_fast_assign(
- NETDEV_ASSIGN;
- ),
- TP_printk(NETDEV_PR_FMT , NETDEV_PR_ARG)
-);
-
-DEFINE_EVENT(netdev_evt_only, cfg80211_send_rx_auth,
- TP_PROTO(struct net_device *netdev),
- TP_ARGS(netdev)
-);
-
TRACE_EVENT(cfg80211_send_rx_assoc,
TP_PROTO(struct net_device *netdev,
const struct cfg80211_rx_assoc_resp_data *data),
@@ -3480,21 +3463,6 @@ TRACE_EVENT(cfg80211_reg_can_beacon,
__entry->prohibited_flags, __entry->permitting_flags)
);
-TRACE_EVENT(cfg80211_chandef_dfs_required,
- TP_PROTO(struct wiphy *wiphy, struct cfg80211_chan_def *chandef),
- TP_ARGS(wiphy, chandef),
- TP_STRUCT__entry(
- WIPHY_ENTRY
- CHAN_DEF_ENTRY
- ),
- TP_fast_assign(
- WIPHY_ASSIGN;
- CHAN_DEF_ASSIGN(chandef);
- ),
- TP_printk(WIPHY_PR_FMT ", " CHAN_DEF_PR_FMT,
- WIPHY_PR_ARG, CHAN_DEF_PR_ARG)
-);
-
TRACE_EVENT(cfg80211_ch_switch_notify,
TP_PROTO(struct net_device *netdev,
struct cfg80211_chan_def *chandef,
@@ -3862,30 +3830,6 @@ DEFINE_EVENT(cfg80211_bss_evt, cfg80211_return_bss,
TP_ARGS(pub)
);
-TRACE_EVENT(cfg80211_return_uint,
- TP_PROTO(unsigned int ret),
- TP_ARGS(ret),
- TP_STRUCT__entry(
- __field(unsigned int, ret)
- ),
- TP_fast_assign(
- __entry->ret = ret;
- ),
- TP_printk("ret: %d", __entry->ret)
-);
-
-TRACE_EVENT(cfg80211_return_u32,
- TP_PROTO(u32 ret),
- TP_ARGS(ret),
- TP_STRUCT__entry(
- __field(u32, ret)
- ),
- TP_fast_assign(
- __entry->ret = ret;
- ),
- TP_printk("ret: %u", __entry->ret)
-);
-
TRACE_EVENT(cfg80211_report_wowlan_wakeup,
TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev,
struct cfg80211_wowlan_wakeup *wakeup),
@@ -4222,6 +4166,41 @@ TRACE_EVENT(cfg80211_epcs_changed,
WDEV_PR_ARG, __entry->enabled)
);
+TRACE_EVENT(cfg80211_next_nan_dw_notif,
+ TP_PROTO(struct wireless_dev *wdev,
+ struct ieee80211_channel *chan),
+ TP_ARGS(wdev, chan),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ CHAN_ENTRY
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ CHAN_ASSIGN(chan);
+ ),
+ TP_printk(WDEV_PR_FMT " " CHAN_PR_FMT,
+ WDEV_PR_ARG, CHAN_PR_ARG)
+);
+
+TRACE_EVENT(cfg80211_nan_cluster_joined,
+ TP_PROTO(struct wireless_dev *wdev,
+ const u8 *cluster_id,
+ bool new_cluster),
+ TP_ARGS(wdev, cluster_id, new_cluster),
+ TP_STRUCT__entry(
+ WDEV_ENTRY
+ MAC_ENTRY(cluster_id)
+ __field(bool, new_cluster)
+ ),
+ TP_fast_assign(
+ WDEV_ASSIGN;
+ MAC_ASSIGN(cluster_id, cluster_id);
+ __entry->new_cluster = new_cluster;
+ ),
+ TP_printk(WDEV_PR_FMT " cluster_id %pMF%s",
+ WDEV_PR_ARG, __entry->cluster_id,
+ __entry->new_cluster ? " [new]" : "")
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 240c68baa3d1..56724b33af04 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -106,33 +106,6 @@ u32 ieee80211_channel_to_freq_khz(int chan, enum nl80211_band band)
}
EXPORT_SYMBOL(ieee80211_channel_to_freq_khz);
-enum nl80211_chan_width
-ieee80211_s1g_channel_width(const struct ieee80211_channel *chan)
-{
- if (WARN_ON(!chan || chan->band != NL80211_BAND_S1GHZ))
- return NL80211_CHAN_WIDTH_20_NOHT;
-
- /*S1G defines a single allowed channel width per channel.
- * Extract that width here.
- */
- if (chan->flags & IEEE80211_CHAN_1MHZ)
- return NL80211_CHAN_WIDTH_1;
- else if (chan->flags & IEEE80211_CHAN_2MHZ)
- return NL80211_CHAN_WIDTH_2;
- else if (chan->flags & IEEE80211_CHAN_4MHZ)
- return NL80211_CHAN_WIDTH_4;
- else if (chan->flags & IEEE80211_CHAN_8MHZ)
- return NL80211_CHAN_WIDTH_8;
- else if (chan->flags & IEEE80211_CHAN_16MHZ)
- return NL80211_CHAN_WIDTH_16;
-
- pr_err("unknown channel width for channel at %dKHz?\n",
- ieee80211_channel_to_khz(chan));
-
- return NL80211_CHAN_WIDTH_1;
-}
-EXPORT_SYMBOL(ieee80211_s1g_channel_width);
-
int ieee80211_freq_khz_to_channel(u32 freq)
{
/* TODO: just handle MHz for now */
@@ -2584,7 +2557,7 @@ int cfg80211_get_radio_idx_by_chan(struct wiphy *wiphy,
}
}
- return -ENOENT;
+ return -EINVAL;
}
EXPORT_SYMBOL(cfg80211_get_radio_idx_by_chan);
@@ -2992,7 +2965,7 @@ bool cfg80211_radio_chandef_valid(const struct wiphy_radio *radio,
u32 freq, width;
freq = ieee80211_chandef_to_khz(chandef);
- width = cfg80211_chandef_get_width(chandef);
+ width = MHZ_TO_KHZ(cfg80211_chandef_get_width(chandef));
if (!ieee80211_radio_freq_range_valid(radio, freq, width))
return false;
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 72e34bd2d925..7b0c68a70888 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -618,11 +618,16 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static void xsk_set_destructor_arg(struct sk_buff *skb, u64 addr)
+static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
+ u64 addr)
{
BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
+ skb->dev = xs->dev;
+ skb->priority = READ_ONCE(xs->sk.sk_priority);
+ skb->mark = READ_ONCE(xs->sk.sk_mark);
XSKCB(skb)->num_descs = 0;
+ skb->destructor = xsk_destruct_skb;
skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
}
@@ -652,6 +657,45 @@ static void xsk_drop_skb(struct sk_buff *skb)
xsk_consume_skb(skb);
}
+static int xsk_skb_metadata(struct sk_buff *skb, void *buffer,
+ struct xdp_desc *desc, struct xsk_buff_pool *pool,
+ u32 hr)
+{
+ struct xsk_tx_metadata *meta = NULL;
+
+ if (unlikely(pool->tx_metadata_len == 0))
+ return -EINVAL;
+
+ meta = buffer - pool->tx_metadata_len;
+ if (unlikely(!xsk_buff_valid_tx_metadata(meta)))
+ return -EINVAL;
+
+ if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
+ if (unlikely(meta->request.csum_start +
+ meta->request.csum_offset +
+ sizeof(__sum16) > desc->len))
+ return -EINVAL;
+
+ skb->csum_start = hr + meta->request.csum_start;
+ skb->csum_offset = meta->request.csum_offset;
+ skb->ip_summed = CHECKSUM_PARTIAL;
+
+ if (unlikely(pool->tx_sw_csum)) {
+ int err;
+
+ err = skb_checksum_help(skb);
+ if (err)
+ return err;
+ }
+ }
+
+ if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
+ skb->skb_mstamp_ns = meta->request.launch_time;
+ xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
+
+ return 0;
+}
+
static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
struct xdp_desc *desc)
{
@@ -664,6 +708,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
int err, i;
u64 addr;
+ addr = desc->addr;
+ buffer = xsk_buff_raw_get_data(pool, addr);
+
if (!skb) {
hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
@@ -673,7 +720,12 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
skb_reserve(skb, hr);
- xsk_set_destructor_arg(skb, desc->addr);
+ xsk_skb_init_misc(skb, xs, desc->addr);
+ if (desc->options & XDP_TX_METADATA) {
+ err = xsk_skb_metadata(skb, buffer, desc, pool, hr);
+ if (unlikely(err))
+ return ERR_PTR(err);
+ }
} else {
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
if (!xsk_addr)
@@ -687,11 +739,9 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
}
- addr = desc->addr;
len = desc->len;
ts = pool->unaligned ? len : pool->chunk_size;
- buffer = xsk_buff_raw_get_data(pool, addr);
offset = offset_in_page(buffer);
addr = buffer - pool->addrs;
@@ -722,16 +772,15 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
struct xdp_desc *desc)
{
- struct xsk_tx_metadata *meta = NULL;
struct net_device *dev = xs->dev;
struct sk_buff *skb = xs->skb;
- bool first_frag = false;
int err;
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
skb = xsk_build_skb_zerocopy(xs, desc);
if (IS_ERR(skb)) {
err = PTR_ERR(skb);
+ skb = NULL;
goto free_err;
}
} else {
@@ -742,8 +791,6 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
len = desc->len;
if (!skb) {
- first_frag = true;
-
hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
tr = dev->needed_tailroom;
skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
@@ -757,7 +804,13 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
if (unlikely(err))
goto free_err;
- xsk_set_destructor_arg(skb, desc->addr);
+ xsk_skb_init_misc(skb, xs, desc->addr);
+ if (desc->options & XDP_TX_METADATA) {
+ err = xsk_skb_metadata(skb, buffer, desc,
+ xs->pool, hr);
+ if (unlikely(err))
+ goto free_err;
+ }
} else {
int nr_frags = skb_shinfo(skb)->nr_frags;
struct xsk_addr_node *xsk_addr;
@@ -792,54 +845,14 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
xsk_addr->addr = desc->addr;
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
}
-
- if (first_frag && desc->options & XDP_TX_METADATA) {
- if (unlikely(xs->pool->tx_metadata_len == 0)) {
- err = -EINVAL;
- goto free_err;
- }
-
- meta = buffer - xs->pool->tx_metadata_len;
- if (unlikely(!xsk_buff_valid_tx_metadata(meta))) {
- err = -EINVAL;
- goto free_err;
- }
-
- if (meta->flags & XDP_TXMD_FLAGS_CHECKSUM) {
- if (unlikely(meta->request.csum_start +
- meta->request.csum_offset +
- sizeof(__sum16) > len)) {
- err = -EINVAL;
- goto free_err;
- }
-
- skb->csum_start = hr + meta->request.csum_start;
- skb->csum_offset = meta->request.csum_offset;
- skb->ip_summed = CHECKSUM_PARTIAL;
-
- if (unlikely(xs->pool->tx_sw_csum)) {
- err = skb_checksum_help(skb);
- if (err)
- goto free_err;
- }
- }
-
- if (meta->flags & XDP_TXMD_FLAGS_LAUNCH_TIME)
- skb->skb_mstamp_ns = meta->request.launch_time;
- }
}
- skb->dev = dev;
- skb->priority = READ_ONCE(xs->sk.sk_priority);
- skb->mark = READ_ONCE(xs->sk.sk_mark);
- skb->destructor = xsk_destruct_skb;
- xsk_tx_metadata_to_compl(meta, &skb_shinfo(skb)->xsk_meta);
xsk_inc_num_desc(skb);
return skb;
free_err:
- if (first_frag && skb)
+ if (skb && !skb_shinfo(skb)->nr_frags)
kfree_skb(skb);
if (err == -EOVERFLOW) {
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index c5035a9bc3bb..62486f866975 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2594,7 +2594,7 @@ xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl,
static dscp_t xfrm_get_dscp(const struct flowi *fl, int family)
{
if (family == AF_INET)
- return inet_dsfield_to_dscp(fl->u.ip4.flowi4_tos);
+ return fl->u.ip4.flowi4_dscp;
return 0;
}
@@ -3462,7 +3462,7 @@ decode_session4(const struct xfrm_flow_keys *flkeys, struct flowi *fl, bool reve
}
fl4->flowi4_proto = flkeys->basic.ip_proto;
- fl4->flowi4_tos = flkeys->ip.tos & ~INET_ECN_MASK;
+ fl4->flowi4_dscp = inet_dsfield_to_dscp(flkeys->ip.tos);
}
#if IS_ENABLED(CONFIG_IPV6)
@@ -3594,7 +3594,7 @@ static bool xfrm_icmp_flow_decode(struct sk_buff *skb, unsigned short family,
fl1->flowi_oif = fl->flowi_oif;
fl1->flowi_mark = fl->flowi_mark;
- fl1->flowi_tos = fl->flowi_tos;
+ fl1->flowi_dscp = fl->flowi_dscp;
nf_nat_decode_session(newskb, fl1, family);
ret = false;
@@ -3881,12 +3881,18 @@ int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
}
skb_dst_force(skb);
- if (!skb_dst(skb)) {
+ dst = skb_dst(skb);
+ if (!dst) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
return 0;
}
- dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
+ /* ignore return value from skb_dstref_steal, xfrm_lookup takes
+ * care of dropping the refcnt if needed.
+ */
+ skb_dstref_steal(skb);
+
+ dst = xfrm_lookup(net, dst, &fl, NULL, XFRM_LOOKUP_QUEUE);
if (IS_ERR(dst)) {
res = 0;
dst = NULL;
diff --git a/net/xfrm/xfrm_proc.c b/net/xfrm/xfrm_proc.c
index 8e07dd614b0b..5e1fd6b1d503 100644
--- a/net/xfrm/xfrm_proc.c
+++ b/net/xfrm/xfrm_proc.c
@@ -45,21 +45,21 @@ static const struct snmp_mib xfrm_mib_list[] = {
SNMP_MIB_ITEM("XfrmInStateDirError", LINUX_MIB_XFRMINSTATEDIRERROR),
SNMP_MIB_ITEM("XfrmInIptfsError", LINUX_MIB_XFRMINIPTFSERROR),
SNMP_MIB_ITEM("XfrmOutNoQueueSpace", LINUX_MIB_XFRMOUTNOQSPACE),
- SNMP_MIB_SENTINEL
};
static int xfrm_statistics_seq_show(struct seq_file *seq, void *v)
{
- unsigned long buff[LINUX_MIB_XFRMMAX];
+ unsigned long buff[ARRAY_SIZE(xfrm_mib_list)];
+ const int cnt = ARRAY_SIZE(xfrm_mib_list);
struct net *net = seq->private;
int i;
- memset(buff, 0, sizeof(unsigned long) * LINUX_MIB_XFRMMAX);
+ memset(buff, 0, sizeof(buff));
xfrm_state_update_stats(net);
- snmp_get_cpu_field_batch(buff, xfrm_mib_list,
- net->mib.xfrm_statistics);
- for (i = 0; xfrm_mib_list[i].name; i++)
+ snmp_get_cpu_field_batch_cnt(buff, xfrm_mib_list, cnt,
+ net->mib.xfrm_statistics);
+ for (i = 0; i < cnt; i++)
seq_printf(seq, "%-24s\t%lu\n", xfrm_mib_list[i].name,
buff[i]);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 684239018bec..010c9e6638c0 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -593,7 +593,7 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props,
if (!p)
return -ENOMEM;
- strcpy(p->alg_name, algo->name);
+ strscpy(p->alg_name, algo->name);
*algpp = p;
return 0;
}
@@ -620,7 +620,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta,
if (!p)
return -ENOMEM;
- strcpy(p->alg_name, algo->name);
+ strscpy(p->alg_name, algo->name);
x->ealg = p;
x->geniv = algo->uinfo.encr.geniv;
return 0;
@@ -649,7 +649,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props,
if (!p)
return -ENOMEM;
- strcpy(p->alg_name, algo->name);
+ strscpy(p->alg_name, algo->name);
p->alg_key_len = ualg->alg_key_len;
p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
memcpy(p->alg_key, ualg->alg_key, (ualg->alg_key_len + 7) / 8);
@@ -684,7 +684,7 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props,
if (!p)
return -ENOMEM;
- strcpy(p->alg_name, algo->name);
+ strscpy(p->alg_name, algo->name);
if (!p->alg_trunc_len)
p->alg_trunc_len = algo->uinfo.auth.icv_truncbits;
@@ -714,7 +714,7 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta,
if (!p)
return -ENOMEM;
- strcpy(p->alg_name, algo->name);
+ strscpy(p->alg_name, algo->name);
x->aead = p;
x->geniv = algo->uinfo.aead.geniv;
return 0;