summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2026-02-11 19:31:52 -0800
commit37a93dd5c49b5fda807fd204edf2547c3493319c (patch)
treece1ef5a642b9ea3d7242156438eb96dc5607a752 /net/core
parent098b6e44cbaa2d526d06af90c862d13fb414a0ec (diff)
parent83310d613382f74070fc8b402f3f6c2af8439ead (diff)
Merge tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-nextipvs/mainipvs/HEADipvs-next/mainipvs-next/HEADdavem/net-next/maindavem/net-next/HEAD
Pull networking updates from Paolo Abeni: "Core & protocols: - A significant effort all around the stack to guide the compiler to make the right choice when inlining code, to avoid unneeded calls for small helper and stack canary overhead in the fast-path. This generates better and faster code with very small or no text size increases, as in many cases the call generated more code than the actual inlined helper. - Extend AccECN implementation so that is now functionally complete, also allow the user-space enabling it on a per network namespace basis. - Add support for memory providers with large (above 4K) rx buffer. Paired with hw-gro, larger rx buffer sizes reduce the number of buffers traversing the stack, dincreasing single stream CPU usage by up to ~30%. - Do not add HBH header to Big TCP GSO packets. This simplifies the RX path, the TX path and the NIC drivers, and is possible because user-space taps can now interpret correctly such packets without the HBH hint. - Allow IPv6 routes to be configured with a gateway address that is resolved out of a different interface than the one specified, aligning IPv6 to IPv4 behavior. - Multi-queue aware sch_cake. This makes it possible to scale the rate shaper of sch_cake across multiple CPUs, while still enforcing a single global rate on the interface. - Add support for the nbcon (new buffer console) infrastructure to netconsole, enabling lock-free, priority-based console operations that are safer in crash scenarios. - Improve the TCP ipv6 output path to cache the flow information, saving cpu cycles, reducing cache line misses and stack use. - Improve netfilter packet tracker to resolve clashes for most protocols, avoiding unneeded drops on rare occasions. - Add IP6IP6 tunneling acceleration to the flowtable infrastructure. - Reduce tcp socket size by one cache line. - Notify neighbour changes atomically, avoiding inconsistencies between the notification sequence and the actual states sequence. - Add vsock namespace support, allowing complete isolation of vsocks across different network namespaces. - Improve xsk generic performances with cache-alignment-oriented optimizations. - Support netconsole automatic target recovery, allowing netconsole to reestablish targets when underlying low-level interface comes back online. Driver API: - Support for switching the working mode (automatic vs manual) of a DPLL device via netlink. - Introduce PHY ports representation to expose multiple front-facing media ports over a single MAC. - Introduce "rx-polarity" and "tx-polarity" device tree properties, to generalize polarity inversion requirements for differential signaling. - Add helper to create, prepare and enable managed clocks. Device drivers: - Add Huawei hinic3 PF etherner driver. - Add DWMAC glue driver for Motorcomm YT6801 PCIe ethernet controller. - Add ethernet driver for MaxLinear MxL862xx switches - Remove parallel-port Ethernet driver. - Convert existing driver timestamp configuration reporting to hwtstamp_get and remove legacy ioctl(). - Convert existing drivers to .get_rx_ring_count(), simplifing the RX ring count retrieval. Also remove the legacy fallback path. - Ethernet high-speed NICs: - Broadcom (bnxt, bng): - bnxt: add FW interface update to support FEC stats histogram and NVRAM defragmentation - bng: add TSO and H/W GRO support - nVidia/Mellanox (mlx5): - improve latency of channel restart operations, reducing the used H/W resources - add TSO support for UDP over GRE over VLAN - add flow counters support for hardware steering (HWS) rules - use a static memory area to store headers for H/W GRO, leading to 12% RX tput improvement - Intel (100G, ice, idpf): - ice: reorganizes layout of Tx and Rx rings for cacheline locality and utilizes __cacheline_group* macros on the new layouts - ice: introduces Synchronous Ethernet (SyncE) support - Meta (fbnic): - adds debugfs for firmware mailbox and tx/rx rings vectors - Ethernet virtual: - geneve: introduce GRO/GSO support for double UDP encapsulation - Ethernet NICs consumer, and embedded: - Synopsys (stmmac): - some code refactoring and cleanups - RealTek (r8169): - add support for RTL8127ATF (10G Fiber SFP) - add dash and LTR support - Airoha: - AN8811HB 2.5 Gbps phy support - Freescale (fec): - add XDP zero-copy support - Thunderbolt: - add get link setting support to allow bonding - Renesas: - add support for RZ/G3L GBETH SoC - Ethernet switches: - Maxlinear: - support R(G)MII slow rate configuration - add support for Intel GSW150 - Motorcomm (yt921x): - add DCB/QoS support - TI: - icssm-prueth: support bridging (STP/RSTP) via the switchdev framework - Ethernet PHYs: - Realtek: - enable SGMII and 2500Base-X in-band auto-negotiation - simplify and reunify C22/C45 drivers - Micrel: convert bindings to DT schema - CAN: - move skb headroom content into skb extensions, making CAN metadata access more robust - CAN drivers: - rcar_canfd: - add support for FD-only mode - add support for the RZ/T2H SoC - sja1000: cleanup the CAN state handling - WiFi: - implement EPPKE/802.1X over auth frames support - split up drop reasons better, removing generic RX_DROP - additional FTM capabilities: 6 GHz support, supported number of spatial streams and supported number of LTF repetitions - better mac80211 iterators to enumerate resources - initial UHR (Wi-Fi 8) support for cfg80211/mac80211 - WiFi drivers: - Qualcomm/Atheros: - ath11k: support for Channel Frequency Response measurement - ath12k: a significant driver refactor to support multi-wiphy devices and and pave the way for future device support in the same driver (rather than splitting to ath13k) - ath12k: support for the QCC2072 chipset - Intel: - iwlwifi: partial Neighbor Awareness Networking (NAN) support - iwlwifi: initial support for U-NII-9 and IEEE 802.11bn - RealTek (rtw89): - preparations for RTL8922DE support - Bluetooth: - implement setsockopt(BT_PHY) to set the connection packet type/PHY - set link_policy on incoming ACL connections - Bluetooth drivers: - btusb: add support for MediaTek7920, Realtek RTL8761BU and 8851BE - btqca: add WCN6855 firmware priority selection feature" * tag 'net-next-7.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1254 commits) bnge/bng_re: Add a new HSI net: macb: Fix tx/rx malfunction after phy link down and up af_unix: Fix memleak of newsk in unix_stream_connect(). net: ti: icssg-prueth: Add optional dependency on HSR net: dsa: add basic initial driver for MxL862xx switches net: mdio: add unlocked mdiodev C45 bus accessors net: dsa: add tag format for MxL862xx switches dt-bindings: net: dsa: add MaxLinear MxL862xx selftests: drivers: net: hw: Modify toeplitz.c to poll for packets octeontx2-pf: Unregister devlink on probe failure net: renesas: rswitch: fix forwarding offload statemachine ionic: Rate limit unknown xcvr type messages tcp: inet6_csk_xmit() optimization tcp: populate inet->cork.fl.u.ip6 in tcp_v6_syn_recv_sock() tcp: populate inet->cork.fl.u.ip6 in tcp_v6_connect() ipv6: inet6_csk_xmit() and inet6_csk_update_pmtu() use inet->cork.fl.u.ip6 ipv6: use inet->cork.fl.u.ip6 and np->final in ip6_datagram_dst_update() ipv6: use np->final in inet6_sk_rebuild_header() ipv6: add daddr/final storage in struct ipv6_pinfo net: stmmac: qcom-ethqos: fix qcom_ethqos_serdes_powerup() ...
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c24
-rw-r--r--net/core/dev.h5
-rw-r--r--net/core/dev_ioctl.c60
-rw-r--r--net/core/devmem.c27
-rw-r--r--net/core/devmem.h17
-rw-r--r--net/core/gro.c4
-rw-r--r--net/core/neighbour.c150
-rw-r--r--net/core/net_namespace.c34
-rw-r--r--net/core/netdev_config.c78
-rw-r--r--net/core/netdev_rx_queue.c53
-rw-r--r--net/core/request_sock.c127
-rw-r--r--net/core/skbuff.c166
-rw-r--r--net/core/sock.c16
-rw-r--r--net/core/sysctl_net_core.c11
15 files changed, 424 insertions, 351 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 9ef2099c5426..dc17c5a61e9a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,7 +3,7 @@
# Makefile for the Linux networking core.
#
-obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+obj-y := sock.o skbuff.o datagram.o stream.o scm.o \
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
flow_dissector.o
@@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
+obj-y += netdev_config.o
obj-y += netdev_rx_queue.o
obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
diff --git a/net/core/dev.c b/net/core/dev.c
index ccef685023c2..ac6bcb2a0784 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -246,12 +246,11 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)
}
static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
- unsigned long *flags)
+ unsigned long flags)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
- spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else
- local_irq_restore(*flags);
+ spin_unlock(&sd->input_pkt_queue.lock);
+ local_irq_restore(flags);
}
static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
@@ -3803,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
- features &= ~NETIF_F_TSO_MANGLEID;
+ features &= ~dev->mangleid_features;
}
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
@@ -3814,8 +3813,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
skb_transport_header_was_set(skb) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
return features;
@@ -3918,8 +3916,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
goto sw_checksum;
switch (skb->csum_offset) {
@@ -5260,7 +5257,7 @@ void kick_defer_list_purge(unsigned int cpu)
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
__napi_schedule_irqoff(&sd->backlog);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
smp_call_function_single_async(cpu, &sd->defer_csd);
@@ -5347,14 +5344,14 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
}
__skb_queue_tail(&sd->input_pkt_queue, skb);
tail = rps_input_queue_tail_incr(sd);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
/* save the tail outside of the critical section */
rps_input_queue_tail_save(qtail, tail);
return NET_RX_SUCCESS;
}
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
cpu_backlog_drop:
reason = SKB_DROP_REASON_CPU_BACKLOG;
@@ -11386,6 +11383,9 @@ int register_netdevice(struct net_device *dev)
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+ /* TSO_MANGLEID belongs in mangleid_features by definition */
+ dev->mangleid_features |= NETIF_F_TSO_MANGLEID;
+
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
diff --git a/net/core/dev.h b/net/core/dev.h
index da18536cbd35..98793a738f43 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -10,6 +10,7 @@
struct net;
struct netlink_ext_ack;
+struct netdev_queue_config;
struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
@@ -91,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem;
extern struct list_head net_todo_list;
void netdev_run_todo(void);
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+
/* netdev management, shared between various uAPI entry points */
struct netdev_name_node {
struct hlist_node hlist;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 53a53357cfef..7a8966544c9d 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -287,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
int err;
if (!ops->ndo_hwtstamp_get)
- return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -414,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
}
if (!ops->ndo_hwtstamp_set)
- return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -438,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return 0;
}
-static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
- struct kernel_hwtstamp_config *kernel_cfg)
-{
- struct ifreq ifrr;
- int err;
-
- if (!kernel_cfg->ifr)
- return -EINVAL;
-
- strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
- ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
-
- err = dev_eth_ioctl(dev, &ifrr, cmd);
- if (err)
- return err;
-
- kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
- kernel_cfg->copied_to_user = true;
-
- return 0;
-}
-
int generic_hwtstamp_get_lower(struct net_device *dev,
struct kernel_hwtstamp_config *kernel_cfg)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_get_lower);
@@ -488,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_set)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_set_lower);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index ec4217d6c0b4..63f093f7d2b2 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
static const struct memory_provider_ops dmabuf_devmem_ops;
-bool net_is_devmem_iov(struct net_iov *niov)
-{
- return niov->type == NET_IOV_DMABUF;
-}
-
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
struct gen_pool_chunk *chunk,
void *not_used)
@@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
+static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref)
+{
+ struct net_devmem_dmabuf_binding *binding =
+ container_of(ref, struct net_devmem_dmabuf_binding, ref);
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
@@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ percpu_ref_exit(&binding->ref);
kvfree(binding->tx_vec);
kfree(binding);
}
@@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- net_devmem_dmabuf_binding_put(binding);
+ percpu_ref_kill(&binding->ref);
}
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
@@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dev = dev;
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
- refcount_set(&binding->ref, 1);
+ err = percpu_ref_init(&binding->ref,
+ net_devmem_dmabuf_binding_release,
+ 0, GFP_KERNEL);
+ if (err < 0)
+ goto err_free_binding;
mutex_init(&binding->lock);
@@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_binding;
+ goto err_exit_ref;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
@@ -322,6 +331,8 @@ err_unmap:
direction);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
+err_exit_ref:
+ percpu_ref_exit(&binding->ref);
err_free_binding:
kfree(binding);
err_put_dmabuf:
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 0b43a648cd2e..1c5c18581fcb 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding {
* retransmits) hold a reference to the binding until the skb holding
* them is freed.
*/
- refcount_t ref;
+ struct percpu_ref ref;
/* The list of bindings currently active. Used for netlink to notify us
* of the user dropping the bind.
@@ -125,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- return refcount_inc_not_zero(&binding->ref);
+ return percpu_ref_tryget(&binding->ref);
}
static inline void
net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
{
- if (!refcount_dec_and_test(&binding->ref))
- return;
-
- INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
- schedule_work(&binding->unbind_w);
+ percpu_ref_put(&binding->ref);
}
void net_devmem_get_net_iov(struct net_iov *niov);
@@ -145,7 +141,7 @@ struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
-bool net_is_devmem_iov(struct net_iov *niov);
+
struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
struct net_iov *
@@ -218,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
return 0;
}
-static inline bool net_is_devmem_iov(struct net_iov *niov)
-{
- return false;
-}
-
static inline struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
{
diff --git a/net/core/gro.c b/net/core/gro.c
index 482fa7d7f598..31d21de5b15a 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
- (p->protocol == htons(ETH_P_IPV6) &&
- skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation)
return -E2BIG;
}
@@ -417,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
struct skb_shared_info *pinfo = skb_shinfo(skb);
- BUG_ON(skb->end - skb->tail < grow);
+ DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96a3b1a93252..e0897eb41c8d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -51,9 +51,8 @@ do { \
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(struct timer_list *t);
-static void __neigh_notify(struct neighbour *n, int type, int flags,
- u32 pid);
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
+static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm);
@@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
trace_neigh_cleanup_and_release(neigh, 0);
- __neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
+ neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -1105,6 +1104,7 @@ static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
struct neighbour *neigh = timer_container_of(neigh, t, timer);
+ bool skip_probe = false;
unsigned int state;
int notify = 0;
@@ -1172,9 +1172,15 @@ static void neigh_timer_handler(struct timer_list *t)
neigh_invalidate(neigh);
}
notify = 1;
- goto out;
+ skip_probe = true;
}
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0);
+
+ if (skip_probe)
+ goto out;
+
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/100))
next = jiffies + HZ/100;
@@ -1189,7 +1195,7 @@ out:
}
if (notify)
- neigh_update_notify(neigh, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
trace_neigh_timer_handler(neigh, 0);
@@ -1303,6 +1309,47 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
+static void neigh_update_process_arp_queue(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ struct sk_buff *skb;
+
+ /* Again: avoid deadlock if something went wrong. */
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
+
+ write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst &&
+ READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ READ_ONCE(n1->output)(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
+ write_lock_bh(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -1329,6 +1376,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
struct netlink_ext_ack *extack)
{
bool gc_update = false, managed_update = false;
+ bool process_arp_queue = false;
int update_isrouter = 0;
struct net_device *dev;
int err, notify = 0;
@@ -1462,53 +1510,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
neigh_connect(neigh);
else
neigh_suspect(neigh);
- if (!(old & NUD_VALID)) {
- struct sk_buff *skb;
- /* Again: avoid dead loop if something went wrong */
+ if (!(old & NUD_VALID))
+ process_arp_queue = true;
- while (neigh->nud_state & NUD_VALID &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n2, *n1 = neigh;
- write_unlock_bh(&neigh->lock);
-
- rcu_read_lock();
-
- /* Why not just use 'neigh' as-is? The problem is that
- * things such as shaper, eql, and sch_teql can end up
- * using alternative, different, neigh objects to output
- * the packet in the output path. So what we need to do
- * here is re-lookup the top-level neigh in the path so
- * we can reinject the packet there.
- */
- n2 = NULL;
- if (dst &&
- READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
- n2 = dst_neigh_lookup_skb(dst, skb);
- if (n2)
- n1 = n2;
- }
- READ_ONCE(n1->output)(n1, skb);
- if (n2)
- neigh_release(n2);
- rcu_read_unlock();
-
- write_lock_bh(&neigh->lock);
- }
- __skb_queue_purge(&neigh->arp_queue);
- neigh->arp_queue_len_bytes = 0;
- }
out:
if (update_isrouter)
neigh_update_is_router(neigh, flags, &notify);
+
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
+
+ if (process_arp_queue)
+ neigh_update_process_arp_queue(neigh);
+
write_unlock_bh(&neigh->lock);
+
if (((new ^ old) & NUD_PERMANENT) || gc_update)
neigh_update_gc_list(neigh);
if (managed_update)
neigh_update_managed_list(neigh);
+
if (notify)
- neigh_update_notify(neigh, nlmsg_pid);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+
trace_neigh_update_done(neigh, err);
return err;
}
@@ -2622,8 +2647,8 @@ out:
return skb->len;
}
-static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
- u32 pid, u32 seq, int type, unsigned int flags)
+static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
{
u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
@@ -2649,23 +2674,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
goto nla_put_failure;
- read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
if (neigh->nud_state & NUD_VALID) {
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, neigh, neigh->dev);
- if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
- read_unlock_bh(&neigh->lock);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0)
goto nla_put_failure;
- }
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
- read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
@@ -2684,6 +2705,20 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ int err;
+
+ read_lock_bh(&neigh->lock);
+ err = __neigh_fill_info(skb, neigh, pid, seq, type, flags);
+ read_unlock_bh(&neigh->lock);
+
+ return err;
+}
+
static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
@@ -2727,12 +2762,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
-{
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
- __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
-}
-
static bool neigh_master_filtered(struct net_device *dev, int master_idx)
{
struct net_device *master;
@@ -3545,7 +3574,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
if (skb == NULL)
goto errout;
- err = neigh_fill_info(skb, n, pid, 0, type, flags);
+ err = __neigh_fill_info(skb, n, pid, 0, type, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3560,9 +3589,16 @@ out:
rcu_read_unlock();
}
+static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid)
+{
+ read_lock_bh(&neigh->lock);
+ __neigh_notify(neigh, type, flags, pid);
+ read_unlock_bh(&neigh->lock);
+}
+
void neigh_app_ns(struct neighbour *n)
{
- __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
+ neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
}
EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a6e6a964a287..aef44e617361 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -624,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);
-static void unhash_nsid(struct net *net, struct net *last)
+static void unhash_nsid(struct net *last)
{
- struct net *tmp;
+ struct net *tmp, *peer;
+
/* This function is only called from cleanup_net() work,
* and this work is the only process, that may delete
* a net from net_namespace_list. So, when the below
@@ -634,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last)
* use for_each_net_rcu() or net_rwsem.
*/
for_each_net(tmp) {
- int id;
+ int id = 0;
spin_lock(&tmp->nsid_lock);
- id = __peernet2id(tmp, net);
- if (id >= 0)
- idr_remove(&tmp->netns_ids, id);
- spin_unlock(&tmp->nsid_lock);
- if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ while ((peer = idr_get_next(&tmp->netns_ids, &id))) {
+ int curr_id = id;
+
+ id++;
+ if (!peer->is_dying)
+ continue;
+
+ idr_remove(&tmp->netns_ids, curr_id);
+ spin_unlock(&tmp->nsid_lock);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL,
GFP_KERNEL);
+ spin_lock(&tmp->nsid_lock);
+ }
+ spin_unlock(&tmp->nsid_lock);
if (tmp == last)
break;
}
- spin_lock(&net->nsid_lock);
- idr_destroy(&net->netns_ids);
- spin_unlock(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
@@ -674,6 +679,7 @@ static void cleanup_net(struct work_struct *work)
llist_for_each_entry(net, net_kill_list, cleanup_list) {
ns_tree_remove(net);
list_del_rcu(&net->list);
+ net->is_dying = true;
}
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
@@ -688,8 +694,10 @@ static void cleanup_net(struct work_struct *work)
last = list_last_entry(&net_namespace_list, struct net, list);
up_write(&net_rwsem);
+ unhash_nsid(last);
+
llist_for_each_entry(net, net_kill_list, cleanup_list) {
- unhash_nsid(net, last);
+ idr_destroy(&net->netns_ids);
list_add_tail(&net->exit_list, &net_exit_list);
}
diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c
new file mode 100644
index 000000000000..f14af365d5cd
--- /dev/null
+++ b/net/core/netdev_config.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+
+#include "dev.h"
+
+static int netdev_nop_validate_qcfg(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static int __netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack,
+ bool validate)
+{
+ int (*validate_cb)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+ struct pp_memory_provider_params *mpp;
+ int err;
+
+ validate_cb = netdev_nop_validate_qcfg;
+ if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg)
+ validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg;
+
+ memset(qcfg, 0, sizeof(*qcfg));
+
+ /* Get defaults from the driver, in case user config not set */
+ if (dev->queue_mgmt_ops->ndo_default_qcfg)
+ dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg);
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ /* Apply MP overrides */
+ mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params;
+ if (mpp->rx_page_size)
+ qcfg->rx_page_size = mpp->rx_page_size;
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * netdev_queue_config() - get configuration for a given queue
+ * @dev: net_device instance
+ * @rxq_idx: index of the queue of interest
+ * @qcfg: queue configuration struct (output)
+ *
+ * Render the configuration for a given queue. This helper should be used
+ * by drivers which support queue configuration to retrieve config for
+ * a particular queue.
+ *
+ * @qcfg is an output parameter and is always fully initialized by this
+ * function. Some values may not be set by the user, drivers may either
+ * deal with the "unset" values in @qcfg, or provide the callback
+ * to populate defaults in queue_management_ops.
+ */
+void netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg)
+{
+ __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false);
+}
+EXPORT_SYMBOL(netdev_queue_config);
+
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true);
+}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index c7d9341b7630..668a90658f25 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -7,6 +7,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/memory_provider.h>
+#include "dev.h"
#include "page_pool_priv.h"
/* See also page_pool_is_unreadable() */
@@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+static int netdev_rx_queue_reconfig(struct net_device *dev,
+ unsigned int rxq_idx,
+ struct netdev_queue_config *qcfg_old,
+ struct netdev_queue_config *qcfg_new)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
@@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
goto err_free_new_mem;
}
- err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_free_old_mem;
@@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
if (err)
goto err_free_new_queue_mem;
- err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_start_queue;
} else {
@@ -76,7 +80,7 @@ err_start_queue:
* WARN if we fail to recover the old rx queue, and at least free
* old_mem so we don't also leak that.
*/
- if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) {
WARN(1,
"Failed to restart old queue in error path. RX queue %d may be unhealthy.",
rxq_idx);
@@ -94,12 +98,22 @@ err_free_new_mem:
return err;
}
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_queue_config qcfg;
+
+ netdev_queue_config(dev, rxq_idx, &qcfg);
+ return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg);
+}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack)
{
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int ret;
@@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
return -EEXIST;
}
+ if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) {
+ NL_SET_ERR_MSG(extack, "device does not support: rx_page_size");
+ return -EOPNOTSUPP;
+ }
rxq = __netif_get_rx_queue(dev, rxq_idx);
if (rxq->mp_params.mp_ops) {
@@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
}
#endif
+ netdev_queue_config(dev, rxq_idx, &qcfg[0]);
rxq->mp_params = *p;
- ret = netdev_rx_queue_restart(dev, rxq_idx);
- if (ret) {
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- }
+ ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack);
+ if (ret)
+ goto err_clear_mp;
+
+ ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]);
+ if (ret)
+ goto err_clear_mp;
+
+ return 0;
+
+err_clear_mp:
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
return ret;
}
@@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
const struct pp_memory_provider_params *old_p)
{
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
@@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
rxq->mp_params.mp_priv != old_p->mp_priv))
return;
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- err = netdev_rx_queue_restart(dev, ifq_idx);
+ netdev_queue_config(dev, ifq_idx, &qcfg[0]);
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
+ netdev_queue_config(dev, ifq_idx, &qcfg[1]);
+
+ err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]);
WARN_ON(err && err != -ENETDOWN);
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
deleted file mode 100644
index 897a8f01a67b..000000000000
--- a/net/core/request_sock.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * NET Generic infrastructure for Network protocols.
- *
- * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * From code originally in include/net/tcp.h
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/tcp.h>
-#include <linux/vmalloc.h>
-
-#include <net/request_sock.h>
-
-/*
- * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
- * One SYN_RECV socket costs about 80bytes on a 32bit machine.
- * It would be better to replace it with a global counter for all sockets
- * but then some measure against one socket starving all other sockets
- * would be needed.
- *
- * The minimum value of it is 128. Experiments with real servers show that
- * it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems.
- * This value is adjusted to 128 for low memory machines,
- * and it will increase in proportion to the memory of machine.
- * Note : Dont forget somaxconn that may limit backlog too.
- */
-
-void reqsk_queue_alloc(struct request_sock_queue *queue)
-{
- queue->fastopenq.rskq_rst_head = NULL;
- queue->fastopenq.rskq_rst_tail = NULL;
- queue->fastopenq.qlen = 0;
-
- queue->rskq_accept_head = NULL;
-}
-
-/*
- * This function is called to set a Fast Open socket's "fastopen_rsk" field
- * to NULL when a TFO socket no longer needs to access the request_sock.
- * This happens only after 3WHS has been either completed or aborted (e.g.,
- * RST is received).
- *
- * Before TFO, a child socket is created only after 3WHS is completed,
- * hence it never needs to access the request_sock. things get a lot more
- * complex with TFO. A child socket, accepted or not, has to access its
- * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
- * until 3WHS is either completed or aborted. Afterwards the req will stay
- * until either the child socket is accepted, or in the rare case when the
- * listener is closed before the child is accepted.
- *
- * In short, a request socket is only freed after BOTH 3WHS has completed
- * (or aborted) and the child socket has been accepted (or listener closed).
- * When a child socket is accepted, its corresponding req->sk is set to
- * NULL since it's no longer needed. More importantly, "req->sk == NULL"
- * will be used by the code below to determine if a child socket has been
- * accepted or not, and the check is protected by the fastopenq->lock
- * described below.
- *
- * Note that fastopen_rsk is only accessed from the child socket's context
- * with its socket lock held. But a request_sock (req) can be accessed by
- * both its child socket through fastopen_rsk, and a listener socket through
- * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
- * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
- * only in the rare case when both the listener and the child locks are held,
- * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
- * The lock also protects other fields such as fastopenq->qlen, which is
- * decremented by this function when fastopen_rsk is no longer needed.
- *
- * Note that another solution was to simply use the existing socket lock
- * from the listener. But first socket lock is difficult to use. It is not
- * a simple spin lock - one must consider sock_owned_by_user() and arrange
- * to use sk_add_backlog() stuff. But what really makes it infeasible is the
- * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock.
- *
- * This function also sets "treq->tfo_listener" to false.
- * treq->tfo_listener is used by the listener so it is protected by the
- * fastopenq->lock in this function.
- */
-void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
- bool reset)
-{
- struct sock *lsk = req->rsk_listener;
- struct fastopen_queue *fastopenq;
-
- fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
-
- RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
- spin_lock_bh(&fastopenq->lock);
- fastopenq->qlen--;
- tcp_rsk(req)->tfo_listener = false;
- if (req->sk) /* the child socket hasn't been accepted yet */
- goto out;
-
- if (!reset || lsk->sk_state != TCP_LISTEN) {
- /* If the listener has been closed don't bother with the
- * special RST handling below.
- */
- spin_unlock_bh(&fastopenq->lock);
- reqsk_put(req);
- return;
- }
- /* Wait for 60secs before removing a req that has triggered RST.
- * This is a simple defense against TFO spoofing attack - by
- * counting the req against fastopen.max_qlen, and disabling
- * TFO when the qlen exceeds max_qlen.
- *
- * For more details see CoNext'11 "TCP Fast Open" paper.
- */
- req->rsk_timer.expires = jiffies + 60*HZ;
- if (fastopenq->rskq_rst_head == NULL)
- fastopenq->rskq_rst_head = req;
- else
- fastopenq->rskq_rst_tail->dl_next = req;
-
- req->dl_next = NULL;
- fastopenq->rskq_rst_tail = req;
- fastopenq->qlen++;
-out:
- spin_unlock_bh(&fastopenq->lock);
-}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 61746c2b95f6..699c401a5eae 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -78,6 +78,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
+#include <net/can.h>
#include <net/page_pool/helpers.h>
#include <net/psp/types.h>
#include <net/dropreason.h>
@@ -280,7 +281,7 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align);
*/
static u32 skbuff_cache_size __read_mostly;
-static struct sk_buff *napi_skb_cache_get(bool alloc)
+static inline struct sk_buff *napi_skb_cache_get(bool alloc)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
@@ -307,6 +308,23 @@ static struct sk_buff *napi_skb_cache_get(bool alloc)
return skb;
}
+/*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise later. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+static inline void skbuff_clear(struct sk_buff *skb)
+{
+ /* Replace memset(skb, 0, offsetof(struct sk_buff, tail))
+ * with two smaller memset(), with a barrier() between them.
+ * This forces the compiler to inline both calls.
+ */
+ BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128);
+ memset(skb, 0, 128);
+ barrier();
+ memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128);
+}
+
/**
* napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
* @skbs: pointer to an at least @n-sized array to fill with skb pointers
@@ -357,7 +375,7 @@ get:
skbs[i] = nc->skb_cache[base + i];
kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
- memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skbs[i]);
}
nc->skb_count -= n;
@@ -424,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
@@ -476,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -537,7 +555,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -566,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(napi_build_skb);
+static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node)
+{
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+ if (!obj_size)
+ return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
+ flags, node);
+ return kmalloc_node_track_caller(obj_size, flags, node);
+}
+
/*
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
* the caller if emergency pfmemalloc reserves are being used. If it is and
@@ -574,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb);
* memory is free
*/
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
- bool *pfmemalloc)
+ struct sk_buff *skb)
{
- bool ret_pfmemalloc = false;
size_t obj_size;
void *obj;
@@ -587,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
*size = SKB_SMALL_HEAD_CACHE_SIZE;
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
- goto out;
+ if (skb)
+ skb->pfmemalloc = true;
+ return kmalloc_pfmemalloc(0, flags, node);
}
obj_size = kmalloc_size_roundup(obj_size);
@@ -608,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(obj_size, flags, node);
-
+ if (skb)
+ skb->pfmemalloc = true;
+ obj = kmalloc_pfmemalloc(obj_size, flags, node);
out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
-
return obj;
}
@@ -650,7 +674,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct sk_buff *skb = NULL;
struct kmem_cache *cache;
- bool pfmemalloc;
u8 *data;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
@@ -680,37 +703,35 @@ fallback:
if (unlikely(!skb))
return NULL;
}
- prefetchw(skb);
+ skbuff_clear(skb);
/* We do our best to align skb_shared_info on a separate cache
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, skb);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- prefetchw(data + SKB_WITH_OVERHEAD(size));
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, size);
- skb->pfmemalloc = pfmemalloc;
+ __finalize_skb_around(skb, data, size);
if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff_fclones *fclones;
fclones = container_of(skb, struct sk_buff_fclones, skb1);
- skb->fclone = SKB_FCLONE_ORIG;
+ /* skb->fclone is a 2bits field.
+ * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG)
+ * with a single OR.
+ */
+ BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0);
+ DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE);
+ skb->fclone |= SKB_FCLONE_ORIG;
+
refcount_set(&fclones->fclone_ref, 1);
}
@@ -1488,9 +1509,20 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
napi_skb_cache_put(skb);
}
+/**
+ * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache
+ * @skb: buffer to free
+ * @budget: NAPI budget
+ *
+ * Non-zero @budget must come from the @budget argument passed by the core
+ * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll
+ * for example when polling for netpoll / netconsole.
+ *
+ * Passing @budget of 0 is safe from any context, it turns this function
+ * into dev_consume_skb_any().
+ */
void napi_consume_skb(struct sk_buff *skb, int budget)
{
- /* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
@@ -5108,6 +5140,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_INET_PSP)
[SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
#endif
+#if IS_ENABLED(CONFIG_CAN)
+ [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -5123,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void)
static void skb_extensions_init(void)
{
- BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(SKB_EXT_NUM > 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif
@@ -7392,31 +7427,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
-void get_netmem(netmem_ref netmem)
+void __get_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_get_net_iov(netmem_to_net_iov(netmem));
- return;
- }
- get_page(netmem_to_page(netmem));
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
}
-EXPORT_SYMBOL(get_netmem);
+EXPORT_SYMBOL(__get_netmem);
-void put_netmem(netmem_ref netmem)
+void __put_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_put_net_iov(netmem_to_net_iov(netmem));
- return;
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+}
+EXPORT_SYMBOL(__put_netmem);
+
+struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
+ __be16 type,
+ int mac_offset)
+{
+ unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;
+
+ /* if type is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (vlan_depth) {
+ if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN))
+ return (struct vlan_type_depth) { 0 };
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
}
+ do {
+ struct vlan_hdr vhdr, *vh;
+
+ vh = skb_header_pointer(skb, mac_offset + vlan_depth,
+ sizeof(vhdr), &vhdr);
+ if (unlikely(!vh || !--parse_depth))
+ return (struct vlan_type_depth) { 0 };
- put_page(netmem_to_page(netmem));
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (eth_type_vlan(type));
+
+ return (struct vlan_type_depth) {
+ .type = type,
+ .depth = vlan_depth
+ };
}
-EXPORT_SYMBOL(put_netmem);
+EXPORT_SYMBOL(__vlan_get_protocol_offset);
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d56..693e6d80f501 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
return -EINVAL;
}
if (alloc_slab) {
- prot->slab = kmem_cache_create_usercopy(prot->name,
- prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
- prot->slab_flags,
- prot->useroffset, prot->usersize,
- NULL);
+ struct kmem_cache_args args = {
+ .useroffset = prot->useroffset,
+ .usersize = prot->usersize,
+ .freeptr_offset = prot->freeptr_offset,
+ .use_freeptr_offset = !!prot->freeptr_offset,
+ };
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+ &args,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 05dd55cf8b58..03aea10073f0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/sched/isolation.h>
+#include <linux/hex.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write,
static int proc_do_rss_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
+ struct ctl_table fake_table;
+ char *pos = buf;
+
+ for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) {
+ pos = hex_byte_pack(pos, netdev_rss_key[i]);
+ *pos++ = ':';
+ }
+ *(--pos) = 0;
- snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
fake_table.data = buf;
fake_table.maxlen = sizeof(buf);
return proc_dostring(&fake_table, write, buffer, lenp, ppos);