summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile3
-rw-r--r--net/core/dev.c24
-rw-r--r--net/core/dev.h5
-rw-r--r--net/core/dev_ioctl.c60
-rw-r--r--net/core/devmem.c27
-rw-r--r--net/core/devmem.h17
-rw-r--r--net/core/gro.c4
-rw-r--r--net/core/neighbour.c150
-rw-r--r--net/core/net_namespace.c34
-rw-r--r--net/core/netdev_config.c78
-rw-r--r--net/core/netdev_rx_queue.c53
-rw-r--r--net/core/request_sock.c127
-rw-r--r--net/core/skbuff.c166
-rw-r--r--net/core/sock.c16
-rw-r--r--net/core/sysctl_net_core.c11
15 files changed, 424 insertions, 351 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 9ef2099c5426..dc17c5a61e9a 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,7 +3,7 @@
# Makefile for the Linux networking core.
#
-obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \
+obj-y := sock.o skbuff.o datagram.o stream.o scm.o \
gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \
flow_dissector.o
@@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o
obj-y += net-sysfs.o
obj-y += hotdata.o
+obj-y += netdev_config.o
obj-y += netdev_rx_queue.o
obj-y += netdev_queues.o
obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o
diff --git a/net/core/dev.c b/net/core/dev.c
index ccef685023c2..ac6bcb2a0784 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -246,12 +246,11 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd)
}
static inline void backlog_unlock_irq_restore(struct softnet_data *sd,
- unsigned long *flags)
+ unsigned long flags)
{
if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads())
- spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
- else
- local_irq_restore(*flags);
+ spin_unlock(&sd->input_pkt_queue.lock);
+ local_irq_restore(flags);
}
static inline void backlog_unlock_irq_enable(struct softnet_data *sd)
@@ -3803,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
inner_ip_hdr(skb) : ip_hdr(skb);
if (!(iph->frag_off & htons(IP_DF)))
- features &= ~NETIF_F_TSO_MANGLEID;
+ features &= ~dev->mangleid_features;
}
/* NETIF_F_IPV6_CSUM does not support IPv6 extension headers,
@@ -3814,8 +3813,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb,
(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 &&
vlan_get_protocol(skb) == htons(ETH_P_IPV6))) &&
skb_transport_header_was_set(skb) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4);
return features;
@@ -3918,8 +3916,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb,
if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) &&
- skb_network_header_len(skb) != sizeof(struct ipv6hdr) &&
- !ipv6_has_hopopt_jumbo(skb))
+ skb_network_header_len(skb) != sizeof(struct ipv6hdr))
goto sw_checksum;
switch (skb->csum_offset) {
@@ -5260,7 +5257,7 @@ void kick_defer_list_purge(unsigned int cpu)
if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
__napi_schedule_irqoff(&sd->backlog);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
} else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) {
smp_call_function_single_async(cpu, &sd->defer_csd);
@@ -5347,14 +5344,14 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
}
__skb_queue_tail(&sd->input_pkt_queue, skb);
tail = rps_input_queue_tail_incr(sd);
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
/* save the tail outside of the critical section */
rps_input_queue_tail_save(qtail, tail);
return NET_RX_SUCCESS;
}
- backlog_unlock_irq_restore(sd, &flags);
+ backlog_unlock_irq_restore(sd, flags);
cpu_backlog_drop:
reason = SKB_DROP_REASON_CPU_BACKLOG;
@@ -11386,6 +11383,9 @@ int register_netdevice(struct net_device *dev)
if (dev->hw_enc_features & NETIF_F_TSO)
dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
+ /* TSO_MANGLEID belongs in mangleid_features by definition */
+ dev->mangleid_features |= NETIF_F_TSO_MANGLEID;
+
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
*/
dev->vlan_features |= NETIF_F_HIGHDMA;
diff --git a/net/core/dev.h b/net/core/dev.h
index da18536cbd35..98793a738f43 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -10,6 +10,7 @@
struct net;
struct netlink_ext_ack;
+struct netdev_queue_config;
struct cpumask;
/* Random bits of netdevice that don't need to be exposed */
@@ -91,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem;
extern struct list_head net_todo_list;
void netdev_run_todo(void);
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+
/* netdev management, shared between various uAPI entry points */
struct netdev_name_node {
struct hlist_node hlist;
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 53a53357cfef..7a8966544c9d 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -287,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr)
int err;
if (!ops->ndo_hwtstamp_get)
- return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -414,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
}
if (!ops->ndo_hwtstamp_set)
- return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */
+ return -EOPNOTSUPP;
if (!netif_device_present(dev))
return -ENODEV;
@@ -438,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr)
return 0;
}
-static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd,
- struct kernel_hwtstamp_config *kernel_cfg)
-{
- struct ifreq ifrr;
- int err;
-
- if (!kernel_cfg->ifr)
- return -EINVAL;
-
- strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ);
- ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru;
-
- err = dev_eth_ioctl(dev, &ifrr, cmd);
- if (err)
- return err;
-
- kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru;
- kernel_cfg->copied_to_user = true;
-
- return 0;
-}
-
int generic_hwtstamp_get_lower(struct net_device *dev,
struct kernel_hwtstamp_config *kernel_cfg)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_get)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_get_lower);
@@ -488,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
struct netlink_ext_ack *extack)
{
const struct net_device_ops *ops = dev->netdev_ops;
+ int err;
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set) {
- int err;
-
- netdev_lock_ops(dev);
- err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
- netdev_unlock_ops(dev);
+ if (!ops->ndo_hwtstamp_set)
+ return -EOPNOTSUPP;
- return err;
- }
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
- /* Legacy path: unconverted lower driver */
- return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
+ return err;
}
EXPORT_SYMBOL(generic_hwtstamp_set_lower);
diff --git a/net/core/devmem.c b/net/core/devmem.c
index ec4217d6c0b4..63f093f7d2b2 100644
--- a/net/core/devmem.c
+++ b/net/core/devmem.c
@@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1);
static const struct memory_provider_ops dmabuf_devmem_ops;
-bool net_is_devmem_iov(struct net_iov *niov)
-{
- return niov->type == NET_IOV_DMABUF;
-}
-
static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool,
struct gen_pool_chunk *chunk,
void *not_used)
@@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov)
((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT);
}
+static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref)
+{
+ struct net_devmem_dmabuf_binding *binding =
+ container_of(ref, struct net_devmem_dmabuf_binding, ref);
+
+ INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
+ schedule_work(&binding->unbind_w);
+}
+
void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
{
struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w);
@@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq)
dma_buf_detach(binding->dmabuf, binding->attachment);
dma_buf_put(binding->dmabuf);
xa_destroy(&binding->bound_rxqs);
+ percpu_ref_exit(&binding->ref);
kvfree(binding->tx_vec);
kfree(binding);
}
@@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding)
__net_mp_close_rxq(binding->dev, rxq_idx, &mp_params);
}
- net_devmem_dmabuf_binding_put(binding);
+ percpu_ref_kill(&binding->ref);
}
int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx,
@@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev,
binding->dev = dev;
xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC);
- refcount_set(&binding->ref, 1);
+ err = percpu_ref_init(&binding->ref,
+ net_devmem_dmabuf_binding_release,
+ 0, GFP_KERNEL);
+ if (err < 0)
+ goto err_free_binding;
mutex_init(&binding->lock);
@@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev,
if (IS_ERR(binding->attachment)) {
err = PTR_ERR(binding->attachment);
NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device");
- goto err_free_binding;
+ goto err_exit_ref;
}
binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment,
@@ -322,6 +331,8 @@ err_unmap:
direction);
err_detach:
dma_buf_detach(dmabuf, binding->attachment);
+err_exit_ref:
+ percpu_ref_exit(&binding->ref);
err_free_binding:
kfree(binding);
err_put_dmabuf:
diff --git a/net/core/devmem.h b/net/core/devmem.h
index 0b43a648cd2e..1c5c18581fcb 100644
--- a/net/core/devmem.h
+++ b/net/core/devmem.h
@@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding {
* retransmits) hold a reference to the binding until the skb holding
* them is freed.
*/
- refcount_t ref;
+ struct percpu_ref ref;
/* The list of bindings currently active. Used for netlink to notify us
* of the user dropping the bind.
@@ -125,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov)
static inline bool
net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding)
{
- return refcount_inc_not_zero(&binding->ref);
+ return percpu_ref_tryget(&binding->ref);
}
static inline void
net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding)
{
- if (!refcount_dec_and_test(&binding->ref))
- return;
-
- INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free);
- schedule_work(&binding->unbind_w);
+ percpu_ref_put(&binding->ref);
}
void net_devmem_get_net_iov(struct net_iov *niov);
@@ -145,7 +141,7 @@ struct net_iov *
net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding);
void net_devmem_free_dmabuf(struct net_iov *ppiov);
-bool net_is_devmem_iov(struct net_iov *niov);
+
struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id);
struct net_iov *
@@ -218,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov)
return 0;
}
-static inline bool net_is_devmem_iov(struct net_iov *niov)
-{
- return false;
-}
-
static inline struct net_devmem_dmabuf_binding *
net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id)
{
diff --git a/net/core/gro.c b/net/core/gro.c
index 482fa7d7f598..31d21de5b15a 100644
--- a/net/core/gro.c
+++ b/net/core/gro.c
@@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb)
if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) {
if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP ||
- (p->protocol == htons(ETH_P_IPV6) &&
- skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) ||
p->encapsulation)
return -E2BIG;
}
@@ -417,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
{
struct skb_shared_info *pinfo = skb_shinfo(skb);
- BUG_ON(skb->end - skb->tail < grow);
+ DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow);
memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 96a3b1a93252..e0897eb41c8d 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -51,9 +51,8 @@ do { \
#define PNEIGH_HASHMASK 0xF
static void neigh_timer_handler(struct timer_list *t);
-static void __neigh_notify(struct neighbour *n, int type, int flags,
- u32 pid);
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid);
+static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
+static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid);
static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
bool skip_perm);
@@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb)
static void neigh_cleanup_and_release(struct neighbour *neigh)
{
trace_neigh_cleanup_and_release(neigh, 0);
- __neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
+ neigh_notify(neigh, RTM_DELNEIGH, 0, 0);
call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
neigh_release(neigh);
}
@@ -1105,6 +1104,7 @@ static void neigh_timer_handler(struct timer_list *t)
{
unsigned long now, next;
struct neighbour *neigh = timer_container_of(neigh, t, timer);
+ bool skip_probe = false;
unsigned int state;
int notify = 0;
@@ -1172,9 +1172,15 @@ static void neigh_timer_handler(struct timer_list *t)
neigh_invalidate(neigh);
}
notify = 1;
- goto out;
+ skip_probe = true;
}
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0);
+
+ if (skip_probe)
+ goto out;
+
if (neigh->nud_state & NUD_IN_TIMER) {
if (time_before(next, jiffies + HZ/100))
next = jiffies + HZ/100;
@@ -1189,7 +1195,7 @@ out:
}
if (notify)
- neigh_update_notify(neigh, 0);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
trace_neigh_timer_handler(neigh, 0);
@@ -1303,6 +1309,47 @@ static void neigh_update_hhs(struct neighbour *neigh)
}
}
+static void neigh_update_process_arp_queue(struct neighbour *neigh)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ struct sk_buff *skb;
+
+ /* Again: avoid deadlock if something went wrong. */
+ while (neigh->nud_state & NUD_VALID &&
+ (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
+ struct dst_entry *dst = skb_dst(skb);
+ struct neighbour *n2, *n1 = neigh;
+
+ write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
+
+ /* Why not just use 'neigh' as-is? The problem is that
+ * things such as shaper, eql, and sch_teql can end up
+ * using alternative, different, neigh objects to output
+ * the packet in the output path. So what we need to do
+ * here is re-lookup the top-level neigh in the path so
+ * we can reinject the packet there.
+ */
+ n2 = NULL;
+ if (dst &&
+ READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
+ n2 = dst_neigh_lookup_skb(dst, skb);
+ if (n2)
+ n1 = n2;
+ }
+ READ_ONCE(n1->output)(n1, skb);
+ if (n2)
+ neigh_release(n2);
+ rcu_read_unlock();
+
+ write_lock_bh(&neigh->lock);
+ }
+ __skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
/* Generic update routine.
-- lladdr is new lladdr or NULL, if it is not supplied.
-- new is new state.
@@ -1329,6 +1376,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
struct netlink_ext_ack *extack)
{
bool gc_update = false, managed_update = false;
+ bool process_arp_queue = false;
int update_isrouter = 0;
struct net_device *dev;
int err, notify = 0;
@@ -1462,53 +1510,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr,
neigh_connect(neigh);
else
neigh_suspect(neigh);
- if (!(old & NUD_VALID)) {
- struct sk_buff *skb;
- /* Again: avoid dead loop if something went wrong */
+ if (!(old & NUD_VALID))
+ process_arp_queue = true;
- while (neigh->nud_state & NUD_VALID &&
- (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) {
- struct dst_entry *dst = skb_dst(skb);
- struct neighbour *n2, *n1 = neigh;
- write_unlock_bh(&neigh->lock);
-
- rcu_read_lock();
-
- /* Why not just use 'neigh' as-is? The problem is that
- * things such as shaper, eql, and sch_teql can end up
- * using alternative, different, neigh objects to output
- * the packet in the output path. So what we need to do
- * here is re-lookup the top-level neigh in the path so
- * we can reinject the packet there.
- */
- n2 = NULL;
- if (dst &&
- READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) {
- n2 = dst_neigh_lookup_skb(dst, skb);
- if (n2)
- n1 = n2;
- }
- READ_ONCE(n1->output)(n1, skb);
- if (n2)
- neigh_release(n2);
- rcu_read_unlock();
-
- write_lock_bh(&neigh->lock);
- }
- __skb_queue_purge(&neigh->arp_queue);
- neigh->arp_queue_len_bytes = 0;
- }
out:
if (update_isrouter)
neigh_update_is_router(neigh, flags, &notify);
+
+ if (notify)
+ __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
+
+ if (process_arp_queue)
+ neigh_update_process_arp_queue(neigh);
+
write_unlock_bh(&neigh->lock);
+
if (((new ^ old) & NUD_PERMANENT) || gc_update)
neigh_update_gc_list(neigh);
if (managed_update)
neigh_update_managed_list(neigh);
+
if (notify)
- neigh_update_notify(neigh, nlmsg_pid);
+ call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
+
trace_neigh_update_done(neigh, err);
return err;
}
@@ -2622,8 +2647,8 @@ out:
return skb->len;
}
-static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
- u32 pid, u32 seq, int type, unsigned int flags)
+static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
{
u32 neigh_flags, neigh_flags_ext;
unsigned long now = jiffies;
@@ -2649,23 +2674,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key))
goto nla_put_failure;
- read_lock_bh(&neigh->lock);
ndm->ndm_state = neigh->nud_state;
if (neigh->nud_state & NUD_VALID) {
char haddr[MAX_ADDR_LEN];
neigh_ha_snapshot(haddr, neigh, neigh->dev);
- if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) {
- read_unlock_bh(&neigh->lock);
+ if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0)
goto nla_put_failure;
- }
}
ci.ndm_used = jiffies_to_clock_t(now - neigh->used);
ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed);
ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated);
ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1;
- read_unlock_bh(&neigh->lock);
if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) ||
nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
@@ -2684,6 +2705,20 @@ nla_put_failure:
return -EMSGSIZE;
}
+static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh,
+ u32 pid, u32 seq, int type, unsigned int flags)
+ __releases(neigh->lock)
+ __acquires(neigh->lock)
+{
+ int err;
+
+ read_lock_bh(&neigh->lock);
+ err = __neigh_fill_info(skb, neigh, pid, seq, type, flags);
+ read_unlock_bh(&neigh->lock);
+
+ return err;
+}
+
static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
u32 pid, u32 seq, int type, unsigned int flags,
struct neigh_table *tbl)
@@ -2727,12 +2762,6 @@ nla_put_failure:
return -EMSGSIZE;
}
-static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid)
-{
- call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh);
- __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid);
-}
-
static bool neigh_master_filtered(struct net_device *dev, int master_idx)
{
struct net_device *master;
@@ -3545,7 +3574,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags,
if (skb == NULL)
goto errout;
- err = neigh_fill_info(skb, n, pid, 0, type, flags);
+ err = __neigh_fill_info(skb, n, pid, 0, type, flags);
if (err < 0) {
/* -EMSGSIZE implies BUG in neigh_nlmsg_size() */
WARN_ON(err == -EMSGSIZE);
@@ -3560,9 +3589,16 @@ out:
rcu_read_unlock();
}
+static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid)
+{
+ read_lock_bh(&neigh->lock);
+ __neigh_notify(neigh, type, flags, pid);
+ read_unlock_bh(&neigh->lock);
+}
+
void neigh_app_ns(struct neighbour *n)
{
- __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
+ neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0);
}
EXPORT_SYMBOL(neigh_app_ns);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index a6e6a964a287..aef44e617361 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -624,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid)
}
EXPORT_SYMBOL_GPL(net_ns_get_ownership);
-static void unhash_nsid(struct net *net, struct net *last)
+static void unhash_nsid(struct net *last)
{
- struct net *tmp;
+ struct net *tmp, *peer;
+
/* This function is only called from cleanup_net() work,
* and this work is the only process, that may delete
* a net from net_namespace_list. So, when the below
@@ -634,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last)
* use for_each_net_rcu() or net_rwsem.
*/
for_each_net(tmp) {
- int id;
+ int id = 0;
spin_lock(&tmp->nsid_lock);
- id = __peernet2id(tmp, net);
- if (id >= 0)
- idr_remove(&tmp->netns_ids, id);
- spin_unlock(&tmp->nsid_lock);
- if (id >= 0)
- rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL,
+ while ((peer = idr_get_next(&tmp->netns_ids, &id))) {
+ int curr_id = id;
+
+ id++;
+ if (!peer->is_dying)
+ continue;
+
+ idr_remove(&tmp->netns_ids, curr_id);
+ spin_unlock(&tmp->nsid_lock);
+ rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL,
GFP_KERNEL);
+ spin_lock(&tmp->nsid_lock);
+ }
+ spin_unlock(&tmp->nsid_lock);
if (tmp == last)
break;
}
- spin_lock(&net->nsid_lock);
- idr_destroy(&net->netns_ids);
- spin_unlock(&net->nsid_lock);
}
static LLIST_HEAD(cleanup_list);
@@ -674,6 +679,7 @@ static void cleanup_net(struct work_struct *work)
llist_for_each_entry(net, net_kill_list, cleanup_list) {
ns_tree_remove(net);
list_del_rcu(&net->list);
+ net->is_dying = true;
}
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
@@ -688,8 +694,10 @@ static void cleanup_net(struct work_struct *work)
last = list_last_entry(&net_namespace_list, struct net, list);
up_write(&net_rwsem);
+ unhash_nsid(last);
+
llist_for_each_entry(net, net_kill_list, cleanup_list) {
- unhash_nsid(net, last);
+ idr_destroy(&net->netns_ids);
list_add_tail(&net->exit_list, &net_exit_list);
}
diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c
new file mode 100644
index 000000000000..f14af365d5cd
--- /dev/null
+++ b/net/core/netdev_config.c
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#include <linux/netdevice.h>
+#include <net/netdev_queues.h>
+#include <net/netdev_rx_queue.h>
+
+#include "dev.h"
+
+static int netdev_nop_validate_qcfg(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return 0;
+}
+
+static int __netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack,
+ bool validate)
+{
+ int (*validate_cb)(struct net_device *dev,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack);
+ struct pp_memory_provider_params *mpp;
+ int err;
+
+ validate_cb = netdev_nop_validate_qcfg;
+ if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg)
+ validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg;
+
+ memset(qcfg, 0, sizeof(*qcfg));
+
+ /* Get defaults from the driver, in case user config not set */
+ if (dev->queue_mgmt_ops->ndo_default_qcfg)
+ dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg);
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ /* Apply MP overrides */
+ mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params;
+ if (mpp->rx_page_size)
+ qcfg->rx_page_size = mpp->rx_page_size;
+ err = validate_cb(dev, qcfg, extack);
+ if (err)
+ return err;
+
+ return 0;
+}
+
+/**
+ * netdev_queue_config() - get configuration for a given queue
+ * @dev: net_device instance
+ * @rxq_idx: index of the queue of interest
+ * @qcfg: queue configuration struct (output)
+ *
+ * Render the configuration for a given queue. This helper should be used
+ * by drivers which support queue configuration to retrieve config for
+ * a particular queue.
+ *
+ * @qcfg is an output parameter and is always fully initialized by this
+ * function. Some values may not be set by the user, drivers may either
+ * deal with the "unset" values in @qcfg, or provide the callback
+ * to populate defaults in queue_management_ops.
+ */
+void netdev_queue_config(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg)
+{
+ __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false);
+}
+EXPORT_SYMBOL(netdev_queue_config);
+
+int netdev_queue_config_validate(struct net_device *dev, int rxq_idx,
+ struct netdev_queue_config *qcfg,
+ struct netlink_ext_ack *extack)
+{
+ return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true);
+}
diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c
index c7d9341b7630..668a90658f25 100644
--- a/net/core/netdev_rx_queue.c
+++ b/net/core/netdev_rx_queue.c
@@ -7,6 +7,7 @@
#include <net/netdev_rx_queue.h>
#include <net/page_pool/memory_provider.h>
+#include "dev.h"
#include "page_pool_priv.h"
/* See also page_pool_is_unreadable() */
@@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx)
}
EXPORT_SYMBOL(netif_rxq_has_unreadable_mp);
-int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+static int netdev_rx_queue_reconfig(struct net_device *dev,
+ unsigned int rxq_idx,
+ struct netdev_queue_config *qcfg_old,
+ struct netdev_queue_config *qcfg_new)
{
struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx);
const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
@@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
goto err_free_new_mem;
}
- err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_free_old_mem;
@@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
if (err)
goto err_free_new_queue_mem;
- err = qops->ndo_queue_start(dev, new_mem, rxq_idx);
+ err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx);
if (err)
goto err_start_queue;
} else {
@@ -76,7 +80,7 @@ err_start_queue:
* WARN if we fail to recover the old rx queue, and at least free
* old_mem so we don't also leak that.
*/
- if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) {
+ if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) {
WARN(1,
"Failed to restart old queue in error path. RX queue %d may be unhealthy.",
rxq_idx);
@@ -94,12 +98,22 @@ err_free_new_mem:
return err;
}
+
+int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx)
+{
+ struct netdev_queue_config qcfg;
+
+ netdev_queue_config(dev, rxq_idx, &qcfg);
+ return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg);
+}
EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL");
int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
const struct pp_memory_provider_params *p,
struct netlink_ext_ack *extack)
{
+ const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops;
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int ret;
@@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached");
return -EEXIST;
}
+ if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) {
+ NL_SET_ERR_MSG(extack, "device does not support: rx_page_size");
+ return -EOPNOTSUPP;
+ }
rxq = __netif_get_rx_queue(dev, rxq_idx);
if (rxq->mp_params.mp_ops) {
@@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
}
#endif
+ netdev_queue_config(dev, rxq_idx, &qcfg[0]);
rxq->mp_params = *p;
- ret = netdev_rx_queue_restart(dev, rxq_idx);
- if (ret) {
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- }
+ ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack);
+ if (ret)
+ goto err_clear_mp;
+
+ ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]);
+ if (ret)
+ goto err_clear_mp;
+
+ return 0;
+
+err_clear_mp:
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
return ret;
}
@@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx,
void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
const struct pp_memory_provider_params *old_p)
{
+ struct netdev_queue_config qcfg[2];
struct netdev_rx_queue *rxq;
int err;
@@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx,
rxq->mp_params.mp_priv != old_p->mp_priv))
return;
- rxq->mp_params.mp_ops = NULL;
- rxq->mp_params.mp_priv = NULL;
- err = netdev_rx_queue_restart(dev, ifq_idx);
+ netdev_queue_config(dev, ifq_idx, &qcfg[0]);
+ memset(&rxq->mp_params, 0, sizeof(rxq->mp_params));
+ netdev_queue_config(dev, ifq_idx, &qcfg[1]);
+
+ err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]);
WARN_ON(err && err != -ENETDOWN);
}
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
deleted file mode 100644
index 897a8f01a67b..000000000000
--- a/net/core/request_sock.c
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
-/*
- * NET Generic infrastructure for Network protocols.
- *
- * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br>
- *
- * From code originally in include/net/tcp.h
- */
-
-#include <linux/module.h>
-#include <linux/random.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/tcp.h>
-#include <linux/vmalloc.h>
-
-#include <net/request_sock.h>
-
-/*
- * Maximum number of SYN_RECV sockets in queue per LISTEN socket.
- * One SYN_RECV socket costs about 80bytes on a 32bit machine.
- * It would be better to replace it with a global counter for all sockets
- * but then some measure against one socket starving all other sockets
- * would be needed.
- *
- * The minimum value of it is 128. Experiments with real servers show that
- * it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems.
- * This value is adjusted to 128 for low memory machines,
- * and it will increase in proportion to the memory of machine.
- * Note : Dont forget somaxconn that may limit backlog too.
- */
-
-void reqsk_queue_alloc(struct request_sock_queue *queue)
-{
- queue->fastopenq.rskq_rst_head = NULL;
- queue->fastopenq.rskq_rst_tail = NULL;
- queue->fastopenq.qlen = 0;
-
- queue->rskq_accept_head = NULL;
-}
-
-/*
- * This function is called to set a Fast Open socket's "fastopen_rsk" field
- * to NULL when a TFO socket no longer needs to access the request_sock.
- * This happens only after 3WHS has been either completed or aborted (e.g.,
- * RST is received).
- *
- * Before TFO, a child socket is created only after 3WHS is completed,
- * hence it never needs to access the request_sock. things get a lot more
- * complex with TFO. A child socket, accepted or not, has to access its
- * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts,
- * until 3WHS is either completed or aborted. Afterwards the req will stay
- * until either the child socket is accepted, or in the rare case when the
- * listener is closed before the child is accepted.
- *
- * In short, a request socket is only freed after BOTH 3WHS has completed
- * (or aborted) and the child socket has been accepted (or listener closed).
- * When a child socket is accepted, its corresponding req->sk is set to
- * NULL since it's no longer needed. More importantly, "req->sk == NULL"
- * will be used by the code below to determine if a child socket has been
- * accepted or not, and the check is protected by the fastopenq->lock
- * described below.
- *
- * Note that fastopen_rsk is only accessed from the child socket's context
- * with its socket lock held. But a request_sock (req) can be accessed by
- * both its child socket through fastopen_rsk, and a listener socket through
- * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin
- * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created.
- * only in the rare case when both the listener and the child locks are held,
- * e.g., in inet_csk_listen_stop() do we not need to acquire the lock.
- * The lock also protects other fields such as fastopenq->qlen, which is
- * decremented by this function when fastopen_rsk is no longer needed.
- *
- * Note that another solution was to simply use the existing socket lock
- * from the listener. But first socket lock is difficult to use. It is not
- * a simple spin lock - one must consider sock_owned_by_user() and arrange
- * to use sk_add_backlog() stuff. But what really makes it infeasible is the
- * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to
- * acquire a child's lock while holding listener's socket lock.
- *
- * This function also sets "treq->tfo_listener" to false.
- * treq->tfo_listener is used by the listener so it is protected by the
- * fastopenq->lock in this function.
- */
-void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
- bool reset)
-{
- struct sock *lsk = req->rsk_listener;
- struct fastopen_queue *fastopenq;
-
- fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq;
-
- RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL);
- spin_lock_bh(&fastopenq->lock);
- fastopenq->qlen--;
- tcp_rsk(req)->tfo_listener = false;
- if (req->sk) /* the child socket hasn't been accepted yet */
- goto out;
-
- if (!reset || lsk->sk_state != TCP_LISTEN) {
- /* If the listener has been closed don't bother with the
- * special RST handling below.
- */
- spin_unlock_bh(&fastopenq->lock);
- reqsk_put(req);
- return;
- }
- /* Wait for 60secs before removing a req that has triggered RST.
- * This is a simple defense against TFO spoofing attack - by
- * counting the req against fastopen.max_qlen, and disabling
- * TFO when the qlen exceeds max_qlen.
- *
- * For more details see CoNext'11 "TCP Fast Open" paper.
- */
- req->rsk_timer.expires = jiffies + 60*HZ;
- if (fastopenq->rskq_rst_head == NULL)
- fastopenq->rskq_rst_head = req;
- else
- fastopenq->rskq_rst_tail->dl_next = req;
-
- req->dl_next = NULL;
- fastopenq->rskq_rst_tail = req;
- fastopenq->qlen++;
-out:
- spin_unlock_bh(&fastopenq->lock);
-}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 61746c2b95f6..699c401a5eae 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -78,6 +78,7 @@
#include <net/mpls.h>
#include <net/mptcp.h>
#include <net/mctp.h>
+#include <net/can.h>
#include <net/page_pool/helpers.h>
#include <net/psp/types.h>
#include <net/dropreason.h>
@@ -280,7 +281,7 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align);
*/
static u32 skbuff_cache_size __read_mostly;
-static struct sk_buff *napi_skb_cache_get(bool alloc)
+static inline struct sk_buff *napi_skb_cache_get(bool alloc)
{
struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
@@ -307,6 +308,23 @@ static struct sk_buff *napi_skb_cache_get(bool alloc)
return skb;
}
+/*
+ * Only clear those fields we need to clear, not those that we will
+ * actually initialise later. Hence, don't put any more fields after
+ * the tail pointer in struct sk_buff!
+ */
+static inline void skbuff_clear(struct sk_buff *skb)
+{
+ /* Replace memset(skb, 0, offsetof(struct sk_buff, tail))
+ * with two smaller memset(), with a barrier() between them.
+ * This forces the compiler to inline both calls.
+ */
+ BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128);
+ memset(skb, 0, 128);
+ barrier();
+ memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128);
+}
+
/**
* napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache
* @skbs: pointer to an at least @n-sized array to fill with skb pointers
@@ -357,7 +375,7 @@ get:
skbs[i] = nc->skb_cache[base + i];
kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size);
- memset(skbs[i], 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skbs[i]);
}
nc->skb_count -= n;
@@ -424,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
data = __slab_build_skb(data, &size);
__finalize_skb_around(skb, data, size);
@@ -476,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -537,7 +555,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size)
if (unlikely(!skb))
return NULL;
- memset(skb, 0, offsetof(struct sk_buff, tail));
+ skbuff_clear(skb);
__build_skb_around(skb, data, frag_size);
return skb;
@@ -566,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(napi_build_skb);
+static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node)
+{
+ if (!gfp_pfmemalloc_allowed(flags))
+ return NULL;
+ if (!obj_size)
+ return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache,
+ flags, node);
+ return kmalloc_node_track_caller(obj_size, flags, node);
+}
+
/*
* kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells
* the caller if emergency pfmemalloc reserves are being used. If it is and
@@ -574,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb);
* memory is free
*/
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
- bool *pfmemalloc)
+ struct sk_buff *skb)
{
- bool ret_pfmemalloc = false;
size_t obj_size;
void *obj;
@@ -587,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
*size = SKB_SMALL_HEAD_CACHE_SIZE;
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node);
- goto out;
+ if (skb)
+ skb->pfmemalloc = true;
+ return kmalloc_pfmemalloc(0, flags, node);
}
obj_size = kmalloc_size_roundup(obj_size);
@@ -608,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
obj = kmalloc_node_track_caller(obj_size,
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
node);
- if (obj || !(gfp_pfmemalloc_allowed(flags)))
+ if (likely(obj))
goto out;
/* Try again but now we are using pfmemalloc reserves */
- ret_pfmemalloc = true;
- obj = kmalloc_node_track_caller(obj_size, flags, node);
-
+ if (skb)
+ skb->pfmemalloc = true;
+ obj = kmalloc_pfmemalloc(obj_size, flags, node);
out:
- if (pfmemalloc)
- *pfmemalloc = ret_pfmemalloc;
-
return obj;
}
@@ -650,7 +674,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
{
struct sk_buff *skb = NULL;
struct kmem_cache *cache;
- bool pfmemalloc;
u8 *data;
if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX))
@@ -680,37 +703,35 @@ fallback:
if (unlikely(!skb))
return NULL;
}
- prefetchw(skb);
+ skbuff_clear(skb);
/* We do our best to align skb_shared_info on a separate cache
* line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
* Both skb->head and skb_shared_info are cache line aligned.
*/
- data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
+ data = kmalloc_reserve(&size, gfp_mask, node, skb);
if (unlikely(!data))
goto nodata;
/* kmalloc_size_roundup() might give us more room than requested.
* Put skb_shared_info exactly at the end of allocated zone,
* to allow max possible filling before reallocation.
*/
- prefetchw(data + SKB_WITH_OVERHEAD(size));
-
- /*
- * Only clear those fields we need to clear, not those that we will
- * actually initialise below. Hence, don't put any more fields after
- * the tail pointer in struct sk_buff!
- */
- memset(skb, 0, offsetof(struct sk_buff, tail));
- __build_skb_around(skb, data, size);
- skb->pfmemalloc = pfmemalloc;
+ __finalize_skb_around(skb, data, size);
if (flags & SKB_ALLOC_FCLONE) {
struct sk_buff_fclones *fclones;
fclones = container_of(skb, struct sk_buff_fclones, skb1);
- skb->fclone = SKB_FCLONE_ORIG;
+ /* skb->fclone is a 2bits field.
+ * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG)
+ * with a single OR.
+ */
+ BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0);
+ DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE);
+ skb->fclone |= SKB_FCLONE_ORIG;
+
refcount_set(&fclones->fclone_ref, 1);
}
@@ -1488,9 +1509,20 @@ void napi_skb_free_stolen_head(struct sk_buff *skb)
napi_skb_cache_put(skb);
}
+/**
+ * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache
+ * @skb: buffer to free
+ * @budget: NAPI budget
+ *
+ * Non-zero @budget must come from the @budget argument passed by the core
+ * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll
+ * for example when polling for netpoll / netconsole.
+ *
+ * Passing @budget of 0 is safe from any context, it turns this function
+ * into dev_consume_skb_any().
+ */
void napi_consume_skb(struct sk_buff *skb, int budget)
{
- /* Zero budget indicate non-NAPI context called us, like netpoll */
if (unlikely(!budget || !skb)) {
dev_consume_skb_any(skb);
return;
@@ -5108,6 +5140,9 @@ static const u8 skb_ext_type_len[] = {
#if IS_ENABLED(CONFIG_INET_PSP)
[SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext),
#endif
+#if IS_ENABLED(CONFIG_CAN)
+ [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext),
+#endif
};
static __always_inline unsigned int skb_ext_total_length(void)
@@ -5123,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void)
static void skb_extensions_init(void)
{
- BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+ BUILD_BUG_ON(SKB_EXT_NUM > 8);
#if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL)
BUILD_BUG_ON(skb_ext_total_length() > 255);
#endif
@@ -7392,31 +7427,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes,
}
EXPORT_SYMBOL(csum_and_copy_from_iter_full);
-void get_netmem(netmem_ref netmem)
+void __get_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_get_net_iov(netmem_to_net_iov(netmem));
- return;
- }
- get_page(netmem_to_page(netmem));
+ if (net_is_devmem_iov(niov))
+ net_devmem_get_net_iov(netmem_to_net_iov(netmem));
}
-EXPORT_SYMBOL(get_netmem);
+EXPORT_SYMBOL(__get_netmem);
-void put_netmem(netmem_ref netmem)
+void __put_netmem(netmem_ref netmem)
{
- struct net_iov *niov;
+ struct net_iov *niov = netmem_to_net_iov(netmem);
- if (netmem_is_net_iov(netmem)) {
- niov = netmem_to_net_iov(netmem);
- if (net_is_devmem_iov(niov))
- net_devmem_put_net_iov(netmem_to_net_iov(netmem));
- return;
+ if (net_is_devmem_iov(niov))
+ net_devmem_put_net_iov(netmem_to_net_iov(netmem));
+}
+EXPORT_SYMBOL(__put_netmem);
+
+struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb,
+ __be16 type,
+ int mac_offset)
+{
+ unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH;
+
+ /* if type is 802.1Q/AD then the header should already be
+ * present at mac_len - VLAN_HLEN (if mac_len > 0), or at
+ * ETH_HLEN otherwise
+ */
+ if (vlan_depth) {
+ if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN))
+ return (struct vlan_type_depth) { 0 };
+ vlan_depth -= VLAN_HLEN;
+ } else {
+ vlan_depth = ETH_HLEN;
}
+ do {
+ struct vlan_hdr vhdr, *vh;
+
+ vh = skb_header_pointer(skb, mac_offset + vlan_depth,
+ sizeof(vhdr), &vhdr);
+ if (unlikely(!vh || !--parse_depth))
+ return (struct vlan_type_depth) { 0 };
- put_page(netmem_to_page(netmem));
+ type = vh->h_vlan_encapsulated_proto;
+ vlan_depth += VLAN_HLEN;
+ } while (eth_type_vlan(type));
+
+ return (struct vlan_type_depth) {
+ .type = type,
+ .depth = vlan_depth
+ };
}
-EXPORT_SYMBOL(put_netmem);
+EXPORT_SYMBOL(__vlan_get_protocol_offset);
diff --git a/net/core/sock.c b/net/core/sock.c
index a1c8b47b0d56..693e6d80f501 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab)
return -EINVAL;
}
if (alloc_slab) {
- prot->slab = kmem_cache_create_usercopy(prot->name,
- prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
- prot->slab_flags,
- prot->useroffset, prot->usersize,
- NULL);
+ struct kmem_cache_args args = {
+ .useroffset = prot->useroffset,
+ .usersize = prot->usersize,
+ .freeptr_offset = prot->freeptr_offset,
+ .use_freeptr_offset = !!prot->freeptr_offset,
+ };
+ prot->slab = kmem_cache_create(prot->name, prot->obj_size,
+ &args,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
+ prot->slab_flags);
if (prot->slab == NULL) {
pr_crit("%s: Can't create sock SLAB cache!\n",
prot->name);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 05dd55cf8b58..03aea10073f0 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -17,6 +17,7 @@
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/sched/isolation.h>
+#include <linux/hex.h>
#include <net/ip.h>
#include <net/sock.h>
@@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write,
static int proc_do_rss_key(const struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
- struct ctl_table fake_table;
char buf[NETDEV_RSS_KEY_LEN * 3];
+ struct ctl_table fake_table;
+ char *pos = buf;
+
+ for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) {
+ pos = hex_byte_pack(pos, netdev_rss_key[i]);
+ *pos++ = ':';
+ }
+ *(--pos) = 0;
- snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key);
fake_table.data = buf;
fake_table.maxlen = sizeof(buf);
return proc_dostring(&fake_table, write, buffer, lenp, ppos);