diff options
Diffstat (limited to 'net/core')
| -rw-r--r-- | net/core/Makefile | 3 | ||||
| -rw-r--r-- | net/core/dev.c | 24 | ||||
| -rw-r--r-- | net/core/dev.h | 5 | ||||
| -rw-r--r-- | net/core/dev_ioctl.c | 60 | ||||
| -rw-r--r-- | net/core/devmem.c | 27 | ||||
| -rw-r--r-- | net/core/devmem.h | 17 | ||||
| -rw-r--r-- | net/core/gro.c | 4 | ||||
| -rw-r--r-- | net/core/neighbour.c | 150 | ||||
| -rw-r--r-- | net/core/net_namespace.c | 34 | ||||
| -rw-r--r-- | net/core/netdev_config.c | 78 | ||||
| -rw-r--r-- | net/core/netdev_rx_queue.c | 53 | ||||
| -rw-r--r-- | net/core/request_sock.c | 127 | ||||
| -rw-r--r-- | net/core/skbuff.c | 166 | ||||
| -rw-r--r-- | net/core/sock.c | 16 | ||||
| -rw-r--r-- | net/core/sysctl_net_core.c | 11 |
15 files changed, 424 insertions, 351 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 9ef2099c5426..dc17c5a61e9a 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -3,7 +3,7 @@ # Makefile for the Linux networking core. # -obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ +obj-y := sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ flow_dissector.o @@ -19,6 +19,7 @@ obj-$(CONFIG_NETDEV_ADDR_LIST_TEST) += dev_addr_lists_test.o obj-y += net-sysfs.o obj-y += hotdata.o +obj-y += netdev_config.o obj-y += netdev_rx_queue.o obj-y += netdev_queues.o obj-$(CONFIG_PAGE_POOL) += page_pool.o page_pool_user.o diff --git a/net/core/dev.c b/net/core/dev.c index ccef685023c2..ac6bcb2a0784 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -246,12 +246,11 @@ static inline void backlog_lock_irq_disable(struct softnet_data *sd) } static inline void backlog_unlock_irq_restore(struct softnet_data *sd, - unsigned long *flags) + unsigned long flags) { if (IS_ENABLED(CONFIG_RPS) || use_backlog_threads()) - spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags); - else - local_irq_restore(*flags); + spin_unlock(&sd->input_pkt_queue.lock); + local_irq_restore(flags); } static inline void backlog_unlock_irq_enable(struct softnet_data *sd) @@ -3803,7 +3802,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, inner_ip_hdr(skb) : ip_hdr(skb); if (!(iph->frag_off & htons(IP_DF))) - features &= ~NETIF_F_TSO_MANGLEID; + features &= ~dev->mangleid_features; } /* NETIF_F_IPV6_CSUM does not support IPv6 extension headers, @@ -3814,8 +3813,7 @@ static netdev_features_t gso_features_check(const struct sk_buff *skb, (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && vlan_get_protocol(skb) == htons(ETH_P_IPV6))) && skb_transport_header_was_set(skb) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) features &= ~(NETIF_F_IPV6_CSUM | NETIF_F_TSO6 | NETIF_F_GSO_UDP_L4); return features; @@ -3918,8 +3916,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) { if (vlan_get_protocol(skb) == htons(ETH_P_IPV6) && - skb_network_header_len(skb) != sizeof(struct ipv6hdr) && - !ipv6_has_hopopt_jumbo(skb)) + skb_network_header_len(skb) != sizeof(struct ipv6hdr)) goto sw_checksum; switch (skb->csum_offset) { @@ -5260,7 +5257,7 @@ void kick_defer_list_purge(unsigned int cpu) if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) __napi_schedule_irqoff(&sd->backlog); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); } else if (!cmpxchg(&sd->defer_ipi_scheduled, 0, 1)) { smp_call_function_single_async(cpu, &sd->defer_csd); @@ -5347,14 +5344,14 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, } __skb_queue_tail(&sd->input_pkt_queue, skb); tail = rps_input_queue_tail_incr(sd); - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); /* save the tail outside of the critical section */ rps_input_queue_tail_save(qtail, tail); return NET_RX_SUCCESS; } - backlog_unlock_irq_restore(sd, &flags); + backlog_unlock_irq_restore(sd, flags); cpu_backlog_drop: reason = SKB_DROP_REASON_CPU_BACKLOG; @@ -11386,6 +11383,9 @@ int register_netdevice(struct net_device *dev) if (dev->hw_enc_features & NETIF_F_TSO) dev->hw_enc_features |= NETIF_F_TSO_MANGLEID; + /* TSO_MANGLEID belongs in mangleid_features by definition */ + dev->mangleid_features |= NETIF_F_TSO_MANGLEID; + /* Make NETIF_F_HIGHDMA inheritable to VLAN devices. */ dev->vlan_features |= NETIF_F_HIGHDMA; diff --git a/net/core/dev.h b/net/core/dev.h index da18536cbd35..98793a738f43 100644 --- a/net/core/dev.h +++ b/net/core/dev.h @@ -10,6 +10,7 @@ struct net; struct netlink_ext_ack; +struct netdev_queue_config; struct cpumask; /* Random bits of netdevice that don't need to be exposed */ @@ -91,6 +92,10 @@ extern struct rw_semaphore dev_addr_sem; extern struct list_head net_todo_list; void netdev_run_todo(void); +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + /* netdev management, shared between various uAPI entry points */ struct netdev_name_node { struct hlist_node hlist; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 53a53357cfef..7a8966544c9d 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -287,7 +287,7 @@ static int dev_get_hwtstamp(struct net_device *dev, struct ifreq *ifr) int err; if (!ops->ndo_hwtstamp_get) - return dev_eth_ioctl(dev, ifr, SIOCGHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -414,7 +414,7 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) } if (!ops->ndo_hwtstamp_set) - return dev_eth_ioctl(dev, ifr, SIOCSHWTSTAMP); /* legacy */ + return -EOPNOTSUPP; if (!netif_device_present(dev)) return -ENODEV; @@ -438,48 +438,23 @@ static int dev_set_hwtstamp(struct net_device *dev, struct ifreq *ifr) return 0; } -static int generic_hwtstamp_ioctl_lower(struct net_device *dev, int cmd, - struct kernel_hwtstamp_config *kernel_cfg) -{ - struct ifreq ifrr; - int err; - - if (!kernel_cfg->ifr) - return -EINVAL; - - strscpy_pad(ifrr.ifr_name, dev->name, IFNAMSIZ); - ifrr.ifr_ifru = kernel_cfg->ifr->ifr_ifru; - - err = dev_eth_ioctl(dev, &ifrr, cmd); - if (err) - return err; - - kernel_cfg->ifr->ifr_ifru = ifrr.ifr_ifru; - kernel_cfg->copied_to_user = true; - - return 0; -} - int generic_hwtstamp_get_lower(struct net_device *dev, struct kernel_hwtstamp_config *kernel_cfg) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) { - int err; - - netdev_lock_ops(dev); - err = dev_get_hwtstamp_phylib(dev, kernel_cfg); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_get) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_get_lower); @@ -488,22 +463,19 @@ int generic_hwtstamp_set_lower(struct net_device *dev, struct netlink_ext_ack *extack) { const struct net_device_ops *ops = dev->netdev_ops; + int err; if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) { - int err; - - netdev_lock_ops(dev); - err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); - netdev_unlock_ops(dev); + if (!ops->ndo_hwtstamp_set) + return -EOPNOTSUPP; - return err; - } + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); - /* Legacy path: unconverted lower driver */ - return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); + return err; } EXPORT_SYMBOL(generic_hwtstamp_set_lower); diff --git a/net/core/devmem.c b/net/core/devmem.c index ec4217d6c0b4..63f093f7d2b2 100644 --- a/net/core/devmem.c +++ b/net/core/devmem.c @@ -30,11 +30,6 @@ static DEFINE_XARRAY_FLAGS(net_devmem_dmabuf_bindings, XA_FLAGS_ALLOC1); static const struct memory_provider_ops dmabuf_devmem_ops; -bool net_is_devmem_iov(struct net_iov *niov) -{ - return niov->type == NET_IOV_DMABUF; -} - static void net_devmem_dmabuf_free_chunk_owner(struct gen_pool *genpool, struct gen_pool_chunk *chunk, void *not_used) @@ -54,6 +49,15 @@ static dma_addr_t net_devmem_get_dma_addr(const struct net_iov *niov) ((dma_addr_t)net_iov_idx(niov) << PAGE_SHIFT); } +static void net_devmem_dmabuf_binding_release(struct percpu_ref *ref) +{ + struct net_devmem_dmabuf_binding *binding = + container_of(ref, struct net_devmem_dmabuf_binding, ref); + + INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); + schedule_work(&binding->unbind_w); +} + void __net_devmem_dmabuf_binding_free(struct work_struct *wq) { struct net_devmem_dmabuf_binding *binding = container_of(wq, typeof(*binding), unbind_w); @@ -75,6 +79,7 @@ void __net_devmem_dmabuf_binding_free(struct work_struct *wq) dma_buf_detach(binding->dmabuf, binding->attachment); dma_buf_put(binding->dmabuf); xa_destroy(&binding->bound_rxqs); + percpu_ref_exit(&binding->ref); kvfree(binding->tx_vec); kfree(binding); } @@ -143,7 +148,7 @@ void net_devmem_unbind_dmabuf(struct net_devmem_dmabuf_binding *binding) __net_mp_close_rxq(binding->dev, rxq_idx, &mp_params); } - net_devmem_dmabuf_binding_put(binding); + percpu_ref_kill(&binding->ref); } int net_devmem_bind_dmabuf_to_queue(struct net_device *dev, u32 rxq_idx, @@ -209,7 +214,11 @@ net_devmem_bind_dmabuf(struct net_device *dev, binding->dev = dev; xa_init_flags(&binding->bound_rxqs, XA_FLAGS_ALLOC); - refcount_set(&binding->ref, 1); + err = percpu_ref_init(&binding->ref, + net_devmem_dmabuf_binding_release, + 0, GFP_KERNEL); + if (err < 0) + goto err_free_binding; mutex_init(&binding->lock); @@ -220,7 +229,7 @@ net_devmem_bind_dmabuf(struct net_device *dev, if (IS_ERR(binding->attachment)) { err = PTR_ERR(binding->attachment); NL_SET_ERR_MSG(extack, "Failed to bind dmabuf to device"); - goto err_free_binding; + goto err_exit_ref; } binding->sgt = dma_buf_map_attachment_unlocked(binding->attachment, @@ -322,6 +331,8 @@ err_unmap: direction); err_detach: dma_buf_detach(dmabuf, binding->attachment); +err_exit_ref: + percpu_ref_exit(&binding->ref); err_free_binding: kfree(binding); err_put_dmabuf: diff --git a/net/core/devmem.h b/net/core/devmem.h index 0b43a648cd2e..1c5c18581fcb 100644 --- a/net/core/devmem.h +++ b/net/core/devmem.h @@ -41,7 +41,7 @@ struct net_devmem_dmabuf_binding { * retransmits) hold a reference to the binding until the skb holding * them is freed. */ - refcount_t ref; + struct percpu_ref ref; /* The list of bindings currently active. Used for netlink to notify us * of the user dropping the bind. @@ -125,17 +125,13 @@ static inline unsigned long net_iov_virtual_addr(const struct net_iov *niov) static inline bool net_devmem_dmabuf_binding_get(struct net_devmem_dmabuf_binding *binding) { - return refcount_inc_not_zero(&binding->ref); + return percpu_ref_tryget(&binding->ref); } static inline void net_devmem_dmabuf_binding_put(struct net_devmem_dmabuf_binding *binding) { - if (!refcount_dec_and_test(&binding->ref)) - return; - - INIT_WORK(&binding->unbind_w, __net_devmem_dmabuf_binding_free); - schedule_work(&binding->unbind_w); + percpu_ref_put(&binding->ref); } void net_devmem_get_net_iov(struct net_iov *niov); @@ -145,7 +141,7 @@ struct net_iov * net_devmem_alloc_dmabuf(struct net_devmem_dmabuf_binding *binding); void net_devmem_free_dmabuf(struct net_iov *ppiov); -bool net_is_devmem_iov(struct net_iov *niov); + struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id); struct net_iov * @@ -218,11 +214,6 @@ static inline u32 net_devmem_iov_binding_id(const struct net_iov *niov) return 0; } -static inline bool net_is_devmem_iov(struct net_iov *niov) -{ - return false; -} - static inline struct net_devmem_dmabuf_binding * net_devmem_get_binding(struct sock *sk, unsigned int dmabuf_id) { diff --git a/net/core/gro.c b/net/core/gro.c index 482fa7d7f598..31d21de5b15a 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -115,8 +115,6 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) if (unlikely(p->len + len >= GRO_LEGACY_MAX_SIZE)) { if (NAPI_GRO_CB(skb)->proto != IPPROTO_TCP || - (p->protocol == htons(ETH_P_IPV6) && - skb_headroom(p) < sizeof(struct hop_jumbo_hdr)) || p->encapsulation) return -E2BIG; } @@ -417,7 +415,7 @@ static void gro_pull_from_frag0(struct sk_buff *skb, int grow) { struct skb_shared_info *pinfo = skb_shinfo(skb); - BUG_ON(skb->end - skb->tail < grow); + DEBUG_NET_WARN_ON_ONCE(skb->end - skb->tail < grow); memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96a3b1a93252..e0897eb41c8d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -51,9 +51,8 @@ do { \ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(struct timer_list *t); -static void __neigh_notify(struct neighbour *n, int type, int flags, - u32 pid); -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); +static void neigh_notify(struct neighbour *n, int type, int flags, u32 pid); +static void __neigh_notify(struct neighbour *n, int type, int flags, u32 pid); static void pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev, bool skip_perm); @@ -117,7 +116,7 @@ static int neigh_blackhole(struct neighbour *neigh, struct sk_buff *skb) static void neigh_cleanup_and_release(struct neighbour *neigh) { trace_neigh_cleanup_and_release(neigh, 0); - __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); + neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -1105,6 +1104,7 @@ static void neigh_timer_handler(struct timer_list *t) { unsigned long now, next; struct neighbour *neigh = timer_container_of(neigh, t, timer); + bool skip_probe = false; unsigned int state; int notify = 0; @@ -1172,9 +1172,15 @@ static void neigh_timer_handler(struct timer_list *t) neigh_invalidate(neigh); } notify = 1; - goto out; + skip_probe = true; } + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, 0); + + if (skip_probe) + goto out; + if (neigh->nud_state & NUD_IN_TIMER) { if (time_before(next, jiffies + HZ/100)) next = jiffies + HZ/100; @@ -1189,7 +1195,7 @@ out: } if (notify) - neigh_update_notify(neigh, 0); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); trace_neigh_timer_handler(neigh, 0); @@ -1303,6 +1309,47 @@ static void neigh_update_hhs(struct neighbour *neigh) } } +static void neigh_update_process_arp_queue(struct neighbour *neigh) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + struct sk_buff *skb; + + /* Again: avoid deadlock if something went wrong. */ + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct dst_entry *dst = skb_dst(skb); + struct neighbour *n2, *n1 = neigh; + + write_unlock_bh(&neigh->lock); + + rcu_read_lock(); + + /* Why not just use 'neigh' as-is? The problem is that + * things such as shaper, eql, and sch_teql can end up + * using alternative, different, neigh objects to output + * the packet in the output path. So what we need to do + * here is re-lookup the top-level neigh in the path so + * we can reinject the packet there. + */ + n2 = NULL; + if (dst && + READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { + n2 = dst_neigh_lookup_skb(dst, skb); + if (n2) + n1 = n2; + } + READ_ONCE(n1->output)(n1, skb); + if (n2) + neigh_release(n2); + rcu_read_unlock(); + + write_lock_bh(&neigh->lock); + } + __skb_queue_purge(&neigh->arp_queue); + neigh->arp_queue_len_bytes = 0; +} + /* Generic update routine. -- lladdr is new lladdr or NULL, if it is not supplied. -- new is new state. @@ -1329,6 +1376,7 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, struct netlink_ext_ack *extack) { bool gc_update = false, managed_update = false; + bool process_arp_queue = false; int update_isrouter = 0; struct net_device *dev; int err, notify = 0; @@ -1462,53 +1510,30 @@ static int __neigh_update(struct neighbour *neigh, const u8 *lladdr, neigh_connect(neigh); else neigh_suspect(neigh); - if (!(old & NUD_VALID)) { - struct sk_buff *skb; - /* Again: avoid dead loop if something went wrong */ + if (!(old & NUD_VALID)) + process_arp_queue = true; - while (neigh->nud_state & NUD_VALID && - (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { - struct dst_entry *dst = skb_dst(skb); - struct neighbour *n2, *n1 = neigh; - write_unlock_bh(&neigh->lock); - - rcu_read_lock(); - - /* Why not just use 'neigh' as-is? The problem is that - * things such as shaper, eql, and sch_teql can end up - * using alternative, different, neigh objects to output - * the packet in the output path. So what we need to do - * here is re-lookup the top-level neigh in the path so - * we can reinject the packet there. - */ - n2 = NULL; - if (dst && - READ_ONCE(dst->obsolete) != DST_OBSOLETE_DEAD) { - n2 = dst_neigh_lookup_skb(dst, skb); - if (n2) - n1 = n2; - } - READ_ONCE(n1->output)(n1, skb); - if (n2) - neigh_release(n2); - rcu_read_unlock(); - - write_lock_bh(&neigh->lock); - } - __skb_queue_purge(&neigh->arp_queue); - neigh->arp_queue_len_bytes = 0; - } out: if (update_isrouter) neigh_update_is_router(neigh, flags, ¬ify); + + if (notify) + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); + + if (process_arp_queue) + neigh_update_process_arp_queue(neigh); + write_unlock_bh(&neigh->lock); + if (((new ^ old) & NUD_PERMANENT) || gc_update) neigh_update_gc_list(neigh); if (managed_update) neigh_update_managed_list(neigh); + if (notify) - neigh_update_notify(neigh, nlmsg_pid); + call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); + trace_neigh_update_done(neigh, err); return err; } @@ -2622,8 +2647,8 @@ out: return skb->len; } -static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, - u32 pid, u32 seq, int type, unsigned int flags) +static int __neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) { u32 neigh_flags, neigh_flags_ext; unsigned long now = jiffies; @@ -2649,23 +2674,19 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, if (nla_put(skb, NDA_DST, neigh->tbl->key_len, neigh->primary_key)) goto nla_put_failure; - read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; if (neigh->nud_state & NUD_VALID) { char haddr[MAX_ADDR_LEN]; neigh_ha_snapshot(haddr, neigh, neigh->dev); - if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { - read_unlock_bh(&neigh->lock); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) goto nla_put_failure; - } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); ci.ndm_confirmed = jiffies_to_clock_t(now - neigh->confirmed); ci.ndm_updated = jiffies_to_clock_t(now - neigh->updated); ci.ndm_refcnt = refcount_read(&neigh->refcnt) - 1; - read_unlock_bh(&neigh->lock); if (nla_put_u32(skb, NDA_PROBES, atomic_read(&neigh->probes)) || nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -2684,6 +2705,20 @@ nla_put_failure: return -EMSGSIZE; } +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, + u32 pid, u32 seq, int type, unsigned int flags) + __releases(neigh->lock) + __acquires(neigh->lock) +{ + int err; + + read_lock_bh(&neigh->lock); + err = __neigh_fill_info(skb, neigh, pid, seq, type, flags); + read_unlock_bh(&neigh->lock); + + return err; +} + static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, u32 pid, u32 seq, int type, unsigned int flags, struct neigh_table *tbl) @@ -2727,12 +2762,6 @@ nla_put_failure: return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) -{ - call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); -} - static bool neigh_master_filtered(struct net_device *dev, int master_idx) { struct net_device *master; @@ -3545,7 +3574,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags, if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, pid, 0, type, flags); + err = __neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3560,9 +3589,16 @@ out: rcu_read_unlock(); } +static void neigh_notify(struct neighbour *neigh, int type, int flags, u32 pid) +{ + read_lock_bh(&neigh->lock); + __neigh_notify(neigh, type, flags, pid); + read_unlock_bh(&neigh->lock); +} + void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); + neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index a6e6a964a287..aef44e617361 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -624,9 +624,10 @@ void net_ns_get_ownership(const struct net *net, kuid_t *uid, kgid_t *gid) } EXPORT_SYMBOL_GPL(net_ns_get_ownership); -static void unhash_nsid(struct net *net, struct net *last) +static void unhash_nsid(struct net *last) { - struct net *tmp; + struct net *tmp, *peer; + /* This function is only called from cleanup_net() work, * and this work is the only process, that may delete * a net from net_namespace_list. So, when the below @@ -634,22 +635,26 @@ static void unhash_nsid(struct net *net, struct net *last) * use for_each_net_rcu() or net_rwsem. */ for_each_net(tmp) { - int id; + int id = 0; spin_lock(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id, 0, NULL, + while ((peer = idr_get_next(&tmp->netns_ids, &id))) { + int curr_id = id; + + id++; + if (!peer->is_dying) + continue; + + idr_remove(&tmp->netns_ids, curr_id); + spin_unlock(&tmp->nsid_lock); + rtnl_net_notifyid(tmp, RTM_DELNSID, curr_id, 0, NULL, GFP_KERNEL); + spin_lock(&tmp->nsid_lock); + } + spin_unlock(&tmp->nsid_lock); if (tmp == last) break; } - spin_lock(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock(&net->nsid_lock); } static LLIST_HEAD(cleanup_list); @@ -674,6 +679,7 @@ static void cleanup_net(struct work_struct *work) llist_for_each_entry(net, net_kill_list, cleanup_list) { ns_tree_remove(net); list_del_rcu(&net->list); + net->is_dying = true; } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer @@ -688,8 +694,10 @@ static void cleanup_net(struct work_struct *work) last = list_last_entry(&net_namespace_list, struct net, list); up_write(&net_rwsem); + unhash_nsid(last); + llist_for_each_entry(net, net_kill_list, cleanup_list) { - unhash_nsid(net, last); + idr_destroy(&net->netns_ids); list_add_tail(&net->exit_list, &net_exit_list); } diff --git a/net/core/netdev_config.c b/net/core/netdev_config.c new file mode 100644 index 000000000000..f14af365d5cd --- /dev/null +++ b/net/core/netdev_config.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include <linux/netdevice.h> +#include <net/netdev_queues.h> +#include <net/netdev_rx_queue.h> + +#include "dev.h" + +static int netdev_nop_validate_qcfg(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return 0; +} + +static int __netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack, + bool validate) +{ + int (*validate_cb)(struct net_device *dev, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack); + struct pp_memory_provider_params *mpp; + int err; + + validate_cb = netdev_nop_validate_qcfg; + if (validate && dev->queue_mgmt_ops->ndo_validate_qcfg) + validate_cb = dev->queue_mgmt_ops->ndo_validate_qcfg; + + memset(qcfg, 0, sizeof(*qcfg)); + + /* Get defaults from the driver, in case user config not set */ + if (dev->queue_mgmt_ops->ndo_default_qcfg) + dev->queue_mgmt_ops->ndo_default_qcfg(dev, qcfg); + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + /* Apply MP overrides */ + mpp = &__netif_get_rx_queue(dev, rxq_idx)->mp_params; + if (mpp->rx_page_size) + qcfg->rx_page_size = mpp->rx_page_size; + err = validate_cb(dev, qcfg, extack); + if (err) + return err; + + return 0; +} + +/** + * netdev_queue_config() - get configuration for a given queue + * @dev: net_device instance + * @rxq_idx: index of the queue of interest + * @qcfg: queue configuration struct (output) + * + * Render the configuration for a given queue. This helper should be used + * by drivers which support queue configuration to retrieve config for + * a particular queue. + * + * @qcfg is an output parameter and is always fully initialized by this + * function. Some values may not be set by the user, drivers may either + * deal with the "unset" values in @qcfg, or provide the callback + * to populate defaults in queue_management_ops. + */ +void netdev_queue_config(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg) +{ + __netdev_queue_config(dev, rxq_idx, qcfg, NULL, false); +} +EXPORT_SYMBOL(netdev_queue_config); + +int netdev_queue_config_validate(struct net_device *dev, int rxq_idx, + struct netdev_queue_config *qcfg, + struct netlink_ext_ack *extack) +{ + return __netdev_queue_config(dev, rxq_idx, qcfg, extack, true); +} diff --git a/net/core/netdev_rx_queue.c b/net/core/netdev_rx_queue.c index c7d9341b7630..668a90658f25 100644 --- a/net/core/netdev_rx_queue.c +++ b/net/core/netdev_rx_queue.c @@ -7,6 +7,7 @@ #include <net/netdev_rx_queue.h> #include <net/page_pool/memory_provider.h> +#include "dev.h" #include "page_pool_priv.h" /* See also page_pool_is_unreadable() */ @@ -18,7 +19,10 @@ bool netif_rxq_has_unreadable_mp(struct net_device *dev, int idx) } EXPORT_SYMBOL(netif_rxq_has_unreadable_mp); -int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +static int netdev_rx_queue_reconfig(struct net_device *dev, + unsigned int rxq_idx, + struct netdev_queue_config *qcfg_old, + struct netdev_queue_config *qcfg_new) { struct netdev_rx_queue *rxq = __netif_get_rx_queue(dev, rxq_idx); const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; @@ -41,7 +45,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) goto err_free_new_mem; } - err = qops->ndo_queue_mem_alloc(dev, new_mem, rxq_idx); + err = qops->ndo_queue_mem_alloc(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_free_old_mem; @@ -54,7 +58,7 @@ int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) if (err) goto err_free_new_queue_mem; - err = qops->ndo_queue_start(dev, new_mem, rxq_idx); + err = qops->ndo_queue_start(dev, qcfg_new, new_mem, rxq_idx); if (err) goto err_start_queue; } else { @@ -76,7 +80,7 @@ err_start_queue: * WARN if we fail to recover the old rx queue, and at least free * old_mem so we don't also leak that. */ - if (qops->ndo_queue_start(dev, old_mem, rxq_idx)) { + if (qops->ndo_queue_start(dev, qcfg_old, old_mem, rxq_idx)) { WARN(1, "Failed to restart old queue in error path. RX queue %d may be unhealthy.", rxq_idx); @@ -94,12 +98,22 @@ err_free_new_mem: return err; } + +int netdev_rx_queue_restart(struct net_device *dev, unsigned int rxq_idx) +{ + struct netdev_queue_config qcfg; + + netdev_queue_config(dev, rxq_idx, &qcfg); + return netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg, &qcfg); +} EXPORT_SYMBOL_NS_GPL(netdev_rx_queue_restart, "NETDEV_INTERNAL"); int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, const struct pp_memory_provider_params *p, struct netlink_ext_ack *extack) { + const struct netdev_queue_mgmt_ops *qops = dev->queue_mgmt_ops; + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int ret; @@ -124,6 +138,10 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, NL_SET_ERR_MSG(extack, "unable to custom memory provider to device with XDP program attached"); return -EEXIST; } + if (p->rx_page_size && !(qops->supported_params & QCFG_RX_PAGE_SIZE)) { + NL_SET_ERR_MSG(extack, "device does not support: rx_page_size"); + return -EOPNOTSUPP; + } rxq = __netif_get_rx_queue(dev, rxq_idx); if (rxq->mp_params.mp_ops) { @@ -137,12 +155,20 @@ int __net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, } #endif + netdev_queue_config(dev, rxq_idx, &qcfg[0]); rxq->mp_params = *p; - ret = netdev_rx_queue_restart(dev, rxq_idx); - if (ret) { - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - } + ret = netdev_queue_config_validate(dev, rxq_idx, &qcfg[1], extack); + if (ret) + goto err_clear_mp; + + ret = netdev_rx_queue_reconfig(dev, rxq_idx, &qcfg[0], &qcfg[1]); + if (ret) + goto err_clear_mp; + + return 0; + +err_clear_mp: + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); return ret; } @@ -160,6 +186,7 @@ int net_mp_open_rxq(struct net_device *dev, unsigned int rxq_idx, void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, const struct pp_memory_provider_params *old_p) { + struct netdev_queue_config qcfg[2]; struct netdev_rx_queue *rxq; int err; @@ -179,9 +206,11 @@ void __net_mp_close_rxq(struct net_device *dev, unsigned int ifq_idx, rxq->mp_params.mp_priv != old_p->mp_priv)) return; - rxq->mp_params.mp_ops = NULL; - rxq->mp_params.mp_priv = NULL; - err = netdev_rx_queue_restart(dev, ifq_idx); + netdev_queue_config(dev, ifq_idx, &qcfg[0]); + memset(&rxq->mp_params, 0, sizeof(rxq->mp_params)); + netdev_queue_config(dev, ifq_idx, &qcfg[1]); + + err = netdev_rx_queue_reconfig(dev, ifq_idx, &qcfg[0], &qcfg[1]); WARN_ON(err && err != -ENETDOWN); } diff --git a/net/core/request_sock.c b/net/core/request_sock.c deleted file mode 100644 index 897a8f01a67b..000000000000 --- a/net/core/request_sock.c +++ /dev/null @@ -1,127 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * NET Generic infrastructure for Network protocols. - * - * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> - * - * From code originally in include/net/tcp.h - */ - -#include <linux/module.h> -#include <linux/random.h> -#include <linux/slab.h> -#include <linux/string.h> -#include <linux/tcp.h> -#include <linux/vmalloc.h> - -#include <net/request_sock.h> - -/* - * Maximum number of SYN_RECV sockets in queue per LISTEN socket. - * One SYN_RECV socket costs about 80bytes on a 32bit machine. - * It would be better to replace it with a global counter for all sockets - * but then some measure against one socket starving all other sockets - * would be needed. - * - * The minimum value of it is 128. Experiments with real servers show that - * it is absolutely not enough even at 100conn/sec. 256 cures most - * of problems. - * This value is adjusted to 128 for low memory machines, - * and it will increase in proportion to the memory of machine. - * Note : Dont forget somaxconn that may limit backlog too. - */ - -void reqsk_queue_alloc(struct request_sock_queue *queue) -{ - queue->fastopenq.rskq_rst_head = NULL; - queue->fastopenq.rskq_rst_tail = NULL; - queue->fastopenq.qlen = 0; - - queue->rskq_accept_head = NULL; -} - -/* - * This function is called to set a Fast Open socket's "fastopen_rsk" field - * to NULL when a TFO socket no longer needs to access the request_sock. - * This happens only after 3WHS has been either completed or aborted (e.g., - * RST is received). - * - * Before TFO, a child socket is created only after 3WHS is completed, - * hence it never needs to access the request_sock. things get a lot more - * complex with TFO. A child socket, accepted or not, has to access its - * request_sock for 3WHS processing, e.g., to retransmit SYN-ACK pkts, - * until 3WHS is either completed or aborted. Afterwards the req will stay - * until either the child socket is accepted, or in the rare case when the - * listener is closed before the child is accepted. - * - * In short, a request socket is only freed after BOTH 3WHS has completed - * (or aborted) and the child socket has been accepted (or listener closed). - * When a child socket is accepted, its corresponding req->sk is set to - * NULL since it's no longer needed. More importantly, "req->sk == NULL" - * will be used by the code below to determine if a child socket has been - * accepted or not, and the check is protected by the fastopenq->lock - * described below. - * - * Note that fastopen_rsk is only accessed from the child socket's context - * with its socket lock held. But a request_sock (req) can be accessed by - * both its child socket through fastopen_rsk, and a listener socket through - * icsk_accept_queue.rskq_accept_head. To protect the access a simple spin - * lock per listener "icsk->icsk_accept_queue.fastopenq->lock" is created. - * only in the rare case when both the listener and the child locks are held, - * e.g., in inet_csk_listen_stop() do we not need to acquire the lock. - * The lock also protects other fields such as fastopenq->qlen, which is - * decremented by this function when fastopen_rsk is no longer needed. - * - * Note that another solution was to simply use the existing socket lock - * from the listener. But first socket lock is difficult to use. It is not - * a simple spin lock - one must consider sock_owned_by_user() and arrange - * to use sk_add_backlog() stuff. But what really makes it infeasible is the - * locking hierarchy violation. E.g., inet_csk_listen_stop() may try to - * acquire a child's lock while holding listener's socket lock. - * - * This function also sets "treq->tfo_listener" to false. - * treq->tfo_listener is used by the listener so it is protected by the - * fastopenq->lock in this function. - */ -void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, - bool reset) -{ - struct sock *lsk = req->rsk_listener; - struct fastopen_queue *fastopenq; - - fastopenq = &inet_csk(lsk)->icsk_accept_queue.fastopenq; - - RCU_INIT_POINTER(tcp_sk(sk)->fastopen_rsk, NULL); - spin_lock_bh(&fastopenq->lock); - fastopenq->qlen--; - tcp_rsk(req)->tfo_listener = false; - if (req->sk) /* the child socket hasn't been accepted yet */ - goto out; - - if (!reset || lsk->sk_state != TCP_LISTEN) { - /* If the listener has been closed don't bother with the - * special RST handling below. - */ - spin_unlock_bh(&fastopenq->lock); - reqsk_put(req); - return; - } - /* Wait for 60secs before removing a req that has triggered RST. - * This is a simple defense against TFO spoofing attack - by - * counting the req against fastopen.max_qlen, and disabling - * TFO when the qlen exceeds max_qlen. - * - * For more details see CoNext'11 "TCP Fast Open" paper. - */ - req->rsk_timer.expires = jiffies + 60*HZ; - if (fastopenq->rskq_rst_head == NULL) - fastopenq->rskq_rst_head = req; - else - fastopenq->rskq_rst_tail->dl_next = req; - - req->dl_next = NULL; - fastopenq->rskq_rst_tail = req; - fastopenq->qlen++; -out: - spin_unlock_bh(&fastopenq->lock); -} diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 61746c2b95f6..699c401a5eae 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -78,6 +78,7 @@ #include <net/mpls.h> #include <net/mptcp.h> #include <net/mctp.h> +#include <net/can.h> #include <net/page_pool/helpers.h> #include <net/psp/types.h> #include <net/dropreason.h> @@ -280,7 +281,7 @@ EXPORT_SYMBOL(__netdev_alloc_frag_align); */ static u32 skbuff_cache_size __read_mostly; -static struct sk_buff *napi_skb_cache_get(bool alloc) +static inline struct sk_buff *napi_skb_cache_get(bool alloc) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); struct sk_buff *skb; @@ -307,6 +308,23 @@ static struct sk_buff *napi_skb_cache_get(bool alloc) return skb; } +/* + * Only clear those fields we need to clear, not those that we will + * actually initialise later. Hence, don't put any more fields after + * the tail pointer in struct sk_buff! + */ +static inline void skbuff_clear(struct sk_buff *skb) +{ + /* Replace memset(skb, 0, offsetof(struct sk_buff, tail)) + * with two smaller memset(), with a barrier() between them. + * This forces the compiler to inline both calls. + */ + BUILD_BUG_ON(offsetof(struct sk_buff, tail) <= 128); + memset(skb, 0, 128); + barrier(); + memset((void *)skb + 128, 0, offsetof(struct sk_buff, tail) - 128); +} + /** * napi_skb_cache_get_bulk - obtain a number of zeroed skb heads from the cache * @skbs: pointer to an at least @n-sized array to fill with skb pointers @@ -357,7 +375,7 @@ get: skbs[i] = nc->skb_cache[base + i]; kasan_mempool_unpoison_object(skbs[i], skbuff_cache_size); - memset(skbs[i], 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skbs[i]); } nc->skb_count -= n; @@ -424,7 +442,7 @@ struct sk_buff *slab_build_skb(void *data) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); data = __slab_build_skb(data, &size); __finalize_skb_around(skb, data, size); @@ -476,7 +494,7 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -537,7 +555,7 @@ static struct sk_buff *__napi_build_skb(void *data, unsigned int frag_size) if (unlikely(!skb)) return NULL; - memset(skb, 0, offsetof(struct sk_buff, tail)); + skbuff_clear(skb); __build_skb_around(skb, data, frag_size); return skb; @@ -566,6 +584,16 @@ struct sk_buff *napi_build_skb(void *data, unsigned int frag_size) } EXPORT_SYMBOL(napi_build_skb); +static void *kmalloc_pfmemalloc(size_t obj_size, gfp_t flags, int node) +{ + if (!gfp_pfmemalloc_allowed(flags)) + return NULL; + if (!obj_size) + return kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, + flags, node); + return kmalloc_node_track_caller(obj_size, flags, node); +} + /* * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells * the caller if emergency pfmemalloc reserves are being used. If it is and @@ -574,9 +602,8 @@ EXPORT_SYMBOL(napi_build_skb); * memory is free */ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, - bool *pfmemalloc) + struct sk_buff *skb) { - bool ret_pfmemalloc = false; size_t obj_size; void *obj; @@ -587,12 +614,12 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); *size = SKB_SMALL_HEAD_CACHE_SIZE; - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmem_cache_alloc_node(net_hotdata.skb_small_head_cache, flags, node); - goto out; + if (skb) + skb->pfmemalloc = true; + return kmalloc_pfmemalloc(0, flags, node); } obj_size = kmalloc_size_roundup(obj_size); @@ -608,17 +635,14 @@ static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node, obj = kmalloc_node_track_caller(obj_size, flags | __GFP_NOMEMALLOC | __GFP_NOWARN, node); - if (obj || !(gfp_pfmemalloc_allowed(flags))) + if (likely(obj)) goto out; /* Try again but now we are using pfmemalloc reserves */ - ret_pfmemalloc = true; - obj = kmalloc_node_track_caller(obj_size, flags, node); - + if (skb) + skb->pfmemalloc = true; + obj = kmalloc_pfmemalloc(obj_size, flags, node); out: - if (pfmemalloc) - *pfmemalloc = ret_pfmemalloc; - return obj; } @@ -650,7 +674,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, { struct sk_buff *skb = NULL; struct kmem_cache *cache; - bool pfmemalloc; u8 *data; if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) @@ -680,37 +703,35 @@ fallback: if (unlikely(!skb)) return NULL; } - prefetchw(skb); + skbuff_clear(skb); /* We do our best to align skb_shared_info on a separate cache * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives * aligned memory blocks, unless SLUB/SLAB debug is enabled. * Both skb->head and skb_shared_info are cache line aligned. */ - data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc); + data = kmalloc_reserve(&size, gfp_mask, node, skb); if (unlikely(!data)) goto nodata; /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - prefetchw(data + SKB_WITH_OVERHEAD(size)); - - /* - * Only clear those fields we need to clear, not those that we will - * actually initialise below. Hence, don't put any more fields after - * the tail pointer in struct sk_buff! - */ - memset(skb, 0, offsetof(struct sk_buff, tail)); - __build_skb_around(skb, data, size); - skb->pfmemalloc = pfmemalloc; + __finalize_skb_around(skb, data, size); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - skb->fclone = SKB_FCLONE_ORIG; + /* skb->fclone is a 2bits field. + * Replace expensive RMW (skb->fclone = SKB_FCLONE_ORIG) + * with a single OR. + */ + BUILD_BUG_ON(SKB_FCLONE_UNAVAILABLE != 0); + DEBUG_NET_WARN_ON_ONCE(skb->fclone != SKB_FCLONE_UNAVAILABLE); + skb->fclone |= SKB_FCLONE_ORIG; + refcount_set(&fclones->fclone_ref, 1); } @@ -1488,9 +1509,20 @@ void napi_skb_free_stolen_head(struct sk_buff *skb) napi_skb_cache_put(skb); } +/** + * napi_consume_skb() - consume skb in NAPI context, try to feed skb cache + * @skb: buffer to free + * @budget: NAPI budget + * + * Non-zero @budget must come from the @budget argument passed by the core + * to a NAPI poll function. Note that core may pass budget of 0 to NAPI poll + * for example when polling for netpoll / netconsole. + * + * Passing @budget of 0 is safe from any context, it turns this function + * into dev_consume_skb_any(). + */ void napi_consume_skb(struct sk_buff *skb, int budget) { - /* Zero budget indicate non-NAPI context called us, like netpoll */ if (unlikely(!budget || !skb)) { dev_consume_skb_any(skb); return; @@ -5108,6 +5140,9 @@ static const u8 skb_ext_type_len[] = { #if IS_ENABLED(CONFIG_INET_PSP) [SKB_EXT_PSP] = SKB_EXT_CHUNKSIZEOF(struct psp_skb_ext), #endif +#if IS_ENABLED(CONFIG_CAN) + [SKB_EXT_CAN] = SKB_EXT_CHUNKSIZEOF(struct can_skb_ext), +#endif }; static __always_inline unsigned int skb_ext_total_length(void) @@ -5123,7 +5158,7 @@ static __always_inline unsigned int skb_ext_total_length(void) static void skb_extensions_init(void) { - BUILD_BUG_ON(SKB_EXT_NUM >= 8); + BUILD_BUG_ON(SKB_EXT_NUM > 8); #if !IS_ENABLED(CONFIG_KCOV_INSTRUMENT_ALL) BUILD_BUG_ON(skb_ext_total_length() > 255); #endif @@ -7392,31 +7427,56 @@ bool csum_and_copy_from_iter_full(void *addr, size_t bytes, } EXPORT_SYMBOL(csum_and_copy_from_iter_full); -void get_netmem(netmem_ref netmem) +void __get_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_get_net_iov(netmem_to_net_iov(netmem)); - return; - } - get_page(netmem_to_page(netmem)); + if (net_is_devmem_iov(niov)) + net_devmem_get_net_iov(netmem_to_net_iov(netmem)); } -EXPORT_SYMBOL(get_netmem); +EXPORT_SYMBOL(__get_netmem); -void put_netmem(netmem_ref netmem) +void __put_netmem(netmem_ref netmem) { - struct net_iov *niov; + struct net_iov *niov = netmem_to_net_iov(netmem); - if (netmem_is_net_iov(netmem)) { - niov = netmem_to_net_iov(netmem); - if (net_is_devmem_iov(niov)) - net_devmem_put_net_iov(netmem_to_net_iov(netmem)); - return; + if (net_is_devmem_iov(niov)) + net_devmem_put_net_iov(netmem_to_net_iov(netmem)); +} +EXPORT_SYMBOL(__put_netmem); + +struct vlan_type_depth __vlan_get_protocol_offset(const struct sk_buff *skb, + __be16 type, + int mac_offset) +{ + unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; + + /* if type is 802.1Q/AD then the header should already be + * present at mac_len - VLAN_HLEN (if mac_len > 0), or at + * ETH_HLEN otherwise + */ + if (vlan_depth) { + if (WARN_ON_ONCE(vlan_depth < VLAN_HLEN)) + return (struct vlan_type_depth) { 0 }; + vlan_depth -= VLAN_HLEN; + } else { + vlan_depth = ETH_HLEN; } + do { + struct vlan_hdr vhdr, *vh; + + vh = skb_header_pointer(skb, mac_offset + vlan_depth, + sizeof(vhdr), &vhdr); + if (unlikely(!vh || !--parse_depth)) + return (struct vlan_type_depth) { 0 }; - put_page(netmem_to_page(netmem)); + type = vh->h_vlan_encapsulated_proto; + vlan_depth += VLAN_HLEN; + } while (eth_type_vlan(type)); + + return (struct vlan_type_depth) { + .type = type, + .depth = vlan_depth + }; } -EXPORT_SYMBOL(put_netmem); +EXPORT_SYMBOL(__vlan_get_protocol_offset); diff --git a/net/core/sock.c b/net/core/sock.c index a1c8b47b0d56..693e6d80f501 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -4193,13 +4193,17 @@ int proto_register(struct proto *prot, int alloc_slab) return -EINVAL; } if (alloc_slab) { - prot->slab = kmem_cache_create_usercopy(prot->name, - prot->obj_size, 0, - SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | - prot->slab_flags, - prot->useroffset, prot->usersize, - NULL); + struct kmem_cache_args args = { + .useroffset = prot->useroffset, + .usersize = prot->usersize, + .freeptr_offset = prot->freeptr_offset, + .use_freeptr_offset = !!prot->freeptr_offset, + }; + prot->slab = kmem_cache_create(prot->name, prot->obj_size, + &args, + SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | + prot->slab_flags); if (prot->slab == NULL) { pr_crit("%s: Can't create sock SLAB cache!\n", prot->name); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 05dd55cf8b58..03aea10073f0 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -17,6 +17,7 @@ #include <linux/init.h> #include <linux/slab.h> #include <linux/sched/isolation.h> +#include <linux/hex.h> #include <net/ip.h> #include <net/sock.h> @@ -325,10 +326,16 @@ static int proc_do_dev_weight(const struct ctl_table *table, int write, static int proc_do_rss_key(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; + struct ctl_table fake_table; + char *pos = buf; + + for (int i = 0; i < NETDEV_RSS_KEY_LEN; i++) { + pos = hex_byte_pack(pos, netdev_rss_key[i]); + *pos++ = ':'; + } + *(--pos) = 0; - snprintf(buf, sizeof(buf), "%*phC", NETDEV_RSS_KEY_LEN, netdev_rss_key); fake_table.data = buf; fake_table.maxlen = sizeof(buf); return proc_dostring(&fake_table, write, buffer, lenp, ppos); |
