diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/dev.c | 2 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 22 | ||||
-rw-r--r-- | net/core/filter.c | 210 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 2 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 6 | ||||
-rw-r--r-- | net/core/net_namespace.c | 60 | ||||
-rw-r--r-- | net/core/skbuff.c | 2 | ||||
-rw-r--r-- | net/core/sock.c | 22 |
8 files changed, 239 insertions, 87 deletions
diff --git a/net/core/dev.c b/net/core/dev.c index 93a25d87b86b..8d49b2198d07 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi) * the kthread. */ while (true) { - if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state)) + if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state)) break; msleep(20); diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 9c0ad7f4b5d8..ad54b12d4b4c 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_get) - return dev_get_hwtstamp_phylib(dev, kernel_cfg); + if (ops->ndo_hwtstamp_get) { + int err; + + netdev_lock_ops(dev); + err = dev_get_hwtstamp_phylib(dev, kernel_cfg); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg); @@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev, if (!netif_device_present(dev)) return -ENODEV; - if (ops->ndo_hwtstamp_set) - return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + if (ops->ndo_hwtstamp_set) { + int err; + + netdev_lock_ops(dev); + err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack); + netdev_unlock_ops(dev); + + return err; + } /* Legacy path: unconverted lower driver */ return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg); diff --git a/net/core/filter.c b/net/core/filter.c index da391e2b0788..2af0a5f1d748 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) return 0; } -static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, - enum xdp_mem_type mem_type, bool release) +static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, + bool tail, bool release) { - struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); + struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) : + xsk_buff_get_head(xdp); if (release) { - xsk_buff_del_tail(zc_frag); - __xdp_return(0, mem_type, false, zc_frag); + xsk_buff_del_frag(zc_frag); } else { - zc_frag->data_end -= shrink; + if (tail) + zc_frag->data_end -= shrink; + else + zc_frag->data += shrink; } + + return zc_frag; } static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, - int shrink) + int shrink, bool tail) { enum xdp_mem_type mem_type = xdp->rxq->mem.type; bool release = skb_frag_size(frag) == shrink; + netmem_ref netmem = skb_frag_netmem(frag); + struct xdp_buff *zc_frag = NULL; if (mem_type == MEM_TYPE_XSK_BUFF_POOL) { - bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release); - goto out; + netmem = 0; + zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release); } - if (release) - __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL); + if (release) { + __xdp_return(netmem, mem_type, false, zc_frag); + } else { + if (!tail) + skb_frag_off_add(frag, shrink); + skb_frag_size_sub(frag, shrink); + } -out: return release; } @@ -4198,18 +4209,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) len_free += shrink; offset -= shrink; - if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + if (bpf_xdp_shrink_data(xdp, frag, shrink, true)) n_frags_free++; - } else { - skb_frag_size_sub(frag, shrink); - break; - } } sinfo->nr_frags -= n_frags_free; sinfo->xdp_frags_size -= len_free; if (unlikely(!sinfo->nr_frags)) { xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); xdp->data_end -= offset; } @@ -7431,6 +7439,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_sock, FIELD)); \ } while (0) + BTF_TYPE_EMIT(struct bpf_xdp_sock); + switch (si->off) { case offsetof(struct bpf_xdp_sock, queue_id): BPF_XDP_SOCK_GET(queue_id); @@ -9284,13 +9294,17 @@ static bool sock_addr_is_valid_access(int off, int size, return false; info->reg_type = PTR_TO_SOCKET; break; - default: - if (type == BPF_READ) { - if (size != size_default) - return false; - } else { + case bpf_ctx_range(struct bpf_sock_addr, user_family): + case bpf_ctx_range(struct bpf_sock_addr, family): + case bpf_ctx_range(struct bpf_sock_addr, type): + case bpf_ctx_range(struct bpf_sock_addr, protocol): + if (type != BPF_READ) return false; - } + if (size != size_default) + return false; + break; + default: + return false; } return true; @@ -11990,6 +12004,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return func; } +/** + * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area. + * @skb: socket buffer carrying the metadata + * @offset: offset into the metadata area, must be <= skb_metadata_len() + */ +void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset) +{ + return skb_metadata_end(skb) - skb_metadata_len(skb) + offset; +} + __bpf_kfunc_start_defs(); __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, struct bpf_dynptr *ptr__uninit) @@ -12007,6 +12031,42 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags, return 0; } +/** + * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area. + * @skb_: socket buffer carrying the metadata + * @flags: future use, must be zero + * @ptr__uninit: dynptr to initialize + * + * Set up a dynptr for access to the metadata area earlier allocated from the + * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to + * &__sk_buff->data_meta. + * + * If passed @skb_ is a clone which shares the data with the original, the + * dynptr will be read-only. This limitation may be lifted in the future. + * + * Return: + * * %0 - dynptr ready to use + * * %-EINVAL - invalid flags, dynptr set to null + */ +__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags, + struct bpf_dynptr *ptr__uninit) +{ + struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit; + struct sk_buff *skb = (struct sk_buff *)skb_; + + if (flags) { + bpf_dynptr_set_null(ptr); + return -EINVAL; + } + + bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb)); + + if (skb_cloned(skb)) + bpf_dynptr_set_rdonly(ptr); + + return 0; +} + __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags, struct bpf_dynptr *ptr__uninit) { @@ -12160,6 +12220,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops, return 0; } +/** + * bpf_xdp_pull_data() - Pull in non-linear xdp data. + * @x: &xdp_md associated with the XDP buffer + * @len: length of data to be made directly accessible in the linear part + * + * Pull in data in case the XDP buffer associated with @x is non-linear and + * not all @len are in the linear data area. + * + * Direct packet access allows reading and writing linear XDP data through + * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which + * ends up in the linear part of the xdp_buff depends on the NIC and its + * configuration. When a frag-capable XDP program wants to directly access + * headers that may be in the non-linear area, call this kfunc to make sure + * the data is available in the linear area. Alternatively, use dynptr or + * bpf_xdp_{load,store}_bytes() to access data without pulling. + * + * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate + * headers in the non-linear data area. + * + * A call to this kfunc may reduce headroom. If there is not enough tailroom + * in the linear data area, metadata and data will be shifted down. + * + * A call to this kfunc is susceptible to change the buffer geometry. + * Therefore, at load time, all checks on pointers previously done by the + * verifier are invalidated and must be performed again, if the kfunc is used + * in combination with direct packet access. + * + * Return: + * * %0 - success + * * %-EINVAL - invalid len + */ +__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len) +{ + struct xdp_buff *xdp = (struct xdp_buff *)x; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i, delta, shift, headroom, tailroom, n_frags_free = 0; + void *data_hard_end = xdp_data_hard_end(xdp); + int data_len = xdp->data_end - xdp->data; + void *start; + + if (len <= data_len) + return 0; + + if (unlikely(len > xdp_get_buff_len(xdp))) + return -EINVAL; + + start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta; + + headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame); + tailroom = data_hard_end - xdp->data_end; + + delta = len - data_len; + if (unlikely(delta > tailroom + headroom)) + return -EINVAL; + + shift = delta - tailroom; + if (shift > 0) { + memmove(start - shift, start, xdp->data_end - start); + + xdp->data_meta -= shift; + xdp->data -= shift; + xdp->data_end -= shift; + } + + for (i = 0; i < sinfo->nr_frags && delta; i++) { + skb_frag_t *frag = &sinfo->frags[i]; + u32 shrink = min_t(u32, delta, skb_frag_size(frag)); + + memcpy(xdp->data_end, skb_frag_address(frag), shrink); + + xdp->data_end += shrink; + sinfo->xdp_frags_size -= shrink; + delta -= shrink; + if (bpf_xdp_shrink_data(xdp, frag, shrink, false)) + n_frags_free++; + } + + if (unlikely(n_frags_free)) { + memmove(sinfo->frags, sinfo->frags + n_frags_free, + (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t)); + + sinfo->nr_frags -= n_frags_free; + + if (!sinfo->nr_frags) { + xdp_buff_clear_frags_flag(xdp); + xdp_buff_clear_frag_pfmemalloc(xdp); + } + } + + return 0; +} + __bpf_kfunc_end_defs(); int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags, @@ -12181,8 +12333,13 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb) BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_kfunc_check_set_skb) +BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta) +BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS) +BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta) + BTF_KFUNCS_START(bpf_kfunc_check_set_xdp) BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) +BTF_ID_FLAGS(func, bpf_xdp_pull_data) BTF_KFUNCS_END(bpf_kfunc_check_set_xdp) BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr) @@ -12202,6 +12359,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { .set = &bpf_kfunc_check_set_skb, }; +static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb_meta, +}; + static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { .owner = THIS_MODULE, .set = &bpf_kfunc_check_set_xdp, @@ -12237,6 +12399,8 @@ static int __init bpf_kfunc_init(void) ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, &bpf_kfunc_set_sock_addr); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 7d426a8e29f3..f112156db587 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -90,10 +90,12 @@ static void est_timer(struct timer_list *t) rate = (b_packets - est->last_packets) << (10 - est->intvl_log); rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + preempt_disable_nested(); write_seqcount_begin(&est->seq); est->avbps += brate; est->avpps += rate; write_seqcount_end(&est->seq); + preempt_enable_nested(); est->last_bytes = b_bytes; est->last_packets = b_packets; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c28cd6665444..3c2dc4c5e683 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) struct netdev_rx_queue *queue = &dev->_rx[i]; struct kobject *kobj = &queue->kobj; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!refcount_read(&dev_net(dev)->ns.count)) + if (!check_net(dev_net(dev))) queue->kobj.uevent_suppress = 1; if (netdev_uses_bql(dev)) @@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!refcount_read(&dev_net(ndev)->ns.count)) + if (!check_net(dev_net(ndev))) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 1b6f3826dd0e..b0e0f22d7b21 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -20,6 +20,7 @@ #include <linux/sched/task.h> #include <linux/uidgid.h> #include <linux/proc_fs.h> +#include <linux/nstree.h> #include <net/aligned_data.h> #include <net/sock.h> @@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp) { int id; - if (refcount_read(&net->ns.count) == 0) + if (!check_net(net)) return NETNSA_NSID_NOT_ASSIGNED; spin_lock(&net->nsid_lock); @@ -397,10 +398,15 @@ static __net_init void preinit_net_sysctl(struct net *net) } /* init code that must occur even if setup_net() is not called. */ -static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns) +static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns) { + int ret; + + ret = ns_common_init(net); + if (ret) + return ret; + refcount_set(&net->passive, 1); - refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt"); ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt"); @@ -420,6 +426,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_ INIT_LIST_HEAD(&net->ptype_all); INIT_LIST_HEAD(&net->ptype_specific); preinit_net_sysctl(net); + return 0; } /* @@ -432,7 +439,7 @@ static __net_init int setup_net(struct net *net) LIST_HEAD(net_exit_list); int error = 0; - net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie); + net->net_cookie = ns_tree_gen_id(&net->ns); list_for_each_entry(ops, &pernet_list, list) { error = ops_init(ops, net); @@ -442,6 +449,7 @@ static __net_init int setup_net(struct net *net) down_write(&net_rwsem); list_add_tail_rcu(&net->list, &net_namespace_list); up_write(&net_rwsem); + ns_tree_add_raw(net); out: return error; @@ -539,7 +547,7 @@ void net_drop_ns(void *p) net_passive_dec(net); } -struct net *copy_net_ns(unsigned long flags, +struct net *copy_net_ns(u64 flags, struct user_namespace *user_ns, struct net *old_net) { struct ucounts *ucounts; @@ -559,7 +567,9 @@ struct net *copy_net_ns(unsigned long flags, goto dec_ucounts; } - preinit_net(net, user_ns); + rv = preinit_net(net, user_ns); + if (rv < 0) + goto dec_ucounts; net->ucounts = ucounts; get_user_ns(user_ns); @@ -573,6 +583,7 @@ struct net *copy_net_ns(unsigned long flags, if (rv < 0) { put_userns: + ns_common_free(net); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); #endif @@ -659,8 +670,10 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ down_write(&net_rwsem); - llist_for_each_entry(net, net_kill_list, cleanup_list) + llist_for_each_entry(net, net_kill_list, cleanup_list) { + ns_tree_remove(net); list_del_rcu(&net->list); + } /* Cache last net. After we unlock rtnl, no one new net * added to net_namespace_list can assign nsid pointer * to a net from net_kill_list (see peernet2id_alloc()). @@ -693,6 +706,7 @@ static void cleanup_net(struct work_struct *work) /* Finally it is safe to free my network namespace structure */ list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) { list_del_init(&net->exit_list); + ns_common_free(net); dec_net_namespaces(net->ucounts); #ifdef CONFIG_KEYS key_remove_domain(net->key_domain); @@ -812,31 +826,12 @@ static void net_ns_net_debugfs(struct net *net) static __net_init int net_ns_net_init(struct net *net) { -#ifdef CONFIG_NET_NS - net->ns.ops = &netns_operations; -#endif - net->ns.inum = PROC_NET_INIT_INO; - if (net != &init_net) { - int ret = ns_alloc_inum(&net->ns); - if (ret) - return ret; - } net_ns_net_debugfs(net); return 0; } -static __net_exit void net_ns_net_exit(struct net *net) -{ - /* - * Initial network namespace doesn't exit so we don't need any - * special checks here. - */ - ns_free_inum(&net->ns); -} - static struct pernet_operations __net_initdata net_ns_ops = { .init = net_ns_net_init, - .exit = net_ns_net_exit, }; static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { @@ -1282,7 +1277,12 @@ void __init net_ns_init(void) #ifdef CONFIG_KEYS init_net.key_domain = &init_net_key_domain; #endif - preinit_net(&init_net, &init_user_ns); + /* + * This currently cannot fail as the initial network namespace + * has a static inode number. + */ + if (preinit_net(&init_net, &init_user_ns)) + panic("Could not preinitialize the initial network namespace"); down_write(&pernet_ops_rwsem); if (setup_net(&init_net)) @@ -1517,11 +1517,6 @@ static struct ns_common *netns_get(struct task_struct *task) return net ? &net->ns : NULL; } -static inline struct net *to_net_ns(struct ns_common *ns) -{ - return container_of(ns, struct net, ns); -} - static void netns_put(struct ns_common *ns) { put_net(to_net_ns(ns)); @@ -1548,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns) const struct proc_ns_operations netns_operations = { .name = "net", - .type = CLONE_NEWNET, .get = netns_get, .put = netns_put, .install = netns_install, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ee0274417948..1c0279b9cb9f 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len, return NULL; while (data_len) { - if (nr_frags == MAX_SKB_FRAGS - 1) + if (nr_frags == MAX_SKB_FRAGS) goto failure; while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order)) order--; diff --git a/net/core/sock.c b/net/core/sock.c index 7c26ec8dce63..158bddd23134 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2780,28 +2780,6 @@ void sock_pfree(struct sk_buff *skb) EXPORT_SYMBOL(sock_pfree); #endif /* CONFIG_INET */ -unsigned long __sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - read_lock(&sk->sk_callback_lock); - ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; - read_unlock(&sk->sk_callback_lock); - return ino; -} -EXPORT_SYMBOL(__sock_i_ino); - -unsigned long sock_i_ino(struct sock *sk) -{ - unsigned long ino; - - local_bh_disable(); - ino = __sock_i_ino(sk); - local_bh_enable(); - return ino; -} -EXPORT_SYMBOL(sock_i_ino); - /* * Allocate a skb from the socket's send buffer. */ |