summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/dev.c2
-rw-r--r--net/core/dev_ioctl.c22
-rw-r--r--net/core/filter.c210
-rw-r--r--net/core/net-sysfs.c6
-rw-r--r--net/core/net_namespace.c60
-rw-r--r--net/core/skbuff.c2
6 files changed, 237 insertions, 65 deletions
diff --git a/net/core/dev.c b/net/core/dev.c
index 93a25d87b86b..8d49b2198d07 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -6965,7 +6965,7 @@ static void napi_stop_kthread(struct napi_struct *napi)
* the kthread.
*/
while (true) {
- if (!test_bit(NAPIF_STATE_SCHED_THREADED, &napi->state))
+ if (!test_bit(NAPI_STATE_SCHED_THREADED, &napi->state))
break;
msleep(20);
diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c
index 9c0ad7f4b5d8..ad54b12d4b4c 100644
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -464,8 +464,15 @@ int generic_hwtstamp_get_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_get)
- return dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ if (ops->ndo_hwtstamp_get) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_get_hwtstamp_phylib(dev, kernel_cfg);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCGHWTSTAMP, kernel_cfg);
@@ -481,8 +488,15 @@ int generic_hwtstamp_set_lower(struct net_device *dev,
if (!netif_device_present(dev))
return -ENODEV;
- if (ops->ndo_hwtstamp_set)
- return dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ if (ops->ndo_hwtstamp_set) {
+ int err;
+
+ netdev_lock_ops(dev);
+ err = dev_set_hwtstamp_phylib(dev, kernel_cfg, extack);
+ netdev_unlock_ops(dev);
+
+ return err;
+ }
/* Legacy path: unconverted lower driver */
return generic_hwtstamp_ioctl_lower(dev, SIOCSHWTSTAMP, kernel_cfg);
diff --git a/net/core/filter.c b/net/core/filter.c
index da391e2b0788..2af0a5f1d748 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4153,34 +4153,45 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
return 0;
}
-static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
- enum xdp_mem_type mem_type, bool release)
+static struct xdp_buff *bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
+ bool tail, bool release)
{
- struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
+ struct xdp_buff *zc_frag = tail ? xsk_buff_get_tail(xdp) :
+ xsk_buff_get_head(xdp);
if (release) {
- xsk_buff_del_tail(zc_frag);
- __xdp_return(0, mem_type, false, zc_frag);
+ xsk_buff_del_frag(zc_frag);
} else {
- zc_frag->data_end -= shrink;
+ if (tail)
+ zc_frag->data_end -= shrink;
+ else
+ zc_frag->data += shrink;
}
+
+ return zc_frag;
}
static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
- int shrink)
+ int shrink, bool tail)
{
enum xdp_mem_type mem_type = xdp->rxq->mem.type;
bool release = skb_frag_size(frag) == shrink;
+ netmem_ref netmem = skb_frag_netmem(frag);
+ struct xdp_buff *zc_frag = NULL;
if (mem_type == MEM_TYPE_XSK_BUFF_POOL) {
- bpf_xdp_shrink_data_zc(xdp, shrink, mem_type, release);
- goto out;
+ netmem = 0;
+ zc_frag = bpf_xdp_shrink_data_zc(xdp, shrink, tail, release);
}
- if (release)
- __xdp_return(skb_frag_netmem(frag), mem_type, false, NULL);
+ if (release) {
+ __xdp_return(netmem, mem_type, false, zc_frag);
+ } else {
+ if (!tail)
+ skb_frag_off_add(frag, shrink);
+ skb_frag_size_sub(frag, shrink);
+ }
-out:
return release;
}
@@ -4198,18 +4209,15 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
len_free += shrink;
offset -= shrink;
- if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, true))
n_frags_free++;
- } else {
- skb_frag_size_sub(frag, shrink);
- break;
- }
}
sinfo->nr_frags -= n_frags_free;
sinfo->xdp_frags_size -= len_free;
if (unlikely(!sinfo->nr_frags)) {
xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
xdp->data_end -= offset;
}
@@ -7431,6 +7439,8 @@ u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct xdp_sock, FIELD)); \
} while (0)
+ BTF_TYPE_EMIT(struct bpf_xdp_sock);
+
switch (si->off) {
case offsetof(struct bpf_xdp_sock, queue_id):
BPF_XDP_SOCK_GET(queue_id);
@@ -9284,13 +9294,17 @@ static bool sock_addr_is_valid_access(int off, int size,
return false;
info->reg_type = PTR_TO_SOCKET;
break;
- default:
- if (type == BPF_READ) {
- if (size != size_default)
- return false;
- } else {
+ case bpf_ctx_range(struct bpf_sock_addr, user_family):
+ case bpf_ctx_range(struct bpf_sock_addr, family):
+ case bpf_ctx_range(struct bpf_sock_addr, type):
+ case bpf_ctx_range(struct bpf_sock_addr, protocol):
+ if (type != BPF_READ)
return false;
- }
+ if (size != size_default)
+ return false;
+ break;
+ default:
+ return false;
}
return true;
@@ -11990,6 +12004,16 @@ bpf_sk_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return func;
}
+/**
+ * bpf_skb_meta_pointer() - Gets a mutable pointer within the skb metadata area.
+ * @skb: socket buffer carrying the metadata
+ * @offset: offset into the metadata area, must be <= skb_metadata_len()
+ */
+void *bpf_skb_meta_pointer(struct sk_buff *skb, u32 offset)
+{
+ return skb_metadata_end(skb) - skb_metadata_len(skb) + offset;
+}
+
__bpf_kfunc_start_defs();
__bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
struct bpf_dynptr *ptr__uninit)
@@ -12007,6 +12031,42 @@ __bpf_kfunc int bpf_dynptr_from_skb(struct __sk_buff *s, u64 flags,
return 0;
}
+/**
+ * bpf_dynptr_from_skb_meta() - Initialize a dynptr to the skb metadata area.
+ * @skb_: socket buffer carrying the metadata
+ * @flags: future use, must be zero
+ * @ptr__uninit: dynptr to initialize
+ *
+ * Set up a dynptr for access to the metadata area earlier allocated from the
+ * XDP context with bpf_xdp_adjust_meta(). Serves as an alternative to
+ * &__sk_buff->data_meta.
+ *
+ * If passed @skb_ is a clone which shares the data with the original, the
+ * dynptr will be read-only. This limitation may be lifted in the future.
+ *
+ * Return:
+ * * %0 - dynptr ready to use
+ * * %-EINVAL - invalid flags, dynptr set to null
+ */
+__bpf_kfunc int bpf_dynptr_from_skb_meta(struct __sk_buff *skb_, u64 flags,
+ struct bpf_dynptr *ptr__uninit)
+{
+ struct bpf_dynptr_kern *ptr = (struct bpf_dynptr_kern *)ptr__uninit;
+ struct sk_buff *skb = (struct sk_buff *)skb_;
+
+ if (flags) {
+ bpf_dynptr_set_null(ptr);
+ return -EINVAL;
+ }
+
+ bpf_dynptr_init(ptr, skb, BPF_DYNPTR_TYPE_SKB_META, 0, skb_metadata_len(skb));
+
+ if (skb_cloned(skb))
+ bpf_dynptr_set_rdonly(ptr);
+
+ return 0;
+}
+
__bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_md *x, u64 flags,
struct bpf_dynptr *ptr__uninit)
{
@@ -12160,6 +12220,98 @@ __bpf_kfunc int bpf_sock_ops_enable_tx_tstamp(struct bpf_sock_ops_kern *skops,
return 0;
}
+/**
+ * bpf_xdp_pull_data() - Pull in non-linear xdp data.
+ * @x: &xdp_md associated with the XDP buffer
+ * @len: length of data to be made directly accessible in the linear part
+ *
+ * Pull in data in case the XDP buffer associated with @x is non-linear and
+ * not all @len are in the linear data area.
+ *
+ * Direct packet access allows reading and writing linear XDP data through
+ * packet pointers (i.e., &xdp_md->data + offsets). The amount of data which
+ * ends up in the linear part of the xdp_buff depends on the NIC and its
+ * configuration. When a frag-capable XDP program wants to directly access
+ * headers that may be in the non-linear area, call this kfunc to make sure
+ * the data is available in the linear area. Alternatively, use dynptr or
+ * bpf_xdp_{load,store}_bytes() to access data without pulling.
+ *
+ * This kfunc can also be used with bpf_xdp_adjust_head() to decapsulate
+ * headers in the non-linear data area.
+ *
+ * A call to this kfunc may reduce headroom. If there is not enough tailroom
+ * in the linear data area, metadata and data will be shifted down.
+ *
+ * A call to this kfunc is susceptible to change the buffer geometry.
+ * Therefore, at load time, all checks on pointers previously done by the
+ * verifier are invalidated and must be performed again, if the kfunc is used
+ * in combination with direct packet access.
+ *
+ * Return:
+ * * %0 - success
+ * * %-EINVAL - invalid len
+ */
+__bpf_kfunc int bpf_xdp_pull_data(struct xdp_md *x, u32 len)
+{
+ struct xdp_buff *xdp = (struct xdp_buff *)x;
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+ int i, delta, shift, headroom, tailroom, n_frags_free = 0;
+ void *data_hard_end = xdp_data_hard_end(xdp);
+ int data_len = xdp->data_end - xdp->data;
+ void *start;
+
+ if (len <= data_len)
+ return 0;
+
+ if (unlikely(len > xdp_get_buff_len(xdp)))
+ return -EINVAL;
+
+ start = xdp_data_meta_unsupported(xdp) ? xdp->data : xdp->data_meta;
+
+ headroom = start - xdp->data_hard_start - sizeof(struct xdp_frame);
+ tailroom = data_hard_end - xdp->data_end;
+
+ delta = len - data_len;
+ if (unlikely(delta > tailroom + headroom))
+ return -EINVAL;
+
+ shift = delta - tailroom;
+ if (shift > 0) {
+ memmove(start - shift, start, xdp->data_end - start);
+
+ xdp->data_meta -= shift;
+ xdp->data -= shift;
+ xdp->data_end -= shift;
+ }
+
+ for (i = 0; i < sinfo->nr_frags && delta; i++) {
+ skb_frag_t *frag = &sinfo->frags[i];
+ u32 shrink = min_t(u32, delta, skb_frag_size(frag));
+
+ memcpy(xdp->data_end, skb_frag_address(frag), shrink);
+
+ xdp->data_end += shrink;
+ sinfo->xdp_frags_size -= shrink;
+ delta -= shrink;
+ if (bpf_xdp_shrink_data(xdp, frag, shrink, false))
+ n_frags_free++;
+ }
+
+ if (unlikely(n_frags_free)) {
+ memmove(sinfo->frags, sinfo->frags + n_frags_free,
+ (sinfo->nr_frags - n_frags_free) * sizeof(skb_frag_t));
+
+ sinfo->nr_frags -= n_frags_free;
+
+ if (!sinfo->nr_frags) {
+ xdp_buff_clear_frags_flag(xdp);
+ xdp_buff_clear_frag_pfmemalloc(xdp);
+ }
+ }
+
+ return 0;
+}
+
__bpf_kfunc_end_defs();
int bpf_dynptr_from_skb_rdonly(struct __sk_buff *skb, u64 flags,
@@ -12181,8 +12333,13 @@ BTF_KFUNCS_START(bpf_kfunc_check_set_skb)
BTF_ID_FLAGS(func, bpf_dynptr_from_skb, KF_TRUSTED_ARGS)
BTF_KFUNCS_END(bpf_kfunc_check_set_skb)
+BTF_KFUNCS_START(bpf_kfunc_check_set_skb_meta)
+BTF_ID_FLAGS(func, bpf_dynptr_from_skb_meta, KF_TRUSTED_ARGS)
+BTF_KFUNCS_END(bpf_kfunc_check_set_skb_meta)
+
BTF_KFUNCS_START(bpf_kfunc_check_set_xdp)
BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+BTF_ID_FLAGS(func, bpf_xdp_pull_data)
BTF_KFUNCS_END(bpf_kfunc_check_set_xdp)
BTF_KFUNCS_START(bpf_kfunc_check_set_sock_addr)
@@ -12202,6 +12359,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
.set = &bpf_kfunc_check_set_skb,
};
+static const struct btf_kfunc_id_set bpf_kfunc_set_skb_meta = {
+ .owner = THIS_MODULE,
+ .set = &bpf_kfunc_check_set_skb_meta,
+};
+
static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
.owner = THIS_MODULE,
.set = &bpf_kfunc_check_set_xdp,
@@ -12237,6 +12399,8 @@ static int __init bpf_kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_kfunc_set_skb);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_kfunc_set_skb_meta);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_ACT, &bpf_kfunc_set_skb_meta);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
&bpf_kfunc_set_sock_addr);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index c28cd6665444..3c2dc4c5e683 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1328,7 +1328,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
struct netdev_rx_queue *queue = &dev->_rx[i];
struct kobject *kobj = &queue->kobj;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
kobj->uevent_suppress = 1;
if (dev->sysfs_rx_queue_group)
sysfs_remove_group(kobj, dev->sysfs_rx_queue_group);
@@ -2061,7 +2061,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num)
while (--i >= new_num) {
struct netdev_queue *queue = dev->_tx + i;
- if (!refcount_read(&dev_net(dev)->ns.count))
+ if (!check_net(dev_net(dev)))
queue->kobj.uevent_suppress = 1;
if (netdev_uses_bql(dev))
@@ -2315,7 +2315,7 @@ void netdev_unregister_kobject(struct net_device *ndev)
{
struct device *dev = &ndev->dev;
- if (!refcount_read(&dev_net(ndev)->ns.count))
+ if (!check_net(dev_net(ndev)))
dev_set_uevent_suppress(dev, 1);
kobject_get(&dev->kobj);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 1b6f3826dd0e..b0e0f22d7b21 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -20,6 +20,7 @@
#include <linux/sched/task.h>
#include <linux/uidgid.h>
#include <linux/proc_fs.h>
+#include <linux/nstree.h>
#include <net/aligned_data.h>
#include <net/sock.h>
@@ -314,7 +315,7 @@ int peernet2id_alloc(struct net *net, struct net *peer, gfp_t gfp)
{
int id;
- if (refcount_read(&net->ns.count) == 0)
+ if (!check_net(net))
return NETNSA_NSID_NOT_ASSIGNED;
spin_lock(&net->nsid_lock);
@@ -397,10 +398,15 @@ static __net_init void preinit_net_sysctl(struct net *net)
}
/* init code that must occur even if setup_net() is not called. */
-static __net_init void preinit_net(struct net *net, struct user_namespace *user_ns)
+static __net_init int preinit_net(struct net *net, struct user_namespace *user_ns)
{
+ int ret;
+
+ ret = ns_common_init(net);
+ if (ret)
+ return ret;
+
refcount_set(&net->passive, 1);
- refcount_set(&net->ns.count, 1);
ref_tracker_dir_init(&net->refcnt_tracker, 128, "net_refcnt");
ref_tracker_dir_init(&net->notrefcnt_tracker, 128, "net_notrefcnt");
@@ -420,6 +426,7 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
INIT_LIST_HEAD(&net->ptype_all);
INIT_LIST_HEAD(&net->ptype_specific);
preinit_net_sysctl(net);
+ return 0;
}
/*
@@ -432,7 +439,7 @@ static __net_init int setup_net(struct net *net)
LIST_HEAD(net_exit_list);
int error = 0;
- net->net_cookie = atomic64_inc_return(&net_aligned_data.net_cookie);
+ net->net_cookie = ns_tree_gen_id(&net->ns);
list_for_each_entry(ops, &pernet_list, list) {
error = ops_init(ops, net);
@@ -442,6 +449,7 @@ static __net_init int setup_net(struct net *net)
down_write(&net_rwsem);
list_add_tail_rcu(&net->list, &net_namespace_list);
up_write(&net_rwsem);
+ ns_tree_add_raw(net);
out:
return error;
@@ -539,7 +547,7 @@ void net_drop_ns(void *p)
net_passive_dec(net);
}
-struct net *copy_net_ns(unsigned long flags,
+struct net *copy_net_ns(u64 flags,
struct user_namespace *user_ns, struct net *old_net)
{
struct ucounts *ucounts;
@@ -559,7 +567,9 @@ struct net *copy_net_ns(unsigned long flags,
goto dec_ucounts;
}
- preinit_net(net, user_ns);
+ rv = preinit_net(net, user_ns);
+ if (rv < 0)
+ goto dec_ucounts;
net->ucounts = ucounts;
get_user_ns(user_ns);
@@ -573,6 +583,7 @@ struct net *copy_net_ns(unsigned long flags,
if (rv < 0) {
put_userns:
+ ns_common_free(net);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
#endif
@@ -659,8 +670,10 @@ static void cleanup_net(struct work_struct *work)
/* Don't let anyone else find us. */
down_write(&net_rwsem);
- llist_for_each_entry(net, net_kill_list, cleanup_list)
+ llist_for_each_entry(net, net_kill_list, cleanup_list) {
+ ns_tree_remove(net);
list_del_rcu(&net->list);
+ }
/* Cache last net. After we unlock rtnl, no one new net
* added to net_namespace_list can assign nsid pointer
* to a net from net_kill_list (see peernet2id_alloc()).
@@ -693,6 +706,7 @@ static void cleanup_net(struct work_struct *work)
/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, &net_exit_list, exit_list) {
list_del_init(&net->exit_list);
+ ns_common_free(net);
dec_net_namespaces(net->ucounts);
#ifdef CONFIG_KEYS
key_remove_domain(net->key_domain);
@@ -812,31 +826,12 @@ static void net_ns_net_debugfs(struct net *net)
static __net_init int net_ns_net_init(struct net *net)
{
-#ifdef CONFIG_NET_NS
- net->ns.ops = &netns_operations;
-#endif
- net->ns.inum = PROC_NET_INIT_INO;
- if (net != &init_net) {
- int ret = ns_alloc_inum(&net->ns);
- if (ret)
- return ret;
- }
net_ns_net_debugfs(net);
return 0;
}
-static __net_exit void net_ns_net_exit(struct net *net)
-{
- /*
- * Initial network namespace doesn't exit so we don't need any
- * special checks here.
- */
- ns_free_inum(&net->ns);
-}
-
static struct pernet_operations __net_initdata net_ns_ops = {
.init = net_ns_net_init,
- .exit = net_ns_net_exit,
};
static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = {
@@ -1282,7 +1277,12 @@ void __init net_ns_init(void)
#ifdef CONFIG_KEYS
init_net.key_domain = &init_net_key_domain;
#endif
- preinit_net(&init_net, &init_user_ns);
+ /*
+ * This currently cannot fail as the initial network namespace
+ * has a static inode number.
+ */
+ if (preinit_net(&init_net, &init_user_ns))
+ panic("Could not preinitialize the initial network namespace");
down_write(&pernet_ops_rwsem);
if (setup_net(&init_net))
@@ -1517,11 +1517,6 @@ static struct ns_common *netns_get(struct task_struct *task)
return net ? &net->ns : NULL;
}
-static inline struct net *to_net_ns(struct ns_common *ns)
-{
- return container_of(ns, struct net, ns);
-}
-
static void netns_put(struct ns_common *ns)
{
put_net(to_net_ns(ns));
@@ -1548,7 +1543,6 @@ static struct user_namespace *netns_owner(struct ns_common *ns)
const struct proc_ns_operations netns_operations = {
.name = "net",
- .type = CLONE_NEWNET,
.get = netns_get,
.put = netns_put,
.install = netns_install,
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ee0274417948..1c0279b9cb9f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -6667,7 +6667,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL;
while (data_len) {
- if (nr_frags == MAX_SKB_FRAGS - 1)
+ if (nr_frags == MAX_SKB_FRAGS)
goto failure;
while (order && PAGE_ALIGN(data_len) < (PAGE_SIZE << order))
order--;