From eec517cdb4810b3843eb7707971de3164088bff1 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:50 +0200 Subject: net: Add IF_OPER_TESTING RFC 2863 defines the operational state testing. Add support for this state, both as a IF_LINK_MODE_ and __LINK_STATE_. Signed-off-by: Andrew Lunn Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ net/core/link_watch.c | 12 ++++++++++-- net/core/rtnetlink.c | 9 ++++++++- 3 files changed, 23 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 522288177bbd..fb61522b1ce1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9136,6 +9136,11 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); + if (rootdev->operstate == IF_OPER_TESTING) + netif_testing_on(dev); + else + netif_testing_off(dev); + if (netif_carrier_ok(rootdev)) netif_carrier_on(dev); else diff --git a/net/core/link_watch.c b/net/core/link_watch.c index f153e0601838..75431ca9300f 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -34,6 +34,9 @@ static DEFINE_SPINLOCK(lweventlist_lock); static unsigned char default_operstate(const struct net_device *dev) { + if (netif_testing(dev)) + return IF_OPER_TESTING; + if (!netif_carrier_ok(dev)) return (dev->ifindex != dev_get_iflink(dev) ? IF_OPER_LOWERLAYERDOWN : IF_OPER_DOWN); @@ -55,11 +58,15 @@ static void rfc2863_policy(struct net_device *dev) write_lock_bh(&dev_base_lock); switch(dev->link_mode) { + case IF_LINK_MODE_TESTING: + if (operstate == IF_OPER_UP) + operstate = IF_OPER_TESTING; + break; + case IF_LINK_MODE_DORMANT: if (operstate == IF_OPER_UP) operstate = IF_OPER_DORMANT; break; - case IF_LINK_MODE_DEFAULT: default: break; @@ -74,7 +81,8 @@ static void rfc2863_policy(struct net_device *dev) void linkwatch_init_dev(struct net_device *dev) { /* Handle pre-registration link state changes */ - if (!netif_carrier_ok(dev) || netif_dormant(dev)) + if (!netif_carrier_ok(dev) || netif_dormant(dev) || + netif_testing(dev)) rfc2863_policy(dev); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 709ebbf8ab5b..d6f4f4a9e8ba 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -829,11 +829,18 @@ static void set_operstate(struct net_device *dev, unsigned char transition) switch (transition) { case IF_OPER_UP: if ((operstate == IF_OPER_DORMANT || + operstate == IF_OPER_TESTING || operstate == IF_OPER_UNKNOWN) && - !netif_dormant(dev)) + !netif_dormant(dev) && !netif_testing(dev)) operstate = IF_OPER_UP; break; + case IF_OPER_TESTING: + if (operstate == IF_OPER_UP || + operstate == IF_OPER_UNKNOWN) + operstate = IF_OPER_TESTING; + break; + case IF_OPER_DORMANT: if (operstate == IF_OPER_UP || operstate == IF_OPER_UNKNOWN) -- cgit v1.2.3 From db30a57779b18b7cef092c21887ed2d23ad2bd35 Mon Sep 17 00:00:00 2001 From: Andrew Lunn Date: Mon, 20 Apr 2020 00:11:51 +0200 Subject: net: Add testing sysfs attribute Similar to speed, duplex and dorment, report the testing status in sysfs. Signed-off-by: Andrew Lunn Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 13 +++++++++++++ net/core/net-sysfs.c | 15 ++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 664a8f6a634f..3b404577f380 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -124,6 +124,19 @@ Description: authentication is performed (e.g: 802.1x). 'link_mode' attribute will also reflect the dormant state. +What: /sys/class/net//testing +Date: April 2002 +KernelVersion: 5.8 +Contact: netdev@vger.kernel.org +Description: + Indicates whether the interface is under test. Possible + values are: + 0: interface is not being tested + 1: interface is being tested + + When an interface is under test, it cannot be expected + to pass packets as normal. + What: /sys/clas/net//duplex Date: October 2009 KernelVersion: 2.6.33 diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 4773ad6ec111..0d9e46de205e 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -243,6 +243,18 @@ static ssize_t duplex_show(struct device *dev, } static DEVICE_ATTR_RO(duplex); +static ssize_t testing_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + if (netif_running(netdev)) + return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + + return -EINVAL; +} +static DEVICE_ATTR_RO(testing); + static ssize_t dormant_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -260,7 +272,7 @@ static const char *const operstates[] = { "notpresent", /* currently unused */ "down", "lowerlayerdown", - "testing", /* currently unused */ + "testing", "dormant", "up" }; @@ -524,6 +536,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_speed.attr, &dev_attr_duplex.attr, &dev_attr_dormant.attr, + &dev_attr_testing.attr, &dev_attr_operstate.attr, &dev_attr_carrier_changes.attr, &dev_attr_ifalias.attr, -- cgit v1.2.3 From 6f8b12d661d09b488b9ac879b8eafbd2cc4a1450 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:27 -0700 Subject: net: napi: add hard irqs deferral feature Back in commit 3b47d30396ba ("net: gro: add a per device gro flush timer") we added the ability to arm one high resolution timer, that we used to keep not-complete packets in GRO engine a bit longer, hoping that further frames might be added to them. Since then, we added the napi_complete_done() interface, and commit 364b6055738b ("net: busy-poll: return busypolling status to drivers") allowed drivers to avoid re-arming NIC interrupts if we made a promise that their NAPI poll() handler would be called in the near future. This infrastructure can be leveraged, thanks to a new device parameter, which allows to arm the napi hrtimer, instead of re-arming the device hard IRQ. We have noticed that on some servers with 32 RX queues or more, the chit-chat between the NIC and the host caused by IRQ delivery and re-arming could hurt throughput by ~20% on 100Gbit NIC. In contrast, hrtimers are using local (percpu) resources and might have lower cost. The new tunable, named napi_defer_hard_irqs, is placed in the same hierarchy than gro_flush_timeout (/sys/class/net/ethX/) By default, both gro_flush_timeout and napi_defer_hard_irqs are zero. This patch does not change the prior behavior of gro_flush_timeout if used alone : NIC hard irqs should be rearmed as before. One concrete usage can be : echo 20000 >/sys/class/net/eth1/gro_flush_timeout echo 10 >/sys/class/net/eth1/napi_defer_hard_irqs If at least one packet is retired, then we will reset napi counter to 10 (napi_defer_hard_irqs), ensuring at least 10 periodic scans of the queue. On busy queues, this should avoid NIC hard IRQ, while before this patch IRQ avoidance was only possible if napi->poll() was exhausting its budget and not call napi_complete_done(). This feature also can be used to work around some non-optimal NIC irq coalescing strategies. Having the ability to insert XX usec delays between each napi->poll() can increase cache efficiency, since we increase batch sizes. It also keeps serving cpus not idle too long, reducing tail latencies. Co-developed-by: Luigi Rizzo Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netdevice.h | 2 ++ net/core/dev.c | 29 ++++++++++++++++++----------- net/core/net-sysfs.c | 18 ++++++++++++++++++ 3 files changed, 38 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0750b54b3765..5a8d40f1ffe2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -329,6 +329,7 @@ struct napi_struct { unsigned long state; int weight; + int defer_hard_irqs_count; unsigned long gro_bitmask; int (*poll)(struct napi_struct *, int); #ifdef CONFIG_NETPOLL @@ -1995,6 +1996,7 @@ struct net_device { struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; + int napi_defer_hard_irqs; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; diff --git a/net/core/dev.c b/net/core/dev.c index fb61522b1ce1..67585484ad32 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6227,7 +6227,8 @@ EXPORT_SYMBOL(__napi_schedule_irqoff); bool napi_complete_done(struct napi_struct *n, int work_done) { - unsigned long flags, val, new; + unsigned long flags, val, new, timeout = 0; + bool ret = true; /* * 1) Don't let napi dequeue from the cpu poll list @@ -6239,20 +6240,23 @@ bool napi_complete_done(struct napi_struct *n, int work_done) NAPIF_STATE_IN_BUSY_POLL))) return false; - if (n->gro_bitmask) { - unsigned long timeout = 0; - - if (work_done) + if (work_done) { + if (n->gro_bitmask) timeout = n->dev->gro_flush_timeout; - + n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + } + if (n->defer_hard_irqs_count > 0) { + n->defer_hard_irqs_count--; + timeout = n->dev->gro_flush_timeout; + if (timeout) + ret = false; + } + if (n->gro_bitmask) { /* When the NAPI instance uses a timeout and keeps postponing * it, we need to bound somehow the time packets are kept in * the GRO layer */ napi_gro_flush(n, !!timeout); - if (timeout) - hrtimer_start(&n->timer, ns_to_ktime(timeout), - HRTIMER_MODE_REL_PINNED); } gro_normal_list(n); @@ -6284,7 +6288,10 @@ bool napi_complete_done(struct napi_struct *n, int work_done) return false; } - return true; + if (timeout) + hrtimer_start(&n->timer, ns_to_ktime(timeout), + HRTIMER_MODE_REL_PINNED); + return ret; } EXPORT_SYMBOL(napi_complete_done); @@ -6464,7 +6471,7 @@ static enum hrtimer_restart napi_watchdog(struct hrtimer *timer) /* Note : we use a relaxed variant of napi_schedule_prep() not setting * NAPI_STATE_MISSED, since we do not react to a device IRQ. */ - if (napi->gro_bitmask && !napi_disable_pending(napi) && + if (!napi_disable_pending(napi) && !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) __napi_schedule_irqoff(napi); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 0d9e46de205e..f3b650cd0923 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -382,6 +382,23 @@ static ssize_t gro_flush_timeout_store(struct device *dev, } NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); +static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) +{ + dev->napi_defer_hard_irqs = val; + return 0; +} + +static ssize_t napi_defer_hard_irqs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + return netdev_store(dev, attr, buf, len, change_napi_defer_hard_irqs); +} +NETDEVICE_SHOW_RW(napi_defer_hard_irqs, fmt_dec); + static ssize_t ifalias_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { @@ -545,6 +562,7 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_flags.attr, &dev_attr_tx_queue_len.attr, &dev_attr_gro_flush_timeout.attr, + &dev_attr_napi_defer_hard_irqs.attr, &dev_attr_phys_port_id.attr, &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, -- cgit v1.2.3 From 7e417a66b86c110f4b282945dac82e21e0b08328 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Apr 2020 09:13:28 -0700 Subject: net: napi: use READ_ONCE()/WRITE_ONCE() gro_flush_timeout and napi_defer_hard_irqs can be read from napi_complete_done() while other cpus write the value, whithout explicit synchronization. Use READ_ONCE()/WRITE_ONCE() to annotate the races. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++--- net/core/net-sysfs.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 67585484ad32..afff16849c26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6242,12 +6242,12 @@ bool napi_complete_done(struct napi_struct *n, int work_done) if (work_done) { if (n->gro_bitmask) - timeout = n->dev->gro_flush_timeout; - n->defer_hard_irqs_count = n->dev->napi_defer_hard_irqs; + timeout = READ_ONCE(n->dev->gro_flush_timeout); + n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs); } if (n->defer_hard_irqs_count > 0) { n->defer_hard_irqs_count--; - timeout = n->dev->gro_flush_timeout; + timeout = READ_ONCE(n->dev->gro_flush_timeout); if (timeout) ret = false; } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index f3b650cd0923..880e89c894f6 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -367,7 +367,7 @@ NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); static int change_gro_flush_timeout(struct net_device *dev, unsigned long val) { - dev->gro_flush_timeout = val; + WRITE_ONCE(dev->gro_flush_timeout, val); return 0; } @@ -384,7 +384,7 @@ NETDEVICE_SHOW_RW(gro_flush_timeout, fmt_ulong); static int change_napi_defer_hard_irqs(struct net_device *dev, unsigned long val) { - dev->napi_defer_hard_irqs = val; + WRITE_ONCE(dev->napi_defer_hard_irqs, val); return 0; } -- cgit v1.2.3 From 0456ea170cd665ddbb9503be92e39f96055dd5fa Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Mon, 20 Apr 2020 10:46:10 -0700 Subject: bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT} Currently the following prog types don't fall back to bpf_base_func_proto() (instead they have cgroup_base_func_proto which has a limited set of helpers from bpf_base_func_proto): * BPF_PROG_TYPE_CGROUP_DEVICE * BPF_PROG_TYPE_CGROUP_SYSCTL * BPF_PROG_TYPE_CGROUP_SOCKOPT I don't see any specific reason why we shouldn't use bpf_base_func_proto(), every other type of program (except bpf-lirc and, understandably, tracing) use it, so let's fall back to bpf_base_func_proto for those prog types as well. This basically boils down to adding access to the following helpers: * BPF_FUNC_get_prandom_u32 * BPF_FUNC_get_smp_processor_id * BPF_FUNC_get_numa_node_id * BPF_FUNC_tail_call * BPF_FUNC_ktime_get_ns * BPF_FUNC_spin_lock (CAP_SYS_ADMIN) * BPF_FUNC_spin_unlock (CAP_SYS_ADMIN) * BPF_FUNC_jiffies64 (CAP_SYS_ADMIN) I've also added bpf_perf_event_output() because it's really handy for logging and debugging. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200420174610.77494-1-sdf@google.com --- include/linux/bpf.h | 1 + kernel/bpf/cgroup.c | 20 +++--------------- net/core/filter.c | 2 +- .../testing/selftests/bpf/verifier/event_output.c | 24 ++++++++++++++++++++++ 4 files changed, 29 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index fd2b2322412d..25da6ff2a880 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1523,6 +1523,7 @@ extern const struct bpf_func_proto bpf_strtoul_proto; extern const struct bpf_func_proto bpf_tcp_sock_proto; extern const struct bpf_func_proto bpf_jiffies64_proto; extern const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto; +extern const struct bpf_func_proto bpf_event_output_data_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..4d748c5785bc 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1060,30 +1060,16 @@ static const struct bpf_func_proto * cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* fall through */ + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; default: - return NULL; + return bpf_base_func_proto(func_id); } } diff --git a/net/core/filter.c b/net/core/filter.c index 7d6ceaa54d21..a943df3ad8b0 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4214,7 +4214,7 @@ BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags return bpf_event_output(map, flags, data, size, NULL, 0, NULL); } -static const struct bpf_func_proto bpf_event_output_data_proto = { +const struct bpf_func_proto bpf_event_output_data_proto = { .func = bpf_event_output_data, .gpl_only = true, .ret_type = RET_INTEGER, diff --git a/tools/testing/selftests/bpf/verifier/event_output.c b/tools/testing/selftests/bpf/verifier/event_output.c index 130553e19eca..99f8f582c02b 100644 --- a/tools/testing/selftests/bpf/verifier/event_output.c +++ b/tools/testing/selftests/bpf/verifier/event_output.c @@ -92,3 +92,27 @@ .result = ACCEPT, .retval = 1, }, +{ + "perfevent for cgroup dev", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_DEVICE, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sysctl", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SYSCTL, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, +{ + "perfevent for cgroup sockopt", + .insns = { __PERF_EVENT_INSNS__ }, + .prog_type = BPF_PROG_TYPE_CGROUP_SOCKOPT, + .fixup_map_event_output = { 4 }, + .result = ACCEPT, + .retval = 1, +}, -- cgit v1.2.3 From 6890896bd765b0504761c61901c9804fca23bfb2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 24 Apr 2020 16:59:41 -0700 Subject: bpf: Fix missing bpf_base_func_proto in cgroup_base_func_proto for CGROUP_NET=n linux-next build bot reported compile issue [1] with one of its configs. It looks like when we have CONFIG_NET=n and CONFIG_BPF{,_SYSCALL}=y, we are missing the bpf_base_func_proto definition (from net/core/filter.c) in cgroup_base_func_proto. I'm reshuffling the code a bit to make it work. The common helpers are moved into kernel/bpf/helpers.c and the bpf_base_func_proto is exported from there. Also, bpf_get_raw_cpu_id goes into kernel/bpf/core.c akin to existing bpf_user_rnd_u32. [1] https://lore.kernel.org/linux-next/CAKH8qBsBvKHswiX1nx40LgO+BGeTmb1NX8tiTttt_0uu6T3dCA@mail.gmail.com/T/#mff8b0c083314c68c2e2ef0211cb11bc20dc13c72 Fixes: 0456ea170cd6 ("bpf: Enable more helpers for BPF_PROG_TYPE_CGROUP_{DEVICE,SYSCTL,SOCKOPT}") Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Cc: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20200424235941.58382-1-sdf@google.com --- include/linux/bpf.h | 8 ++++++ include/linux/filter.h | 2 -- kernel/bpf/core.c | 5 ++++ kernel/bpf/helpers.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 78 +------------------------------------------------- 5 files changed, 87 insertions(+), 79 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 25da6ff2a880..5147e11e53ff 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1215,6 +1215,7 @@ int btf_check_type_match(struct bpf_verifier_env *env, struct bpf_prog *prog, struct bpf_prog *bpf_prog_by_id(u32 id); +const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id); #else /* !CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get(u32 ufd) { @@ -1365,6 +1366,12 @@ static inline struct bpf_prog *bpf_prog_by_id(u32 id) { return ERR_PTR(-ENOTSUPP); } + +static inline const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + return NULL; +} #endif /* CONFIG_BPF_SYSCALL */ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, @@ -1531,6 +1538,7 @@ const struct bpf_func_proto *bpf_tracing_func_proto( /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +u64 bpf_get_raw_cpu_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) bool bpf_sock_common_is_valid_access(int off, int size, diff --git a/include/linux/filter.h b/include/linux/filter.h index 9b5aa5c483cc..af37318bb1c5 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -863,8 +863,6 @@ int bpf_prog_create(struct bpf_prog **pfp, struct sock_fprog_kern *fprog); int bpf_prog_create_from_user(struct bpf_prog **pfp, struct sock_fprog *fprog, bpf_aux_classic_check_t trans, bool save_orig); void bpf_prog_destroy(struct bpf_prog *fp); -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id); int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk); int sk_attach_bpf(u32 ufd, struct sock *sk); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 916f5132a984..0cc91805069a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2136,6 +2136,11 @@ BPF_CALL_0(bpf_user_rnd_u32) return res; } +BPF_CALL_0(bpf_get_raw_cpu_id) +{ + return raw_smp_processor_id(); +} + /* Weak definitions of helper functions in case we don't have bpf syscall. */ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bafc53ddd350..dbba4f41d508 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -562,3 +562,76 @@ const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto = { .arg3_type = ARG_PTR_TO_UNINIT_MEM, .arg4_type = ARG_CONST_SIZE, }; + +static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { + .func = bpf_get_raw_cpu_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + +BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, + u64, flags, void *, data, u64, size) +{ + if (unlikely(flags & ~(BPF_F_INDEX_MASK))) + return -EINVAL; + + return bpf_event_output(map, flags, data, size, NULL, 0, NULL); +} + +const struct bpf_func_proto bpf_event_output_data_proto = { + .func = bpf_event_output_data, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +const struct bpf_func_proto * +bpf_base_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_raw_smp_processor_id_proto; + case BPF_FUNC_get_numa_node_id: + return &bpf_get_numa_node_id_proto; + case BPF_FUNC_tail_call: + return &bpf_tail_call_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); + case BPF_FUNC_jiffies64: + return &bpf_jiffies64_proto; + default: + return NULL; + } +} diff --git a/net/core/filter.c b/net/core/filter.c index a943df3ad8b0..a605626142b6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -256,17 +256,6 @@ BPF_CALL_2(bpf_skb_load_helper_32_no_cache, const struct sk_buff *, skb, offset); } -BPF_CALL_0(bpf_get_raw_cpu_id) -{ - return raw_smp_processor_id(); -} - -static const struct bpf_func_proto bpf_get_raw_smp_processor_id_proto = { - .func = bpf_get_raw_cpu_id, - .gpl_only = false, - .ret_type = RET_INTEGER, -}; - static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, struct bpf_insn *insn_buf) { @@ -4205,26 +4194,6 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_event_output_data, void *, ctx, struct bpf_map *, map, u64, flags, - void *, data, u64, size) -{ - if (unlikely(flags & ~(BPF_F_INDEX_MASK))) - return -EINVAL; - - return bpf_event_output(map, flags, data, size, NULL, 0, NULL); -} - -const struct bpf_func_proto bpf_event_output_data_proto = { - .func = bpf_event_output_data, - .gpl_only = true, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_CONST_MAP_PTR, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE_OR_ZERO, -}; - BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, int, level, int, optname, char *, optval, int, optlen) { @@ -5983,52 +5952,7 @@ bool bpf_helper_changes_pkt_data(void *func) return false; } -const struct bpf_func_proto * -bpf_base_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - case BPF_FUNC_map_push_elem: - return &bpf_map_push_elem_proto; - case BPF_FUNC_map_pop_elem: - return &bpf_map_pop_elem_proto; - case BPF_FUNC_map_peek_elem: - return &bpf_map_peek_elem_proto; - case BPF_FUNC_get_prandom_u32: - return &bpf_get_prandom_u32_proto; - case BPF_FUNC_get_smp_processor_id: - return &bpf_get_raw_smp_processor_id_proto; - case BPF_FUNC_get_numa_node_id: - return &bpf_get_numa_node_id_proto; - case BPF_FUNC_tail_call: - return &bpf_tail_call_proto; - case BPF_FUNC_ktime_get_ns: - return &bpf_ktime_get_ns_proto; - default: - break; - } - - if (!capable(CAP_SYS_ADMIN)) - return NULL; - - switch (func_id) { - case BPF_FUNC_spin_lock: - return &bpf_spin_lock_proto; - case BPF_FUNC_spin_unlock: - return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - return bpf_get_trace_printk_proto(); - case BPF_FUNC_jiffies64: - return &bpf_jiffies64_proto; - default: - return NULL; - } -} +const struct bpf_func_proto bpf_event_output_data_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -- cgit v1.2.3 From 6f3f65d80dac8f2bafce2213005821fccdce194c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Mon, 20 Apr 2020 11:34:08 -0700 Subject: net: bpf: Allow TC programs to call BPF_FUNC_skb_change_head MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This allows TC eBPF programs to modify and forward (redirect) packets from interfaces without ethernet headers (for example cellular) to interfaces with (for example ethernet/wifi). The lack of this appears to simply be an oversight. Tested: in active use in Android R on 4.14+ devices for ipv6 cellular to wifi tethering offload. Signed-off-by: Lorenzo Colitti Signed-off-by: Maciej Żenczykowski Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a605626142b6..da3b7a72c37c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6137,6 +6137,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_adjust_room_proto; case BPF_FUNC_skb_change_tail: return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_skb_get_tunnel_key: return &bpf_skb_get_tunnel_key_proto; case BPF_FUNC_skb_set_tunnel_key: -- cgit v1.2.3 From 32927393dc1ccd60fb2bdc05b9e8e88753761469 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 24 Apr 2020 08:43:38 +0200 Subject: sysctl: pass kernel pointers to ->proc_handler Instead of having all the sysctl handlers deal with user pointers, which is rather hairy in terms of the BPF interaction, copy the input to and from userspace in common code. This also means that the strings are always NUL-terminated by the common code, making the API a little bit safer. As most handler just pass through the data to one of the common handlers a lot of the changes are mechnical. Signed-off-by: Christoph Hellwig Acked-by: Andrey Ignatov Signed-off-by: Al Viro --- arch/arm64/kernel/armv8_deprecated.c | 2 +- arch/arm64/kernel/fpsimd.c | 3 +- arch/mips/lasat/sysctl.c | 13 +- arch/s390/appldata/appldata_base.c | 11 +- arch/s390/kernel/debug.c | 2 +- arch/s390/kernel/topology.c | 2 +- arch/s390/mm/cmm.c | 12 +- arch/x86/kernel/itmt.c | 3 +- drivers/cdrom/cdrom.c | 2 +- drivers/char/random.c | 2 +- drivers/macintosh/mac_hid.c | 3 +- drivers/parport/procfs.c | 39 +++--- fs/dcache.c | 2 +- fs/drop_caches.c | 2 +- fs/file_table.c | 4 +- fs/fscache/main.c | 3 +- fs/inode.c | 2 +- fs/proc/proc_sysctl.c | 47 ++++--- fs/quota/dquot.c | 2 +- fs/xfs/xfs_sysctl.c | 4 +- include/linux/bpf-cgroup.h | 9 +- include/linux/compaction.h | 2 +- include/linux/fs.h | 6 +- include/linux/ftrace.h | 3 +- include/linux/hugetlb.h | 15 +- include/linux/kprobes.h | 2 +- include/linux/latencytop.h | 4 +- include/linux/mm.h | 12 +- include/linux/mmzone.h | 23 ++- include/linux/nmi.h | 15 +- include/linux/perf_event.h | 13 +- include/linux/printk.h | 2 +- include/linux/sched/sysctl.h | 44 ++---- include/linux/security.h | 2 +- include/linux/sysctl.h | 53 +++---- include/linux/timer.h | 3 +- include/linux/vmstat.h | 8 +- include/linux/writeback.h | 28 ++-- ipc/ipc_sysctl.c | 10 +- ipc/mq_sysctl.c | 4 +- kernel/bpf/cgroup.c | 35 ++--- kernel/events/callchain.c | 2 +- kernel/events/core.c | 6 +- kernel/kprobes.c | 2 +- kernel/latencytop.c | 4 +- kernel/pid_namespace.c | 2 +- kernel/printk/printk.c | 2 +- kernel/sched/core.c | 9 +- kernel/sched/fair.c | 3 +- kernel/sched/rt.c | 10 +- kernel/sched/topology.c | 2 +- kernel/seccomp.c | 2 +- kernel/sysctl.c | 239 ++++++++++++-------------------- kernel/time/timer.c | 3 +- kernel/trace/trace.c | 2 +- kernel/umh.c | 2 +- kernel/utsname_sysctl.c | 2 +- kernel/watchdog.c | 12 +- mm/compaction.c | 2 +- mm/hugetlb.c | 9 +- mm/page-writeback.c | 16 +-- mm/page_alloc.c | 30 ++-- mm/util.c | 10 +- mm/vmstat.c | 4 +- net/bridge/br_netfilter_hooks.c | 2 +- net/core/neighbour.c | 28 ++-- net/core/sysctl_net_core.c | 27 ++-- net/decnet/dn_dev.c | 7 +- net/decnet/sysctl_net_decnet.c | 27 ++-- net/ipv4/devinet.c | 9 +- net/ipv4/route.c | 3 +- net/ipv4/sysctl_net_ipv4.c | 38 ++--- net/ipv6/addrconf.c | 33 ++--- net/ipv6/ndisc.c | 3 +- net/ipv6/route.c | 5 +- net/ipv6/sysctl_net_ipv6.c | 3 +- net/mpls/af_mpls.c | 5 +- net/netfilter/ipvs/ip_vs_ctl.c | 6 +- net/netfilter/nf_conntrack_standalone.c | 2 +- net/netfilter/nf_log.c | 2 +- net/phonet/sysctl.c | 3 +- net/rds/tcp.c | 6 +- net/sctp/sysctl.c | 32 ++--- net/sunrpc/sysctl.c | 29 ++-- net/sunrpc/xprtrdma/svc_rdma.c | 7 +- security/apparmor/lsm.c | 2 +- security/min_addr.c | 2 +- security/yama/yama_lsm.c | 2 +- 88 files changed, 458 insertions(+), 653 deletions(-) (limited to 'net/core') diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index c19aa81ddc8c..7364de008bab 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -203,7 +203,7 @@ static void __init register_insn_emulation(struct insn_emulation_ops *ops) } static int emulation_proc_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 94289d126993..35cb5e66c504 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -341,8 +341,7 @@ static unsigned int find_supported_vector_length(unsigned int vl) #ifdef CONFIG_SYSCTL static int sve_proc_do_default_vl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int vl = sve_default_vl; diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index e666fe26c50d..2119541a5b8b 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -95,16 +95,15 @@ int proc_lasat_ip(struct ctl_table *table, int write, len = 0; p = buffer; while (len < *lenp) { - if (get_user(c, p++)) - return -EFAULT; + c = *p; + p++; if (c == 0 || c == '\n') break; len++; } if (len >= sizeof(ipbuf)-1) len = sizeof(ipbuf) - 1; - if (copy_from_user(ipbuf, buffer, len)) - return -EFAULT; + memcpy(ipbuf, buffer, len); ipbuf[len] = 0; *ppos += *lenp; /* Now see if we can convert it to a valid IP */ @@ -122,11 +121,9 @@ int proc_lasat_ip(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, ipbuf, len)) - return -EFAULT; + memcpy(buffer, ipbuf, len); if (len < *lenp) { - if (put_user('\n', ((char *) buffer) + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; len++; } *lenp = len; diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index aa738cad1338..d74a4c7d5df6 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -51,10 +51,9 @@ static struct platform_device *appldata_pdev; */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table_header *appldata_sysctl_header; static struct ctl_table appldata_table[] = { @@ -217,7 +216,7 @@ static void __appldata_vtimer_setup(int cmd) */ static int appldata_timer_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int timer_active = appldata_timer_active; int rc; @@ -250,7 +249,7 @@ appldata_timer_handler(struct ctl_table *ctl, int write, */ static int appldata_interval_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int interval = appldata_interval; int rc; @@ -280,7 +279,7 @@ appldata_interval_handler(struct ctl_table *ctl, int write, */ static int appldata_generic_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; struct list_head *lh; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 6d321f5f101d..636446003a06 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -867,7 +867,7 @@ static int debug_active = 1; * if debug_active is already off */ static int s390dbf_procactive(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) return proc_dointvec(table, write, buffer, lenp, ppos); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index 5f70cefc13e4..332b542548cd 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -594,7 +594,7 @@ static int __init topology_setup(char *str) early_param("topology", topology_setup); static int topology_ctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int enabled = topology_is_enabled(); int new_mode; diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index ae989b740376..36bce727897b 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -245,7 +245,7 @@ static int cmm_skip_blanks(char *cp, char **endp) } static int cmm_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_pages(); struct ctl_table ctl_entry = { @@ -264,7 +264,7 @@ static int cmm_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { long nr = cmm_get_timed_pages(); @@ -284,7 +284,7 @@ static int cmm_timed_pages_handler(struct ctl_table *ctl, int write, } static int cmm_timeout_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; long nr, seconds; @@ -297,8 +297,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, if (write) { len = min(*lenp, sizeof(buf)); - if (copy_from_user(buf, buffer, len)) - return -EFAULT; + memcpy(buf, buffer, len); buf[len - 1] = '\0'; cmm_skip_blanks(buf, &p); nr = simple_strtoul(p, &p, 0); @@ -311,8 +310,7 @@ static int cmm_timeout_handler(struct ctl_table *ctl, int write, cmm_timeout_pages, cmm_timeout_seconds); if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, buf, len)) - return -EFAULT; + memcpy(buffer, buf, len); *lenp = len; *ppos += len; } diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index 1cb3ca9bba49..1afbdd1dd777 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -39,8 +39,7 @@ static bool __read_mostly sched_itmt_capable; unsigned int __read_mostly sysctl_sched_itmt_enabled; static int sched_itmt_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old_sysctl; int ret; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index faca0f346fff..e3bbe108eb54 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3631,7 +3631,7 @@ static void cdrom_update_settings(void) } static int cdrom_sysctl_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d10e31fd342..1e0db78b83ba 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -2057,7 +2057,7 @@ static char sysctl_bootid[16]; * sysctl system call, as 16 bytes of binary data. */ static int proc_do_uuid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; unsigned char buf[64], tmp_uuid[16], *uuid; diff --git a/drivers/macintosh/mac_hid.c b/drivers/macintosh/mac_hid.c index 7af0c536d568..28b8581b44dd 100644 --- a/drivers/macintosh/mac_hid.c +++ b/drivers/macintosh/mac_hid.c @@ -183,8 +183,7 @@ static void mac_hid_stop_emulation(void) } static int mac_hid_toggle_emumouse(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int old_val = *valp; diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 48804049d697..ee7b5daabfd4 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -34,7 +34,7 @@ #define PARPORT_MAX_SPINTIME_VALUE 1000 static int do_active_device(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[256]; @@ -65,13 +65,13 @@ static int do_active_device(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #ifdef CONFIG_PARPORT_1284 static int do_autoprobe(struct ctl_table *table, int write, - void __user *result, size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; const char *str; @@ -108,13 +108,13 @@ static int do_autoprobe(struct ctl_table *table, int write, *ppos += len; - return copy_to_user (result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -136,13 +136,12 @@ static int do_hardware_base_addr(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_irq(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -164,13 +163,12 @@ static int do_hardware_irq(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_dma(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[20]; @@ -192,13 +190,12 @@ static int do_hardware_dma(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } static int do_hardware_modes(struct ctl_table *table, int write, - void __user *result, - size_t *lenp, loff_t *ppos) + void *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; char buffer[40]; @@ -231,8 +228,8 @@ static int do_hardware_modes(struct ctl_table *table, int write, *lenp = len; *ppos += len; - - return copy_to_user(result, buffer, len) ? -EFAULT : 0; + memcpy(result, buffer, len); + return 0; } #define PARPORT_PORT_DIR(CHILD) { .procname = NULL, .mode = 0555, .child = CHILD } diff --git a/fs/dcache.c b/fs/dcache.c index b280e07e162b..8dd4d8d7bd0b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -165,7 +165,7 @@ static long get_nr_dentry_negative(void) return sum < 0 ? 0 : sum; } -int proc_nr_dentry(struct ctl_table *table, int write, void __user *buffer, +int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); diff --git a/fs/drop_caches.c b/fs/drop_caches.c index dc1a1d5d825b..f00fcc4a4f72 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -47,7 +47,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } int drop_caches_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/fs/file_table.c b/fs/file_table.c index 30d55c9a1744..3b612535391f 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -80,14 +80,14 @@ EXPORT_SYMBOL_GPL(get_max_files); */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #else int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 59c2494efda3..c1e6cc9091aa 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -51,8 +51,7 @@ static unsigned fscache_op_max_active = 2; static struct ctl_table_header *fscache_sysctl_header; static int fscache_max_active_sysctl(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct workqueue_struct **wqp = table->extra1; unsigned int *datap = table->data; diff --git a/fs/inode.c b/fs/inode.c index 93d9252a00ab..cc6e701b7e5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -108,7 +108,7 @@ long get_nr_dirty_inodes(void) */ #ifdef CONFIG_SYSCTL int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index b6f5d459b087..df2143e05c57 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -539,13 +539,13 @@ out: return err; } -static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, +static ssize_t proc_sys_call_handler(struct file *filp, void __user *ubuf, size_t count, loff_t *ppos, int write) { struct inode *inode = file_inode(filp); struct ctl_table_header *head = grab_header(inode); struct ctl_table *table = PROC_I(inode)->sysctl_entry; - void *new_buf = NULL; + void *kbuf; ssize_t error; if (IS_ERR(head)) @@ -564,27 +564,38 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, if (!table->proc_handler) goto out; - error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, &count, - ppos, &new_buf); + if (write) { + kbuf = memdup_user_nul(ubuf, count); + if (IS_ERR(kbuf)) { + error = PTR_ERR(kbuf); + goto out; + } + } else { + error = -ENOMEM; + kbuf = kzalloc(count, GFP_KERNEL); + if (!kbuf) + goto out; + } + + error = BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, &kbuf, &count, + ppos); if (error) - goto out; + goto out_free_buf; /* careful: calling conventions are nasty here */ - if (new_buf) { - mm_segment_t old_fs; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - error = table->proc_handler(table, write, (void __user *)new_buf, - &count, ppos); - set_fs(old_fs); - kfree(new_buf); - } else { - error = table->proc_handler(table, write, buf, &count, ppos); + error = table->proc_handler(table, write, kbuf, &count, ppos); + if (error) + goto out_free_buf; + + if (!write) { + error = -EFAULT; + if (copy_to_user(ubuf, kbuf, count)) + goto out_free_buf; } - if (!error) - error = count; + error = count; +out_free_buf: + kfree(kbuf); out: sysctl_head_finish(head); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b6a4f692d345..7b4bac91146b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2841,7 +2841,7 @@ const struct quotactl_ops dquot_quotactl_sysfile_ops = { EXPORT_SYMBOL(dquot_quotactl_sysfile_ops); static int do_proc_dqstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int type = (unsigned long *)table->data - dqstats.stat; s64 value = percpu_counter_sum(&dqstats.counter[type]); diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 31b3bdbd2eba..021ef96d0542 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -13,7 +13,7 @@ STATIC int xfs_stats_clear_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { @@ -33,7 +33,7 @@ STATIC int xfs_panic_mask_proc_handler( struct ctl_table *ctl, int write, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos) { diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index c11b413d5b1a..0b41fd5fc96b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -138,8 +138,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type); int __cgroup_bpf_run_filter_setsockopt(struct sock *sock, int *level, @@ -302,12 +301,12 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos, nbuf) \ +#define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ - buf, count, pos, nbuf, \ + buf, count, pos, \ BPF_CGROUP_SYSCTL); \ __ret; \ }) @@ -429,7 +428,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos,nbuf) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 4b898cdbdf05..a0eabfbeb0e1 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -86,7 +86,7 @@ static inline unsigned long compact_gap(unsigned int order) #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); extern int sysctl_extfrag_threshold; extern int sysctl_compact_unevictable_allowed; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4f6f59b4f22a..9b028d260649 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3536,11 +3536,11 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, struct ctl_table; int proc_nr_files(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_dentry(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int proc_nr_inodes(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); #define __FMODE_EXEC ((__force int) FMODE_EXEC) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index db95244a62d4..ddfc377de0d2 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -1005,8 +1005,7 @@ extern void disable_trace_on_warning(void); extern int __disable_trace_on_warning; int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #else /* CONFIG_TRACING */ static inline void disable_trace_on_warning(void) { } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 43a1cef8f0f1..92c21c5ccc58 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -105,14 +105,13 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, void hugepage_put_subpool(struct hugepage_subpool *spool); void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); - -#ifdef CONFIG_NUMA -int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#endif +int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 04bdaf01112c..594265bfd390 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -312,7 +312,7 @@ DEFINE_INSN_CACHE_OPS(optinsn); #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *length, loff_t *ppos); #endif extern void wait_for_kprobe_optimizer(void); diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h index 9022f0c2e2e4..abe3d95f795b 100644 --- a/include/linux/latencytop.h +++ b/include/linux/latencytop.h @@ -38,8 +38,8 @@ account_scheduler_latency(struct task_struct *task, int usecs, int inter) void clear_tsk_latency_tracing(struct task_struct *p); -extern int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #else diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c4e7e76dedd..a7b1ef8ed970 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -201,10 +201,10 @@ extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; -extern int overcommit_ratio_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); -extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *, - size_t *, loff_t *); +int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) @@ -2957,8 +2957,8 @@ extern bool process_shares_mm(struct task_struct *p, struct mm_struct *mm); #ifdef CONFIG_SYSCTL extern int sysctl_drop_caches; -int drop_caches_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); #endif void drop_slab(void); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index b2af594ef0f7..93cf20f41e26 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -910,22 +910,21 @@ static inline int is_highmem(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, + loff_t *); +int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, + size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); - -extern int numa_zonelist_order_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *, size_t *, loff_t *); +int numa_zonelist_order_handler(struct ctl_table *, int, + void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 diff --git a/include/linux/nmi.h b/include/linux/nmi.h index 9003e29cde46..750c7f395ca9 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -202,16 +202,11 @@ static inline void watchdog_update_hrtimer_threshold(u64 period) { } #endif struct ctl_table; -extern int proc_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_nmi_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_soft_watchdog(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_thresh(struct ctl_table *, int , - void __user *, size_t *, loff_t *); -extern int proc_watchdog_cpumask(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +int proc_watchdog(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_nmi_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_soft_watchdog(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_thresh(struct ctl_table *, int , void *, size_t *, loff_t *); +int proc_watchdog_cpumask(struct ctl_table *, int, void *, size_t *, loff_t *); #ifdef CONFIG_HAVE_ACPI_APEI_NMI #include diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9c3e7619c929..347ea379622a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1280,15 +1280,12 @@ extern int sysctl_perf_cpu_time_max_percent; extern void perf_sample_event_took(u64 sample_len_ns); -extern int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - +int perf_proc_update_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); /* Access to perf_event_open(2) syscall. */ #define PERF_SECURITY_OPEN 0 diff --git a/include/linux/printk.h b/include/linux/printk.h index e061635e0409..fcde0772ec98 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -189,7 +189,7 @@ extern int printk_delay_msec; extern int dmesg_restrict; extern int -devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void __user *buf, +devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, void *buf, size_t *lenp, loff_t *ppos); extern void wake_up_klogd(void); diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index d4f6215ee03f..7b4d3a49b6c5 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -12,9 +12,8 @@ extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_check_interval_secs; extern int sysctl_hung_task_warnings; -extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos); +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #else /* Avoid need for ifdefs elsewhere in the code */ enum { sysctl_hung_task_timeout_secs = 0 }; @@ -43,8 +42,7 @@ extern __read_mostly unsigned int sysctl_sched_migration_cost; extern __read_mostly unsigned int sysctl_sched_nr_migrate; int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos); + void *buffer, size_t *length, loff_t *ppos); #endif /* @@ -72,33 +70,21 @@ extern unsigned int sysctl_sched_autogroup_enabled; extern int sysctl_sched_rr_timeslice; extern int sched_rr_timeslice; -extern int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -#ifdef CONFIG_UCLAMP_TASK -extern int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -#endif - -extern int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); - -extern int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; -extern int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int sched_energy_aware_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); #endif #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/linux/security.h b/include/linux/security.h index a8d9310472df..6aa229b252ce 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -211,7 +211,7 @@ struct request_sock; #ifdef CONFIG_MMU extern int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif /* security_inode_init_security callback function to write xattrs */ diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 36143ca40b56..f2401e45a3c2 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -44,35 +44,26 @@ struct ctl_dir; extern const int sysctl_vals[]; -typedef int proc_handler (struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -extern int proc_dostring(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int proc_dointvec_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - void __user *, size_t *, loff_t *); -extern int proc_do_large_bitmap(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -extern int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +typedef int proc_handler(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); + +int proc_dostring(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_douintvec_minmax(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +int proc_dointvec_jiffies(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_dointvec_ms_jiffies(struct ctl_table *, int, void *, size_t *, + loff_t *); +int proc_doulongvec_minmax(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, void *, + size_t *, loff_t *); +int proc_do_large_bitmap(struct ctl_table *, int, void *, size_t *, loff_t *); +int proc_do_static_key(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table @@ -246,7 +237,7 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, #endif /* CONFIG_SYSCTL */ -int sysctl_max_threads(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); #endif /* _LINUX_SYSCTL_H */ diff --git a/include/linux/timer.h b/include/linux/timer.h index 0dc19a8c39c9..07910ae5ddd9 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -201,8 +201,7 @@ struct ctl_table; extern unsigned int sysctl_timer_migration; int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); #endif unsigned long __round_jiffies(unsigned long j, int cpu); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 292485f3d24d..cb507151710f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -16,8 +16,8 @@ extern int sysctl_stat_interval; #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); -extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, - int write, void __user *buffer, size_t *length, loff_t *ppos); +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { @@ -274,8 +274,8 @@ void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; -int vmstat_refresh(struct ctl_table *, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); +int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, + loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a19d845dd7eb..f8a7e1a850fb 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -362,24 +362,18 @@ extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; -extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -extern int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); +int dirty_background_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_background_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_ratio_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_bytes_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos); - -struct ctl_table; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); + void *buffer, size_t *lenp, loff_t *ppos); +int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index affd66537e87..d1b8644bfb88 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -24,7 +24,7 @@ static void *get_ipc(struct ctl_table *table) #ifdef CONFIG_PROC_SYSCTL static int proc_ipc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -35,7 +35,7 @@ static int proc_ipc_dointvec(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; @@ -46,7 +46,7 @@ static int proc_ipc_dointvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ipc_namespace *ns = current->nsproxy->ipc_ns; int err = proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos); @@ -59,7 +59,7 @@ static int proc_ipc_dointvec_minmax_orphans(struct ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); @@ -70,7 +70,7 @@ static int proc_ipc_doulongvec_minmax(struct ctl_table *table, int write, } static int proc_ipc_auto_msgmni(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; int dummy = 0; diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 7c00f28923a8..72a92a08c848 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -19,7 +19,7 @@ static void *get_mq(struct ctl_table *table) } static int proc_mq_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); @@ -29,7 +29,7 @@ static int proc_mq_dointvec(struct ctl_table *table, int write, } static int proc_mq_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index cb305e71e7de..977bc69bb1c5 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1137,16 +1137,13 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { * @head: sysctl table header * @table: sysctl table * @write: sysctl is being read (= 0) or written (= 1) - * @buf: pointer to buffer passed by user space + * @buf: pointer to buffer (in and out) * @pcount: value-result argument: value is size of buffer pointed to by @buf, * result is size of @new_buf if program set new value, initial value * otherwise * @ppos: value-result argument: value is position at which read from or write * to sysctl is happening, result is new position if program overrode it, * initial value otherwise - * @new_buf: pointer to pointer to new buffer that will be allocated if program - * overrides new value provided by user space on sysctl write - * NOTE: it's caller responsibility to free *new_buf if it was set * @type: type of program to be executed * * Program is run when sysctl is being accessed, either read or written, and @@ -1157,8 +1154,7 @@ const struct bpf_verifier_ops cg_dev_verifier_ops = { */ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, struct ctl_table *table, int write, - void __user *buf, size_t *pcount, - loff_t *ppos, void **new_buf, + void **buf, size_t *pcount, loff_t *ppos, enum bpf_attach_type type) { struct bpf_sysctl_kern ctx = { @@ -1173,36 +1169,28 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, .new_updated = 0, }; struct cgroup *cgrp; + loff_t pos = 0; int ret; ctx.cur_val = kmalloc_track_caller(ctx.cur_len, GFP_KERNEL); - if (ctx.cur_val) { - mm_segment_t old_fs; - loff_t pos = 0; - - old_fs = get_fs(); - set_fs(KERNEL_DS); - if (table->proc_handler(table, 0, (void __user *)ctx.cur_val, - &ctx.cur_len, &pos)) { - /* Let BPF program decide how to proceed. */ - ctx.cur_len = 0; - } - set_fs(old_fs); - } else { + if (!ctx.cur_val || + table->proc_handler(table, 0, ctx.cur_val, &ctx.cur_len, &pos)) { /* Let BPF program decide how to proceed. */ ctx.cur_len = 0; } - if (write && buf && *pcount) { + if (write && *buf && *pcount) { /* BPF program should be able to override new value with a * buffer bigger than provided by user. */ ctx.new_val = kmalloc_track_caller(PAGE_SIZE, GFP_KERNEL); ctx.new_len = min_t(size_t, PAGE_SIZE, *pcount); - if (!ctx.new_val || - copy_from_user(ctx.new_val, buf, ctx.new_len)) + if (ctx.new_val) { + memcpy(ctx.new_val, *buf, ctx.new_len); + } else { /* Let BPF program decide how to proceed. */ ctx.new_len = 0; + } } rcu_read_lock(); @@ -1213,7 +1201,8 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, kfree(ctx.cur_val); if (ret == 1 && ctx.new_updated) { - *new_buf = ctx.new_val; + kfree(*buf); + *buf = ctx.new_val; *pcount = ctx.new_len; } else { kfree(ctx.new_val); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c2b41a263166..bdb1533ada81 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -236,7 +236,7 @@ exit_put: * sysctl_perf_event_max_contexts_per_stack. */ int perf_event_max_stack_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *value = table->data; int new_value = *value, ret; diff --git a/kernel/events/core.c b/kernel/events/core.c index bc9b98a9af9a..f86d46f2c4d9 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -437,8 +437,7 @@ static void update_perf_cpu_limits(void) static bool perf_rotate_context(struct perf_cpu_context *cpuctx); int perf_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int perf_cpu = sysctl_perf_cpu_time_max_percent; @@ -462,8 +461,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT; int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2625c241ac00..ffbe03a45c16 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -892,7 +892,7 @@ static void unoptimize_all_kprobes(void) static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, + void *buffer, size_t *length, loff_t *ppos) { int ret; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index 8d1c15832e55..166d7bf49666 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -269,8 +269,8 @@ static int __init init_lstats_procfs(void) return 0; } -int sysctl_latencytop(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_latencytop(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int err; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 01f8ba32cc0c..3ccaba5f15c0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -263,7 +263,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) #ifdef CONFIG_CHECKPOINT_RESTORE static int pid_ns_ctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct pid_namespace *pid_ns = task_active_pid_ns(current); struct ctl_table tmp = *table; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9a9b6156270b..471f649b5868 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -173,7 +173,7 @@ __setup("printk.devkmsg=", control_devkmsg); char devkmsg_log_str[DEVKMSG_STR_MAX_SIZE] = "ratelimit"; int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char old_str[DEVKMSG_STR_MAX_SIZE]; unsigned int old; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a61a3b8eaa9..5c589a2e4d19 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1110,8 +1110,7 @@ static void uclamp_update_root_tg(void) { } #endif int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { bool update_root_tg = false; int old_min, old_max; @@ -2723,7 +2722,7 @@ void set_numabalancing_state(bool enabled) #ifdef CONFIG_PROC_SYSCTL int sysctl_numa_balancing(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; @@ -2797,8 +2796,8 @@ static void __init init_schedstats(void) } #ifdef CONFIG_PROC_SYSCTL -int sysctl_schedstats(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table t; int err; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02f323b85b6d..b6077fd5b32f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -645,8 +645,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) */ int sched_proc_update_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); unsigned int factor = get_update_sysctl_factor(); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index df11d88c9895..45da29de3ecc 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2714,9 +2714,8 @@ static void sched_rt_do_global(void) def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); } -int sched_rt_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rt_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_period, old_runtime; static DEFINE_MUTEX(mutex); @@ -2754,9 +2753,8 @@ undo: return ret; } -int sched_rr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int sched_rr_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; static DEFINE_MUTEX(mutex); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 8344757bba6e..fa64b2ee9fe6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -209,7 +209,7 @@ bool sched_energy_update; #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret, state; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 55a6184f5990..d653d8426de9 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1776,7 +1776,7 @@ static void audit_actions_logged(u32 actions_logged, u32 old_actions_logged, } static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3fafca3ced98..e961286d0e14 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -208,12 +208,10 @@ static int max_extfrag_threshold = 1000; #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(char *data, int maxlen, int write, - char __user *buffer, - size_t *lenp, loff_t *ppos) + char *buffer, size_t *lenp, loff_t *ppos) { size_t len; - char __user *p; - char c; + char c, *p; if (!data || !maxlen || !*lenp) { *lenp = 0; @@ -238,8 +236,7 @@ static int _proc_do_string(char *data, int maxlen, int write, *ppos += *lenp; p = buffer; while ((p - buffer) < *lenp && len < maxlen - 1) { - if (get_user(c, p++)) - return -EFAULT; + c = *(p++); if (c == 0 || c == '\n') break; data[len++] = c; @@ -261,11 +258,9 @@ static int _proc_do_string(char *data, int maxlen, int write, if (len > *lenp) len = *lenp; if (len) - if (copy_to_user(buffer, data, len)) - return -EFAULT; + memcpy(buffer, data, len); if (len < *lenp) { - if (put_user('\n', buffer + len)) - return -EFAULT; + buffer[len] = '\n'; len++; } *lenp = len; @@ -326,13 +321,13 @@ static bool proc_first_pos_non_zero_ignore(loff_t *ppos, * Returns 0 on success. */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) proc_first_pos_non_zero_ignore(ppos, table); - return _proc_do_string((char *)(table->data), table->maxlen, write, - (char __user *)buffer, lenp, ppos); + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, + ppos); } static size_t proc_skip_spaces(char **buf) @@ -463,11 +458,10 @@ static int proc_get_long(char **buf, size_t *size, * @val: the integer to be converted * @neg: sign of the number, %TRUE for negative * - * In case of success %0 is returned and @buf and @size are updated with - * the amount of bytes written. + * In case of success @buf and @size are updated with the amount of bytes + * written. */ -static int proc_put_long(void __user **buf, size_t *size, unsigned long val, - bool neg) +static void proc_put_long(void **buf, size_t *size, unsigned long val, bool neg) { int len; char tmp[TMPBUFLEN], *p = tmp; @@ -476,24 +470,22 @@ static int proc_put_long(void __user **buf, size_t *size, unsigned long val, len = strlen(tmp); if (len > *size) len = *size; - if (copy_to_user(*buf, tmp, len)) - return -EFAULT; + memcpy(*buf, tmp, len); *size -= len; *buf += len; - return 0; } #undef TMPBUFLEN -static int proc_put_char(void __user **buf, size_t *size, char c) +static void proc_put_char(void **buf, size_t *size, char c) { if (*size) { - char __user **buffer = (char __user **)buf; - if (put_user(c, *buffer)) - return -EFAULT; - (*size)--, (*buffer)++; + char **buffer = (char **)buf; + **buffer = c; + + (*size)--; + (*buffer)++; *buf = *buffer; } - return 0; } static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, @@ -541,7 +533,7 @@ static int do_proc_douintvec_conv(unsigned long *lvalp, static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -549,7 +541,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, { int *i, vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -569,9 +561,7 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first=0) { @@ -598,24 +588,17 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, break; } if (!first) - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - err = proc_put_long(&buffer, &left, lval, neg); - if (err) - break; + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, lval, neg); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err && left) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -623,7 +606,7 @@ out: } static int do_proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(bool *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) @@ -634,7 +617,7 @@ static int do_proc_dointvec(struct ctl_table *table, int write, static int do_proc_douintvec_w(unsigned int *tbl_data, struct ctl_table *table, - void __user *buffer, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -645,7 +628,7 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, int err = 0; size_t left; bool neg; - char *kbuf = NULL, *p; + char *p = buffer; left = *lenp; @@ -655,10 +638,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return -EINVAL; - left -= proc_skip_spaces(&p); if (!left) { err = -EINVAL; @@ -682,7 +661,6 @@ static int do_proc_douintvec_w(unsigned int *tbl_data, left -= proc_skip_spaces(&p); out_free: - kfree(kbuf); if (err) return -EINVAL; @@ -694,7 +672,7 @@ bail_early: return err; } -static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, +static int do_proc_douintvec_r(unsigned int *tbl_data, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -712,11 +690,11 @@ static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer, goto out; } - err = proc_put_long(&buffer, &left, lval, false); - if (err || !left) + proc_put_long(&buffer, &left, lval, false); + if (!left) goto out; - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); out: *lenp -= left; @@ -726,7 +704,7 @@ out: } static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, - int write, void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, @@ -762,7 +740,7 @@ static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table, } static int do_proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos, + void *buffer, size_t *lenp, loff_t *ppos, int (*conv)(unsigned long *lvalp, unsigned int *valp, int write, void *data), @@ -785,16 +763,15 @@ static int do_proc_douintvec(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL); } #ifdef CONFIG_COMPACTION static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, - int write, void __user *buffer, - size_t *lenp, loff_t *ppos) + int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret, old; @@ -826,8 +803,8 @@ static int proc_dointvec_minmax_warn_RT_change(struct ctl_table *table, * * Returns 0 on success. */ -int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_douintvec(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_douintvec_conv, NULL); @@ -838,7 +815,7 @@ int proc_douintvec(struct ctl_table *table, int write, * This means we can safely use a temporary. */ static int proc_taint(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long tmptaint = get_taint(); @@ -870,7 +847,7 @@ static int proc_taint(struct ctl_table *table, int write, #ifdef CONFIG_PRINTK static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; @@ -936,7 +913,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success or -EINVAL on write when the range check fails. */ int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, @@ -1005,7 +982,7 @@ static int do_proc_douintvec_minmax_conv(unsigned long *lvalp, * Returns 0 on success or -ERANGE on write when the range check fails. */ int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_douintvec_minmax_conv_param param = { .min = (unsigned int *) table->extra1, @@ -1036,7 +1013,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, } static int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_douintvec(table, write, buffer, lenp, ppos, do_proc_dopipe_max_size_conv, NULL); @@ -1057,7 +1034,7 @@ static void validate_coredump_safety(void) } static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!error) @@ -1067,7 +1044,7 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, #ifdef CONFIG_COREDUMP static int proc_dostring_coredump(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int error = proc_dostring(table, write, buffer, lenp, ppos); if (!error) @@ -1078,7 +1055,7 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #ifdef CONFIG_MAGIC_SYSRQ static int sysrq_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int tmp, ret; @@ -1096,16 +1073,14 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write, } #endif -static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, + int write, void *buffer, size_t *lenp, loff_t *ppos, + unsigned long convmul, unsigned long convdiv) { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; size_t left; - char *kbuf = NULL, *p; + char *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -1124,9 +1099,7 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); + p = buffer; } for (; left && vleft--; i++, first = 0) { @@ -1154,26 +1127,18 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int *i = val; } else { val = convdiv * (*i) / convmul; - if (!first) { - err = proc_put_char(&buffer, &left, '\t'); - if (err) - break; - } - err = proc_put_long(&buffer, &left, val, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, '\t'); + proc_put_long(&buffer, &left, val, false); } } if (!write && !first && left && !err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); if (write && !err) left -= proc_skip_spaces(&p); - if (write) { - kfree(kbuf); - if (first) - return err ? : -EINVAL; - } + if (write && first) + return err ? : -EINVAL; *lenp -= left; out: *ppos += *lenp; @@ -1181,10 +1146,8 @@ out: } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos, - unsigned long convmul, - unsigned long convdiv) + void *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, + unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, buffer, lenp, ppos, convmul, convdiv); @@ -1207,7 +1170,7 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } @@ -1230,8 +1193,7 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); @@ -1325,7 +1287,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, * Returns 0 on success. */ int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); @@ -1347,7 +1309,7 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); @@ -1369,15 +1331,15 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_do_cad_pid(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct pid *new_pid; pid_t tmp; @@ -1416,7 +1378,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, * Returns 0 on success. */ int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err = 0; bool first = 1; @@ -1432,7 +1394,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - char *kbuf, *p; + char *p = buffer; size_t skipped = 0; if (left > PAGE_SIZE - 1) { @@ -1441,15 +1403,9 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, skipped = *lenp - left; } - p = kbuf = memdup_user_nul(buffer, left); - if (IS_ERR(kbuf)) - return PTR_ERR(kbuf); - tmp_bitmap = bitmap_zalloc(bitmap_len, GFP_KERNEL); - if (!tmp_bitmap) { - kfree(kbuf); + if (!tmp_bitmap) return -ENOMEM; - } proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; @@ -1513,7 +1469,6 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, first = 0; proc_skip_char(&p, &left, '\n'); } - kfree(kbuf); left += skipped; } else { unsigned long bit_a, bit_b = 0; @@ -1525,27 +1480,17 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, bit_b = find_next_zero_bit(bitmap, bitmap_len, bit_a + 1) - 1; - if (!first) { - err = proc_put_char(&buffer, &left, ','); - if (err) - break; - } - err = proc_put_long(&buffer, &left, bit_a, false); - if (err) - break; + if (!first) + proc_put_char(&buffer, &left, ','); + proc_put_long(&buffer, &left, bit_a, false); if (bit_a != bit_b) { - err = proc_put_char(&buffer, &left, '-'); - if (err) - break; - err = proc_put_long(&buffer, &left, bit_b, false); - if (err) - break; + proc_put_char(&buffer, &left, '-'); + proc_put_long(&buffer, &left, bit_b, false); } first = 0; bit_b++; } - if (!err) - err = proc_put_char(&buffer, &left, '\n'); + proc_put_char(&buffer, &left, '\n'); } if (!err) { @@ -1566,68 +1511,67 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_douintvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { - return -ENOSYS; + return -ENOSYS; } int proc_do_large_bitmap(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } @@ -1636,8 +1580,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, #if defined(CONFIG_SYSCTL) int proc_do_static_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct static_key *key = (struct static_key *)table->data; static DEFINE_MUTEX(static_key_mutex); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index a5221abb4594..398e6eadb861 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -249,8 +249,7 @@ void timers_update_nohz(void) } int timer_migration_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8d2b98812625..167a74a15b1a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2661,7 +2661,7 @@ static void output_printk(struct trace_event_buffer *fbuffer) } int tracepoint_printk_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int save_tracepoint_printk; diff --git a/kernel/umh.c b/kernel/umh.c index 7f255b5a8845..9788ed481a6a 100644 --- a/kernel/umh.c +++ b/kernel/umh.c @@ -630,7 +630,7 @@ int call_usermodehelper(const char *path, char **argv, char **envp, int wait) EXPORT_SYMBOL(call_usermodehelper); static int proc_cap_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 3732c888a949..4ca61d49885b 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -30,7 +30,7 @@ static void *get_uts(struct ctl_table *table) * to observe. Should this be in kernel/sys.c ???? */ static int proc_do_uts_string(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index b6b1f54a7837..53ff2c81b084 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -661,7 +661,7 @@ static void proc_watchdog_update(void) * proc_soft_watchdog | soft_watchdog_user_enabled | SOFT_WATCHDOG_ENABLED */ static int proc_watchdog_common(int which, struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old, *param = table->data; @@ -688,7 +688,7 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * /proc/sys/kernel/watchdog */ int proc_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(NMI_WATCHDOG_ENABLED|SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -698,7 +698,7 @@ int proc_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/nmi_watchdog */ int proc_nmi_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!nmi_watchdog_available && write) return -ENOTSUPP; @@ -710,7 +710,7 @@ int proc_nmi_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/soft_watchdog */ int proc_soft_watchdog(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { return proc_watchdog_common(SOFT_WATCHDOG_ENABLED, table, write, buffer, lenp, ppos); @@ -720,7 +720,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, * /proc/sys/kernel/watchdog_thresh */ int proc_watchdog_thresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err, old; @@ -743,7 +743,7 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, * been brought online, if desired. */ int proc_watchdog_cpumask(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int err; diff --git a/mm/compaction.c b/mm/compaction.c index 46f0fcc93081..d8cfb7b99a83 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2463,7 +2463,7 @@ int sysctl_compact_memory; * /proc/sys/vm/compact_memory */ int sysctl_compaction_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { if (write) compact_nodes(); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index cd459155d28a..2277c5728b1f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3352,7 +3352,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL static int hugetlb_sysctl_handler_common(bool obey_mempolicy, struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp = h->max_huge_pages; @@ -3375,7 +3375,7 @@ out: } int hugetlb_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(false, table, write, @@ -3384,7 +3384,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, #ifdef CONFIG_NUMA int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { return hugetlb_sysctl_handler_common(true, table, write, buffer, length, ppos); @@ -3392,8 +3392,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, #endif /* CONFIG_NUMA */ int hugetlb_overcommit_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7326b54ab728..d3ee4c4dafac 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -512,8 +512,7 @@ bool node_dirty_ok(struct pglist_data *pgdat) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -524,8 +523,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write, } int dirty_background_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -535,9 +533,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write, return ret; } -int dirty_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; @@ -551,8 +548,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write, } int dirty_bytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; @@ -1972,7 +1968,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { unsigned int old_interval = dirty_writeback_interval; int ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 62c1550cd43e..0c43e9ae5004 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5546,21 +5546,11 @@ char numa_zonelist_order[] = "Node"; * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, - loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { - char *str; - int ret; - - if (!write) - return proc_dostring(table, write, buffer, length, ppos); - str = memdup_user_nul(buffer, 16); - if (IS_ERR(str)) - return PTR_ERR(str); - - ret = __parse_numa_zonelist_order(str); - kfree(str); - return ret; + if (write) + return __parse_numa_zonelist_order(buffer); + return proc_dostring(table, write, buffer, length, ppos); } @@ -7963,7 +7953,7 @@ core_initcall(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -7979,7 +7969,7 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, } int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8009,7 +7999,7 @@ static void setup_min_unmapped_ratio(void) int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8036,7 +8026,7 @@ static void setup_min_slab_ratio(void) } int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int rc; @@ -8060,7 +8050,7 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); @@ -8082,7 +8072,7 @@ static void __zone_pcp_update(struct zone *zone) * pagelist can have before it gets flushed back to buddy allocator. */ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int old_percpu_pagelist_fraction; diff --git a/mm/util.c b/mm/util.c index 988d11e6c17c..8defc8ec141f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -717,9 +717,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */ unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */ -int overcommit_ratio_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; @@ -729,9 +728,8 @@ int overcommit_ratio_handler(struct ctl_table *table, int write, return ret; } -int overcommit_kbytes_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret; diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..c03a8c914922 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -76,7 +76,7 @@ static void invalid_numa_statistics(void) static DEFINE_MUTEX(vm_numa_stat_lock); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *length, loff_t *ppos) + void *buffer, size_t *length, loff_t *ppos) { int ret, oldval; @@ -1751,7 +1751,7 @@ static void refresh_vm_stats(struct work_struct *work) } int vmstat_refresh(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { long val; int err; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 59980ecfc962..04c3f9a82650 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -1027,7 +1027,7 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, #ifdef CONFIG_SYSCTL static int brnf_sysctl_call_tables(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 39d37d0ef575..3f2263e79e4b 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -3379,7 +3379,7 @@ EXPORT_SYMBOL(neigh_app_ns); static int unres_qlen_max = INT_MAX / SKB_TRUESIZE(ETH_FRAME_LEN); static int proc_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int size, ret; struct ctl_table tmp = *ctl; @@ -3443,8 +3443,8 @@ static void neigh_proc_update(struct ctl_table *ctl, int write) } static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct ctl_table tmp = *ctl; int ret; @@ -3457,8 +3457,8 @@ static int neigh_proc_dointvec_zero_intmax(struct ctl_table *ctl, int write, return ret; } -int neigh_proc_dointvec(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -3467,8 +3467,7 @@ int neigh_proc_dointvec(struct ctl_table *ctl, int write, } EXPORT_SYMBOL(neigh_proc_dointvec); -int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, +int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); @@ -3479,8 +3478,8 @@ int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_jiffies); static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_dointvec_userhz_jiffies(ctl, write, buffer, lenp, ppos); @@ -3489,8 +3488,7 @@ static int neigh_proc_dointvec_userhz_jiffies(struct ctl_table *ctl, int write, } int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = proc_dointvec_ms_jiffies(ctl, write, buffer, lenp, ppos); @@ -3500,8 +3498,8 @@ int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, EXPORT_SYMBOL(neigh_proc_dointvec_ms_jiffies); static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { int ret = proc_unres_qlen(ctl, write, buffer, lenp, ppos); @@ -3510,8 +3508,8 @@ static int neigh_proc_dointvec_unres_qlen(struct ctl_table *ctl, int write, } static int neigh_proc_base_reachable_time(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, + loff_t *ppos) { struct neigh_parms *p = ctl->extra2; int ret; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 9f9e00ba3ad7..0ddb13a6282b 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -45,7 +45,7 @@ EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); #ifdef CONFIG_RPS static int rps_sock_flow_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int orig_size, size; int ret, i; @@ -115,8 +115,7 @@ static int rps_sock_flow_sysctl(struct ctl_table *table, int write, static DEFINE_MUTEX(flow_limit_update_mutex); static int flow_limit_cpu_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct sd_flow_limit *cur; struct softnet_data *sd; @@ -180,10 +179,7 @@ write_unlock: } if (len < *lenp) kbuf[len++] = '\n'; - if (copy_to_user(buffer, kbuf, len)) { - ret = -EFAULT; - goto done; - } + memcpy(buffer, kbuf, len); *lenp = len; *ppos += len; } @@ -194,8 +190,7 @@ done: } static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { unsigned int old, *ptr; int ret; @@ -217,7 +212,7 @@ static int flow_limit_table_len_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_NET_SCHED static int set_default_qdisc(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char id[IFNAMSIZ]; struct ctl_table tbl = { @@ -236,7 +231,7 @@ static int set_default_qdisc(struct ctl_table *table, int write, #endif static int proc_do_dev_weight(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -251,7 +246,7 @@ static int proc_do_dev_weight(struct ctl_table *table, int write, } static int proc_do_rss_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table fake_table; char buf[NETDEV_RSS_KEY_LEN * 3]; @@ -264,7 +259,7 @@ static int proc_do_rss_key(struct ctl_table *table, int write, #ifdef CONFIG_BPF_JIT static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret, jit_enable = *(int *)table->data; @@ -291,8 +286,7 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, # ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -303,8 +297,7 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, static int proc_dolongvec_minmax_bpf_restricted(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index cca7ae712995..65abcf1b3210 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -160,8 +160,8 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(struct ctl_table *, int, - void __user *, size_t *, loff_t *); +static int dn_forwarding_proc(struct ctl_table *, int, void *, size_t *, + loff_t *); static struct dn_dev_sysctl_table { struct ctl_table_header *sysctl_header; struct ctl_table dn_dev_vars[5]; @@ -245,8 +245,7 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { #ifdef CONFIG_DECNET_ROUTER struct net_device *dev = table->extra1; diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 55bf64a22b59..deae519bdeec 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -134,8 +134,7 @@ static int parse_addr(__le16 *addr, char *str) } static int dn_node_address_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char addr[DN_ASCBUF_LEN]; size_t len; @@ -148,10 +147,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, if (write) { len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); - - if (copy_from_user(addr, buffer, len)) - return -EFAULT; - + memcpy(addr, buffer, len); addr[len] = 0; strip_it(addr); @@ -173,11 +169,9 @@ static int dn_node_address_handler(struct ctl_table *table, int write, len = strlen(addr); addr[len++] = '\n'; - if (len > *lenp) len = *lenp; - - if (copy_to_user(buffer, addr, len)) - return -EFAULT; - + if (len > *lenp) + len = *lenp; + memcpy(buffer, addr, len); *lenp = len; *ppos += len; @@ -185,8 +179,7 @@ static int dn_node_address_handler(struct ctl_table *table, int write, } static int dn_def_dev_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { size_t len; struct net_device *dev; @@ -201,9 +194,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (*lenp > 16) return -E2BIG; - if (copy_from_user(devname, buffer, *lenp)) - return -EFAULT; - + memcpy(devname, buffer, *lenp); devname[*lenp] = 0; strip_it(devname); @@ -238,9 +229,7 @@ static int dn_def_dev_handler(struct ctl_table *table, int write, if (len > *lenp) len = *lenp; - if (copy_to_user(buffer, devname, len)) - return -EFAULT; - + memcpy(buffer, devname, len); *lenp = len; *ppos += len; diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 30fa42f5997d..a118978d222c 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2361,8 +2361,7 @@ static int devinet_conf_ifindex(struct net *net, struct ipv4_devconf *cnf) } static int devinet_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int old_value = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2414,8 +2413,7 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, } static int devinet_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -2458,8 +2456,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, } static int ipv4_doint_and_flush(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 788c69d9bfe0..041f4dcac440 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3336,8 +3336,7 @@ static int ip_rt_gc_elasticity __read_mostly = 8; static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = (struct net *)__ctl->extra1; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 81b267e990a1..868e317cc324 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -71,8 +71,7 @@ static void set_local_port_range(struct net *net, int range[2]) /* Validate changes from /proc interface. */ static int ipv4_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.ip_local_ports.range); @@ -107,7 +106,7 @@ static int ipv4_local_port_range(struct ctl_table *table, int write, /* Validate changes from /proc interface. */ static int ipv4_privileged_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_ip_prot_sock); @@ -168,8 +167,7 @@ static void set_ping_group_range(struct ctl_table *table, kgid_t low, kgid_t hig /* Validate changes from /proc interface. */ static int ipv4_ping_group_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct user_namespace *user_ns = current_user_ns(); int ret; @@ -204,8 +202,7 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, } static int ipv4_fwd_update_priority(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; @@ -221,7 +218,7 @@ static int ipv4_fwd_update_priority(struct ctl_table *table, int write, } static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(ctl->data, struct net, ipv4.tcp_congestion_control); @@ -241,9 +238,8 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, } static int proc_tcp_available_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX, }; int ret; @@ -258,9 +254,8 @@ static int proc_tcp_available_congestion_control(struct ctl_table *ctl, } static int proc_allowed_congestion_control(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_CA_BUF_MAX }; int ret; @@ -296,8 +291,7 @@ static int sscanf_key(char *buf, __le32 *key) } static int proc_tcp_fastopen_key(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, ipv4.sysctl_tcp_fastopen); @@ -399,7 +393,7 @@ static void proc_configure_early_demux(int enabled, int protocol) } static int proc_tcp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -415,7 +409,7 @@ static int proc_tcp_early_demux(struct ctl_table *table, int write, } static int proc_udp_early_demux(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -431,8 +425,7 @@ static int proc_udp_early_demux(struct ctl_table *table, int write, } static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, @@ -447,8 +440,7 @@ static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, } static int proc_tcp_available_ulp(struct ctl_table *ctl, - int write, - void __user *buffer, size_t *lenp, + int write, void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table tbl = { .maxlen = TCP_ULP_BUF_MAX, }; @@ -466,7 +458,7 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = container_of(table->data, struct net, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 24e319dfb510..9d0e89bccb90 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6108,9 +6108,8 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL -static -int addrconf_sysctl_forward(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_forward(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6134,9 +6133,8 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_mtu(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct inet6_dev *idev = ctl->extra1; int min_mtu = IPV6_MIN_MTU; @@ -6206,9 +6204,8 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) return 0; } -static -int addrconf_sysctl_disable(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_disable(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; @@ -6232,9 +6229,8 @@ int addrconf_sysctl_disable(struct ctl_table *ctl, int write, return ret; } -static -int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int ret; @@ -6275,7 +6271,7 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, } static int addrconf_sysctl_addr_gen_mode(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int ret = 0; @@ -6337,7 +6333,7 @@ out: } static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, + void *buffer, size_t *lenp, loff_t *ppos) { int err; @@ -6404,8 +6400,7 @@ out: static int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, - int write, - void __user *buffer, + int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -6505,10 +6500,8 @@ int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) return 0; } -static -int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) +static int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 1ecd4e9b0bdf..58f1255295d3 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1835,7 +1835,8 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 310cbddaa533..acdb31e38412 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -6088,9 +6088,8 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) #ifdef CONFIG_SYSCTL -static -int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write, + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int delay; diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 63b657aa8d29..fac2135aa47b 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -26,8 +26,7 @@ static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net; int ret; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 4701edffb1f7..a42e4ed5ab0e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1362,8 +1362,7 @@ done: (&((struct mpls_dev *)0)->field) static int mpls_conf_proc(struct ctl_table *ctl, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int oval = *(int *)ctl->data; int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); @@ -2594,7 +2593,7 @@ nolabels: } static int mpls_platform_labels(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = table->data; int platform_labels = net->mpls.platform_labels; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8d14a1acbc37..412656c34f20 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1736,7 +1736,7 @@ static int three = 3; static int proc_do_defense_mode(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct netns_ipvs *ipvs = table->extra2; int *valp = table->data; @@ -1763,7 +1763,7 @@ proc_do_defense_mode(struct ctl_table *table, int write, static int proc_do_sync_threshold(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val[2]; @@ -1788,7 +1788,7 @@ proc_do_sync_threshold(struct ctl_table *table, int write, static int proc_do_sync_ports(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 9b57330c81f8..31b027b12ff3 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -517,7 +517,7 @@ static unsigned int nf_conntrack_htable_size_user __read_mostly; static int nf_conntrack_hash_sysctl(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index bb25d4c794c7..6cb9f9474b05 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -414,7 +414,7 @@ static struct ctl_table nf_log_sysctl_ftable[] = { }; static int nf_log_proc_dostring(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; char buf[NFLOGGER_NAME_LEN]; diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 251e750fd9aa..0d0bf41381c2 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -49,8 +49,7 @@ void phonet_get_local_port_range(int *min, int *max) } static int proc_local_port_range(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; int range[2] = {local_port_range[0], local_port_range[1]}; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 66121bc6f34e..46782fac4c16 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -62,8 +62,7 @@ static atomic_t rds_tcp_unloading = ATOMIC_INIT(0); static struct kmem_cache *rds_tcp_conn_slab; static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos); + void *buffer, size_t *lenp, loff_t *fpos); static int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF; static int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF; @@ -676,8 +675,7 @@ static void rds_tcp_sysctl_reset(struct net *net) } static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *fpos) + void *buffer, size_t *lenp, loff_t *fpos) { struct net *net = current->nsproxy->net_ns; int err; diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 4740aa70e652..c16c80963e55 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -43,20 +43,15 @@ static unsigned long max_autoclose_max = ? UINT_MAX : MAX_SCHEDULE_TIMEOUT / HZ; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); -static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); +static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer, + size_t *lenp, loff_t *ppos); static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos); + void *buffer, size_t *lenp, loff_t *ppos); static struct ctl_table sctp_table[] = { { @@ -343,8 +338,7 @@ static struct ctl_table sctp_net_table[] = { }; static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; @@ -389,8 +383,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -418,8 +411,7 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int write, } static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; unsigned int min = *(unsigned int *) ctl->extra1; @@ -447,8 +439,7 @@ static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, } static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (write) pr_warn_once("Changing rto_alpha or rto_beta may lead to " @@ -458,8 +449,7 @@ static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write, } static int proc_sctp_do_auth(struct ctl_table *ctl, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; struct ctl_table tbl; diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index d75f17b56f0e..999eee1ed61c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -60,7 +60,7 @@ rpc_unregister_sysctl(void) } static int proc_do_xprt(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; size_t len; @@ -70,15 +70,15 @@ static int proc_do_xprt(struct ctl_table *table, int write, return 0; } len = svc_print_xprts(tmpbuf, sizeof(tmpbuf)); - return simple_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); + return memory_read_from_buffer(buffer, *lenp, ppos, tmpbuf, len); } static int -proc_dodebug(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +proc_dodebug(struct ctl_table *table, int write, void *buffer, size_t *lenp, + loff_t *ppos) { - char tmpbuf[20], c, *s = NULL; - char __user *p; + char tmpbuf[20], *s = NULL; + char *p; unsigned int value; size_t left, len; @@ -90,18 +90,17 @@ proc_dodebug(struct ctl_table *table, int write, left = *lenp; if (write) { - if (!access_ok(buffer, left)) - return -EFAULT; p = buffer; - while (left && __get_user(c, p) >= 0 && isspace(c)) - left--, p++; + while (left && isspace(*p)) { + left--; + p++; + } if (!left) goto done; if (left > sizeof(tmpbuf) - 1) return -EINVAL; - if (copy_from_user(tmpbuf, p, left)) - return -EFAULT; + memcpy(tmpbuf, p, left); tmpbuf[left] = '\0'; value = simple_strtol(tmpbuf, &s, 0); @@ -121,11 +120,9 @@ proc_dodebug(struct ctl_table *table, int write, len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (copy_to_user(buffer, tmpbuf, len)) - return -EFAULT; + memcpy(buffer, tmpbuf, len); if ((left -= len) > 0) { - if (put_user('\n', (char __user *)buffer + len)) - return -EFAULT; + *((char *)buffer + len) = '\n'; left--; } } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 97bca509a391..526da5d4710b 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,8 +80,7 @@ atomic_t rdma_stat_sq_prod; * current value. */ static int read_reset_stat(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, - loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; @@ -103,8 +102,8 @@ static int read_reset_stat(struct ctl_table *table, int write, len -= *ppos; if (len > *lenp) len = *lenp; - if (len && copy_to_user(buffer, str_buf, len)) - return -EFAULT; + if (len) + memcpy(buffer, str_buf, len); *lenp = len; *ppos += len; } diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index b621ad74f54a..27e371b44dad 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -1696,7 +1696,7 @@ static int __init alloc_buffers(void) #ifdef CONFIG_SYSCTL static int apparmor_dointvec(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { if (!policy_admin_capable(NULL)) return -EPERM; diff --git a/security/min_addr.c b/security/min_addr.c index 94d2b0cf0e7b..88c9a6a21f47 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -30,7 +30,7 @@ static void update_mmap_min_addr(void) * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ int mmap_min_addr_handler(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; diff --git a/security/yama/yama_lsm.c b/security/yama/yama_lsm.c index 94dc346370b1..536c99646f6a 100644 --- a/security/yama/yama_lsm.c +++ b/security/yama/yama_lsm.c @@ -430,7 +430,7 @@ static struct security_hook_list yama_hooks[] __lsm_ro_after_init = { #ifdef CONFIG_SYSCTL static int yama_dointvec_minmax(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table table_copy; -- cgit v1.2.3 From 7d3118016787b5c05da94b3bcdb96c9d6ff82c44 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 25 Apr 2020 12:28:14 +0100 Subject: net: rtnetlink: remove redundant assignment to variable err The variable err is being initializeed with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d6f4f4a9e8ba..2269199c5891 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3997,8 +3997,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, struct ndmsg *ndm; struct nlattr *tb[NDA_MAX+1]; struct net_device *dev; - int err = -EINVAL; __u8 *addr; + int err; u16 vid; if (!netlink_capable(skb, CAP_NET_ADMIN)) -- cgit v1.2.3 From 8c498935585680284e5f3e5294d3c901b7c89d57 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 28 Apr 2020 00:01:41 +0200 Subject: docs: networking: convert gen_stats.txt to ReST - add SPDX header; - mark code blocks and literals as such; - mark tables as such; - mark lists as such; - adjust identation, whitespaces and blank lines; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/gen_stats.rst | 129 +++++++++++++++++++++++++++++++++ Documentation/networking/gen_stats.txt | 119 ------------------------------ Documentation/networking/index.rst | 1 + net/core/gen_stats.c | 2 +- 4 files changed, 131 insertions(+), 120 deletions(-) create mode 100644 Documentation/networking/gen_stats.rst delete mode 100644 Documentation/networking/gen_stats.txt (limited to 'net/core') diff --git a/Documentation/networking/gen_stats.rst b/Documentation/networking/gen_stats.rst new file mode 100644 index 000000000000..595a83b9a61b --- /dev/null +++ b/Documentation/networking/gen_stats.rst @@ -0,0 +1,129 @@ +.. SPDX-License-Identifier: GPL-2.0 + +=============================================== +Generic networking statistics for netlink users +=============================================== + +Statistic counters are grouped into structs: + +==================== ===================== ===================== +Struct TLV type Description +==================== ===================== ===================== +gnet_stats_basic TCA_STATS_BASIC Basic statistics +gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator +gnet_stats_queue TCA_STATS_QUEUE Queue statistics +none TCA_STATS_APP Application specific +==================== ===================== ===================== + + +Collecting: +----------- + +Declare the statistic structs you need:: + + struct mystruct { + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + ... + }; + +Update statistics, in dequeue() methods only, (while owning qdisc->running):: + + mystruct->tstats.packet++; + mystruct->qstats.backlog += skb->pkt_len; + + +Export to userspace (Dump): +--------------------------- + +:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + struct gnet_dump dump; + + if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + + if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || + gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || + gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&dump) < 0) + goto rtattr_failure; + ... + } + +TCA_STATS/TCA_XSTATS backward compatibility: +-------------------------------------------- + +Prior users of struct tc_stats and xstats can maintain backward +compatibility by calling the compat wrappers to keep providing the +existing TLV types:: + + my_dumping_routine(struct sk_buff *skb, ...) + { + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, &mystruct->lock, &dump, + TCA_PAD) < 0) + goto rtattr_failure; + ... + } + +A struct tc_stats will be filled out during gnet_stats_copy_* calls +and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app +was called. + + +Locking: +-------- + +Locks are taken before writing and released once all statistics have +been written. Locks are always released in case of an error. You +are responsible for making sure that the lock is initialized. + + +Rate Estimator: +--------------- + +0) Prepare an estimator attribute. Most likely this would be in user + space. The value of this TLV should contain a tc_estimator structure. + As usual, such a TLV needs to be 32 bit aligned and therefore the + length needs to be appropriately set, etc. The estimator interval + and ewma log need to be converted to the appropriate values. + tc_estimator.c::tc_setup_estimator() is advisable to be used as the + conversion routine. It does a few clever things. It takes a time + interval in microsecs, a time constant also in microsecs and a struct + tc_estimator to be populated. The returned tc_estimator can be + transported to the kernel. Transfer such a structure in a TLV of type + TCA_RATE to your code in the kernel. + +In the kernel when setting up: + +1) make sure you have basic stats and rate stats setup first. +2) make sure you have initialized stats lock that is used to setup such + stats. +3) Now initialize a new estimator:: + + int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, + mystats_lock, attr_with_tcestimator_struct); + + if ret == 0 + success + else + failed + +From now on, every time you dump my_rate_est_stats it will contain +up-to-date info. + +Once you are done, call gen_kill_estimator(my_basicstats, +my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats +are still valid (i.e still exist) at the time of making this call. + + +Authors: +-------- +- Thomas Graf +- Jamal Hadi Salim diff --git a/Documentation/networking/gen_stats.txt b/Documentation/networking/gen_stats.txt deleted file mode 100644 index 179b18ce45ff..000000000000 --- a/Documentation/networking/gen_stats.txt +++ /dev/null @@ -1,119 +0,0 @@ -Generic networking statistics for netlink users -====================================================================== - -Statistic counters are grouped into structs: - -Struct TLV type Description ----------------------------------------------------------------------- -gnet_stats_basic TCA_STATS_BASIC Basic statistics -gnet_stats_rate_est TCA_STATS_RATE_EST Rate estimator -gnet_stats_queue TCA_STATS_QUEUE Queue statistics -none TCA_STATS_APP Application specific - - -Collecting: ------------ - -Declare the statistic structs you need: -struct mystruct { - struct gnet_stats_basic bstats; - struct gnet_stats_queue qstats; - ... -}; - -Update statistics, in dequeue() methods only, (while owning qdisc->running) -mystruct->tstats.packet++; -mystruct->qstats.backlog += skb->pkt_len; - - -Export to userspace (Dump): ---------------------------- - -my_dumping_routine(struct sk_buff *skb, ...) -{ - struct gnet_dump dump; - - if (gnet_stats_start_copy(skb, TCA_STATS2, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - - if (gnet_stats_copy_basic(&dump, &mystruct->bstats) < 0 || - gnet_stats_copy_queue(&dump, &mystruct->qstats) < 0 || - gnet_stats_copy_app(&dump, &xstats, sizeof(xstats)) < 0) - goto rtattr_failure; - - if (gnet_stats_finish_copy(&dump) < 0) - goto rtattr_failure; - ... -} - -TCA_STATS/TCA_XSTATS backward compatibility: --------------------------------------------- - -Prior users of struct tc_stats and xstats can maintain backward -compatibility by calling the compat wrappers to keep providing the -existing TLV types. - -my_dumping_routine(struct sk_buff *skb, ...) -{ - if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, - TCA_XSTATS, &mystruct->lock, &dump, - TCA_PAD) < 0) - goto rtattr_failure; - ... -} - -A struct tc_stats will be filled out during gnet_stats_copy_* calls -and appended to the skb. TCA_XSTATS is provided if gnet_stats_copy_app -was called. - - -Locking: --------- - -Locks are taken before writing and released once all statistics have -been written. Locks are always released in case of an error. You -are responsible for making sure that the lock is initialized. - - -Rate Estimator: --------------- - -0) Prepare an estimator attribute. Most likely this would be in user - space. The value of this TLV should contain a tc_estimator structure. - As usual, such a TLV needs to be 32 bit aligned and therefore the - length needs to be appropriately set, etc. The estimator interval - and ewma log need to be converted to the appropriate values. - tc_estimator.c::tc_setup_estimator() is advisable to be used as the - conversion routine. It does a few clever things. It takes a time - interval in microsecs, a time constant also in microsecs and a struct - tc_estimator to be populated. The returned tc_estimator can be - transported to the kernel. Transfer such a structure in a TLV of type - TCA_RATE to your code in the kernel. - -In the kernel when setting up: -1) make sure you have basic stats and rate stats setup first. -2) make sure you have initialized stats lock that is used to setup such - stats. -3) Now initialize a new estimator: - - int ret = gen_new_estimator(my_basicstats,my_rate_est_stats, - mystats_lock, attr_with_tcestimator_struct); - - if ret == 0 - success - else - failed - -From now on, every time you dump my_rate_est_stats it will contain -up-to-date info. - -Once you are done, call gen_kill_estimator(my_basicstats, -my_rate_est_stats) Make sure that my_basicstats and my_rate_est_stats -are still valid (i.e still exist) at the time of making this call. - - -Authors: --------- -Thomas Graf -Jamal Hadi Salim diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index 42e556509e22..33afbb67f3fa 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -61,6 +61,7 @@ Contents: framerelay generic-hdlc generic_netlink + gen_stats .. only:: subproject and html diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 1d653fbfcf52..e491b083b348 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -6,7 +6,7 @@ * Jamal Hadi Salim * Alexey Kuznetsov, * - * See Documentation/networking/gen_stats.txt + * See Documentation/networking/gen_stats.rst */ #include -- cgit v1.2.3 From a54776f2c4939bdee084c9ecd00a4a5a25b7c429 Mon Sep 17 00:00:00 2001 From: Yunjian Wang Date: Wed, 29 Apr 2020 18:20:58 +0800 Subject: netpoll: Fix use correct return type for ndo_start_xmit() The method ndo_start_xmit() returns a value of type netdev_tx_t. Fix the ndo function to use the correct type. Signed-off-by: Yunjian Wang Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 849380a622ef..15b366a1a958 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -69,10 +69,11 @@ module_param(carrier_timeout, uint, 0644); #define np_notice(np, fmt, ...) \ pr_notice("%s: " fmt, np->name, ##__VA_ARGS__) -static int netpoll_start_xmit(struct sk_buff *skb, struct net_device *dev, - struct netdev_queue *txq) +static netdev_tx_t netpoll_start_xmit(struct sk_buff *skb, + struct net_device *dev, + struct netdev_queue *txq) { - int status = NETDEV_TX_OK; + netdev_tx_t status = NETDEV_TX_OK; netdev_features_t features; features = netif_skb_features(skb); @@ -307,7 +308,7 @@ static int netpoll_owner_active(struct net_device *dev) void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, struct net_device *dev) { - int status = NETDEV_TX_BUSY; + netdev_tx_t status = NETDEV_TX_BUSY; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; -- cgit v1.2.3 From 64d85290d79c0677edb5a8ee2295b36c022fa5df Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Wed, 29 Apr 2020 20:11:52 +0200 Subject: bpf: Allow bpf_map_lookup_elem for SOCKMAP and SOCKHASH White-list map lookup for SOCKMAP/SOCKHASH from BPF. Lookup returns a pointer to a full socket and acquires a reference if necessary. To support it we need to extend the verifier to know that: (1) register storing the lookup result holds a pointer to socket, if lookup was done on SOCKMAP/SOCKHASH, and that (2) map lookup on SOCKMAP/SOCKHASH is a reference acquiring operation, which needs a corresponding reference release with bpf_sk_release. On sock_map side, lookup handlers exposed via bpf_map_ops now bump sk_refcnt if socket is reference counted. In turn, bpf_sk_select_reuseport, the only in-kernel user of SOCKMAP/SOCKHASH ops->map_lookup_elem, was updated to release the reference. Sockets fetched from a map can be used in the same way as ones returned by BPF socket lookup helpers, such as bpf_sk_lookup_tcp. In particular, they can be used with bpf_sk_assign to direct packets toward a socket on TC ingress path. Suggested-by: Lorenz Bauer Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200429181154.479310-2-jakub@cloudflare.com --- kernel/bpf/verifier.c | 45 +++++++++++++++++++++++++++++++++++---------- net/core/filter.c | 4 ++++ net/core/sock_map.c | 18 ++++++++++++++++-- 3 files changed, 55 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2b337e32aa94..70ad009577f8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -429,11 +429,30 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } -static bool is_acquire_function(enum bpf_func_id func_id) +static bool may_be_acquire_function(enum bpf_func_id func_id) { return func_id == BPF_FUNC_sk_lookup_tcp || func_id == BPF_FUNC_sk_lookup_udp || - func_id == BPF_FUNC_skc_lookup_tcp; + func_id == BPF_FUNC_skc_lookup_tcp || + func_id == BPF_FUNC_map_lookup_elem; +} + +static bool is_acquire_function(enum bpf_func_id func_id, + const struct bpf_map *map) +{ + enum bpf_map_type map_type = map ? map->map_type : BPF_MAP_TYPE_UNSPEC; + + if (func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp || + func_id == BPF_FUNC_skc_lookup_tcp) + return true; + + if (func_id == BPF_FUNC_map_lookup_elem && + (map_type == BPF_MAP_TYPE_SOCKMAP || + map_type == BPF_MAP_TYPE_SOCKHASH)) + return true; + + return false; } static bool is_ptr_cast_function(enum bpf_func_id func_id) @@ -3934,7 +3953,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_map_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_map && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_SOCKHASH: @@ -3942,7 +3962,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_sock_hash_update && func_id != BPF_FUNC_map_delete_elem && func_id != BPF_FUNC_msg_redirect_hash && - func_id != BPF_FUNC_sk_select_reuseport) + func_id != BPF_FUNC_sk_select_reuseport && + func_id != BPF_FUNC_map_lookup_elem) goto error; break; case BPF_MAP_TYPE_REUSEPORT_SOCKARRAY: @@ -4112,7 +4133,7 @@ static bool check_refcount_ok(const struct bpf_func_proto *fn, int func_id) /* A reference acquiring function cannot acquire * another refcounted ptr. */ - if (is_acquire_function(func_id) && count) + if (may_be_acquire_function(func_id) && count) return false; /* We only support one arg being unreferenced at the moment, @@ -4623,7 +4644,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (is_ptr_cast_function(func_id)) { /* For release_reference() */ regs[BPF_REG_0].ref_obj_id = meta.ref_obj_id; - } else if (is_acquire_function(func_id)) { + } else if (is_acquire_function(func_id, meta.map_ptr)) { int id = acquire_reference_state(env, insn_idx); if (id < 0) @@ -6532,12 +6553,16 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, if (is_null) { reg->type = SCALAR_VALUE; } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (reg->map_ptr->inner_map_meta) { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = reg->map_ptr->inner_map_meta; - } else if (reg->map_ptr->map_type == - BPF_MAP_TYPE_XSKMAP) { + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; } else { reg->type = PTR_TO_MAP_VALUE; } diff --git a/net/core/filter.c b/net/core/filter.c index da3b7a72c37c..70b32723e6be 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -8712,6 +8712,10 @@ BPF_CALL_4(sk_select_reuseport, struct sk_reuseport_kern *, reuse_kern, reuse = rcu_dereference(selected_sk->sk_reuseport_cb); if (!reuse) { + /* Lookup in sock_map can return TCP ESTABLISHED sockets. */ + if (sk_is_refcounted(selected_sk)) + sock_put(selected_sk); + /* reuseport_array has only sk with non NULL sk_reuseport_cb. * The only (!reuse) case here is - the sk has already been * unhashed (e.g. by close()), so treat it as -ENOENT. diff --git a/net/core/sock_map.c b/net/core/sock_map.c index b08dfae10f88..00a26cf2cfe9 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -343,7 +343,14 @@ static struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static void *sock_map_lookup(struct bpf_map *map, void *key) { - return __sock_map_lookup_elem(map, *(u32 *)key); + struct sock *sk; + + sk = __sock_map_lookup_elem(map, *(u32 *)key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void *sock_map_lookup_sys(struct bpf_map *map, void *key) @@ -1051,7 +1058,14 @@ static void *sock_hash_lookup_sys(struct bpf_map *map, void *key) static void *sock_hash_lookup(struct bpf_map *map, void *key) { - return __sock_hash_lookup_elem(map, key); + struct sock *sk; + + sk = __sock_hash_lookup_elem(map, key); + if (!sk || !sk_fullsock(sk)) + return NULL; + if (sk_is_refcounted(sk) && !refcount_inc_not_zero(&sk->sk_refcnt)) + return NULL; + return sk; } static void sock_hash_release_progs(struct bpf_map *map) -- cgit v1.2.3 From c1e4535f24bcfeef55a7ed409a5f50548e284426 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 30 Apr 2020 18:04:13 +0200 Subject: docs: networking: convert pktgen.txt to ReST - add SPDX header; - adjust title markup; - use bold markups on a few places; - mark code blocks and literals as such; - adjust identation, whitespaces and blank lines where needed; - add to networking/index.rst. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: David S. Miller --- Documentation/networking/index.rst | 1 + Documentation/networking/pktgen.rst | 412 ++++++++++++++++++++++++++++++++++++ Documentation/networking/pktgen.txt | 400 ---------------------------------- net/Kconfig | 2 +- net/core/pktgen.c | 2 +- samples/pktgen/README.rst | 2 +- 6 files changed, 416 insertions(+), 403 deletions(-) create mode 100644 Documentation/networking/pktgen.rst delete mode 100644 Documentation/networking/pktgen.txt (limited to 'net/core') diff --git a/Documentation/networking/index.rst b/Documentation/networking/index.rst index e460026331c6..696181a96e3c 100644 --- a/Documentation/networking/index.rst +++ b/Documentation/networking/index.rst @@ -91,6 +91,7 @@ Contents: operstates packet_mmap phonet + pktgen .. only:: subproject and html diff --git a/Documentation/networking/pktgen.rst b/Documentation/networking/pktgen.rst new file mode 100644 index 000000000000..7afa1c9f1183 --- /dev/null +++ b/Documentation/networking/pktgen.rst @@ -0,0 +1,412 @@ +.. SPDX-License-Identifier: GPL-2.0 + +==================================== +HOWTO for the linux packet generator +==================================== + +Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel +or as a module. A module is preferred; modprobe pktgen if needed. Once +running, pktgen creates a thread for each CPU with affinity to that CPU. +Monitoring and controlling is done via /proc. It is easiest to select a +suitable sample script and configure that. + +On a dual CPU:: + + ps aux | grep pkt + root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] + root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] + + +For monitoring and control pktgen creates:: + + /proc/net/pktgen/pgctrl + /proc/net/pktgen/kpktgend_X + /proc/net/pktgen/ethX + + +Tuning NIC for max performance +============================== + +The default NIC settings are (likely) not tuned for pktgen's artificial +overload type of benchmarking, as this could hurt the normal use-case. + +Specifically increasing the TX ring buffer in the NIC:: + + # ethtool -G ethX tx 1024 + +A larger TX ring can improve pktgen's performance, while it can hurt +in the general case, 1) because the TX ring buffer might get larger +than the CPU's L1/L2 cache, 2) because it allows more queueing in the +NIC HW layer (which is bad for bufferbloat). + +One should hesitate to conclude that packets/descriptors in the HW +TX ring cause delay. Drivers usually delay cleaning up the +ring-buffers for various performance reasons, and packets stalling +the TX ring might just be waiting for cleanup. + +This cleanup issue is specifically the case for the driver ixgbe +(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, +and the cleanup interval is affected by the ethtool --coalesce setting +of parameter "rx-usecs". + +For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6):: + + # ethtool -C ethX rx-usecs 30 + + +Kernel threads +============== +Pktgen creates a thread for each CPU with affinity to that CPU. +Which is controlled through procfile /proc/net/pktgen/kpktgend_X. + +Example: /proc/net/pktgen/kpktgend_0:: + + Running: + Stopped: eth4@0 + Result: OK: add_device=eth4@0 + +Most important are the devices assigned to the thread. + +The two basic thread commands are: + + * add_device DEVICE@NAME -- adds a single device + * rem_device_all -- remove all associated devices + +When adding a device to a thread, a corresponding procfile is created +which is used for configuring this device. Thus, device names need to +be unique. + +To support adding the same device to multiple threads, which is useful +with multi queue NICs, the device naming scheme is extended with "@": +device@something + +The part after "@" can be anything, but it is custom to use the thread +number. + +Viewing devices +=============== + +The Params section holds configured information. The Current section +holds running statistics. The Result is printed after a run or after +interruption. Example:: + + /proc/net/pktgen/eth4@0 + + Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 + frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 + flows: 0 flowlen: 0 + queue_map_min: 0 queue_map_max: 0 + dst_min: 192.168.81.2 dst_max: + src_min: src_max: + src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 + udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 + src_mac_count: 0 dst_mac_count: 0 + Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU + Current: + pkts-sofar: 100000 errors: 0 + started: 623913381008us stopped: 623913396439us idle: 25us + seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 + cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 + cur_udp_dst: 9 cur_udp_src: 42 + cur_queue_map: 0 + flows: 0 + Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) + 6480562pps 3110Mb/sec (3110669760bps) errors: 0 + + +Configuring devices +=================== +This is done via the /proc interface, and most easily done via pgset +as defined in the sample scripts. +You need to specify PGDEV environment variable to use functions from sample +scripts, i.e.:: + + export PGDEV=/proc/net/pktgen/eth4@0 + source samples/pktgen/functions.sh + +Examples:: + + pg_ctrl start starts injection. + pg_ctrl stop aborts injection. Also, ^C aborts generator. + + pgset "clone_skb 1" sets the number of copies of the same packet + pgset "clone_skb 0" use single SKB for all transmits + pgset "burst 8" uses xmit_more API to queue 8 copies of the same + packet and update HW tx queue tail pointer once. + "burst 1" is the default + pgset "pkt_size 9014" sets packet size to 9014 + pgset "frags 5" packet will consist of 5 fragments + pgset "count 200000" sets number of packets to send, set to zero + for continuous sends until explicitly stopped. + + pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds + + pgset "dst 10.0.0.1" sets IP destination address + (BEWARE! This generator is very aggressive!) + + pgset "dst_min 10.0.0.1" Same as dst + pgset "dst_max 10.0.0.254" Set the maximum destination IP. + pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. + pgset "src_max 10.0.0.254" Set the maximum source IP. + pgset "dst6 fec0::1" IPV6 destination address + pgset "src6 fec0::2" IPV6 source address + pgset "dstmac 00:00:00:00:00:00" sets MAC destination address + pgset "srcmac 00:00:00:00:00:00" sets MAC source address + + pgset "queue_map_min 0" Sets the min value of tx queue interval + pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices + To select queue 1 of a given device, + use queue_map_min=1 and queue_map_max=1 + + pgset "src_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with srcmac. + + pgset "dst_mac_count 1" Sets the number of MACs we'll range through. + The 'minimum' MAC is what you set with dstmac. + + pgset "flag [name]" Set a flag to determine behaviour. Current flags + are: IPSRC_RND # IP source is random (between min/max) + IPDST_RND # IP destination is random + UDPSRC_RND, UDPDST_RND, + MACSRC_RND, MACDST_RND + TXSIZE_RND, IPV6, + MPLS_RND, VID_RND, SVID_RND + FLOW_SEQ, + QUEUE_MAP_RND # queue map random + QUEUE_MAP_CPU # queue map mirrors smp_processor_id() + UDPCSUM, + IPSEC # IPsec encapsulation (needs CONFIG_XFRM) + NODE_ALLOC # node specific memory allocation + NO_TIMESTAMP # disable timestamping + pgset 'flag ![name]' Clear a flag to determine behaviour. + Note that you might need to use single quote in + interactive mode, so that your shell wouldn't expand + the specified flag as a history command. + + pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. + + pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then + cycle through the port range. + + pgset "udp_src_max 9" set UDP source port max. + pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then + cycle through the port range. + pgset "udp_dst_max 9" set UDP destination port max. + + pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example + outer label=16,middle label=32, + inner label=0 (IPv4 NULL)) Note that + there must be no spaces between the + arguments. Leading zeros are required. + Do not set the bottom of stack bit, + that's done automatically. If you do + set the bottom of stack bit, that + indicates that you want to randomly + generate that address and the flag + MPLS_RND will be turned on. You + can have any mix of random and fixed + labels in the label stack. + + pgset "mpls 0" turn off mpls (or any invalid argument works too!) + + pgset "vlan_id 77" set VLAN ID 0-4095 + pgset "vlan_p 3" set priority bit 0-7 (default 0) + pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "svlan_id 22" set SVLAN ID 0-4095 + pgset "svlan_p 3" set priority bit 0-7 (default 0) + pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) + + pgset "vlan_id 9999" > 4095 remove vlan and svlan tags + pgset "svlan 9999" > 4095 remove svlan tag + + + pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) + pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) + + pgset "rate 300M" set rate to 300 Mb/s + pgset "ratep 1000000" set rate to 1Mpps + + pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() + Works with "burst" but not with "clone_skb". + Default xmit_mode is "start_xmit". + +Sample scripts +============== + +A collection of tutorial scripts and helpers for pktgen is in the +samples/pktgen directory. The helper parameters.sh file support easy +and consistent parameter parsing across the sample scripts. + +Usage example and help:: + + ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 + +Usage::: + + ./pktgen_sample01_simple.sh [-vx] -i ethX + + -i : ($DEV) output interface/device (required) + -s : ($PKT_SIZE) packet size + -d : ($DEST_IP) destination IP + -m : ($DST_MAC) destination MAC-addr + -t : ($THREADS) threads to start + -c : ($SKB_CLONE) SKB clones send before alloc new SKB + -b : ($BURST) HW level bursting of SKBs + -v : ($VERBOSE) verbose + -x : ($DEBUG) debug + +The global variables being set are also listed. E.g. the required +interface/device parameter "-i" sets variable $DEV. Copy the +pktgen_sampleXX scripts and modify them to fit your own needs. + +The old scripts:: + + pktgen.conf-1-2 # 1 CPU 2 dev + pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS + pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 + pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS + pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. + + +Interrupt affinity +=================== +Note that when adding devices to a specific CPU it is a good idea to +also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound +to the same CPU. This reduces cache bouncing when freeing skbs. + +Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue +to the running threads CPU (directly from smp_processor_id()). + +Enable IPsec +============ +Default IPsec transformation with ESP encapsulation plus transport mode +can be enabled by simply setting:: + + pgset "flag IPSEC" + pgset "flows 1" + +To avoid breaking existing testbed scripts for using AH type and tunnel mode, +you can use "pgset spi SPI_VALUE" to specify which transformation mode +to employ. + + +Current commands and configuration options +========================================== + +**Pgcontrol commands**:: + + start + stop + reset + +**Thread commands**:: + + add_device + rem_device_all + + +**Device commands**:: + + count + clone_skb + burst + debug + + frags + delay + + src_mac_count + dst_mac_count + + pkt_size + min_pkt_size + max_pkt_size + + queue_map_min + queue_map_max + skb_priority + + tos (ipv4) + traffic_class (ipv6) + + mpls + + udp_src_min + udp_src_max + + udp_dst_min + udp_dst_max + + node + + flag + IPSRC_RND + IPDST_RND + UDPSRC_RND + UDPDST_RND + MACSRC_RND + MACDST_RND + TXSIZE_RND + IPV6 + MPLS_RND + VID_RND + SVID_RND + FLOW_SEQ + QUEUE_MAP_RND + QUEUE_MAP_CPU + UDPCSUM + IPSEC + NODE_ALLOC + NO_TIMESTAMP + + spi (ipsec) + + dst_min + dst_max + + src_min + src_max + + dst_mac + src_mac + + clear_counters + + src6 + dst6 + dst6_max + dst6_min + + flows + flowlen + + rate + ratep + + xmit_mode + + vlan_cfi + vlan_id + vlan_p + + svlan_cfi + svlan_id + svlan_p + + +References: + +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ +- tp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ + +Paper from Linux-Kongress in Erlangen 2004. +- ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf + +Thanks to: + +Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek +Stephen Hemminger, Andi Kleen, Dave Miller and many others. + + +Good luck with the linux net-development. diff --git a/Documentation/networking/pktgen.txt b/Documentation/networking/pktgen.txt deleted file mode 100644 index d2fd78f85aa4..000000000000 --- a/Documentation/networking/pktgen.txt +++ /dev/null @@ -1,400 +0,0 @@ - - - HOWTO for the linux packet generator - ------------------------------------ - -Enable CONFIG_NET_PKTGEN to compile and build pktgen either in-kernel -or as a module. A module is preferred; modprobe pktgen if needed. Once -running, pktgen creates a thread for each CPU with affinity to that CPU. -Monitoring and controlling is done via /proc. It is easiest to select a -suitable sample script and configure that. - -On a dual CPU: - -ps aux | grep pkt -root 129 0.3 0.0 0 0 ? SW 2003 523:20 [kpktgend_0] -root 130 0.3 0.0 0 0 ? SW 2003 509:50 [kpktgend_1] - - -For monitoring and control pktgen creates: - /proc/net/pktgen/pgctrl - /proc/net/pktgen/kpktgend_X - /proc/net/pktgen/ethX - - -Tuning NIC for max performance -============================== - -The default NIC settings are (likely) not tuned for pktgen's artificial -overload type of benchmarking, as this could hurt the normal use-case. - -Specifically increasing the TX ring buffer in the NIC: - # ethtool -G ethX tx 1024 - -A larger TX ring can improve pktgen's performance, while it can hurt -in the general case, 1) because the TX ring buffer might get larger -than the CPU's L1/L2 cache, 2) because it allows more queueing in the -NIC HW layer (which is bad for bufferbloat). - -One should hesitate to conclude that packets/descriptors in the HW -TX ring cause delay. Drivers usually delay cleaning up the -ring-buffers for various performance reasons, and packets stalling -the TX ring might just be waiting for cleanup. - -This cleanup issue is specifically the case for the driver ixgbe -(Intel 82599 chip). This driver (ixgbe) combines TX+RX ring cleanups, -and the cleanup interval is affected by the ethtool --coalesce setting -of parameter "rx-usecs". - -For ixgbe use e.g. "30" resulting in approx 33K interrupts/sec (1/30*10^6): - # ethtool -C ethX rx-usecs 30 - - -Kernel threads -============== -Pktgen creates a thread for each CPU with affinity to that CPU. -Which is controlled through procfile /proc/net/pktgen/kpktgend_X. - -Example: /proc/net/pktgen/kpktgend_0 - - Running: - Stopped: eth4@0 - Result: OK: add_device=eth4@0 - -Most important are the devices assigned to the thread. - -The two basic thread commands are: - * add_device DEVICE@NAME -- adds a single device - * rem_device_all -- remove all associated devices - -When adding a device to a thread, a corresponding procfile is created -which is used for configuring this device. Thus, device names need to -be unique. - -To support adding the same device to multiple threads, which is useful -with multi queue NICs, the device naming scheme is extended with "@": - device@something - -The part after "@" can be anything, but it is custom to use the thread -number. - -Viewing devices -=============== - -The Params section holds configured information. The Current section -holds running statistics. The Result is printed after a run or after -interruption. Example: - -/proc/net/pktgen/eth4@0 - - Params: count 100000 min_pkt_size: 60 max_pkt_size: 60 - frags: 0 delay: 0 clone_skb: 64 ifname: eth4@0 - flows: 0 flowlen: 0 - queue_map_min: 0 queue_map_max: 0 - dst_min: 192.168.81.2 dst_max: - src_min: src_max: - src_mac: 90:e2:ba:0a:56:b4 dst_mac: 00:1b:21:3c:9d:f8 - udp_src_min: 9 udp_src_max: 109 udp_dst_min: 9 udp_dst_max: 9 - src_mac_count: 0 dst_mac_count: 0 - Flags: UDPSRC_RND NO_TIMESTAMP QUEUE_MAP_CPU - Current: - pkts-sofar: 100000 errors: 0 - started: 623913381008us stopped: 623913396439us idle: 25us - seq_num: 100001 cur_dst_mac_offset: 0 cur_src_mac_offset: 0 - cur_saddr: 192.168.8.3 cur_daddr: 192.168.81.2 - cur_udp_dst: 9 cur_udp_src: 42 - cur_queue_map: 0 - flows: 0 - Result: OK: 15430(c15405+d25) usec, 100000 (60byte,0frags) - 6480562pps 3110Mb/sec (3110669760bps) errors: 0 - - -Configuring devices -=================== -This is done via the /proc interface, and most easily done via pgset -as defined in the sample scripts. -You need to specify PGDEV environment variable to use functions from sample -scripts, i.e.: -export PGDEV=/proc/net/pktgen/eth4@0 -source samples/pktgen/functions.sh - -Examples: - - pg_ctrl start starts injection. - pg_ctrl stop aborts injection. Also, ^C aborts generator. - - pgset "clone_skb 1" sets the number of copies of the same packet - pgset "clone_skb 0" use single SKB for all transmits - pgset "burst 8" uses xmit_more API to queue 8 copies of the same - packet and update HW tx queue tail pointer once. - "burst 1" is the default - pgset "pkt_size 9014" sets packet size to 9014 - pgset "frags 5" packet will consist of 5 fragments - pgset "count 200000" sets number of packets to send, set to zero - for continuous sends until explicitly stopped. - - pgset "delay 5000" adds delay to hard_start_xmit(). nanoseconds - - pgset "dst 10.0.0.1" sets IP destination address - (BEWARE! This generator is very aggressive!) - - pgset "dst_min 10.0.0.1" Same as dst - pgset "dst_max 10.0.0.254" Set the maximum destination IP. - pgset "src_min 10.0.0.1" Set the minimum (or only) source IP. - pgset "src_max 10.0.0.254" Set the maximum source IP. - pgset "dst6 fec0::1" IPV6 destination address - pgset "src6 fec0::2" IPV6 source address - pgset "dstmac 00:00:00:00:00:00" sets MAC destination address - pgset "srcmac 00:00:00:00:00:00" sets MAC source address - - pgset "queue_map_min 0" Sets the min value of tx queue interval - pgset "queue_map_max 7" Sets the max value of tx queue interval, for multiqueue devices - To select queue 1 of a given device, - use queue_map_min=1 and queue_map_max=1 - - pgset "src_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with srcmac. - - pgset "dst_mac_count 1" Sets the number of MACs we'll range through. - The 'minimum' MAC is what you set with dstmac. - - pgset "flag [name]" Set a flag to determine behaviour. Current flags - are: IPSRC_RND # IP source is random (between min/max) - IPDST_RND # IP destination is random - UDPSRC_RND, UDPDST_RND, - MACSRC_RND, MACDST_RND - TXSIZE_RND, IPV6, - MPLS_RND, VID_RND, SVID_RND - FLOW_SEQ, - QUEUE_MAP_RND # queue map random - QUEUE_MAP_CPU # queue map mirrors smp_processor_id() - UDPCSUM, - IPSEC # IPsec encapsulation (needs CONFIG_XFRM) - NODE_ALLOC # node specific memory allocation - NO_TIMESTAMP # disable timestamping - pgset 'flag ![name]' Clear a flag to determine behaviour. - Note that you might need to use single quote in - interactive mode, so that your shell wouldn't expand - the specified flag as a history command. - - pgset "spi [SPI_VALUE]" Set specific SA used to transform packet. - - pgset "udp_src_min 9" set UDP source port min, If < udp_src_max, then - cycle through the port range. - - pgset "udp_src_max 9" set UDP source port max. - pgset "udp_dst_min 9" set UDP destination port min, If < udp_dst_max, then - cycle through the port range. - pgset "udp_dst_max 9" set UDP destination port max. - - pgset "mpls 0001000a,0002000a,0000000a" set MPLS labels (in this example - outer label=16,middle label=32, - inner label=0 (IPv4 NULL)) Note that - there must be no spaces between the - arguments. Leading zeros are required. - Do not set the bottom of stack bit, - that's done automatically. If you do - set the bottom of stack bit, that - indicates that you want to randomly - generate that address and the flag - MPLS_RND will be turned on. You - can have any mix of random and fixed - labels in the label stack. - - pgset "mpls 0" turn off mpls (or any invalid argument works too!) - - pgset "vlan_id 77" set VLAN ID 0-4095 - pgset "vlan_p 3" set priority bit 0-7 (default 0) - pgset "vlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "svlan_id 22" set SVLAN ID 0-4095 - pgset "svlan_p 3" set priority bit 0-7 (default 0) - pgset "svlan_cfi 0" set canonical format identifier 0-1 (default 0) - - pgset "vlan_id 9999" > 4095 remove vlan and svlan tags - pgset "svlan 9999" > 4095 remove svlan tag - - - pgset "tos XX" set former IPv4 TOS field (e.g. "tos 28" for AF11 no ECN, default 00) - pgset "traffic_class XX" set former IPv6 TRAFFIC CLASS (e.g. "traffic_class B8" for EF no ECN, default 00) - - pgset "rate 300M" set rate to 300 Mb/s - pgset "ratep 1000000" set rate to 1Mpps - - pgset "xmit_mode netif_receive" RX inject into stack netif_receive_skb() - Works with "burst" but not with "clone_skb". - Default xmit_mode is "start_xmit". - -Sample scripts -============== - -A collection of tutorial scripts and helpers for pktgen is in the -samples/pktgen directory. The helper parameters.sh file support easy -and consistent parameter parsing across the sample scripts. - -Usage example and help: - ./pktgen_sample01_simple.sh -i eth4 -m 00:1B:21:3C:9D:F8 -d 192.168.8.2 - -Usage: ./pktgen_sample01_simple.sh [-vx] -i ethX - -i : ($DEV) output interface/device (required) - -s : ($PKT_SIZE) packet size - -d : ($DEST_IP) destination IP - -m : ($DST_MAC) destination MAC-addr - -t : ($THREADS) threads to start - -c : ($SKB_CLONE) SKB clones send before alloc new SKB - -b : ($BURST) HW level bursting of SKBs - -v : ($VERBOSE) verbose - -x : ($DEBUG) debug - -The global variables being set are also listed. E.g. the required -interface/device parameter "-i" sets variable $DEV. Copy the -pktgen_sampleXX scripts and modify them to fit your own needs. - -The old scripts: - -pktgen.conf-1-2 # 1 CPU 2 dev -pktgen.conf-1-1-rdos # 1 CPU 1 dev w. route DoS -pktgen.conf-1-1-ip6 # 1 CPU 1 dev ipv6 -pktgen.conf-1-1-ip6-rdos # 1 CPU 1 dev ipv6 w. route DoS -pktgen.conf-1-1-flows # 1 CPU 1 dev multiple flows. - - -Interrupt affinity -=================== -Note that when adding devices to a specific CPU it is a good idea to -also assign /proc/irq/XX/smp_affinity so that the TX interrupts are bound -to the same CPU. This reduces cache bouncing when freeing skbs. - -Plus using the device flag QUEUE_MAP_CPU, which maps the SKBs TX queue -to the running threads CPU (directly from smp_processor_id()). - -Enable IPsec -============ -Default IPsec transformation with ESP encapsulation plus transport mode -can be enabled by simply setting: - -pgset "flag IPSEC" -pgset "flows 1" - -To avoid breaking existing testbed scripts for using AH type and tunnel mode, -you can use "pgset spi SPI_VALUE" to specify which transformation mode -to employ. - - -Current commands and configuration options -========================================== - -** Pgcontrol commands: - -start -stop -reset - -** Thread commands: - -add_device -rem_device_all - - -** Device commands: - -count -clone_skb -burst -debug - -frags -delay - -src_mac_count -dst_mac_count - -pkt_size -min_pkt_size -max_pkt_size - -queue_map_min -queue_map_max -skb_priority - -tos (ipv4) -traffic_class (ipv6) - -mpls - -udp_src_min -udp_src_max - -udp_dst_min -udp_dst_max - -node - -flag - IPSRC_RND - IPDST_RND - UDPSRC_RND - UDPDST_RND - MACSRC_RND - MACDST_RND - TXSIZE_RND - IPV6 - MPLS_RND - VID_RND - SVID_RND - FLOW_SEQ - QUEUE_MAP_RND - QUEUE_MAP_CPU - UDPCSUM - IPSEC - NODE_ALLOC - NO_TIMESTAMP - -spi (ipsec) - -dst_min -dst_max - -src_min -src_max - -dst_mac -src_mac - -clear_counters - -src6 -dst6 -dst6_max -dst6_min - -flows -flowlen - -rate -ratep - -xmit_mode - -vlan_cfi -vlan_id -vlan_p - -svlan_cfi -svlan_id -svlan_p - - -References: -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/ -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/examples/ - -Paper from Linux-Kongress in Erlangen 2004. -ftp://robur.slu.se/pub/Linux/net-development/pktgen-testing/pktgen_paper.pdf - -Thanks to: -Grant Grundler for testing on IA-64 and parisc, Harald Welte, Lennert Buytenhek -Stephen Hemminger, Andi Kleen, Dave Miller and many others. - - -Good luck with the linux net-development. diff --git a/net/Kconfig b/net/Kconfig index 8b1f85820a6b..c5ba2d180c43 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -344,7 +344,7 @@ config NET_PKTGEN what was just said, you don't need it: say N. Documentation on how to use the packet generator can be found - at . + at . To compile this code as a module, choose M here: the module will be called pktgen. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 08e2811b5274..b53b6d38c4df 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -56,7 +56,7 @@ * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) * * 021124 Finished major redesign and rewrite for new functionality. - * See Documentation/networking/pktgen.txt for how to use this. + * See Documentation/networking/pktgen.rst for how to use this. * * The new operation: * For each CPU one thread/process is created at start. This process checks diff --git a/samples/pktgen/README.rst b/samples/pktgen/README.rst index 3f6483e8b2df..f9c53ca5cf93 100644 --- a/samples/pktgen/README.rst +++ b/samples/pktgen/README.rst @@ -3,7 +3,7 @@ Sample and benchmark scripts for pktgen (packet generator) This directory contains some pktgen sample and benchmark scripts, that can easily be copied and adjusted for your own use-case. -General doc is located in kernel: Documentation/networking/pktgen.txt +General doc is located in kernel: Documentation/networking/pktgen.rst Helper include files ==================== -- cgit v1.2.3 From cff9f12b18915d957a2130885a00f8ab15cff7e4 Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Thu, 30 Apr 2020 22:21:31 +0300 Subject: net/core: Introduce netdev_get_xmit_slave Add new ndo to get the xmit slave of master device. The reference counters are not incremented so the caller must be careful with locks. User can ask to get the xmit slave assume all the slaves can transmit by set all_slaves arg to true. Signed-off-by: Maor Gottlieb Reviewed-by: Jiri Pirko Reviewed-by: David Ahern Acked-by: David S. Miller Signed-off-by: Saeed Mahameed --- include/linux/netdevice.h | 12 ++++++++++++ net/core/dev.c | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) (limited to 'net/core') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 130a668049ab..26bc0f11b7ad 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1146,6 +1146,12 @@ struct netdev_net_notifier { * int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); * Called to release previously enslaved netdev. * + * struct net_device *(*ndo_get_xmit_slave)(struct net_device *dev, + * struct sk_buff *skb, + * bool all_slaves); + * Get the xmit slave of master device. If all_slaves is true, function + * assume all the slaves can transmit. + * * Feature/offload setting functions. * netdev_features_t (*ndo_fix_features)(struct net_device *dev, * netdev_features_t features); @@ -1389,6 +1395,9 @@ struct net_device_ops { struct netlink_ext_ack *extack); int (*ndo_del_slave)(struct net_device *dev, struct net_device *slave_dev); + struct net_device* (*ndo_get_xmit_slave)(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); netdev_features_t (*ndo_fix_features)(struct net_device *dev, netdev_features_t features); int (*ndo_set_features)(struct net_device *dev, @@ -2731,6 +2740,9 @@ void netdev_freemem(struct net_device *dev); void synchronize_net(void); int init_dummy_netdev(struct net_device *dev); +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves); struct net_device *dev_get_by_index(struct net *net, int ifindex); struct net_device *__dev_get_by_index(struct net *net, int ifindex); struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex); diff --git a/net/core/dev.c b/net/core/dev.c index 9c9e763bfe0e..e6c10980abfd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -7785,6 +7785,28 @@ void netdev_bonding_info_change(struct net_device *dev, } EXPORT_SYMBOL(netdev_bonding_info_change); +/** + * netdev_get_xmit_slave - Get the xmit slave of master device + * @skb: The packet + * @all_slaves: assume all the slaves are active + * + * The reference counters are not incremented so the caller must be + * careful with locks. The caller must hold RCU lock. + * %NULL is returned if no slave is found. + */ + +struct net_device *netdev_get_xmit_slave(struct net_device *dev, + struct sk_buff *skb, + bool all_slaves) +{ + const struct net_device_ops *ops = dev->netdev_ops; + + if (!ops->ndo_get_xmit_slave) + return NULL; + return ops->ndo_get_xmit_slave(dev, skb, all_slaves); +} +EXPORT_SYMBOL(netdev_get_xmit_slave); + static void netdev_adjacent_add_links(struct net_device *dev) { struct netdev_adjacent *iter; -- cgit v1.2.3 From beecf11bc2188067824591612151c4dc6ec383c7 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Thu, 30 Apr 2020 16:31:52 -0700 Subject: bpf: Bpf_{g,s}etsockopt for struct bpf_sock_addr Currently, bpf_getsockopt and bpf_setsockopt helpers operate on the 'struct bpf_sock_ops' context in BPF_PROG_TYPE_SOCK_OPS program. Let's generalize them and make them available for 'struct bpf_sock_addr'. That way, in the future, we can allow those helpers in more places. As an example, let's expose those 'struct bpf_sock_addr' based helpers to BPF_CGROUP_INET{4,6}_CONNECT hooks. That way we can override CC before the connection is made. v3: * Expose custom helpers for bpf_sock_addr context instead of doing generic bpf_sock argument (as suggested by Daniel). Even with try_socket_lock that doesn't sleep we have a problem where context sk is already locked and socket lock is non-nestable. v2: * s/BPF_PROG_TYPE_CGROUP_SOCKOPT/BPF_PROG_TYPE_SOCK_OPS/ Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20200430233152.199403-1-sdf@google.com --- include/uapi/linux/bpf.h | 14 ++- net/core/filter.c | 118 +++++++++++++++++----- tools/include/uapi/linux/bpf.h | 14 ++- tools/testing/selftests/bpf/config | 1 + tools/testing/selftests/bpf/progs/connect4_prog.c | 46 +++++++++ 5 files changed, 166 insertions(+), 27 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/net/core/filter.c b/net/core/filter.c index 70b32723e6be..dfaf5df13722 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4194,16 +4194,19 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +#define SOCKOPT_CC_REINIT (1 << 0) + +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen, u32 flags) { - struct sock *sk = bpf_sock->sk; int ret = 0; int val; if (!sk_fullsock(sk)) return -EINVAL; + sock_owned_by_me(sk); + if (level == SOL_SOCKET) { if (optlen != sizeof(int)) return -EINVAL; @@ -4298,7 +4301,7 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { char name[TCP_CA_NAME_MAX]; - bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN; + bool reinit = flags & SOCKOPT_CC_REINIT; strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); @@ -4345,24 +4348,14 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, return ret; } -static const struct bpf_func_proto bpf_setsockopt_proto = { - .func = bpf_setsockopt, - .gpl_only = false, - .ret_type = RET_INTEGER, - .arg1_type = ARG_PTR_TO_CTX, - .arg2_type = ARG_ANYTHING, - .arg3_type = ARG_ANYTHING, - .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, -}; - -BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, - int, level, int, optname, char *, optval, int, optlen) +static int _bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) { - struct sock *sk = bpf_sock->sk; - if (!sk_fullsock(sk)) goto err_clear; + + sock_owned_by_me(sk); + #ifdef CONFIG_INET if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { struct inet_connection_sock *icsk; @@ -4428,8 +4421,71 @@ err_clear: return -EINVAL; } -static const struct bpf_func_proto bpf_getsockopt_proto = { - .func = bpf_getsockopt, +BPF_CALL_5(bpf_sock_addr_setsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + return _bpf_setsockopt(ctx->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_addr_setsockopt_proto = { + .func = bpf_sock_addr_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_addr_getsockopt, struct bpf_sock_addr_kern *, ctx, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(ctx->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_addr_getsockopt_proto = { + .func = bpf_sock_addr_getsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_UNINIT_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + u32 flags = 0; + if (bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN) + flags |= SOCKOPT_CC_REINIT; + return _bpf_setsockopt(bpf_sock->sk, level, optname, optval, optlen, + flags); +} + +static const struct bpf_func_proto bpf_sock_ops_setsockopt_proto = { + .func = bpf_sock_ops_setsockopt, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE, +}; + +BPF_CALL_5(bpf_sock_ops_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, + int, level, int, optname, char *, optval, int, optlen) +{ + return _bpf_getsockopt(bpf_sock->sk, level, optname, optval, optlen); +} + +static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = { + .func = bpf_sock_ops_getsockopt, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -6043,6 +6099,22 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_setsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_setsockopt_proto; + default: + return NULL; + } + case BPF_FUNC_getsockopt: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_sock_addr_getsockopt_proto; + default: + return NULL; + } default: return bpf_base_func_proto(func_id); } @@ -6261,9 +6333,9 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: - return &bpf_setsockopt_proto; + return &bpf_sock_ops_setsockopt_proto; case BPF_FUNC_getsockopt: - return &bpf_getsockopt_proto; + return &bpf_sock_ops_getsockopt_proto; case BPF_FUNC_sock_ops_cb_flags_set: return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 705e4822f997..b3643e27e264 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1587,7 +1587,7 @@ union bpf_attr { * Return * 0 * - * int bpf_setsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **setsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1595,6 +1595,11 @@ union bpf_attr { * must be specified, see **setsockopt(2)** for more information. * The option value of length *optlen* is pointed by *optval*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: * @@ -1789,7 +1794,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_getsockopt(struct bpf_sock_ops *bpf_socket, int level, int optname, void *optval, int optlen) + * int bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) * Description * Emulate a call to **getsockopt()** on the socket associated to * *bpf_socket*, which must be a full socket. The *level* at @@ -1798,6 +1803,11 @@ union bpf_attr { * The retrieved value is stored in the structure pointed by * *opval* and of length *optlen*. * + * *bpf_socket* should be one of the following: + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** + * and **BPF_CGROUP_INET6_CONNECT**. + * * This helper actually implements a subset of **getsockopt()**. * It supports the following *level*\ s: * diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index 60e3ae5d4e48..6e5b94c036ca 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -37,3 +37,4 @@ CONFIG_IPV6_SIT=m CONFIG_BPF_JIT=y CONFIG_BPF_LSM=y CONFIG_SECURITY=y +CONFIG_TCP_CONG_DCTCP=y diff --git a/tools/testing/selftests/bpf/progs/connect4_prog.c b/tools/testing/selftests/bpf/progs/connect4_prog.c index ad3c498a8150..972918cd2d7f 100644 --- a/tools/testing/selftests/bpf/progs/connect4_prog.c +++ b/tools/testing/selftests/bpf/progs/connect4_prog.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include @@ -16,6 +17,10 @@ #define DST_REWRITE_IP4 0x7f000001U #define DST_REWRITE_PORT4 4444 +#ifndef TCP_CA_NAME_MAX +#define TCP_CA_NAME_MAX 16 +#endif + int _version SEC("version") = 1; __attribute__ ((noinline)) @@ -33,6 +38,43 @@ int do_bind(struct bpf_sock_addr *ctx) return 1; } +static __inline int verify_cc(struct bpf_sock_addr *ctx, + char expected[TCP_CA_NAME_MAX]) +{ + char buf[TCP_CA_NAME_MAX]; + int i; + + if (bpf_getsockopt(ctx, SOL_TCP, TCP_CONGESTION, &buf, sizeof(buf))) + return 1; + + for (i = 0; i < TCP_CA_NAME_MAX; i++) { + if (buf[i] != expected[i]) + return 1; + if (buf[i] == 0) + break; + } + + return 0; +} + +static __inline int set_cc(struct bpf_sock_addr *ctx) +{ + char dctcp[TCP_CA_NAME_MAX] = "dctcp"; + char cubic[TCP_CA_NAME_MAX] = "cubic"; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &dctcp, sizeof(dctcp))) + return 1; + if (verify_cc(ctx, dctcp)) + return 1; + + if (bpf_setsockopt(ctx, SOL_TCP, TCP_CONGESTION, &cubic, sizeof(cubic))) + return 1; + if (verify_cc(ctx, cubic)) + return 1; + + return 0; +} + SEC("cgroup/connect4") int connect_v4_prog(struct bpf_sock_addr *ctx) { @@ -66,6 +108,10 @@ int connect_v4_prog(struct bpf_sock_addr *ctx) bpf_sk_release(sk); + /* Rewrite congestion control. */ + if (ctx->type == SOCK_STREAM && set_cc(ctx)) + return 0; + /* Rewrite destination. */ ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); ctx->user_port = bpf_htons(DST_REWRITE_PORT4); -- cgit v1.2.3 From 41a46913bee76d8493681442907ccd989ced2633 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Mon, 27 Apr 2020 18:37:43 +0200 Subject: net: fix skb_panic to output real address In skb_panic() the real pointer values are really needed to diagnose issues, e.g. data and head are related (to calculate headroom). The hashed versions of the addresses doesn't make much sense here. The patch use the printk specifier %px to print the actual address. The printk documentation on %px: https://www.kernel.org/doc/html/latest/core-api/printk-formats.html#unmodified-addresses Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 7e29590482ce..1bf0c3d278e7 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags); static void skb_panic(struct sk_buff *skb, unsigned int sz, void *addr, const char msg[]) { - pr_emerg("%s: text:%p len:%d put:%d head:%p data:%p tail:%#lx end:%#lx dev:%s\n", + pr_emerg("%s: text:%px len:%d put:%d head:%px data:%px tail:%#lx end:%#lx dev:%s\n", msg, addr, skb->len, sz, skb->head, skb->data, (unsigned long)skb->tail, (unsigned long)skb->end, skb->dev ? skb->dev->name : ""); -- cgit v1.2.3 From dd86fec7e06ab792fe470c66a67ff42bf5d72b91 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 May 2020 09:40:40 -0700 Subject: devlink: factor out building a snapshot notification We'll need to send snapshot info back on the socket which requested a snapshot to be created. Factor out constructing a snapshot description from the broadcast notification code. v3: new patch Signed-off-by: Jakub Kicinski Reviewed-by: Jiri Pirko Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- net/core/devlink.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 80f97722f31f..2b7c60c18b99 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3716,24 +3716,26 @@ nla_put_failure: return err; } -static void devlink_nl_region_notify(struct devlink_region *region, - struct devlink_snapshot *snapshot, - enum devlink_command cmd) +static struct sk_buff * +devlink_nl_region_notify_build(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd, u32 portid, u32 seq) { struct devlink *devlink = region->devlink; struct sk_buff *msg; void *hdr; int err; - WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - return; + return ERR_PTR(-ENOMEM); - hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd); - if (!hdr) + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, 0, cmd); + if (!hdr) { + err = -EMSGSIZE; goto out_free_msg; + } err = devlink_nl_put_handle(msg, devlink); if (err) @@ -3757,15 +3759,30 @@ static void devlink_nl_region_notify(struct devlink_region *region, } genlmsg_end(msg, hdr); - genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), - msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); - - return; + return msg; out_cancel_msg: genlmsg_cancel(msg, hdr); out_free_msg: nlmsg_free(msg); + return ERR_PTR(err); +} + +static void devlink_nl_region_notify(struct devlink_region *region, + struct devlink_snapshot *snapshot, + enum devlink_command cmd) +{ + struct devlink *devlink = region->devlink; + struct sk_buff *msg; + + WARN_ON(cmd != DEVLINK_CMD_REGION_NEW && cmd != DEVLINK_CMD_REGION_DEL); + + msg = devlink_nl_region_notify_build(region, snapshot, cmd, 0, 0); + if (IS_ERR(msg)) + return; + + genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink), + msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL); } /** -- cgit v1.2.3 From 043b3e22768d5d909cb1474fc21ae2fbaf026c0c Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 May 2020 09:40:41 -0700 Subject: devlink: let kernel allocate region snapshot id Currently users have to choose a free snapshot id before calling DEVLINK_CMD_REGION_NEW. This is potentially racy and inconvenient. Make the DEVLINK_ATTR_REGION_SNAPSHOT_ID optional and try to allocate id automatically. Send a message back to the caller with the snapshot info. Example use: $ devlink region new netdevsim/netdevsim1/dummy netdevsim/netdevsim1/dummy: snapshot 1 $ id=$(devlink -j region new netdevsim/netdevsim1/dummy | \ jq '.[][][][]') $ devlink region dump netdevsim/netdevsim1/dummy snapshot $id [...] $ devlink region del netdevsim/netdevsim1/dummy snapshot $id v4: - inline the notification code v3: - send the notification only once snapshot creation completed. v2: - don't wrap the line containing extack; - add a few sentences to the docs. Signed-off-by: Jakub Kicinski Reviewed-by: Jacob Keller Signed-off-by: David S. Miller --- .../networking/devlink/devlink-region.rst | 7 ++- net/core/devlink.c | 57 +++++++++++++++++----- .../selftests/drivers/net/netdevsim/devlink.sh | 13 +++++ 3 files changed, 62 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst index 04e04d1ff627..daf35427fce1 100644 --- a/Documentation/networking/devlink/devlink-region.rst +++ b/Documentation/networking/devlink/devlink-region.rst @@ -23,7 +23,9 @@ states, but see also :doc:`devlink-health` Regions may optionally support capturing a snapshot on demand via the ``DEVLINK_CMD_REGION_NEW`` netlink message. A driver wishing to allow requested snapshots must implement the ``.snapshot`` callback for the region -in its ``devlink_region_ops`` structure. +in its ``devlink_region_ops`` structure. If snapshot id is not set in +the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send +the snapshot information to user space. example usage ------------- @@ -45,7 +47,8 @@ example usage $ devlink region del pci/0000:00:05.0/cr-space snapshot 1 # Request an immediate snapshot, if supported by the region - $ devlink region new pci/0000:00:05.0/cr-space snapshot 5 + $ devlink region new pci/0000:00:05.0/cr-space + pci/0000:00:05.0/cr-space: snapshot 5 # Dump a snapshot: $ devlink region dump pci/0000:00:05.0/fw-health snapshot 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 2b7c60c18b99..43a9d5be73ca 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4086,6 +4086,8 @@ static int devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink = info->user_ptr[0]; + struct devlink_snapshot *snapshot; + struct nlattr *snapshot_id_attr; struct devlink_region *region; const char *region_name; u32 snapshot_id; @@ -4097,11 +4099,6 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - if (!info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) { - NL_SET_ERR_MSG_MOD(info->extack, "No snapshot id provided"); - return -EINVAL; - } - region_name = nla_data(info->attrs[DEVLINK_ATTR_REGION_NAME]); region = devlink_region_get_by_name(devlink, region_name); if (!region) { @@ -4119,16 +4116,25 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) return -ENOSPC; } - snapshot_id = nla_get_u32(info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]); + snapshot_id_attr = info->attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]; + if (snapshot_id_attr) { + snapshot_id = nla_get_u32(snapshot_id_attr); - if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { - NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); - return -EEXIST; - } + if (devlink_region_snapshot_get_by_id(region, snapshot_id)) { + NL_SET_ERR_MSG_MOD(info->extack, "The requested snapshot id is already in use"); + return -EEXIST; + } - err = __devlink_snapshot_id_insert(devlink, snapshot_id); - if (err) - return err; + err = __devlink_snapshot_id_insert(devlink, snapshot_id); + if (err) + return err; + } else { + err = __devlink_region_snapshot_id_get(devlink, &snapshot_id); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "Failed to allocate a new snapshot id"); + return err; + } + } err = region->ops->snapshot(devlink, info->extack, &data); if (err) @@ -4138,6 +4144,27 @@ devlink_nl_cmd_region_new(struct sk_buff *skb, struct genl_info *info) if (err) goto err_snapshot_create; + if (!snapshot_id_attr) { + struct sk_buff *msg; + + snapshot = devlink_region_snapshot_get_by_id(region, + snapshot_id); + if (WARN_ON(!snapshot)) + return -EINVAL; + + msg = devlink_nl_region_notify_build(region, snapshot, + DEVLINK_CMD_REGION_NEW, + info->snd_portid, + info->snd_seq); + err = PTR_ERR_OR_ZERO(msg); + if (err) + goto err_notify; + + err = genlmsg_reply(msg, info); + if (err) + goto err_notify; + } + return 0; err_snapshot_create: @@ -4145,6 +4172,10 @@ err_snapshot_create: err_snapshot_capture: __devlink_snapshot_id_decrement(devlink, snapshot_id); return err; + +err_notify: + devlink_region_snapshot_del(region, snapshot); + return err; } static int devlink_nl_cmd_region_read_chunk_fill(struct sk_buff *msg, diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index 9f9741444549..ad539eccddcb 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -151,6 +151,19 @@ regions_test() check_region_snapshot_count dummy post-second-delete 2 + sid=$(devlink -j region new $DL_HANDLE/dummy | jq '.[][][][]') + check_err $? "Failed to create a new snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 3 + + devlink region dump $DL_HANDLE/dummy snapshot $sid >> /dev/null + check_err $? "Failed to dump a snapshot with id allocated by the kernel" + + devlink region del $DL_HANDLE/dummy snapshot $sid + check_err $? "Failed to delete snapshot with id allocated by the kernel" + + check_region_snapshot_count dummy post-first-request 2 + log_test "regions test" } -- cgit v1.2.3 From 1a33e10e4a95cb109ff1145098175df3113313ef Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sat, 2 May 2020 22:22:19 -0700 Subject: net: partially revert dynamic lockdep key changes This patch reverts the folowing commits: commit 064ff66e2bef84f1153087612032b5b9eab005bd "bonding: add missing netdev_update_lockdep_key()" commit 53d374979ef147ab51f5d632dfe20b14aebeccd0 "net: avoid updating qdisc_xmit_lock_key in netdev_update_lockdep_key()" commit 1f26c0d3d24125992ab0026b0dab16c08df947c7 "net: fix kernel-doc warning in " commit ab92d68fc22f9afab480153bd82a20f6e2533769 "net: core: add generic lockdep keys" but keeps the addr_list_lock_key because we still lock addr_list_lock nestedly on stack devices, unlikely xmit_lock this is safe because we don't take addr_list_lock on any fast path. Reported-and-tested-by: syzbot+aaa6fa4949cc5d9b7b25@syzkaller.appspotmail.com Cc: Dmitry Vyukov Cc: Taehee Yoo Signed-off-by: Cong Wang Acked-by: Taehee Yoo Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_net_repr.c | 16 ++++ drivers/net/hamradio/bpqether.c | 20 +++++ drivers/net/hyperv/netvsc_drv.c | 2 + drivers/net/ipvlan/ipvlan_main.c | 2 + drivers/net/macsec.c | 2 + drivers/net/macvlan.c | 2 + drivers/net/ppp/ppp_generic.c | 2 + drivers/net/team/team.c | 1 + drivers/net/vrf.c | 1 + drivers/net/wireless/intersil/hostap/hostap_hw.c | 22 ++++++ include/linux/netdevice.h | 27 +++++-- net/8021q/vlan_dev.c | 21 ++++++ net/batman-adv/soft-interface.c | 30 ++++++++ net/bluetooth/6lowpan.c | 8 ++ net/core/dev.c | 90 ++++++++++++++++++----- net/dsa/slave.c | 12 +++ net/ieee802154/6lowpan/core.c | 8 ++ net/l2tp/l2tp_eth.c | 1 + net/netrom/af_netrom.c | 21 ++++++ net/rose/af_rose.c | 21 ++++++ net/sched/sch_generic.c | 17 +++-- 22 files changed, 294 insertions(+), 33 deletions(-) (limited to 'net/core') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2e70e43c5df5..d01871321d22 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4898,6 +4898,7 @@ static int bond_init(struct net_device *bond_dev) spin_lock_init(&bond->stats_lock); lockdep_register_key(&bond->stats_lock_key); lockdep_set_class(&bond->stats_lock, &bond->stats_lock_key); + netdev_lockdep_set_classes(bond_dev); list_add_tail(&bond->bond_list, &bn->dev_list); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c index 79d72c88bbef..b3cabc274121 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_repr.c @@ -299,6 +299,20 @@ static void nfp_repr_clean(struct nfp_repr *repr) nfp_port_free(repr->port); } +static struct lock_class_key nfp_repr_netdev_xmit_lock_key; + +static void nfp_repr_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nfp_repr_netdev_xmit_lock_key); +} + +static void nfp_repr_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nfp_repr_set_lockdep_class_one, NULL); +} + int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 cmsg_port_id, struct nfp_port *port, struct net_device *pf_netdev) @@ -308,6 +322,8 @@ int nfp_repr_init(struct nfp_app *app, struct net_device *netdev, u32 repr_cap = nn->tlv_caps.repr_cap; int err; + nfp_repr_set_lockdep_class(netdev); + repr->port = port; repr->dst = metadata_dst_alloc(0, METADATA_HW_PORT_MUX, GFP_KERNEL); if (!repr->dst) diff --git a/drivers/net/hamradio/bpqether.c b/drivers/net/hamradio/bpqether.c index fbea6f232819..206688154fdf 100644 --- a/drivers/net/hamradio/bpqether.c +++ b/drivers/net/hamradio/bpqether.c @@ -107,6 +107,25 @@ struct bpqdev { static LIST_HEAD(bpq_devices); +/* + * bpqether network devices are paired with ethernet devices below them, so + * form a special "super class" of normal ethernet devices; split their locks + * off into a separate class since they always nest. + */ +static struct lock_class_key bpq_netdev_xmit_lock_key; + +static void bpq_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &bpq_netdev_xmit_lock_key); +} + +static void bpq_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, bpq_set_lockdep_class_one, NULL); +} + /* ------------------------------------------------------------------------ */ @@ -477,6 +496,7 @@ static int bpq_new_device(struct net_device *edev) err = register_netdevice(ndev); if (err) goto error; + bpq_set_lockdep_class(ndev); /* List protected by RTNL */ list_add_rcu(&bpq->bpq_list, &bpq_devices); diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index d8e86bdbfba1..c0b647a4c893 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -2456,6 +2456,8 @@ static int netvsc_probe(struct hv_device *dev, NETIF_F_HW_VLAN_CTAG_RX; net->vlan_features = net->features; + netdev_lockdep_set_classes(net); + /* MTU range: 68 - 1500 or 65521 */ net->min_mtu = NETVSC_MTU_MIN; if (nvdev->nvsp_version >= NVSP_PROTOCOL_VERSION_2) diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index f195f278a83a..15e87c097b0b 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -131,6 +131,8 @@ static int ipvlan_init(struct net_device *dev) dev->gso_max_segs = phy_dev->gso_max_segs; dev->hard_header_len = phy_dev->hard_header_len; + netdev_lockdep_set_classes(dev); + ipvlan->pcpu_stats = netdev_alloc_pcpu_stats(struct ipvl_pcpu_stats); if (!ipvlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c index 758baf7cb8a1..ea3f25cc79ef 100644 --- a/drivers/net/macsec.c +++ b/drivers/net/macsec.c @@ -4047,6 +4047,8 @@ static int macsec_newlink(struct net *net, struct net_device *dev, if (err < 0) return err; + netdev_lockdep_set_classes(dev); + err = netdev_upper_dev_link(real_dev, dev, extack); if (err < 0) goto unregister; diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index d45600e0a38c..34eb073cdd74 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -890,6 +890,8 @@ static int macvlan_init(struct net_device *dev) dev->gso_max_segs = lowerdev->gso_max_segs; dev->hard_header_len = lowerdev->hard_header_len; + netdev_lockdep_set_classes(dev); + vlan->pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->pcpu_stats) return -ENOMEM; diff --git a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c index 22cc2cb9d878..7d005896a0f9 100644 --- a/drivers/net/ppp/ppp_generic.c +++ b/drivers/net/ppp/ppp_generic.c @@ -1410,6 +1410,8 @@ static int ppp_dev_init(struct net_device *dev) { struct ppp *ppp; + netdev_lockdep_set_classes(dev); + ppp = netdev_priv(dev); /* Let the netdevice take a reference on the ppp file. This ensures * that ppp_destroy_interface() won't run before the device gets diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index 04845a4017f9..8c1e02752ff6 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1647,6 +1647,7 @@ static int team_init(struct net_device *dev) lockdep_register_key(&team->team_lock_key); __mutex_init(&team->lock, "team->team_lock_key", &team->team_lock_key); + netdev_lockdep_set_classes(dev); return 0; diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 56f8aab46f89..43928a1c2f2a 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -867,6 +867,7 @@ static int vrf_dev_init(struct net_device *dev) /* similarly, oper state is irrelevant; set to up to avoid confusion */ dev->operstate = IF_OPER_UP; + netdev_lockdep_set_classes(dev); return 0; out_rth: diff --git a/drivers/net/wireless/intersil/hostap/hostap_hw.c b/drivers/net/wireless/intersil/hostap/hostap_hw.c index 58212c532c90..aadf3dec5bf3 100644 --- a/drivers/net/wireless/intersil/hostap/hostap_hw.c +++ b/drivers/net/wireless/intersil/hostap/hostap_hw.c @@ -3041,6 +3041,27 @@ static void prism2_clear_set_tim_queue(local_info_t *local) } } + +/* + * HostAP uses two layers of net devices, where the inner + * layer gets called all the time from the outer layer. + * This is a natural nesting, which needs a split lock type. + */ +static struct lock_class_key hostap_netdev_xmit_lock_key; + +static void prism2_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &hostap_netdev_xmit_lock_key); +} + +static void prism2_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, prism2_set_lockdep_class_one, NULL); +} + static struct net_device * prism2_init_local_data(struct prism2_helper_functions *funcs, int card_idx, struct device *sdev) @@ -3199,6 +3220,7 @@ while (0) if (ret >= 0) ret = register_netdevice(dev); + prism2_set_lockdep_class(dev); rtnl_unlock(); if (ret < 0) { printk(KERN_WARNING "%s: register netdevice failed!\n", diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5a8d40f1ffe2..7725efd6e48a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1805,13 +1805,11 @@ enum netdev_priv_flags { * @phydev: Physical device may attach itself * for hardware timestamping * @sfp_bus: attached &struct sfp_bus structure. - * @qdisc_tx_busylock_key: lockdep class annotating Qdisc->busylock - * spinlock - * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount - * @qdisc_xmit_lock_key: lockdep class annotating - * netdev_queue->_xmit_lock spinlock + * * @addr_list_lock_key: lockdep class annotating * net_device->addr_list_lock spinlock + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock + * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount * * @proto_down: protocol port state information can be sent to the * switch driver and used to set the phys state of the @@ -2112,10 +2110,9 @@ struct net_device { #endif struct phy_device *phydev; struct sfp_bus *sfp_bus; - struct lock_class_key qdisc_tx_busylock_key; - struct lock_class_key qdisc_running_key; - struct lock_class_key qdisc_xmit_lock_key; struct lock_class_key addr_list_lock_key; + struct lock_class_key *qdisc_tx_busylock; + struct lock_class_key *qdisc_running_key; bool proto_down; unsigned wol_enabled:1; @@ -2200,6 +2197,20 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, f(dev, &dev->_tx[i], arg); } +#define netdev_lockdep_set_classes(dev) \ +{ \ + static struct lock_class_key qdisc_tx_busylock_key; \ + static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ + (dev)->qdisc_running_key = &qdisc_running_key; \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ + lockdep_set_class(&(dev)->_tx[i]._xmit_lock, \ + &qdisc_xmit_lock_key); \ +} + u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev); struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 990b9fde28c6..319220b2341d 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -489,6 +489,25 @@ static void vlan_dev_set_rx_mode(struct net_device *vlan_dev) dev_uc_sync(vlan_dev_priv(vlan_dev)->real_dev, vlan_dev); } +/* + * vlan network devices have devices nesting below it, and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key vlan_netdev_xmit_lock_key; + +static void vlan_dev_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *unused) +{ + lockdep_set_class(&txq->_xmit_lock, &vlan_netdev_xmit_lock_key); +} + +static void vlan_dev_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, vlan_dev_set_lockdep_one, NULL); +} + static const struct header_ops vlan_header_ops = { .create = vlan_dev_hard_header, .parse = eth_header_parse, @@ -579,6 +598,8 @@ static int vlan_dev_init(struct net_device *dev) SET_NETDEV_DEVTYPE(dev, &vlan_type); + vlan_dev_set_lockdep_class(dev); + vlan->vlan_pcpu_stats = netdev_alloc_pcpu_stats(struct vlan_pcpu_stats); if (!vlan->vlan_pcpu_stats) return -ENOMEM; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 5f05a728f347..822af540b854 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -739,6 +739,34 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto, return 0; } +/* batman-adv network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key batadv_netdev_xmit_lock_key; + +/** + * batadv_set_lockdep_class_one() - Set lockdep class for a single tx queue + * @dev: device which owns the tx queue + * @txq: tx queue to modify + * @_unused: always NULL + */ +static void batadv_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &batadv_netdev_xmit_lock_key); +} + +/** + * batadv_set_lockdep_class() - Set txq and addr_list lockdep class + * @dev: network device to modify + */ +static void batadv_set_lockdep_class(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, batadv_set_lockdep_class_one, NULL); +} + /** * batadv_softif_init_late() - late stage initialization of soft interface * @dev: registered network device to modify @@ -752,6 +780,8 @@ static int batadv_softif_init_late(struct net_device *dev) int ret; size_t cnt_len = sizeof(u64) * BATADV_CNT_NUM; + batadv_set_lockdep_class(dev); + bat_priv = netdev_priv(dev); bat_priv->soft_iface = dev; diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index 4febc82a7c76..bb55d92691b0 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -571,7 +571,15 @@ static netdev_tx_t bt_xmit(struct sk_buff *skb, struct net_device *netdev) return err < 0 ? NET_XMIT_DROP : err; } +static int bt_dev_init(struct net_device *dev) +{ + netdev_lockdep_set_classes(dev); + + return 0; +} + static const struct net_device_ops netdev_ops = { + .ndo_init = bt_dev_init, .ndo_start_xmit = bt_xmit, }; diff --git a/net/core/dev.c b/net/core/dev.c index afff16849c26..f8d83922a6af 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -398,6 +398,74 @@ static RAW_NOTIFIER_HEAD(netdev_chain); DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data); EXPORT_PER_CPU_SYMBOL(softnet_data); +#ifdef CONFIG_LOCKDEP +/* + * register_netdevice() inits txq->_xmit_lock and sets lockdep class + * according to dev->type + */ +static const unsigned short netdev_lock_type[] = { + ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25, + ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET, + ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM, + ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP, + ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD, + ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25, + ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP, + ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD, + ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI, + ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE, + ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET, + ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL, + ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM, + ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE, + ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE}; + +static const char *const netdev_lock_name[] = { + "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25", + "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET", + "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM", + "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP", + "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD", + "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25", + "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP", + "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD", + "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI", + "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE", + "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET", + "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL", + "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM", + "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE", + "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"}; + +static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)]; + +static inline unsigned short netdev_lock_pos(unsigned short dev_type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++) + if (netdev_lock_type[i] == dev_type) + return i; + /* the last key is used by default */ + return ARRAY_SIZE(netdev_lock_type) - 1; +} + +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ + int i; + + i = netdev_lock_pos(dev_type); + lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i], + netdev_lock_name[i]); +} +#else +static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock, + unsigned short dev_type) +{ +} +#endif + /******************************************************************************* * * Protocol management and registration routines @@ -9208,7 +9276,7 @@ static void netdev_init_one_queue(struct net_device *dev, { /* Initialize queue lock */ spin_lock_init(&queue->_xmit_lock); - lockdep_set_class(&queue->_xmit_lock, &dev->qdisc_xmit_lock_key); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); queue->xmit_lock_owner = -1; netdev_queue_numa_node_write(queue, NUMA_NO_NODE); queue->dev = dev; @@ -9255,22 +9323,6 @@ void netif_tx_stop_all_queues(struct net_device *dev) } EXPORT_SYMBOL(netif_tx_stop_all_queues); -static void netdev_register_lockdep_key(struct net_device *dev) -{ - lockdep_register_key(&dev->qdisc_tx_busylock_key); - lockdep_register_key(&dev->qdisc_running_key); - lockdep_register_key(&dev->qdisc_xmit_lock_key); - lockdep_register_key(&dev->addr_list_lock_key); -} - -static void netdev_unregister_lockdep_key(struct net_device *dev) -{ - lockdep_unregister_key(&dev->qdisc_tx_busylock_key); - lockdep_unregister_key(&dev->qdisc_running_key); - lockdep_unregister_key(&dev->qdisc_xmit_lock_key); - lockdep_unregister_key(&dev->addr_list_lock_key); -} - void netdev_update_lockdep_key(struct net_device *dev) { lockdep_unregister_key(&dev->addr_list_lock_key); @@ -9837,7 +9889,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - netdev_register_lockdep_key(dev); + lockdep_register_key(&dev->addr_list_lock_key); dev->gso_max_size = GSO_MAX_SIZE; dev->gso_max_segs = GSO_MAX_SEGS; @@ -9926,7 +9978,7 @@ void free_netdev(struct net_device *dev) free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; - netdev_unregister_lockdep_key(dev); + lockdep_unregister_key(&dev->addr_list_lock_key); /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { diff --git a/net/dsa/slave.c b/net/dsa/slave.c index ba8bf90dc0cc..fa2634043751 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -1671,6 +1671,15 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) return ret; } +static struct lock_class_key dsa_slave_netdev_xmit_lock_key; +static void dsa_slave_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, + &dsa_slave_netdev_xmit_lock_key); +} + int dsa_slave_suspend(struct net_device *slave_dev) { struct dsa_port *dp = dsa_slave_to_port(slave_dev); @@ -1754,6 +1763,9 @@ int dsa_slave_create(struct dsa_port *port) slave_dev->max_mtu = ETH_MAX_MTU; SET_NETDEV_DEVTYPE(slave_dev, &dsa_type); + netdev_for_each_tx_queue(slave_dev, dsa_slave_set_lockdep_class_one, + NULL); + SET_NETDEV_DEV(slave_dev, port->ds->dev); slave_dev->dev.of_node = port->dn; slave_dev->vlan_features = master->vlan_features; diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c index c0b107cdd715..3297e7fa9945 100644 --- a/net/ieee802154/6lowpan/core.c +++ b/net/ieee802154/6lowpan/core.c @@ -58,6 +58,13 @@ static const struct header_ops lowpan_header_ops = { .create = lowpan_header_create, }; +static int lowpan_dev_init(struct net_device *ldev) +{ + netdev_lockdep_set_classes(ldev); + + return 0; +} + static int lowpan_open(struct net_device *dev) { if (!open_count) @@ -89,6 +96,7 @@ static int lowpan_get_iflink(const struct net_device *dev) } static const struct net_device_ops lowpan_netdev_ops = { + .ndo_init = lowpan_dev_init, .ndo_start_xmit = lowpan_xmit, .ndo_open = lowpan_open, .ndo_stop = lowpan_stop, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index d3b520b9b2c9..fd5ac2788e45 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -56,6 +56,7 @@ static int l2tp_eth_dev_init(struct net_device *dev) { eth_hw_addr_random(dev); eth_broadcast_addr(dev->broadcast); + netdev_lockdep_set_classes(dev); return 0; } diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 7b1a74f74aad..eccc7d366e17 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,6 +63,26 @@ static DEFINE_SPINLOCK(nr_list_lock); static const struct proto_ops nr_proto_ops; +/* + * NETROM network devices are virtual network devices encapsulating NETROM + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key nr_netdev_xmit_lock_key; + +static void nr_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &nr_netdev_xmit_lock_key); +} + +static void nr_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, nr_set_lockdep_one, NULL); +} + /* * Socket removal during an interrupt is now safe. */ @@ -1394,6 +1414,7 @@ static int __init nr_proto_init(void) free_netdev(dev); goto fail; } + nr_set_lockdep_key(dev); dev_nr[i] = dev; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 1e8eeb044b07..e7a872207b46 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -64,6 +64,26 @@ static const struct proto_ops rose_proto_ops; ax25_address rose_callsign; +/* + * ROSE network devices are virtual network devices encapsulating ROSE + * frames into AX.25 which will be sent through an AX.25 device, so form a + * special "super class" of normal net devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key rose_netdev_xmit_lock_key; + +static void rose_set_lockdep_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &rose_netdev_xmit_lock_key); +} + +static void rose_set_lockdep_key(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, rose_set_lockdep_one, NULL); +} + /* * Convert a ROSE address into text. */ @@ -1511,6 +1531,7 @@ static int __init rose_proto_init(void) free_netdev(dev); goto fail; } + rose_set_lockdep_key(dev); dev_rose[i] = dev; } diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index ad24fa1a51e6..ebc55d884247 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -794,6 +794,9 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { }; EXPORT_SYMBOL(pfifo_fast_ops); +static struct lock_class_key qdisc_tx_busylock; +static struct lock_class_key qdisc_running_key; + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, const struct Qdisc_ops *ops, struct netlink_ext_ack *extack) @@ -846,9 +849,17 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, } spin_lock_init(&sch->busylock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + /* seqlock has the same scope of busylock, for NOLOCK qdisc */ spin_lock_init(&sch->seqlock); + lockdep_set_class(&sch->busylock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + seqcount_init(&sch->running); + lockdep_set_class(&sch->running, + dev->qdisc_running_key ?: &qdisc_running_key); sch->ops = ops; sch->flags = ops->static_flags; @@ -859,12 +870,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, dev_hold(dev); refcount_set(&sch->refcnt, 1); - if (sch != &noop_qdisc) { - lockdep_set_class(&sch->busylock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->seqlock, &dev->qdisc_tx_busylock_key); - lockdep_set_class(&sch->running, &dev->qdisc_running_key); - } - return sch; errout1: kfree(p); -- cgit v1.2.3 From 307f660d056b5eb8f5bb2328fac3915ab75b5007 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:18 -0700 Subject: netpoll: remove dev argument from netpoll_send_skb_on_dev() netpoll_send_skb_on_dev() can get the device pointer directly from np->dev Rename it to __netpoll_send_skb() Following patch will move netpoll_send_skb() out-of-line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 5 ++--- net/core/netpoll.c | 10 ++++++---- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 676f1ff161a9..00e0bae3d402 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,13 +63,12 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev); +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; local_irq_save(flags); - netpoll_send_skb_on_dev(np, skb, np->dev); + __netpoll_send_skb(np, skb); local_irq_restore(flags); } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 15b366a1a958..c5059b7ffc94 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,17 +305,19 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, - struct net_device *dev) +void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; + struct net_device *dev; unsigned long tries; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo; lockdep_assert_irqs_disabled(); - npinfo = rcu_dereference_bh(np->dev->npinfo); + dev = np->dev; + npinfo = rcu_dereference_bh(dev->npinfo); + if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); return; @@ -358,7 +360,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(netpoll_send_skb_on_dev); +EXPORT_SYMBOL(__netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From fb1eee476b0d3be3e58dac1a3a96f726c6278bed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:19 -0700 Subject: netpoll: move netpoll_send_skb() out of line There is no need to inline this helper, as we intend to add more code in this function. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 9 +-------- net/core/netpoll.c | 13 +++++++++++-- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index 00e0bae3d402..e466ddffef61 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,14 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); -static inline void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) -{ - unsigned long flags; - local_irq_save(flags); - __netpoll_send_skb(np, skb); - local_irq_restore(flags); -} +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index c5059b7ffc94..34cd34f24423 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -360,7 +360,16 @@ void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(__netpoll_send_skb); + +void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + unsigned long flags; + + local_irq_save(flags); + __netpoll_send_skb(np, skb); + local_irq_restore(flags); +} +EXPORT_SYMBOL(netpoll_send_skb); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.2.3 From 1ddabdfaf70c202b88925edd74c66f4707dbd92e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:20 -0700 Subject: netpoll: netpoll_send_skb() returns transmit status Some callers want to know if the packet has been sent or dropped, to inform upper stacks. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/netpoll.h | 2 +- net/core/netpoll.c | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/linux/netpoll.h b/include/linux/netpoll.h index e466ddffef61..f47af135bd56 100644 --- a/include/linux/netpoll.h +++ b/include/linux/netpoll.h @@ -63,7 +63,7 @@ int netpoll_setup(struct netpoll *np); void __netpoll_cleanup(struct netpoll *np); void __netpoll_free(struct netpoll *np); void netpoll_cleanup(struct netpoll *np); -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb); #ifdef CONFIG_NETPOLL static inline void *netpoll_poll_lock(struct napi_struct *napi) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 34cd34f24423..40d2753aa47d 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -305,7 +305,7 @@ static int netpoll_owner_active(struct net_device *dev) } /* call with IRQ disabled */ -static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +static netdev_tx_t __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { netdev_tx_t status = NETDEV_TX_BUSY; struct net_device *dev; @@ -320,7 +320,7 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) if (!npinfo || !netif_running(dev) || !netif_device_present(dev)) { dev_kfree_skb_irq(skb); - return; + return NET_XMIT_DROP; } /* don't get messages out of order, and no recursion */ @@ -359,15 +359,18 @@ static void __netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) skb_queue_tail(&npinfo->txq, skb); schedule_delayed_work(&npinfo->tx_work,0); } + return NETDEV_TX_OK; } -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) { unsigned long flags; + netdev_tx_t ret; local_irq_save(flags); - __netpoll_send_skb(np, skb); + ret = __netpoll_send_skb(np, skb); local_irq_restore(flags); + return ret; } EXPORT_SYMBOL(netpoll_send_skb); -- cgit v1.2.3 From f78ed2204db9fc35b545d693865bddbe0149aa1f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 09:32:21 -0700 Subject: netpoll: accept NULL np argument in netpoll_send_skb() netpoll_send_skb() callers seem to leak skb if the np pointer is NULL. While this should not happen, we can make the code more robust. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/macvlan.c | 5 ++--- include/linux/if_team.h | 5 +---- include/net/bonding.h | 5 +---- net/8021q/vlan_dev.c | 5 ++--- net/bridge/br_private.h | 5 +---- net/core/netpoll.c | 11 ++++++++--- net/dsa/slave.c | 5 ++--- 7 files changed, 17 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 34eb073cdd74..9a419d5102ce 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -542,12 +542,11 @@ xmit_world: static inline netdev_tx_t macvlan_netpoll_send_skb(struct macvlan_dev *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t macvlan_start_xmit(struct sk_buff *skb, diff --git a/include/linux/if_team.h b/include/linux/if_team.h index ec7e4bd07f82..537dc2b8c879 100644 --- a/include/linux/if_team.h +++ b/include/linux/if_team.h @@ -102,10 +102,7 @@ static inline bool team_port_dev_txable(const struct net_device *port_dev) static inline void team_netpoll_send_skb(struct team_port *port, struct sk_buff *skb) { - struct netpoll *np = port->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(port->np, skb); } #else static inline void team_netpoll_send_skb(struct team_port *port, diff --git a/include/net/bonding.h b/include/net/bonding.h index 0b696da5c115..f211983cd52a 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -507,10 +507,7 @@ static inline unsigned long slave_last_rx(struct bonding *bond, static inline void bond_netpoll_send_skb(const struct slave *slave, struct sk_buff *skb) { - struct netpoll *np = slave->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(slave->np, skb); } #else static inline void bond_netpoll_send_skb(const struct slave *slave, diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 319220b2341d..f00bb57f0f60 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -88,12 +88,11 @@ static int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, static inline netdev_tx_t vlan_netpoll_send_skb(struct vlan_dev_priv *vlan, struct sk_buff *skb) { #ifdef CONFIG_NET_POLL_CONTROLLER - if (vlan->netpoll) - netpoll_send_skb(vlan->netpoll, skb); + return netpoll_send_skb(vlan->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb, diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 78d3a951180d..4dc21e8f7e33 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -598,10 +598,7 @@ netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev); static inline void br_netpoll_send_skb(const struct net_bridge_port *p, struct sk_buff *skb) { - struct netpoll *np = p->np; - - if (np) - netpoll_send_skb(np, skb); + netpoll_send_skb(p->np, skb); } int br_netpoll_enable(struct net_bridge_port *p); diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 40d2753aa47d..093e90e52bc2 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -367,9 +367,14 @@ netdev_tx_t netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) unsigned long flags; netdev_tx_t ret; - local_irq_save(flags); - ret = __netpoll_send_skb(np, skb); - local_irq_restore(flags); + if (unlikely(!np)) { + dev_kfree_skb_irq(skb); + ret = NET_XMIT_DROP; + } else { + local_irq_save(flags); + ret = __netpoll_send_skb(np, skb); + local_irq_restore(flags); + } return ret; } EXPORT_SYMBOL(netpoll_send_skb); diff --git a/net/dsa/slave.c b/net/dsa/slave.c index dfb4282fc339..61b0de52040a 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -445,12 +445,11 @@ static inline netdev_tx_t dsa_slave_netpoll_send_skb(struct net_device *dev, #ifdef CONFIG_NET_POLL_CONTROLLER struct dsa_slave_priv *p = netdev_priv(dev); - if (p->netpoll) - netpoll_send_skb(p->netpoll, skb); + return netpoll_send_skb(p->netpoll, skb); #else BUG(); -#endif return NETDEV_TX_OK; +#endif } static void dsa_skb_tx_timestamp(struct dsa_slave_priv *p, -- cgit v1.2.3 From 790709f249728640faa4eff38286a9feb34fed81 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 10:05:39 -0700 Subject: net: relax SO_TXTIME CAP_NET_ADMIN check Now sch_fq has horizon feature, we want to allow QUIC/UDP applications to use EDT model so that pacing can be offloaded to the kernel (sch_fq) or the NIC. Signed-off-by: Eric Dumazet Cc: Willem de Bruijn Acked-by: Willem de Bruijn Acked-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- net/core/sock.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index b714162213ae..fd85e651ce28 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1152,23 +1152,31 @@ set_rcvbuf: break; case SO_TXTIME: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { - ret = -EPERM; - } else if (optlen != sizeof(struct sock_txtime)) { + if (optlen != sizeof(struct sock_txtime)) { ret = -EINVAL; + break; } else if (copy_from_user(&sk_txtime, optval, sizeof(struct sock_txtime))) { ret = -EFAULT; + break; } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { ret = -EINVAL; - } else { - sock_valbool_flag(sk, SOCK_TXTIME, true); - sk->sk_clockid = sk_txtime.clockid; - sk->sk_txtime_deadline_mode = - !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); - sk->sk_txtime_report_errors = - !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); + break; + } + /* CLOCK_MONOTONIC is only used by sch_fq, and this packet + * scheduler has enough safe guards. + */ + if (sk_txtime.clockid != CLOCK_MONOTONIC && + !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + ret = -EPERM; + break; } + sock_valbool_flag(sk, SOCK_TXTIME, true); + sk->sk_clockid = sk_txtime.clockid; + sk->sk_txtime_deadline_mode = + !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); + sk->sk_txtime_report_errors = + !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); break; case SO_BINDTOIFINDEX: -- cgit v1.2.3 From cb0721c7e200750907bb8ef59b12646a5cb2dadf Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:10 -0700 Subject: net: Refactor arguments of inet{,6}_bind The intent is to add an additional bind parameter in the next commit. Instead of adding another argument, let's convert all existing flag arguments into an extendable bit field. No functional changes. Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-4-sdf@google.com --- include/net/inet_common.h | 6 +++++- include/net/ipv6_stubs.h | 2 +- net/core/filter.c | 6 ++++-- net/ipv4/af_inet.c | 10 +++++----- net/ipv6/af_inet6.c | 10 +++++----- 5 files changed, 20 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index ae2ba897675c..c38f4f7d660a 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -35,8 +35,12 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +/* Don't allocate port at this moment, defer to connect. */ +#define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) +/* Grab and release socket lock. */ +#define BIND_WITH_LOCK (1 << 1) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6_stubs.h b/include/net/ipv6_stubs.h index a5f7c12c326a..6e622dd3122e 100644 --- a/include/net/ipv6_stubs.h +++ b/include/net/ipv6_stubs.h @@ -63,7 +63,7 @@ extern const struct ipv6_stub *ipv6_stub __read_mostly; /* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ struct ipv6_bpf_stub { int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock); + u32 flags); struct sock *(*udp6_lib_lookup)(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, diff --git a/net/core/filter.c b/net/core/filter.c index dfaf5df13722..fa9ddab5dd1f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4538,7 +4538,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, return err; if (((struct sockaddr_in *)addr)->sin_port != htons(0)) return err; - return __inet_bind(sk, addr, addr_len, true, false); + return __inet_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) @@ -4548,7 +4549,8 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, + BIND_FORCE_ADDRESS_NO_PORT); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6177c4ba0037..68e74b1b0f26 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -450,12 +450,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet_bind(sk, uaddr, addr_len, false, true); + return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet_bind); int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -506,7 +506,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -520,7 +520,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { inet->inet_saddr = inet->inet_rcv_saddr = 0; err = -EADDRINUSE; @@ -543,7 +543,7 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, sk_dst_reset(sk); err = 0; out_release_sock: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); out: return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 345baa0a754f..552c2592b81c 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -273,7 +273,7 @@ out_rcu_unlock: } static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, - bool force_bind_address_no_port, bool with_lock) + u32 flags) { struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct inet_sock *inet = inet_sk(sk); @@ -297,7 +297,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - if (with_lock) + if (flags & BIND_WITH_LOCK) lock_sock(sk); /* Check these errors (active socket, double bind). */ @@ -400,7 +400,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, /* Make sure we are allowed to bind here. */ if (snum || !(inet->bind_address_no_port || - force_bind_address_no_port)) { + (flags & BIND_FORCE_ADDRESS_NO_PORT))) { if (sk->sk_prot->get_port(sk, snum)) { sk->sk_ipv6only = saved_ipv6only; inet_reset_saddr(sk); @@ -423,7 +423,7 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, inet->inet_dport = 0; inet->inet_daddr = 0; out: - if (with_lock) + if (flags & BIND_WITH_LOCK) release_sock(sk); return err; out_unlock: @@ -451,7 +451,7 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, false, true); + return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); } EXPORT_SYMBOL(inet6_bind); -- cgit v1.2.3 From 8086fbaf49345f988deec539ec8e182b02914401 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 8 May 2020 10:46:11 -0700 Subject: bpf: Allow any port in bpf_bind helper We want to have a tighter control on what ports we bind to in the BPF_CGROUP_INET{4,6}_CONNECT hooks even if it means connect() becomes slightly more expensive. The expensive part comes from the fact that we now need to call inet_csk_get_port() that verifies that the port is not used and allocates an entry in the hash table for it. Since we can't rely on "snum || !bind_address_no_port" to prevent us from calling POST_BIND hook anymore, let's add another bind flag to indicate that the call site is BPF program. v5: * fix wrong AF_INET (should be AF_INET6) in the bpf program for v6 v3: * More bpf_bind documentation refinements (Martin KaFai Lau) * Add UDP tests as well (Martin KaFai Lau) * Don't start the thread, just do socket+bind+listen (Martin KaFai Lau) v2: * Update documentation (Andrey Ignatov) * Pass BIND_FORCE_ADDRESS_NO_PORT conditionally (Andrey Ignatov) Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Andrey Ignatov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200508174611.228805-5-sdf@google.com --- include/net/inet_common.h | 2 + include/uapi/linux/bpf.h | 9 +- net/core/filter.c | 18 ++-- net/ipv4/af_inet.c | 10 +- net/ipv6/af_inet6.c | 12 ++- tools/include/uapi/linux/bpf.h | 9 +- .../selftests/bpf/prog_tests/connect_force_port.c | 115 +++++++++++++++++++++ .../selftests/bpf/progs/connect_force_port4.c | 28 +++++ .../selftests/bpf/progs/connect_force_port6.c | 28 +++++ 9 files changed, 203 insertions(+), 28 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/connect_force_port.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port4.c create mode 100644 tools/testing/selftests/bpf/progs/connect_force_port6.c (limited to 'net/core') diff --git a/include/net/inet_common.h b/include/net/inet_common.h index c38f4f7d660a..cb2818862919 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -39,6 +39,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) +/* Called from BPF program. */ +#define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/net/core/filter.c b/net/core/filter.c index fa9ddab5dd1f..da0634979f53 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4525,32 +4525,28 @@ BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, { #ifdef CONFIG_INET struct sock *sk = ctx->sk; + u32 flags = BIND_FROM_BPF; int err; - /* Binding to port can be expensive so it's prohibited in the helper. - * Only binding to IP is supported. - */ err = -EINVAL; if (addr_len < offsetofend(struct sockaddr, sa_family)) return err; if (addr->sa_family == AF_INET) { if (addr_len < sizeof(struct sockaddr_in)) return err; - if (((struct sockaddr_in *)addr)->sin_port != htons(0)) - return err; - return __inet_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + if (((struct sockaddr_in *)addr)->sin_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; + return __inet_bind(sk, addr, addr_len, flags); #if IS_ENABLED(CONFIG_IPV6) } else if (addr->sa_family == AF_INET6) { if (addr_len < SIN6_LEN_RFC2133) return err; - if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) - return err; + if (((struct sockaddr_in6 *)addr)->sin6_port == htons(0)) + flags |= BIND_FORCE_ADDRESS_NO_PORT; /* ipv6_bpf_stub cannot be NULL, since it's called from * bpf_cgroup_inet6_connect hook and ipv6 is already loaded */ - return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, - BIND_FORCE_ADDRESS_NO_PORT); + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, flags); #endif /* CONFIG_IPV6 */ } #endif /* CONFIG_INET */ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 68e74b1b0f26..fcf0d12a407a 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -526,10 +526,12 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out_release_sock; } - err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); - if (err) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - goto out_release_sock; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 552c2592b81c..771a462a8322 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -407,11 +407,13 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, err = -EADDRINUSE; goto out; } - err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); - if (err) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - goto out; + if (!(flags & BIND_FROM_BPF)) { + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b3643e27e264..6e5e7caa3739 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1994,10 +1994,11 @@ union bpf_attr { * * This helper works for IPv4 and IPv6, TCP and UDP sockets. The * domain (*addr*\ **->sa_family**) must be **AF_INET** (or - * **AF_INET6**). Looking for a free port to bind to can be - * expensive, therefore binding to port is not permitted by the - * helper: *addr*\ **->sin_port** (or **sin6_port**, respectively) - * must be set to zero. + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. * Return * 0 on success, or a negative error in case of failure. * diff --git a/tools/testing/selftests/bpf/prog_tests/connect_force_port.c b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c new file mode 100644 index 000000000000..47fbb20cb6a6 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/connect_force_port.c @@ -0,0 +1,115 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "cgroup_helpers.h" +#include "network_helpers.h" + +static int verify_port(int family, int fd, int expected) +{ + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + __u16 port; + + if (getsockname(fd, (struct sockaddr *)&addr, &len)) { + log_err("Failed to get server addr"); + return -1; + } + + if (family == AF_INET) + port = ((struct sockaddr_in *)&addr)->sin_port; + else + port = ((struct sockaddr_in6 *)&addr)->sin6_port; + + if (ntohs(port) != expected) { + log_err("Unexpected port %d, expected %d", ntohs(port), + expected); + return -1; + } + + return 0; +} + +static int run_test(int cgroup_fd, int server_fd, int family, int type) +{ + struct bpf_prog_load_attr attr = { + .prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + }; + struct bpf_object *obj; + int expected_port; + int prog_fd; + int err; + int fd; + + if (family == AF_INET) { + attr.file = "./connect_force_port4.o"; + attr.expected_attach_type = BPF_CGROUP_INET4_CONNECT; + expected_port = 22222; + } else { + attr.file = "./connect_force_port6.o"; + attr.expected_attach_type = BPF_CGROUP_INET6_CONNECT; + expected_port = 22223; + } + + err = bpf_prog_load_xattr(&attr, &obj, &prog_fd); + if (err) { + log_err("Failed to load BPF object"); + return -1; + } + + err = bpf_prog_attach(prog_fd, cgroup_fd, attr.expected_attach_type, + 0); + if (err) { + log_err("Failed to attach BPF program"); + goto close_bpf_object; + } + + fd = connect_to_fd(family, type, server_fd); + if (fd < 0) { + err = -1; + goto close_bpf_object; + } + + err = verify_port(family, fd, expected_port); + + close(fd); + +close_bpf_object: + bpf_object__close(obj); + return err; +} + +void test_connect_force_port(void) +{ + int server_fd, cgroup_fd; + + cgroup_fd = test__join_cgroup("/connect_force_port"); + if (CHECK_FAIL(cgroup_fd < 0)) + return; + + server_fd = start_server(AF_INET, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_STREAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_STREAM)); + close(server_fd); + + server_fd = start_server(AF_INET, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET, SOCK_DGRAM)); + close(server_fd); + + server_fd = start_server(AF_INET6, SOCK_DGRAM); + if (CHECK_FAIL(server_fd < 0)) + goto close_cgroup_fd; + CHECK_FAIL(run_test(cgroup_fd, server_fd, AF_INET6, SOCK_DGRAM)); + close(server_fd); + +close_cgroup_fd: + close(cgroup_fd); +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port4.c b/tools/testing/selftests/bpf/progs/connect_force_port4.c new file mode 100644 index 000000000000..1b8eb34b2db0 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port4.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int _connect4(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa = {}; + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(22222); + sa.sin_addr.s_addr = bpf_htonl(0x7f000001); /* 127.0.0.1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} diff --git a/tools/testing/selftests/bpf/progs/connect_force_port6.c b/tools/testing/selftests/bpf/progs/connect_force_port6.c new file mode 100644 index 000000000000..ae6f7d750b4c --- /dev/null +++ b/tools/testing/selftests/bpf/progs/connect_force_port6.c @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: GPL-2.0 +#include + +#include +#include +#include +#include + +#include +#include + +char _license[] SEC("license") = "GPL"; +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int _connect6(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa = {}; + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(22223); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(1); /* ::1 */ + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + + return 1; +} -- cgit v1.2.3 From cf86a086a18095e33e0637cb78cda1fcf5280852 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 May 2020 18:58:10 -0700 Subject: net/dst: use a smaller percpu_counter batch for dst entries accounting percpu_counter_add() uses a default batch size which is quite big on platforms with 256 cpus. (2*256 -> 512) This means dst_entries_get_fast() can be off by +/- 2*(nr_cpus^2) (131072 on servers with 256 cpus) Reduce the batch size to something more reasonable, and add logic to ip6_dst_gc() to call dst_entries_get_slow() before calling the _very_ expensive fib6_run_gc() function. Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/dst_ops.h | 4 +++- net/core/dst.c | 8 ++++---- net/ipv6/route.c | 3 +++ 3 files changed, 10 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 443863c7b8da..88ff7bb2bb9b 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h @@ -53,9 +53,11 @@ static inline int dst_entries_get_slow(struct dst_ops *dst) return percpu_counter_sum_positive(&dst->pcpuc_entries); } +#define DST_PERCPU_COUNTER_BATCH 32 static inline void dst_entries_add(struct dst_ops *dst, int val) { - percpu_counter_add(&dst->pcpuc_entries, val); + percpu_counter_add_batch(&dst->pcpuc_entries, val, + DST_PERCPU_COUNTER_BATCH); } static inline int dst_entries_init(struct dst_ops *dst) diff --git a/net/core/dst.c b/net/core/dst.c index 193af526e908..d6b6ced0d451 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -81,11 +81,11 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev, { struct dst_entry *dst; - if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { + if (ops->gc && + !(flags & DST_NOCOUNT) && + dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) { - printk_ratelimited(KERN_NOTICE "Route cache is full: " - "consider increasing sysctl " - "net.ipv[4|6].route.max_size.\n"); + pr_notice_ratelimited("Route cache is full: consider increasing sysctl net.ipv6.route.max_size.\n"); return NULL; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 1ff142393c76..a9072dba00f4 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3195,6 +3195,9 @@ static int ip6_dst_gc(struct dst_ops *ops) int entries; entries = dst_entries_get_fast(ops); + if (entries > rt_max_size) + entries = dst_entries_get_slow(ops); + if (time_after(rt_last_gc + rt_min_interval, jiffies) && entries <= rt_max_size) goto out; -- cgit v1.2.3 From 0462b6bdb6445b887b8896f28be92e0d94c92e7b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2020 13:59:11 +0200 Subject: net: add a CMSG_USER_DATA macro Add a variant of CMSG_DATA that operates on user pointer to avoid sparse warnings about casting to/from user pointers. Also fix up CMSG_DATA to rely on the gcc extension that allows void pointer arithmetics to cut down on the amount of casts. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 5 ++++- net/core/scm.c | 4 ++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/include/linux/socket.h b/include/linux/socket.h index 54338fac45cb..4cc64d611cf4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -94,7 +94,10 @@ struct cmsghdr { #define CMSG_ALIGN(len) ( ((len)+sizeof(long)-1) & ~(sizeof(long)-1) ) -#define CMSG_DATA(cmsg) ((void *)((char *)(cmsg) + sizeof(struct cmsghdr))) +#define CMSG_DATA(cmsg) \ + ((void *)(cmsg) + sizeof(struct cmsghdr)) +#define CMSG_USER_DATA(cmsg) \ + ((void __user *)(cmsg) + sizeof(struct cmsghdr)) #define CMSG_SPACE(len) (sizeof(struct cmsghdr) + CMSG_ALIGN(len)) #define CMSG_LEN(len) (sizeof(struct cmsghdr) + (len)) diff --git a/net/core/scm.c b/net/core/scm.c index dc6fed1f221c..abfdc85a64c1 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -236,7 +236,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) err = -EFAULT; if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) goto out; - if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) goto out; cmlen = CMSG_SPACE(len); if (msg->msg_controllen < cmlen) @@ -300,7 +300,7 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) if (fdnum < fdmax) fdmax = fdnum; - for (i=0, cmfptr=(__force int __user *)CMSG_DATA(cm); i Date: Mon, 11 May 2020 13:59:12 +0200 Subject: net/scm: cleanup scm_detach_fds Factor out two helpes to keep the code tidy. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/core/scm.c | 94 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 51 insertions(+), 43 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index abfdc85a64c1..168b006a52ff 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -277,78 +277,86 @@ void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_inter } EXPORT_SYMBOL(put_cmsg_scm_timestamping); +static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) +{ + struct socket *sock; + int new_fd; + int error; + + error = security_file_receive(file); + if (error) + return error; + + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; + + error = put_user(new_fd, ufd); + if (error) { + put_unused_fd(new_fd); + return error; + } + + /* Bump the usage count and install the file. */ + sock = sock_from_file(file, &error); + if (sock) { + sock_update_netprioidx(&sock->sk->sk_cgrp_data); + sock_update_classid(&sock->sk->sk_cgrp_data); + } + fd_install(new_fd, get_file(file)); + return error; +} + +static int scm_max_fds(struct msghdr *msg) +{ + if (msg->msg_controllen <= sizeof(struct cmsghdr)) + return 0; + return (msg->msg_controllen - sizeof(struct cmsghdr)) / sizeof(int); +} + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm = (__force struct cmsghdr __user*)msg->msg_control; - - int fdmax = 0; - int fdnum = scm->fp->count; - struct file **fp = scm->fp->fp; - int __user *cmfptr; + int o_flags = (msg->msg_flags & MSG_CMSG_CLOEXEC) ? O_CLOEXEC : 0; + int fdmax = min_t(int, scm_max_fds(msg), scm->fp->count); + int __user *cmsg_data = CMSG_USER_DATA(cm); int err = 0, i; - if (MSG_CMSG_COMPAT & msg->msg_flags) { + if (msg->msg_flags & MSG_CMSG_COMPAT) { scm_detach_fds_compat(msg, scm); return; } - if (msg->msg_controllen > sizeof(struct cmsghdr)) - fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) - / sizeof(int)); - - if (fdnum < fdmax) - fdmax = fdnum; - - for (i=0, cmfptr =(int __user *)CMSG_USER_DATA(cm); ifp->fp[i], cmsg_data + i, o_flags); if (err) break; - err = get_unused_fd_flags(MSG_CMSG_CLOEXEC & msg->msg_flags - ? O_CLOEXEC : 0); - if (err < 0) - break; - new_fd = err; - err = put_user(new_fd, cmfptr); - if (err) { - put_unused_fd(new_fd); - break; - } - /* Bump the usage count and install the file. */ - sock = sock_from_file(fp[i], &err); - if (sock) { - sock_update_netprioidx(&sock->sk->sk_cgrp_data); - sock_update_classid(&sock->sk->sk_cgrp_data); - } - fd_install(new_fd, get_file(fp[i])); } - if (i > 0) - { - int cmlen = CMSG_LEN(i*sizeof(int)); + if (i > 0) { + int cmlen = CMSG_LEN(i * sizeof(int)); + err = put_user(SOL_SOCKET, &cm->cmsg_level); if (!err) err = put_user(SCM_RIGHTS, &cm->cmsg_type); if (!err) err = put_user(cmlen, &cm->cmsg_len); if (!err) { - cmlen = CMSG_SPACE(i*sizeof(int)); + cmlen = CMSG_SPACE(i * sizeof(int)); if (msg->msg_controllen < cmlen) cmlen = msg->msg_controllen; msg->msg_control += cmlen; msg->msg_controllen -= cmlen; } } - if (i < fdnum || (fdnum && fdmax <= 0)) + + if (i < scm->fp->count || (scm->fp->count && fdmax <= 0)) msg->msg_flags |= MSG_CTRUNC; /* - * All of the files that fit in the message have had their - * usage counts incremented, so we just free the list. + * All of the files that fit in the message have had their usage counts + * incremented, so we just free the list. */ __scm_destroy(scm); } -- cgit v1.2.3 From 1f466e1f15cf1dac7c86798d694649fc42cd868a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 May 2020 13:59:13 +0200 Subject: net: cleanly handle kernel vs user buffers for ->msg_control The msg_control field in struct msghdr can either contain a user pointer when used with the recvmsg system call, or a kernel pointer when used with sendmsg. To complicate things further kernel_recvmsg can stuff a kernel pointer in and then use set_fs to make the uaccess helpers accept it. Replace it with a union of a kernel pointer msg_control field, and a user pointer msg_control_user one, and allow kernel_recvmsg operate on a proper kernel pointer using a bitfield to override the normal choice of a user pointer for recvmsg. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/socket.h | 12 +++++++++++- net/compat.c | 5 +++-- net/core/scm.c | 49 ++++++++++++++++++++++++++++--------------------- net/ipv4/ip_sockglue.c | 3 ++- net/socket.c | 22 ++++++---------------- 5 files changed, 50 insertions(+), 41 deletions(-) (limited to 'net/core') diff --git a/include/linux/socket.h b/include/linux/socket.h index 4cc64d611cf4..04d2bc97f497 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -50,7 +50,17 @@ struct msghdr { void *msg_name; /* ptr to socket address structure */ int msg_namelen; /* size of socket address structure */ struct iov_iter msg_iter; /* data */ - void *msg_control; /* ancillary data */ + + /* + * Ancillary data. msg_control_user is the user buffer used for the + * recv* side when msg_control_is_user is set, msg_control is the kernel + * buffer used for all other cases. + */ + union { + void *msg_control; + void __user *msg_control_user; + }; + bool msg_control_is_user : 1; __kernel_size_t msg_controllen; /* ancillary data buffer length */ unsigned int msg_flags; /* flags on received message */ struct kiocb *msg_iocb; /* ptr to iocb for async requests */ diff --git a/net/compat.c b/net/compat.c index 4bed96e84d9a..69fc6d1e4e6e 100644 --- a/net/compat.c +++ b/net/compat.c @@ -56,7 +56,8 @@ int __get_compat_msghdr(struct msghdr *kmsg, if (kmsg->msg_namelen > sizeof(struct sockaddr_storage)) kmsg->msg_namelen = sizeof(struct sockaddr_storage); - kmsg->msg_control = compat_ptr(msg.msg_control); + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = compat_ptr(msg.msg_control); kmsg->msg_controllen = msg.msg_controllen; if (save_addr) @@ -121,7 +122,7 @@ int get_compat_msghdr(struct msghdr *kmsg, ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ (ucmlen) <= (unsigned long) \ ((mhdr)->msg_controllen - \ - ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + ((char __user *)(ucmsg) - (char __user *)(mhdr)->msg_control_user))) static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, struct compat_cmsghdr __user *cmsg, int cmsg_len) diff --git a/net/core/scm.c b/net/core/scm.c index 168b006a52ff..a75cd637a71f 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -212,16 +212,12 @@ EXPORT_SYMBOL(__scm_send); int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) { - struct cmsghdr __user *cm - = (__force struct cmsghdr __user *)msg->msg_control; - struct cmsghdr cmhdr; int cmlen = CMSG_LEN(len); - int err; - if (MSG_CMSG_COMPAT & msg->msg_flags) + if (msg->msg_flags & MSG_CMSG_COMPAT) return put_cmsg_compat(msg, level, type, len, data); - if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + if (!msg->msg_control || msg->msg_controllen < sizeof(struct cmsghdr)) { msg->msg_flags |= MSG_CTRUNC; return 0; /* XXX: return error? check spec. */ } @@ -229,23 +225,30 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_flags |= MSG_CTRUNC; cmlen = msg->msg_controllen; } - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - - err = -EFAULT; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) - goto out; - if (copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - goto out; - cmlen = CMSG_SPACE(len); - if (msg->msg_controllen < cmlen) - cmlen = msg->msg_controllen; + + if (msg->msg_control_is_user) { + struct cmsghdr __user *cm = msg->msg_control_user; + struct cmsghdr cmhdr; + + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || + copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) + return -EFAULT; + } else { + struct cmsghdr *cm = msg->msg_control; + + cm->cmsg_level = level; + cm->cmsg_type = type; + cm->cmsg_len = cmlen; + memcpy(CMSG_DATA(cm), data, cmlen - sizeof(*cm)); + } + + cmlen = min(CMSG_SPACE(len), msg->msg_controllen); msg->msg_control += cmlen; msg->msg_controllen -= cmlen; - err = 0; -out: - return err; + return 0; } EXPORT_SYMBOL(put_cmsg); @@ -328,6 +331,10 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) return; } + /* no use for FD passing from kernel space callers */ + if (WARN_ON_ONCE(!msg->msg_control_is_user)) + return; + for (i = 0; i < fdmax; i++) { err = __scm_install_fd(scm->fp->fp[i], cmsg_data + i, o_flags); if (err) diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index aa3fd61818c4..8206047d70b6 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1492,7 +1492,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control = (__force void *) optval; + msg.msg_control_is_user = true; + msg.msg_control_user = optval; msg.msg_controllen = len; msg.msg_flags = flags; diff --git a/net/socket.c b/net/socket.c index 2dd739fba866..1c9a7260a41d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -924,14 +924,9 @@ EXPORT_SYMBOL(sock_recvmsg); int kernel_recvmsg(struct socket *sock, struct msghdr *msg, struct kvec *vec, size_t num, size_t size, int flags) { - mm_segment_t oldfs = get_fs(); - int result; - + msg->msg_control_is_user = false; iov_iter_kvec(&msg->msg_iter, READ, vec, num, size); - set_fs(KERNEL_DS); - result = sock_recvmsg(sock, msg, flags); - set_fs(oldfs); - return result; + return sock_recvmsg(sock, msg, flags); } EXPORT_SYMBOL(kernel_recvmsg); @@ -2239,7 +2234,8 @@ int __copy_msghdr_from_user(struct msghdr *kmsg, if (copy_from_user(&msg, umsg, sizeof(*umsg))) return -EFAULT; - kmsg->msg_control = (void __force *)msg.msg_control; + kmsg->msg_control_is_user = true; + kmsg->msg_control_user = msg.msg_control; kmsg->msg_controllen = msg.msg_controllen; kmsg->msg_flags = msg.msg_flags; @@ -2331,16 +2327,10 @@ static int ____sys_sendmsg(struct socket *sock, struct msghdr *msg_sys, goto out; } err = -EFAULT; - /* - * Careful! Before this, msg_sys->msg_control contains a user pointer. - * Afterwards, it will be a kernel pointer. Thus the compiler-assisted - * checking falls down on this. - */ - if (copy_from_user(ctl_buf, - (void __user __force *)msg_sys->msg_control, - ctl_len)) + if (copy_from_user(ctl_buf, msg_sys->msg_control_user, ctl_len)) goto out_freectl; msg_sys->msg_control = ctl_buf; + msg_sys->msg_control_is_user = false; } msg_sys->msg_flags = flags; -- cgit v1.2.3 From 6e8a4f9dda3823274fa8a4c1aa5e6a93f9775749 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 13 May 2020 13:07:59 +0200 Subject: net: ignore sock_from_file errors in __scm_install_fd The code had historically been ignoring these errors, and my recent refactoring changed that, which broke ssh in some setups. Fixes: 2618d530dd8b ("net/scm: cleanup scm_detach_fds") Reported-by: Ido Schimmel Signed-off-by: Christoph Hellwig Tested-by: Ido Schimmel Tested-by: Ioana Ciornei Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index a75cd637a71f..875df1c2989d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -307,7 +307,7 @@ static int __scm_install_fd(struct file *file, int __user *ufd, int o_flags) sock_update_classid(&sock->sk->sk_cgrp_data); } fd_install(new_fd, get_file(file)); - return error; + return 0; } static int scm_max_fds(struct msghdr *msg) -- cgit v1.2.3 From 5a46b062e28f57bffde767437fad3ab1d0cee2c7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 13 May 2020 10:28:22 -0700 Subject: devlink: refactor end checks in devlink_nl_cmd_region_read_dumpit Clean up after recent fixes, move address calculations around and change the variable init, so that we can have just one start_offset == end_offset check. Make the check a little stricter to preserve the -EINVAL error if requested start offset is larger than the region itself. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/devlink.c | 41 +++++++++------------- .../selftests/drivers/net/netdevsim/devlink.sh | 15 ++++++++ 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 20f935fa29f5..7b76e5fffc10 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4215,7 +4215,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, struct nlattr **attrs, u64 start_offset, u64 end_offset, - bool dump, u64 *new_offset) { struct devlink_snapshot *snapshot; @@ -4230,9 +4229,6 @@ static int devlink_nl_region_read_snapshot_fill(struct sk_buff *skb, if (!snapshot) return -EINVAL; - if (end_offset > region->size || dump) - end_offset = region->size; - while (curr_offset < end_offset) { u32 data_size; u8 *data; @@ -4260,13 +4256,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, struct netlink_callback *cb) { const struct genl_dumpit_info *info = genl_dumpit_info(cb); - u64 ret_offset, start_offset, end_offset = 0; + u64 ret_offset, start_offset, end_offset = U64_MAX; struct nlattr **attrs = info->attrs; struct devlink_region *region; struct nlattr *chunks_attr; const char *region_name; struct devlink *devlink; - bool dump = true; void *hdr; int err; @@ -4294,8 +4289,21 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto out_unlock; } + if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && + attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { + if (!start_offset) + start_offset = + nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + + end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); + end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); + } + + if (end_offset > region->size) + end_offset = region->size; + /* return 0 if there is no further data to read */ - if (start_offset >= region->size) { + if (start_offset == end_offset) { err = 0; goto out_unlock; } @@ -4322,27 +4330,10 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb, goto nla_put_failure; } - if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] && - attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]) { - if (!start_offset) - start_offset = - nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - - end_offset = nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR]); - end_offset += nla_get_u64(attrs[DEVLINK_ATTR_REGION_CHUNK_LEN]); - dump = false; - - if (start_offset == end_offset) { - err = 0; - goto nla_put_failure; - } - } - err = devlink_nl_region_read_snapshot_fill(skb, devlink, region, attrs, start_offset, - end_offset, dump, - &ret_offset); + end_offset, &ret_offset); if (err && err != -EMSGSIZE) goto nla_put_failure; diff --git a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh index ad539eccddcb..de4b32fc4223 100755 --- a/tools/testing/selftests/drivers/net/netdevsim/devlink.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/devlink.sh @@ -146,6 +146,21 @@ regions_test() check_region_snapshot_count dummy post-first-request 3 + devlink region dump $DL_HANDLE/dummy snapshot 25 >> /dev/null + check_err $? "Failed to dump snapshot with id 25" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 0 len 1 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (1 byte)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len 128 >> /dev/null + check_err $? "Failed to read snapshot with id 25 (128 bytes)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr 128 len $((1<<32)) >> /dev/null + check_err $? "Failed to read snapshot with id 25 (oversized)" + + devlink region read $DL_HANDLE/dummy snapshot 25 addr $((1<<32)) len 128 >> /dev/null 2>&1 + check_fail $? "Bad read of snapshot with id 25 did not fail" + devlink region del $DL_HANDLE/dummy snapshot 25 check_err $? "Failed to delete snapshot with id 25" -- cgit v1.2.3 From 7aebfa1b3885b5aa29fcb4a596d0485ac463bbe8 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Wed, 13 May 2020 18:50:27 -0700 Subject: bpf: Support narrow loads from bpf_sock_addr.user_port bpf_sock_addr.user_port supports only 4-byte load and it leads to ugly code in BPF programs, like: volatile __u32 user_port = ctx->user_port; __u16 port = bpf_ntohs(user_port); Since otherwise clang may optimize the load to be 2-byte and it's rejected by verifier. Add support for 1- and 2-byte loads same way as it's supported for other fields in bpf_sock_addr like user_ip4, msg_src_ip4, etc. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/c1e983f4c17573032601d0b2b1f9d1274f24bc16.1589420814.git.rdna@fb.com --- include/uapi/linux/bpf.h | 2 +- net/core/filter.c | 15 +++++++-------- tools/include/uapi/linux/bpf.h | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ diff --git a/net/core/filter.c b/net/core/filter.c index da0634979f53..1fe8c0c2d408 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7029,6 +7029,7 @@ static bool sock_addr_is_valid_access(int off, int size, case bpf_ctx_range(struct bpf_sock_addr, msg_src_ip4): case bpf_ctx_range_till(struct bpf_sock_addr, msg_src_ip6[0], msg_src_ip6[3]): + case bpf_ctx_range(struct bpf_sock_addr, user_port): if (type == BPF_READ) { bpf_ctx_record_field_size(info, size_default); @@ -7059,10 +7060,6 @@ static bool sock_addr_is_valid_access(int off, int size, return false; } break; - case bpf_ctx_range(struct bpf_sock_addr, user_port): - if (size != size_default) - return false; - break; case offsetof(struct bpf_sock_addr, sk): if (type != BPF_READ) return false; @@ -7958,8 +7955,8 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { + int off, port_size = sizeof_field(struct sockaddr_in6, sin6_port); struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct bpf_sock_addr, user_family): @@ -7994,9 +7991,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, offsetof(struct sockaddr_in6, sin6_port)); BUILD_BUG_ON(sizeof_field(struct sockaddr_in, sin_port) != sizeof_field(struct sockaddr_in6, sin6_port)); - SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, - struct sockaddr_in6, uaddr, - sin6_port, tmp_reg); + /* Account for sin6_port being smaller than user_port. */ + port_size = min(port_size, BPF_LDST_BYTES(si)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_port, bytes_to_bpf_size(port_size), 0, tmp_reg); break; case offsetof(struct bpf_sock_addr, family): diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index bfb31c1be219..85cfdffde182 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3728,7 +3728,7 @@ struct bpf_sock_addr { __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. * Stored in network byte order. */ - __u32 user_port; /* Allows 4-byte read and write. + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. * Stored in network byte order */ __u32 family; /* Allows 4-byte read, but no write */ -- cgit v1.2.3 From d56c2f95adb3d401bf982b6cf8fc4bb6d2f7acdd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:45 -0700 Subject: bpf: Allow sk lookup helpers in cgroup skb Currently sk lookup helpers are allowed in tc, xdp, sk skb, and cgroup sock_addr programs. But they would be useful in cgroup skb as well so that for example cgroup skb ingress program can lookup a peer socket a packet comes from on same host and make a decision whether to allow or deny this packet based on the properties of that socket, e.g. cgroup that peer socket belongs to. Allow the following sk lookup helpers in cgroup skb: * bpf_sk_lookup_tcp; * bpf_sk_lookup_udp; * bpf_sk_release; * bpf_skc_lookup_tcp. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f8c7ee280f1582b586629436d777b6db00597d63.1589486450.git.rdna@fb.com --- net/core/filter.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 1fe8c0c2d408..9c3eada5c86c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6159,6 +6159,14 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; #endif #ifdef CONFIG_INET + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; + case BPF_FUNC_skc_lookup_tcp: + return &bpf_skc_lookup_tcp_proto; case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; case BPF_FUNC_get_listener_sock: -- cgit v1.2.3 From 06d3e4c9f11afc849dc201ecf9ef7a43eeb1dddd Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:46 -0700 Subject: bpf: Allow skb_ancestor_cgroup_id helper in cgroup skb cgroup skb programs already can use bpf_skb_cgroup_id. Allow bpf_skb_ancestor_cgroup_id as well so that container policies can be implemented for a container that can have sub-cgroups dynamically created, but policies should still be implemented based on cgroup id of container itself not on an id of a sub-cgroup. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/8874194d6041eba190356453ea9f6071edf5f658.1589486450.git.rdna@fb.com --- net/core/filter.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 9c3eada5c86c..a47dc5b9dad4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6157,6 +6157,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_SOCK_CGROUP_DATA case BPF_FUNC_skb_cgroup_id: return &bpf_skb_cgroup_id_proto; + case BPF_FUNC_skb_ancestor_cgroup_id: + return &bpf_skb_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: -- cgit v1.2.3 From f307fa2cb4c935f7f1ff0aeb880c7b44fb9a642b Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Thu, 14 May 2020 13:03:47 -0700 Subject: bpf: Introduce bpf_sk_{, ancestor_}cgroup_id helpers With having ability to lookup sockets in cgroup skb programs it becomes useful to access cgroup id of retrieved sockets so that policies can be implemented based on origin cgroup of such socket. For example, a container running in a cgroup can have cgroup skb ingress program that can lookup peer socket that is sending packets to a process inside the container and decide whether those packets should be allowed or denied based on cgroup id of the peer. More specifically such ingress program can implement intra-host policy "allow incoming packets only from this same container and not from any other container on same host" w/o relying on source IP addresses since quite often it can be the case that containers share same IP address on the host. Introduce two new helpers for this use-case: bpf_sk_cgroup_id() and bpf_sk_ancestor_cgroup_id(). These helpers are similar to existing bpf_skb_{,ancestor_}cgroup_id helpers with the only difference that sk is used to get cgroup id instead of skb, and share code with them. See documentation in UAPI for more details. Signed-off-by: Andrey Ignatov Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/f5884981249ce911f63e9b57ecd5d7d19154ff39.1589486450.git.rdna@fb.com --- include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- net/core/filter.c | 60 +++++++++++++++++++++++++++++++++++------- tools/include/uapi/linux/bpf.h | 36 ++++++++++++++++++++++++- 3 files changed, 121 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index a47dc5b9dad4..5815902bb617 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4003,16 +4003,22 @@ static const struct bpf_func_proto bpf_skb_under_cgroup_proto = { }; #ifdef CONFIG_SOCK_CGROUP_DATA +static inline u64 __bpf_sk_cgroup_id(struct sock *sk) +{ + struct cgroup *cgrp; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + return cgroup_id(cgrp); +} + BPF_CALL_1(bpf_skb_cgroup_id, const struct sk_buff *, skb) { struct sock *sk = skb_to_full_sk(skb); - struct cgroup *cgrp; if (!sk || !sk_fullsock(sk)) return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - return cgroup_id(cgrp); + return __bpf_sk_cgroup_id(sk); } static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { @@ -4022,16 +4028,12 @@ static const struct bpf_func_proto bpf_skb_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, - ancestor_level) +static inline u64 __bpf_sk_ancestor_cgroup_id(struct sock *sk, + int ancestor_level) { - struct sock *sk = skb_to_full_sk(skb); struct cgroup *ancestor; struct cgroup *cgrp; - if (!sk || !sk_fullsock(sk)) - return 0; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); ancestor = cgroup_ancestor(cgrp, ancestor_level); if (!ancestor) @@ -4040,6 +4042,17 @@ BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, return cgroup_id(ancestor); } +BPF_CALL_2(bpf_skb_ancestor_cgroup_id, const struct sk_buff *, skb, int, + ancestor_level) +{ + struct sock *sk = skb_to_full_sk(skb); + + if (!sk || !sk_fullsock(sk)) + return 0; + + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .func = bpf_skb_ancestor_cgroup_id, .gpl_only = false, @@ -4047,6 +4060,31 @@ static const struct bpf_func_proto bpf_skb_ancestor_cgroup_id_proto = { .arg1_type = ARG_PTR_TO_CTX, .arg2_type = ARG_ANYTHING, }; + +BPF_CALL_1(bpf_sk_cgroup_id, struct sock *, sk) +{ + return __bpf_sk_cgroup_id(sk); +} + +static const struct bpf_func_proto bpf_sk_cgroup_id_proto = { + .func = bpf_sk_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + +BPF_CALL_2(bpf_sk_ancestor_cgroup_id, struct sock *, sk, int, ancestor_level) +{ + return __bpf_sk_ancestor_cgroup_id(sk, ancestor_level); +} + +static const struct bpf_func_proto bpf_sk_ancestor_cgroup_id_proto = { + .func = bpf_sk_ancestor_cgroup_id, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, + .arg2_type = ARG_ANYTHING, +}; #endif static unsigned long bpf_xdp_copy(void *dst_buff, const void *src_buff, @@ -6159,6 +6197,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_cgroup_id_proto; case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; + case BPF_FUNC_sk_cgroup_id: + return &bpf_sk_cgroup_id_proto; + case BPF_FUNC_sk_ancestor_cgroup_id: + return &bpf_sk_ancestor_cgroup_id_proto; #endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 85cfdffde182..146c742f1d49 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3121,6 +3121,38 @@ union bpf_attr { * 0 on success, or a negative error in case of failure: * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(struct bpf_sock *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a full socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3250,7 +3282,9 @@ union bpf_attr { FN(sk_assign), \ FN(ktime_get_boot_ns), \ FN(seq_printf), \ - FN(seq_write), + FN(seq_write), \ + FN(sk_cgroup_id), \ + FN(sk_ancestor_cgroup_id), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.2.3 From a075767bbdc659066b89be282c8377fa880e9dc4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:28 +0200 Subject: net: XDP-generic determining XDP frame size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The SKB "head" pointer points to the data area that contains skb_shared_info, that can be found via skb_end_pointer(). Given xdp->data_hard_start have been established (basically pointing to skb->head), frame size is between skb_end_pointer() and data_hard_start, plus the size reserved to skb_shared_info. Change the bpf_xdp_adjust_tail offset adjust of skb->len, to be a positive offset number on grow, and negative number on shrink. As this seems more natural when reading the code. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945336804.97035.7164852191163722056.stgit@firesoul --- net/core/dev.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4c91de39890a..f937a3ff668d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4617,6 +4617,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp->data_meta = xdp->data; xdp->data_end = xdp->data + hlen; xdp->data_hard_start = skb->data - skb_headroom(skb); + + /* SKB "head" area always have tailroom for skb_shared_info */ + xdp->frame_sz = (void *)skb_end_pointer(skb) - xdp->data_hard_start; + xdp->frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; @@ -4640,14 +4645,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, skb_reset_network_header(skb); } - /* check if bpf_xdp_adjust_tail was used. it can only "shrink" - * pckt. - */ - off = orig_data_end - xdp->data_end; + /* check if bpf_xdp_adjust_tail was used */ + off = xdp->data_end - orig_data_end; if (off != 0) { skb_set_tail_pointer(skb, xdp->data_end - xdp->data); - skb->len -= off; - + skb->len += off; /* positive on grow, negative on shrink */ } /* check if XDP changed eth hdr such SKB needs update */ -- cgit v1.2.3 From 34cc0b338a61de3eee3a2bfcaf4f9d6e9fae091a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:49:33 +0200 Subject: xdp: Xdp_frame add member frame_sz and handle in convert_to_xdp_frame MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use hole in struct xdp_frame, when adding member frame_sz, which keeps same sizeof struct (32 bytes) Drivers ixgbe and sfc had bug cases where the necessary/expected tailroom was not reserved. This can lead to some hard to catch memory corruption issues. Having the drivers frame_sz this can be detected when packet length/end via xdp->data_end exceed the xdp_data_hard_end pointer, which accounts for the reserved the tailroom. When detecting this driver issue, simply fail the conversion with NULL, which results in feedback to driver (failing xdp_do_redirect()) causing driver to drop packet. Given the lack of consistent XDP stats, this can be hard to troubleshoot. And given this is a driver bug, we want to generate some more noise in form of a WARN stack dump (to ID the driver code that inlined convert_to_xdp_frame). Inlining the WARN macro is problematic, because it adds an asm instruction (on Intel CPUs ud2) what influence instruction cache prefetching. Thus, introduce xdp_warn and macro XDP_WARN, to avoid this and at the same time make identifying the function and line of this inlined function easier. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945337313.97035.10015729316710496600.stgit@firesoul --- include/net/xdp.h | 14 +++++++++++++- net/core/xdp.c | 8 ++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/include/net/xdp.h b/include/net/xdp.h index a764af4ae0ea..3094fccf5a88 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -89,7 +89,8 @@ struct xdp_frame { void *data; u16 len; u16 headroom; - u16 metasize; + u32 metasize:8; + u32 frame_sz:24; /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, * while mem info is valid on remote CPU. */ @@ -104,6 +105,10 @@ static inline void xdp_scrub_frame(struct xdp_frame *frame) frame->dev_rx = NULL; } +/* Avoids inlining WARN macro in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line); +#define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) + struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); /* Convert xdp_buff to xdp_frame */ @@ -124,6 +129,12 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) if (unlikely((headroom - metasize) < sizeof(*xdp_frame))) return NULL; + /* Catch if driver didn't reserve tailroom for skb_shared_info */ + if (unlikely(xdp->data_end > xdp_data_hard_end(xdp))) { + XDP_WARN("Driver BUG: missing reserved tailroom"); + return NULL; + } + /* Store info in top of packet */ xdp_frame = xdp->data_hard_start; @@ -131,6 +142,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) xdp_frame->len = xdp->data_end - xdp->data; xdp_frame->headroom = headroom - sizeof(*xdp_frame); xdp_frame->metasize = metasize; + xdp_frame->frame_sz = xdp->frame_sz; /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ xdp_frame->mem = xdp->rxq->mem; diff --git a/net/core/xdp.c b/net/core/xdp.c index 4c7ea85486af..490b8f5fa8ee 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -496,3 +497,10 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); + +/* Used by XDP_WARN macro, to avoid inlining WARN() in fast-path */ +void xdp_warn(const char *msg, const char *func, const int line) +{ + WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); +}; +EXPORT_SYMBOL_GPL(xdp_warn); -- cgit v1.2.3 From c8741e2bfe872425ea6f10bb6f7dc1d67bc60c3a Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:25 +0200 Subject: xdp: Allow bpf_xdp_adjust_tail() to grow packet size Finally, after all drivers have a frame size, allow BPF-helper bpf_xdp_adjust_tail() to grow or extend packet size at frame tail. Remember that helper/macro xdp_data_hard_end have reserved some tailroom. Thus, this helper makes sure that the BPF-prog don't have access to this tailroom area. V2: Remove one chicken check and use WARN_ONCE for other Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/158945348530.97035.12577148209134239291.stgit@firesoul --- include/uapi/linux/bpf.h | 4 ++-- net/core/filter.c | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 32cbf36c7729..b9b8a0f63b91 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2015,8 +2015,8 @@ union bpf_attr { * int bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) * Description * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is - * only possible to shrink the packet as of this writing, - * therefore *delta* must be a negative integer. + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers diff --git a/net/core/filter.c b/net/core/filter.c index 5815902bb617..e7b033dad44e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3411,12 +3411,19 @@ static const struct bpf_func_proto bpf_xdp_adjust_head_proto = { BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) { + void *data_hard_end = xdp_data_hard_end(xdp); /* use xdp->frame_sz */ void *data_end = xdp->data_end + offset; - /* only shrinking is allowed for now. */ - if (unlikely(offset >= 0)) + /* Notice that xdp_data_hard_end have reserved some tailroom */ + if (unlikely(data_end > data_hard_end)) return -EINVAL; + /* ALL drivers MUST init xdp->frame_sz, chicken check below */ + if (unlikely(xdp->frame_sz > PAGE_SIZE)) { + WARN_ONCE(1, "Too BIG xdp->frame_sz = %d\n", xdp->frame_sz); + return -EINVAL; + } + if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; -- cgit v1.2.3 From ddb47d518ca10948d1f64a983cb9274720f691cd Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Thu, 14 May 2020 12:51:30 +0200 Subject: xdp: Clear grow memory in bpf_xdp_adjust_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clearing memory of tail when grow happens, because it is too easy to write a XDP_PASS program that extend the tail, which expose this memory to users that can run tcpdump. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/158945349039.97035.5262100484553494.stgit@firesoul --- net/core/filter.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index e7b033dad44e..a85eb538d4d6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3427,6 +3427,10 @@ BPF_CALL_2(bpf_xdp_adjust_tail, struct xdp_buff *, xdp, int, offset) if (unlikely(data_end < xdp->data + ETH_HLEN)) return -EINVAL; + /* Clear memory area on grow, can contain uninit kernel memory */ + if (offset > 0) + memset(xdp->data_end, 0, offset); + xdp->data_end = data_end; return 0; -- cgit v1.2.3 From 2c78ee898d8f10ae6fb2fa23a3fbaec96b1b7366 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 13 May 2020 16:03:54 -0700 Subject: bpf: Implement CAP_BPF Implement permissions as stated in uapi/linux/capability.h In order to do that the verifier allow_ptr_leaks flag is split into four flags and they are set as: env->allow_ptr_leaks = bpf_allow_ptr_leaks(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); env->bpf_capable = bpf_capable(); The first three currently equivalent to perfmon_capable(), since leaking kernel pointers and reading kernel memory via side channel attacks is roughly equivalent to reading kernel memory with cap_perfmon. 'bpf_capable' enables bounded loops, precision tracking, bpf to bpf calls and other verifier features. 'allow_ptr_leaks' enable ptr leaks, ptr conversions, subtraction of pointers. 'bypass_spec_v1' disables speculative analysis in the verifier, run time mitigations in bpf array, and enables indirect variable access in bpf programs. 'bypass_spec_v4' disables emission of sanitation code by the verifier. That means that the networking BPF program loaded with CAP_BPF + CAP_NET_ADMIN will have speculative checks done by the verifier and other spectre mitigation applied. Such networking BPF program will not be able to leak kernel pointers and will not be able to access arbitrary kernel memory. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200513230355.7858-3-alexei.starovoitov@gmail.com --- drivers/media/rc/bpf-lirc.c | 2 +- include/linux/bpf.h | 18 ++++++++- include/linux/bpf_verifier.h | 3 ++ kernel/bpf/arraymap.c | 10 ++--- kernel/bpf/bpf_struct_ops.c | 2 +- kernel/bpf/core.c | 2 +- kernel/bpf/cpumap.c | 2 +- kernel/bpf/hashtab.c | 4 +- kernel/bpf/helpers.c | 4 +- kernel/bpf/lpm_trie.c | 2 +- kernel/bpf/map_in_map.c | 2 +- kernel/bpf/queue_stack_maps.c | 2 +- kernel/bpf/reuseport_array.c | 2 +- kernel/bpf/stackmap.c | 2 +- kernel/bpf/syscall.c | 89 +++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 37 +++++++++--------- kernel/trace/bpf_trace.c | 3 ++ net/core/bpf_sk_storage.c | 4 +- net/core/filter.c | 4 +- 19 files changed, 134 insertions(+), 60 deletions(-) (limited to 'net/core') diff --git a/drivers/media/rc/bpf-lirc.c b/drivers/media/rc/bpf-lirc.c index 069c42f22a8c..5bb144435c16 100644 --- a/drivers/media/rc/bpf-lirc.c +++ b/drivers/media/rc/bpf-lirc.c @@ -110,7 +110,7 @@ lirc_mode2_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) + if (perfmon_capable()) return bpf_get_trace_printk_proto(); /* fall through */ default: diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c45d198ac38c..efe8836b5c48 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -19,6 +19,7 @@ #include #include #include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -119,7 +120,7 @@ struct bpf_map { struct bpf_map_memory memory; char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; - bool unpriv_array; + bool bypass_spec_v1; bool frozen; /* write-once; write-protected by freeze_mutex */ /* 22 bytes hole */ @@ -1095,6 +1096,21 @@ struct bpf_map *bpf_map_get_curr_or_next(u32 *id); extern int sysctl_unprivileged_bpf_disabled; +static inline bool bpf_allow_ptr_leaks(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v1(void) +{ + return perfmon_capable(); +} + +static inline bool bpf_bypass_spec_v4(void) +{ + return perfmon_capable(); +} + int bpf_map_new_fd(struct bpf_map *map, int flags); int bpf_prog_new_fd(struct bpf_prog *prog); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6abd5a778fcd..ea833087e853 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -375,6 +375,9 @@ struct bpf_verifier_env { u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool bpf_capable; + bool bypass_spec_v1; + bool bypass_spec_v4; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ const struct bpf_line_info *prev_linfo; diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 95d77770353c..1d5bb0d983b2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -77,7 +77,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int ret, numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); + bool bypass_spec_v1 = bpf_bypass_spec_v1(); u64 cost, array_size, mask64; struct bpf_map_memory mem; struct bpf_array *array; @@ -95,7 +95,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) mask64 -= 1; index_mask = mask64; - if (unpriv) { + if (!bypass_spec_v1) { /* round up array size to nearest power of 2, * since cpu will speculate within index_mask limits */ @@ -149,7 +149,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); } array->index_mask = index_mask; - array->map.unpriv_array = unpriv; + array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); @@ -219,7 +219,7 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { @@ -1053,7 +1053,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - if (map->unpriv_array) { + if (!map->bypass_spec_v1) { *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); } else { diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 26cb51f2db72..c6b0decaa46a 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -557,7 +557,7 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) struct bpf_map *map; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); st_ops = bpf_struct_ops_find_value(attr->btf_vmlinux_value_type_id); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 6aa11de67315..c40ff4cf9880 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { if (!bpf_prog_kallsyms_candidate(fp) || - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return; bpf_prog_ksym_set_addr(fp); diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a71790dab12d..8b85bfddfac7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -85,7 +85,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u64 cost; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index d541c8486c95..b4b288a3c3c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -359,9 +359,9 @@ static int htab_map_alloc_check(union bpf_attr *attr) BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) != offsetof(struct htab_elem, hash_node.pprev)); - if (lru && !capable(CAP_SYS_ADMIN)) + if (lru && !bpf_capable()) /* LRU implementation is much complicated than other - * maps. Hence, limit to CAP_SYS_ADMIN for now. + * maps. Hence, limit to CAP_BPF. */ return -EPERM; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 5c0290e0696e..886949fdcece 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -633,7 +633,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) break; } - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return NULL; switch (func_id) { @@ -642,6 +642,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: + if (!perfmon_capable()) + return NULL; return bpf_get_trace_printk_proto(); case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 65c236cf341e..c8cc4e4cf98d 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) u64 cost = sizeof(*trie), cost_per_node; int ret; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); /* check sanity of attributes */ diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index b3c48d1533cb..17738c93bec8 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -60,7 +60,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) /* Misc members not needed in bpf_map_meta_equal() check. */ inner_map_meta->ops = inner_map->ops; if (inner_map->ops == &array_map_ops) { - inner_map_meta->unpriv_array = inner_map->unpriv_array; + inner_map_meta->bypass_spec_v1 = inner_map->bypass_spec_v1; container_of(inner_map_meta, struct bpf_array, map)->index_mask = container_of(inner_map, struct bpf_array, map)->index_mask; } diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 30e1373fd437..05c8e043b9d2 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) /* Called from syscall */ static int queue_stack_map_alloc_check(union bpf_attr *attr) { - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; /* check sanity of attributes */ diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index 01badd3eda7a..21cde24386db 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) struct bpf_map_memory mem; u64 array_size; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); array_size = sizeof(*array); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index db76339fe358..7b8381ce40a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -93,7 +93,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) u64 cost, n_buckets; int err; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); if (attr->map_flags & ~STACK_CREATE_FLAG_MASK) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index de2a75500233..79bcd8d056d2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1534,7 +1534,7 @@ static int map_freeze(const union bpf_attr *attr) err = -EBUSY; goto err_put; } - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { err = -EPERM; goto err_put; } @@ -2009,6 +2009,55 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } +static bool is_net_admin_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + case BPF_PROG_TYPE_XDP: + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_LWT_SEG6LOCAL: + case BPF_PROG_TYPE_SK_SKB: + case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_LIRC_MODE2: + case BPF_PROG_TYPE_FLOW_DISSECTOR: + case BPF_PROG_TYPE_CGROUP_DEVICE: + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + case BPF_PROG_TYPE_CGROUP_SOCKOPT: + case BPF_PROG_TYPE_CGROUP_SYSCTL: + case BPF_PROG_TYPE_SOCK_OPS: + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + case BPF_PROG_TYPE_CGROUP_SKB: + /* always unpriv */ + case BPF_PROG_TYPE_SK_REUSEPORT: + /* equivalent to SOCKET_FILTER. need CAP_BPF only */ + default: + return false; + } +} + +static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + case BPF_PROG_TYPE_TRACING: + case BPF_PROG_TYPE_LSM: + case BPF_PROG_TYPE_STRUCT_OPS: /* has access to struct sock */ + case BPF_PROG_TYPE_EXT: /* extends any prog */ + return true; + default: + return false; + } +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD attach_prog_fd @@ -2031,7 +2080,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && (attr->prog_flags & BPF_F_ANY_ALIGNMENT) && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) return -EPERM; /* copy eBPF program license from user space */ @@ -2044,11 +2093,16 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) is_gpl = license_is_gpl_compatible(license); if (attr->insn_cnt == 0 || - attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) + attr->insn_cnt > (bpf_capable() ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS)) return -E2BIG; if (type != BPF_PROG_TYPE_SOCKET_FILTER && type != BPF_PROG_TYPE_CGROUP_SKB && - !capable(CAP_SYS_ADMIN)) + !bpf_capable()) + return -EPERM; + + if (is_net_admin_prog_type(type) && !capable(CAP_NET_ADMIN)) + return -EPERM; + if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; bpf_prog_load_fixup_attach_type(attr); @@ -2682,6 +2736,11 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, case BPF_PROG_TYPE_CGROUP_SOCKOPT: return attach_type == prog->expected_attach_type ? 0 : -EINVAL; case BPF_PROG_TYPE_CGROUP_SKB: + if (!capable(CAP_NET_ADMIN)) + /* cg-skb progs can be loaded by unpriv user. + * check permissions at attach time. + */ + return -EPERM; return prog->enforce_expected_attach_type && prog->expected_attach_type != attach_type ? -EINVAL : 0; @@ -2747,9 +2806,6 @@ static int bpf_prog_attach(const union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_ATTACH)) return -EINVAL; @@ -2804,9 +2860,6 @@ static int bpf_prog_detach(const union bpf_attr *attr) { enum bpf_prog_type ptype; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_PROG_DETACH)) return -EINVAL; @@ -2819,6 +2872,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: return lirc_prog_detach(attr); case BPF_PROG_TYPE_FLOW_DISSECTOR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; return skb_flow_dissector_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2882,8 +2937,6 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -3184,7 +3237,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; - if (!capable(CAP_SYS_ADMIN)) { + if (!bpf_capable()) { info.jited_prog_len = 0; info.xlated_prog_len = 0; info.nr_jited_ksyms = 0; @@ -3543,7 +3596,7 @@ static int bpf_btf_load(const union bpf_attr *attr) if (CHECK_ATTR(BPF_BTF_LOAD)) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; return btf_new_fd(attr); @@ -3766,9 +3819,6 @@ static int link_create(union bpf_attr *attr) struct bpf_prog *prog; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_CREATE)) return -EINVAL; @@ -3817,9 +3867,6 @@ static int link_update(union bpf_attr *attr) u32 flags; int ret; - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - if (CHECK_ATTR(BPF_LINK_UPDATE)) return -EINVAL; @@ -3988,7 +4035,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz union bpf_attr attr; int err; - if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN)) + if (sysctl_unprivileged_bpf_disabled && !bpf_capable()) return -EPERM; err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a3f2af756fd6..180933f6fba9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1295,7 +1295,7 @@ static void __mark_reg_unknown(const struct bpf_verifier_env *env, reg->type = SCALAR_VALUE; reg->var_off = tnum_unknown; reg->frameno = 0; - reg->precise = env->subprog_cnt > 1 || !env->allow_ptr_leaks; + reg->precise = env->subprog_cnt > 1 || !env->bpf_capable; __mark_reg_unbounded(reg); } @@ -1427,8 +1427,9 @@ static int check_subprogs(struct bpf_verifier_env *env) continue; if (insn[i].src_reg != BPF_PSEUDO_CALL) continue; - if (!env->allow_ptr_leaks) { - verbose(env, "function calls to other bpf functions are allowed for root only\n"); + if (!env->bpf_capable) { + verbose(env, + "function calls to other bpf functions are allowed for CAP_BPF and CAP_SYS_ADMIN\n"); return -EPERM; } ret = add_subprog(env, i + insn[i].imm + 1); @@ -1962,8 +1963,7 @@ static int __mark_chain_precision(struct bpf_verifier_env *env, int regno, bool new_marks = false; int i, err; - if (!env->allow_ptr_leaks) - /* backtracking is root only for now */ + if (!env->bpf_capable) return 0; func = st->frame[st->curframe]; @@ -2211,7 +2211,7 @@ static int check_stack_write(struct bpf_verifier_env *env, reg = &cur->regs[value_regno]; if (reg && size == BPF_REG_SIZE && register_is_const(reg) && - !register_is_null(reg) && env->allow_ptr_leaks) { + !register_is_null(reg) && env->bpf_capable) { if (dst_reg != BPF_REG_FP) { /* The backtracking logic can only recognize explicit * stack slot address like [fp - 8]. Other spill of @@ -2237,7 +2237,7 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EINVAL; } - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v4) { bool sanitize = false; if (state->stack[spi].slot_type[0] == STACK_SPILL && @@ -3432,7 +3432,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, * Spectre masking for stack ALU. * See also retrieve_ptr_limit(). */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); @@ -4435,10 +4435,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (!BPF_MAP_PTR(aux->map_ptr_state)) bpf_map_ptr_store(aux, meta->map_ptr, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); else if (BPF_MAP_PTR(aux->map_ptr_state) != meta->map_ptr) bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, - meta->map_ptr->unpriv_array); + !meta->map_ptr->bypass_spec_v1); return 0; } @@ -4807,7 +4807,7 @@ static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, const struct bpf_insn *insn) { - return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; + return env->bypass_spec_v1 || BPF_SRC(insn->code) == BPF_K; } static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, @@ -5117,7 +5117,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, /* For unprivileged we require that resulting offset must be in bounds * in order to be able to sanitize access later on. */ - if (!env->allow_ptr_leaks) { + if (!env->bypass_spec_v1) { if (dst_reg->type == PTR_TO_MAP_VALUE && check_map_access(env, dst, dst_reg->off, 1, false)) { verbose(env, "R%d pointer arithmetic of map value goes out of range, " @@ -7244,7 +7244,7 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, insn_stack[env->cfg.cur_stack++] = w; return 1; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { - if (loop_ok && env->allow_ptr_leaks) + if (loop_ok && env->bpf_capable) return 0; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); @@ -8353,7 +8353,7 @@ next: if (env->max_states_per_insn < states_cnt) env->max_states_per_insn = states_cnt; - if (!env->allow_ptr_leaks && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) + if (!env->bpf_capable && states_cnt > BPF_COMPLEXITY_LIMIT_STATES) return push_jmp_history(env, cur); if (!add_new_state) @@ -10014,7 +10014,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code = BPF_JMP | BPF_TAIL_CALL; aux = &env->insn_aux_data[i + delta]; - if (env->allow_ptr_leaks && !expect_blinding && + if (env->bpf_capable && !expect_blinding && prog->jit_requested && !bpf_map_key_poisoned(aux) && !bpf_map_ptr_poisoned(aux) && @@ -10758,7 +10758,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->insn_aux_data[i].orig_idx = i; env->prog = *prog; env->ops = bpf_verifier_ops[env->prog->type]; - is_priv = capable(CAP_SYS_ADMIN); + is_priv = bpf_capable(); if (!btf_vmlinux && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) { mutex_lock(&bpf_verifier_lock); @@ -10799,7 +10799,10 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (attr->prog_flags & BPF_F_ANY_ALIGNMENT) env->strict_alignment = false; - env->allow_ptr_leaks = is_priv; + env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->bypass_spec_v1 = bpf_bypass_spec_v1(); + env->bypass_spec_v4 = bpf_bypass_spec_v4(); + env->bpf_capable = bpf_capable(); if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d961428fb5b6..9a84d7fb4869 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -315,6 +315,9 @@ static const struct bpf_func_proto bpf_probe_write_user_proto = { static const struct bpf_func_proto *bpf_get_probe_write_proto(void) { + if (!capable(CAP_SYS_ADMIN)) + return NULL; + pr_warn_ratelimited("%s[%d] is installing a program with bpf_probe_write_user helper that may corrupt user memory!", current->comm, task_pid_nr(current)); diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 756b63b6f7b3..d2c4d16dadba 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -625,7 +625,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr) !attr->btf_key_type_id || !attr->btf_value_type_id) return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return -EPERM; if (attr->value_size > MAX_VALUE_SIZE) @@ -978,7 +978,7 @@ bpf_sk_storage_diag_alloc(const struct nlattr *nla_stgs) /* bpf_sk_storage_map is currently limited to CAP_SYS_ADMIN as * the map_alloc_check() side also does. */ - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return ERR_PTR(-EPERM); nla_for_each_nested(nla, nla_stgs, rem) { diff --git a/net/core/filter.c b/net/core/filter.c index a85eb538d4d6..f8a3c7e9d027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6687,7 +6687,7 @@ static bool cg_skb_is_valid_access(int off, int size, return false; case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; } @@ -6699,7 +6699,7 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): break; case bpf_ctx_range(struct __sk_buff, tstamp): - if (!capable(CAP_SYS_ADMIN)) + if (!bpf_capable()) return false; break; default: -- cgit v1.2.3 From 2e186a2cf8c785f38ef3237e83f8921f82f6e2b7 Mon Sep 17 00:00:00 2001 From: Tobias Waldekranz Date: Fri, 15 May 2020 11:52:52 +0200 Subject: net: core: recursively find netdev by device node The assumption that a device node is associated either with the netdev's device, or the parent of that device, does not hold for all drivers. E.g. Freescale's DPAA has two layers of platform devices above the netdev. Instead, recursively walk up the tree from the netdev, allowing any parent to match against the sought after node. Signed-off-by: Tobias Waldekranz Reviewed-by: Florian Fainelli Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 880e89c894f6..e353b822bb15 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1805,12 +1805,12 @@ static struct class net_class __ro_after_init = { #ifdef CONFIG_OF_NET static int of_dev_node_match(struct device *dev, const void *data) { - int ret = 0; - - if (dev->parent) - ret = dev->parent->of_node == data; + for (; dev; dev = dev->parent) { + if (dev->of_node == data) + return 1; + } - return ret == 0 ? dev->of_node == data : ret; + return 0; } /* -- cgit v1.2.3 From 9efd6a3cecdde984d67e63d17fe6af53c7c50968 Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 13 May 2020 15:58:43 +0200 Subject: netns: enable to inherit devconf from current netns The goal is to be able to inherit the initial devconf parameters from the current netns, ie the netns where this new netns has been created. This is useful in a containers environment where /proc/sys is read only. For example, if a pod is created with specifics devconf parameters and has the capability to create netns, the user expects to get the same parameters than his 'init_net', which is not the real init_net in this case. Signed-off-by: Nicolas Dichtel Signed-off-by: David S. Miller --- Documentation/admin-guide/sysctl/net.rst | 4 +++- net/core/sysctl_net_core.c | 4 +++- net/ipv4/devinet.c | 23 ++++++++++++++++++----- net/ipv6/addrconf.c | 23 ++++++++++++++++++++--- 4 files changed, 44 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/Documentation/admin-guide/sysctl/net.rst b/Documentation/admin-guide/sysctl/net.rst index 2ad1b77a7182..42cd04bca548 100644 --- a/Documentation/admin-guide/sysctl/net.rst +++ b/Documentation/admin-guide/sysctl/net.rst @@ -339,7 +339,9 @@ settings from init_net and for IPv6 we reset all settings to default. If set to 1, both IPv4 and IPv6 settings are forced to inherit from current ones in init_net. If set to 2, both IPv4 and IPv6 settings are -forced to reset to their default values. +forced to reset to their default values. If set to 3, both IPv4 and IPv6 +settings are forced to inherit from current ones in the netns where this +new netns has been created. Default : 0 (for compatibility reasons) diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 0ddb13a6282b..b109cc8a6dd8 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -23,6 +23,7 @@ #include static int two __maybe_unused = 2; +static int three = 3; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -39,6 +40,7 @@ EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net); * IPv6: reset all settings to default * 1 - Both inherit all current settings from init_net * 2 - Both reset all settings to default + * 3 - Both inherit all settings from current netns */ int sysctl_devconf_inherit_init_net __read_mostly; EXPORT_SYMBOL(sysctl_devconf_inherit_init_net); @@ -553,7 +555,7 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &two, + .extra2 = &three, }, { .procname = "high_order_alloc_disable", diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index fc94f82f82c7..f048d0a188b7 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -2666,11 +2666,24 @@ static __net_init int devinet_init_net(struct net *net) tbl[0].extra2 = net; #endif - if ((!IS_ENABLED(CONFIG_SYSCTL) || - sysctl_devconf_inherit_init_net != 2) && - !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv4.devconf_all, sizeof(ipv4_devconf)); - memcpy(dflt, init_net.ipv4.devconf_dflt, sizeof(ipv4_devconf_dflt)); + if (!net_eq(net, &init_net)) { + if (IS_ENABLED(CONFIG_SYSCTL) && + sysctl_devconf_inherit_init_net == 3) { + /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } else if (!IS_ENABLED(CONFIG_SYSCTL) || + sysctl_devconf_inherit_init_net != 2) { + /* inherit == 0 or 1: copy from init_net */ + memcpy(all, init_net.ipv4.devconf_all, + sizeof(ipv4_devconf)); + memcpy(dflt, init_net.ipv4.devconf_dflt, + sizeof(ipv4_devconf_dflt)); + } + /* else inherit == 2: use compiled values */ } #ifdef CONFIG_SYSCTL diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index fd885f06c4ed..ab7e839753ae 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -6991,9 +6991,26 @@ static int __net_init addrconf_init_net(struct net *net) goto err_alloc_dflt; if (IS_ENABLED(CONFIG_SYSCTL) && - sysctl_devconf_inherit_init_net == 1 && !net_eq(net, &init_net)) { - memcpy(all, init_net.ipv6.devconf_all, sizeof(ipv6_devconf)); - memcpy(dflt, init_net.ipv6.devconf_dflt, sizeof(ipv6_devconf_dflt)); + !net_eq(net, &init_net)) { + switch (sysctl_devconf_inherit_init_net) { + case 1: /* copy from init_net */ + memcpy(all, init_net.ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, init_net.ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 3: /* copy from the current netns */ + memcpy(all, current->nsproxy->net_ns->ipv6.devconf_all, + sizeof(ipv6_devconf)); + memcpy(dflt, + current->nsproxy->net_ns->ipv6.devconf_dflt, + sizeof(ipv6_devconf_dflt)); + break; + case 0: + case 2: + /* use compiled values */ + break; + } } /* these will be inherited by all namespaces */ -- cgit v1.2.3 From 4930f4831b1547b52c5968e9307fe3d840d7fba0 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 16 May 2020 10:46:23 +0200 Subject: net: allow __skb_ext_alloc to sleep mptcp calls this from the transmit side, from process context. Allow a sleeping allocation instead of unconditional GFP_ATOMIC. Acked-by: Paolo Abeni Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 8 +++++--- net/mptcp/protocol.c | 4 +++- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 3000c526f552..531843952809 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -4165,7 +4165,7 @@ struct skb_ext { char data[] __aligned(8); }; -struct skb_ext *__skb_ext_alloc(void); +struct skb_ext *__skb_ext_alloc(gfp_t flags); void *__skb_ext_set(struct sk_buff *skb, enum skb_ext_id id, struct skb_ext *ext); void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 1bf0c3d278e7..35a133c6d13b 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -6087,13 +6087,15 @@ static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id) /** * __skb_ext_alloc - allocate a new skb extensions storage * + * @flags: See kmalloc(). + * * Returns the newly allocated pointer. The pointer can later attached to a * skb via __skb_ext_set(). * Note: caller must handle the skb_ext as an opaque data. */ -struct skb_ext *__skb_ext_alloc(void) +struct skb_ext *__skb_ext_alloc(gfp_t flags) { - struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, GFP_ATOMIC); + struct skb_ext *new = kmem_cache_alloc(skbuff_ext_cache, flags); if (new) { memset(new->offset, 0, sizeof(new->offset)); @@ -6188,7 +6190,7 @@ void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id) } else { newoff = SKB_EXT_CHUNKSIZEOF(*new); - new = __skb_ext_alloc(); + new = __skb_ext_alloc(GFP_ATOMIC); if (!new) return NULL; } diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index bc950cf818f7..e3a628bea2b8 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -367,8 +367,10 @@ static void mptcp_stop_timer(struct sock *sk) static bool mptcp_ext_cache_refill(struct mptcp_sock *msk) { + const struct sock *sk = (const struct sock *)msk; + if (!msk->cached_ext) - msk->cached_ext = __skb_ext_alloc(); + msk->cached_ext = __skb_ext_alloc(sk->sk_allocation); return !!msk->cached_ext; } -- cgit v1.2.3 From 1b66d253610c7f8f257103808a9460223a087469 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 19 May 2020 00:45:45 +0200 Subject: bpf: Add get{peer, sock}name attach types for sock_addr As stated in 983695fa6765 ("bpf: fix unconnected udp hooks"), the objective for the existing cgroup connect/sendmsg/recvmsg/bind BPF hooks is to be transparent to applications. In Cilium we make use of these hooks [0] in order to enable E-W load balancing for existing Kubernetes service types for all Cilium managed nodes in the cluster. Those backends can be local or remote. The main advantage of this approach is that it operates as close as possible to the socket, and therefore allows to avoid packet-based NAT given in connect/sendmsg/recvmsg hooks we only need to xlate sock addresses. This also allows to expose NodePort services on loopback addresses in the host namespace, for example. As another advantage, this also efficiently blocks bind requests for applications in the host namespace for exposed ports. However, one missing item is that we also need to perform reverse xlation for inet{,6}_getname() hooks such that we can return the service IP/port tuple back to the application instead of the remote peer address. The vast majority of applications does not bother about getpeername(), but in a few occasions we've seen breakage when validating the peer's address since it returns unexpectedly the backend tuple instead of the service one. Therefore, this trivial patch allows to customise and adds a getpeername() as well as getsockname() BPF cgroup hook for both IPv4 and IPv6 in order to address this situation. Simple example: # ./cilium/cilium service list ID Frontend Service Type Backend 1 1.2.3.4:80 ClusterIP 1 => 10.0.0.10:80 Before; curl's verbose output example, no getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (10.0.0.10) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] After; with getpeername() reverse xlation: # curl --verbose 1.2.3.4 * Rebuilt URL to: 1.2.3.4/ * Trying 1.2.3.4... * TCP_NODELAY set * Connected to 1.2.3.4 (1.2.3.4) port 80 (#0) > GET / HTTP/1.1 > Host: 1.2.3.4 > User-Agent: curl/7.58.0 > Accept: */* [...] Originally, I had both under a BPF_CGROUP_INET{4,6}_GETNAME type and exposed peer to the context similar as in inet{,6}_getname() fashion, but API-wise this is suboptimal as it always enforces programs having to test for ctx->peer which can easily be missed, hence BPF_CGROUP_INET{4,6}_GET{PEER,SOCK}NAME split. Similarly, the checked return code is on tnum_range(1, 1), but if a use case comes up in future, it can easily be changed to return an error code instead. Helper and ctx member access is the same as with connect/sendmsg/etc hooks. [0] https://github.com/cilium/cilium/blob/master/bpf/bpf_sock.c Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/61a479d759b2482ae3efb45546490bacd796a220.1589841594.git.daniel@iogearbox.net --- include/linux/bpf-cgroup.h | 1 + include/uapi/linux/bpf.h | 4 ++++ kernel/bpf/syscall.c | 12 ++++++++++++ kernel/bpf/verifier.c | 6 +++++- net/core/filter.c | 4 ++++ net/ipv4/af_inet.c | 8 ++++++-- net/ipv6/af_inet6.c | 9 ++++++--- tools/include/uapi/linux/bpf.h | 4 ++++ 8 files changed, 42 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 272626cc3fc9..c66c545e161a 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -396,6 +396,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, } #define cgroup_bpf_enabled (0) +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9b8a0f63b91..97e1fd19ff58 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 57dfc98289d5..431241c74614 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1978,6 +1978,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2767,6 +2771,10 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type) case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: @@ -2912,6 +2920,10 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET6_POST_BIND: case BPF_CGROUP_INET4_CONNECT: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9c7d67d65d8c..2ed8351f47a4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7094,7 +7094,11 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || - env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG) + env->prog->expected_attach_type == BPF_CGROUP_UDP6_RECVMSG || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETPEERNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || + env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); break; case BPF_PROG_TYPE_CGROUP_SKB: diff --git a/net/core/filter.c b/net/core/filter.c index 822d662f97ef..bd2853d23b50 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7049,6 +7049,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_BIND: case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET4_GETPEERNAME: + case BPF_CGROUP_INET4_GETSOCKNAME: case BPF_CGROUP_UDP4_SENDMSG: case BPF_CGROUP_UDP4_RECVMSG: break; @@ -7060,6 +7062,8 @@ static bool sock_addr_is_valid_access(int off, int size, switch (prog->expected_attach_type) { case BPF_CGROUP_INET6_BIND: case BPF_CGROUP_INET6_CONNECT: + case BPF_CGROUP_INET6_GETPEERNAME: + case BPF_CGROUP_INET6_GETSOCKNAME: case BPF_CGROUP_UDP6_SENDMSG: case BPF_CGROUP_UDP6_RECVMSG: break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index fcf0d12a407a..8f5c8c9409d3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -755,12 +755,11 @@ do_err: } EXPORT_SYMBOL(inet_accept); - /* * This does both peername and sockname. */ int inet_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sock *sk = sock->sk; struct inet_sock *inet = inet_sk(sk); @@ -781,6 +780,11 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET4_GETPEERNAME : + BPF_CGROUP_INET4_GETSOCKNAME, + NULL); memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 771a462a8322..3b6fcc0c321a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -504,9 +504,8 @@ EXPORT_SYMBOL_GPL(inet6_destroy_sock); /* * This does both peername and sockname. */ - int inet6_getname(struct socket *sock, struct sockaddr *uaddr, - int peer) + int peer) { struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; @@ -531,9 +530,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; - sin->sin6_port = inet->inet_sport; } + if (cgroup_bpf_enabled) + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + peer ? BPF_CGROUP_INET6_GETPEERNAME : + BPF_CGROUP_INET6_GETSOCKNAME, + NULL); sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 146c742f1d49..1cddc398404a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -220,6 +220,10 @@ enum bpf_attach_type { BPF_MODIFY_RETURN, BPF_LSM_MAC, BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 4f65e2f483b6f764c15094d14dd53dda048a4048 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 May 2020 15:50:12 -0700 Subject: net: unexport skb_gro_receive() skb_gro_receive() used to be used by SCTP, it is no longer the case. skb_gro_receive_list() is in the same category : never used from modules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 35a133c6d13b..b8afefe6f6b6 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3727,7 +3727,6 @@ int skb_gro_receive_list(struct sk_buff *p, struct sk_buff *skb) return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive_list); /** * skb_segment - Perform protocol segmentation on skb. @@ -4191,7 +4190,6 @@ done: NAPI_GRO_CB(skb)->same_flow = 1; return 0; } -EXPORT_SYMBOL_GPL(skb_gro_receive); #ifdef CONFIG_SKB_EXTENSIONS #define SKB_EXT_ALIGN_VALUE 8 -- cgit v1.2.3 From 2b43470add8c8ff1e1ee28dffc5c5df97e955d09 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:20:53 +0200 Subject: xsk: Introduce AF_XDP buffer allocation API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to simplify AF_XDP zero-copy enablement for NIC driver developers, a new AF_XDP buffer allocation API is added. The implementation is based on a single core (single producer/consumer) buffer pool for the AF_XDP UMEM. A buffer is allocated using the xsk_buff_alloc() function, and returned using xsk_buff_free(). If a buffer is disassociated with the pool, e.g. when a buffer is passed to an AF_XDP socket, a buffer is said to be released. Currently, the release function is only used by the AF_XDP internals and not visible to the driver. Drivers using this API should register the XDP memory model with the new MEM_TYPE_XSK_BUFF_POOL type. The API is defined in net/xdp_sock_drv.h. The buffer type is struct xdp_buff, and follows the lifetime of regular xdp_buffs, i.e. the lifetime of an xdp_buff is restricted to a NAPI context. In other words, the API is not replacing xdp_frames. In addition to introducing the API and implementations, the AF_XDP core is migrated to use the new APIs. rfc->v1: Fixed build errors/warnings for m68k and riscv. (kbuild test robot) Added headroom/chunk size getter. (Maxim/Björn) v1->v2: Swapped SoBs. (Maxim) v2->v3: Initialize struct xdp_buff member frame_sz. (Björn) Add API to query the DMA address of a frame. (Maxim) Do DMA sync for CPU till the end of the frame to handle possible growth (frame_sz). (Maxim) Signed-off-by: Björn Töpel Signed-off-by: Maxim Mikityanskiy Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-6-bjorn.topel@gmail.com --- include/net/xdp.h | 4 +- include/net/xdp_sock.h | 2 + include/net/xdp_sock_drv.h | 164 ++++++++++++++++ include/net/xsk_buff_pool.h | 56 ++++++ include/trace/events/xdp.h | 3 +- net/core/xdp.c | 14 +- net/xdp/Makefile | 1 + net/xdp/xdp_umem.c | 19 +- net/xdp/xsk.c | 147 +++++--------- net/xdp/xsk_buff_pool.c | 467 ++++++++++++++++++++++++++++++++++++++++++++ net/xdp/xsk_diag.c | 2 +- net/xdp/xsk_queue.h | 59 ++++-- 12 files changed, 819 insertions(+), 119 deletions(-) create mode 100644 include/net/xsk_buff_pool.h create mode 100644 net/xdp/xsk_buff_pool.c (limited to 'net/core') diff --git a/include/net/xdp.h b/include/net/xdp.h index 3094fccf5a88..f432134c7c00 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -40,6 +40,7 @@ enum xdp_mem_type { MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, MEM_TYPE_ZERO_COPY, + MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -119,7 +120,8 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) + if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index fb7fe3060175..6e7265f63c04 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -31,11 +31,13 @@ struct xdp_umem_fq_reuse { struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; + struct xsk_buff_pool *pool; struct xdp_umem_page *pages; u64 chunk_mask; u64 size; u32 headroom; u32 chunk_size_nohr; + u32 chunk_size; struct user_struct *user; refcount_t users; struct work_struct work; diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index d67f2361937a..7752c8663d1b 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -7,6 +7,7 @@ #define _LINUX_XDP_SOCK_DRV_H #include +#include #ifdef CONFIG_XDP_SOCKETS @@ -101,6 +102,94 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return umem->chunk_size_nohr; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return XDP_PACKET_HEADROOM + umem->headroom; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return umem->chunk_size; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return xsk_umem_get_chunk_size(umem) - xsk_umem_get_headroom(umem); +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ + xp_set_rxq_info(umem->pool, rxq); +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ + xp_dma_unmap(umem->pool, attrs); +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return xp_dma_map(umem->pool, dev, attrs, umem->pgs, umem->npgs); +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_dma(xskb); +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + return xp_get_frame_dma(xskb); +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return xp_alloc(umem->pool); +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return xp_can_alloc(umem->pool, count); +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_free(xskb); +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_dma(umem->pool, addr); +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return xp_raw_get_data(umem->pool, addr); +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + + xp_dma_sync_for_cpu(xskb); +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ + xp_dma_sync_for_device(umem->pool, dma, size); +} + #else static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) @@ -212,6 +301,81 @@ static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) return 0; } +static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_chunk_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline u32 xsk_umem_get_rx_frame_size(struct xdp_umem *umem) +{ + return 0; +} + +static inline void xsk_buff_set_rxq_info(struct xdp_umem *umem, + struct xdp_rxq_info *rxq) +{ +} + +static inline void xsk_buff_dma_unmap(struct xdp_umem *umem, + unsigned long attrs) +{ +} + +static inline int xsk_buff_dma_map(struct xdp_umem *umem, struct device *dev, + unsigned long attrs) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline dma_addr_t xsk_buff_xdp_get_frame_dma(struct xdp_buff *xdp) +{ + return 0; +} + +static inline struct xdp_buff *xsk_buff_alloc(struct xdp_umem *umem) +{ + return NULL; +} + +static inline bool xsk_buff_can_alloc(struct xdp_umem *umem, u32 count) +{ + return false; +} + +static inline void xsk_buff_free(struct xdp_buff *xdp) +{ +} + +static inline dma_addr_t xsk_buff_raw_get_dma(struct xdp_umem *umem, u64 addr) +{ + return 0; +} + +static inline void *xsk_buff_raw_get_data(struct xdp_umem *umem, u64 addr) +{ + return NULL; +} + +static inline void xsk_buff_dma_sync_for_cpu(struct xdp_buff *xdp) +{ +} + +static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, + dma_addr_t dma, + size_t size) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_DRV_H */ diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h new file mode 100644 index 000000000000..9f221b36e405 --- /dev/null +++ b/include/net/xsk_buff_pool.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2020 Intel Corporation. */ + +#ifndef XSK_BUFF_POOL_H_ +#define XSK_BUFF_POOL_H_ + +#include +#include +#include + +struct xsk_buff_pool; +struct xdp_rxq_info; +struct xsk_queue; +struct xdp_desc; +struct device; +struct page; + +struct xdp_buff_xsk { + struct xdp_buff xdp; + dma_addr_t dma; + dma_addr_t frame_dma; + struct xsk_buff_pool *pool; + bool unaligned; + u64 orig_addr; + struct list_head free_list_node; +}; + +/* AF_XDP core. */ +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned); +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq); +void xp_destroy(struct xsk_buff_pool *pool); +void xp_release(struct xdp_buff_xsk *xskb); +u64 xp_get_handle(struct xdp_buff_xsk *xskb); +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc); + +/* AF_XDP, and XDP core. */ +void xp_free(struct xdp_buff_xsk *xskb); + +/* AF_XDP ZC drivers, via xdp_sock_buff.h */ +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq); +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages); +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs); +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool); +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count); +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr); +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb); +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb); +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size); + +#endif /* XSK_BUFF_POOL_H_ */ diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index b95d65e8c628..48547a12fa27 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,8 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) + FN(ZERO_COPY) \ + FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); diff --git a/net/core/xdp.c b/net/core/xdp.c index 490b8f5fa8ee..f0ce8b195193 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -17,6 +17,7 @@ #include #include /* struct xdp_mem_allocator */ #include +#include #define REG_STATE_NEW 0x0 #define REG_STATE_REGISTERED 0x1 @@ -361,7 +362,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle) + unsigned long handle, struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -390,6 +391,11 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); xa->zc_alloc->free(xa->zc_alloc, handle); rcu_read_unlock(); + break; + case MEM_TYPE_XSK_BUFF_POOL: + /* NB! Only valid from an xdp_buff! */ + xsk_buff_free(xdp); + break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ break; @@ -398,19 +404,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0); + __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0); + __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 90b5460d6166..30cdc4315f42 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o xskmap.o +obj-$(CONFIG_XDP_SOCKETS) += xsk_buff_pool.o obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 37ace3bc0d48..7f04688045d5 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -245,7 +245,7 @@ static void xdp_umem_release(struct xdp_umem *umem) } xsk_reuseq_destroy(umem); - + xp_destroy(umem->pool); xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); @@ -390,6 +390,7 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) umem->size = size; umem->headroom = headroom; umem->chunk_size_nohr = chunk_size - headroom; + umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; umem->user = NULL; @@ -415,11 +416,21 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) } err = xdp_umem_map_pages(umem); - if (!err) - return 0; + if (err) + goto out_pages; - kvfree(umem->pages); + umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, + headroom, size, unaligned_chunks); + if (!umem->pool) { + err = -ENOMEM; + goto out_unmap; + } + return 0; +out_unmap: + xdp_umem_unmap_pages(umem); +out_pages: + kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 8bda654e82ec..6933f0d494ba 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -117,76 +117,67 @@ bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) } EXPORT_SYMBOL(xsk_umem_uses_need_wakeup); -/* If a buffer crosses a page boundary, we need to do 2 memcpy's, one for - * each page. This is only required in copy mode. - */ -static void __xsk_rcv_memcpy(struct xdp_umem *umem, u64 addr, void *from_buf, - u32 len, u32 metalen) +static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) { - void *to_buf = xdp_umem_get_data(umem, addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (xskq_cons_crosses_non_contig_pg(umem, addr, len + metalen)) { - void *next_pg_addr = umem->pages[(addr >> PAGE_SHIFT) + 1].addr; - u64 page_start = addr & ~(PAGE_SIZE - 1); - u64 first_len = PAGE_SIZE - (addr - page_start); - - memcpy(to_buf, from_buf, first_len); - memcpy(next_pg_addr, from_buf + first_len, - len + metalen - first_len); + struct xdp_buff_xsk *xskb = container_of(xdp, struct xdp_buff_xsk, xdp); + u64 addr; + int err; - return; + addr = xp_get_handle(xskb); + err = xskq_prod_reserve_desc(xs->rx, addr, len); + if (err) { + xs->rx_dropped++; + return err; } - memcpy(to_buf, from_buf, len + metalen); + xp_release(xskb); + return 0; } -static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static void xsk_copy_xdp(struct xdp_buff *to, struct xdp_buff *from, u32 len) { - u64 offset = xs->umem->headroom; - u64 addr, memcpy_addr; - void *from_buf; + void *from_buf, *to_buf; u32 metalen; - int err; - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - xs->rx_dropped++; - return -ENOSPC; - } - if (unlikely(xdp_data_meta_unsupported(xdp))) { - from_buf = xdp->data; + if (unlikely(xdp_data_meta_unsupported(from))) { + from_buf = from->data; + to_buf = to->data; metalen = 0; } else { - from_buf = xdp->data_meta; - metalen = xdp->data - xdp->data_meta; + from_buf = from->data_meta; + metalen = from->data - from->data_meta; + to_buf = to->data - metalen; } - memcpy_addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - __xsk_rcv_memcpy(xs->umem, memcpy_addr, from_buf, len, metalen); - - offset += metalen; - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (!err) { - xskq_cons_release(xs->umem->fq); - xdp_return_buff(xdp); - return 0; - } - - xs->rx_dropped++; - return err; + memcpy(to_buf, from_buf, len + metalen); } -static int __xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) +static int __xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len, + bool explicit_free) { - int err = xskq_prod_reserve_desc(xs->rx, xdp->handle, len); + struct xdp_buff *xsk_xdp; + int err; - if (err) + if (len > xsk_umem_get_rx_frame_size(xs->umem)) { + xs->rx_dropped++; + return -ENOSPC; + } + + xsk_xdp = xsk_buff_alloc(xs->umem); + if (!xsk_xdp) { xs->rx_dropped++; + return -ENOSPC; + } - return err; + xsk_copy_xdp(xsk_xdp, xdp, len); + err = __xsk_rcv_zc(xs, xsk_xdp, len); + if (err) { + xsk_buff_free(xsk_xdp); + return err; + } + if (explicit_free) + xdp_return_buff(xdp); + return 0; } static bool xsk_is_bound(struct xdp_sock *xs) @@ -199,7 +190,8 @@ static bool xsk_is_bound(struct xdp_sock *xs) return false; } -static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) +static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, + bool explicit_free) { u32 len; @@ -211,8 +203,10 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) len = xdp->data_end - xdp->data; - return (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY) ? - __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len); + return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || + xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + __xsk_rcv_zc(xs, xdp, len) : + __xsk_rcv(xs, xdp, len, explicit_free); } static void xsk_flush(struct xdp_sock *xs) @@ -224,46 +218,11 @@ static void xsk_flush(struct xdp_sock *xs) int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { - u32 metalen = xdp->data - xdp->data_meta; - u32 len = xdp->data_end - xdp->data; - u64 offset = xs->umem->headroom; - void *buffer; - u64 addr; int err; spin_lock_bh(&xs->rx_lock); - - if (xs->dev != xdp->rxq->dev || xs->queue_id != xdp->rxq->queue_index) { - err = -EINVAL; - goto out_unlock; - } - - if (!xskq_cons_peek_addr(xs->umem->fq, &addr, xs->umem) || - len > xs->umem->chunk_size_nohr - XDP_PACKET_HEADROOM) { - err = -ENOSPC; - goto out_drop; - } - - addr = xsk_umem_adjust_offset(xs->umem, addr, offset); - buffer = xdp_umem_get_data(xs->umem, addr); - memcpy(buffer, xdp->data_meta, len + metalen); - - addr = xsk_umem_adjust_offset(xs->umem, addr, metalen); - err = xskq_prod_reserve_desc(xs->rx, addr, len); - if (err) - goto out_drop; - - xskq_cons_release(xs->umem->fq); - xskq_prod_submit(xs->rx); - - spin_unlock_bh(&xs->rx_lock); - - xs->sk.sk_data_ready(&xs->sk); - return 0; - -out_drop: - xs->rx_dropped++; -out_unlock: + err = xsk_rcv(xs, xdp, false); + xsk_flush(xs); spin_unlock_bh(&xs->rx_lock); return err; } @@ -273,7 +232,7 @@ int __xsk_map_redirect(struct xdp_sock *xs, struct xdp_buff *xdp) struct list_head *flush_list = this_cpu_ptr(&xskmap_flush_list); int err; - err = xsk_rcv(xs, xdp); + err = xsk_rcv(xs, xdp, true); if (err) return err; @@ -404,7 +363,7 @@ static int xsk_generic_xmit(struct sock *sk) skb_put(skb, len); addr = desc.addr; - buffer = xdp_umem_get_data(xs->umem, addr); + buffer = xsk_buff_raw_get_data(xs->umem, addr); err = skb_store_bits(skb, 0, buffer, len); /* This is the backpressure mechanism for the Tx path. * Reserve space in the completion queue and only proceed @@ -860,6 +819,8 @@ static int xsk_setsockopt(struct socket *sock, int level, int optname, q = (optname == XDP_UMEM_FILL_RING) ? &xs->umem->fq : &xs->umem->cq; err = xsk_init_queue(entries, q, true); + if (optname == XDP_UMEM_FILL_RING) + xp_set_fq(xs->umem->pool, *q); mutex_unlock(&xs->mutex); return err; } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c new file mode 100644 index 000000000000..e214a5795a62 --- /dev/null +++ b/net/xdp/xsk_buff_pool.c @@ -0,0 +1,467 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "xsk_queue.h" + +struct xsk_buff_pool { + struct xsk_queue *fq; + struct list_head free_list; + dma_addr_t *dma_pages; + struct xdp_buff_xsk *heads; + u64 chunk_mask; + u64 addrs_cnt; + u32 free_list_cnt; + u32 dma_pages_cnt; + u32 heads_cnt; + u32 free_heads_cnt; + u32 headroom; + u32 chunk_size; + u32 frame_len; + bool cheap_dma; + bool unaligned; + void *addrs; + struct device *dev; + struct xdp_buff_xsk *free_heads[]; +}; + +static void xp_addr_unmap(struct xsk_buff_pool *pool) +{ + vunmap(pool->addrs); +} + +static int xp_addr_map(struct xsk_buff_pool *pool, + struct page **pages, u32 nr_pages) +{ + pool->addrs = vmap(pages, nr_pages, VM_MAP, PAGE_KERNEL); + if (!pool->addrs) + return -ENOMEM; + return 0; +} + +void xp_destroy(struct xsk_buff_pool *pool) +{ + if (!pool) + return; + + xp_addr_unmap(pool); + kvfree(pool->heads); + kvfree(pool); +} + +struct xsk_buff_pool *xp_create(struct page **pages, u32 nr_pages, u32 chunks, + u32 chunk_size, u32 headroom, u64 size, + bool unaligned) +{ + struct xsk_buff_pool *pool; + struct xdp_buff_xsk *xskb; + int err; + u32 i; + + pool = kvzalloc(struct_size(pool, free_heads, chunks), GFP_KERNEL); + if (!pool) + goto out; + + pool->heads = kvcalloc(chunks, sizeof(*pool->heads), GFP_KERNEL); + if (!pool->heads) + goto out; + + pool->chunk_mask = ~((u64)chunk_size - 1); + pool->addrs_cnt = size; + pool->heads_cnt = chunks; + pool->free_heads_cnt = chunks; + pool->headroom = headroom; + pool->chunk_size = chunk_size; + pool->cheap_dma = true; + pool->unaligned = unaligned; + pool->frame_len = chunk_size - headroom - XDP_PACKET_HEADROOM; + INIT_LIST_HEAD(&pool->free_list); + + for (i = 0; i < pool->free_heads_cnt; i++) { + xskb = &pool->heads[i]; + xskb->pool = pool; + xskb->xdp.frame_sz = chunk_size - headroom; + pool->free_heads[i] = xskb; + } + + err = xp_addr_map(pool, pages, nr_pages); + if (!err) + return pool; + +out: + xp_destroy(pool); + return NULL; +} + +void xp_set_fq(struct xsk_buff_pool *pool, struct xsk_queue *fq) +{ + pool->fq = fq; +} + +void xp_set_rxq_info(struct xsk_buff_pool *pool, struct xdp_rxq_info *rxq) +{ + u32 i; + + for (i = 0; i < pool->heads_cnt; i++) + pool->heads[i].xdp.rxq = rxq; +} +EXPORT_SYMBOL(xp_set_rxq_info); + +void xp_dma_unmap(struct xsk_buff_pool *pool, unsigned long attrs) +{ + dma_addr_t *dma; + u32 i; + + if (pool->dma_pages_cnt == 0) + return; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = &pool->dma_pages[i]; + if (*dma) { + dma_unmap_page_attrs(pool->dev, *dma, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + *dma = 0; + } + } + + kvfree(pool->dma_pages); + pool->dma_pages_cnt = 0; + pool->dev = NULL; +} +EXPORT_SYMBOL(xp_dma_unmap); + +static void xp_check_dma_contiguity(struct xsk_buff_pool *pool) +{ + u32 i; + + for (i = 0; i < pool->dma_pages_cnt - 1; i++) { + if (pool->dma_pages[i] + PAGE_SIZE == pool->dma_pages[i + 1]) + pool->dma_pages[i] |= XSK_NEXT_PG_CONTIG_MASK; + else + pool->dma_pages[i] &= ~XSK_NEXT_PG_CONTIG_MASK; + } +} + +static bool __maybe_unused xp_check_swiotlb_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_SWIOTLB) + phys_addr_t paddr; + u32 i; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + paddr = dma_to_phys(pool->dev, pool->dma_pages[i]); + if (is_swiotlb_buffer(paddr)) + return false; + } +#endif + return true; +} + +static bool xp_check_cheap_dma(struct xsk_buff_pool *pool) +{ +#if defined(CONFIG_HAS_DMA) + const struct dma_map_ops *ops = get_dma_ops(pool->dev); + + if (ops) { + return !ops->sync_single_for_cpu && + !ops->sync_single_for_device; + } + + if (!dma_is_direct(ops)) + return false; + + if (!xp_check_swiotlb_dma(pool)) + return false; + + if (!dev_is_dma_coherent(pool->dev)) { +#if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) || \ + defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) + return false; +#endif + } +#endif + return true; +} + +int xp_dma_map(struct xsk_buff_pool *pool, struct device *dev, + unsigned long attrs, struct page **pages, u32 nr_pages) +{ + dma_addr_t dma; + u32 i; + + pool->dma_pages = kvcalloc(nr_pages, sizeof(*pool->dma_pages), + GFP_KERNEL); + if (!pool->dma_pages) + return -ENOMEM; + + pool->dev = dev; + pool->dma_pages_cnt = nr_pages; + + for (i = 0; i < pool->dma_pages_cnt; i++) { + dma = dma_map_page_attrs(dev, pages[i], 0, PAGE_SIZE, + DMA_BIDIRECTIONAL, attrs); + if (dma_mapping_error(dev, dma)) { + xp_dma_unmap(pool, attrs); + return -ENOMEM; + } + pool->dma_pages[i] = dma; + } + + if (pool->unaligned) + xp_check_dma_contiguity(pool); + + pool->dev = dev; + pool->cheap_dma = xp_check_cheap_dma(pool); + return 0; +} +EXPORT_SYMBOL(xp_dma_map); + +static bool xp_desc_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr, u32 len) +{ + bool cross_pg = (addr & (PAGE_SIZE - 1)) + len > PAGE_SIZE; + + if (pool->dma_pages_cnt && cross_pg) { + return !(pool->dma_pages[addr >> PAGE_SHIFT] & + XSK_NEXT_PG_CONTIG_MASK); + } + return false; +} + +static bool xp_addr_crosses_non_contig_pg(struct xsk_buff_pool *pool, + u64 addr) +{ + return xp_desc_crosses_non_contig_pg(pool, addr, pool->chunk_size); +} + +void xp_release(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_heads[xskb->pool->free_heads_cnt++] = xskb; +} + +static u64 xp_aligned_extract_addr(struct xsk_buff_pool *pool, u64 addr) +{ + return addr & pool->chunk_mask; +} + +static u64 xp_unaligned_extract_addr(u64 addr) +{ + return addr & XSK_UNALIGNED_BUF_ADDR_MASK; +} + +static u64 xp_unaligned_extract_offset(u64 addr) +{ + return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; +} + +static u64 xp_unaligned_add_offset_to_addr(u64 addr) +{ + return xp_unaligned_extract_addr(addr) + + xp_unaligned_extract_offset(addr); +} + +static bool xp_check_unaligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_unaligned_extract_addr(*addr); + if (*addr >= pool->addrs_cnt || + *addr + pool->chunk_size > pool->addrs_cnt || + xp_addr_crosses_non_contig_pg(pool, *addr)) + return false; + return true; +} + +static bool xp_check_aligned(struct xsk_buff_pool *pool, u64 *addr) +{ + *addr = xp_aligned_extract_addr(pool, *addr); + return *addr < pool->addrs_cnt; +} + +static struct xdp_buff_xsk *__xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + u64 addr; + bool ok; + + if (pool->free_heads_cnt == 0) + return NULL; + + xskb = pool->free_heads[--pool->free_heads_cnt]; + + for (;;) { + if (!xskq_cons_peek_addr_unchecked(pool->fq, &addr)) { + xp_release(xskb); + return NULL; + } + + ok = pool->unaligned ? xp_check_unaligned(pool, &addr) : + xp_check_aligned(pool, &addr); + if (!ok) { + pool->fq->invalid_descs++; + xskq_cons_release(pool->fq); + continue; + } + break; + } + xskq_cons_release(pool->fq); + + xskb->orig_addr = addr; + xskb->xdp.data_hard_start = pool->addrs + addr + pool->headroom; + if (pool->dma_pages_cnt) { + xskb->frame_dma = (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); + xskb->dma = xskb->frame_dma + pool->headroom + + XDP_PACKET_HEADROOM; + } + return xskb; +} + +struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) +{ + struct xdp_buff_xsk *xskb; + + if (!pool->free_list_cnt) { + xskb = __xp_alloc(pool); + if (!xskb) + return NULL; + } else { + pool->free_list_cnt--; + xskb = list_first_entry(&pool->free_list, struct xdp_buff_xsk, + free_list_node); + list_del(&xskb->free_list_node); + } + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; + + if (!pool->cheap_dma) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, + pool->frame_len, + DMA_BIDIRECTIONAL); + } + return &xskb->xdp; +} +EXPORT_SYMBOL(xp_alloc); + +bool xp_can_alloc(struct xsk_buff_pool *pool, u32 count) +{ + if (pool->free_list_cnt >= count) + return true; + return xskq_cons_has_entries(pool->fq, count - pool->free_list_cnt); +} +EXPORT_SYMBOL(xp_can_alloc); + +void xp_free(struct xdp_buff_xsk *xskb) +{ + xskb->pool->free_list_cnt++; + list_add(&xskb->free_list_node, &xskb->pool->free_list); +} +EXPORT_SYMBOL(xp_free); + +static bool xp_aligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 chunk, chunk_end; + + chunk = xp_aligned_extract_addr(pool, desc->addr); + chunk_end = xp_aligned_extract_addr(pool, desc->addr + desc->len); + if (chunk != chunk_end) + return false; + + if (chunk >= pool->addrs_cnt) + return false; + + if (desc->options) + return false; + return true; +} + +static bool xp_unaligned_validate_desc(struct xsk_buff_pool *pool, + struct xdp_desc *desc) +{ + u64 addr, base_addr; + + base_addr = xp_unaligned_extract_addr(desc->addr); + addr = xp_unaligned_add_offset_to_addr(desc->addr); + + if (desc->len > pool->chunk_size) + return false; + + if (base_addr >= pool->addrs_cnt || addr >= pool->addrs_cnt || + xp_desc_crosses_non_contig_pg(pool, addr, desc->len)) + return false; + + if (desc->options) + return false; + return true; +} + +bool xp_validate_desc(struct xsk_buff_pool *pool, struct xdp_desc *desc) +{ + return pool->unaligned ? xp_unaligned_validate_desc(pool, desc) : + xp_aligned_validate_desc(pool, desc); +} + +u64 xp_get_handle(struct xdp_buff_xsk *xskb) +{ + u64 offset = xskb->xdp.data - xskb->xdp.data_hard_start; + + offset += xskb->pool->headroom; + if (!xskb->pool->unaligned) + return xskb->orig_addr + offset; + return xskb->orig_addr + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); +} + +void *xp_raw_get_data(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return pool->addrs + addr; +} +EXPORT_SYMBOL(xp_raw_get_data); + +dma_addr_t xp_raw_get_dma(struct xsk_buff_pool *pool, u64 addr) +{ + addr = pool->unaligned ? xp_unaligned_add_offset_to_addr(addr) : addr; + return (pool->dma_pages[addr >> PAGE_SHIFT] & + ~XSK_NEXT_PG_CONTIG_MASK) + + (addr & ~PAGE_MASK); +} +EXPORT_SYMBOL(xp_raw_get_dma); + +dma_addr_t xp_get_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->dma; +} +EXPORT_SYMBOL(xp_get_dma); + +dma_addr_t xp_get_frame_dma(struct xdp_buff_xsk *xskb) +{ + return xskb->frame_dma; +} +EXPORT_SYMBOL(xp_get_frame_dma); + +void xp_dma_sync_for_cpu(struct xdp_buff_xsk *xskb) +{ + if (xskb->pool->cheap_dma) + return; + + dma_sync_single_range_for_cpu(xskb->pool->dev, xskb->dma, 0, + xskb->pool->frame_len, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_cpu); + +void xp_dma_sync_for_device(struct xsk_buff_pool *pool, dma_addr_t dma, + size_t size) +{ + if (pool->cheap_dma) + return; + + dma_sync_single_range_for_device(pool->dev, dma, 0, + size, DMA_BIDIRECTIONAL); +} +EXPORT_SYMBOL(xp_dma_sync_for_device); diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c index f59791ba43a0..0163b26aaf63 100644 --- a/net/xdp/xsk_diag.c +++ b/net/xdp/xsk_diag.c @@ -56,7 +56,7 @@ static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) du.id = umem->id; du.size = umem->size; du.num_pages = umem->npgs; - du.chunk_size = umem->chunk_size_nohr + umem->headroom; + du.chunk_size = umem->chunk_size; du.headroom = umem->headroom; du.ifindex = umem->dev ? umem->dev->ifindex : 0; du.queue_id = umem->queue_id; diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index a322a7dac58c..9151aef7dbca 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "xsk.h" @@ -172,31 +173,45 @@ out: return false; } -static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, - struct xdp_desc *d, - struct xdp_umem *umem) +static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) { - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (!xskq_cons_is_valid_unaligned(q, d->addr, d->len, umem)) - return false; + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - if (d->len > umem->chunk_size_nohr || d->options) { - q->invalid_descs++; - return false; - } + while (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; + if (xskq_cons_is_valid_addr(q, *addr)) + return true; + q->cached_cons++; + } + + return false; +} + +static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; + + if (q->cached_cons != q->cached_prod) { + u32 idx = q->cached_cons & q->ring_mask; + + *addr = ring->desc[idx]; return true; } - if (!xskq_cons_is_valid_addr(q, d->addr)) - return false; + return false; +} - if (((d->addr + d->len) & q->chunk_mask) != (d->addr & q->chunk_mask) || - d->options) { +static inline bool xskq_cons_is_valid_desc(struct xsk_queue *q, + struct xdp_desc *d, + struct xdp_umem *umem) +{ + if (!xp_validate_desc(umem->pool, d)) { q->invalid_descs++; return false; } - return true; } @@ -260,6 +275,20 @@ static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, return xskq_cons_read_addr(q, addr, umem); } +static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_aligned(q, addr); +} + +static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) +{ + if (q->cached_prod == q->cached_cons) + xskq_cons_get_entries(q); + return xskq_cons_read_addr_unchecked(q, addr); +} + static inline bool xskq_cons_peek_desc(struct xsk_queue *q, struct xdp_desc *desc, struct xdp_umem *umem) -- cgit v1.2.3 From 0807892ecb35734b7ce6f7c29b078f1b60151c94 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:00 +0200 Subject: xsk: Remove MEM_TYPE_ZERO_COPY and corresponding code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users of MEM_TYPE_ZERO_COPY. Remove all corresponding code, including the "handle" member of struct xdp_buff. rfc->v1: Fixed spelling in commit message. (Björn) Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-13-bjorn.topel@gmail.com --- drivers/net/hyperv/netvsc_bpf.c | 1 - include/net/xdp.h | 9 +-- include/net/xdp_sock.h | 45 ------------ include/net/xdp_sock_drv.h | 149 ---------------------------------------- include/trace/events/xdp.h | 1 - net/core/xdp.c | 42 ++--------- net/xdp/xdp_umem.c | 56 +-------------- net/xdp/xsk.c | 48 +------------ net/xdp/xsk_buff_pool.c | 7 ++ net/xdp/xsk_queue.c | 62 ----------------- net/xdp/xsk_queue.h | 105 ---------------------------- 11 files changed, 15 insertions(+), 510 deletions(-) (limited to 'net/core') diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c index 1e0c024b0a93..8e4141552423 100644 --- a/drivers/net/hyperv/netvsc_bpf.c +++ b/drivers/net/hyperv/netvsc_bpf.c @@ -50,7 +50,6 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan, xdp->data_end = xdp->data + len; xdp->rxq = &nvchan->xdp_rxq; xdp->frame_sz = PAGE_SIZE; - xdp->handle = 0; memcpy(xdp->data, data, len); diff --git a/include/net/xdp.h b/include/net/xdp.h index f432134c7c00..90f11760bd12 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -39,7 +39,6 @@ enum xdp_mem_type { MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */ MEM_TYPE_PAGE_ORDER0, /* Orig XDP full page model */ MEM_TYPE_PAGE_POOL, - MEM_TYPE_ZERO_COPY, MEM_TYPE_XSK_BUFF_POOL, MEM_TYPE_MAX, }; @@ -55,10 +54,6 @@ struct xdp_mem_info { struct page_pool; -struct zero_copy_allocator { - void (*free)(struct zero_copy_allocator *zca, unsigned long handle); -}; - struct xdp_rxq_info { struct net_device *dev; u32 queue_index; @@ -71,7 +66,6 @@ struct xdp_buff { void *data_end; void *data_meta; void *data_hard_start; - unsigned long handle; struct xdp_rxq_info *rxq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; @@ -120,8 +114,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp) int metasize; int headroom; - if (xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) + if (xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) return xdp_convert_zc_to_xdp_frame(xdp); /* Assure headroom is available for storing info */ diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 6e7265f63c04..96bfc5f5f24e 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -17,26 +17,12 @@ struct net_device; struct xsk_queue; struct xdp_buff; -struct xdp_umem_page { - void *addr; - dma_addr_t dma; -}; - -struct xdp_umem_fq_reuse { - u32 nentries; - u32 length; - u64 handles[]; -}; - struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; struct xsk_buff_pool *pool; - struct xdp_umem_page *pages; - u64 chunk_mask; u64 size; u32 headroom; - u32 chunk_size_nohr; u32 chunk_size; struct user_struct *user; refcount_t users; @@ -48,7 +34,6 @@ struct xdp_umem { u8 flags; int id; struct net_device *dev; - struct xdp_umem_fq_reuse *fq_reuse; bool zc; spinlock_t xsk_tx_list_lock; struct list_head xsk_tx_list; @@ -109,21 +94,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return xs; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return addr & XSK_UNALIGNED_BUF_ADDR_MASK; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return addr >> XSK_UNALIGNED_BUF_OFFSET_SHIFT; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return xsk_umem_extract_addr(addr) + xsk_umem_extract_offset(addr); -} - #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) @@ -146,21 +116,6 @@ static inline struct xdp_sock *__xsk_map_lookup_elem(struct bpf_map *map, return NULL; } -static inline u64 xsk_umem_extract_addr(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_extract_offset(u64 addr) -{ - return 0; -} - -static inline u64 xsk_umem_add_offset_to_addr(u64 addr) -{ - return 0; -} - #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h index 7752c8663d1b..ccf848f7efa4 100644 --- a/include/net/xdp_sock_drv.h +++ b/include/net/xdp_sock_drv.h @@ -11,16 +11,9 @@ #ifdef CONFIG_XDP_SOCKETS -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt); -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr); -void xsk_umem_release_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, struct xdp_desc *desc); void xsk_umem_consume_tx_done(struct xdp_umem *umem); -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq); -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id); void xsk_set_rx_need_wakeup(struct xdp_umem *umem); void xsk_set_tx_need_wakeup(struct xdp_umem *umem); @@ -28,80 +21,6 @@ void xsk_clear_rx_need_wakeup(struct xdp_umem *umem); void xsk_clear_tx_need_wakeup(struct xdp_umem *umem); bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem); -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - unsigned long page_addr; - - addr = xsk_umem_add_offset_to_addr(addr); - page_addr = (unsigned long)umem->pages[addr >> PAGE_SHIFT].addr; - - return (char *)(page_addr & PAGE_MASK) + (addr & ~PAGE_MASK); -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - addr = xsk_umem_add_offset_to_addr(addr); - - return umem->pages[addr >> PAGE_SHIFT].dma + (addr & ~PAGE_MASK); -} - -/* Reuse-queue aware version of FILL queue helpers */ -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (rq->length >= cnt) - return true; - - return xsk_umem_has_addrs(umem, cnt - rq->length); -} - -static inline bool xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - return xsk_umem_peek_addr(umem, addr); - - *addr = rq->handles[rq->length - 1]; - return addr; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - if (!rq->length) - xsk_umem_release_addr(umem); - else - rq->length--; -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ - struct xdp_umem_fq_reuse *rq = umem->fq_reuse; - - rq->handles[rq->length++] = addr; -} - -/* Handle the offset appropriately depending on aligned or unaligned mode. - * For unaligned mode, we store the offset in the upper 16-bits of the address. - * For aligned mode, we simply add the offset to the address. - */ -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 address, - u64 offset) -{ - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) - return address + (offset << XSK_UNALIGNED_BUF_OFFSET_SHIFT); - else - return address + offset; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return umem->chunk_size_nohr; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return XDP_PACKET_HEADROOM + umem->headroom; @@ -192,20 +111,6 @@ static inline void xsk_buff_raw_dma_sync_for_device(struct xdp_umem *umem, #else -static inline bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr(struct xdp_umem *umem) -{ -} - static inline void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries) { } @@ -220,55 +125,12 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } -static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - return NULL; -} - -static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( - struct xdp_umem *umem, struct xdp_umem_fq_reuse *newq) -{ - return NULL; -} - -static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ -} - static inline struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev, u16 queue_id) { return NULL; } -static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) -{ - return NULL; -} - -static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) -{ - return 0; -} - -static inline bool xsk_umem_has_addrs_rq(struct xdp_umem *umem, u32 cnt) -{ - return false; -} - -static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) -{ - return NULL; -} - -static inline void xsk_umem_release_addr_rq(struct xdp_umem *umem) -{ -} - -static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) -{ -} - static inline void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { } @@ -290,17 +152,6 @@ static inline bool xsk_umem_uses_need_wakeup(struct xdp_umem *umem) return false; } -static inline u64 xsk_umem_adjust_offset(struct xdp_umem *umem, u64 handle, - u64 offset) -{ - return 0; -} - -static inline u32 xsk_umem_xdp_frame_sz(struct xdp_umem *umem) -{ - return 0; -} - static inline u32 xsk_umem_get_headroom(struct xdp_umem *umem) { return 0; diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h index 48547a12fa27..b73d3e141323 100644 --- a/include/trace/events/xdp.h +++ b/include/trace/events/xdp.h @@ -287,7 +287,6 @@ TRACE_EVENT(xdp_devmap_xmit, FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ - FN(ZERO_COPY) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ diff --git a/net/core/xdp.c b/net/core/xdp.c index f0ce8b195193..a8c2f243367d 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -110,27 +110,6 @@ static void mem_allocator_disconnect(void *allocator) mutex_unlock(&mem_id_lock); } -static void mem_id_disconnect(int id) -{ - struct xdp_mem_allocator *xa; - - mutex_lock(&mem_id_lock); - - xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params); - if (!xa) { - mutex_unlock(&mem_id_lock); - WARN(1, "Request remove non-existing id(%d), driver bug?", id); - return; - } - - trace_mem_disconnect(xa); - - if (!rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params)) - call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free); - - mutex_unlock(&mem_id_lock); -} - void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) { struct xdp_mem_allocator *xa; @@ -144,9 +123,6 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq) if (id == 0) return; - if (xdp_rxq->mem.type == MEM_TYPE_ZERO_COPY) - return mem_id_disconnect(id); - if (xdp_rxq->mem.type == MEM_TYPE_PAGE_POOL) { rcu_read_lock(); xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params); @@ -302,7 +278,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq, xdp_rxq->mem.type = type; if (!allocator) { - if (type == MEM_TYPE_PAGE_POOL || type == MEM_TYPE_ZERO_COPY) + if (type == MEM_TYPE_PAGE_POOL) return -EINVAL; /* Setup time check page_pool req */ return 0; } @@ -362,7 +338,7 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * of xdp_frames/pages in those cases. */ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - unsigned long handle, struct xdp_buff *xdp) + struct xdp_buff *xdp) { struct xdp_mem_allocator *xa; struct page *page; @@ -384,14 +360,6 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_ZERO_COPY: - /* NB! Only valid from an xdp_buff! */ - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); - xa->zc_alloc->free(xa->zc_alloc, handle); - rcu_read_unlock(); - break; case MEM_TYPE_XSK_BUFF_POOL: /* NB! Only valid from an xdp_buff! */ xsk_buff_free(xdp); @@ -404,19 +372,19 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, false, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, 0, NULL); + __xdp_return(xdpf->data, &xdpf->mem, true, NULL); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp->handle, xdp); + __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); } EXPORT_SYMBOL_GPL(xdp_return_buff); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 7f04688045d5..19e59d1a5e9f 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -179,37 +179,6 @@ void xdp_umem_clear_dev(struct xdp_umem *umem) umem->zc = false; } -static void xdp_umem_unmap_pages(struct xdp_umem *umem) -{ - unsigned int i; - - for (i = 0; i < umem->npgs; i++) - if (PageHighMem(umem->pgs[i])) - vunmap(umem->pages[i].addr); -} - -static int xdp_umem_map_pages(struct xdp_umem *umem) -{ - unsigned int i; - void *addr; - - for (i = 0; i < umem->npgs; i++) { - if (PageHighMem(umem->pgs[i])) - addr = vmap(&umem->pgs[i], 1, VM_MAP, PAGE_KERNEL); - else - addr = page_address(umem->pgs[i]); - - if (!addr) { - xdp_umem_unmap_pages(umem); - return -ENOMEM; - } - - umem->pages[i].addr = addr; - } - - return 0; -} - static void xdp_umem_unpin_pages(struct xdp_umem *umem) { unpin_user_pages_dirty_lock(umem->pgs, umem->npgs, true); @@ -244,14 +213,9 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } - xsk_reuseq_destroy(umem); xp_destroy(umem->pool); - xdp_umem_unmap_pages(umem); xdp_umem_unpin_pages(umem); - kvfree(umem->pages); - umem->pages = NULL; - xdp_umem_unaccount_pages(umem); kfree(umem); } @@ -385,11 +349,8 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (headroom >= chunk_size - XDP_PACKET_HEADROOM) return -EINVAL; - umem->chunk_mask = unaligned_chunks ? XSK_UNALIGNED_BUF_ADDR_MASK - : ~((u64)chunk_size - 1); umem->size = size; umem->headroom = headroom; - umem->chunk_size_nohr = chunk_size - headroom; umem->chunk_size = chunk_size; umem->npgs = size / PAGE_SIZE; umem->pgs = NULL; @@ -408,29 +369,14 @@ static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr) if (err) goto out_account; - umem->pages = kvcalloc(umem->npgs, sizeof(*umem->pages), - GFP_KERNEL_ACCOUNT); - if (!umem->pages) { - err = -ENOMEM; - goto out_pin; - } - - err = xdp_umem_map_pages(umem); - if (err) - goto out_pages; - umem->pool = xp_create(umem->pgs, umem->npgs, chunks, chunk_size, headroom, size, unaligned_chunks); if (!umem->pool) { err = -ENOMEM; - goto out_unmap; + goto out_pin; } return 0; -out_unmap: - xdp_umem_unmap_pages(umem); -out_pages: - kvfree(umem->pages); out_pin: xdp_umem_unpin_pages(umem); out_account: diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 6933f0d494ba..3f2ab732ab8b 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -39,24 +39,6 @@ bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) READ_ONCE(xs->umem->fq); } -bool xsk_umem_has_addrs(struct xdp_umem *umem, u32 cnt) -{ - return xskq_cons_has_entries(umem->fq, cnt); -} -EXPORT_SYMBOL(xsk_umem_has_addrs); - -bool xsk_umem_peek_addr(struct xdp_umem *umem, u64 *addr) -{ - return xskq_cons_peek_addr(umem->fq, addr, umem); -} -EXPORT_SYMBOL(xsk_umem_peek_addr); - -void xsk_umem_release_addr(struct xdp_umem *umem) -{ - xskq_cons_release(umem->fq); -} -EXPORT_SYMBOL(xsk_umem_release_addr); - void xsk_set_rx_need_wakeup(struct xdp_umem *umem) { if (umem->need_wakeup & XDP_WAKEUP_RX) @@ -203,8 +185,7 @@ static int xsk_rcv(struct xdp_sock *xs, struct xdp_buff *xdp, len = xdp->data_end - xdp->data; - return xdp->rxq->mem.type == MEM_TYPE_ZERO_COPY || - xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? + return xdp->rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL ? __xsk_rcv_zc(xs, xdp, len) : __xsk_rcv(xs, xdp, len, explicit_free); } @@ -588,24 +569,6 @@ static struct socket *xsk_lookup_xsk_from_fd(int fd) return sock; } -/* Check if umem pages are contiguous. - * If zero-copy mode, use the DMA address to do the page contiguity check - * For all other modes we use addr (kernel virtual address) - * Store the result in the low bits of addr. - */ -static void xsk_check_page_contiguity(struct xdp_umem *umem, u32 flags) -{ - struct xdp_umem_page *pgs = umem->pages; - int i, is_contig; - - for (i = 0; i < umem->npgs - 1; i++) { - is_contig = (flags & XDP_ZEROCOPY) ? - (pgs[i].dma + PAGE_SIZE == pgs[i + 1].dma) : - (pgs[i].addr + PAGE_SIZE == pgs[i + 1].addr); - pgs[i].addr += is_contig << XSK_NEXT_PG_CONTIG_SHIFT; - } -} - static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) { struct sockaddr_xdp *sxdp = (struct sockaddr_xdp *)addr; @@ -688,23 +651,14 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } else { /* This xsk has its own umem. */ - xskq_set_umem(xs->umem->fq, xs->umem->size, - xs->umem->chunk_mask); - xskq_set_umem(xs->umem->cq, xs->umem->size, - xs->umem->chunk_mask); - err = xdp_umem_assign_dev(xs->umem, dev, qid, flags); if (err) goto out_unlock; - - xsk_check_page_contiguity(xs->umem, flags); } xs->dev = dev; xs->zc = xs->umem->zc; xs->queue_id = qid; - xskq_set_umem(xs->rx, xs->umem->size, xs->umem->chunk_mask); - xskq_set_umem(xs->tx, xs->umem->size, xs->umem->chunk_mask); xdp_add_sk_umem(xs->umem, xs); out_unlock: diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index e214a5795a62..89dae78865e7 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -8,6 +8,13 @@ #include "xsk_queue.h" +/* Masks for xdp_umem_page flags. + * The low 12-bits of the addr will be 0 since this is the page address, so we + * can use them for flags. + */ +#define XSK_NEXT_PG_CONTIG_SHIFT 0 +#define XSK_NEXT_PG_CONTIG_MASK BIT_ULL(XSK_NEXT_PG_CONTIG_SHIFT) + struct xsk_buff_pool { struct xsk_queue *fq; struct list_head free_list; diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 554b1ebb4d02..6cf9586e5027 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -10,15 +10,6 @@ #include "xsk_queue.h" -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask) -{ - if (!q) - return; - - q->umem_size = umem_size; - q->chunk_mask = chunk_mask; -} - static size_t xskq_get_ring_size(struct xsk_queue *q, bool umem_queue) { struct xdp_umem_ring *umem_ring; @@ -64,56 +55,3 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } - -struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) -{ - struct xdp_umem_fq_reuse *newq; - - /* Check for overflow */ - if (nentries > (u32)roundup_pow_of_two(nentries)) - return NULL; - nentries = roundup_pow_of_two(nentries); - - newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); - if (!newq) - return NULL; - memset(newq, 0, offsetof(typeof(*newq), handles)); - - newq->nentries = nentries; - return newq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_prepare); - -struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, - struct xdp_umem_fq_reuse *newq) -{ - struct xdp_umem_fq_reuse *oldq = umem->fq_reuse; - - if (!oldq) { - umem->fq_reuse = newq; - return NULL; - } - - if (newq->nentries < oldq->length) - return newq; - - memcpy(newq->handles, oldq->handles, - array_size(oldq->length, sizeof(u64))); - newq->length = oldq->length; - - umem->fq_reuse = newq; - return oldq; -} -EXPORT_SYMBOL_GPL(xsk_reuseq_swap); - -void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) -{ - kvfree(rq); -} -EXPORT_SYMBOL_GPL(xsk_reuseq_free); - -void xsk_reuseq_destroy(struct xdp_umem *umem) -{ - xsk_reuseq_free(umem->fq_reuse); - umem->fq_reuse = NULL; -} diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index 9151aef7dbca..16bf15864788 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -32,8 +32,6 @@ struct xdp_umem_ring { }; struct xsk_queue { - u64 chunk_mask; - u64 umem_size; u32 ring_mask; u32 nentries; u32 cached_prod; @@ -106,90 +104,6 @@ struct xsk_queue { /* Functions that read and validate content from consumer rings. */ -static inline bool xskq_cons_crosses_non_contig_pg(struct xdp_umem *umem, - u64 addr, - u64 length) -{ - bool cross_pg = (addr & (PAGE_SIZE - 1)) + length > PAGE_SIZE; - bool next_pg_contig = - (unsigned long)umem->pages[(addr >> PAGE_SHIFT)].addr & - XSK_NEXT_PG_CONTIG_MASK; - - return cross_pg && !next_pg_contig; -} - -static inline bool xskq_cons_is_valid_unaligned(struct xsk_queue *q, - u64 addr, - u64 length, - struct xdp_umem *umem) -{ - u64 base_addr = xsk_umem_extract_addr(addr); - - addr = xsk_umem_add_offset_to_addr(addr); - if (base_addr >= q->umem_size || addr >= q->umem_size || - xskq_cons_crosses_non_contig_pg(umem, addr, length)) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_is_valid_addr(struct xsk_queue *q, u64 addr) -{ - if (addr >= q->umem_size) { - q->invalid_descs++; - return false; - } - - return true; -} - -static inline bool xskq_cons_read_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx] & q->chunk_mask; - - if (umem->flags & XDP_UMEM_UNALIGNED_CHUNK_FLAG) { - if (xskq_cons_is_valid_unaligned(q, *addr, - umem->chunk_size_nohr, - umem)) - return true; - goto out; - } - - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - -out: - q->cached_cons++; - } - - return false; -} - -static inline bool xskq_cons_read_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - - while (q->cached_cons != q->cached_prod) { - u32 idx = q->cached_cons & q->ring_mask; - - *addr = ring->desc[idx]; - if (xskq_cons_is_valid_addr(q, *addr)) - return true; - - q->cached_cons++; - } - - return false; -} - static inline bool xskq_cons_read_addr_unchecked(struct xsk_queue *q, u64 *addr) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; @@ -267,21 +181,6 @@ static inline bool xskq_cons_has_entries(struct xsk_queue *q, u32 cnt) return entries >= cnt; } -static inline bool xskq_cons_peek_addr(struct xsk_queue *q, u64 *addr, - struct xdp_umem *umem) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr(q, addr, umem); -} - -static inline bool xskq_cons_peek_addr_aligned(struct xsk_queue *q, u64 *addr) -{ - if (q->cached_prod == q->cached_cons) - xskq_cons_get_entries(q); - return xskq_cons_read_addr_aligned(q, addr); -} - static inline bool xskq_cons_peek_addr_unchecked(struct xsk_queue *q, u64 *addr) { if (q->cached_prod == q->cached_cons) @@ -410,11 +309,7 @@ static inline u64 xskq_nb_invalid_descs(struct xsk_queue *q) return q ? q->invalid_descs : 0; } -void xskq_set_umem(struct xsk_queue *q, u64 umem_size, u64 chunk_mask); struct xsk_queue *xskq_create(u32 nentries, bool umem_queue); void xskq_destroy(struct xsk_queue *q_ops); -/* Executed by the core when the entire UMEM gets freed */ -void xsk_reuseq_destroy(struct xdp_umem *umem); - #endif /* _LINUX_XSK_QUEUE_H */ -- cgit v1.2.3 From 82c41671ca4f597b6ff05bd5d118161deec26e07 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Wed, 20 May 2020 21:21:01 +0200 Subject: xdp: Simplify xdp_return_{frame, frame_rx_napi, buff} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The xdp_return_{frame,frame_rx_napi,buff} function are never used, except in xdp_convert_zc_to_xdp_frame(), by the MEM_TYPE_XSK_BUFF_POOL memory type. To simplify and reduce code, change so that xdp_convert_zc_to_xdp_frame() calls xsk_buff_free() directly since the type is know, and remove MEM_TYPE_XSK_BUFF_POOL from the switch statement in __xdp_return() function. Suggested-by: Maxim Mikityanskiy Signed-off-by: Björn Töpel Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200520192103.355233-14-bjorn.topel@gmail.com --- net/core/xdp.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/xdp.c b/net/core/xdp.c index a8c2f243367d..90f44f382115 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -335,10 +335,11 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); * scenarios (e.g. queue full), it is possible to return the xdp_frame * while still leveraging this protection. The @napi_direct boolean * is used for those calls sites. Thus, allowing for faster recycling - * of xdp_frames/pages in those cases. + * of xdp_frames/pages in those cases. This path is never used by the + * MEM_TYPE_XSK_BUFF_POOL memory type, so it's explicitly not part of + * the switch-statement. */ -static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, - struct xdp_buff *xdp) +static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct) { struct xdp_mem_allocator *xa; struct page *page; @@ -360,33 +361,29 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, page = virt_to_page(data); /* Assumes order0 page*/ put_page(page); break; - case MEM_TYPE_XSK_BUFF_POOL: - /* NB! Only valid from an xdp_buff! */ - xsk_buff_free(xdp); - break; default: /* Not possible, checked in xdp_rxq_info_reg_mem_model() */ + WARN(1, "Incorrect XDP memory type (%d) usage", mem->type); break; } } void xdp_return_frame(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, false, NULL); + __xdp_return(xdpf->data, &xdpf->mem, false); } EXPORT_SYMBOL_GPL(xdp_return_frame); void xdp_return_frame_rx_napi(struct xdp_frame *xdpf) { - __xdp_return(xdpf->data, &xdpf->mem, true, NULL); + __xdp_return(xdpf->data, &xdpf->mem, true); } EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi); void xdp_return_buff(struct xdp_buff *xdp) { - __xdp_return(xdp->data, &xdp->rxq->mem, true, xdp); + __xdp_return(xdp->data, &xdp->rxq->mem, true); } -EXPORT_SYMBOL_GPL(xdp_return_buff); /* Only called for MEM_TYPE_PAGE_POOL see xdp.h */ void __xdp_release_frame(void *data, struct xdp_mem_info *mem) @@ -467,7 +464,7 @@ struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp) xdpf->metasize = metasize; xdpf->mem.type = MEM_TYPE_PAGE_ORDER0; - xdp_return_buff(xdp); + xsk_buff_free(xdp); return xdpf; } EXPORT_SYMBOL_GPL(xdp_convert_zc_to_xdp_frame); -- cgit v1.2.3 From 1274e1cc42264d4e629841e4f182795cb0becfd2 Mon Sep 17 00:00:00 2001 From: Roopa Prabhu Date: Thu, 21 May 2020 22:26:14 -0700 Subject: vxlan: ecmp support for mac fdb entries Todays vxlan mac fdb entries can point to multiple remote ips (rdsts) with the sole purpose of replicating broadcast-multicast and unknown unicast packets to those remote ips. E-VPN multihoming [1,2,3] requires bridged vxlan traffic to be load balanced to remote switches (vteps) belonging to the same multi-homed ethernet segment (E-VPN multihoming is analogous to multi-homed LAG implementations, but with the inter-switch peerlink replaced with a vxlan tunnel). In other words it needs support for mac ecmp. Furthermore, for faster convergence, E-VPN multihoming needs the ability to update fdb ecmp nexthops independent of the fdb entries. New route nexthop API is perfect for this usecase. This patch extends the vxlan fdb code to take a nexthop id pointing to an ecmp nexthop group. Changes include: - New NDA_NH_ID attribute for fdbs - Use the newly added fdb nexthop groups - makes vxlan rdsts and nexthop handling code mutually exclusive - since this is a new use-case and the requirement is for ecmp nexthop groups, the fdb add and update path checks that the nexthop is really an ecmp nexthop group. This check can be relaxed in the future, if we want to introduce replication fdb nexthop groups and allow its use in lieu of current rdst lists. - fdb update requests with nexthop id's only allowed for existing fdb's that have nexthop id's - learning will not override an existing fdb entry with nexthop group - I have wrapped the switchdev offload code around the presence of rdst [1] E-VPN RFC https://tools.ietf.org/html/rfc7432 [2] E-VPN with vxlan https://tools.ietf.org/html/rfc8365 [3] http://vger.kernel.org/lpc_net2018_talks/scaling_bridge_fdb_database_slidesV3.pdf Includes a null check fix in vxlan_xmit from Nikolay v2 - Fixed build issue: Reported-by: kbuild test robot Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 306 +++++++++++++++++++++++++++++++++-------- include/net/vxlan.h | 25 ++++ include/uapi/linux/neighbour.h | 1 + net/core/neighbour.c | 2 + 4 files changed, 275 insertions(+), 59 deletions(-) (limited to 'net/core') diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index a5b415fed11e..754e00240eea 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -26,6 +26,7 @@ #include #include #include +#include #if IS_ENABLED(CONFIG_IPV6) #include @@ -78,6 +79,8 @@ struct vxlan_fdb { u16 state; /* see ndm_state */ __be32 vni; u16 flags; /* see ndm_flags and below */ + struct list_head nh_list; + struct nexthop __rcu *nh; }; #define NTF_VXLAN_ADDED_BY_USER 0x100 @@ -174,11 +177,15 @@ static inline struct hlist_head *vs_head(struct net *net, __be16 port) */ static inline struct vxlan_rdst *first_remote_rcu(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_entry_rcu(fdb->remotes.next, struct vxlan_rdst, list); } static inline struct vxlan_rdst *first_remote_rtnl(struct vxlan_fdb *fdb) { + if (rcu_access_pointer(fdb->nh)) + return NULL; return list_first_entry(&fdb->remotes, struct vxlan_rdst, list); } @@ -251,9 +258,10 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, { unsigned long now = jiffies; struct nda_cacheinfo ci; + bool send_ip, send_eth; struct nlmsghdr *nlh; + struct nexthop *nh; struct ndmsg *ndm; - bool send_ip, send_eth; nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags); if (nlh == NULL) @@ -264,16 +272,21 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, send_eth = send_ip = true; + nh = rcu_dereference_rtnl(fdb->nh); if (type == RTM_GETNEIGH) { - send_ip = !vxlan_addr_any(&rdst->remote_ip); + if (rdst) { + send_ip = !vxlan_addr_any(&rdst->remote_ip); + ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; + } else if (nh) { + ndm->ndm_family = nexthop_get_family(nh); + } send_eth = !is_zero_ether_addr(fdb->eth_addr); - ndm->ndm_family = send_ip ? rdst->remote_ip.sa.sa_family : AF_INET; } else ndm->ndm_family = AF_BRIDGE; ndm->ndm_state = fdb->state; ndm->ndm_ifindex = vxlan->dev->ifindex; ndm->ndm_flags = fdb->flags; - if (rdst->offloaded) + if (rdst && rdst->offloaded) ndm->ndm_flags |= NTF_OFFLOADED; ndm->ndm_type = RTN_UNICAST; @@ -284,23 +297,30 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr)) goto nla_put_failure; + if (nh) { + if (nla_put_u32(skb, NDA_NH_ID, nh->id)) + goto nla_put_failure; + } else if (rdst) { + if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, + &rdst->remote_ip)) + goto nla_put_failure; + + if (rdst->remote_port && + rdst->remote_port != vxlan->cfg.dst_port && + nla_put_be16(skb, NDA_PORT, rdst->remote_port)) + goto nla_put_failure; + if (rdst->remote_vni != vxlan->default_dst.remote_vni && + nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) + goto nla_put_failure; + if (rdst->remote_ifindex && + nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) + goto nla_put_failure; + } - if (send_ip && vxlan_nla_put_addr(skb, NDA_DST, &rdst->remote_ip)) - goto nla_put_failure; - - if (rdst->remote_port && rdst->remote_port != vxlan->cfg.dst_port && - nla_put_be16(skb, NDA_PORT, rdst->remote_port)) - goto nla_put_failure; - if (rdst->remote_vni != vxlan->default_dst.remote_vni && - nla_put_u32(skb, NDA_VNI, be32_to_cpu(rdst->remote_vni))) - goto nla_put_failure; if ((vxlan->cfg.flags & VXLAN_F_COLLECT_METADATA) && fdb->vni && nla_put_u32(skb, NDA_SRC_VNI, be32_to_cpu(fdb->vni))) goto nla_put_failure; - if (rdst->remote_ifindex && - nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex)) - goto nla_put_failure; ci.ndm_used = jiffies_to_clock_t(now - fdb->used); ci.ndm_confirmed = 0; @@ -401,7 +421,7 @@ static int vxlan_fdb_notify(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, { int err; - if (swdev_notify) { + if (swdev_notify && rd) { switch (type) { case RTM_NEWNEIGH: err = vxlan_fdb_switchdev_call_notifiers(vxlan, fdb, rd, @@ -805,6 +825,8 @@ static struct vxlan_fdb *vxlan_fdb_alloc(const u8 *mac, __u16 state, f->flags = ndm_flags; f->updated = f->used = jiffies; f->vni = src_vni; + f->nh = NULL; + INIT_LIST_HEAD(&f->nh_list); INIT_LIST_HEAD(&f->remotes); memcpy(f->eth_addr, mac, ETH_ALEN); @@ -819,11 +841,78 @@ static void vxlan_fdb_insert(struct vxlan_dev *vxlan, const u8 *mac, vxlan_fdb_head(vxlan, mac, src_vni)); } +static int vxlan_fdb_nh_update(struct vxlan_dev *vxlan, struct vxlan_fdb *fdb, + u32 nhid, struct netlink_ext_ack *extack) +{ + struct nexthop *old_nh = rtnl_dereference(fdb->nh); + struct nh_group *nhg; + struct nexthop *nh; + int err = -EINVAL; + + if (old_nh && old_nh->id == nhid) + return 0; + + nh = nexthop_find_by_id(vxlan->net, nhid); + if (!nh) { + NL_SET_ERR_MSG(extack, "Nexthop id does not exist"); + goto err_inval; + } + + if (nh) { + if (!nexthop_get(nh)) { + NL_SET_ERR_MSG(extack, "Nexthop has been deleted"); + nh = NULL; + goto err_inval; + } + if (!nh->is_fdb_nh) { + NL_SET_ERR_MSG(extack, "Nexthop is not a fdb nexthop"); + goto err_inval; + } + + if (!nh->is_group || !nh->nh_grp->mpath) { + NL_SET_ERR_MSG(extack, "Nexthop is not a multipath group"); + goto err_inval; + } + + /* check nexthop group family */ + nhg = rtnl_dereference(nh->nh_grp); + switch (vxlan->default_dst.remote_ip.sa.sa_family) { + case AF_INET: + if (!nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + break; + case AF_INET6: + if (nhg->has_v4) { + err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "Nexthop group family not supported"); + goto err_inval; + } + } + } + + if (old_nh) { + list_del_rcu(&fdb->nh_list); + nexthop_put(old_nh); + } + rcu_assign_pointer(fdb->nh, nh); + list_add_tail_rcu(&fdb->nh_list, &nh->fdb_list); + return 1; + +err_inval: + if (nh) + nexthop_put(nh); + return err; +} + static int vxlan_fdb_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __be16 port, __be32 src_vni, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb **fdb) + u32 nhid, struct vxlan_fdb **fdb, + struct netlink_ext_ack *extack) { struct vxlan_rdst *rd = NULL; struct vxlan_fdb *f; @@ -838,20 +927,33 @@ static int vxlan_fdb_create(struct vxlan_dev *vxlan, if (!f) return -ENOMEM; - rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); - if (rc < 0) { - kfree(f); - return rc; - } + if (nhid) + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + else + rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); + if (rc < 0) + goto errout; *fdb = f; return 0; + +errout: + kfree(f); + return rc; } static void __vxlan_fdb_free(struct vxlan_fdb *f) { struct vxlan_rdst *rd, *nd; + struct nexthop *nh; + + nh = rcu_dereference_raw(f->nh); + if (nh) { + rcu_assign_pointer(f->nh, NULL); + list_del_rcu(&f->nh_list); + nexthop_put(nh); + } list_for_each_entry_safe(rd, nd, &f->remotes, list) { dst_cache_destroy(&rd->dst_cache); @@ -875,10 +977,15 @@ static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, netdev_dbg(vxlan->dev, "delete %pM\n", f->eth_addr); --vxlan->addrcnt; - if (do_notify) - list_for_each_entry(rd, &f->remotes, list) - vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + if (do_notify) { + if (rcu_access_pointer(f->nh)) + vxlan_fdb_notify(vxlan, f, NULL, RTM_DELNEIGH, swdev_notify, NULL); + else + list_for_each_entry(rd, &f->remotes, list) + vxlan_fdb_notify(vxlan, f, rd, RTM_DELNEIGH, + swdev_notify, NULL); + } hlist_del_rcu(&f->hlist); call_rcu(&f->rcu, vxlan_fdb_free); @@ -897,7 +1004,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, __u16 state, __u16 flags, __be16 port, __be32 vni, __u32 ifindex, __u16 ndm_flags, - struct vxlan_fdb *f, + struct vxlan_fdb *f, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -908,6 +1015,18 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, int rc = 0; int err; + if (nhid && !rcu_access_pointer(f->nh)) { + NL_SET_ERR_MSG(extack, + "Cannot replace an existing non nexthop fdb with a nexthop"); + return -EOPNOTSUPP; + } + + if (nhid && (flags & NLM_F_APPEND)) { + NL_SET_ERR_MSG(extack, + "Cannot append to a nexthop fdb"); + return -EOPNOTSUPP; + } + /* Do not allow an externally learned entry to take over an entry added * by the user. */ @@ -929,10 +1048,17 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, /* Only change unicasts */ if (!(is_multicast_ether_addr(f->eth_addr) || is_zero_ether_addr(f->eth_addr))) { - rc = vxlan_fdb_replace(f, ip, port, vni, - ifindex, &oldrd); + if (nhid) { + rc = vxlan_fdb_nh_update(vxlan, f, nhid, extack); + if (rc < 0) + return rc; + } else { + rc = vxlan_fdb_replace(f, ip, port, vni, + ifindex, &oldrd); + } notify |= rc; } else { + NL_SET_ERR_MSG(extack, "Cannot replace non-unicast fdb entries"); return -EOPNOTSUPP; } } @@ -962,6 +1088,8 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, return 0; err_notify: + if (nhid) + return err; if ((flags & NLM_F_REPLACE) && rc) *rd = oldrd; else if ((flags & NLM_F_APPEND) && rc) { @@ -975,7 +1103,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -990,7 +1118,7 @@ static int vxlan_fdb_update_create(struct vxlan_dev *vxlan, netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, - vni, ifindex, fdb_flags, &f); + vni, ifindex, fdb_flags, nhid, &f, extack); if (rc < 0) return rc; @@ -1012,7 +1140,7 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, const u8 *mac, union vxlan_addr *ip, __u16 state, __u16 flags, __be16 port, __be32 src_vni, __be32 vni, - __u32 ifindex, __u16 ndm_flags, + __u32 ifindex, __u16 ndm_flags, u32 nhid, bool swdev_notify, struct netlink_ext_ack *extack) { @@ -1028,14 +1156,15 @@ static int vxlan_fdb_update(struct vxlan_dev *vxlan, return vxlan_fdb_update_existing(vxlan, ip, state, flags, port, vni, ifindex, ndm_flags, f, - swdev_notify, extack); + nhid, swdev_notify, extack); } else { if (!(flags & NLM_F_CREATE)) return -ENOENT; return vxlan_fdb_update_create(vxlan, mac, ip, state, flags, port, src_vni, vni, ifindex, - ndm_flags, swdev_notify, extack); + ndm_flags, nhid, swdev_notify, + extack); } } @@ -1049,7 +1178,7 @@ static void vxlan_fdb_dst_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, union vxlan_addr *ip, __be16 *port, __be32 *src_vni, - __be32 *vni, u32 *ifindex) + __be32 *vni, u32 *ifindex, u32 *nhid) { struct net *net = dev_net(vxlan->dev); int err; @@ -1109,6 +1238,11 @@ static int vxlan_fdb_parse(struct nlattr *tb[], struct vxlan_dev *vxlan, *ifindex = 0; } + if (tb[NDA_NH_ID]) + *nhid = nla_get_u32(tb[NDA_NH_ID]); + else + *nhid = 0; + return 0; } @@ -1123,7 +1257,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], union vxlan_addr ip; __be16 port; __be32 src_vni, vni; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; int err; @@ -1133,10 +1267,11 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], return -EINVAL; } - if (tb[NDA_DST] == NULL) + if (!tb || (!tb[NDA_DST] && !tb[NDA_NH_ID])) return -EINVAL; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1148,7 +1283,7 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, port, src_vni, vni, ifindex, ndm->ndm_flags | NTF_VXLAN_ADDED_BY_USER, - true, extack); + nhid, true, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; @@ -1159,8 +1294,8 @@ static int __vxlan_fdb_delete(struct vxlan_dev *vxlan, __be16 port, __be32 src_vni, __be32 vni, u32 ifindex, bool swdev_notify) { - struct vxlan_fdb *f; struct vxlan_rdst *rd = NULL; + struct vxlan_fdb *f; int err = -ENOENT; f = vxlan_find_mac(vxlan, addr, src_vni); @@ -1195,12 +1330,13 @@ static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[], struct vxlan_dev *vxlan = netdev_priv(dev); union vxlan_addr ip; __be32 src_vni, vni; - __be16 port; - u32 ifindex; + u32 ifindex, nhid; u32 hash_index; + __be16 port; int err; - err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex); + err = vxlan_fdb_parse(tb, vxlan, &ip, &port, &src_vni, &vni, &ifindex, + &nhid); if (err) return err; @@ -1228,6 +1364,17 @@ static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) { struct vxlan_rdst *rd; + if (rcu_access_pointer(f->nh)) { + err = vxlan_fdb_info(skb, vxlan, f, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH, + NLM_F_MULTI, NULL); + if (err < 0) + goto out; + continue; + } + list_for_each_entry_rcu(rd, &f->remotes, list) { if (*idx < cb->args[2]) goto skip; @@ -1311,6 +1458,10 @@ static bool vxlan_snoop(struct net_device *dev, if (f->state & (NUD_PERMANENT | NUD_NOARP)) return true; + /* Don't override an fdb with nexthop with a learnt entry */ + if (rcu_access_pointer(f->nh)) + return true; + if (net_ratelimit()) netdev_info(dev, "%pM migrated from %pIS to %pIS\n", @@ -1333,7 +1484,7 @@ static bool vxlan_snoop(struct net_device *dev, vxlan->cfg.dst_port, vni, vxlan->default_dst.remote_vni, - ifindex, NTF_SELF, true, NULL); + ifindex, NTF_SELF, 0, true, NULL); spin_unlock(&vxlan->hash_lock[hash_index]); } @@ -2616,6 +2767,38 @@ tx_error: kfree_skb(skb); } +static void vxlan_xmit_nh(struct sk_buff *skb, struct net_device *dev, + struct vxlan_fdb *f, __be32 vni, bool did_rsc) +{ + struct vxlan_rdst nh_rdst; + struct nexthop *nh; + bool do_xmit; + u32 hash; + + memset(&nh_rdst, 0, sizeof(struct vxlan_rdst)); + hash = skb_get_hash(skb); + + rcu_read_lock(); + nh = rcu_dereference(f->nh); + if (!nh) { + rcu_read_unlock(); + goto drop; + } + do_xmit = vxlan_fdb_nh_path_select(nh, hash, &nh_rdst); + rcu_read_unlock(); + + if (likely(do_xmit)) + vxlan_xmit_one(skb, dev, vni, &nh_rdst, did_rsc); + else + goto drop; + + return; + +drop: + dev->stats.tx_dropped++; + dev_kfree_skb(skb); +} + /* Transmit local packets over Vxlan * * Outer IP header inherits ECN and DF from inner header. @@ -2692,22 +2875,27 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev) } } - list_for_each_entry_rcu(rdst, &f->remotes, list) { - struct sk_buff *skb1; + if (rcu_access_pointer(f->nh)) { + vxlan_xmit_nh(skb, dev, f, + (vni ? : vxlan->default_dst.remote_vni), did_rsc); + } else { + list_for_each_entry_rcu(rdst, &f->remotes, list) { + struct sk_buff *skb1; - if (!fdst) { - fdst = rdst; - continue; + if (!fdst) { + fdst = rdst; + continue; + } + skb1 = skb_clone(skb, GFP_ATOMIC); + if (skb1) + vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); } - skb1 = skb_clone(skb, GFP_ATOMIC); - if (skb1) - vxlan_xmit_one(skb1, dev, vni, rdst, did_rsc); + if (fdst) + vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); + else + kfree_skb(skb); } - if (fdst) - vxlan_xmit_one(skb, dev, vni, fdst, did_rsc); - else - kfree_skb(skb); return NETDEV_TX_OK; } @@ -3615,7 +3803,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev, dst->remote_vni, dst->remote_vni, dst->remote_ifindex, - NTF_SELF, &f); + NTF_SELF, 0, &f, extack); if (err) return err; } @@ -4013,7 +4201,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], vxlan->cfg.dst_port, conf.vni, conf.vni, conf.remote_ifindex, - NTF_SELF, true, extack); + NTF_SELF, 0, true, extack); if (err) { spin_unlock_bh(&vxlan->hash_lock[hash_index]); netdev_adjacent_change_abort(dst->remote_dev, @@ -4335,7 +4523,7 @@ vxlan_fdb_external_learn_add(struct net_device *dev, fdb_info->remote_vni, fdb_info->remote_ifindex, NTF_USE | NTF_SELF | NTF_EXT_LEARNED, - false, extack); + 0, false, extack); spin_unlock_bh(&vxlan->hash_lock[hash_index]); return err; diff --git a/include/net/vxlan.h b/include/net/vxlan.h index 373aadcfea21..3a41627cbdfe 100644 --- a/include/net/vxlan.h +++ b/include/net/vxlan.h @@ -7,6 +7,7 @@ #include #include #include +#include #define IANA_VXLAN_UDP_PORT 4789 @@ -487,4 +488,28 @@ static inline void vxlan_flag_attr_error(int attrtype, #undef VXLAN_FLAG } +static inline bool vxlan_fdb_nh_path_select(struct nexthop *nh, + int hash, + struct vxlan_rdst *rdst) +{ + struct fib_nh_common *nhc; + + nhc = nexthop_path_fdb_result(nh, hash); + if (unlikely(!nhc)) + return false; + + switch (nhc->nhc_gw_family) { + case AF_INET: + rdst->remote_ip.sin.sin_addr.s_addr = nhc->nhc_gw.ipv4; + rdst->remote_ip.sa.sa_family = AF_INET; + break; + case AF_INET6: + rdst->remote_ip.sin6.sin6_addr = nhc->nhc_gw.ipv6; + rdst->remote_ip.sa.sa_family = AF_INET6; + break; + } + + return true; +} + #endif diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index cd144e3099a3..eefcda8ca44e 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -29,6 +29,7 @@ enum { NDA_LINK_NETNSID, NDA_SRC_VNI, NDA_PROTOCOL, /* Originator of entry */ + NDA_NH_ID, __NDA_MAX }; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b607ea602774..37e4dba62460 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1771,6 +1771,7 @@ static struct neigh_table *neigh_find_table(int family) } const struct nla_policy nda_policy[NDA_MAX+1] = { + [NDA_UNSPEC] = { .strict_start_type = NDA_NH_ID }, [NDA_DST] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_LLADDR] = { .type = NLA_BINARY, .len = MAX_ADDR_LEN }, [NDA_CACHEINFO] = { .len = sizeof(struct nda_cacheinfo) }, @@ -1781,6 +1782,7 @@ const struct nla_policy nda_policy[NDA_MAX+1] = { [NDA_IFINDEX] = { .type = NLA_U32 }, [NDA_MASTER] = { .type = NLA_U32 }, [NDA_PROTOCOL] = { .type = NLA_U8 }, + [NDA_NH_ID] = { .type = NLA_U32 }, }; static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, -- cgit v1.2.3 From 060b6381efe58478e1d7dfff7a1e76a73a6377db Mon Sep 17 00:00:00 2001 From: Edward Cree Date: Wed, 20 May 2020 19:18:10 +0100 Subject: net: flow_offload: simplify hw stats check handling Make FLOW_ACTION_HW_STATS_DONT_CARE be all bits, rather than none, so that drivers and __flow_action_hw_stats_check can use simple bitwise checks. Pre-fill all actions with DONT_CARE in flow_rule_alloc(), rather than relying on implicit semantics of zero from kzalloc, so that callers which don't configure action stats themselves (i.e. netfilter) get the correct behaviour by default. Only the kernel's internal API semantics change; the TC uAPI is unaffected. v4: move DONT_CARE setting to flow_rule_alloc() for robustness and simplicity. v3: set DONT_CARE in nft and ct offload. v2: rebased on net-next, removed RFC tags. Signed-off-by: Edward Cree Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c | 8 ++++---- include/net/flow_offload.h | 11 +++++++---- net/core/flow_offload.c | 6 ++++++ 3 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c index b286fe158820..51e1b3930c56 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c @@ -30,14 +30,14 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp, return -EOPNOTSUPP; act = flow_action_first_entry_get(flow_action); - if (act->hw_stats == FLOW_ACTION_HW_STATS_ANY || - act->hw_stats == FLOW_ACTION_HW_STATS_IMMEDIATE) { + if (act->hw_stats & FLOW_ACTION_HW_STATS_DISABLED) { + /* Nothing to do */ + } else if (act->hw_stats & FLOW_ACTION_HW_STATS_IMMEDIATE) { /* Count action is inserted first */ err = mlxsw_sp_acl_rulei_act_count(mlxsw_sp, rulei, extack); if (err) return err; - } else if (act->hw_stats != FLOW_ACTION_HW_STATS_DISABLED && - act->hw_stats != FLOW_ACTION_HW_STATS_DONT_CARE) { + } else { NL_SET_ERR_MSG_MOD(extack, "Unsupported action HW stats type"); return -EOPNOTSUPP; } diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 4001ffb04f0d..95d633785ef9 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -168,10 +168,11 @@ enum flow_action_hw_stats_bit { FLOW_ACTION_HW_STATS_IMMEDIATE_BIT, FLOW_ACTION_HW_STATS_DELAYED_BIT, FLOW_ACTION_HW_STATS_DISABLED_BIT, + + FLOW_ACTION_HW_STATS_NUM_BITS }; enum flow_action_hw_stats { - FLOW_ACTION_HW_STATS_DONT_CARE = 0, FLOW_ACTION_HW_STATS_IMMEDIATE = BIT(FLOW_ACTION_HW_STATS_IMMEDIATE_BIT), FLOW_ACTION_HW_STATS_DELAYED = BIT(FLOW_ACTION_HW_STATS_DELAYED_BIT), @@ -179,6 +180,7 @@ enum flow_action_hw_stats { FLOW_ACTION_HW_STATS_DELAYED, FLOW_ACTION_HW_STATS_DISABLED = BIT(FLOW_ACTION_HW_STATS_DISABLED_BIT), + FLOW_ACTION_HW_STATS_DONT_CARE = BIT(FLOW_ACTION_HW_STATS_NUM_BITS) - 1, }; typedef void (*action_destr)(void *priv); @@ -340,11 +342,12 @@ __flow_action_hw_stats_check(const struct flow_action *action, return false; action_entry = flow_action_first_entry_get(action); - if (action_entry->hw_stats == FLOW_ACTION_HW_STATS_DONT_CARE) - return true; + + /* Zero is not a legal value for hw_stats, catch anyone passing it */ + WARN_ON_ONCE(!action_entry->hw_stats); if (!check_allow_bit && - action_entry->hw_stats != FLOW_ACTION_HW_STATS_ANY) { + ~action_entry->hw_stats & FLOW_ACTION_HW_STATS_ANY) { NL_SET_ERR_MSG_MOD(extack, "Driver supports only default HW stats type \"any\""); return false; } else if (check_allow_bit && diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e951b743bed3..e64941c526b1 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -8,6 +8,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) { struct flow_rule *rule; + int i; rule = kzalloc(struct_size(rule, action.entries, num_actions), GFP_KERNEL); @@ -15,6 +16,11 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions) return NULL; rule->action.num_entries = num_actions; + /* Pre-fill each action hw_stats with DONT_CARE. + * Caller can override this if it wants stats for a given action. + */ + for (i = 0; i < num_actions; i++) + rule->action.entries[i].hw_stats = FLOW_ACTION_HW_STATS_DONT_CARE; return rule; } -- cgit v1.2.3 From 58cff782cc55eb755826c649976aea9f5f8b3086 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Tue, 26 May 2020 14:29:00 +0200 Subject: flow_dissector: Parse multiple MPLS Label Stack Entries The current MPLS dissector only parses the first MPLS Label Stack Entry (second LSE can be parsed too, but only to set a key_id). This patch adds the possibility to parse several LSEs by making __skb_flow_dissect_mpls() return FLOW_DISSECT_RET_PROTO_AGAIN as long as the Bottom Of Stack bit hasn't been seen, up to a maximum of FLOW_DIS_MPLS_MAX entries. FLOW_DIS_MPLS_MAX is arbitrarily set to 7. This should be enough for many practical purposes, without wasting too much space. To record the parsed values, flow_dissector_key_mpls is modified to store an array of stack entries, instead of just the values of the first one. A bit field, "used_lses", is also added to keep track of the LSEs that have been set. The objective is to avoid defining a new FLOW_DISSECTOR_KEY_MPLS_XX for each level of the MPLS stack. TC flower is adapted for the new struct flow_dissector_key_mpls layout. Matching on several MPLS Label Stack Entries will be added in the next patch. The NFP and MLX5 drivers are also adapted: nfp_flower_compile_mac() and mlx5's parse_tunnel() now verify that the rule only uses the first LSE and fail if it doesn't. Finally, the behaviour of the FLOW_DISSECTOR_KEY_MPLS_ENTROPY key is slightly modified. Instead of recording the first Entropy Label, it now records the last one. This shouldn't have any consequences since there doesn't seem to have any user of FLOW_DISSECTOR_KEY_MPLS_ENTROPY in the tree. We'd probably better do a hash of all parsed MPLS labels instead (excluding reserved labels) anyway. That'd give better entropy and would probably also simplify the code. But that's not the purpose of this patch, so I'm keeping that as a future possible improvement. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller --- .../mellanox/mlx5/core/en/tc_tun_mplsoudp.c | 27 +++++++---- drivers/net/ethernet/netronome/nfp/flower/match.c | 42 ++++++++++++----- include/net/flow_dissector.h | 14 +++++- net/core/flow_dissector.c | 49 +++++++++++++------- net/sched/cls_flower.c | 52 +++++++++++++++------- 5 files changed, 132 insertions(+), 52 deletions(-) (limited to 'net/core') diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c index 98ee62e427d2..b4a3c96d34fd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_mplsoudp.c @@ -101,25 +101,36 @@ static int parse_tunnel(struct mlx5e_priv *priv, flow_rule_match_mpls(rule, &match); + /* Only support matching the first LSE */ + if (match.mask->used_lses != 1) + return -EOPNOTSUPP; + MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_label, match.mask->mpls_label); + outer_first_mpls_over_udp.mpls_label, + match.mask->ls[0].mpls_label); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_label, match.key->mpls_label); + outer_first_mpls_over_udp.mpls_label, + match.key->ls[0].mpls_label); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_exp, match.mask->mpls_tc); + outer_first_mpls_over_udp.mpls_exp, + match.mask->ls[0].mpls_tc); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_exp, match.key->mpls_tc); + outer_first_mpls_over_udp.mpls_exp, match.key->ls[0].mpls_tc); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_s_bos, match.mask->mpls_bos); + outer_first_mpls_over_udp.mpls_s_bos, + match.mask->ls[0].mpls_bos); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_s_bos, match.key->mpls_bos); + outer_first_mpls_over_udp.mpls_s_bos, + match.key->ls[0].mpls_bos); MLX5_SET(fte_match_set_misc2, misc2_c, - outer_first_mpls_over_udp.mpls_ttl, match.mask->mpls_ttl); + outer_first_mpls_over_udp.mpls_ttl, + match.mask->ls[0].mpls_ttl); MLX5_SET(fte_match_set_misc2, misc2_v, - outer_first_mpls_over_udp.mpls_ttl, match.key->mpls_ttl); + outer_first_mpls_over_udp.mpls_ttl, + match.key->ls[0].mpls_ttl); spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_2; return 0; diff --git a/drivers/net/ethernet/netronome/nfp/flower/match.c b/drivers/net/ethernet/netronome/nfp/flower/match.c index 546bc01d507d..f7f01e2e3dce 100644 --- a/drivers/net/ethernet/netronome/nfp/flower/match.c +++ b/drivers/net/ethernet/netronome/nfp/flower/match.c @@ -74,9 +74,10 @@ nfp_flower_compile_port(struct nfp_flower_in_port *frame, u32 cmsg_port, return 0; } -static void +static int nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, - struct nfp_flower_mac_mpls *msk, struct flow_rule *rule) + struct nfp_flower_mac_mpls *msk, struct flow_rule *rule, + struct netlink_ext_ack *extack) { memset(ext, 0, sizeof(struct nfp_flower_mac_mpls)); memset(msk, 0, sizeof(struct nfp_flower_mac_mpls)); @@ -97,14 +98,28 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, u32 t_mpls; flow_rule_match_mpls(rule, &match); - t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.key->mpls_label) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.key->mpls_tc) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.key->mpls_bos) | + + /* Only support matching the first LSE */ + if (match.mask->used_lses != 1) { + NL_SET_ERR_MSG_MOD(extack, + "unsupported offload: invalid LSE depth for MPLS match offload"); + return -EOPNOTSUPP; + } + + t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, + match.key->ls[0].mpls_label) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, + match.key->ls[0].mpls_tc) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, + match.key->ls[0].mpls_bos) | NFP_FLOWER_MASK_MPLS_Q; ext->mpls_lse = cpu_to_be32(t_mpls); - t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, match.mask->mpls_label) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, match.mask->mpls_tc) | - FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, match.mask->mpls_bos) | + t_mpls = FIELD_PREP(NFP_FLOWER_MASK_MPLS_LB, + match.mask->ls[0].mpls_label) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_TC, + match.mask->ls[0].mpls_tc) | + FIELD_PREP(NFP_FLOWER_MASK_MPLS_BOS, + match.mask->ls[0].mpls_bos) | NFP_FLOWER_MASK_MPLS_Q; msk->mpls_lse = cpu_to_be32(t_mpls); } else if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_BASIC)) { @@ -121,6 +136,8 @@ nfp_flower_compile_mac(struct nfp_flower_mac_mpls *ext, msk->mpls_lse = cpu_to_be32(NFP_FLOWER_MASK_MPLS_Q); } } + + return 0; } static void @@ -461,9 +478,12 @@ int nfp_flower_compile_flow_match(struct nfp_app *app, msk += sizeof(struct nfp_flower_in_port); if (NFP_FLOWER_LAYER_MAC & key_ls->key_layer) { - nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext, - (struct nfp_flower_mac_mpls *)msk, - rule); + err = nfp_flower_compile_mac((struct nfp_flower_mac_mpls *)ext, + (struct nfp_flower_mac_mpls *)msk, + rule, extack); + if (err) + return err; + ext += sizeof(struct nfp_flower_mac_mpls); msk += sizeof(struct nfp_flower_mac_mpls); } diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 628383915827..4fb1a69c6ecf 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -59,13 +59,25 @@ struct flow_dissector_key_vlan { __be16 vlan_tpid; }; -struct flow_dissector_key_mpls { +struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; +#define FLOW_DIS_MPLS_MAX 7 +struct flow_dissector_key_mpls { + struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ + u8 used_lses; /* One bit set for each Label Stack Entry in use */ +}; + +static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, + int lse_index) +{ + mpls->used_lses |= 1 << lse_index; +} + #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5dceed467f64..0aeb33572feb 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -480,47 +480,59 @@ EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, void *data, int nhoff, int hlen, + int lse_index, bool *entropy_label) { - struct flow_dissector_key_keyid *key_keyid; - struct mpls_label *hdr, _hdr[2]; - u32 entry, label; + struct mpls_label *hdr, _hdr; + u32 entry, label, bos; if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) return FLOW_DISSECT_RET_OUT_GOOD; + if (lse_index >= FLOW_DIS_MPLS_MAX) + return FLOW_DISSECT_RET_OUT_GOOD; + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); if (!hdr) return FLOW_DISSECT_RET_OUT_BAD; - entry = ntohl(hdr[0].entry); + entry = ntohl(hdr->entry); label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + bos = (entry & MPLS_LS_S_MASK) >> MPLS_LS_S_SHIFT; if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { struct flow_dissector_key_mpls *key_mpls; + struct flow_dissector_mpls_lse *lse; key_mpls = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS, target_container); - key_mpls->mpls_label = label; - key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) - >> MPLS_LS_TTL_SHIFT; - key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) - >> MPLS_LS_TC_SHIFT; - key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) - >> MPLS_LS_S_SHIFT; + lse = &key_mpls->ls[lse_index]; + + lse->mpls_ttl = (entry & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT; + lse->mpls_bos = bos; + lse->mpls_tc = (entry & MPLS_LS_TC_MASK) >> MPLS_LS_TC_SHIFT; + lse->mpls_label = label; + dissector_set_mpls_lse(key_mpls, lse_index); } - if (label == MPLS_LABEL_ENTROPY) { + if (*entropy_label && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { + struct flow_dissector_key_keyid *key_keyid; + key_keyid = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_MPLS_ENTROPY, target_container); - key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + key_keyid->keyid = cpu_to_be32(label); } - return FLOW_DISSECT_RET_OUT_GOOD; + + *entropy_label = label == MPLS_LABEL_ENTROPY; + + return bos ? FLOW_DISSECT_RET_OUT_GOOD : FLOW_DISSECT_RET_PROTO_AGAIN; } static enum flow_dissect_ret @@ -979,6 +991,8 @@ bool __skb_flow_dissect(const struct net *net, struct bpf_prog *attached = NULL; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + bool mpls_el = false; + int mpls_lse = 0; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -1278,7 +1292,10 @@ proto_again: case htons(ETH_P_MPLS_MC): fdret = __skb_flow_dissect_mpls(skb, flow_dissector, target_container, data, - nhoff, hlen); + nhoff, hlen, mpls_lse, + &mpls_el); + nhoff += sizeof(struct mpls_label); + mpls_lse++; break; case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) { diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 0c574700da75..f524afe0b7f5 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -781,9 +781,17 @@ static int fl_set_key_mpls(struct nlattr **tb, struct flow_dissector_key_mpls *key_mask, struct netlink_ext_ack *extack) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_val; + + lse_val = &key_val->ls[0]; + lse_mask = &key_mask->ls[0]; + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { - key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); - key_mask->mpls_ttl = MPLS_TTL_MASK; + lse_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + lse_mask->mpls_ttl = MPLS_TTL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); @@ -794,8 +802,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Bottom Of Stack (BOS) must be 0 or 1"); return -EINVAL; } - key_val->mpls_bos = bos; - key_mask->mpls_bos = MPLS_BOS_MASK; + lse_val->mpls_bos = bos; + lse_mask->mpls_bos = MPLS_BOS_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_TC]) { u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); @@ -806,8 +816,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Traffic Class (TC) must be between 0 and 7"); return -EINVAL; } - key_val->mpls_tc = tc; - key_mask->mpls_tc = MPLS_TC_MASK; + lse_val->mpls_tc = tc; + lse_mask->mpls_tc = MPLS_TC_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); @@ -818,8 +830,10 @@ static int fl_set_key_mpls(struct nlattr **tb, "Label must be between 0 and 1048575"); return -EINVAL; } - key_val->mpls_label = label; - key_mask->mpls_label = MPLS_LABEL_MASK; + lse_val->mpls_label = label; + lse_mask->mpls_label = MPLS_LABEL_MASK; + dissector_set_mpls_lse(key_val, 0); + dissector_set_mpls_lse(key_mask, 0); } return 0; } @@ -2222,31 +2236,37 @@ static int fl_dump_key_mpls(struct sk_buff *skb, struct flow_dissector_key_mpls *mpls_key, struct flow_dissector_key_mpls *mpls_mask) { + struct flow_dissector_mpls_lse *lse_mask; + struct flow_dissector_mpls_lse *lse_key; int err; if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) return 0; - if (mpls_mask->mpls_ttl) { + + lse_mask = &mpls_mask->ls[0]; + lse_key = &mpls_key->ls[0]; + + if (lse_mask->mpls_ttl) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, - mpls_key->mpls_ttl); + lse_key->mpls_ttl); if (err) return err; } - if (mpls_mask->mpls_tc) { + if (lse_mask->mpls_tc) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, - mpls_key->mpls_tc); + lse_key->mpls_tc); if (err) return err; } - if (mpls_mask->mpls_label) { + if (lse_mask->mpls_label) { err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, - mpls_key->mpls_label); + lse_key->mpls_label); if (err) return err; } - if (mpls_mask->mpls_bos) { + if (lse_mask->mpls_bos) { err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, - mpls_key->mpls_bos); + lse_key->mpls_bos); if (err) return err; } -- cgit v1.2.3 From b58f0e8f38c0a44afa59601a115bd231f23471e1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:09 +0200 Subject: net: add sock_set_reuseaddr Add a helper to directly set the SO_REUSEADDR sockopt from kernel space without going through a fake uaccess. For this the iscsi target now has to formally depend on inet to avoid a mostly theoretical compile failure. For actual operation it already did depend on having ipv4 or ipv6 support. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/infiniband/sw/siw/siw_cm.c | 18 +++++------------- drivers/nvme/target/tcp.c | 8 +------- drivers/target/iscsi/Kconfig | 2 +- drivers/target/iscsi/iscsi_target_login.c | 9 +-------- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 2 ++ net/core/sock.c | 8 ++++++++ 7 files changed, 19 insertions(+), 34 deletions(-) (limited to 'net/core') diff --git a/drivers/infiniband/sw/siw/siw_cm.c b/drivers/infiniband/sw/siw/siw_cm.c index 559e5fd3bad8..d1860f3e8740 100644 --- a/drivers/infiniband/sw/siw/siw_cm.c +++ b/drivers/infiniband/sw/siw/siw_cm.c @@ -1312,17 +1312,14 @@ static void siw_cm_llp_state_change(struct sock *sk) static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr, struct sockaddr *raddr) { - int rv, flags = 0, s_val = 1; + int rv, flags = 0; size_t size = laddr->sa_family == AF_INET ? sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6); /* * Make address available again asap. */ - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv < 0) - return rv; + sock_set_reuseaddr(s->sk); rv = s->ops->bind(s, laddr, size); if (rv < 0) @@ -1781,7 +1778,7 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) struct siw_cep *cep = NULL; struct siw_device *sdev = to_siw_dev(id->device); int addr_family = id->local_addr.ss_family; - int rv = 0, s_val; + int rv = 0; if (addr_family != AF_INET && addr_family != AF_INET6) return -EAFNOSUPPORT; @@ -1793,13 +1790,8 @@ int siw_create_listen(struct iw_cm_id *id, int backlog) /* * Allow binding local port when still in TIME_WAIT from last close. */ - s_val = 1; - rv = kernel_setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *)&s_val, - sizeof(s_val)); - if (rv) { - siw_dbg(id->device, "setsockopt error: %d\n", rv); - goto error; - } + sock_set_reuseaddr(s->sk); + if (addr_family == AF_INET) { struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr); diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index f0da04e960f4..40757a63f455 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1632,6 +1632,7 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) port->sock->sk->sk_user_data = port; port->data_ready = port->sock->sk->sk_data_ready; port->sock->sk->sk_data_ready = nvmet_tcp_listen_data_ready; + sock_set_reuseaddr(port->sock->sk); opt = 1; ret = kernel_setsockopt(port->sock, IPPROTO_TCP, @@ -1641,13 +1642,6 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret) { - pr_err("failed to set SO_REUSEADDR sock opt %d\n", ret); - goto err_sock; - } - if (so_priority > 0) { ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, (char *)&so_priority, sizeof(so_priority)); diff --git a/drivers/target/iscsi/Kconfig b/drivers/target/iscsi/Kconfig index 1f93ea381353..922484ea4e30 100644 --- a/drivers/target/iscsi/Kconfig +++ b/drivers/target/iscsi/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config ISCSI_TARGET tristate "Linux-iSCSI.org iSCSI Target Mode Stack" - depends on NET + depends on INET select CRYPTO select CRYPTO_CRC32C select CRYPTO_CRC32C_INTEL if X86 diff --git a/drivers/target/iscsi/iscsi_target_login.c b/drivers/target/iscsi/iscsi_target_login.c index 731ee67fe914..91acb3f07b4c 100644 --- a/drivers/target/iscsi/iscsi_target_login.c +++ b/drivers/target/iscsi/iscsi_target_login.c @@ -909,14 +909,7 @@ int iscsit_setup_np( } } - /* FIXME: Someone please explain why this is endian-safe */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - pr_err("kernel_setsockopt() for SO_REUSEADDR" - " failed\n"); - goto fail; - } + sock_set_reuseaddr(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_IP, IP_FREEBIND, (char *)&opt, sizeof(opt)); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index f13dad0fd9ef..88f2574ca63a 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1127,12 +1127,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char *)&one, sizeof(one)); - result = kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, - (char *)&one, sizeof(one)); + sock_set_reuseaddr(sock->sk); - if (result < 0) { - log_print("Failed to set SO_REUSEADDR on socket: %d", result); - } write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_user_data = con; save_listen_callbacks(sock); diff --git a/include/net/sock.h b/include/net/sock.h index 3e8c6d4b4b59..2ec085044790 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,4 +2688,6 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_set_reuseaddr(struct sock *sk); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index fd85e651ce28..18eb84fdf5fb 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -712,6 +712,14 @@ bool sk_mc_loop(struct sock *sk) } EXPORT_SYMBOL(sk_mc_loop); +void sock_set_reuseaddr(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuse = SK_CAN_REUSE; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseaddr); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From c433594c07457d2b2e41a87014bfad9bec279abf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:10 +0200 Subject: net: add sock_no_linger Add a helper to directly set the SO_LINGER sockopt from kernel space with onoff set to true and a linger time of 0 without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 9 +-------- drivers/nvme/target/tcp.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 9 +++++++++ net/rds/tcp.h | 1 - net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 13 +------------ net/sunrpc/svcsock.c | 12 ++---------- 8 files changed, 16 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index c15a92163c1f..e72d87482eb7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1313,7 +1313,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, { struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret, opt, rcv_pdu_size; queue->ctrl = ctrl; @@ -1361,13 +1360,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) { - dev_err(nctrl->device, - "failed to set SO_LINGER sock opt %d\n", ret); - goto err_sock; - } + sock_no_linger(queue->sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 40757a63f455..e0801494b097 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1429,7 +1429,6 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; struct inet_sock *inet = inet_sk(sock->sk); - struct linger sol = { .l_onoff = 1, .l_linger = 0 }; int ret; ret = kernel_getsockname(sock, @@ -1447,10 +1446,7 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) * close. This is done to prevent stale data from being sent should * the network connection be restored before TCP times out. */ - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&sol, sizeof(sol)); - if (ret) - return ret; + sock_no_linger(sock->sk); if (so_priority > 0) { ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, diff --git a/include/net/sock.h b/include/net/sock.h index 2ec085044790..6ed00bf009bb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +void sock_no_linger(struct sock *sk); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 18eb84fdf5fb..f0f09524911c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -720,6 +720,15 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_no_linger(struct sock *sk) +{ + lock_sock(sk); + sk->sk_lingertime = 0; + sock_set_flag(sk, SOCK_LINGER); + release_sock(sk); +} +EXPORT_SYMBOL(sock_no_linger); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 3c69361d21c7..d640e210b97b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -73,7 +73,6 @@ void rds_tcp_listen_data_ready(struct sock *sk); int rds_tcp_accept_one(struct socket *sock); int rds_tcp_keepalive(struct socket *sock); void *rds_tcp_listen_sock_def_readable(struct net *net); -void rds_tcp_set_linger(struct socket *sock); /* tcp_recv.c */ int rds_tcp_recv_init(void); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index 008f50fb25dd..4e64598176b0 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -207,7 +207,7 @@ void rds_tcp_conn_path_shutdown(struct rds_conn_path *cp) if (sock) { if (rds_destroy_pending(cp->cp_conn)) - rds_tcp_set_linger(sock); + sock_no_linger(sock->sk); sock->ops->shutdown(sock, RCV_SHUTDOWN | SEND_SHUTDOWN); lock_sock(sock->sk); rds_tcp_restore_callbacks(sock, tc); /* tc->tc_sock = NULL */ diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 810a3a49e947..bbb31b9c0b39 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -111,17 +111,6 @@ struct rds_tcp_connection *rds_tcp_accept_one_path(struct rds_connection *conn) return NULL; } -void rds_tcp_set_linger(struct socket *sock) -{ - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; - - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); -} - int rds_tcp_accept_one(struct socket *sock) { struct socket *new_sock = NULL; @@ -241,7 +230,7 @@ rst_nsk: * be pending on it. By setting linger, we achieve the side-effect * of avoiding TIME_WAIT state on new_sock. */ - rds_tcp_set_linger(new_sock); + sock_no_linger(new_sock->sk); kernel_sock_shutdown(new_sock, SHUT_RDWR); ret = 0; out: diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 023514e392b3..6773dacc64d8 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -323,17 +323,9 @@ static int svc_tcp_has_wspace(struct svc_xprt *xprt) static void svc_tcp_kill_temp_xprt(struct svc_xprt *xprt) { - struct svc_sock *svsk; - struct socket *sock; - struct linger no_linger = { - .l_onoff = 1, - .l_linger = 0, - }; + struct svc_sock *svsk = container_of(xprt, struct svc_sock, sk_xprt); - svsk = container_of(xprt, struct svc_sock, sk_xprt); - sock = svsk->sk_sock; - kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER, - (char *)&no_linger, sizeof(no_linger)); + sock_no_linger(svsk->sk_sock->sk); } /* -- cgit v1.2.3 From 6e43496745e75ac49d644df984d2f4ee5b5b6b4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:11 +0200 Subject: net: add sock_set_priority Add a helper to directly set the SO_PRIORITY sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Acked-by: Sagi Grimberg Signed-off-by: David S. Miller --- drivers/nvme/host/tcp.c | 12 ++---------- drivers/nvme/target/tcp.c | 18 ++++-------------- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ 4 files changed, 15 insertions(+), 24 deletions(-) (limited to 'net/core') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index e72d87482eb7..a307972d33a0 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1362,16 +1362,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, */ sock_no_linger(queue->sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - dev_err(ctrl->ctrl.device, - "failed to set SO_PRIORITY sock opt, ret %d\n", - ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(queue->sock->sk, so_priority); /* Set socket type of service */ if (nctrl->opts->tos >= 0) { diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index e0801494b097..f3088156d01d 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1448,12 +1448,8 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp_queue *queue) */ sock_no_linger(sock->sk); - if (so_priority > 0) { - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) - return ret; - } + if (so_priority > 0) + sock_set_priority(sock->sk, so_priority); /* Set socket type of service */ if (inet->rcv_tos > 0) { @@ -1638,14 +1634,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport) goto err_sock; } - if (so_priority > 0) { - ret = kernel_setsockopt(port->sock, SOL_SOCKET, SO_PRIORITY, - (char *)&so_priority, sizeof(so_priority)); - if (ret) { - pr_err("failed to set SO_PRIORITY sock opt %d\n", ret); - goto err_sock; - } - } + if (so_priority > 0) + sock_set_priority(port->sock->sk, so_priority); ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr, sizeof(port->addr)); diff --git a/include/net/sock.h b/include/net/sock.h index 6ed00bf009bb..a3a43141a4be 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index f0f09524911c..ceda1a9248b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_no_linger(struct sock *sk) } EXPORT_SYMBOL(sock_no_linger); +void sock_set_priority(struct sock *sk, u32 priority) +{ + lock_sock(sk); + sk->sk_priority = priority; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_priority); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From 76ee0785f42afbc0418072b7179d95f450d3c9a8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:12 +0200 Subject: net: add sock_set_sndtimeo Add a helper to directly set the SO_SNDTIMEO_NEW sockopt from kernel space without going through a fake uaccess. The interface is simplified to only pass the seconds value, as that is the only thing needed at the moment. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 8 ++------ include/net/sock.h | 1 + net/core/sock.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 88f2574ca63a..b79711d0aac7 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -918,7 +918,6 @@ static void sctp_connect_to_sock(struct connection *con) int result; int addr_len; struct socket *sock; - struct __kernel_sock_timeval tv = { .tv_sec = 5, .tv_usec = 0 }; if (con->nodeid == 0) { log_print("attempt to connect sock 0 foiled"); @@ -970,13 +969,10 @@ static void sctp_connect_to_sock(struct connection *con) * since O_NONBLOCK argument in connect() function does not work here, * then, we should restore the default value of this attribute. */ - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 5); result = sock->ops->connect(sock, (struct sockaddr *)&daddr, addr_len, 0); - memset(&tv, 0, sizeof(tv)); - kernel_setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO_NEW, (char *)&tv, - sizeof(tv)); + sock_set_sndtimeo(sock->sk, 0); if (result == -EINPROGRESS) result = 0; diff --git a/include/net/sock.h b/include/net/sock.h index a3a43141a4be..9a7b9e98685a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,5 +2691,6 @@ void sock_def_readable(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); +void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index ceda1a9248b3..d3b1d61e4f76 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -737,6 +737,17 @@ void sock_set_priority(struct sock *sk, u32 priority) } EXPORT_SYMBOL(sock_set_priority); +void sock_set_sndtimeo(struct sock *sk, s64 secs) +{ + lock_sock(sk); + if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) + sk->sk_sndtimeo = secs * HZ; + else + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_sndtimeo); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. -- cgit v1.2.3 From 7594888c782e735f8a7b110094307a4dbe7b3f03 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:13 +0200 Subject: net: add sock_bindtoindex Add a helper to directly set the SO_BINDTOIFINDEX sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 21 +++++++++++++++------ net/ipv4/udp_tunnel.c | 4 +--- net/ipv6/ip6_udp_tunnel.c | 4 +--- 4 files changed, 18 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index 9a7b9e98685a..cdec7bc055d5 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2688,6 +2688,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); +int sock_bindtoindex(struct sock *sk, int ifindex); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index d3b1d61e4f76..23f80880fbb2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -566,7 +566,7 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) } EXPORT_SYMBOL(sk_dst_check); -static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) +static int sock_bindtoindex_locked(struct sock *sk, int ifindex) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -594,6 +594,18 @@ out: return ret; } +int sock_bindtoindex(struct sock *sk, int ifindex) +{ + int ret; + + lock_sock(sk); + ret = sock_bindtoindex_locked(sk, ifindex); + release_sock(sk); + + return ret; +} +EXPORT_SYMBOL(sock_bindtoindex); + static int sock_setbindtodevice(struct sock *sk, char __user *optval, int optlen) { @@ -634,10 +646,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - lock_sock(sk); - ret = sock_setbindtodevice_locked(sk, index); - release_sock(sk); - + return sock_bindtoindex(sk, index); out: #endif @@ -1216,7 +1225,7 @@ set_rcvbuf: break; case SO_BINDTOIFINDEX: - ret = sock_setbindtodevice_locked(sk, val); + ret = sock_bindtoindex_locked(sk, val); break; default: diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 150e6f0fdbf5..2158e8bddf41 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,9 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 58956a6b66a2..6523609516d2 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -33,9 +33,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = kernel_setsockopt(sock, SOL_SOCKET, SO_BINDTOIFINDEX, - (void *)&cfg->bind_ifindex, - sizeof(cfg->bind_ifindex)); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); if (err < 0) goto error; } -- cgit v1.2.3 From 783da70e83967efeacf3c02c9dcfdc2b17bd62eb Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:14 +0200 Subject: net: add sock_enable_timestamps Add a helper to directly enable timestamps instead of setting the SO_TIMESTAMP* sockopts from kernel space and going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 47 +++++++++++++++++++++++++++++------------------ net/rxrpc/local_object.c | 8 +------- 3 files changed, 31 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index cdec7bc055d5..99ef43508d2b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2689,6 +2689,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); +void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 23f80880fbb2..e4a4dd2b3d8b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -757,6 +757,28 @@ void sock_set_sndtimeo(struct sock *sk, s64 secs) } EXPORT_SYMBOL(sock_set_sndtimeo); +static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) +{ + if (val) { + sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); + sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns); + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); + } else { + sock_reset_flag(sk, SOCK_RCVTSTAMP); + sock_reset_flag(sk, SOCK_RCVTSTAMPNS); + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + } +} + +void sock_enable_timestamps(struct sock *sk) +{ + lock_sock(sk); + __sock_set_timestamps(sk, true, false, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_enable_timestamps); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -948,28 +970,17 @@ set_rcvbuf: break; case SO_TIMESTAMP_OLD: + __sock_set_timestamps(sk, valbool, false, false); + break; case SO_TIMESTAMP_NEW: + __sock_set_timestamps(sk, valbool, true, false); + break; case SO_TIMESTAMPNS_OLD: + __sock_set_timestamps(sk, valbool, false, true); + break; case SO_TIMESTAMPNS_NEW: - if (valbool) { - if (optname == SO_TIMESTAMP_NEW || optname == SO_TIMESTAMPNS_NEW) - sock_set_flag(sk, SOCK_TSTAMP_NEW); - else - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - - if (optname == SO_TIMESTAMP_OLD || optname == SO_TIMESTAMP_NEW) - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - else - sock_set_flag(sk, SOCK_RCVTSTAMPNS); - sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk, SOCK_TIMESTAMP); - } else { - sock_reset_flag(sk, SOCK_RCVTSTAMP); - sock_reset_flag(sk, SOCK_RCVTSTAMPNS); - sock_reset_flag(sk, SOCK_TSTAMP_NEW); - } + __sock_set_timestamps(sk, valbool, true, true); break; - case SO_TIMESTAMPING_NEW: sock_set_flag(sk, SOCK_TSTAMP_NEW); /* fall through */ diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 01135e54d95d..5ea2bd01fdd5 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -189,13 +189,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) } /* We want receive timestamps. */ - opt = 1; - ret = kernel_setsockopt(local->socket, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - (char *)&opt, sizeof(opt)); - if (ret < 0) { - _debug("setsockopt failed"); - goto error; - } + sock_enable_timestamps(local->socket->sk); break; default: -- cgit v1.2.3 From ce3d9544cecacd40389c399d2b7ca31acc533b70 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:15 +0200 Subject: net: add sock_set_keepalive Add a helper to directly set the SO_KEEPALIVE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 6 +----- include/net/sock.h | 1 + net/core/sock.c | 10 ++++++++++ net/rds/tcp_listen.c | 6 +----- net/sunrpc/xprtsock.c | 4 +--- 5 files changed, 14 insertions(+), 13 deletions(-) (limited to 'net/core') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b79711d0aac7..b6e6dba28154 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1142,11 +1142,7 @@ static struct socket *tcp_create_listen_sock(struct connection *con, con->sock = NULL; goto create_out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&one, sizeof(one)); - if (result < 0) { - log_print("Set keepalive failed: %d", result); - } + sock_set_keepalive(sock->sk); result = sock->ops->listen(sock, 5); if (result < 0) { diff --git a/include/net/sock.h b/include/net/sock.h index 99ef43508d2b..dc08c176238f 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2691,6 +2691,7 @@ void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); +void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index e4a4dd2b3d8b..728f5fb156a0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -779,6 +779,16 @@ void sock_enable_timestamps(struct sock *sk) } EXPORT_SYMBOL(sock_enable_timestamps); +void sock_set_keepalive(struct sock *sk) +{ + lock_sock(sk); + if (sk->sk_prot->keepalive) + sk->sk_prot->keepalive(sk, true); + sock_valbool_flag(sk, SOCK_KEEPOPEN, true); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_keepalive); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index bbb31b9c0b39..d8bd13276959 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -43,13 +43,9 @@ int rds_tcp_keepalive(struct socket *sock) /* values below based on xs_udp_default_timeout */ int keepidle = 5; /* send a probe 'keepidle' secs after last data */ int keepcnt = 5; /* number of unack'ed probes before declaring dead */ - int keepalive = 1; int ret = 0; - ret = kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&keepalive, sizeof(keepalive)); - if (ret < 0) - goto bail; + sock_set_keepalive(sock->sk); ret = kernel_setsockopt(sock, IPPROTO_TCP, TCP_KEEPCNT, (char *)&keepcnt, sizeof(keepcnt)); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 845d0be805ec..30082cd03996 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2110,7 +2110,6 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); unsigned int keepidle; unsigned int keepcnt; - unsigned int opt_on = 1; unsigned int timeo; spin_lock(&xprt->transport_lock); @@ -2122,8 +2121,7 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, spin_unlock(&xprt->transport_lock); /* TCP Keepalive options */ - kernel_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE, - (char *)&opt_on, sizeof(opt_on)); + sock_set_keepalive(sock->sk); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPIDLE, (char *)&keepidle, sizeof(keepidle)); kernel_setsockopt(sock, SOL_TCP, TCP_KEEPINTVL, -- cgit v1.2.3 From 26cfabf9cdd273650126d84a48a7f8dedbcded48 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:16 +0200 Subject: net: add sock_set_rcvbuf Add a helper to directly set the SO_RCVBUFFORCE sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 7 +------ include/net/sock.h | 1 + net/core/sock.c | 59 +++++++++++++++++++++++++++++------------------------- 3 files changed, 34 insertions(+), 33 deletions(-) (limited to 'net/core') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b6e6dba28154..2822a430a2b4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1180,7 +1180,6 @@ static int sctp_listen_for_all(void) struct socket *sock = NULL; int result = -EINVAL; struct connection *con = nodeid2con(0, GFP_NOFS); - int bufsize = NEEDED_RMEM; int one = 1; if (!con) @@ -1195,11 +1194,7 @@ static int sctp_listen_for_all(void) goto out; } - result = kernel_setsockopt(sock, SOL_SOCKET, SO_RCVBUFFORCE, - (char *)&bufsize, sizeof(bufsize)); - if (result) - log_print("Error increasing buffer space on socket %d", result); - + sock_set_rcvbuf(sock->sk, NEEDED_RMEM); result = kernel_setsockopt(sock, SOL_SCTP, SCTP_NODELAY, (char *)&one, sizeof(one)); if (result < 0) diff --git a/include/net/sock.h b/include/net/sock.h index dc08c176238f..c997289aabbf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2693,6 +2693,7 @@ void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); +void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); diff --git a/net/core/sock.c b/net/core/sock.c index 728f5fb156a0..3c6ebf952e9a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -789,6 +789,35 @@ void sock_set_keepalive(struct sock *sk) } EXPORT_SYMBOL(sock_set_keepalive); +static void __sock_set_rcvbuf(struct sock *sk, int val) +{ + /* Ensure val * 2 fits into an int, to prevent max_t() from treating it + * as a negative value. + */ + val = min_t(int, val, INT_MAX / 2); + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + + /* We double it on the way in to account for "struct sk_buff" etc. + * overhead. Applications assume that the SO_RCVBUF setting they make + * will allow that much actual data to be received on that socket. + * + * Applications are unaware that "struct sk_buff" and other overheads + * allocate from the receive buffer during socket buffer allocation. + * + * And after considering the possible alternatives, returning the value + * we actually used in getsockopt is the most desirable behavior. + */ + WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); +} + +void sock_set_rcvbuf(struct sock *sk, int val) +{ + lock_sock(sk); + __sock_set_rcvbuf(sk, val); + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_rcvbuf); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -885,30 +914,7 @@ set_sndbuf: * play 'guess the biggest size' games. RCVBUF/SNDBUF * are treated in BSD as hints */ - val = min_t(u32, val, sysctl_rmem_max); -set_rcvbuf: - /* Ensure val * 2 fits into an int, to prevent max_t() - * from treating it as a negative value. - */ - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - /* - * We double it on the way in to account for - * "struct sk_buff" etc. overhead. Applications - * assume that the SO_RCVBUF setting they make will - * allow that much actual data to be received on that - * socket. - * - * Applications are unaware that "struct sk_buff" and - * other overheads allocate from the receive buffer - * during socket buffer allocation. - * - * And after considering the possible alternatives, - * returning the value we actually used in getsockopt - * is the most desirable behavior. - */ - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); + __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max)); break; case SO_RCVBUFFORCE: @@ -920,9 +926,8 @@ set_rcvbuf: /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ - if (val < 0) - val = 0; - goto set_rcvbuf; + __sock_set_rcvbuf(sk, max(val, 0)); + break; case SO_KEEPALIVE: if (sk->sk_prot->keepalive) -- cgit v1.2.3 From fe31a326a4aadb4a3ba2b21deacc380d06802737 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 28 May 2020 07:12:17 +0200 Subject: net: add sock_set_reuseport Add a helper to directly set the SO_REUSEPORT sockopt from kernel space without going through a fake uaccess. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/net/sock.h | 1 + net/core/sock.c | 8 ++++++++ net/sunrpc/xprtsock.c | 17 +---------------- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index c997289aabbf..d994daa418ec 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2695,6 +2695,7 @@ void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_reuseaddr(struct sock *sk); +void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 3c6ebf952e9a..2ca3425b519c 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -729,6 +729,14 @@ void sock_set_reuseaddr(struct sock *sk) } EXPORT_SYMBOL(sock_set_reuseaddr); +void sock_set_reuseport(struct sock *sk) +{ + lock_sock(sk); + sk->sk_reuseport = true; + release_sock(sk); +} +EXPORT_SYMBOL(sock_set_reuseport); + void sock_no_linger(struct sock *sk) { lock_sock(sk); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 30082cd03996..399848c2bcb2 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1594,21 +1594,6 @@ static int xs_get_random_port(void) return rand + min; } -/** - * xs_set_reuseaddr_port - set the socket's port and address reuse options - * @sock: socket - * - * Note that this function has to be called on all sockets that share the - * same port, and it must be called before binding. - */ -static void xs_sock_set_reuseport(struct socket *sock) -{ - int opt = 1; - - kernel_setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, - (char *)&opt, sizeof(opt)); -} - static unsigned short xs_sock_getport(struct socket *sock) { struct sockaddr_storage buf; @@ -1801,7 +1786,7 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, xs_reclassify_socket(family, sock); if (reuseport) - xs_sock_set_reuseport(sock); + sock_set_reuseport(sock->sk); err = xs_bind(transport, sock); if (err) { -- cgit v1.2.3 From c0425a4249e9d313eec5f81c0bde8a286ebf9a63 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 May 2020 14:09:42 +0200 Subject: net: add a new bind_add method The SCTP protocol allows to bind multiple address to a socket. That feature is currently only exposed as a socket option. Add a bind_add method struct proto that allows to bind additional addresses, and switch the dlm code to use the method instead of going through the socket option from kernel space. Signed-off-by: Christoph Hellwig Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- fs/dlm/lowcomms.c | 9 +++------ include/net/sock.h | 6 +++++- net/core/sock.c | 8 ++++++++ net/sctp/socket.c | 14 ++++++++++++++ 4 files changed, 30 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9f1c3cdc9d65..3543a8fec907 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -882,6 +882,7 @@ static void writequeue_entry_complete(struct writequeue_entry *e, int completed) static int sctp_bind_addrs(struct connection *con, uint16_t port) { struct sockaddr_storage localaddr; + struct sockaddr *addr = (struct sockaddr *)&localaddr; int i, addr_len, result = 0; for (i = 0; i < dlm_local_count; i++) { @@ -889,13 +890,9 @@ static int sctp_bind_addrs(struct connection *con, uint16_t port) make_sockaddr(&localaddr, port, &addr_len); if (!i) - result = kernel_bind(con->sock, - (struct sockaddr *)&localaddr, - addr_len); + result = kernel_bind(con->sock, addr, addr_len); else - result = kernel_setsockopt(con->sock, SOL_SCTP, - SCTP_SOCKOPT_BINDX_ADD, - (char *)&localaddr, addr_len); + result = sock_bind_add(con->sock->sk, addr, addr_len); if (result < 0) { log_print("Can't bind to %d addr number %d, %d.\n", diff --git a/include/net/sock.h b/include/net/sock.h index d994daa418ec..6e9f713a7860 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1156,7 +1156,9 @@ struct proto { int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, - struct sockaddr *uaddr, int addr_len); + struct sockaddr *addr, int addr_len); + int (*bind_add)(struct sock *sk, + struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); @@ -2698,4 +2700,6 @@ void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); + #endif /* _SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 2ca3425b519c..61ec573221a6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3712,3 +3712,11 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) } EXPORT_SYMBOL(sk_busy_loop_end); #endif /* CONFIG_NET_RX_BUSY_POLL */ + +int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) +{ + if (!sk->sk_prot->bind_add) + return -EOPNOTSUPP; + return sk->sk_prot->bind_add(sk, addr, addr_len); +} +EXPORT_SYMBOL(sock_bind_add); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6e745ac3c4a5..d57e1a002ffc 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1052,6 +1052,18 @@ static int sctp_setsockopt_bindx(struct sock *sk, return err; } +static int sctp_bind_add(struct sock *sk, struct sockaddr *addrs, + int addrlen) +{ + int err; + + lock_sock(sk); + err = sctp_setsockopt_bindx_kernel(sk, addrs, addrlen, + SCTP_BINDX_ADD_ADDR); + release_sock(sk); + return err; +} + static int sctp_connect_new_asoc(struct sctp_endpoint *ep, const union sctp_addr *daddr, const struct sctp_initmsg *init, @@ -9620,6 +9632,7 @@ struct proto sctp_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, @@ -9662,6 +9675,7 @@ struct proto sctpv6_prot = { .sendmsg = sctp_sendmsg, .recvmsg = sctp_recvmsg, .bind = sctp_bind, + .bind_add = sctp_bind_add, .backlog_rcv = sctp_backlog_rcv, .hash = sctp_hash, .unhash = sctp_unhash, -- cgit v1.2.3 From 1fac52da5942c58dd3e337fd7c5a550925ca752e Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:35 +0200 Subject: net: flow_offload: consolidate indirect flow_block infrastructure Tunnel devices provide no dev->netdev_ops->ndo_setup_tc(...) interface. The tunnel device and route control plane does not provide an obvious way to relate tunnel and physical devices. This patch allows drivers to register a tunnel device offload handler for the tc and netfilter frontends through flow_indr_dev_register() and flow_indr_dev_unregister(). The frontend calls flow_indr_dev_setup_offload() that iterates over the list of drivers that are offering tunnel device hardware offload support and it sets up the flow block for this tunnel device. If the driver module is removed, the indirect flow_block ends up with a stale callback reference. The module removal path triggers the dev_shutdown() path to remove the qdisc and the flow_blocks for the physical devices. However, this is not useful for tunnel devices, where relation between the physical and the tunnel device is not explicit. This patch introduces a cleanup callback that is invoked when the driver module is removed to clean up the tunnel device flow_block. This patch defines struct flow_block_indr and it uses it from flow_block_cb to store the information that front-end requires to perform the flow_block_cb cleanup on module removal. Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 19 ++++++ net/core/flow_offload.c | 157 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) (limited to 'net/core') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 95d633785ef9..5493282348fa 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -443,6 +443,16 @@ enum tc_setup_type; typedef int flow_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); +struct flow_block_cb; + +struct flow_block_indr { + struct list_head list; + struct net_device *dev; + enum flow_block_binder_type binder_type; + void *data; + void (*cleanup)(struct flow_block_cb *block_cb); +}; + struct flow_block_cb { struct list_head driver_list; struct list_head list; @@ -450,6 +460,7 @@ struct flow_block_cb { void *cb_ident; void *cb_priv; void (*release)(void *cb_priv); + struct flow_block_indr indr; unsigned int refcnt; }; @@ -523,6 +534,14 @@ static inline void flow_block_init(struct flow_block *flow_block) typedef int flow_indr_block_bind_cb_t(struct net_device *dev, void *cb_priv, enum tc_setup_type type, void *type_data); +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv); +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb); +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)); + typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index e64941c526b1..8cd7da2586ae 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -317,6 +317,163 @@ int flow_block_cb_setup_simple(struct flow_block_offload *f, } EXPORT_SYMBOL(flow_block_cb_setup_simple); +static DEFINE_MUTEX(flow_indr_block_lock); +static LIST_HEAD(flow_block_indr_list); +static LIST_HEAD(flow_block_indr_dev_list); + +struct flow_indr_dev { + struct list_head list; + flow_indr_block_bind_cb_t *cb; + void *cb_priv; + refcount_t refcnt; + struct rcu_head rcu; +}; + +static struct flow_indr_dev *flow_indr_dev_alloc(flow_indr_block_bind_cb_t *cb, + void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + indr_dev = kmalloc(sizeof(*indr_dev), GFP_KERNEL); + if (!indr_dev) + return NULL; + + indr_dev->cb = cb; + indr_dev->cb_priv = cb_priv; + refcount_set(&indr_dev->refcnt, 1); + + return indr_dev; +} + +int flow_indr_dev_register(flow_indr_block_bind_cb_t *cb, void *cb_priv) +{ + struct flow_indr_dev *indr_dev; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(indr_dev, &flow_block_indr_dev_list, list) { + if (indr_dev->cb == cb && + indr_dev->cb_priv == cb_priv) { + refcount_inc(&indr_dev->refcnt); + mutex_unlock(&flow_indr_block_lock); + return 0; + } + } + + indr_dev = flow_indr_dev_alloc(cb, cb_priv); + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return -ENOMEM; + } + + list_add(&indr_dev->list, &flow_block_indr_dev_list); + mutex_unlock(&flow_indr_block_lock); + + return 0; +} +EXPORT_SYMBOL(flow_indr_dev_register); + +static void __flow_block_indr_cleanup(flow_setup_cb_t *setup_cb, void *cb_priv, + struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, &flow_block_indr_list, indr.list) { + if (this->cb == setup_cb && + this->cb_priv == cb_priv) { + list_move(&this->indr.list, cleanup_list); + return; + } + } +} + +static void flow_block_indr_notify(struct list_head *cleanup_list) +{ + struct flow_block_cb *this, *next; + + list_for_each_entry_safe(this, next, cleanup_list, indr.list) { + list_del(&this->indr.list); + this->indr.cleanup(this); + } +} + +void flow_indr_dev_unregister(flow_indr_block_bind_cb_t *cb, void *cb_priv, + flow_setup_cb_t *setup_cb) +{ + struct flow_indr_dev *this, *next, *indr_dev = NULL; + LIST_HEAD(cleanup_list); + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry_safe(this, next, &flow_block_indr_dev_list, list) { + if (this->cb == cb && + this->cb_priv == cb_priv && + refcount_dec_and_test(&this->refcnt)) { + indr_dev = this; + list_del(&indr_dev->list); + break; + } + } + + if (!indr_dev) { + mutex_unlock(&flow_indr_block_lock); + return; + } + + __flow_block_indr_cleanup(setup_cb, cb_priv, &cleanup_list); + mutex_unlock(&flow_indr_block_lock); + + flow_block_indr_notify(&cleanup_list); + kfree(indr_dev); +} +EXPORT_SYMBOL(flow_indr_dev_unregister); + +static void flow_block_indr_init(struct flow_block_cb *flow_block, + struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + flow_block->indr.binder_type = bo->binder_type; + flow_block->indr.data = data; + flow_block->indr.dev = dev; + flow_block->indr.cleanup = cleanup; +} + +static void __flow_block_indr_binding(struct flow_block_offload *bo, + struct net_device *dev, void *data, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_block_cb *block_cb; + + list_for_each_entry(block_cb, &bo->cb_list, list) { + switch (bo->command) { + case FLOW_BLOCK_BIND: + flow_block_indr_init(block_cb, bo, dev, data, cleanup); + list_add(&block_cb->indr.list, &flow_block_indr_list); + break; + case FLOW_BLOCK_UNBIND: + list_del(&block_cb->indr.list); + break; + } + } +} + +int flow_indr_dev_setup_offload(struct net_device *dev, + enum tc_setup_type type, void *data, + struct flow_block_offload *bo, + void (*cleanup)(struct flow_block_cb *block_cb)) +{ + struct flow_indr_dev *this; + + mutex_lock(&flow_indr_block_lock); + list_for_each_entry(this, &flow_block_indr_dev_list, list) + this->cb(dev, this->cb_priv, type, bo); + + __flow_block_indr_binding(bo, dev, data, cleanup); + mutex_unlock(&flow_indr_block_lock); + + return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; +} +EXPORT_SYMBOL(flow_indr_dev_setup_offload); + static LIST_HEAD(block_cb_list); static struct rhashtable indr_setup_block_ht; -- cgit v1.2.3 From 709ffbe19b777e8fc952e2fdcfd8e6f50c8ef08c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 29 May 2020 02:25:41 +0200 Subject: net: remove indirect block netdev event registration Drivers do not register to netdev events to set up indirect blocks anymore. Remove __flow_indr_block_cb_register() and __flow_indr_block_cb_unregister(). The frontends set up the callbacks through flow_indr_dev_setup_block() Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- include/net/flow_offload.h | 9 -- net/core/flow_offload.c | 238 ---------------------------------- net/netfilter/nf_flow_table_offload.c | 66 ---------- net/netfilter/nf_tables_offload.c | 53 +------- net/sched/cls_api.c | 79 ----------- 5 files changed, 1 insertion(+), 444 deletions(-) (limited to 'net/core') diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h index 5493282348fa..69e13c8b6b3a 100644 --- a/include/net/flow_offload.h +++ b/include/net/flow_offload.h @@ -546,15 +546,6 @@ typedef void flow_indr_block_cmd_t(struct net_device *dev, flow_indr_block_bind_cb_t *cb, void *cb_priv, enum flow_block_command command); -struct flow_indr_block_entry { - flow_indr_block_cmd_t *cb; - struct list_head list; -}; - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry); - int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, flow_indr_block_bind_cb_t *cb, void *cb_ident); diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 8cd7da2586ae..0cfc35e6be28 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -473,241 +473,3 @@ int flow_indr_dev_setup_offload(struct net_device *dev, return list_empty(&bo->cb_list) ? -EOPNOTSUPP : 0; } EXPORT_SYMBOL(flow_indr_dev_setup_offload); - -static LIST_HEAD(block_cb_list); - -static struct rhashtable indr_setup_block_ht; - -struct flow_indr_block_cb { - struct list_head list; - void *cb_priv; - flow_indr_block_bind_cb_t *cb; - void *cb_ident; -}; - -struct flow_indr_block_dev { - struct rhash_head ht_node; - struct net_device *dev; - unsigned int refcnt; - struct list_head cb_list; -}; - -static const struct rhashtable_params flow_indr_setup_block_ht_params = { - .key_offset = offsetof(struct flow_indr_block_dev, dev), - .head_offset = offsetof(struct flow_indr_block_dev, ht_node), - .key_len = sizeof(struct net_device *), -}; - -static struct flow_indr_block_dev * -flow_indr_block_dev_lookup(struct net_device *dev) -{ - return rhashtable_lookup_fast(&indr_setup_block_ht, &dev, - flow_indr_setup_block_ht_params); -} - -static struct flow_indr_block_dev * -flow_indr_block_dev_get(struct net_device *dev) -{ - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (indr_dev) - goto inc_ref; - - indr_dev = kzalloc(sizeof(*indr_dev), GFP_KERNEL); - if (!indr_dev) - return NULL; - - INIT_LIST_HEAD(&indr_dev->cb_list); - indr_dev->dev = dev; - if (rhashtable_insert_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params)) { - kfree(indr_dev); - return NULL; - } - -inc_ref: - indr_dev->refcnt++; - return indr_dev; -} - -static void flow_indr_block_dev_put(struct flow_indr_block_dev *indr_dev) -{ - if (--indr_dev->refcnt) - return; - - rhashtable_remove_fast(&indr_setup_block_ht, &indr_dev->ht_node, - flow_indr_setup_block_ht_params); - kfree(indr_dev); -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_lookup(struct flow_indr_block_dev *indr_dev, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - if (indr_block_cb->cb == cb && - indr_block_cb->cb_ident == cb_ident) - return indr_block_cb; - return NULL; -} - -static struct flow_indr_block_cb * -flow_indr_block_cb_add(struct flow_indr_block_dev *indr_dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (indr_block_cb) - return ERR_PTR(-EEXIST); - - indr_block_cb = kzalloc(sizeof(*indr_block_cb), GFP_KERNEL); - if (!indr_block_cb) - return ERR_PTR(-ENOMEM); - - indr_block_cb->cb_priv = cb_priv; - indr_block_cb->cb = cb; - indr_block_cb->cb_ident = cb_ident; - list_add(&indr_block_cb->list, &indr_dev->cb_list); - - return indr_block_cb; -} - -static void flow_indr_block_cb_del(struct flow_indr_block_cb *indr_block_cb) -{ - list_del(&indr_block_cb->list); - kfree(indr_block_cb); -} - -static DEFINE_MUTEX(flow_indr_block_cb_lock); - -static void flow_block_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command) -{ - struct flow_indr_block_entry *entry; - - mutex_lock(&flow_indr_block_cb_lock); - list_for_each_entry(entry, &block_cb_list, list) { - entry->cb(dev, cb, cb_priv, command); - } - mutex_unlock(&flow_indr_block_cb_lock); -} - -int __flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - int err; - - indr_dev = flow_indr_block_dev_get(dev); - if (!indr_dev) - return -ENOMEM; - - indr_block_cb = flow_indr_block_cb_add(indr_dev, cb_priv, cb, cb_ident); - err = PTR_ERR_OR_ZERO(indr_block_cb); - if (err) - goto err_dev_put; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_BIND); - - return 0; - -err_dev_put: - flow_indr_block_dev_put(indr_dev); - return err; -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_register); - -int flow_indr_block_cb_register(struct net_device *dev, void *cb_priv, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - int err; - - rtnl_lock(); - err = __flow_indr_block_cb_register(dev, cb_priv, cb, cb_ident); - rtnl_unlock(); - - return err; -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_register); - -void __flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - indr_block_cb = flow_indr_block_cb_lookup(indr_dev, cb, cb_ident); - if (!indr_block_cb) - return; - - flow_block_cmd(dev, indr_block_cb->cb, indr_block_cb->cb_priv, - FLOW_BLOCK_UNBIND); - - flow_indr_block_cb_del(indr_block_cb); - flow_indr_block_dev_put(indr_dev); -} -EXPORT_SYMBOL_GPL(__flow_indr_block_cb_unregister); - -void flow_indr_block_cb_unregister(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_ident) -{ - rtnl_lock(); - __flow_indr_block_cb_unregister(dev, cb, cb_ident); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(flow_indr_block_cb_unregister); - -void flow_indr_block_call(struct net_device *dev, - struct flow_block_offload *bo, - enum flow_block_command command, - enum tc_setup_type type) -{ - struct flow_indr_block_cb *indr_block_cb; - struct flow_indr_block_dev *indr_dev; - - indr_dev = flow_indr_block_dev_lookup(dev); - if (!indr_dev) - return; - - list_for_each_entry(indr_block_cb, &indr_dev->cb_list, list) - indr_block_cb->cb(dev, indr_block_cb->cb_priv, type, bo); -} -EXPORT_SYMBOL_GPL(flow_indr_block_call); - -void flow_indr_add_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_add_tail(&entry->list, &block_cb_list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_add_block_cb); - -void flow_indr_del_block_cb(struct flow_indr_block_entry *entry) -{ - mutex_lock(&flow_indr_block_cb_lock); - list_del(&entry->list); - mutex_unlock(&flow_indr_block_cb_lock); -} -EXPORT_SYMBOL_GPL(flow_indr_del_block_cb); - -static int __init init_flow_indr_rhashtable(void) -{ - return rhashtable_init(&indr_setup_block_ht, - &flow_indr_setup_block_ht_params); -} -subsys_initcall(init_flow_indr_rhashtable); diff --git a/net/netfilter/nf_flow_table_offload.c b/net/netfilter/nf_flow_table_offload.c index 01cfa02c43bd..62651e6683f6 100644 --- a/net/netfilter/nf_flow_table_offload.c +++ b/net/netfilter/nf_flow_table_offload.c @@ -1008,69 +1008,6 @@ int nf_flow_table_offload_setup(struct nf_flowtable *flowtable, } EXPORT_SYMBOL_GPL(nf_flow_table_offload_setup); -static void nf_flow_table_indr_block_ing_cmd(struct net_device *dev, - struct nf_flowtable *flowtable, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!flowtable) - return; - - nf_flow_table_block_offload_init(&bo, dev_net(dev), cmd, flowtable, - &extack); - - cb(dev, cb_priv, TC_SETUP_FT, &bo); - - nf_flow_table_block_setup(flowtable, &bo, cmd); -} - -static void nf_flow_table_indr_block_cb_cmd(struct nf_flowtable *flowtable, - struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - if (!(flowtable->flags & NF_FLOWTABLE_HW_OFFLOAD)) - return; - - nf_flow_table_indr_block_ing_cmd(dev, flowtable, cb, cb_priv, cmd); -} - -static void nf_flow_table_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_flowtable *nft_ft; - struct nft_table *table; - struct nft_hook *hook; - - mutex_lock(&net->nft.commit_mutex); - list_for_each_entry(table, &net->nft.tables, list) { - list_for_each_entry(nft_ft, &table->flowtables, list) { - list_for_each_entry(hook, &nft_ft->hook_list, list) { - if (hook->ops.dev != dev) - continue; - - nf_flow_table_indr_block_cb_cmd(&nft_ft->data, - dev, cb, - cb_priv, cmd); - } - } - } - mutex_unlock(&net->nft.commit_mutex); -} - -static struct flow_indr_block_entry block_ing_entry = { - .cb = nf_flow_table_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - int nf_flow_table_offload_init(void) { nf_flow_offload_wq = alloc_workqueue("nf_flow_table_offload", @@ -1078,13 +1015,10 @@ int nf_flow_table_offload_init(void) if (!nf_flow_offload_wq) return -ENOMEM; - flow_indr_add_block_cb(&block_ing_entry); - return 0; } void nf_flow_table_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); destroy_workqueue(nf_flow_offload_wq); } diff --git a/net/netfilter/nf_tables_offload.c b/net/netfilter/nf_tables_offload.c index 1960f11477e8..185fc82c99aa 100644 --- a/net/netfilter/nf_tables_offload.c +++ b/net/netfilter/nf_tables_offload.c @@ -285,25 +285,6 @@ static int nft_block_offload_cmd(struct nft_base_chain *chain, return nft_block_setup(chain, &bo, cmd); } -static void nft_indr_block_ing_cmd(struct net_device *dev, - struct nft_base_chain *chain, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command cmd) -{ - struct netlink_ext_ack extack = {}; - struct flow_block_offload bo; - - if (!chain) - return; - - nft_flow_block_offload_init(&bo, dev_net(dev), cmd, chain, &extack); - - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - nft_block_setup(chain, &bo, cmd); -} - static void nft_indr_block_cleanup(struct flow_block_cb *block_cb) { struct nft_base_chain *basechain = block_cb->indr.data; @@ -575,24 +556,6 @@ static struct nft_chain *__nft_offload_get_chain(struct net_device *dev) return NULL; } -static void nft_indr_block_cb(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command cmd) -{ - struct net *net = dev_net(dev); - struct nft_chain *chain; - - mutex_lock(&net->nft.commit_mutex); - chain = __nft_offload_get_chain(dev); - if (chain && chain->flags & NFT_CHAIN_HW_OFFLOAD) { - struct nft_base_chain *basechain; - - basechain = nft_base_chain(chain); - nft_indr_block_ing_cmd(dev, basechain, cb, cb_priv, cmd); - } - mutex_unlock(&net->nft.commit_mutex); -} - static int nft_offload_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -614,30 +577,16 @@ static int nft_offload_netdev_event(struct notifier_block *this, return NOTIFY_DONE; } -static struct flow_indr_block_entry block_ing_entry = { - .cb = nft_indr_block_cb, - .list = LIST_HEAD_INIT(block_ing_entry.list), -}; - static struct notifier_block nft_offload_netdev_notifier = { .notifier_call = nft_offload_netdev_event, }; int nft_offload_init(void) { - int err; - - err = register_netdevice_notifier(&nft_offload_netdev_notifier); - if (err < 0) - return err; - - flow_indr_add_block_cb(&block_ing_entry); - - return 0; + return register_netdevice_notifier(&nft_offload_netdev_notifier); } void nft_offload_exit(void) { - flow_indr_del_block_cb(&block_ing_entry); unregister_netdevice_notifier(&nft_offload_netdev_notifier); } diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 760e51d852f5..a00a203b2ef5 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -621,78 +621,6 @@ static void tcf_chain_flush(struct tcf_chain *chain, bool rtnl_held) static int tcf_block_setup(struct tcf_block *block, struct flow_block_offload *bo); -static void tc_indr_block_cmd(struct net_device *dev, struct tcf_block *block, - flow_indr_block_bind_cb_t *cb, void *cb_priv, - enum flow_block_command command, bool ingress) -{ - struct flow_block_offload bo = { - .command = command, - .binder_type = ingress ? - FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS : - FLOW_BLOCK_BINDER_TYPE_CLSACT_EGRESS, - .net = dev_net(dev), - .block_shared = tcf_block_non_null_shared(block), - }; - INIT_LIST_HEAD(&bo.cb_list); - - if (!block) - return; - - bo.block = &block->flow_block; - - down_write(&block->cb_lock); - cb(dev, cb_priv, TC_SETUP_BLOCK, &bo); - - tcf_block_setup(block, &bo); - up_write(&block->cb_lock); -} - -static struct tcf_block *tc_dev_block(struct net_device *dev, bool ingress) -{ - const struct Qdisc_class_ops *cops; - const struct Qdisc_ops *ops; - struct Qdisc *qdisc; - - if (!dev_ingress_queue(dev)) - return NULL; - - qdisc = dev_ingress_queue(dev)->qdisc_sleeping; - if (!qdisc) - return NULL; - - ops = qdisc->ops; - if (!ops) - return NULL; - - if (!ingress && !strcmp("ingress", ops->id)) - return NULL; - - cops = ops->cl_ops; - if (!cops) - return NULL; - - if (!cops->tcf_block) - return NULL; - - return cops->tcf_block(qdisc, - ingress ? TC_H_MIN_INGRESS : TC_H_MIN_EGRESS, - NULL); -} - -static void tc_indr_block_get_and_cmd(struct net_device *dev, - flow_indr_block_bind_cb_t *cb, - void *cb_priv, - enum flow_block_command command) -{ - struct tcf_block *block; - - block = tc_dev_block(dev, true); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, true); - - block = tc_dev_block(dev, false); - tc_indr_block_cmd(dev, block, cb, cb_priv, command, false); -} - static void tcf_block_offload_init(struct flow_block_offload *bo, struct net_device *dev, enum flow_block_command command, @@ -3836,11 +3764,6 @@ static struct pernet_operations tcf_net_ops = { .size = sizeof(struct tcf_net), }; -static struct flow_indr_block_entry block_entry = { - .cb = tc_indr_block_get_and_cmd, - .list = LIST_HEAD_INIT(block_entry.list), -}; - static int __init tc_filter_init(void) { int err; @@ -3853,8 +3776,6 @@ static int __init tc_filter_init(void) if (err) goto err_register_pernet_subsys; - flow_indr_add_block_cb(&block_entry); - rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_new_tfilter, NULL, RTNL_FLAG_DOIT_UNLOCKED); rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_del_tfilter, NULL, -- cgit v1.2.3 From 678eb199cc9df3bf1cb12fb2da22768b8d1b6bf3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:36 +0300 Subject: devlink: Create dedicated trap group for layer 3 exceptions Packets that hit exceptions during layer 3 forwarding must be trapped to the CPU for the control plane to function properly. Create a dedicated group for them, so that user space could choose to assign a different policer for them. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 7 +++++-- include/net/devlink.h | 3 +++ net/core/devlink.c | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index fe089acb7783..4ca241e70064 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -277,8 +277,11 @@ narrow. The description of these groups must be added to the following table: - Contains packet traps for packets that were dropped by the device during layer 2 forwarding (i.e., bridge) * - ``l3_drops`` - - Contains packet traps for packets that were dropped by the device or hit - an exception (e.g., TTL error) during layer 3 forwarding + - Contains packet traps for packets that were dropped by the device during + layer 3 forwarding + * - ``l3_exceptions`` + - Contains packet traps for packets that hit an exception (e.g., TTL + error) during layer 3 forwarding * - ``buffer_drops`` - Contains packet traps for packets that were dropped by the device due to an enqueue decision diff --git a/include/net/devlink.h b/include/net/devlink.h index 8ffc1b5cd89b..851388c9d795 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,7 @@ enum devlink_trap_generic_id { enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_L2_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_L3_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_L3_EXCEPTIONS, DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, @@ -730,6 +731,8 @@ enum devlink_trap_group_generic_id { "l2_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_DROPS \ "l3_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_L3_EXCEPTIONS \ + "l3_exceptions" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_BUFFER_DROPS \ "buffer_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_TUNNEL_DROPS \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 7b76e5fffc10..d9fff7083f02 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8505,6 +8505,7 @@ static const struct devlink_trap devlink_trap_generic[] = { static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(L2_DROPS), DEVLINK_TRAP_GROUP(L3_DROPS), + DEVLINK_TRAP_GROUP(L3_EXCEPTIONS), DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), -- cgit v1.2.3 From 9eefeabed6f831018c15bd7e17d34967ee34d9dd Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:39 +0300 Subject: devlink: Add 'mirror' trap action The action is used by control traps such as IGMP query. The packet is flooded by the device, but also trapped to the CPU in order for the software bridge to mark the receiving port as a multicast router port. Such packets are marked with 'skb->offload_fwd_mark = 1' in order to prevent the software bridge from flooding them again. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 2 ++ include/uapi/linux/devlink.h | 3 +++ net/core/devlink.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 4ca241e70064..5b97327caefc 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -108,6 +108,8 @@ The ``devlink-trap`` mechanism supports the following packet trap actions: * ``trap``: The sole copy of the packet is sent to the CPU. * ``drop``: The packet is dropped by the underlying device and a copy is not sent to the CPU. + * ``mirror``: The packet is forwarded by the underlying device and a copy is + sent to the CPU. Generic Packet Traps ==================== diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1ae90e06c06d..16305932a950 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -233,10 +233,13 @@ enum { * @DEVLINK_TRAP_ACTION_DROP: Packet is dropped by the device and a copy is not * sent to the CPU. * @DEVLINK_TRAP_ACTION_TRAP: The sole copy of the packet is sent to the CPU. + * @DEVLINK_TRAP_ACTION_MIRROR: Packet is forwarded by the device and a copy is + * sent to the CPU. */ enum devlink_trap_action { DEVLINK_TRAP_ACTION_DROP, DEVLINK_TRAP_ACTION_TRAP, + DEVLINK_TRAP_ACTION_MIRROR, }; /** diff --git a/net/core/devlink.c b/net/core/devlink.c index d9fff7083f02..d6298917b077 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -5869,7 +5869,8 @@ devlink_trap_action_get_from_info(struct genl_info *info, val = nla_get_u8(info->attrs[DEVLINK_ATTR_TRAP_ACTION]); switch (val) { case DEVLINK_TRAP_ACTION_DROP: /* fall-through */ - case DEVLINK_TRAP_ACTION_TRAP: + case DEVLINK_TRAP_ACTION_TRAP: /* fall-through */ + case DEVLINK_TRAP_ACTION_MIRROR: *p_trap_action = val; break; default: -- cgit v1.2.3 From 30a4e9a29ab9aadfe6c5386ae4aa396b1d2556c2 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:40 +0300 Subject: devlink: Add 'control' trap type This type is used for traps that trap control packets such as ARP request and IGMP query to the CPU. Do not report such packets to the kernel's drop monitor as they were not dropped by the device no encountered an exception during forwarding. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 8 +++++++- include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 7 +++++++ 3 files changed, 20 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 5b97327caefc..6c293cfa23ee 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -55,7 +55,7 @@ The following diagram provides a general overview of ``devlink-trap``:: | | +-------^--------+ | - | + | Non-control traps | +----+----+ | | Kernel's Rx path @@ -97,6 +97,12 @@ The ``devlink-trap`` mechanism supports the following packet trap types: processed by ``devlink`` and injected to the kernel's Rx path. Changing the action of such traps is not allowed, as it can easily break the control plane. + * ``control``: Trapped packets were trapped by the device because these are + control packets required for the correct functioning of the control plane. + For example, ARP request and IGMP query packets. Packets are injected to + the kernel's Rx path, but not reported to the kernel's drop monitor. + Changing the action of such traps is not allowed, as it can easily break + the control plane. .. _Trap-Actions: diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 16305932a950..08563e6a424d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -253,10 +253,16 @@ enum devlink_trap_action { * control plane for resolution. Trapped packets * are processed by devlink and injected to * the kernel's Rx path. + * @DEVLINK_TRAP_TYPE_CONTROL: Packet was trapped because it is required for + * the correct functioning of the control plane. + * For example, an ARP request packet. Trapped + * packets are injected to the kernel's Rx path, + * but not reported to drop monitor. */ enum devlink_trap_type { DEVLINK_TRAP_TYPE_DROP, DEVLINK_TRAP_TYPE_EXCEPTION, + DEVLINK_TRAP_TYPE_CONTROL, }; enum { diff --git a/net/core/devlink.c b/net/core/devlink.c index d6298917b077..47c28e0f848f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8847,6 +8847,13 @@ void devlink_trap_report(struct devlink *devlink, struct sk_buff *skb, devlink_trap_stats_update(trap_item->stats, skb->len); devlink_trap_stats_update(trap_item->group_item->stats, skb->len); + /* Control packets were not dropped by the device or encountered an + * exception during forwarding and therefore should not be reported to + * the kernel's drop monitor. + */ + if (trap_item->trap->type == DEVLINK_TRAP_TYPE_CONTROL) + return; + devlink_trap_report_metadata_fill(&hw_metadata, trap_item, in_devlink_port, fa_cookie); net_dm_hw_report(skb, &hw_metadata); -- cgit v1.2.3 From 515eac677fe119433c2a466443bef95c10c550cc Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:41 +0300 Subject: devlink: Add layer 2 control packet traps Add layer 2 control packet traps such as STP and IGMP query, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 45 +++++++++++++++++++++ include/net/devlink.h | 48 +++++++++++++++++++++++ net/core/devlink.c | 16 ++++++++ 3 files changed, 109 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 6c293cfa23ee..e9fc3c9d7d7a 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -252,6 +252,42 @@ be added to the following table: * - ``egress_flow_action_drop`` - ``drop`` - Traps packets dropped during processing of egress flow action drop + * - ``stp`` + - ``control`` + - Traps STP packets + * - ``lacp`` + - ``control`` + - Traps LACP packets + * - ``lldp`` + - ``control`` + - Traps LLDP packets + * - ``igmp_query`` + - ``control`` + - Traps IGMP Membership Query packets + * - ``igmp_v1_report`` + - ``control`` + - Traps IGMP Version 1 Membership Report packets + * - ``igmp_v2_report`` + - ``control`` + - Traps IGMP Version 2 Membership Report packets + * - ``igmp_v3_report`` + - ``control`` + - Traps IGMP Version 3 Membership Report packets + * - ``igmp_v2_leave`` + - ``control`` + - Traps IGMP Version 2 Leave Group packets + * - ``mld_query`` + - ``control`` + - Traps MLD Multicast Listener Query packets + * - ``mld_v1_report`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Report packets + * - ``mld_v2_report`` + - ``control`` + - Traps MLD Version 2 Multicast Listener Report packets + * - ``mld_v1_done`` + - ``control`` + - Traps MLD Version 1 Multicast Listener Done packets Driver-specific Packet Traps ============================ @@ -299,6 +335,15 @@ narrow. The description of these groups must be added to the following table: * - ``acl_drops`` - Contains packet traps for packets that were dropped by the device during ACL processing + * - ``stp`` + - Contains packet traps for STP packets + * - ``lacp`` + - Contains packet traps for LACP packets + * - ``lldp`` + - Contains packet traps for LLDP packets + * - ``mc_snooping`` + - Contains packet traps for IGMP and MLD packets required for multicast + snooping Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 851388c9d795..c0061542ad65 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -645,6 +645,18 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_OVERLAY_SMAC_MC, DEVLINK_TRAP_GENERIC_ID_INGRESS_FLOW_ACTION_DROP, DEVLINK_TRAP_GENERIC_ID_EGRESS_FLOW_ACTION_DROP, + DEVLINK_TRAP_GENERIC_ID_STP, + DEVLINK_TRAP_GENERIC_ID_LACP, + DEVLINK_TRAP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GENERIC_ID_IGMP_QUERY, + DEVLINK_TRAP_GENERIC_ID_IGMP_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V3_REPORT, + DEVLINK_TRAP_GENERIC_ID_IGMP_V2_LEAVE, + DEVLINK_TRAP_GENERIC_ID_MLD_QUERY, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, + DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -661,6 +673,10 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_BUFFER_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_TUNNEL_DROPS, DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_DROPS, + DEVLINK_TRAP_GROUP_GENERIC_ID_STP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, + DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, + DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -726,6 +742,30 @@ enum devlink_trap_group_generic_id { "ingress_flow_action_drop" #define DEVLINK_TRAP_GENERIC_NAME_EGRESS_FLOW_ACTION_DROP \ "egress_flow_action_drop" +#define DEVLINK_TRAP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_QUERY \ + "igmp_query" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V1_REPORT \ + "igmp_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_REPORT \ + "igmp_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V3_REPORT \ + "igmp_v3_report" +#define DEVLINK_TRAP_GENERIC_NAME_IGMP_V2_LEAVE \ + "igmp_v2_leave" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_QUERY \ + "mld_query" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_REPORT \ + "mld_v1_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V2_REPORT \ + "mld_v2_report" +#define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ + "mld_v1_done" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -739,6 +779,14 @@ enum devlink_trap_group_generic_id { "tunnel_drops" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_DROPS \ "acl_drops" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_STP \ + "stp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LACP \ + "lacp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LLDP \ + "lldp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ + "mc_snooping" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index 47c28e0f848f..c91ef1b5f738 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8495,6 +8495,18 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(OVERLAY_SMAC_MC, DROP), DEVLINK_TRAP(INGRESS_FLOW_ACTION_DROP, DROP), DEVLINK_TRAP(EGRESS_FLOW_ACTION_DROP, DROP), + DEVLINK_TRAP(STP, CONTROL), + DEVLINK_TRAP(LACP, CONTROL), + DEVLINK_TRAP(LLDP, CONTROL), + DEVLINK_TRAP(IGMP_QUERY, CONTROL), + DEVLINK_TRAP(IGMP_V1_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V3_REPORT, CONTROL), + DEVLINK_TRAP(IGMP_V2_LEAVE, CONTROL), + DEVLINK_TRAP(MLD_QUERY, CONTROL), + DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), + DEVLINK_TRAP(MLD_V1_DONE, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8510,6 +8522,10 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(BUFFER_DROPS), DEVLINK_TRAP_GROUP(TUNNEL_DROPS), DEVLINK_TRAP_GROUP(ACL_DROPS), + DEVLINK_TRAP_GROUP(STP), + DEVLINK_TRAP_GROUP(LACP), + DEVLINK_TRAP_GROUP(LLDP), + DEVLINK_TRAP_GROUP(MC_SNOOPING), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From d77cfd162a346259222d0207a95bf1a0cc0c2520 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:42 +0300 Subject: devlink: Add layer 3 control packet traps Add layer 3 control packet traps such as ARP and DHCP, so that capable device drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 143 ++++++++++++++++++++++ include/net/devlink.h | 126 +++++++++++++++++++ net/core/devlink.c | 42 +++++++ 3 files changed, 311 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index e9fc3c9d7d7a..621b634b16be 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -288,6 +288,115 @@ be added to the following table: * - ``mld_v1_done`` - ``control`` - Traps MLD Version 1 Multicast Listener Done packets + * - ``ipv4_dhcp`` + - ``control`` + - Traps IPv4 DHCP packets + * - ``ipv6_dhcp`` + - ``control`` + - Traps IPv6 DHCP packets + * - ``arp_request`` + - ``control`` + - Traps ARP request packets + * - ``arp_response`` + - ``control`` + - Traps ARP response packets + * - ``arp_overlay`` + - ``control`` + - Traps NVE-decapsulated ARP packets that reached the overlay network. + This is required, for example, when the address that needs to be + resolved is a local address + * - ``ipv6_neigh_solicit`` + - ``control`` + - Traps IPv6 Neighbour Solicitation packets + * - ``ipv6_neigh_advert`` + - ``control`` + - Traps IPv6 Neighbour Advertisement packets + * - ``ipv4_bfd`` + - ``control`` + - Traps IPv4 BFD packets + * - ``ipv6_bfd`` + - ``control`` + - Traps IPv6 BFD packets + * - ``ipv4_ospf`` + - ``control`` + - Traps IPv4 OSPF packets + * - ``ipv6_ospf`` + - ``control`` + - Traps IPv6 OSPF packets + * - ``ipv4_bgp`` + - ``control`` + - Traps IPv4 BGP packets + * - ``ipv6_bgp`` + - ``control`` + - Traps IPv6 BGP packets + * - ``ipv4_vrrp`` + - ``control`` + - Traps IPv4 VRRP packets + * - ``ipv6_vrrp`` + - ``control`` + - Traps IPv6 VRRP packets + * - ``ipv4_pim`` + - ``control`` + - Traps IPv4 PIM packets + * - ``ipv6_pim`` + - ``control`` + - Traps IPv6 PIM packets + * - ``uc_loopback`` + - ``control`` + - Traps unicast packets that need to be routed through the same layer 3 + interface from which they were received. Such packets are routed by the + kernel, but also cause it to potentially generate ICMP redirect packets + * - ``local_route`` + - ``control`` + - Traps unicast packets that hit a local route and need to be locally + delivered + * - ``external_route`` + - ``control`` + - Traps packets that should be routed through an external interface (e.g., + management interface) that does not belong to the same device (e.g., + switch ASIC) as the ingress interface + * - ``ipv6_uc_dip_link_local_scope`` + - ``control`` + - Traps unicast IPv6 packets that need to be routed and have a destination + IP address with a link-local scope (i.e., fe80::/10). The trap allows + device drivers to avoid programming link-local routes, but still receive + packets for local delivery + * - ``ipv6_dip_all_nodes`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Nodes + Address" (i.e., ff02::1) + * - ``ipv6_dip_all_routers`` + - ``control`` + - Traps IPv6 packets that their destination IP address is the "All Routers + Address" (i.e., ff02::2) + * - ``ipv6_router_solicit`` + - ``control`` + - Traps IPv6 Router Solicitation packets + * - ``ipv6_router_advert`` + - ``control`` + - Traps IPv6 Router Advertisement packets + * - ``ipv6_redirect`` + - ``control`` + - Traps IPv6 Redirect Message packets + * - ``ipv4_router_alert`` + - ``control`` + - Traps IPv4 packets that need to be routed and include the Router Alert + option. Such packets need to be locally delivered to raw sockets that + have the IP_ROUTER_ALERT socket option set + * - ``ipv6_router_alert`` + - ``control`` + - Traps IPv6 packets that need to be routed and include the Router Alert + option in their Hop-by-Hop extension header. Such packets need to be + locally delivered to raw sockets that have the IPV6_ROUTER_ALERT socket + option set + * - ``ptp_event`` + - ``control`` + - Traps PTP time-critical event messages (Sync, Delay_req, Pdelay_Req and + Pdelay_Resp) + * - ``ptp_general`` + - ``control`` + - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, + Pdelay_Resp_Follow_Up, management and signaling) Driver-specific Packet Traps ============================ @@ -344,6 +453,40 @@ narrow. The description of these groups must be added to the following table: * - ``mc_snooping`` - Contains packet traps for IGMP and MLD packets required for multicast snooping + * - ``dhcp`` + - Contains packet traps for DHCP packets + * - ``neigh_discovery`` + - Contains packet traps for neighbour discovery packets (e.g., ARP, IPv6 + ND) + * - ``bfd`` + - Contains packet traps for BFD packets + * - ``ospf`` + - Contains packet traps for OSPF packets + * - ``bgp`` + - Contains packet traps for BGP packets + * - ``vrrp`` + - Contains packet traps for VRRP packets + * - ``pim`` + - Contains packet traps for PIM packets + * - ``uc_loopback`` + - Contains a packet trap for unicast loopback packets (i.e., + ``uc_loopback``). This trap is singled-out because in cases such as + one-armed router it will be constantly triggered. To limit the impact on + the CPU usage, a packet trap policer with a low rate can be bound to the + group without affecting other traps + * - ``local_delivery`` + - Contains packet traps for packets that should be locally delivered after + routing, but do not match more specific packet traps (e.g., + ``ipv4_bgp``) + * - ``ipv6`` + - Contains packet traps for various IPv6 control packets (e.g., Router + Advertisements) + * - ``ptp_event`` + - Contains packet traps for PTP time-critical event messages (Sync, + Delay_req, Pdelay_Req and Pdelay_Resp) + * - ``ptp_general`` + - Contains packet traps for PTP general messages (Announce, Follow_Up, + Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index c0061542ad65..05a45dea976b 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -657,6 +657,36 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_MLD_V1_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V2_REPORT, DEVLINK_TRAP_GENERIC_ID_MLD_V1_DONE, + DEVLINK_TRAP_GENERIC_ID_IPV4_DHCP, + DEVLINK_TRAP_GENERIC_ID_IPV6_DHCP, + DEVLINK_TRAP_GENERIC_ID_ARP_REQUEST, + DEVLINK_TRAP_GENERIC_ID_ARP_RESPONSE, + DEVLINK_TRAP_GENERIC_ID_ARP_OVERLAY, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_NEIGH_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV4_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV6_BFD, + DEVLINK_TRAP_GENERIC_ID_IPV4_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV6_OSPF, + DEVLINK_TRAP_GENERIC_ID_IPV4_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV6_BGP, + DEVLINK_TRAP_GENERIC_ID_IPV4_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV6_VRRP, + DEVLINK_TRAP_GENERIC_ID_IPV4_PIM, + DEVLINK_TRAP_GENERIC_ID_IPV6_PIM, + DEVLINK_TRAP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GENERIC_ID_LOCAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_EXTERNAL_ROUTE, + DEVLINK_TRAP_GENERIC_ID_IPV6_UC_DIP_LINK_LOCAL_SCOPE, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_NODES, + DEVLINK_TRAP_GENERIC_ID_IPV6_DIP_ALL_ROUTERS, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_SOLICIT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ADVERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_REDIRECT, + DEVLINK_TRAP_GENERIC_ID_IPV4_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, + DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -677,6 +707,18 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_LACP, DEVLINK_TRAP_GROUP_GENERIC_ID_LLDP, DEVLINK_TRAP_GROUP_GENERIC_ID_MC_SNOOPING, + DEVLINK_TRAP_GROUP_GENERIC_ID_DHCP, + DEVLINK_TRAP_GROUP_GENERIC_ID_NEIGH_DISCOVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_BFD, + DEVLINK_TRAP_GROUP_GENERIC_ID_OSPF, + DEVLINK_TRAP_GROUP_GENERIC_ID_BGP, + DEVLINK_TRAP_GROUP_GENERIC_ID_VRRP, + DEVLINK_TRAP_GROUP_GENERIC_ID_PIM, + DEVLINK_TRAP_GROUP_GENERIC_ID_UC_LB, + DEVLINK_TRAP_GROUP_GENERIC_ID_LOCAL_DELIVERY, + DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, + DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -766,6 +808,66 @@ enum devlink_trap_group_generic_id { "mld_v2_report" #define DEVLINK_TRAP_GENERIC_NAME_MLD_V1_DONE \ "mld_v1_done" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_DHCP \ + "ipv4_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DHCP \ + "ipv6_dhcp" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_REQUEST \ + "arp_request" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_RESPONSE \ + "arp_response" +#define DEVLINK_TRAP_GENERIC_NAME_ARP_OVERLAY \ + "arp_overlay" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_SOLICIT \ + "ipv6_neigh_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_NEIGH_ADVERT \ + "ipv6_neigh_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BFD \ + "ipv4_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BFD \ + "ipv6_bfd" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_OSPF \ + "ipv4_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_OSPF \ + "ipv6_ospf" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_BGP \ + "ipv4_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_BGP \ + "ipv6_bgp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_VRRP \ + "ipv4_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_VRRP \ + "ipv6_vrrp" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_PIM \ + "ipv4_pim" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_PIM \ + "ipv6_pim" +#define DEVLINK_TRAP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GENERIC_NAME_LOCAL_ROUTE \ + "local_route" +#define DEVLINK_TRAP_GENERIC_NAME_EXTERNAL_ROUTE \ + "external_route" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_UC_DIP_LINK_LOCAL_SCOPE \ + "ipv6_uc_dip_link_local_scope" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_NODES \ + "ipv6_dip_all_nodes" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_DIP_ALL_ROUTERS \ + "ipv6_dip_all_routers" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_SOLICIT \ + "ipv6_router_solicit" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ADVERT \ + "ipv6_router_advert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_REDIRECT \ + "ipv6_redirect" +#define DEVLINK_TRAP_GENERIC_NAME_IPV4_ROUTER_ALERT \ + "ipv4_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_IPV6_ROUTER_ALERT \ + "ipv6_router_alert" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -787,6 +889,30 @@ enum devlink_trap_group_generic_id { "lldp" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_MC_SNOOPING \ "mc_snooping" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_DHCP \ + "dhcp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_NEIGH_DISCOVERY \ + "neigh_discovery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BFD \ + "bfd" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_OSPF \ + "ospf" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_BGP \ + "bgp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_VRRP \ + "vrrp" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PIM \ + "pim" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_UC_LB \ + "uc_loopback" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_LOCAL_DELIVERY \ + "local_delivery" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_IPV6 \ + "ipv6" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_EVENT \ + "ptp_event" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ + "ptp_general" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index c91ef1b5f738..f32854c3d0e7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8507,6 +8507,36 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(MLD_V1_REPORT, CONTROL), DEVLINK_TRAP(MLD_V2_REPORT, CONTROL), DEVLINK_TRAP(MLD_V1_DONE, CONTROL), + DEVLINK_TRAP(IPV4_DHCP, CONTROL), + DEVLINK_TRAP(IPV6_DHCP, CONTROL), + DEVLINK_TRAP(ARP_REQUEST, CONTROL), + DEVLINK_TRAP(ARP_RESPONSE, CONTROL), + DEVLINK_TRAP(ARP_OVERLAY, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_NEIGH_ADVERT, CONTROL), + DEVLINK_TRAP(IPV4_BFD, CONTROL), + DEVLINK_TRAP(IPV6_BFD, CONTROL), + DEVLINK_TRAP(IPV4_OSPF, CONTROL), + DEVLINK_TRAP(IPV6_OSPF, CONTROL), + DEVLINK_TRAP(IPV4_BGP, CONTROL), + DEVLINK_TRAP(IPV6_BGP, CONTROL), + DEVLINK_TRAP(IPV4_VRRP, CONTROL), + DEVLINK_TRAP(IPV6_VRRP, CONTROL), + DEVLINK_TRAP(IPV4_PIM, CONTROL), + DEVLINK_TRAP(IPV6_PIM, CONTROL), + DEVLINK_TRAP(UC_LB, CONTROL), + DEVLINK_TRAP(LOCAL_ROUTE, CONTROL), + DEVLINK_TRAP(EXTERNAL_ROUTE, CONTROL), + DEVLINK_TRAP(IPV6_UC_DIP_LINK_LOCAL_SCOPE, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_NODES, CONTROL), + DEVLINK_TRAP(IPV6_DIP_ALL_ROUTERS, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_SOLICIT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ADVERT, CONTROL), + DEVLINK_TRAP(IPV6_REDIRECT, CONTROL), + DEVLINK_TRAP(IPV4_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), + DEVLINK_TRAP(PTP_EVENT, CONTROL), + DEVLINK_TRAP(PTP_GENERAL, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8526,6 +8556,18 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(LACP), DEVLINK_TRAP_GROUP(LLDP), DEVLINK_TRAP_GROUP(MC_SNOOPING), + DEVLINK_TRAP_GROUP(DHCP), + DEVLINK_TRAP_GROUP(NEIGH_DISCOVERY), + DEVLINK_TRAP_GROUP(BFD), + DEVLINK_TRAP_GROUP(OSPF), + DEVLINK_TRAP_GROUP(BGP), + DEVLINK_TRAP_GROUP(VRRP), + DEVLINK_TRAP_GROUP(PIM), + DEVLINK_TRAP_GROUP(UC_LB), + DEVLINK_TRAP_GROUP(LOCAL_DELIVERY), + DEVLINK_TRAP_GROUP(IPV6), + DEVLINK_TRAP_GROUP(PTP_EVENT), + DEVLINK_TRAP_GROUP(PTP_GENERAL), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From 5eb18a2b6c11bf165271644ef1ab812b10659c8f Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Fri, 29 May 2020 21:36:43 +0300 Subject: devlink: Add ACL control packet traps Add packet traps for packets that are sampled / trapped by ACLs, so that capable drivers could register them with devlink. Add documentation for every added packet trap and packet trap group. Signed-off-by: Ido Schimmel Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- Documentation/networking/devlink/devlink-trap.rst | 14 ++++++++++++++ include/net/devlink.h | 12 ++++++++++++ net/core/devlink.c | 4 ++++ 3 files changed, 30 insertions(+) (limited to 'net/core') diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst index 621b634b16be..1e3f3ffee248 100644 --- a/Documentation/networking/devlink/devlink-trap.rst +++ b/Documentation/networking/devlink/devlink-trap.rst @@ -397,6 +397,14 @@ be added to the following table: - ``control`` - Traps PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``flow_action_sample`` + - ``control`` + - Traps packets sampled during processing of flow action sample (e.g., via + tc's sample action) + * - ``flow_action_trap`` + - ``control`` + - Traps packets logged during processing of flow action trap (e.g., via + tc's trap action) Driver-specific Packet Traps ============================ @@ -487,6 +495,12 @@ narrow. The description of these groups must be added to the following table: * - ``ptp_general`` - Contains packet traps for PTP general messages (Announce, Follow_Up, Delay_Resp, Pdelay_Resp_Follow_Up, management and signaling) + * - ``acl_sample`` + - Contains packet traps for packets that were sampled by the device during + ACL processing + * - ``acl_trap`` + - Contains packet traps for packets that were trapped (logged) by the + device during ACL processing Packet Trap Policers ==================== diff --git a/include/net/devlink.h b/include/net/devlink.h index 05a45dea976b..1df6dfec26c2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -687,6 +687,8 @@ enum devlink_trap_generic_id { DEVLINK_TRAP_GENERIC_ID_IPV6_ROUTER_ALERT, DEVLINK_TRAP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_SAMPLE, + DEVLINK_TRAP_GENERIC_ID_FLOW_ACTION_TRAP, /* Add new generic trap IDs above */ __DEVLINK_TRAP_GENERIC_ID_MAX, @@ -719,6 +721,8 @@ enum devlink_trap_group_generic_id { DEVLINK_TRAP_GROUP_GENERIC_ID_IPV6, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_EVENT, DEVLINK_TRAP_GROUP_GENERIC_ID_PTP_GENERAL, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE, + DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP, /* Add new generic trap group IDs above */ __DEVLINK_TRAP_GROUP_GENERIC_ID_MAX, @@ -868,6 +872,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_SAMPLE \ + "flow_action_sample" +#define DEVLINK_TRAP_GENERIC_NAME_FLOW_ACTION_TRAP \ + "flow_action_trap" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \ "l2_drops" @@ -913,6 +921,10 @@ enum devlink_trap_group_generic_id { "ptp_event" #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PTP_GENERAL \ "ptp_general" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_SAMPLE \ + "acl_sample" +#define DEVLINK_TRAP_GROUP_GENERIC_NAME_ACL_TRAP \ + "acl_trap" #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id, \ _metadata_cap) \ diff --git a/net/core/devlink.c b/net/core/devlink.c index f32854c3d0e7..2cafbc808b09 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8537,6 +8537,8 @@ static const struct devlink_trap devlink_trap_generic[] = { DEVLINK_TRAP(IPV6_ROUTER_ALERT, CONTROL), DEVLINK_TRAP(PTP_EVENT, CONTROL), DEVLINK_TRAP(PTP_GENERAL, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_SAMPLE, CONTROL), + DEVLINK_TRAP(FLOW_ACTION_TRAP, CONTROL), }; #define DEVLINK_TRAP_GROUP(_id) \ @@ -8568,6 +8570,8 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = { DEVLINK_TRAP_GROUP(IPV6), DEVLINK_TRAP_GROUP(PTP_EVENT), DEVLINK_TRAP_GROUP(PTP_GENERAL), + DEVLINK_TRAP_GROUP(ACL_SAMPLE), + DEVLINK_TRAP_GROUP(ACL_TRAP), }; static int devlink_trap_generic_verify(const struct devlink_trap *trap) -- cgit v1.2.3 From abe3cac8706bffeda3ebc06e4a9fa6e9cadacf26 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:50:33 -0700 Subject: bpf, sk_msg: Add some generic helpers that may be useful from sk_msg Add these generic helpers that may be useful to use from sk_msg programs. The helpers do not depend on ctx so we can simply add them here, BPF_FUNC_perf_event_output BPF_FUNC_get_current_uid_gid BPF_FUNC_get_current_pid_tgid BPF_FUNC_get_current_cgroup_id BPF_FUNC_get_current_ancestor_cgroup_id BPF_FUNC_get_cgroup_classid Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033903373.12355.15489763099696629346.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/filter.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index bd2853d23b50..c3b496a19748 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6443,6 +6443,22 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_push_data_proto; case BPF_FUNC_msg_pop_data: return &bpf_msg_pop_data_proto; + case BPF_FUNC_perf_event_output: + return &bpf_event_output_data_proto; + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_get_current_pid_tgid: + return &bpf_get_current_pid_tgid_proto; +#ifdef CONFIG_CGROUPS + case BPF_FUNC_get_current_cgroup_id: + return &bpf_get_current_cgroup_id_proto; + case BPF_FUNC_get_current_ancestor_cgroup_id: + return &bpf_get_current_ancestor_cgroup_id_proto; +#endif +#ifdef CONFIG_CGROUP_NET_CLASSID + case BPF_FUNC_get_cgroup_classid: + return &bpf_get_cgroup_classid_curr_proto; +#endif default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 13d70f5a5ecff367db2fb18ed4ebe433eab8a74c Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Sun, 24 May 2020 09:51:15 -0700 Subject: bpf, sk_msg: Add get socket storage helpers Add helpers to use local socket storage. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/159033907577.12355.14740125020572756560.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 15 +++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 3 files changed, 19 insertions(+) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { diff --git a/net/core/filter.c b/net/core/filter.c index c3b496a19748..a6fc23447f12 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6449,6 +6449,10 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_pid_tgid: return &bpf_get_current_pid_tgid_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: return &bpf_get_current_cgroup_id_proto; @@ -7273,6 +7277,11 @@ static bool sk_msg_is_valid_access(int off, int size, if (size != sizeof(__u64)) return false; break; + case offsetof(struct sk_msg_md, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; case bpf_ctx_range(struct sk_msg_md, family): case bpf_ctx_range(struct sk_msg_md, remote_ip4): case bpf_ctx_range(struct sk_msg_md, local_ip4): @@ -8609,6 +8618,12 @@ static u32 sk_msg_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct sk_msg_sg, size)); break; + + case offsetof(struct sk_msg_md, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_msg, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_msg, sk)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 97e1fd19ff58..54b93f8b49b8 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3645,6 +3645,8 @@ struct sk_msg_md { __u32 remote_port; /* Stored in network byte order */ __u32 local_port; /* stored in host byte order */ __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ }; struct sk_reuseport_md { -- cgit v1.2.3 From c3c16f2ea6d20159903cf93afbb1155f3d8348d5 Mon Sep 17 00:00:00 2001 From: Amritha Nambiar Date: Tue, 26 May 2020 17:34:36 -0700 Subject: bpf: Add rx_queue_mapping to bpf_sock Add "rx_queue_mapping" to bpf_sock. This gives read access for the existing field (sk_rx_queue_mapping) of struct sock from bpf_sock. Semantics for the bpf_sock rx_queue_mapping access are similar to sk_rx_queue_get(), i.e the value NO_QUEUE_MAPPING is not allowed and -1 is returned in that case. This is useful for transmit queue selection based on the received queue index which is cached in the socket in the receive path. v3: Addressed review comments to add usecase in patch description, and fixed default value for rx_queue_mapping. v2: fixed build error for CONFIG_XPS wrapping, reported by kbuild test robot Signed-off-by: Amritha Nambiar Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 974ca6e948e3..630432c5c292 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3612,6 +3612,7 @@ struct bpf_sock { __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; + __s32 rx_queue_mapping; }; struct bpf_tcp_sock { diff --git a/net/core/filter.c b/net/core/filter.c index a6fc23447f12..0008b029d644 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6849,6 +6849,7 @@ bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, case offsetof(struct bpf_sock, protocol): case offsetof(struct bpf_sock, dst_port): case offsetof(struct bpf_sock, src_port): + case offsetof(struct bpf_sock, rx_queue_mapping): case bpf_ctx_range(struct bpf_sock, src_ip4): case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): case bpf_ctx_range(struct bpf_sock, dst_ip4): @@ -7897,6 +7898,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_state), target_size)); break; + case offsetof(struct bpf_sock, rx_queue_mapping): +#ifdef CONFIG_XPS + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock, sk_rx_queue_mapping), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock, sk_rx_queue_mapping, + sizeof_field(struct sock, + sk_rx_queue_mapping), + target_size)); + *insn++ = BPF_JMP_IMM(BPF_JNE, si->dst_reg, NO_QUEUE_MAPPING, + 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, -1); + *target_size = 2; +#endif + break; } return insn - insn_buf; -- cgit v1.2.3 From fbee97feed9b3e4acdf9590e1f6b4a2eefecfffe Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:13 -0600 Subject: bpf: Add support to attach bpf program to a devmap entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add BPF_XDP_DEVMAP attach type for use with programs associated with a DEVMAP entry. Allow DEVMAPs to associate a program with a device entry by adding a bpf_prog.fd to 'struct bpf_devmap_val'. Values read show the program id, so the fd and id are a union. bpf programs can get access to the struct via vmlinux.h. The program associated with the fd must have type XDP with expected attach type BPF_XDP_DEVMAP. When a program is associated with a device index, the program is run on an XDP_REDIRECT and before the buffer is added to the per-cpu queue. At this point rxq data is still valid; the next patch adds tx device information allowing the prorgam to see both ingress and egress device indices. XDP generic is skb based and XDP programs do not work with skb's. Block the use case by walking maps used by a program that is to be attached via xdpgeneric and fail if any of them are DEVMAP / DEVMAP_HASH with Block attach of BPF_XDP_DEVMAP programs to devices. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-3-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 5 +++ include/uapi/linux/bpf.h | 1 + kernel/bpf/devmap.c | 88 ++++++++++++++++++++++++++++++++++++++++-- net/core/dev.c | 18 +++++++++ tools/include/uapi/linux/bpf.h | 1 + 5 files changed, 109 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e5884f7f801c..e042311f991f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1250,6 +1250,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, struct net_device *dev_rx); int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb, struct bpf_prog *xdp_prog); +bool dev_map_can_have_prog(struct bpf_map *map); struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key); void __cpu_map_flush(void); @@ -1363,6 +1364,10 @@ static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map { return NULL; } +static inline bool dev_map_can_have_prog(struct bpf_map *map) +{ + return false; +} static inline void __dev_flush(void) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 630432c5c292..f1e364d69007 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index a1459de0914e..0089d56617ec 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -63,12 +63,17 @@ struct xdp_dev_bulk_queue { /* DEVMAP values */ struct bpf_devmap_val { u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + u32 id; /* prog id on map read */ + } bpf_prog; }; struct bpf_dtab_netdev { struct net_device *dev; /* must be first member, due to tracepoint */ struct hlist_node index_hlist; struct bpf_dtab *dtab; + struct bpf_prog *xdp_prog; struct rcu_head rcu; unsigned int idx; struct bpf_devmap_val val; @@ -111,12 +116,18 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { + u32 valsize = attr->value_size; u64 cost = 0; int err; - /* check sanity of attributes */ + /* check sanity of attributes. 2 value sizes supported: + * 4 bytes: ifindex + * 8 bytes: ifindex + prog fd + */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK) + (valsize != offsetofend(struct bpf_devmap_val, ifindex) && + valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) || + attr->map_flags & ~DEV_CREATE_FLAG_MASK) return -EINVAL; /* Lookup returns a pointer straight to dev->ifindex, so make sure the @@ -223,6 +234,8 @@ static void dev_map_free(struct bpf_map *map) hlist_for_each_entry_safe(dev, next, head, index_hlist) { hlist_del_rcu(&dev->index_hlist); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -237,6 +250,8 @@ static void dev_map_free(struct bpf_map *map) if (!dev) continue; + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -323,6 +338,16 @@ static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, return -ENOENT; } +bool dev_map_can_have_prog(struct bpf_map *map) +{ + if ((map->map_type == BPF_MAP_TYPE_DEVMAP || + map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) && + map->value_size != offsetofend(struct bpf_devmap_val, ifindex)) + return true; + + return false; +} + static int bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags) { struct net_device *dev = bq->dev; @@ -447,6 +472,30 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, return bq_enqueue(dev, xdpf, dev_rx); } +static struct xdp_buff *dev_map_run_prog(struct net_device *dev, + struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + u32 act; + + act = bpf_prog_run_xdp(xdp_prog, xdp); + switch (act) { + case XDP_PASS: + return xdp; + case XDP_DROP: + break; + default: + bpf_warn_invalid_xdp_action(act); + fallthrough; + case XDP_ABORTED: + trace_xdp_exception(dev, xdp_prog, act); + break; + } + + xdp_return_buff(xdp); + return NULL; +} + int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp, struct net_device *dev_rx) { @@ -458,6 +507,11 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp, { struct net_device *dev = dst->dev; + if (dst->xdp_prog) { + xdp = dev_map_run_prog(dev, xdp, dst->xdp_prog); + if (!xdp) + return 0; + } return __xdp_enqueue(dev, xdp, dev_rx); } @@ -494,6 +548,8 @@ static void __dev_map_entry_free(struct rcu_head *rcu) struct bpf_dtab_netdev *dev; dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + if (dev->xdp_prog) + bpf_prog_put(dev->xdp_prog); dev_put(dev->dev); kfree(dev); } @@ -547,6 +603,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_devmap_val *val, unsigned int idx) { + struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, @@ -558,11 +615,31 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, if (!dev->dev) goto err_out; + if (val->bpf_prog.fd >= 0) { + prog = bpf_prog_get_type_dev(val->bpf_prog.fd, + BPF_PROG_TYPE_XDP, false); + if (IS_ERR(prog)) + goto err_put_dev; + if (prog->expected_attach_type != BPF_XDP_DEVMAP) + goto err_put_prog; + } + dev->idx = idx; dev->dtab = dtab; + if (prog) { + dev->xdp_prog = prog; + dev->val.bpf_prog.id = prog->aux->id; + } else { + dev->xdp_prog = NULL; + dev->val.bpf_prog.id = 0; + } dev->val.ifindex = val->ifindex; return dev; +err_put_prog: + bpf_prog_put(prog); +err_put_dev: + dev_put(dev->dev); err_out: kfree(dev); return ERR_PTR(-EINVAL); @@ -572,8 +649,8 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 i = *(u32 *)key; if (unlikely(map_flags > BPF_EXIST)) @@ -588,6 +665,9 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map, if (!val.ifindex) { dev = NULL; + /* can not specify fd if ifindex is 0 */ + if (val.bpf_prog.fd != -1) + return -EINVAL; } else { dev = __dev_map_alloc_node(net, dtab, &val, i); if (IS_ERR(dev)) @@ -616,8 +696,8 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_devmap_val val = { .bpf_prog.fd = -1 }; struct bpf_dtab_netdev *dev, *old_dev; - struct bpf_devmap_val val = { }; u32 idx = *(u32 *)key; unsigned long flags; int err = -EEXIST; diff --git a/net/core/dev.c b/net/core/dev.c index ae37586f6ee8..10684833f864 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5420,6 +5420,18 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) struct bpf_prog *new = xdp->prog; int ret = 0; + if (new) { + u32 i; + + /* generic XDP does not work with DEVMAPs that can + * have a bpf_prog installed on an entry + */ + for (i = 0; i < new->aux->used_map_cnt; i++) { + if (dev_map_can_have_prog(new->aux->used_maps[i])) + return -EINVAL; + } + } + switch (xdp->command) { case XDP_SETUP_PROG: rcu_assign_pointer(dev->xdp_prog, new); @@ -8835,6 +8847,12 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, return -EINVAL; } + if (prog->expected_attach_type == BPF_XDP_DEVMAP) { + NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device"); + bpf_prog_put(prog); + return -EINVAL; + } + /* prog->aux->id may be 0 for orphaned device-bound progs */ if (prog->aux->id && prog->aux->id == prog_id) { bpf_prog_put(prog); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 974ca6e948e3..65d7717bce2f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -225,6 +225,7 @@ enum bpf_attach_type { BPF_CGROUP_INET6_GETPEERNAME, BPF_CGROUP_INET4_GETSOCKNAME, BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, __MAX_BPF_ATTACH_TYPE }; -- cgit v1.2.3 From 64b59025c15b244c0954cf52b24fbabfcf5ed8f6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 May 2020 16:07:14 -0600 Subject: xdp: Add xdp_txq_info to xdp_buff MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add xdp_txq_info as the Tx counterpart to xdp_rxq_info. At the moment only the device is added. Other fields (queue_index) can be added as use cases arise. >From a UAPI perspective, add egress_ifindex to xdp context for bpf programs to see the Tx device. Update the verifier to only allow accesses to egress_ifindex by XDP programs with BPF_XDP_DEVMAP expected attach type. Signed-off-by: David Ahern Signed-off-by: Alexei Starovoitov Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20200529220716.75383-4-dsahern@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 5 +++++ include/uapi/linux/bpf.h | 2 ++ kernel/bpf/devmap.c | 3 +++ net/core/filter.c | 17 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 2 ++ 5 files changed, 29 insertions(+) (limited to 'net/core') diff --git a/include/net/xdp.h b/include/net/xdp.h index 90f11760bd12..d54022959491 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -61,12 +61,17 @@ struct xdp_rxq_info { struct xdp_mem_info mem; } ____cacheline_aligned; /* perf critical, avoid false-sharing */ +struct xdp_txq_info { + struct net_device *dev; +}; + struct xdp_buff { void *data; void *data_end; void *data_meta; void *data_hard_start; struct xdp_rxq_info *rxq; + struct xdp_txq_info *txq; u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f1e364d69007..f862a58fb567 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3707,6 +3707,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 0089d56617ec..c04fb1c72f5e 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -476,8 +476,11 @@ static struct xdp_buff *dev_map_run_prog(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { + struct xdp_txq_info txq = { .dev = dev }; u32 act; + xdp->txq = &txq; + act = bpf_prog_run_xdp(xdp_prog, xdp); switch (act) { case XDP_PASS: diff --git a/net/core/filter.c b/net/core/filter.c index 0008b029d644..85ff827aab73 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -7015,6 +7015,13 @@ static bool xdp_is_valid_access(int off, int size, const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { + if (prog->expected_attach_type != BPF_XDP_DEVMAP) { + switch (off) { + case offsetof(struct xdp_md, egress_ifindex): + return false; + } + } + if (type == BPF_WRITE) { if (bpf_prog_is_dev_bound(prog->aux)) { switch (off) { @@ -7985,6 +7992,16 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, offsetof(struct xdp_rxq_info, queue_index)); break; + case offsetof(struct xdp_md, egress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, txq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, txq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_txq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_txq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; } return insn - insn_buf; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 65d7717bce2f..f74bc4a2385e 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3706,6 +3706,8 @@ struct xdp_md { /* Below access go through struct xdp_rxq_info */ __u32 ingress_ifindex; /* rxq->dev->ifindex */ __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ }; enum sk_action { -- cgit v1.2.3 From ca2f5f21dbbd5e3a00cd3e97f728aa2ca0b2e011 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:41 -0700 Subject: bpf: Refactor sockmap redirect code so its easy to reuse We will need this block of code called from tls context shortly lets refactor the redirect logic so its easy to use. This also cleans up the switch stmt so we have fewer fallthrough cases. No logic changes are intended. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Song Liu Link: https://lore.kernel.org/bpf/159079360110.5745.7024009076049029819.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- net/core/skmsg.c | 55 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index c479372f2cd2..9d72f71e9b47 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -682,13 +682,43 @@ static struct sk_psock *sk_psock_from_strp(struct strparser *strp) return container_of(parser, struct sk_psock, parser); } -static void sk_psock_verdict_apply(struct sk_psock *psock, - struct sk_buff *skb, int verdict) +static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) { struct sk_psock *psock_other; struct sock *sk_other; bool ingress; + sk_other = tcp_skb_bpf_redirect_fetch(skb); + if (unlikely(!sk_other)) { + kfree_skb(skb); + return; + } + psock_other = sk_psock(sk_other); + if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || + !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + kfree_skb(skb); + return; + } + + ingress = tcp_skb_bpf_ingress(skb); + if ((!ingress && sock_writeable(sk_other)) || + (ingress && + atomic_read(&sk_other->sk_rmem_alloc) <= + sk_other->sk_rcvbuf)) { + if (!ingress) + skb_set_owner_w(skb, sk_other); + skb_queue_tail(&psock_other->ingress_skb, skb); + schedule_work(&psock_other->work); + } else { + kfree_skb(skb); + } +} + +static void sk_psock_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + struct sock *sk_other; + switch (verdict) { case __SK_PASS: sk_other = psock->sk; @@ -707,25 +737,8 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, } goto out_free; case __SK_REDIRECT: - sk_other = tcp_skb_bpf_redirect_fetch(skb); - if (unlikely(!sk_other)) - goto out_free; - psock_other = sk_psock(sk_other); - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) - goto out_free; - ingress = tcp_skb_bpf_ingress(skb); - if ((!ingress && sock_writeable(sk_other)) || - (ingress && - atomic_read(&sk_other->sk_rmem_alloc) <= - sk_other->sk_rcvbuf)) { - if (!ingress) - skb_set_owner_w(skb, sk_other); - skb_queue_tail(&psock_other->ingress_skb, skb); - schedule_work(&psock_other->work); - break; - } - /* fall-through */ + sk_psock_skb_redirect(psock, skb); + break; case __SK_DROP: /* fall-through */ default: -- cgit v1.2.3 From e91de6afa81c10e9f855c5695eb9a53168d96b73 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 29 May 2020 16:06:59 -0700 Subject: bpf: Fix running sk_skb program types with ktls KTLS uses a stream parser to collect TLS messages and send them to the upper layer tls receive handler. This ensures the tls receiver has a full TLS header to parse when it is run. However, when a socket has BPF_SK_SKB_STREAM_VERDICT program attached before KTLS is enabled we end up with two stream parsers running on the same socket. The result is both try to run on the same socket. First the KTLS stream parser runs and calls read_sock() which will tcp_read_sock which in turn calls tcp_rcv_skb(). This dequeues the skb from the sk_receive_queue. When this is done KTLS code then data_ready() callback which because we stacked KTLS on top of the bpf stream verdict program has been replaced with sk_psock_start_strp(). This will in turn kick the stream parser again and eventually do the same thing KTLS did above calling into tcp_rcv_skb() and dequeuing a skb from the sk_receive_queue. At this point the data stream is broke. Part of the stream was handled by the KTLS side some other bytes may have been handled by the BPF side. Generally this results in either missing data or more likely a "Bad Message" complaint from the kTLS receive handler as the BPF program steals some bytes meant to be in a TLS header and/or the TLS header length is no longer correct. We've already broke the idealized model where we can stack ULPs in any order with generic callbacks on the TX side to handle this. So in this patch we do the same thing but for RX side. We add a sk_psock_strp_enabled() helper so TLS can learn a BPF verdict program is running and add a tls_sw_has_ctx_rx() helper so BPF side can learn there is a TLS ULP on the socket. Then on BPF side we omit calling our stream parser to avoid breaking the data stream for the KTLS receiver. Then on the KTLS side we call BPF_SK_SKB_STREAM_VERDICT once the KTLS receiver is done with the packet but before it posts the msg to userspace. This gives us symmetry between the TX and RX halfs and IMO makes it usable again. On the TX side we process packets in this order BPF -> TLS -> TCP and on the receive side in the reverse order TCP -> TLS -> BPF. Discovered while testing OpenSSL 3.0 Alpha2.0 release. Fixes: d829e9c4112b5 ("tls: convert to generic sk_msg interface") Signed-off-by: John Fastabend Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/159079361946.5745.605854335665044485.stgit@john-Precision-5820-Tower Signed-off-by: Alexei Starovoitov --- include/linux/skmsg.h | 8 ++++++++ include/net/tls.h | 9 +++++++++ net/core/skmsg.c | 43 ++++++++++++++++++++++++++++++++++++++++--- net/tls/tls_sw.c | 20 ++++++++++++++++++-- 4 files changed, 75 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index ad31c9fb7158..08674cd14d5a 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -437,4 +437,12 @@ static inline void psock_progs_drop(struct sk_psock_progs *progs) psock_set_prog(&progs->skb_verdict, NULL); } +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb); + +static inline bool sk_psock_strp_enabled(struct sk_psock *psock) +{ + if (!psock) + return false; + return psock->parser.enabled; +} #endif /* _LINUX_SKMSG_H */ diff --git a/include/net/tls.h b/include/net/tls.h index 3e7b44cae0d9..3212d3c214a9 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -571,6 +571,15 @@ static inline bool tls_sw_has_ctx_tx(const struct sock *sk) return !!tls_sw_ctx_tx(ctx); } +static inline bool tls_sw_has_ctx_rx(const struct sock *sk) +{ + struct tls_context *ctx = tls_get_ctx(sk); + + if (!ctx) + return false; + return !!tls_sw_ctx_rx(ctx); +} + void tls_sw_write_space(struct sock *sk, struct tls_context *ctx); void tls_device_write_space(struct sock *sk, struct tls_context *ctx); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9d72f71e9b47..351afbf6bfba 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -7,6 +7,7 @@ #include #include +#include static bool sk_msg_try_coalesce_ok(struct sk_msg *msg, int elem_first_coalesce) { @@ -714,6 +715,38 @@ static void sk_psock_skb_redirect(struct sk_psock *psock, struct sk_buff *skb) } } +static void sk_psock_tls_verdict_apply(struct sk_psock *psock, + struct sk_buff *skb, int verdict) +{ + switch (verdict) { + case __SK_REDIRECT: + sk_psock_skb_redirect(psock, skb); + break; + case __SK_PASS: + case __SK_DROP: + default: + break; + } +} + +int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog; + int ret = __SK_PASS; + + rcu_read_lock(); + prog = READ_ONCE(psock->progs.skb_verdict); + if (likely(prog)) { + tcp_skb_bpf_redirect_clear(skb); + ret = sk_psock_bpf_run(psock, prog, skb); + ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + } + rcu_read_unlock(); + sk_psock_tls_verdict_apply(psock, skb, ret); + return ret; +} +EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); + static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { @@ -792,9 +825,13 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_lock(); psock = sk_psock(sk); if (likely(psock)) { - write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); - write_unlock_bh(&sk->sk_callback_lock); + if (tls_sw_has_ctx_rx(sk)) { + psock->parser.saved_data_ready(sk); + } else { + write_lock_bh(&sk->sk_callback_lock); + strp_data_ready(&psock->parser.strp); + write_unlock_bh(&sk->sk_callback_lock); + } } rcu_read_unlock(); } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8c2763eb6aae..24f64bc0de18 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1742,6 +1742,7 @@ int tls_sw_recvmsg(struct sock *sk, long timeo; bool is_kvec = iov_iter_is_kvec(&msg->msg_iter); bool is_peek = flags & MSG_PEEK; + bool bpf_strp_enabled; int num_async = 0; int pending; @@ -1752,6 +1753,7 @@ int tls_sw_recvmsg(struct sock *sk, psock = sk_psock_get(sk); lock_sock(sk); + bpf_strp_enabled = sk_psock_strp_enabled(psock); /* Process pending decrypted records. It must be non-zero-copy */ err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false, @@ -1805,11 +1807,12 @@ int tls_sw_recvmsg(struct sock *sk, if (to_decrypt <= len && !is_kvec && !is_peek && ctx->control == TLS_RECORD_TYPE_DATA && - prot->version != TLS_1_3_VERSION) + prot->version != TLS_1_3_VERSION && + !bpf_strp_enabled) zc = true; /* Do not use async mode if record is non-data */ - if (ctx->control == TLS_RECORD_TYPE_DATA) + if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled) async_capable = ctx->async_capable; else async_capable = false; @@ -1859,6 +1862,19 @@ int tls_sw_recvmsg(struct sock *sk, goto pick_next_record; if (!zc) { + if (bpf_strp_enabled) { + err = sk_psock_tls_strp_read(psock, skb); + if (err != __SK_PASS) { + rxm->offset = rxm->offset + rxm->full_len; + rxm->full_len = 0; + if (err == __SK_DROP) + consume_skb(skb); + ctx->recv_pkt = NULL; + __strp_unpause(&ctx->strp); + continue; + } + } + if (rxm->full_len > len) { retain_skb = true; chunk = len; -- cgit v1.2.3 From 8ea204c2b658eaef55b4716fde469fb66c589a3d Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:00 +0200 Subject: net: Make locking in sock_bindtoindex optional The sock_bindtoindex intended for kernel wide usage however it will lock the socket regardless of the context. This modification relax this behavior optionally: locking the socket will be optional by calling the sock_bindtoindex with lock_sk = true. The modification applied to all users of the sock_bindtoindex. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/bee6355da40d9e991b2f2d12b67d55ebb5f5b207.1590871065.git.fejes@inf.elte.hu --- include/net/sock.h | 2 +- net/core/sock.c | 10 ++++++---- net/ipv4/udp_tunnel.c | 2 +- net/ipv6/ip6_udp_tunnel.c | 2 +- 4 files changed, 9 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/include/net/sock.h b/include/net/sock.h index 6e9f713a7860..c53cc42b5ab9 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -2690,7 +2690,7 @@ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) void sock_def_readable(struct sock *sk); -int sock_bindtoindex(struct sock *sk, int ifindex); +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); diff --git a/net/core/sock.c b/net/core/sock.c index 61ec573221a6..6c4acf1f0220 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -594,13 +594,15 @@ out: return ret; } -int sock_bindtoindex(struct sock *sk, int ifindex) +int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) { int ret; - lock_sock(sk); + if (lock_sk) + lock_sock(sk); ret = sock_bindtoindex_locked(sk, ifindex); - release_sock(sk); + if (lock_sk) + release_sock(sk); return ret; } @@ -646,7 +648,7 @@ static int sock_setbindtodevice(struct sock *sk, char __user *optval, goto out; } - return sock_bindtoindex(sk, index); + return sock_bindtoindex(sk, index, true); out: #endif diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c index 2158e8bddf41..3eecba0874aa 100644 --- a/net/ipv4/udp_tunnel.c +++ b/net/ipv4/udp_tunnel.c @@ -22,7 +22,7 @@ int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg, goto error; if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index 2e0ad1bc84a8..cdc4d4ee2420 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -30,7 +30,7 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg *cfg, goto error; } if (cfg->bind_ifindex) { - err = sock_bindtoindex(sock->sk, cfg->bind_ifindex); + err = sock_bindtoindex(sock->sk, cfg->bind_ifindex, true); if (err < 0) goto error; } -- cgit v1.2.3 From 70c58997c1e864c96dfdf072572047303db8f42a Mon Sep 17 00:00:00 2001 From: Ferenc Fejes Date: Sat, 30 May 2020 23:09:01 +0200 Subject: bpf: Allow SO_BINDTODEVICE opt in bpf_setsockopt Extending the supported sockopts in bpf_setsockopt with SO_BINDTODEVICE. We call sock_bindtoindex with parameter lock_sk = false in this context because we already owning the socket. Signed-off-by: Ferenc Fejes Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/4149e304867b8d5a606a305bc59e29b063e51f49.1590871065.git.fejes@inf.elte.hu --- net/core/filter.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 85ff827aab73..ae82bcb03124 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4248,6 +4248,9 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { static int _bpf_setsockopt(struct sock *sk, int level, int optname, char *optval, int optlen, u32 flags) { + char devname[IFNAMSIZ]; + struct net *net; + int ifindex; int ret = 0; int val; @@ -4257,7 +4260,7 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sock_owned_by_me(sk); if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) + if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) return -EINVAL; val = *((int *)optval); @@ -4298,6 +4301,29 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk_dst_reset(sk); } break; + case SO_BINDTODEVICE: + ret = -ENOPROTOOPT; +#ifdef CONFIG_NETDEVICES + optlen = min_t(long, optlen, IFNAMSIZ - 1); + strncpy(devname, optval, optlen); + devname[optlen] = 0; + + ifindex = 0; + if (devname[0] != '\0') { + struct net_device *dev; + + ret = -ENODEV; + + net = sock_net(sk); + dev = dev_get_by_name(net, devname); + if (!dev) + break; + ifindex = dev->ifindex; + dev_put(dev); + } + ret = sock_bindtoindex(sk, ifindex, false); +#endif + break; default: ret = -EINVAL; } -- cgit v1.2.3 From 171526f6fee84de0c39e2b7aa7e666ba0bbfd173 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:35 +0200 Subject: flow_dissector: Pull locking up from prog attach callback Split out the part of attach callback that happens with attach/detach lock acquired. This structures the prog attach callback in a way that opens up doors for moving the locking out of flow_dissector and into generic callbacks for attaching/detaching progs to netns in subsequent patches. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-2-jakub@cloudflare.com --- net/core/flow_dissector.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0aeb33572feb..b64a44a083fd 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -109,15 +109,10 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +static int flow_dissector_bpf_prog_attach(struct net *net, + struct bpf_prog *prog) { struct bpf_prog *attached; - struct net *net; - int ret = 0; - - net = current->nsproxy->net_ns; - mutex_lock(&flow_dissector_mutex); if (net == &init_net) { /* BPF flow dissector in the root namespace overrides @@ -130,33 +125,38 @@ int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(ns->flow_dissector_prog)) + return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) { - ret = -EEXIST; - goto out; - } + if (rcu_access_pointer(init_net.flow_dissector_prog)) + return -EEXIST; } attached = rcu_dereference_protected(net->flow_dissector_prog, lockdep_is_held(&flow_dissector_mutex)); - if (attached == prog) { + if (attached == prog) /* The same program cannot be attached twice */ - ret = -EINVAL; - goto out; - } + return -EINVAL; + rcu_assign_pointer(net->flow_dissector_prog, prog); if (attached) bpf_prog_put(attached); -out: + return 0; +} + +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + int ret; + + mutex_lock(&flow_dissector_mutex); + ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); mutex_unlock(&flow_dissector_mutex); + return ret; } -- cgit v1.2.3 From a3fd7ceee05431d2c51ed86c6cae015d236a51f0 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:36 +0200 Subject: net: Introduce netns_bpf for BPF programs attached to netns In order to: (1) attach more than one BPF program type to netns, or (2) support attaching BPF programs to netns with bpf_link, or (3) support multi-prog attach points for netns we will need to keep more state per netns than a single pointer like we have now for BPF flow dissector program. Prepare for the above by extracting netns_bpf that is part of struct net, for storing all state related to BPF programs attached to netns. Turn flow dissector callbacks for querying/attaching/detaching a program into generic ones that operate on netns_bpf. Next patch will move the generic callbacks into their own module. This is similar to how it is organized for cgroup with cgroup_bpf. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Cc: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20200531082846.2117903-3-jakub@cloudflare.com --- include/linux/bpf-netns.h | 56 +++++++++++++++++++++++ include/linux/skbuff.h | 26 ----------- include/net/net_namespace.h | 4 +- include/net/netns/bpf.h | 17 +++++++ kernel/bpf/syscall.c | 7 +-- net/core/flow_dissector.c | 105 +++++++++++++++++++++++++++++--------------- 6 files changed, 149 insertions(+), 66 deletions(-) create mode 100644 include/linux/bpf-netns.h create mode 100644 include/net/netns/bpf.h (limited to 'net/core') diff --git a/include/linux/bpf-netns.h b/include/linux/bpf-netns.h new file mode 100644 index 000000000000..f3aec3d79824 --- /dev/null +++ b/include/linux/bpf-netns.h @@ -0,0 +1,56 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BPF_NETNS_H +#define _BPF_NETNS_H + +#include +#include + +enum netns_bpf_attach_type { + NETNS_BPF_INVALID = -1, + NETNS_BPF_FLOW_DISSECTOR = 0, + MAX_NETNS_BPF_ATTACH_TYPE +}; + +static inline enum netns_bpf_attach_type +to_netns_bpf_attach_type(enum bpf_attach_type attach_type) +{ + switch (attach_type) { + case BPF_FLOW_DISSECTOR: + return NETNS_BPF_FLOW_DISSECTOR; + default: + return NETNS_BPF_INVALID; + } +} + +/* Protects updates to netns_bpf */ +extern struct mutex netns_bpf_mutex; + +union bpf_attr; +struct bpf_prog; + +#ifdef CONFIG_NET +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr); +int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); +int netns_bpf_prog_detach(const union bpf_attr *attr); +#else +static inline int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + return -EOPNOTSUPP; +} + +static inline int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + return -EOPNOTSUPP; +} +#endif + +#endif /* _BPF_NETNS_H */ diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 531843952809..a0d5c2760103 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1283,32 +1283,6 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); -#ifdef CONFIG_NET -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr); -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog); - -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); -#else -static inline int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) -{ - return -EOPNOTSUPP; -} - -static inline int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) -{ - return -EOPNOTSUPP; -} -#endif - struct bpf_flow_dissector; bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, __be16 proto, int nhoff, int hlen, unsigned int flags); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 8e001e049497..2ee5901bec7a 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -162,7 +163,8 @@ struct net { #endif struct net_generic __rcu *gen; - struct bpf_prog __rcu *flow_dissector_prog; + /* Used to store attached BPF programs */ + struct netns_bpf bpf; /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM diff --git a/include/net/netns/bpf.h b/include/net/netns/bpf.h new file mode 100644 index 000000000000..a858d1c5b166 --- /dev/null +++ b/include/net/netns/bpf.h @@ -0,0 +1,17 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * BPF programs attached to network namespace + */ + +#ifndef __NETNS_BPF_H__ +#define __NETNS_BPF_H__ + +#include + +struct bpf_prog; + +struct netns_bpf { + struct bpf_prog __rcu *progs[MAX_NETNS_BPF_ATTACH_TYPE]; +}; + +#endif /* __NETNS_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e83b0818b529..c77ab9c76f7b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,6 +27,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -2868,7 +2869,7 @@ static int bpf_prog_attach(const union bpf_attr *attr) ret = lirc_prog_attach(attr, prog); break; case BPF_PROG_TYPE_FLOW_DISSECTOR: - ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + ret = netns_bpf_prog_attach(attr, prog); break; case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: @@ -2908,7 +2909,7 @@ static int bpf_prog_detach(const union bpf_attr *attr) case BPF_PROG_TYPE_FLOW_DISSECTOR: if (!capable(CAP_NET_ADMIN)) return -EPERM; - return skb_flow_dissector_bpf_prog_detach(attr); + return netns_bpf_prog_detach(attr); case BPF_PROG_TYPE_CGROUP_DEVICE: case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: @@ -2961,7 +2962,7 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_LIRC_MODE2: return lirc_prog_query(attr, uattr); case BPF_FLOW_DISSECTOR: - return skb_flow_dissector_prog_query(attr, uattr); + return netns_bpf_prog_query(attr, uattr); default: return -EINVAL; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index b64a44a083fd..6c1b8e43d611 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -31,8 +31,10 @@ #include #include #endif +#include -static DEFINE_MUTEX(flow_dissector_mutex); +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -70,23 +72,28 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int skb_flow_dissector_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) { __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; struct bpf_prog *attached; struct net *net; if (attr->query.query_flags) return -EINVAL; + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + net = get_net_ns_by_fd(attr->query.target_fd); if (IS_ERR(net)) return PTR_ERR(net); rcu_read_lock(); - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { prog_cnt = 1; prog_id = attached->aux->id; @@ -112,6 +119,7 @@ int skb_flow_dissector_prog_query(const union bpf_attr *attr, static int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; if (net == &init_net) { @@ -125,74 +133,97 @@ static int flow_dissector_bpf_prog_attach(struct net *net, for_each_net(ns) { if (ns == &init_net) continue; - if (rcu_access_pointer(ns->flow_dissector_prog)) + if (rcu_access_pointer(ns->bpf.progs[type])) return -EEXIST; } } else { /* Make sure root flow dissector is not attached * when attaching to the non-root namespace. */ - if (rcu_access_pointer(init_net.flow_dissector_prog)) + if (rcu_access_pointer(init_net.bpf.progs[type])) return -EEXIST; } - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); if (attached == prog) /* The same program cannot be attached twice */ return -EINVAL; - rcu_assign_pointer(net->flow_dissector_prog, prog); + rcu_assign_pointer(net->bpf.progs[type], prog); if (attached) bpf_prog_put(attached); return 0; } -int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, - struct bpf_prog *prog) +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) { + enum netns_bpf_attach_type type; + struct net *net; int ret; - mutex_lock(&flow_dissector_mutex); - ret = flow_dissector_bpf_prog_attach(current->nsproxy->net_ns, prog); - mutex_unlock(&flow_dissector_mutex); + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); return ret; } -static int flow_dissector_bpf_prog_detach(struct net *net) +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) { struct bpf_prog *attached; - mutex_lock(&flow_dissector_mutex); - attached = rcu_dereference_protected(net->flow_dissector_prog, - lockdep_is_held(&flow_dissector_mutex)); - if (!attached) { - mutex_unlock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) return -ENOENT; - } - RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + RCU_INIT_POINTER(net->bpf.progs[type], NULL); bpf_prog_put(attached); - mutex_unlock(&flow_dissector_mutex); return 0; } -int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +int netns_bpf_prog_detach(const union bpf_attr *attr) { - return flow_dissector_bpf_prog_detach(current->nsproxy->net_ns); + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; } -static void __net_exit flow_dissector_pernet_pre_exit(struct net *net) +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) { - /* We're not racing with attach/detach because there are no - * references to netns left when pre_exit gets called. - */ - if (rcu_access_pointer(net->flow_dissector_prog)) - flow_dissector_bpf_prog_detach(net); + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); } -static struct pernet_operations flow_dissector_pernet_ops __net_initdata = { - .pre_exit = flow_dissector_pernet_pre_exit, +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, }; /** @@ -1044,11 +1075,13 @@ bool __skb_flow_dissect(const struct net *net, WARN_ON_ONCE(!net); if (net) { + enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; + rcu_read_lock(); - attached = rcu_dereference(init_net.flow_dissector_prog); + attached = rcu_dereference(init_net.bpf.progs[type]); if (!attached) - attached = rcu_dereference(net->flow_dissector_prog); + attached = rcu_dereference(net->bpf.progs[type]); if (attached) { struct bpf_flow_keys flow_keys; @@ -1870,6 +1903,6 @@ static int __init init_default_flow_dissectors(void) flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - return register_pernet_subsys(&flow_dissector_pernet_ops); + return register_pernet_subsys(&netns_bpf_pernet_ops); } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From b27f7bb590ba835b32ef122389db158e44cfda1e Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Sun, 31 May 2020 10:28:37 +0200 Subject: flow_dissector: Move out netns_bpf prog callbacks Move functions to manage BPF programs attached to netns that are not specific to flow dissector to a dedicated module named bpf/net_namespace.c. The set of functions will grow with the addition of bpf_link support for netns attached programs. This patch prepares ground by creating a place for it. This is a code move with no functional changes intended. Signed-off-by: Jakub Sitnicki Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20200531082846.2117903-4-jakub@cloudflare.com --- include/net/flow_dissector.h | 6 ++ kernel/bpf/Makefile | 1 + kernel/bpf/net_namespace.c | 133 +++++++++++++++++++++++++++++++++++++++++++ net/core/flow_dissector.c | 125 ++-------------------------------------- 4 files changed, 144 insertions(+), 121 deletions(-) create mode 100644 kernel/bpf/net_namespace.c (limited to 'net/core') diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index 4fb1a69c6ecf..a7eba43fe4e4 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -8,6 +8,8 @@ #include #include +struct bpf_prog; +struct net; struct sk_buff; /** @@ -369,4 +371,8 @@ flow_dissector_init_keys(struct flow_dissector_key_control *key_control, memset(key_basic, 0, sizeof(*key_basic)); } +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog); +#endif /* CONFIG_BPF_SYSCALL */ + #endif diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 8fca02f64811..1131a921e1a6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -13,6 +13,7 @@ ifeq ($(CONFIG_NET),y) obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o +obj-$(CONFIG_BPF_SYSCALL) += net_namespace.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c new file mode 100644 index 000000000000..b37d81450c3a --- /dev/null +++ b/kernel/bpf/net_namespace.c @@ -0,0 +1,133 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +/* + * Functions to manage BPF programs attached to netns + */ + +/* Protects updates to netns_bpf */ +DEFINE_MUTEX(netns_bpf_mutex); + +int netns_bpf_prog_query(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); + u32 prog_id, prog_cnt = 0, flags = 0; + enum netns_bpf_attach_type type; + struct bpf_prog *attached; + struct net *net; + + if (attr->query.query_flags) + return -EINVAL; + + type = to_netns_bpf_attach_type(attr->query.attach_type); + if (type < 0) + return -EINVAL; + + net = get_net_ns_by_fd(attr->query.target_fd); + if (IS_ERR(net)) + return PTR_ERR(net); + + rcu_read_lock(); + attached = rcu_dereference(net->bpf.progs[type]); + if (attached) { + prog_cnt = 1; + prog_id = attached->aux->id; + } + rcu_read_unlock(); + + put_net(net); + + if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) + return -EFAULT; + if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) + return -EFAULT; + + if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) + return 0; + + if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) + return -EFAULT; + + return 0; +} + +int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) +{ + enum netns_bpf_attach_type type; + struct net *net; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + net = current->nsproxy->net_ns; + mutex_lock(&netns_bpf_mutex); + switch (type) { + case NETNS_BPF_FLOW_DISSECTOR: + ret = flow_dissector_bpf_prog_attach(net, prog); + break; + default: + ret = -EINVAL; + break; + } + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +/* Must be called with netns_bpf_mutex held. */ +static int __netns_bpf_prog_detach(struct net *net, + enum netns_bpf_attach_type type) +{ + struct bpf_prog *attached; + + attached = rcu_dereference_protected(net->bpf.progs[type], + lockdep_is_held(&netns_bpf_mutex)); + if (!attached) + return -ENOENT; + RCU_INIT_POINTER(net->bpf.progs[type], NULL); + bpf_prog_put(attached); + return 0; +} + +int netns_bpf_prog_detach(const union bpf_attr *attr) +{ + enum netns_bpf_attach_type type; + int ret; + + type = to_netns_bpf_attach_type(attr->attach_type); + if (type < 0) + return -EINVAL; + + mutex_lock(&netns_bpf_mutex); + ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); + mutex_unlock(&netns_bpf_mutex); + + return ret; +} + +static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) +{ + enum netns_bpf_attach_type type; + + mutex_lock(&netns_bpf_mutex); + for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) + __netns_bpf_prog_detach(net, type); + mutex_unlock(&netns_bpf_mutex); +} + +static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { + .pre_exit = netns_bpf_pernet_pre_exit, +}; + +static int __init netns_bpf_init(void) +{ + return register_pernet_subsys(&netns_bpf_pernet_ops); +} + +subsys_initcall(netns_bpf_init); diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 6c1b8e43d611..d02df0b6d0d9 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -33,9 +33,6 @@ #endif #include -/* Protects updates to netns_bpf */ -DEFINE_MUTEX(netns_bpf_mutex); - static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { @@ -72,52 +69,8 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); -int netns_bpf_prog_query(const union bpf_attr *attr, - union bpf_attr __user *uattr) -{ - __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids); - u32 prog_id, prog_cnt = 0, flags = 0; - enum netns_bpf_attach_type type; - struct bpf_prog *attached; - struct net *net; - - if (attr->query.query_flags) - return -EINVAL; - - type = to_netns_bpf_attach_type(attr->query.attach_type); - if (type < 0) - return -EINVAL; - - net = get_net_ns_by_fd(attr->query.target_fd); - if (IS_ERR(net)) - return PTR_ERR(net); - - rcu_read_lock(); - attached = rcu_dereference(net->bpf.progs[type]); - if (attached) { - prog_cnt = 1; - prog_id = attached->aux->id; - } - rcu_read_unlock(); - - put_net(net); - - if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags))) - return -EFAULT; - if (copy_to_user(&uattr->query.prog_cnt, &prog_cnt, sizeof(prog_cnt))) - return -EFAULT; - - if (!attr->query.prog_cnt || !prog_ids || !prog_cnt) - return 0; - - if (copy_to_user(prog_ids, &prog_id, sizeof(u32))) - return -EFAULT; - - return 0; -} - -static int flow_dissector_bpf_prog_attach(struct net *net, - struct bpf_prog *prog) +#ifdef CONFIG_BPF_SYSCALL +int flow_dissector_bpf_prog_attach(struct net *net, struct bpf_prog *prog) { enum netns_bpf_attach_type type = NETNS_BPF_FLOW_DISSECTOR; struct bpf_prog *attached; @@ -155,76 +108,7 @@ static int flow_dissector_bpf_prog_attach(struct net *net, bpf_prog_put(attached); return 0; } - -int netns_bpf_prog_attach(const union bpf_attr *attr, struct bpf_prog *prog) -{ - enum netns_bpf_attach_type type; - struct net *net; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - net = current->nsproxy->net_ns; - mutex_lock(&netns_bpf_mutex); - switch (type) { - case NETNS_BPF_FLOW_DISSECTOR: - ret = flow_dissector_bpf_prog_attach(net, prog); - break; - default: - ret = -EINVAL; - break; - } - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -/* Must be called with netns_bpf_mutex held. */ -static int __netns_bpf_prog_detach(struct net *net, - enum netns_bpf_attach_type type) -{ - struct bpf_prog *attached; - - attached = rcu_dereference_protected(net->bpf.progs[type], - lockdep_is_held(&netns_bpf_mutex)); - if (!attached) - return -ENOENT; - RCU_INIT_POINTER(net->bpf.progs[type], NULL); - bpf_prog_put(attached); - return 0; -} - -int netns_bpf_prog_detach(const union bpf_attr *attr) -{ - enum netns_bpf_attach_type type; - int ret; - - type = to_netns_bpf_attach_type(attr->attach_type); - if (type < 0) - return -EINVAL; - - mutex_lock(&netns_bpf_mutex); - ret = __netns_bpf_prog_detach(current->nsproxy->net_ns, type); - mutex_unlock(&netns_bpf_mutex); - - return ret; -} - -static void __net_exit netns_bpf_pernet_pre_exit(struct net *net) -{ - enum netns_bpf_attach_type type; - - mutex_lock(&netns_bpf_mutex); - for (type = 0; type < MAX_NETNS_BPF_ATTACH_TYPE; type++) - __netns_bpf_prog_detach(net, type); - mutex_unlock(&netns_bpf_mutex); -} - -static struct pernet_operations netns_bpf_pernet_ops __net_initdata = { - .pre_exit = netns_bpf_pernet_pre_exit, -}; +#endif /* CONFIG_BPF_SYSCALL */ /** * __skb_flow_get_ports - extract the upper layer ports and return them @@ -1902,7 +1786,6 @@ static int __init init_default_flow_dissectors(void) skb_flow_dissector_init(&flow_keys_basic_dissector, flow_keys_basic_dissector_keys, ARRAY_SIZE(flow_keys_basic_dissector_keys)); - - return register_pernet_subsys(&netns_bpf_pernet_ops); + return 0; } core_initcall(init_default_flow_dissectors); -- cgit v1.2.3 From 836e66c218f355ec01ba57671c85abf32961dcea Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:32 +0200 Subject: bpf: Fix up bpf_skb_adjust_room helper's skb csum setting Lorenz recently reported: In our TC classifier cls_redirect [0], we use the following sequence of helper calls to decapsulate a GUE (basically IP + UDP + custom header) encapsulated packet: bpf_skb_adjust_room(skb, -encap_len, BPF_ADJ_ROOM_MAC, BPF_F_ADJ_ROOM_FIXED_GSO) bpf_redirect(skb->ifindex, BPF_F_INGRESS) It seems like some checksums of the inner headers are not validated in this case. For example, a TCP SYN packet with invalid TCP checksum is still accepted by the network stack and elicits a SYN ACK. [...] That is, we receive the following packet from the driver: | ETH | IP | UDP | GUE | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY ip_summed is CHECKSUM_UNNECESSARY because our NICs do rx checksum offloading. On this packet we run skb_adjust_room_mac(-encap_len), and get the following: | ETH | IP | TCP | skb->ip_summed == CHECKSUM_UNNECESSARY Note that ip_summed is still CHECKSUM_UNNECESSARY. After bpf_redirect()'ing into the ingress, we end up in tcp_v4_rcv(). There, skb_checksum_init() is turned into a no-op due to CHECKSUM_UNNECESSARY. The bpf_skb_adjust_room() helper is not aware of protocol specifics. Internally, it handles the CHECKSUM_COMPLETE case via skb_postpull_rcsum(), but that does not cover CHECKSUM_UNNECESSARY. In this case skb->csum_level of the original skb prior to bpf_skb_adjust_room() call was 0, that is, covering UDP. Right now there is no way to adjust the skb->csum_level. NICs that have checksum offload disabled (CHECKSUM_NONE) or that support CHECKSUM_COMPLETE are not affected. Use a safe default for CHECKSUM_UNNECESSARY by resetting to CHECKSUM_NONE and add a flag to the helper called BPF_F_ADJ_ROOM_NO_CSUM_RESET that allows users from opting out. Opting out is useful for the case where we don't remove/add full protocol headers, or for the case where a user wants to adjust the csum level manually e.g. through bpf_csum_level() helper that is added in subsequent patch. The bpf_skb_proto_{4_to_6,6_to_4}() for NAT64/46 translation from the BPF bpf_skb_change_proto() helper uses bpf_skb_net_hdr_{push,pop}() pair internally as well but doesn't change layers, only transitions between v4 to v6 and vice versa, therefore no adoption is required there. [0] https://lore.kernel.org/bpf/20200424185556.7358-1-lmb@cloudflare.com/ Fixes: 2be7e212d541 ("bpf: add bpf_skb_adjust_room helper") Reported-by: Lorenz Bauer Reported-by: Alan Maguire Signed-off-by: Daniel Borkmann Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Link: https://lore.kernel.org/bpf/CACAyw9-uU_52esMd1JjuA80fRPHJv5vsSg8GnfW3t_qDU4aVKQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/11a90472e7cce83e76ddbfce81fdfce7bfc68808.1591108731.git.daniel@iogearbox.net --- include/linux/skbuff.h | 8 ++++++++ include/uapi/linux/bpf.h | 8 ++++++++ net/core/filter.c | 8 ++++++-- tools/include/uapi/linux/bpf.h | 8 ++++++++ 4 files changed, 30 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a0d5c2760103..0c0377fc00c2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3919,6 +3919,14 @@ static inline void __skb_incr_checksum_unnecessary(struct sk_buff *skb) } } +static inline void __skb_reset_checksum_unnecessary(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_UNNECESSARY) { + skb->ip_summed = CHECKSUM_NONE; + skb->csum_level = 0; + } +} + /* Check if we need to perform checksum complete validation. * * Returns true if checksum complete is needed, false otherwise diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { diff --git a/net/core/filter.c b/net/core/filter.c index ae82bcb03124..278dcc0af961 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3113,7 +3113,8 @@ static int bpf_skb_net_shrink(struct sk_buff *skb, u32 off, u32 len_diff, { int ret; - if (flags & ~BPF_F_ADJ_ROOM_FIXED_GSO) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_FIXED_GSO | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (skb_is_gso(skb) && !skb_is_gso_tcp(skb)) { @@ -3163,7 +3164,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, u32 off; int ret; - if (unlikely(flags & ~BPF_F_ADJ_ROOM_MASK)) + if (unlikely(flags & ~(BPF_F_ADJ_ROOM_MASK | + BPF_F_ADJ_ROOM_NO_CSUM_RESET))) return -EINVAL; if (unlikely(len_diff_abs > 0xfffU)) return -EFAULT; @@ -3191,6 +3193,8 @@ BPF_CALL_4(bpf_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, ret = shrink ? bpf_skb_net_shrink(skb, off, len_diff_abs, flags) : bpf_skb_net_grow(skb, off, len_diff_abs, flags); + if (!ret && !(flags & BPF_F_ADJ_ROOM_NO_CSUM_RESET)) + __skb_reset_checksum_unnecessary(skb); bpf_compute_data_pointers(skb); return ret; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index b9ed9f14f2a2..3ba2bbbed80c 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1635,6 +1635,13 @@ union bpf_attr { * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer @@ -3433,6 +3440,7 @@ enum { BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), }; enum { -- cgit v1.2.3 From 7cdec54f9713256bb170873a1fc5c75c9127c9d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 2 Jun 2020 16:58:33 +0200 Subject: bpf: Add csum_level helper for fixing up csum levels Add a bpf_csum_level() helper which BPF programs can use in combination with bpf_skb_adjust_room() when they pass in BPF_F_ADJ_ROOM_NO_CSUM_RESET flag to the latter to avoid falling back to CHECKSUM_NONE. The bpf_csum_level() allows to adjust CHECKSUM_UNNECESSARY skb->csum_levels via BPF_CSUM_LEVEL_{INC,DEC} which calls __skb_{incr,decr}_checksum_unnecessary() on the skb. The helper also allows a BPF_CSUM_LEVEL_RESET which sets the skb's csum to CHECKSUM_NONE as well as a BPF_CSUM_LEVEL_QUERY to just return the current level. Without this helper, there is no way to otherwise adjust the skb->csum_level. I did not add an extra dummy flags as there is plenty of free bitspace in level argument itself iff ever needed in future. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Reviewed-by: Alan Maguire Acked-by: Lorenz Bauer Link: https://lore.kernel.org/bpf/279ae3717cb3d03c0ffeb511493c93c450a01e1a.1591108731.git.daniel@iogearbox.net --- include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 38 +++++++++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 43 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 122 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), diff --git a/net/core/filter.c b/net/core/filter.c index 278dcc0af961..d01a244b5087 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2015,6 +2015,40 @@ static const struct bpf_func_proto bpf_csum_update_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_2(bpf_csum_level, struct sk_buff *, skb, u64, level) +{ + /* The interface is to be used in combination with bpf_skb_adjust_room() + * for encap/decap of packet headers when BPF_F_ADJ_ROOM_NO_CSUM_RESET + * is passed as flags, for example. + */ + switch (level) { + case BPF_CSUM_LEVEL_INC: + __skb_incr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_DEC: + __skb_decr_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_RESET: + __skb_reset_checksum_unnecessary(skb); + break; + case BPF_CSUM_LEVEL_QUERY: + return skb->ip_summed == CHECKSUM_UNNECESSARY ? + skb->csum_level : -EACCES; + default: + return -EINVAL; + } + + return 0; +} + +static const struct bpf_func_proto bpf_csum_level_proto = { + .func = bpf_csum_level, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) { return dev_forward_skb(dev, skb); @@ -6280,6 +6314,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_csum_diff_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: @@ -6613,6 +6649,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skb_store_bytes_proto; case BPF_FUNC_csum_update: return &bpf_csum_update_proto; + case BPF_FUNC_csum_level: + return &bpf_csum_level_proto; case BPF_FUNC_l3_csum_replace: return &bpf_l3_csum_replace_proto; case BPF_FUNC_l4_csum_replace: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ba2bbbed80c..c65b374a5090 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3220,6 +3220,38 @@ union bpf_attr { * calculation. * Return * Requested value, or 0, if flags are not recognized. + * + * int bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3356,7 +3388,8 @@ union bpf_attr { FN(ringbuf_reserve), \ FN(ringbuf_submit), \ FN(ringbuf_discard), \ - FN(ringbuf_query), + FN(ringbuf_query), \ + FN(csum_level), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3433,6 +3466,14 @@ enum { BPF_F_CURRENT_NETNS = (-1L), }; +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + /* BPF_FUNC_skb_adjust_room flags. */ enum { BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), -- cgit v1.2.3