From 814abfabef3ceed390c10d06a0cc69a86454b6cf Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:27:07 -0700 Subject: xdp: add bpf_redirect helper function This adds support for a bpf_redirect helper function to the XDP infrastructure. For now this only supports redirecting to the egress path of a port. In order to support drivers handling a xdp_buff natively this patches uses a new ndo operation ndo_xdp_xmit() that takes pushes a xdp_buff to the specified device. If the program specifies either (a) an unknown device or (b) a device that does not support the operation a BPF warning is thrown and the XDP_ABORTED error code is returned. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e99e3e6f8b37..4dbb7a3f4677 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -717,6 +717,7 @@ enum xdp_action { XDP_DROP, XDP_PASS, XDP_TX, + XDP_REDIRECT, }; /* user accessible metadata for XDP packet hook -- cgit v1.2.3 From 546ac1ffb70d25b56c1126940e5ec639c4dd7413 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:28:56 -0700 Subject: bpf: add devmap, a map for storing net device references Device map (devmap) is a BPF map, primarily useful for networking applications, that uses a key to lookup a reference to a netdevice. The map provides a clean way for BPF programs to build virtual port to physical port maps. Additionally, it provides a scoping function for the redirect action itself allowing multiple optimizations. Future patches will leverage the map to provide batching at the XDP layer. Another optimization/feature, that is not yet implemented, would be to support multiple netdevices per key to support efficient multicast and broadcast support. Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- include/linux/bpf_types.h | 3 + include/uapi/linux/bpf.h | 1 + kernel/bpf/Makefile | 3 + kernel/bpf/devmap.c | 264 ++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 8 + tools/testing/selftests/bpf/test_maps.c | 15 ++ 6 files changed, 294 insertions(+) create mode 100644 kernel/bpf/devmap.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 3d137c33d664..b1e1035ca24b 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -35,3 +35,6 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) +#ifdef CONFIG_NET +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +#endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 4dbb7a3f4677..ecbb0e7e15bc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -104,6 +104,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, }; enum bpf_prog_type { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1e5e658f2db..48e92705be59 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -2,6 +2,9 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o +ifeq ($(CONFIG_NET),y) +obj-$(CONFIG_BPF_SYSCALL) += devmap.o +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c new file mode 100644 index 000000000000..1a878356bd37 --- /dev/null +++ b/kernel/bpf/devmap.c @@ -0,0 +1,264 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* Devmaps primary use is as a backend map for XDP BPF helper call + * bpf_redirect_map(). Because XDP is mostly concerned with performance we + * spent some effort to ensure the datapath with redirect maps does not use + * any locking. This is a quick note on the details. + * + * We have three possible paths to get into the devmap control plane bpf + * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall + * will invoke an update, delete, or lookup operation. To ensure updates and + * deletes appear atomic from the datapath side xchg() is used to modify the + * netdev_map array. Then because the datapath does a lookup into the netdev_map + * array (read-only) from an RCU critical section we use call_rcu() to wait for + * an rcu grace period before free'ing the old data structures. This ensures the + * datapath always has a valid copy. However, the datapath does a "flush" + * operation that pushes any pending packets in the driver outside the RCU + * critical section. Each bpf_dtab_netdev tracks these pending operations using + * an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed + * until all bits are cleared indicating outstanding flush operations have + * completed. + * + * BPF syscalls may race with BPF program calls on any of the update, delete + * or lookup operations. As noted above the xchg() operation also keep the + * netdev_map consistent in this case. From the devmap side BPF programs + * calling into these operations are the same as multiple user space threads + * making system calls. + */ +#include +#include +#include +#include +#include "percpu_freelist.h" +#include "bpf_lru_list.h" +#include "map_in_map.h" + +struct bpf_dtab_netdev { + struct net_device *dev; + int key; + struct rcu_head rcu; + struct bpf_dtab *dtab; +}; + +struct bpf_dtab { + struct bpf_map map; + struct bpf_dtab_netdev **netdev_map; +}; + +static struct bpf_map *dev_map_alloc(union bpf_attr *attr) +{ + struct bpf_dtab *dtab; + u64 cost; + int err; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags) + return ERR_PTR(-EINVAL); + + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + dtab = kzalloc(sizeof(*dtab), GFP_USER); + if (!dtab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + dtab->map.map_type = attr->map_type; + dtab->map.key_size = attr->key_size; + dtab->map.value_size = attr->value_size; + dtab->map.max_entries = attr->max_entries; + dtab->map.map_flags = attr->map_flags; + + err = -ENOMEM; + + /* make sure page count doesn't overflow */ + cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_dtab; + + dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(dtab->map.pages); + if (err) + goto free_dtab; + + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * + sizeof(struct bpf_dtab_netdev *)); + if (!dtab->netdev_map) + goto free_dtab; + + return &dtab->map; + +free_dtab: + kfree(dtab); + return ERR_PTR(err); +} + +static void dev_map_free(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + int i; + + /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete. The rcu critical section only guarantees + * no further reads against netdev_map. It does __not__ ensure pending + * flush operations (if any) are complete. + */ + synchronize_rcu(); + + for (i = 0; i < dtab->map.max_entries; i++) { + struct bpf_dtab_netdev *dev; + + dev = dtab->netdev_map[i]; + if (!dev) + continue; + + dev_put(dev->dev); + kfree(dev); + } + + /* At this point bpf program is detached and all pending operations + * _must_ be complete + */ + bpf_map_area_free(dtab->netdev_map); + kfree(dtab); +} + +static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 index = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (index >= dtab->map.max_entries) { + *next = 0; + return 0; + } + + if (index == dtab->map.max_entries - 1) + return -ENOENT; + + *next = index + 1; + return 0; +} + +/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or + * update happens in parallel here a dev_put wont happen until after reading the + * ifindex. + */ +static void *dev_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + u32 i = *(u32 *)key; + + if (i >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[i]); + return dev ? &dev->dev->ifindex : NULL; +} + +static void __dev_map_entry_free(struct rcu_head *rcu) +{ + struct bpf_dtab_netdev *old_dev; + + old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_put(old_dev->dev); + kfree(old_dev); +} + +static int dev_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + + if (k >= map->max_entries) + return -EINVAL; + + /* Use synchronize_rcu() here to ensure any rcu critical sections + * have completed, but this does not guarantee a flush has happened + * yet. Because driver side rcu_read_lock/unlock only protects the + * running XDP program. However, for pending flush operations the + * dev and ctx are stored in another per cpu map. And additionally, + * the driver tear down ensures all soft irqs are complete before + * removing the net device in the case of dev_put equals zero. + */ + old_dev = xchg(&dtab->netdev_map[k], NULL); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + return 0; +} + +static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct net *net = current->nsproxy->net_ns; + struct bpf_dtab_netdev *dev, *old_dev; + u32 i = *(u32 *)key; + u32 ifindex = *(u32 *)value; + + if (unlikely(map_flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= dtab->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + return -EEXIST; + + if (!ifindex) { + dev = NULL; + } else { + dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + if (!dev) + return -ENOMEM; + + dev->dev = dev_get_by_index(net, ifindex); + if (!dev->dev) { + kfree(dev); + return -EINVAL; + } + + dev->key = i; + dev->dtab = dtab; + } + + /* Use call_rcu() here to ensure rcu critical sections have completed + * Remembering the driver side flush operation will happen before the + * net device is removed. + */ + old_dev = xchg(&dtab->netdev_map[i], dev); + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +const struct bpf_map_ops dev_map_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_get_next_key, + .map_lookup_elem = dev_map_lookup_elem, + .map_update_elem = dev_map_update_elem, + .map_delete_elem = dev_map_delete_elem, +}; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6a86723c5b64..4016774d5cca 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1276,6 +1276,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + /* devmap returns a pointer to a live net_device ifindex that we cannot + * allow to be modified from bpf side. So do not allow lookup elements + * for now. + */ + case BPF_MAP_TYPE_DEVMAP: + if (func_id == BPF_FUNC_map_lookup_elem) + goto error; + break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 79601c81e169..36d6ac3f0c1c 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -438,6 +438,21 @@ static void test_arraymap_percpu_many_keys(void) close(fd); } +static void test_devmap(int task, void *data) +{ + int next_key, fd; + __u32 key, value; + + fd = bpf_create_map(BPF_MAP_TYPE_DEVMAP, sizeof(key), sizeof(value), + 2, 0); + if (fd < 0) { + printf("Failed to create arraymap '%s'!\n", strerror(errno)); + exit(1); + } + + close(fd); +} + #define MAP_SIZE (32 * 1024) static void test_map_large(void) -- cgit v1.2.3 From 97f91a7cf04ff605845c20948b8a80e54cbd3376 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2017 09:29:18 -0700 Subject: bpf: add bpf_redirect_map helper routine BPF programs can use the devmap with a bpf_redirect_map() helper routine to forward packets to netdevice in map. Signed-off-by: John Fastabend Signed-off-by: Jesper Dangaard Brouer Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 3 +++ include/uapi/linux/bpf.h | 8 +++++++- kernel/bpf/devmap.c | 12 +++++++++++ kernel/bpf/verifier.c | 4 ++++ net/core/filter.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 78 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b69e7a5869ff..d0d3281ac678 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -379,4 +379,7 @@ extern const struct bpf_func_proto bpf_get_stackid_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +/* Map specifics */ +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ecbb0e7e15bc..1106a8c4cd36 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -348,6 +348,11 @@ union bpf_attr { * @flags: bit 0 - if set, redirect to ingress instead of egress * other bits - reserved * Return: TC_ACT_REDIRECT + * int bpf_redirect_map(key, map, flags) + * redirect to endpoint in map + * @key: index in map to lookup + * @map: fd of map to do lookup in + * @flags: -- * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid @@ -592,7 +597,8 @@ union bpf_attr { FN(get_socket_uid), \ FN(set_hash), \ FN(setsockopt), \ - FN(skb_adjust_room), + FN(skb_adjust_room), \ + FN(redirect_map), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 1a878356bd37..36dc13deb2e1 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -159,6 +159,18 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev; + + if (key >= map->max_entries) + return NULL; + + dev = READ_ONCE(dtab->netdev_map[key]); + return dev ? dev->dev : NULL; +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4016774d5cca..df05d65f0c87 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1312,6 +1312,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY) goto error; break; + case BPF_FUNC_redirect_map: + if (map->map_type != BPF_MAP_TYPE_DEVMAP) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index e30d38b27f21..e93a558324b5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1779,6 +1779,7 @@ static const struct bpf_func_proto bpf_clone_redirect_proto = { struct redirect_info { u32 ifindex; u32 flags; + struct bpf_map *map; }; static DEFINE_PER_CPU(struct redirect_info, redirect_info); @@ -1792,6 +1793,7 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) ri->ifindex = ifindex; ri->flags = flags; + ri->map = NULL; return TC_ACT_REDIRECT; } @@ -1819,6 +1821,29 @@ static const struct bpf_func_proto bpf_redirect_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return XDP_ABORTED; + + ri->ifindex = ifindex; + ri->flags = flags; + ri->map = map; + + return XDP_REDIRECT; +} + +static const struct bpf_func_proto bpf_redirect_map_proto = { + .func = bpf_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -2423,14 +2448,39 @@ static int __bpf_tx_xdp(struct net_device *dev, struct xdp_buff *xdp) return -EOPNOTSUPP; } +int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, + struct bpf_prog *xdp_prog) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct bpf_map *map = ri->map; + struct net_device *fwd; + int err = -EINVAL; + + ri->ifindex = 0; + ri->map = NULL; + + fwd = __dev_map_lookup_elem(map, ri->ifindex); + if (!fwd) + goto out; + + trace_xdp_redirect(dev, fwd, xdp_prog, XDP_REDIRECT); + err = __bpf_tx_xdp(fwd, xdp); +out: + return err; +} + int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); struct net_device *fwd; + if (ri->map) + return xdp_do_redirect_map(dev, xdp, xdp_prog); + fwd = dev_get_by_index_rcu(dev_net(dev), ri->ifindex); ri->ifindex = 0; + ri->map = NULL; if (unlikely(!fwd)) { bpf_warn_invalid_xdp_redirect(ri->ifindex); return -EINVAL; @@ -3089,6 +3139,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_adjust_head_proto; case BPF_FUNC_redirect: return &bpf_xdp_redirect_proto; + case BPF_FUNC_redirect_map: + return &bpf_redirect_map_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 727f8914477e4642c7d1ff381667cdc4178b40c6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Jul 2017 10:39:26 +0100 Subject: rxrpc: Expose UAPI definitions to userspace Move UAPI definitions from the internal header and place them in a UAPI header file so that userspace can make use of them. Signed-off-by: David Howells --- include/linux/rxrpc.h | 79 --------------------------------------------- include/uapi/linux/rxrpc.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 79 deletions(-) delete mode 100644 include/linux/rxrpc.h create mode 100644 include/uapi/linux/rxrpc.h (limited to 'include/uapi/linux') diff --git a/include/linux/rxrpc.h b/include/linux/rxrpc.h deleted file mode 100644 index 7343f71783dc..000000000000 --- a/include/linux/rxrpc.h +++ /dev/null @@ -1,79 +0,0 @@ -/* AF_RXRPC parameters - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_RXRPC_H -#define _LINUX_RXRPC_H - -#include -#include - -/* - * RxRPC socket address - */ -struct sockaddr_rxrpc { - sa_family_t srx_family; /* address family */ - u16 srx_service; /* service desired */ - u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ - u16 transport_len; /* length of transport address */ - union { - sa_family_t family; /* transport address family */ - struct sockaddr_in sin; /* IPv4 transport address */ - struct sockaddr_in6 sin6; /* IPv6 transport address */ - } transport; -}; - -/* - * RxRPC socket options - */ -#define RXRPC_SECURITY_KEY 1 /* [clnt] set client security key */ -#define RXRPC_SECURITY_KEYRING 2 /* [srvr] set ring of server security keys */ -#define RXRPC_EXCLUSIVE_CONNECTION 3 /* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */ -#define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ -#define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ -#define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ - -/* - * RxRPC control messages - * - If neither abort or accept are specified, the message is a data message. - * - terminal messages mean that a user call ID tag can be recycled - * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() - */ -enum rxrpc_cmsg_type { - RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ - RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ - RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ - RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ - RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ - RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ - RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ - RXRPC_ACCEPT = 9, /* s-: [Service] accept request */ - RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ - RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ - RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ - RXRPC__SUPPORTED -}; - -/* - * RxRPC security levels - */ -#define RXRPC_SECURITY_PLAIN 0 /* plain secure-checksummed packets only */ -#define RXRPC_SECURITY_AUTH 1 /* authenticated packets */ -#define RXRPC_SECURITY_ENCRYPT 2 /* encrypted packets */ - -/* - * RxRPC security indices - */ -#define RXRPC_SECURITY_NONE 0 /* no security protocol */ -#define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ -#define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ -#define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ - -#endif /* _LINUX_RXRPC_H */ diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h new file mode 100644 index 000000000000..08e2fb9c70ae --- /dev/null +++ b/include/uapi/linux/rxrpc.h @@ -0,0 +1,80 @@ +/* Types and definitions for AF_RXRPC. + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_RXRPC_H +#define _UAPI_LINUX_RXRPC_H + +#include +#include +#include + +/* + * RxRPC socket address + */ +struct sockaddr_rxrpc { + sa_family_t srx_family; /* address family */ + u16 srx_service; /* service desired */ + u16 transport_type; /* type of transport socket (SOCK_DGRAM) */ + u16 transport_len; /* length of transport address */ + union { + sa_family_t family; /* transport address family */ + struct sockaddr_in sin; /* IPv4 transport address */ + struct sockaddr_in6 sin6; /* IPv6 transport address */ + } transport; +}; + +/* + * RxRPC socket options + */ +#define RXRPC_SECURITY_KEY 1 /* [clnt] set client security key */ +#define RXRPC_SECURITY_KEYRING 2 /* [srvr] set ring of server security keys */ +#define RXRPC_EXCLUSIVE_CONNECTION 3 /* Deprecated; use RXRPC_EXCLUSIVE_CALL instead */ +#define RXRPC_MIN_SECURITY_LEVEL 4 /* minimum security level */ +#define RXRPC_UPGRADEABLE_SERVICE 5 /* Upgrade service[0] -> service[1] */ +#define RXRPC_SUPPORTED_CMSG 6 /* Get highest supported control message type */ + +/* + * RxRPC control messages + * - If neither abort or accept are specified, the message is a data message. + * - terminal messages mean that a user call ID tag can be recycled + * - s/r/- indicate whether these are applicable to sendmsg() and/or recvmsg() + */ +enum rxrpc_cmsg_type { + RXRPC_USER_CALL_ID = 1, /* sr: user call ID specifier */ + RXRPC_ABORT = 2, /* sr: abort request / notification [terminal] */ + RXRPC_ACK = 3, /* -r: [Service] RPC op final ACK received [terminal] */ + RXRPC_NET_ERROR = 5, /* -r: network error received [terminal] */ + RXRPC_BUSY = 6, /* -r: server busy received [terminal] */ + RXRPC_LOCAL_ERROR = 7, /* -r: local error generated [terminal] */ + RXRPC_NEW_CALL = 8, /* -r: [Service] new incoming call notification */ + RXRPC_ACCEPT = 9, /* s-: [Service] accept request */ + RXRPC_EXCLUSIVE_CALL = 10, /* s-: Call should be on exclusive connection */ + RXRPC_UPGRADE_SERVICE = 11, /* s-: Request service upgrade for client call */ + RXRPC_TX_LENGTH = 12, /* s-: Total length of Tx data */ + RXRPC__SUPPORTED +}; + +/* + * RxRPC security levels + */ +#define RXRPC_SECURITY_PLAIN 0 /* plain secure-checksummed packets only */ +#define RXRPC_SECURITY_AUTH 1 /* authenticated packets */ +#define RXRPC_SECURITY_ENCRYPT 2 /* encrypted packets */ + +/* + * RxRPC security indices + */ +#define RXRPC_SECURITY_NONE 0 /* no security protocol */ +#define RXRPC_SECURITY_RXKAD 2 /* kaserver or kerberos 4 */ +#define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ +#define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ + +#endif /* _UAPI_LINUX_RXRPC_H */ -- cgit v1.2.3 From ddc6c70f07bb1f6dd39a2c6c430f7b4fa95199c8 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 21 Jul 2017 10:07:10 +0100 Subject: rxrpc: Move the packet.h include file into net/rxrpc/ Move the protocol description header file into net/rxrpc/ and rename it to protocol.h. It's no longer necessary to expose it as packets are no longer exposed to kernel services (such as AFS) that use the facility. The abort codes are transferred to the UAPI header instead as we pass these back to userspace and also to kernel services. Signed-off-by: David Howells --- fs/afs/misc.c | 1 - fs/afs/rxrpc.c | 1 - include/rxrpc/packet.h | 235 --------------------------------------------- include/uapi/linux/rxrpc.h | 44 +++++++++ net/rxrpc/ar-internal.h | 2 +- net/rxrpc/protocol.h | 190 ++++++++++++++++++++++++++++++++++++ 6 files changed, 235 insertions(+), 238 deletions(-) delete mode 100644 include/rxrpc/packet.h create mode 100644 net/rxrpc/protocol.h (limited to 'include/uapi/linux') diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 100b207efc9e..c05f1f1c0d41 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -12,7 +12,6 @@ #include #include #include -#include #include "internal.h" #include "afs_fs.h" diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 02781e78ffb6..10743043d431 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -14,7 +14,6 @@ #include #include -#include #include "internal.h" #include "afs_cm.h" diff --git a/include/rxrpc/packet.h b/include/rxrpc/packet.h deleted file mode 100644 index a2dcfb850b9f..000000000000 --- a/include/rxrpc/packet.h +++ /dev/null @@ -1,235 +0,0 @@ -/* packet.h: Rx packet layout and definitions - * - * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _LINUX_RXRPC_PACKET_H -#define _LINUX_RXRPC_PACKET_H - -typedef u32 rxrpc_seq_t; /* Rx message sequence number */ -typedef u32 rxrpc_serial_t; /* Rx message serial number */ -typedef __be32 rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */ -typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ - -/*****************************************************************************/ -/* - * on-the-wire Rx packet header - * - all multibyte fields should be in network byte order - */ -struct rxrpc_wire_header { - __be32 epoch; /* client boot timestamp */ -#define RXRPC_RANDOM_EPOCH 0x80000000 /* Random if set, date-based if not */ - - __be32 cid; /* connection and channel ID */ -#define RXRPC_MAXCALLS 4 /* max active calls per conn */ -#define RXRPC_CHANNELMASK (RXRPC_MAXCALLS-1) /* mask for channel ID */ -#define RXRPC_CIDMASK (~RXRPC_CHANNELMASK) /* mask for connection ID */ -#define RXRPC_CIDSHIFT ilog2(RXRPC_MAXCALLS) /* shift for connection ID */ -#define RXRPC_CID_INC (1 << RXRPC_CIDSHIFT) /* connection ID increment */ - - __be32 callNumber; /* call ID (0 for connection-level packets) */ - __be32 seq; /* sequence number of pkt in call stream */ - __be32 serial; /* serial number of pkt sent to network */ - - uint8_t type; /* packet type */ -#define RXRPC_PACKET_TYPE_DATA 1 /* data */ -#define RXRPC_PACKET_TYPE_ACK 2 /* ACK */ -#define RXRPC_PACKET_TYPE_BUSY 3 /* call reject */ -#define RXRPC_PACKET_TYPE_ABORT 4 /* call/connection abort */ -#define RXRPC_PACKET_TYPE_ACKALL 5 /* ACK all outstanding packets on call */ -#define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ -#define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ -#define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ -#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ -#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ - - uint8_t flags; /* packet flags */ -#define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ -#define RXRPC_REQUEST_ACK 0x02 /* request an unconditional ACK of this packet */ -#define RXRPC_LAST_PACKET 0x04 /* the last packet from this side for this call */ -#define RXRPC_MORE_PACKETS 0x08 /* more packets to come */ -#define RXRPC_JUMBO_PACKET 0x20 /* [DATA] this is a jumbo packet */ -#define RXRPC_SLOW_START_OK 0x20 /* [ACK] slow start supported */ - - uint8_t userStatus; /* app-layer defined status */ -#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01 /* AuriStor service upgrade request */ - - uint8_t securityIndex; /* security protocol ID */ - union { - __be16 _rsvd; /* reserved */ - __be16 cksum; /* kerberos security checksum */ - }; - __be16 serviceId; /* service ID */ - -} __packed; - -#define RXRPC_SUPPORTED_PACKET_TYPES ( \ - (1 << RXRPC_PACKET_TYPE_DATA) | \ - (1 << RXRPC_PACKET_TYPE_ACK) | \ - (1 << RXRPC_PACKET_TYPE_BUSY) | \ - (1 << RXRPC_PACKET_TYPE_ABORT) | \ - (1 << RXRPC_PACKET_TYPE_ACKALL) | \ - (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ - (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ - /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ - (1 << RXRPC_PACKET_TYPE_VERSION)) - -/*****************************************************************************/ -/* - * jumbo packet secondary header - * - can be mapped to read header by: - * - new_serial = serial + 1 - * - new_seq = seq + 1 - * - new_flags = j_flags - * - new__rsvd = j__rsvd - * - duplicating all other fields - */ -struct rxrpc_jumbo_header { - uint8_t flags; /* packet flags (as per rxrpc_header) */ - uint8_t pad; - union { - __be16 _rsvd; /* reserved */ - __be16 cksum; /* kerberos security checksum */ - }; -}; - -#define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ -#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) - -/*****************************************************************************/ -/* - * on-the-wire Rx ACK packet data payload - * - all multibyte fields should be in network byte order - */ -struct rxrpc_ackpacket { - __be16 bufferSpace; /* number of packet buffers available */ - __be16 maxSkew; /* diff between serno being ACK'd and highest serial no - * received */ - __be32 firstPacket; /* sequence no of first ACK'd packet in attached list */ - __be32 previousPacket; /* sequence no of previous packet received */ - __be32 serial; /* serial no of packet that prompted this ACK */ - - uint8_t reason; /* reason for ACK */ -#define RXRPC_ACK_REQUESTED 1 /* ACK was requested on packet */ -#define RXRPC_ACK_DUPLICATE 2 /* duplicate packet received */ -#define RXRPC_ACK_OUT_OF_SEQUENCE 3 /* out of sequence packet received */ -#define RXRPC_ACK_EXCEEDS_WINDOW 4 /* packet received beyond end of ACK window */ -#define RXRPC_ACK_NOSPACE 5 /* packet discarded due to lack of buffer space */ -#define RXRPC_ACK_PING 6 /* keep alive ACK */ -#define RXRPC_ACK_PING_RESPONSE 7 /* response to RXRPC_ACK_PING */ -#define RXRPC_ACK_DELAY 8 /* nothing happened since received packet */ -#define RXRPC_ACK_IDLE 9 /* ACK due to fully received ACK window */ -#define RXRPC_ACK__INVALID 10 /* Representation of invalid ACK reason */ - - uint8_t nAcks; /* number of ACKs */ -#define RXRPC_MAXACKS 255 - - uint8_t acks[0]; /* list of ACK/NAKs */ -#define RXRPC_ACK_TYPE_NACK 0 -#define RXRPC_ACK_TYPE_ACK 1 - -} __packed; - -/* Some ACKs refer to specific packets and some are general and can be updated. */ -#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \ - (1 << RXRPC_ACK_PING_RESPONSE) | \ - (1 << RXRPC_ACK_DELAY) | \ - (1 << RXRPC_ACK_IDLE)) - - -/* - * ACK packets can have a further piece of information tagged on the end - */ -struct rxrpc_ackinfo { - __be32 rxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ - __be32 maxMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ - __be32 rwind; /* Rx window size (packets) [AFS 3.4] */ - __be32 jumbo_max; /* max packets to stick into a jumbo packet [AFS 3.5] */ -}; - -/*****************************************************************************/ -/* - * Kerberos security type-2 challenge packet - */ -struct rxkad_challenge { - __be32 version; /* version of this challenge type */ - __be32 nonce; /* encrypted random number */ - __be32 min_level; /* minimum security level */ - __be32 __padding; /* padding to 8-byte boundary */ -} __packed; - -/*****************************************************************************/ -/* - * Kerberos security type-2 response packet - */ -struct rxkad_response { - __be32 version; /* version of this response type */ - __be32 __pad; - - /* encrypted bit of the response */ - struct { - __be32 epoch; /* current epoch */ - __be32 cid; /* parent connection ID */ - __be32 checksum; /* checksum */ - __be32 securityIndex; /* security type */ - __be32 call_id[4]; /* encrypted call IDs */ - __be32 inc_nonce; /* challenge nonce + 1 */ - __be32 level; /* desired level */ - } encrypted; - - __be32 kvno; /* Kerberos key version number */ - __be32 ticket_len; /* Kerberos ticket length */ -} __packed; - -/*****************************************************************************/ -/* - * RxRPC-level abort codes - */ -#define RX_CALL_DEAD -1 /* call/conn has been inactive and is shut down */ -#define RX_INVALID_OPERATION -2 /* invalid operation requested / attempted */ -#define RX_CALL_TIMEOUT -3 /* call timeout exceeded */ -#define RX_EOF -4 /* unexpected end of data on read op */ -#define RX_PROTOCOL_ERROR -5 /* low-level protocol error */ -#define RX_USER_ABORT -6 /* generic user abort */ -#define RX_ADDRINUSE -7 /* UDP port in use */ -#define RX_DEBUGI_BADTYPE -8 /* bad debugging packet type */ - -/* - * (un)marshalling abort codes (rxgen) - */ -#define RXGEN_CC_MARSHAL -450 -#define RXGEN_CC_UNMARSHAL -451 -#define RXGEN_SS_MARSHAL -452 -#define RXGEN_SS_UNMARSHAL -453 -#define RXGEN_DECODE -454 -#define RXGEN_OPCODE -455 -#define RXGEN_SS_XDRFREE -456 -#define RXGEN_CC_XDRFREE -457 - -/* - * Rx kerberos security abort codes - * - unfortunately we have no generalised security abort codes to say things - * like "unsupported security", so we have to use these instead and hope the - * other side understands - */ -#define RXKADINCONSISTENCY 19270400 /* security module structure inconsistent */ -#define RXKADPACKETSHORT 19270401 /* packet too short for security challenge */ -#define RXKADLEVELFAIL 19270402 /* security level negotiation failed */ -#define RXKADTICKETLEN 19270403 /* ticket length too short or too long */ -#define RXKADOUTOFSEQUENCE 19270404 /* packet had bad sequence number */ -#define RXKADNOAUTH 19270405 /* caller not authorised */ -#define RXKADBADKEY 19270406 /* illegal key: bad parity or weak */ -#define RXKADBADTICKET 19270407 /* security object was passed a bad ticket */ -#define RXKADUNKNOWNKEY 19270408 /* ticket contained unknown key version number */ -#define RXKADEXPIRED 19270409 /* authentication expired */ -#define RXKADSEALEDINCON 19270410 /* sealed data inconsistent */ -#define RXKADDATALEN 19270411 /* user data too long */ -#define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ - -#endif /* _LINUX_RXRPC_PACKET_H */ diff --git a/include/uapi/linux/rxrpc.h b/include/uapi/linux/rxrpc.h index 08e2fb9c70ae..9656aad8f8f7 100644 --- a/include/uapi/linux/rxrpc.h +++ b/include/uapi/linux/rxrpc.h @@ -77,4 +77,48 @@ enum rxrpc_cmsg_type { #define RXRPC_SECURITY_RXGK 4 /* gssapi-based */ #define RXRPC_SECURITY_RXK5 5 /* kerberos 5 */ +/* + * RxRPC-level abort codes + */ +#define RX_CALL_DEAD -1 /* call/conn has been inactive and is shut down */ +#define RX_INVALID_OPERATION -2 /* invalid operation requested / attempted */ +#define RX_CALL_TIMEOUT -3 /* call timeout exceeded */ +#define RX_EOF -4 /* unexpected end of data on read op */ +#define RX_PROTOCOL_ERROR -5 /* low-level protocol error */ +#define RX_USER_ABORT -6 /* generic user abort */ +#define RX_ADDRINUSE -7 /* UDP port in use */ +#define RX_DEBUGI_BADTYPE -8 /* bad debugging packet type */ + +/* + * (un)marshalling abort codes (rxgen) + */ +#define RXGEN_CC_MARSHAL -450 +#define RXGEN_CC_UNMARSHAL -451 +#define RXGEN_SS_MARSHAL -452 +#define RXGEN_SS_UNMARSHAL -453 +#define RXGEN_DECODE -454 +#define RXGEN_OPCODE -455 +#define RXGEN_SS_XDRFREE -456 +#define RXGEN_CC_XDRFREE -457 + +/* + * Rx kerberos security abort codes + * - unfortunately we have no generalised security abort codes to say things + * like "unsupported security", so we have to use these instead and hope the + * other side understands + */ +#define RXKADINCONSISTENCY 19270400 /* security module structure inconsistent */ +#define RXKADPACKETSHORT 19270401 /* packet too short for security challenge */ +#define RXKADLEVELFAIL 19270402 /* security level negotiation failed */ +#define RXKADTICKETLEN 19270403 /* ticket length too short or too long */ +#define RXKADOUTOFSEQUENCE 19270404 /* packet had bad sequence number */ +#define RXKADNOAUTH 19270405 /* caller not authorised */ +#define RXKADBADKEY 19270406 /* illegal key: bad parity or weak */ +#define RXKADBADTICKET 19270407 /* security object was passed a bad ticket */ +#define RXKADUNKNOWNKEY 19270408 /* ticket contained unknown key version number */ +#define RXKADEXPIRED 19270409 /* authentication expired */ +#define RXKADSEALEDINCON 19270410 /* sealed data inconsistent */ +#define RXKADDATALEN 19270411 /* user data too long */ +#define RXKADILLEGALLEVEL 19270412 /* caller not authorised to use encrypted conns */ + #endif /* _UAPI_LINUX_RXRPC_H */ diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 69b97339ff9d..8c0db9b3e4ab 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -15,7 +15,7 @@ #include #include #include -#include +#include "protocol.h" #if 0 #define CHECK_SLAB_OKAY(X) \ diff --git a/net/rxrpc/protocol.h b/net/rxrpc/protocol.h new file mode 100644 index 000000000000..4bddcf3face3 --- /dev/null +++ b/net/rxrpc/protocol.h @@ -0,0 +1,190 @@ +/* packet.h: Rx packet layout and definitions + * + * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _LINUX_RXRPC_PACKET_H +#define _LINUX_RXRPC_PACKET_H + +typedef u32 rxrpc_seq_t; /* Rx message sequence number */ +typedef u32 rxrpc_serial_t; /* Rx message serial number */ +typedef __be32 rxrpc_seq_net_t; /* on-the-wire Rx message sequence number */ +typedef __be32 rxrpc_serial_net_t; /* on-the-wire Rx message serial number */ + +/*****************************************************************************/ +/* + * on-the-wire Rx packet header + * - all multibyte fields should be in network byte order + */ +struct rxrpc_wire_header { + __be32 epoch; /* client boot timestamp */ +#define RXRPC_RANDOM_EPOCH 0x80000000 /* Random if set, date-based if not */ + + __be32 cid; /* connection and channel ID */ +#define RXRPC_MAXCALLS 4 /* max active calls per conn */ +#define RXRPC_CHANNELMASK (RXRPC_MAXCALLS-1) /* mask for channel ID */ +#define RXRPC_CIDMASK (~RXRPC_CHANNELMASK) /* mask for connection ID */ +#define RXRPC_CIDSHIFT ilog2(RXRPC_MAXCALLS) /* shift for connection ID */ +#define RXRPC_CID_INC (1 << RXRPC_CIDSHIFT) /* connection ID increment */ + + __be32 callNumber; /* call ID (0 for connection-level packets) */ + __be32 seq; /* sequence number of pkt in call stream */ + __be32 serial; /* serial number of pkt sent to network */ + + uint8_t type; /* packet type */ +#define RXRPC_PACKET_TYPE_DATA 1 /* data */ +#define RXRPC_PACKET_TYPE_ACK 2 /* ACK */ +#define RXRPC_PACKET_TYPE_BUSY 3 /* call reject */ +#define RXRPC_PACKET_TYPE_ABORT 4 /* call/connection abort */ +#define RXRPC_PACKET_TYPE_ACKALL 5 /* ACK all outstanding packets on call */ +#define RXRPC_PACKET_TYPE_CHALLENGE 6 /* connection security challenge (SRVR->CLNT) */ +#define RXRPC_PACKET_TYPE_RESPONSE 7 /* connection secutity response (CLNT->SRVR) */ +#define RXRPC_PACKET_TYPE_DEBUG 8 /* debug info request */ +#define RXRPC_PACKET_TYPE_VERSION 13 /* version string request */ +#define RXRPC_N_PACKET_TYPES 14 /* number of packet types (incl type 0) */ + + uint8_t flags; /* packet flags */ +#define RXRPC_CLIENT_INITIATED 0x01 /* signifies a packet generated by a client */ +#define RXRPC_REQUEST_ACK 0x02 /* request an unconditional ACK of this packet */ +#define RXRPC_LAST_PACKET 0x04 /* the last packet from this side for this call */ +#define RXRPC_MORE_PACKETS 0x08 /* more packets to come */ +#define RXRPC_JUMBO_PACKET 0x20 /* [DATA] this is a jumbo packet */ +#define RXRPC_SLOW_START_OK 0x20 /* [ACK] slow start supported */ + + uint8_t userStatus; /* app-layer defined status */ +#define RXRPC_USERSTATUS_SERVICE_UPGRADE 0x01 /* AuriStor service upgrade request */ + + uint8_t securityIndex; /* security protocol ID */ + union { + __be16 _rsvd; /* reserved */ + __be16 cksum; /* kerberos security checksum */ + }; + __be16 serviceId; /* service ID */ + +} __packed; + +#define RXRPC_SUPPORTED_PACKET_TYPES ( \ + (1 << RXRPC_PACKET_TYPE_DATA) | \ + (1 << RXRPC_PACKET_TYPE_ACK) | \ + (1 << RXRPC_PACKET_TYPE_BUSY) | \ + (1 << RXRPC_PACKET_TYPE_ABORT) | \ + (1 << RXRPC_PACKET_TYPE_ACKALL) | \ + (1 << RXRPC_PACKET_TYPE_CHALLENGE) | \ + (1 << RXRPC_PACKET_TYPE_RESPONSE) | \ + /*(1 << RXRPC_PACKET_TYPE_DEBUG) | */ \ + (1 << RXRPC_PACKET_TYPE_VERSION)) + +/*****************************************************************************/ +/* + * jumbo packet secondary header + * - can be mapped to read header by: + * - new_serial = serial + 1 + * - new_seq = seq + 1 + * - new_flags = j_flags + * - new__rsvd = j__rsvd + * - duplicating all other fields + */ +struct rxrpc_jumbo_header { + uint8_t flags; /* packet flags (as per rxrpc_header) */ + uint8_t pad; + union { + __be16 _rsvd; /* reserved */ + __be16 cksum; /* kerberos security checksum */ + }; +}; + +#define RXRPC_JUMBO_DATALEN 1412 /* non-terminal jumbo packet data length */ +#define RXRPC_JUMBO_SUBPKTLEN (RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) + +/*****************************************************************************/ +/* + * on-the-wire Rx ACK packet data payload + * - all multibyte fields should be in network byte order + */ +struct rxrpc_ackpacket { + __be16 bufferSpace; /* number of packet buffers available */ + __be16 maxSkew; /* diff between serno being ACK'd and highest serial no + * received */ + __be32 firstPacket; /* sequence no of first ACK'd packet in attached list */ + __be32 previousPacket; /* sequence no of previous packet received */ + __be32 serial; /* serial no of packet that prompted this ACK */ + + uint8_t reason; /* reason for ACK */ +#define RXRPC_ACK_REQUESTED 1 /* ACK was requested on packet */ +#define RXRPC_ACK_DUPLICATE 2 /* duplicate packet received */ +#define RXRPC_ACK_OUT_OF_SEQUENCE 3 /* out of sequence packet received */ +#define RXRPC_ACK_EXCEEDS_WINDOW 4 /* packet received beyond end of ACK window */ +#define RXRPC_ACK_NOSPACE 5 /* packet discarded due to lack of buffer space */ +#define RXRPC_ACK_PING 6 /* keep alive ACK */ +#define RXRPC_ACK_PING_RESPONSE 7 /* response to RXRPC_ACK_PING */ +#define RXRPC_ACK_DELAY 8 /* nothing happened since received packet */ +#define RXRPC_ACK_IDLE 9 /* ACK due to fully received ACK window */ +#define RXRPC_ACK__INVALID 10 /* Representation of invalid ACK reason */ + + uint8_t nAcks; /* number of ACKs */ +#define RXRPC_MAXACKS 255 + + uint8_t acks[0]; /* list of ACK/NAKs */ +#define RXRPC_ACK_TYPE_NACK 0 +#define RXRPC_ACK_TYPE_ACK 1 + +} __packed; + +/* Some ACKs refer to specific packets and some are general and can be updated. */ +#define RXRPC_ACK_UPDATEABLE ((1 << RXRPC_ACK_REQUESTED) | \ + (1 << RXRPC_ACK_PING_RESPONSE) | \ + (1 << RXRPC_ACK_DELAY) | \ + (1 << RXRPC_ACK_IDLE)) + + +/* + * ACK packets can have a further piece of information tagged on the end + */ +struct rxrpc_ackinfo { + __be32 rxMTU; /* maximum Rx MTU size (bytes) [AFS 3.3] */ + __be32 maxMTU; /* maximum interface MTU size (bytes) [AFS 3.3] */ + __be32 rwind; /* Rx window size (packets) [AFS 3.4] */ + __be32 jumbo_max; /* max packets to stick into a jumbo packet [AFS 3.5] */ +}; + +/*****************************************************************************/ +/* + * Kerberos security type-2 challenge packet + */ +struct rxkad_challenge { + __be32 version; /* version of this challenge type */ + __be32 nonce; /* encrypted random number */ + __be32 min_level; /* minimum security level */ + __be32 __padding; /* padding to 8-byte boundary */ +} __packed; + +/*****************************************************************************/ +/* + * Kerberos security type-2 response packet + */ +struct rxkad_response { + __be32 version; /* version of this response type */ + __be32 __pad; + + /* encrypted bit of the response */ + struct { + __be32 epoch; /* current epoch */ + __be32 cid; /* parent connection ID */ + __be32 checksum; /* checksum */ + __be32 securityIndex; /* security type */ + __be32 call_id[4]; /* encrypted call IDs */ + __be32 inc_nonce; /* challenge nonce + 1 */ + __be32 level; /* desired level */ + } encrypted; + + __be32 kvno; /* Kerberos key version number */ + __be32 ticket_len; /* Kerberos ticket length */ +} __packed; + +#endif /* _LINUX_RXRPC_PACKET_H */ -- cgit v1.2.3 From 784b4e612d42a2b7578d7fab2ed78940e10536bc Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Wed, 19 Jul 2017 16:32:23 +0200 Subject: netfilter: nf_tables: Attach process info to NFT_MSG_NEWGEN notifications This is helpful for 'nft monitor' to track which process caused a given change to the ruleset. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 683f6f88fcac..6f0a950e21c3 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1221,6 +1221,8 @@ enum nft_objref_attributes { enum nft_gen_attributes { NFTA_GEN_UNSPEC, NFTA_GEN_ID, + NFTA_GEN_PROC_PID, + NFTA_GEN_PROC_NAME, __NFTA_GEN_MAX }; #define NFTA_GEN_MAX (__NFTA_GEN_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 7fbf0070aba1..b77ad0813564 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4657,6 +4657,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; + char buf[TASK_COMM_LEN]; int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); @@ -4668,7 +4669,9 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, nfmsg->version = NFNETLINK_V0; nfmsg->res_id = htons(net->nft.base_seq & 0xffff); - if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq))) + if (nla_put_be32(skb, NFTA_GEN_ID, htonl(net->nft.base_seq)) || + nla_put_be32(skb, NFTA_GEN_PROC_PID, htonl(task_pid_nr(current))) || + nla_put_string(skb, NFTA_GEN_PROC_NAME, get_task_comm(buf, current))) goto nla_put_failure; nlmsg_end(skb, nlh); -- cgit v1.2.3 From 1a5f3da20bd966220931239fbd31e6ac6ff42251 Mon Sep 17 00:00:00 2001 From: Vidya Sagar Ravipati Date: Thu, 27 Jul 2017 16:47:26 -0700 Subject: net: ethtool: add support for forward error correction modes Forward Error Correction (FEC) modes i.e Base-R and Reed-Solomon modes are introduced in 25G/40G/100G standards for providing good BER at high speeds. Various networking devices which support 25G/40G/100G provides ability to manage supported FEC modes and the lack of FEC encoding control and reporting today is a source for interoperability issues for many vendors. FEC capability as well as specific FEC mode i.e. Base-R or RS modes can be requested or advertised through bits D44:47 of base link codeword. This patch set intends to provide option under ethtool to manage and report FEC encoding settings for networking devices as per IEEE 802.3 bj, bm and by specs. set-fec/show-fec option(s) are designed to provide control and report the FEC encoding on the link. SET FEC option: root@tor: ethtool --set-fec swp1 encoding [off | RS | BaseR | auto] Encoding: Types of encoding Off : Turning off any encoding RS : enforcing RS-FEC encoding on supported speeds BaseR : enforcing Base R encoding on supported speeds Auto : IEEE defaults for the speed/medium combination Here are a few examples of what we would expect if encoding=auto: - if autoneg is on, we are expecting FEC to be negotiated as on or off as long as protocol supports it - if the hardware is capable of detecting the FEC encoding on it's receiver it will reconfigure its encoder to match - in absence of the above, the configuration would be set to IEEE defaults. >From our understanding , this is essentially what most hardware/driver combinations are doing today in the absence of a way for users to control the behavior. SHOW FEC option: root@tor: ethtool --show-fec swp1 FEC parameters for swp1: Active FEC encodings: RS Configured FEC encodings: RS | BaseR ETHTOOL DEVNAME output modification: ethtool devname output: root@tor:~# ethtool swp1 Settings for swp1: root@hpe-7712-03:~# ethtool swp18 Settings for swp18: Supported ports: [ FIBRE ] Supported link modes: 40000baseCR4/Full 40000baseSR4/Full 40000baseLR4/Full 100000baseSR4/Full 100000baseCR4/Full 100000baseLR4_ER4/Full Supported pause frame use: No Supports auto-negotiation: Yes Supported FEC modes: [RS | BaseR | None | Not reported] Advertised link modes: Not reported Advertised pause frame use: No Advertised auto-negotiation: No Advertised FEC modes: [RS | BaseR | None | Not reported] <<<< One or more FEC modes Speed: 100000Mb/s Duplex: Full Port: FIBRE PHYAD: 106 Transceiver: internal Auto-negotiation: off Link detected: yes This patch includes following changes a) New ETHTOOL_SFECPARAM/SFECPARAM API, handled by the new get_fecparam/set_fecparam callbacks, provides support for configuration of forward error correction modes. b) Link mode bits for FEC modes i.e. None (No FEC mode), RS, BaseR/FC are defined so that users can configure these fec modes for supported and advertising fields as part of link autonegotiation. Signed-off-by: Vidya Sagar Ravipati Signed-off-by: Dustin Byford Signed-off-by: Roopa Prabhu Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ++++ include/uapi/linux/ethtool.h | 48 +++++++++++++++++++++++++++++++++++++++++++- net/core/ethtool.c | 34 +++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 83cc9863444b..afdbb701fdb4 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -374,5 +374,9 @@ struct ethtool_ops { struct ethtool_link_ksettings *); int (*set_link_ksettings)(struct net_device *, const struct ethtool_link_ksettings *); + int (*get_fecparam)(struct net_device *, + struct ethtool_fecparam *); + int (*set_fecparam)(struct net_device *, + struct ethtool_fecparam *); }; #endif /* _LINUX_ETHTOOL_H */ diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 7d4a594d5d58..9c041dae8e2c 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1238,6 +1238,47 @@ struct ethtool_per_queue_op { char data[]; }; +/** + * struct ethtool_fecparam - Ethernet forward error correction(fec) parameters + * @cmd: Command number = %ETHTOOL_GFECPARAM or %ETHTOOL_SFECPARAM + * @active_fec: FEC mode which is active on porte + * @fec: Bitmask of supported/configured FEC modes + * @rsvd: Reserved for future extensions. i.e FEC bypass feature. + * + * Drivers should reject a non-zero setting of @autoneg when + * autoneogotiation is disabled (or not supported) for the link. + * + */ +struct ethtool_fecparam { + __u32 cmd; + /* bitmask of FEC modes */ + __u32 active_fec; + __u32 fec; + __u32 reserved; +}; + +/** + * enum ethtool_fec_config_bits - flags definition of ethtool_fec_configuration + * @ETHTOOL_FEC_NONE: FEC mode configuration is not supported + * @ETHTOOL_FEC_AUTO: Default/Best FEC mode provided by driver + * @ETHTOOL_FEC_OFF: No FEC Mode + * @ETHTOOL_FEC_RS: Reed-Solomon Forward Error Detection mode + * @ETHTOOL_FEC_BASER: Base-R/Reed-Solomon Forward Error Detection mode + */ +enum ethtool_fec_config_bits { + ETHTOOL_FEC_NONE_BIT, + ETHTOOL_FEC_AUTO_BIT, + ETHTOOL_FEC_OFF_BIT, + ETHTOOL_FEC_RS_BIT, + ETHTOOL_FEC_BASER_BIT, +}; + +#define ETHTOOL_FEC_NONE (1 << ETHTOOL_FEC_NONE_BIT) +#define ETHTOOL_FEC_AUTO (1 << ETHTOOL_FEC_AUTO_BIT) +#define ETHTOOL_FEC_OFF (1 << ETHTOOL_FEC_OFF_BIT) +#define ETHTOOL_FEC_RS (1 << ETHTOOL_FEC_RS_BIT) +#define ETHTOOL_FEC_BASER (1 << ETHTOOL_FEC_BASER_BIT) + /* CMDs currently supported */ #define ETHTOOL_GSET 0x00000001 /* DEPRECATED, Get settings. * Please use ETHTOOL_GLINKSETTINGS @@ -1330,6 +1371,8 @@ struct ethtool_per_queue_op { #define ETHTOOL_SLINKSETTINGS 0x0000004d /* Set ethtool_link_settings */ #define ETHTOOL_PHY_GTUNABLE 0x0000004e /* Get PHY tunable configuration */ #define ETHTOOL_PHY_STUNABLE 0x0000004f /* Set PHY tunable configuration */ +#define ETHTOOL_GFECPARAM 0x00000050 /* Get FEC settings */ +#define ETHTOOL_SFECPARAM 0x00000051 /* Set FEC settings */ /* compatibility with older code */ #define SPARC_ETH_GSET ETHTOOL_GSET @@ -1387,6 +1430,9 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_2500baseT_Full_BIT = 47, ETHTOOL_LINK_MODE_5000baseT_Full_BIT = 48, + ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, + ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, + ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* @@ -1395,7 +1441,7 @@ enum ethtool_link_mode_bit_indices { */ __ETHTOOL_LINK_MODE_LAST - = ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + = ETHTOOL_LINK_MODE_FEC_BASER_BIT, }; #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ diff --git a/net/core/ethtool.c b/net/core/ethtool.c index b987bc475fc8..6a582ae4c5d9 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -2512,6 +2512,33 @@ static int set_phy_tunable(struct net_device *dev, void __user *useraddr) return ret; } +static int ethtool_get_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam = { ETHTOOL_GFECPARAM }; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_fecparam(dev, &fecparam); + + if (copy_to_user(useraddr, &fecparam, sizeof(fecparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_fecparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_fecparam fecparam; + + if (!dev->ethtool_ops->set_fecparam) + return -EOPNOTSUPP; + + if (copy_from_user(&fecparam, useraddr, sizeof(fecparam))) + return -EFAULT; + + return dev->ethtool_ops->set_fecparam(dev, &fecparam); +} + /* The main entry point in this file. Called from net/core/dev_ioctl.c */ int dev_ethtool(struct net *net, struct ifreq *ifr) @@ -2570,6 +2597,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GTUNABLE: case ETHTOOL_PHY_GTUNABLE: case ETHTOOL_GLINKSETTINGS: + case ETHTOOL_GFECPARAM: break; default: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -2779,6 +2807,12 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_PHY_STUNABLE: rc = set_phy_tunable(dev, useraddr); break; + case ETHTOOL_GFECPARAM: + rc = ethtool_get_fecparam(dev, useraddr); + break; + case ETHTOOL_SFECPARAM: + rc = ethtool_set_fecparam(dev, useraddr); + break; default: rc = -EOPNOTSUPP; } -- cgit v1.2.3 From 64c83d837329531252a1a0f0dfdd4fd607e1d8e9 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:49 -0400 Subject: net netlink: Add new type NLA_BITFIELD32 Generic bitflags attribute content sent to the kernel by user. With this netlink attr type the user can either set or unset a flag in the kernel. The value is a bitmap that defines the bit values being set The selector is a bitmask that defines which value bit is to be considered. A check is made to ensure the rules that a kernel subsystem always conforms to bitflags the kernel already knows about. i.e if the user tries to set a bit flag that is not understood then the _it will be rejected_. In the most basic form, the user specifies the attribute policy as: [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, where myvalidflags is the bit mask of the flags the kernel understands. If the user _does not_ provide myvalidflags then the attribute will also be rejected. Examples: value = 0x0, and selector = 0x1 implies we are selecting bit 1 and we want to set its value to 0. value = 0x2, and selector = 0x2 implies we are selecting bit 2 and we want to set its value to 1. Suggested-by: Jiri Pirko Signed-off-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/netlink.h | 16 ++++++++++++++++ include/uapi/linux/netlink.h | 17 +++++++++++++++++ lib/nlattr.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/netlink.h b/include/net/netlink.h index ef8e6c3a80a6..82dd298b40c7 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -178,6 +178,7 @@ enum { NLA_S16, NLA_S32, NLA_S64, + NLA_BITFIELD32, __NLA_TYPE_MAX, }; @@ -206,6 +207,7 @@ enum { * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" + * NLA_BITFIELD32 A 32-bit bitmap/bitselector attribute * All other Minimum length of attribute payload * * Example: @@ -213,11 +215,13 @@ enum { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = { .len = sizeof(struct mystruct) }, + * [ATTR_GOO] = { .type = NLA_BITFIELD32, .validation_data = &myvalidflags }, * }; */ struct nla_policy { u16 type; u16 len; + void *validation_data; }; /** @@ -1202,6 +1206,18 @@ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) return tmp; } +/** + * nla_get_bitfield32 - return payload of 32 bitfield attribute + * @nla: nla_bitfield32 attribute + */ +static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) +{ + struct nla_bitfield32 tmp; + + nla_memcpy(&tmp, nla, sizeof(tmp)); + return tmp; +} + /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f86127a46cfc..f4fc9c9e123d 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -226,5 +226,22 @@ struct nlattr { #define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) #define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; #endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/lib/nlattr.c b/lib/nlattr.c index fb52435be42d..ee79b7a3c6b0 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -27,6 +27,30 @@ static const u8 nla_attr_minlen[NLA_TYPE_MAX+1] = { [NLA_S64] = sizeof(s64), }; +static int validate_nla_bitfield32(const struct nlattr *nla, + u32 *valid_flags_allowed) +{ + const struct nla_bitfield32 *bf = nla_data(nla); + u32 *valid_flags_mask = valid_flags_allowed; + + if (!valid_flags_allowed) + return -EINVAL; + + /*disallow invalid bit selector */ + if (bf->selector & ~*valid_flags_mask) + return -EINVAL; + + /*disallow invalid bit values */ + if (bf->value & ~*valid_flags_mask) + return -EINVAL; + + /*disallow valid bit values that are not selected*/ + if (bf->value & ~bf->selector) + return -EINVAL; + + return 0; +} + static int validate_nla(const struct nlattr *nla, int maxtype, const struct nla_policy *policy) { @@ -46,6 +70,12 @@ static int validate_nla(const struct nlattr *nla, int maxtype, return -ERANGE; break; + case NLA_BITFIELD32: + if (attrlen != sizeof(struct nla_bitfield32)) + return -ERANGE; + + return validate_nla_bitfield32(nla, pt->validation_data); + case NLA_NUL_STRING: if (pt->len) minlen = min_t(int, attrlen, pt->len + 1); -- cgit v1.2.3 From 90825b23a887f06f6c05bdde77b200c5fe9b6217 Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:51 -0400 Subject: net sched actions: dump more than TCA_ACT_MAX_PRIO actions per batch When you dump hundreds of thousands of actions, getting only 32 per dump batch even when the socket buffer and memory allocations allow is inefficient. With this change, the user will get as many as possibly fitting within the given constraints available to the kernel. The top level action TLV space is extended. An attribute TCA_ROOT_FLAGS is used to carry flags; flag TCA_FLAG_LARGE_DUMP_ON is set by the user indicating the user is capable of processing these large dumps. Older user space which doesnt set this flag doesnt get the large (than 32) batches. The kernel uses the TCA_ROOT_COUNT attribute to tell the user how many actions are put in a single batch. As such user space app knows how long to iterate (independent of the type of action being dumped) instead of hardcoded maximum of 32 thus maintaining backward compat. Some results dumping 1.5M actions below: first an unpatched tc which doesnt understand these features... prompt$ time -p tc actions ls action gact | grep index | wc -l 1500000 real 1388.43 user 2.07 sys 1386.79 Now lets see a patched tc which sets the correct flags when requesting a dump: prompt$ time -p updatedtc actions ls action gact | grep index | wc -l 1500000 real 178.13 user 2.02 sys 176.96 That is about 8x performance improvement for tc app which sets its receive buffer to about 32K. Signed-off-by: Jamal Hadi Salim Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 22 +++++++++++++++++-- net/sched/act_api.c | 50 +++++++++++++++++++++++++++++++++--------- 2 files changed, 60 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index d148505010a7..bfa80a6164d9 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -683,10 +683,28 @@ struct tcamsg { unsigned char tca__pad1; unsigned short tca__pad2; }; + +enum { + TCA_ROOT_UNSPEC, + TCA_ROOT_TAB, +#define TCA_ACT_TAB TCA_ROOT_TAB +#define TCAA_MAX TCA_ROOT_TAB + TCA_ROOT_FLAGS, + TCA_ROOT_COUNT, + __TCA_ROOT_MAX, +#define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) +}; + #define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) #define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) -#define TCA_ACT_TAB 1 /* attr type must be >=1 */ -#define TCAA_MAX 1 +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS + * + * TCA_FLAG_LARGE_DUMP_ON user->kernel to request for larger than TCA_ACT_MAX_PRIO + * actions in a dump. All dump responses will contain the number of actions + * being dumped stored in for user app's consumption in TCA_ROOT_COUNT + * + */ +#define TCA_FLAG_LARGE_DUMP_ON (1 << 0) /* New extended info filters for IFLA_EXT_MASK */ #define RTEXT_FILTER_VF (1 << 0) diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 848370e2fcca..d53653a73c4f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -110,6 +110,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, struct netlink_callback *cb) { int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + u32 act_flags = cb->args[2]; struct nlattr *nest; spin_lock_bh(&hinfo->lock); @@ -138,14 +139,18 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, } nla_nest_end(skb, nest); n_i++; - if (n_i >= TCA_ACT_MAX_PRIO) + if (!(act_flags & TCA_FLAG_LARGE_DUMP_ON) && + n_i >= TCA_ACT_MAX_PRIO) goto done; } } done: spin_unlock_bh(&hinfo->lock); - if (n_i) + if (n_i) { cb->args[0] += n_i; + if (act_flags & TCA_FLAG_LARGE_DUMP_ON) + cb->args[1] = n_i; + } return n_i; nla_put_failure: @@ -1068,11 +1073,17 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return tcf_add_notify(net, n, &actions, portid); } +static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; +static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { + [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, + .validation_data = &tcaa_root_flags_allowed }, +}; + static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct nlattr *tca[TCAA_MAX + 1]; + struct nlattr *tca[TCA_ROOT_MAX + 1]; u32 portid = skb ? NETLINK_CB(skb).portid : 0; int ret = 0, ovr = 0; @@ -1080,7 +1091,7 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCAA_MAX, NULL, + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ROOT_MAX, NULL, extack); if (ret < 0) return ret; @@ -1121,16 +1132,12 @@ replay: return ret; } -static struct nlattr *find_dump_kind(const struct nlmsghdr *n) +static struct nlattr *find_dump_kind(struct nlattr **nla) { struct nlattr *tb1, *tb2[TCA_ACT_MAX + 1]; struct nlattr *tb[TCA_ACT_MAX_PRIO + 1]; - struct nlattr *nla[TCAA_MAX + 1]; struct nlattr *kind; - if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, - NULL, NULL) < 0) - return NULL; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; @@ -1157,8 +1164,18 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) struct tc_action_ops *a_o; int ret = 0; struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); - struct nlattr *kind = find_dump_kind(cb->nlh); + struct nlattr *tb[TCA_ROOT_MAX + 1]; + struct nlattr *count_attr = NULL; + struct nlattr *kind = NULL; + struct nla_bitfield32 bf; + u32 act_count = 0; + + ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, + tcaa_policy, NULL); + if (ret < 0) + return ret; + kind = find_dump_kind(tb); if (kind == NULL) { pr_info("tc_dump_action: action bad kind\n"); return 0; @@ -1168,14 +1185,24 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (a_o == NULL) return 0; + cb->args[2] = 0; + if (tb[TCA_ROOT_FLAGS]) { + bf = nla_get_bitfield32(tb[TCA_ROOT_FLAGS]); + cb->args[2] = bf.value; + } + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; + count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); + if (!count_attr) + goto out_module_put; nest = nla_nest_start(skb, TCA_ACT_TAB); if (nest == NULL) @@ -1188,6 +1215,9 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (ret > 0) { nla_nest_end(skb, nest); ret = skb->len; + act_count = cb->args[1]; + memcpy(nla_data(count_attr), &act_count, sizeof(u32)); + cb->args[1] = 0; } else nlmsg_trim(skb, b); -- cgit v1.2.3 From e62e484df04964ac947c679ef4f00c54ae5395aa Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sun, 30 Jul 2017 13:24:52 -0400 Subject: net sched actions: add time filter for action dumping This patch adds support for filtering based on time since last used. When we are dumping a large number of actions it is useful to have the option of filtering based on when the action was last used to reduce the amount of data crossing to user space. With this patch the user space app sets the TCA_ROOT_TIME_DELTA attribute with the value in milliseconds with "time of interest since now". The kernel converts this to jiffies and does the filtering comparison matching entries that have seen activity since then and returns them to user space. Old kernels and old tc continue to work in legacy mode since they dont specify this attribute. Some example (we have 400 actions bound to 400 filters); at installation time. Using updated when tc setting the time of interest to 120 seconds earlier (we see 400 actions): prompt$ hackedtc actions ls action gact since 120000| grep index | wc -l 400 go get some coffee and wait for > 120 seconds and try again: prompt$ hackedtc actions ls action gact since 120000 | grep index | wc -l 0 Lets see a filter bound to one of these actions: .... filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 2 success 1) match 7f000002/ffffffff at 12 (success 1 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1145 sec used 802 sec Action statistics: Sent 84 bytes 1 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 .... that coffee took long, no? It was good. Now lets ping -c 1 127.0.0.2, then run the actions again: prompt$ hackedtc actions ls action gact since 120 | grep index | wc -l 1 More details please: prompt$ hackedtc -s actions ls action gact since 120000 action order 0: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1270 sec used 30 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 And the filter? filter pref 10 u32 filter pref 10 u32 fh 800: ht divisor 1 filter pref 10 u32 fh 800::800 order 2048 key ht 800 bkt 0 flowid 1:10 (rule hit 4 success 2) match 7f000002/ffffffff at 12 (success 2 ) action order 1: gact action pass random type none pass val 0 index 23 ref 2 bind 1 installed 1324 sec used 84 sec Action statistics: Sent 168 bytes 2 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Signed-off-by: Jamal Hadi Salim Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 1 + net/sched/act_api.c | 21 ++++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index bfa80a6164d9..dab7dad9e01a 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -691,6 +691,7 @@ enum { #define TCAA_MAX TCA_ROOT_TAB TCA_ROOT_FLAGS, TCA_ROOT_COUNT, + TCA_ROOT_TIME_DELTA, /* in msecs */ __TCA_ROOT_MAX, #define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) }; diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d53653a73c4f..f19b118df414 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -111,6 +111,7 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, { int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; u32 act_flags = cb->args[2]; + unsigned long jiffy_since = cb->args[3]; struct nlattr *nest; spin_lock_bh(&hinfo->lock); @@ -128,6 +129,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, if (index < s_i) continue; + if (jiffy_since && + time_after(jiffy_since, + (unsigned long)p->tcfa_tm.lastuse)) + continue; + nest = nla_nest_start(skb, n_i); if (nest == NULL) goto nla_put_failure; @@ -145,9 +151,11 @@ static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb, } } done: + if (index >= 0) + cb->args[0] = index + 1; + spin_unlock_bh(&hinfo->lock); if (n_i) { - cb->args[0] += n_i; if (act_flags & TCA_FLAG_LARGE_DUMP_ON) cb->args[1] = n_i; } @@ -1077,6 +1085,7 @@ static u32 tcaa_root_flags_allowed = TCA_FLAG_LARGE_DUMP_ON; static const struct nla_policy tcaa_policy[TCA_ROOT_MAX + 1] = { [TCA_ROOT_FLAGS] = { .type = NLA_BITFIELD32, .validation_data = &tcaa_root_flags_allowed }, + [TCA_ROOT_TIME_DELTA] = { .type = NLA_U32 }, }; static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, @@ -1166,8 +1175,10 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) struct tcamsg *t = (struct tcamsg *) nlmsg_data(cb->nlh); struct nlattr *tb[TCA_ROOT_MAX + 1]; struct nlattr *count_attr = NULL; + unsigned long jiffy_since = 0; struct nlattr *kind = NULL; struct nla_bitfield32 bf; + u32 msecs_since = 0; u32 act_count = 0; ret = nlmsg_parse(cb->nlh, sizeof(struct tcamsg), tb, TCA_ROOT_MAX, @@ -1191,15 +1202,23 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) cb->args[2] = bf.value; } + if (tb[TCA_ROOT_TIME_DELTA]) { + msecs_since = nla_get_u32(tb[TCA_ROOT_TIME_DELTA]); + } + nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, cb->nlh->nlmsg_type, sizeof(*t), 0); if (!nlh) goto out_module_put; + if (msecs_since) + jiffy_since = jiffies - msecs_to_jiffies(msecs_since); + t = nlmsg_data(nlh); t->tca_family = AF_UNSPEC; t->tca__pad1 = 0; t->tca__pad2 = 0; + cb->args[3] = jiffy_since; count_attr = nla_reserve(skb, TCA_ROOT_COUNT, sizeof(u32)); if (!count_attr) goto out_module_put; -- cgit v1.2.3 From e46abbcc05aa8a16b0e7f5c94e86d11af9aa2770 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:41 +0200 Subject: netfilter: nf_tables: Allow table names of up to 255 chars Allocate all table names dynamically to allow for arbitrary lengths but introduce NFT_NAME_MAXLEN as an upper sanity boundary. It's value was chosen to allow using a domain name as per RFC 1035. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 3 +- net/netfilter/nf_tables_api.c | 49 +++++++++++++++++++++++--------- net/netfilter/nf_tables_trace.c | 2 +- 4 files changed, 40 insertions(+), 16 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index bd5be0d691d5..05ecf78ec078 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -957,7 +957,7 @@ struct nft_table { u32 use; u16 flags:14, genmask:2; - char name[NFT_TABLE_MAXNAMELEN]; + char *name; }; enum nft_af_flags { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6f0a950e21c3..0b94e572ef16 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1,7 +1,8 @@ #ifndef _LINUX_NF_TABLES_H #define _LINUX_NF_TABLES_H -#define NFT_TABLE_MAXNAMELEN 32 +#define NFT_NAME_MAXLEN 256 +#define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN 32 #define NFT_SET_MAXNAMELEN 32 #define NFT_OBJ_MAXNAMELEN 32 diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index b77ad0813564..c2e392d5e512 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -726,7 +726,10 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, if (table == NULL) goto err2; - nla_strlcpy(table->name, name, NFT_TABLE_MAXNAMELEN); + table->name = nla_strdup(name, GFP_KERNEL); + if (table->name == NULL) + goto err3; + INIT_LIST_HEAD(&table->chains); INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); @@ -735,10 +738,12 @@ static int nf_tables_newtable(struct net *net, struct sock *nlsk, nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla); err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE); if (err < 0) - goto err3; + goto err4; list_add_tail_rcu(&table->list, &afi->tables); return 0; +err4: + kfree(table->name); err3: kfree(table); err2: @@ -865,6 +870,7 @@ static void nf_tables_table_destroy(struct nft_ctx *ctx) { BUG_ON(ctx->table->use > 0); + kfree(ctx->table->name); kfree(ctx->table); module_put(ctx->afi->owner); } @@ -1972,7 +1978,7 @@ err: } struct nft_rule_dump_ctx { - char table[NFT_TABLE_MAXNAMELEN]; + char *table; char chain[NFT_CHAIN_MAXNAMELEN]; }; @@ -1997,7 +2003,7 @@ static int nf_tables_dump_rules(struct sk_buff *skb, continue; list_for_each_entry_rcu(table, &afi->tables, list) { - if (ctx && ctx->table[0] && + if (ctx && ctx->table && strcmp(ctx->table, table->name) != 0) continue; @@ -2037,7 +2043,12 @@ done: static int nf_tables_dump_rules_done(struct netlink_callback *cb) { - kfree(cb->data); + struct nft_rule_dump_ctx *ctx = cb->data; + + if (ctx) { + kfree(ctx->table); + kfree(ctx); + } return 0; } @@ -2069,9 +2080,14 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, if (!ctx) return -ENOMEM; - if (nla[NFTA_RULE_TABLE]) - nla_strlcpy(ctx->table, nla[NFTA_RULE_TABLE], - sizeof(ctx->table)); + if (nla[NFTA_RULE_TABLE]) { + ctx->table = nla_strdup(nla[NFTA_RULE_TABLE], + GFP_KERNEL); + if (!ctx->table) { + kfree(ctx); + return -ENOMEM; + } + } if (nla[NFTA_RULE_CHAIN]) nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN], sizeof(ctx->chain)); @@ -4410,7 +4426,7 @@ nla_put_failure: } struct nft_obj_filter { - char table[NFT_OBJ_MAXNAMELEN]; + char *table; u32 type; }; @@ -4475,7 +4491,10 @@ done: static int nf_tables_dump_obj_done(struct netlink_callback *cb) { - kfree(cb->data); + struct nft_obj_filter *filter = cb->data; + + kfree(filter->table); + kfree(filter); return 0; } @@ -4489,9 +4508,13 @@ nft_obj_filter_alloc(const struct nlattr * const nla[]) if (!filter) return ERR_PTR(-ENOMEM); - if (nla[NFTA_OBJ_TABLE]) - nla_strlcpy(filter->table, nla[NFTA_OBJ_TABLE], - NFT_TABLE_MAXNAMELEN); + if (nla[NFTA_OBJ_TABLE]) { + filter->table = nla_strdup(nla[NFTA_OBJ_TABLE], GFP_KERNEL); + if (!filter->table) { + kfree(filter); + return ERR_PTR(-ENOMEM); + } + } if (nla[NFTA_OBJ_TYPE]) filter->type = ntohl(nla_get_be32(nla[NFTA_OBJ_TYPE])); diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 0c3a0049e4aa..62787d985e9d 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -175,7 +175,7 @@ void nft_trace_notify(struct nft_traceinfo *info) return; size = nlmsg_total_size(sizeof(struct nfgenmsg)) + - nla_total_size(NFT_TABLE_MAXNAMELEN) + + nla_total_size(strlen(info->chain->table->name)) + nla_total_size(NFT_CHAIN_MAXNAMELEN) + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ -- cgit v1.2.3 From b7263e071aba736cea9e71cdf2e76dfa7aebd039 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:42 +0200 Subject: netfilter: nf_tables: Allow chain name of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 4 ++-- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 34 ++++++++++++++++++++++++-------- net/netfilter/nf_tables_trace.c | 27 +++++++++++++++++++++++-- 4 files changed, 54 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 05ecf78ec078..be1610162ee0 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -859,7 +859,7 @@ struct nft_chain { u16 level; u8 flags:6, genmask:2; - char name[NFT_CHAIN_MAXNAMELEN]; + char *name; }; enum nft_chain_type { @@ -1272,7 +1272,7 @@ struct nft_trans_set { struct nft_trans_chain { bool update; - char name[NFT_CHAIN_MAXNAMELEN]; + char *name; struct nft_stats __percpu *stats; u8 policy; }; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0b94e572ef16..d9c03a8608ee 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -3,7 +3,7 @@ #define NFT_NAME_MAXLEN 256 #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_CHAIN_MAXNAMELEN 32 +#define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_SET_MAXNAMELEN 32 #define NFT_OBJ_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index c2e392d5e512..747499039709 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1250,8 +1250,10 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) static_branch_dec(&nft_counters_enabled); if (basechain->ops[0].dev != NULL) dev_put(basechain->ops[0].dev); + kfree(chain->name); kfree(basechain); } else { + kfree(chain->name); kfree(chain); } } @@ -1476,8 +1478,13 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, nft_trans_chain_policy(trans) = -1; if (nla[NFTA_CHAIN_HANDLE] && name) { - nla_strlcpy(nft_trans_chain_name(trans), name, - NFT_CHAIN_MAXNAMELEN); + nft_trans_chain_name(trans) = + nla_strdup(name, GFP_KERNEL); + if (!nft_trans_chain_name(trans)) { + kfree(trans); + free_percpu(stats); + return -ENOMEM; + } } list_add_tail(&trans->list, &net->nft.commit_list); return 0; @@ -1544,7 +1551,11 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, INIT_LIST_HEAD(&chain->rules); chain->handle = nf_tables_alloc_handle(table); chain->table = table; - nla_strlcpy(chain->name, name, NFT_CHAIN_MAXNAMELEN); + chain->name = nla_strdup(name, GFP_KERNEL); + if (!chain->name) { + err = -ENOMEM; + goto err1; + } err = nf_tables_register_hooks(net, table, chain, afi->nops); if (err < 0) @@ -1979,7 +1990,7 @@ err: struct nft_rule_dump_ctx { char *table; - char chain[NFT_CHAIN_MAXNAMELEN]; + char *chain; }; static int nf_tables_dump_rules(struct sk_buff *skb, @@ -2047,6 +2058,7 @@ static int nf_tables_dump_rules_done(struct netlink_callback *cb) if (ctx) { kfree(ctx->table); + kfree(ctx->chain); kfree(ctx); } return 0; @@ -2088,9 +2100,15 @@ static int nf_tables_getrule(struct net *net, struct sock *nlsk, return -ENOMEM; } } - if (nla[NFTA_RULE_CHAIN]) - nla_strlcpy(ctx->chain, nla[NFTA_RULE_CHAIN], - sizeof(ctx->chain)); + if (nla[NFTA_RULE_CHAIN]) { + ctx->chain = nla_strdup(nla[NFTA_RULE_CHAIN], + GFP_KERNEL); + if (!ctx->chain) { + kfree(ctx->table); + kfree(ctx); + return -ENOMEM; + } + } c.data = ctx; } @@ -4863,7 +4881,7 @@ static void nft_chain_commit_update(struct nft_trans *trans) { struct nft_base_chain *basechain; - if (nft_trans_chain_name(trans)[0]) + if (nft_trans_chain_name(trans)) strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); if (!nft_is_base_chain(trans->ctx.chain)) diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 62787d985e9d..e1dc527a493b 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -162,6 +162,27 @@ static int nf_trace_fill_rule_info(struct sk_buff *nlskb, NFTA_TRACE_PAD); } +static bool nft_trace_have_verdict_chain(struct nft_traceinfo *info) +{ + switch (info->type) { + case NFT_TRACETYPE_RETURN: + case NFT_TRACETYPE_RULE: + break; + default: + return false; + } + + switch (info->verdict->code) { + case NFT_JUMP: + case NFT_GOTO: + break; + default: + return false; + } + + return true; +} + void nft_trace_notify(struct nft_traceinfo *info) { const struct nft_pktinfo *pkt = info->pkt; @@ -176,12 +197,11 @@ void nft_trace_notify(struct nft_traceinfo *info) size = nlmsg_total_size(sizeof(struct nfgenmsg)) + nla_total_size(strlen(info->chain->table->name)) + - nla_total_size(NFT_CHAIN_MAXNAMELEN) + + nla_total_size(strlen(info->chain->name)) + nla_total_size_64bit(sizeof(__be64)) + /* rule handle */ nla_total_size(sizeof(__be32)) + /* trace type */ nla_total_size(0) + /* VERDICT, nested */ nla_total_size(sizeof(u32)) + /* verdict code */ - nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */ nla_total_size(sizeof(u32)) + /* id */ nla_total_size(NFT_TRACETYPE_LL_HSIZE) + nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) + @@ -194,6 +214,9 @@ void nft_trace_notify(struct nft_traceinfo *info) nla_total_size(sizeof(u32)) + /* nfproto */ nla_total_size(sizeof(u32)); /* policy */ + if (nft_trace_have_verdict_chain(info)) + size += nla_total_size(strlen(info->verdict->chain->name)); /* jump target */ + skb = nlmsg_new(size, GFP_ATOMIC); if (!skb) return; -- cgit v1.2.3 From 387454901bd62022ac1b04e15bd8d4fcc60bbed4 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:43 +0200 Subject: netfilter: nf_tables: Allow set names of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 18 ++++++++++++++---- 3 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index be1610162ee0..66ba62fa7d90 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -396,7 +396,7 @@ void nft_unregister_set(struct nft_set_type *type); struct nft_set { struct list_head list; struct list_head bindings; - char name[NFT_SET_MAXNAMELEN]; + char *name; u32 ktype; u32 dtype; u32 objtype; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index d9c03a8608ee..b5e73e80b7b6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -4,7 +4,7 @@ #define NFT_NAME_MAXLEN 256 #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_SET_MAXNAMELEN 32 +#define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_OBJ_MAXNAMELEN 32 #define NFT_USERDATA_MAXLEN 256 diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 747499039709..e6a07f27b1a3 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2650,7 +2650,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, unsigned long *inuse; unsigned int n = 0, min = 0; - p = strnchr(name, NFT_SET_MAXNAMELEN, '%'); + p = strchr(name, '%'); if (p != NULL) { if (p[1] != 'd' || strchr(p + 2, '%')) return -EINVAL; @@ -2681,7 +2681,10 @@ cont: free_page((unsigned long)inuse); } - snprintf(set->name, sizeof(set->name), name, min + n); + set->name = kasprintf(GFP_KERNEL, name, min + n); + if (!set->name) + return -ENOMEM; + list_for_each_entry(i, &ctx->table->sets, list) { if (!nft_is_active_next(ctx->net, i)) continue; @@ -2958,7 +2961,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, struct nft_table *table; struct nft_set *set; struct nft_ctx ctx; - char name[NFT_SET_MAXNAMELEN]; + char *name; unsigned int size; bool create; u64 timeout; @@ -3104,8 +3107,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, goto err1; } - nla_strlcpy(name, nla[NFTA_SET_NAME], sizeof(set->name)); + name = nla_strdup(nla[NFTA_SET_NAME], GFP_KERNEL); + if (!name) { + err = -ENOMEM; + goto err2; + } + err = nf_tables_set_alloc_name(&ctx, set, name); + kfree(name); if (err < 0) goto err2; @@ -3155,6 +3164,7 @@ static void nft_set_destroy(struct nft_set *set) { set->ops->destroy(set); module_put(set->ops->type->owner); + kfree(set->name); kvfree(set); } -- cgit v1.2.3 From 615095752100748e221028fc96163c2b78185ae4 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Thu, 27 Jul 2017 16:56:44 +0200 Subject: netfilter: nf_tables: Allow object names of up to 255 chars Same conversion as for table names, use NFT_NAME_MAXLEN as upper boundary as well. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 11 +++++++++-- 3 files changed, 11 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 66ba62fa7d90..f9795fe394f3 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1016,7 +1016,7 @@ int nft_verdict_dump(struct sk_buff *skb, int type, */ struct nft_object { struct list_head list; - char name[NFT_OBJ_MAXNAMELEN]; + char *name; struct nft_table *table; u32 genmask:2, use:30; diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b5e73e80b7b6..be25cf69295b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -5,7 +5,7 @@ #define NFT_TABLE_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_CHAIN_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_SET_MAXNAMELEN NFT_NAME_MAXLEN -#define NFT_OBJ_MAXNAMELEN 32 +#define NFT_OBJ_MAXNAMELEN NFT_NAME_MAXLEN #define NFT_USERDATA_MAXLEN 256 /** diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index e6a07f27b1a3..149785ff1c7b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4402,15 +4402,21 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, goto err1; } obj->table = table; - nla_strlcpy(obj->name, nla[NFTA_OBJ_NAME], NFT_OBJ_MAXNAMELEN); + obj->name = nla_strdup(nla[NFTA_OBJ_NAME], GFP_KERNEL); + if (!obj->name) { + err = -ENOMEM; + goto err2; + } err = nft_trans_obj_add(&ctx, NFT_MSG_NEWOBJ, obj); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&obj->list, &table->objects); table->use++; return 0; +err3: + kfree(obj->name); err2: if (obj->type->destroy) obj->type->destroy(obj); @@ -4626,6 +4632,7 @@ static void nft_obj_destroy(struct nft_object *obj) obj->type->destroy(obj); module_put(obj->type->owner); + kfree(obj->name); kfree(obj); } -- cgit v1.2.3 From 3282e65558b3651e230ee985c174c35cb2fedaf1 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sun, 30 Jul 2017 03:57:23 +0200 Subject: tcp: remove unused mib counters was used by tcp prequeue and header prediction. TCPFORWARDRETRANS use was removed in january. Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/uapi/linux/snmp.h | 9 --------- net/ipv4/proc.c | 9 --------- 2 files changed, 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index d85693295798..b3f346fb9fe3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -184,14 +184,7 @@ enum LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */ LINUX_MIB_LISTENDROPS, /* ListenDrops */ - LINUX_MIB_TCPPREQUEUED, /* TCPPrequeued */ - LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, /* TCPDirectCopyFromBacklog */ - LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, /* TCPDirectCopyFromPrequeue */ - LINUX_MIB_TCPPREQUEUEDROPPED, /* TCPPrequeueDropped */ - LINUX_MIB_TCPHPHITS, /* TCPHPHits */ - LINUX_MIB_TCPHPHITSTOUSER, /* TCPHPHitsToUser */ LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */ - LINUX_MIB_TCPHPACKS, /* TCPHPAcks */ LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ @@ -208,14 +201,12 @@ enum LINUX_MIB_TCPSACKFAILURES, /* TCPSackFailures */ LINUX_MIB_TCPLOSSFAILURES, /* TCPLossFailures */ LINUX_MIB_TCPFASTRETRANS, /* TCPFastRetrans */ - LINUX_MIB_TCPFORWARDRETRANS, /* TCPForwardRetrans */ LINUX_MIB_TCPSLOWSTARTRETRANS, /* TCPSlowStartRetrans */ LINUX_MIB_TCPTIMEOUTS, /* TCPTimeouts */ LINUX_MIB_TCPLOSSPROBES, /* TCPLossProbes */ LINUX_MIB_TCPLOSSPROBERECOVERY, /* TCPLossProbeRecovery */ LINUX_MIB_TCPRENORECOVERYFAIL, /* TCPRenoRecoveryFail */ LINUX_MIB_TCPSACKRECOVERYFAIL, /* TCPSackRecoveryFail */ - LINUX_MIB_TCPSCHEDULERFAILED, /* TCPSchedulerFailed */ LINUX_MIB_TCPRCVCOLLAPSED, /* TCPRcvCollapsed */ LINUX_MIB_TCPDSACKOLDSENT, /* TCPDSACKOldSent */ LINUX_MIB_TCPDSACKOFOSENT, /* TCPDSACKOfoSent */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 43eb6567b3a0..b6d3fe03feb3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -206,14 +206,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), - SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), - SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), - SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), - SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), - SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), - SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), - SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), @@ -230,14 +223,12 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), - SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), SNMP_MIB_ITEM("TCPLossProbes", LINUX_MIB_TCPLOSSPROBES), SNMP_MIB_ITEM("TCPLossProbeRecovery", LINUX_MIB_TCPLOSSPROBERECOVERY), SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), - SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), -- cgit v1.2.3 From bb7c19f96012720b895111300b9d9f3f858c3a69 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 28 Jul 2017 10:28:21 -0700 Subject: tcp: add related fields into SCM_TIMESTAMPING_OPT_STATS Add the following stats into SCM_TIMESTAMPING_OPT_STATS control msg: TCP_NLA_PACING_RATE TCP_NLA_DELIVERY_RATE TCP_NLA_SND_CWND TCP_NLA_REORDERING TCP_NLA_MIN_RTT TCP_NLA_RECUR_RETRANS TCP_NLA_DELIVERY_RATE_APP_LMT Signed-off-by: Wei Wang Acked-by: Yuchung Cheng Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- include/uapi/linux/tcp.h | 8 ++++++++ net/ipv4/tcp.c | 20 +++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index a5507c977497..030e594bab45 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -231,6 +231,14 @@ enum { TCP_NLA_SNDBUF_LIMITED, /* Time (usec) limited by send buffer */ TCP_NLA_DATA_SEGS_OUT, /* Data pkts sent including retransmission */ TCP_NLA_TOTAL_RETRANS, /* Data pkts retransmitted */ + TCP_NLA_PACING_RATE, /* Pacing rate in bytes per second */ + TCP_NLA_DELIVERY_RATE, /* Delivery rate in bytes per second */ + TCP_NLA_SND_CWND, /* Sending congestion window */ + TCP_NLA_REORDERING, /* Reordering metric */ + TCP_NLA_MIN_RTT, /* minimum RTT */ + TCP_NLA_RECUR_RETRANS, /* Recurring retransmits for the current pkt */ + TCP_NLA_DELIVERY_RATE_APP_LMT, /* delivery rate application limited ? */ + }; /* for TCP_MD5SIG socket option */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index acee7acdcba6..5326b50a3450 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2840,8 +2840,12 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *stats; struct tcp_info info; + u64 rate64; + u32 rate; - stats = alloc_skb(5 * nla_total_size_64bit(sizeof(u64)), GFP_ATOMIC); + stats = alloc_skb(7 * nla_total_size_64bit(sizeof(u64)) + + 3 * nla_total_size(sizeof(u32)) + + 2 * nla_total_size(sizeof(u8)), GFP_ATOMIC); if (!stats) return NULL; @@ -2856,6 +2860,20 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk) tp->data_segs_out, TCP_NLA_PAD); nla_put_u64_64bit(stats, TCP_NLA_TOTAL_RETRANS, tp->total_retrans, TCP_NLA_PAD); + + rate = READ_ONCE(sk->sk_pacing_rate); + rate64 = rate != ~0U ? rate : ~0ULL; + nla_put_u64_64bit(stats, TCP_NLA_PACING_RATE, rate64, TCP_NLA_PAD); + + rate64 = tcp_compute_delivery_rate(tp); + nla_put_u64_64bit(stats, TCP_NLA_DELIVERY_RATE, rate64, TCP_NLA_PAD); + + nla_put_u32(stats, TCP_NLA_SND_CWND, tp->snd_cwnd); + nla_put_u32(stats, TCP_NLA_REORDERING, tp->reordering); + nla_put_u32(stats, TCP_NLA_MIN_RTT, tcp_min_rtt(tp)); + + nla_put_u8(stats, TCP_NLA_RECUR_RETRANS, inet_csk(sk)->icsk_retransmits); + nla_put_u8(stats, TCP_NLA_DELIVERY_RATE_APP_LMT, !!tp->rate_app_limited); return stats; } -- cgit v1.2.3 From 61e4d01e16acddadb9723143637a20417fa67ac9 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Aug 2017 13:28:20 +0200 Subject: ipv6: fib: Add offload indication to routes Allow user space applications to see which routes are offloaded and which aren't by setting the RTNH_F_OFFLOAD flag when dumping them. To be consistent with IPv4, offload indication is provided on a per-nexthop basis. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/ipv6_route.h | 1 + net/ipv6/route.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index d496c02e14bc..33e2a5732bd1 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -35,6 +35,7 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 +#define RTF_OFFLOAD 0x20000000 /* offloaded route */ #define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ #define RTF_LOCAL 0x80000000 diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 4d30c96a819d..aba07fce67fb 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1820,6 +1820,11 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } + if (cfg->fc_flags & RTF_OFFLOAD) { + NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD"); + goto out; + } + if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@ -3327,6 +3332,9 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } + if (rt->rt6i_flags & RTF_OFFLOAD) + *flags |= RTNH_F_OFFLOAD; + /* not needed for multipath encoding b/c it has a rtnexthop struct */ if (!skip_oif && rt->dst.dev && nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) -- cgit v1.2.3 From 52267790ef52d7513879238ca9fac22c1733e0e3 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 3 Aug 2017 16:29:39 -0400 Subject: sock: add MSG_ZEROCOPY The kernel supports zerocopy sendmsg in virtio and tap. Expand the infrastructure to support other socket types. Introduce a completion notification channel over the socket error queue. Notifications are returned with ee_origin SO_EE_ORIGIN_ZEROCOPY. ee_errno is 0 to avoid blocking the send/recv path on receiving notifications. Add reference counting, to support the skb split, merge, resize and clone operations possible with SOCK_STREAM and other socket types. The patch does not yet modify any datapaths. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/linux/skbuff.h | 60 +++++++++++++++++++ include/linux/socket.h | 1 + include/net/sock.h | 2 + include/uapi/linux/errqueue.h | 3 + net/core/datagram.c | 55 ++++++++++------- net/core/skbuff.c | 133 ++++++++++++++++++++++++++++++++++++++++++ net/core/sock.c | 2 + 7 files changed, 235 insertions(+), 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2f64e2bbb592..59cff7aa494e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -429,6 +429,7 @@ enum { SKBTX_SCHED_TSTAMP = 1 << 6, }; +#define SKBTX_ZEROCOPY_FRAG (SKBTX_DEV_ZEROCOPY | SKBTX_SHARED_FRAG) #define SKBTX_ANY_SW_TSTAMP (SKBTX_SW_TSTAMP | \ SKBTX_SCHED_TSTAMP) #define SKBTX_ANY_TSTAMP (SKBTX_HW_TSTAMP | SKBTX_ANY_SW_TSTAMP) @@ -445,8 +446,28 @@ struct ubuf_info { void (*callback)(struct ubuf_info *, bool zerocopy_success); void *ctx; unsigned long desc; + u16 zerocopy:1; + atomic_t refcnt; }; +#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg)) + +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size); + +static inline void sock_zerocopy_get(struct ubuf_info *uarg) +{ + atomic_inc(&uarg->refcnt); +} + +void sock_zerocopy_put(struct ubuf_info *uarg); +void sock_zerocopy_put_abort(struct ubuf_info *uarg); + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg); + /* This data is invariant across clones and lives at * the end of the header data, ie. at skb->end. */ @@ -1214,6 +1235,45 @@ static inline struct skb_shared_hwtstamps *skb_hwtstamps(struct sk_buff *skb) return &skb_shinfo(skb)->hwtstamps; } +static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb) +{ + bool is_zcopy = skb && skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY; + + return is_zcopy ? skb_uarg(skb) : NULL; +} + +static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg) +{ + if (skb && uarg && !skb_zcopy(skb)) { + sock_zerocopy_get(uarg); + skb_shinfo(skb)->destructor_arg = uarg; + skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG; + } +} + +/* Release a reference on a zerocopy structure */ +static inline void skb_zcopy_clear(struct sk_buff *skb, bool zerocopy) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + uarg->zerocopy = uarg->zerocopy && zerocopy; + sock_zerocopy_put(uarg); + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + +/* Abort a zerocopy operation and revert zckey on error in send syscall */ +static inline void skb_zcopy_abort(struct sk_buff *skb) +{ + struct ubuf_info *uarg = skb_zcopy(skb); + + if (uarg) { + sock_zerocopy_put_abort(uarg); + skb_shinfo(skb)->tx_flags &= ~SKBTX_ZEROCOPY_FRAG; + } +} + /** * skb_queue_empty - check if a queue is empty * @list: queue head diff --git a/include/linux/socket.h b/include/linux/socket.h index 8b13db5163cc..8ad963cdc88c 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -287,6 +287,7 @@ struct ucred { #define MSG_BATCH 0x40000 /* sendmmsg(): more messages coming */ #define MSG_EOF MSG_FIN +#define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through diff --git a/include/net/sock.h b/include/net/sock.h index 0f778d3c4300..fe1a0bc25cd3 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -294,6 +294,7 @@ struct sock_common { * @sk_stamp: time stamp of last packet received * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests + * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag @@ -462,6 +463,7 @@ struct sock { u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; + atomic_t sk_zckey; struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index 07bdce1f444a..78fdf52d6b2f 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -18,10 +18,13 @@ struct sock_extended_err { #define SO_EE_ORIGIN_ICMP 2 #define SO_EE_ORIGIN_ICMP6 3 #define SO_EE_ORIGIN_TXSTATUS 4 +#define SO_EE_ORIGIN_ZEROCOPY 5 #define SO_EE_ORIGIN_TIMESTAMPING SO_EE_ORIGIN_TXSTATUS #define SO_EE_OFFENDER(ee) ((struct sockaddr*)((ee)+1)) +#define SO_EE_CODE_ZEROCOPY_COPIED 1 + /** * struct scm_timestamping - timestamps exposed through cmsg * diff --git a/net/core/datagram.c b/net/core/datagram.c index ee5647bd91b3..2f3277945d35 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -573,27 +573,12 @@ fault: } EXPORT_SYMBOL(skb_copy_datagram_from_iter); -/** - * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter - * @skb: buffer to copy - * @from: the source to copy from - * - * The function will first copy up to headlen, and then pin the userspace - * pages and build frags through them. - * - * Returns 0, -EFAULT or -EMSGSIZE. - */ -int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length) { - int len = iov_iter_count(from); - int copy = min_t(int, skb_headlen(skb), len); - int frag = 0; + int frag = skb_shinfo(skb)->nr_frags; - /* copy up to skb headlen */ - if (skb_copy_datagram_from_iter(skb, 0, from, copy)) - return -EFAULT; - - while (iov_iter_count(from)) { + while (length && iov_iter_count(from)) { struct page *pages[MAX_SKB_FRAGS]; size_t start; ssize_t copied; @@ -603,18 +588,24 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) if (frag == MAX_SKB_FRAGS) return -EMSGSIZE; - copied = iov_iter_get_pages(from, pages, ~0U, + copied = iov_iter_get_pages(from, pages, length, MAX_SKB_FRAGS - frag, &start); if (copied < 0) return -EFAULT; iov_iter_advance(from, copied); + length -= copied; truesize = PAGE_ALIGN(copied + start); skb->data_len += copied; skb->len += copied; skb->truesize += truesize; - refcount_add(truesize, &skb->sk->sk_wmem_alloc); + if (sk && sk->sk_type == SOCK_STREAM) { + sk->sk_wmem_queued += truesize; + sk_mem_charge(sk, truesize); + } else { + refcount_add(truesize, &skb->sk->sk_wmem_alloc); + } while (copied) { int size = min_t(int, copied, PAGE_SIZE - start); skb_fill_page_desc(skb, frag++, pages[n], start, size); @@ -625,6 +616,28 @@ int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) } return 0; } +EXPORT_SYMBOL(__zerocopy_sg_from_iter); + +/** + * zerocopy_sg_from_iter - Build a zerocopy datagram from an iov_iter + * @skb: buffer to copy + * @from: the source to copy from + * + * The function will first copy up to headlen, and then pin the userspace + * pages and build frags through them. + * + * Returns 0, -EFAULT or -EMSGSIZE. + */ +int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *from) +{ + int copy = min_t(int, skb_headlen(skb), iov_iter_count(from)); + + /* copy up to skb headlen */ + if (skb_copy_datagram_from_iter(skb, 0, from, copy)) + return -EFAULT; + + return __zerocopy_sg_from_iter(NULL, skb, from, ~0U); +} EXPORT_SYMBOL(zerocopy_sg_from_iter); static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index a95877a8ac8b..0603e44950da 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -915,6 +915,139 @@ struct sk_buff *skb_morph(struct sk_buff *dst, struct sk_buff *src) } EXPORT_SYMBOL_GPL(skb_morph); +struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size) +{ + struct ubuf_info *uarg; + struct sk_buff *skb; + + WARN_ON_ONCE(!in_task()); + + skb = sock_omalloc(sk, 0, GFP_KERNEL); + if (!skb) + return NULL; + + BUILD_BUG_ON(sizeof(*uarg) > sizeof(skb->cb)); + uarg = (void *)skb->cb; + + uarg->callback = sock_zerocopy_callback; + uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1; + uarg->zerocopy = 1; + atomic_set(&uarg->refcnt, 0); + sock_hold(sk); + + return uarg; +} +EXPORT_SYMBOL_GPL(sock_zerocopy_alloc); + +static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +{ + return container_of((void *)uarg, struct sk_buff, cb); +} + +void sock_zerocopy_callback(struct ubuf_info *uarg, bool success) +{ + struct sk_buff *skb = skb_from_uarg(uarg); + struct sock_exterr_skb *serr; + struct sock *sk = skb->sk; + u16 id = uarg->desc; + + if (sock_flag(sk, SOCK_DEAD)) + goto release; + + serr = SKB_EXT_ERR(skb); + memset(serr, 0, sizeof(*serr)); + serr->ee.ee_errno = 0; + serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY; + serr->ee.ee_data = id; + if (!success) + serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED; + + skb_queue_tail(&sk->sk_error_queue, skb); + skb = NULL; + + sk->sk_error_report(sk); + +release: + consume_skb(skb); + sock_put(sk); +} +EXPORT_SYMBOL_GPL(sock_zerocopy_callback); + +void sock_zerocopy_put(struct ubuf_info *uarg) +{ + if (uarg && atomic_dec_and_test(&uarg->refcnt)) { + if (uarg->callback) + uarg->callback(uarg, uarg->zerocopy); + else + consume_skb(skb_from_uarg(uarg)); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put); + +void sock_zerocopy_put_abort(struct ubuf_info *uarg) +{ + if (uarg) { + struct sock *sk = skb_from_uarg(uarg)->sk; + + atomic_dec(&sk->sk_zckey); + + /* sock_zerocopy_put expects a ref. Most sockets take one per + * skb, which is zero on abort. tcp_sendmsg holds one extra, to + * avoid an skb send inside the main loop triggering uarg free. + */ + if (sk->sk_type != SOCK_STREAM) + atomic_inc(&uarg->refcnt); + + sock_zerocopy_put(uarg); + } +} +EXPORT_SYMBOL_GPL(sock_zerocopy_put_abort); + +extern int __zerocopy_sg_from_iter(struct sock *sk, struct sk_buff *skb, + struct iov_iter *from, size_t length); + +int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb, + struct msghdr *msg, int len, + struct ubuf_info *uarg) +{ + struct iov_iter orig_iter = msg->msg_iter; + int err, orig_len = skb->len; + + err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len); + if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) { + /* Streams do not free skb on error. Reset to prev state. */ + msg->msg_iter = orig_iter; + ___pskb_trim(skb, orig_len); + return err; + } + + skb_zcopy_set(skb, uarg); + return skb->len - orig_len; +} +EXPORT_SYMBOL_GPL(skb_zerocopy_iter_stream); + +/* unused only until next patch in the series; will remove attribute */ +static int __attribute__((unused)) + skb_zerocopy_clone(struct sk_buff *nskb, struct sk_buff *orig, + gfp_t gfp_mask) +{ + if (skb_zcopy(orig)) { + if (skb_zcopy(nskb)) { + /* !gfp_mask callers are verified to !skb_zcopy(nskb) */ + if (!gfp_mask) { + WARN_ON_ONCE(1); + return -ENOMEM; + } + if (skb_uarg(nskb) == skb_uarg(orig)) + return 0; + if (skb_copy_ubufs(nskb, GFP_ATOMIC)) + return -EIO; + } + skb_zcopy_set(nskb, skb_uarg(orig)); + } + return 0; +} + /** * skb_copy_ubufs - copy userspace skb frags buffers to kernel * @skb: the skb to modify diff --git a/net/core/sock.c b/net/core/sock.c index 1261880bdcc8..e8b696858cad 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1670,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_drops, 0); newsk->sk_send_head = NULL; newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + atomic_set(&newsk->sk_zckey, 0); sock_reset_flag(newsk, SOCK_DONE); @@ -2722,6 +2723,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_stamp = SK_DEFAULT_STAMP; + atomic_set(&sk->sk_zckey, 0); #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; -- cgit v1.2.3 From 56ce097c1caede1f9c191a7c9699b950e7c36ad9 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 4 Aug 2017 08:24:05 -0700 Subject: net: comment fixes against BPF devmap helper calls Update BPF comments to accurately reflect XDP usage. Fixes: 97f91a7cf04ff ("bpf: add bpf_redirect_map helper routine") Reported-by: Alexei Starovoitov Signed-off-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1106a8c4cd36..1d06be1569b1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -345,14 +345,20 @@ union bpf_attr { * int bpf_redirect(ifindex, flags) * redirect to another netdev * @ifindex: ifindex of the net device - * @flags: bit 0 - if set, redirect to ingress instead of egress - * other bits - reserved - * Return: TC_ACT_REDIRECT - * int bpf_redirect_map(key, map, flags) + * @flags: + * cls_bpf: + * bit 0 - if set, redirect to ingress instead of egress + * other bits - reserved + * xdp_bpf: + * all bits - reserved + * Return: cls_bpf: TC_ACT_REDIRECT on success or TC_ACT_SHOT on error + * xdp_bfp: XDP_REDIRECT on success or XDP_ABORT on error + * int bpf_redirect_map(map, key, flags) * redirect to endpoint in map + * @map: pointer to dev map * @key: index in map to lookup - * @map: fd of map to do lookup in * @flags: -- + * Return: XDP_REDIRECT on success or XDP_ABORT on error * * u32 bpf_get_route_realm(skb) * retrieve a dst's tclassid -- cgit v1.2.3 From d1df6fd8a1d22d37cffa0075ab8ad423ce656777 Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Sat, 5 Aug 2017 12:38:26 +0200 Subject: ipv6: sr: define core operations for seg6local lightweight tunnel This patch implements a new type of lightweight tunnel named seg6local. A seg6local lwt is defined by a type of action and a set of parameters. The action represents the operation to perform on the packets matching the lwt's route, and is not necessarily an encapsulation. The set of parameters are arguments for the processing function. Each action is defined in a struct seg6_action_desc within seg6_action_table[]. This structure contains the action, mandatory attributes, the processing function, and a static headroom size required by the action. The mandatory attributes are encoded as a bitmask field. The static headroom is set to a non-zero value when the processing function always add a constant number of bytes to the skb (e.g. the header size for encapsulations). To facilitate rtnetlink-related operations such as parsing, fill_encap, and cmp_encap, each type of action parameter is associated to three function pointers, in seg6_action_params[]. All actions defined in seg6_local.h are detailed in [1]. [1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01 Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/linux/seg6_local.h | 6 + include/net/seg6.h | 2 + include/uapi/linux/lwtunnel.h | 1 + include/uapi/linux/seg6_local.h | 68 +++++++++ net/core/lwtunnel.c | 2 + net/ipv6/Kconfig | 3 +- net/ipv6/Makefile | 2 +- net/ipv6/seg6.c | 5 + net/ipv6/seg6_local.c | 320 ++++++++++++++++++++++++++++++++++++++++ 9 files changed, 407 insertions(+), 2 deletions(-) create mode 100644 include/linux/seg6_local.h create mode 100644 include/uapi/linux/seg6_local.h create mode 100644 net/ipv6/seg6_local.c (limited to 'include/uapi/linux') diff --git a/include/linux/seg6_local.h b/include/linux/seg6_local.h new file mode 100644 index 000000000000..ee63e76fe0c7 --- /dev/null +++ b/include/linux/seg6_local.h @@ -0,0 +1,6 @@ +#ifndef _LINUX_SEG6_LOCAL_H +#define _LINUX_SEG6_LOCAL_H + +#include + +#endif diff --git a/include/net/seg6.h b/include/net/seg6.h index a32abb040e1d..5379f550f521 100644 --- a/include/net/seg6.h +++ b/include/net/seg6.h @@ -56,6 +56,8 @@ extern int seg6_init(void); extern void seg6_exit(void); extern int seg6_iptunnel_init(void); extern void seg6_iptunnel_exit(void); +extern int seg6_local_init(void); +extern void seg6_local_exit(void); extern bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len); extern int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh); diff --git a/include/uapi/linux/lwtunnel.h b/include/uapi/linux/lwtunnel.h index 92724cba1eba..7fdd19ca7511 100644 --- a/include/uapi/linux/lwtunnel.h +++ b/include/uapi/linux/lwtunnel.h @@ -11,6 +11,7 @@ enum lwtunnel_encap_types { LWTUNNEL_ENCAP_IP6, LWTUNNEL_ENCAP_SEG6, LWTUNNEL_ENCAP_BPF, + LWTUNNEL_ENCAP_SEG6_LOCAL, __LWTUNNEL_ENCAP_MAX, }; diff --git a/include/uapi/linux/seg6_local.h b/include/uapi/linux/seg6_local.h new file mode 100644 index 000000000000..ef2d8c3e76c1 --- /dev/null +++ b/include/uapi/linux/seg6_local.h @@ -0,0 +1,68 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_SEG6_LOCAL_H +#define _UAPI_LINUX_SEG6_LOCAL_H + +#include + +enum { + SEG6_LOCAL_UNSPEC, + SEG6_LOCAL_ACTION, + SEG6_LOCAL_SRH, + SEG6_LOCAL_TABLE, + SEG6_LOCAL_NH4, + SEG6_LOCAL_NH6, + SEG6_LOCAL_IIF, + SEG6_LOCAL_OIF, + __SEG6_LOCAL_MAX, +}; +#define SEG6_LOCAL_MAX (__SEG6_LOCAL_MAX - 1) + +enum { + SEG6_LOCAL_ACTION_UNSPEC = 0, + /* node segment */ + SEG6_LOCAL_ACTION_END = 1, + /* adjacency segment (IPv6 cross-connect) */ + SEG6_LOCAL_ACTION_END_X = 2, + /* lookup of next seg NH in table */ + SEG6_LOCAL_ACTION_END_T = 3, + /* decap and L2 cross-connect */ + SEG6_LOCAL_ACTION_END_DX2 = 4, + /* decap and IPv6 cross-connect */ + SEG6_LOCAL_ACTION_END_DX6 = 5, + /* decap and IPv4 cross-connect */ + SEG6_LOCAL_ACTION_END_DX4 = 6, + /* decap and lookup of DA in v6 table */ + SEG6_LOCAL_ACTION_END_DT6 = 7, + /* decap and lookup of DA in v4 table */ + SEG6_LOCAL_ACTION_END_DT4 = 8, + /* binding segment with insertion */ + SEG6_LOCAL_ACTION_END_B6 = 9, + /* binding segment with encapsulation */ + SEG6_LOCAL_ACTION_END_B6_ENCAP = 10, + /* binding segment with MPLS encap */ + SEG6_LOCAL_ACTION_END_BM = 11, + /* lookup last seg in table */ + SEG6_LOCAL_ACTION_END_S = 12, + /* forward to SR-unaware VNF with static proxy */ + SEG6_LOCAL_ACTION_END_AS = 13, + /* forward to SR-unaware VNF with masquerading */ + SEG6_LOCAL_ACTION_END_AM = 14, + + __SEG6_LOCAL_ACTION_MAX, +}; + +#define SEG6_LOCAL_ACTION_MAX (__SEG6_LOCAL_ACTION_MAX - 1) + +#endif diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 435f35f9a61c..0b171756453c 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -44,6 +44,8 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "SEG6"; case LWTUNNEL_ENCAP_BPF: return "BPF"; + case LWTUNNEL_ENCAP_SEG6_LOCAL: + return "SEG6LOCAL"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: case LWTUNNEL_ENCAP_NONE: diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index 50181a96923e..0d722396dce6 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -311,7 +311,8 @@ config IPV6_SEG6_LWTUNNEL ---help--- Support for encapsulation of packets within an outer IPv6 header and a Segment Routing Header using the lightweight - tunnels mechanism. + tunnels mechanism. Also enable support for advanced local + processing of SRv6 packets based on their active segment. If unsure, say N. diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index f8b24c2e0d77..10e342363793 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -23,7 +23,7 @@ ipv6-$(CONFIG_IPV6_MULTIPLE_TABLES) += fib6_rules.o ipv6-$(CONFIG_PROC_FS) += proc.o ipv6-$(CONFIG_SYN_COOKIES) += syncookies.o ipv6-$(CONFIG_NETLABEL) += calipso.o -ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o +ipv6-$(CONFIG_IPV6_SEG6_LWTUNNEL) += seg6_iptunnel.o seg6_local.o ipv6-$(CONFIG_IPV6_SEG6_HMAC) += seg6_hmac.o ipv6-objs += $(ipv6-y) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 81c2339b3285..c81407770956 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -456,6 +456,10 @@ int __init seg6_init(void) err = seg6_iptunnel_init(); if (err) goto out_unregister_pernet; + + err = seg6_local_init(); + if (err) + goto out_unregister_pernet; #endif #ifdef CONFIG_IPV6_SEG6_HMAC @@ -471,6 +475,7 @@ out: #ifdef CONFIG_IPV6_SEG6_HMAC out_unregister_iptun: #ifdef CONFIG_IPV6_SEG6_LWTUNNEL + seg6_local_exit(); seg6_iptunnel_exit(); #endif #endif diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c new file mode 100644 index 000000000000..53615d7e0723 --- /dev/null +++ b/net/ipv6/seg6_local.c @@ -0,0 +1,320 @@ +/* + * SR-IPv6 implementation + * + * Author: + * David Lebrun + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IPV6_SEG6_HMAC +#include +#endif + +struct seg6_local_lwt; + +struct seg6_action_desc { + int action; + unsigned long attrs; + int (*input)(struct sk_buff *skb, struct seg6_local_lwt *slwt); + int static_headroom; +}; + +struct seg6_local_lwt { + int action; + struct ipv6_sr_hdr *srh; + int table; + struct in_addr nh4; + struct in6_addr nh6; + int iif; + int oif; + + int headroom; + struct seg6_action_desc *desc; +}; + +static struct seg6_local_lwt *seg6_local_lwtunnel(struct lwtunnel_state *lwt) +{ + return (struct seg6_local_lwt *)lwt->data; +} + +static struct seg6_action_desc seg6_action_table[] = { + { + .action = SEG6_LOCAL_ACTION_END, + .attrs = 0, + }, +}; + +static struct seg6_action_desc *__get_action_desc(int action) +{ + struct seg6_action_desc *desc; + int i, count; + + count = sizeof(seg6_action_table) / sizeof(struct seg6_action_desc); + for (i = 0; i < count; i++) { + desc = &seg6_action_table[i]; + if (desc->action == action) + return desc; + } + + return NULL; +} + +static int seg6_local_input(struct sk_buff *skb) +{ + struct dst_entry *orig_dst = skb_dst(skb); + struct seg6_action_desc *desc; + struct seg6_local_lwt *slwt; + + slwt = seg6_local_lwtunnel(orig_dst->lwtstate); + desc = slwt->desc; + + return desc->input(skb, slwt); +} + +static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { + [SEG6_LOCAL_ACTION] = { .type = NLA_U32 }, + [SEG6_LOCAL_SRH] = { .type = NLA_BINARY }, + [SEG6_LOCAL_TABLE] = { .type = NLA_U32 }, + [SEG6_LOCAL_NH4] = { .type = NLA_BINARY, + .len = sizeof(struct in_addr) }, + [SEG6_LOCAL_NH6] = { .type = NLA_BINARY, + .len = sizeof(struct in6_addr) }, + [SEG6_LOCAL_IIF] = { .type = NLA_U32 }, + [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, +}; + +struct seg6_action_param { + int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); + int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); + int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b); +}; + +static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { + [SEG6_LOCAL_SRH] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, + + [SEG6_LOCAL_TABLE] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, + + [SEG6_LOCAL_NH4] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, + + [SEG6_LOCAL_NH6] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, + + [SEG6_LOCAL_IIF] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, + + [SEG6_LOCAL_OIF] = { .parse = NULL, + .put = NULL, + .cmp = NULL }, +}; + +static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) +{ + struct seg6_action_param *param; + struct seg6_action_desc *desc; + int i, err; + + desc = __get_action_desc(slwt->action); + if (!desc) + return -EINVAL; + + if (!desc->input) + return -EOPNOTSUPP; + + slwt->desc = desc; + slwt->headroom += desc->static_headroom; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (desc->attrs & (1 << i)) { + if (!attrs[i]) + return -EINVAL; + + param = &seg6_action_params[i]; + + err = param->parse(attrs, slwt); + if (err < 0) + return err; + } + } + + return 0; +} + +static int seg6_local_build_state(struct nlattr *nla, unsigned int family, + const void *cfg, struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[SEG6_LOCAL_MAX + 1]; + struct lwtunnel_state *newts; + struct seg6_local_lwt *slwt; + int err; + + err = nla_parse_nested(tb, SEG6_LOCAL_MAX, nla, seg6_local_policy, + extack); + + if (err < 0) + return err; + + if (!tb[SEG6_LOCAL_ACTION]) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*slwt)); + if (!newts) + return -ENOMEM; + + slwt = seg6_local_lwtunnel(newts); + slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]); + + err = parse_nla_action(tb, slwt); + if (err < 0) + goto out_free; + + newts->type = LWTUNNEL_ENCAP_SEG6_LOCAL; + newts->flags = LWTUNNEL_STATE_INPUT_REDIRECT; + newts->headroom = slwt->headroom; + + *ts = newts; + + return 0; + +out_free: + kfree(slwt->srh); + kfree(newts); + return err; +} + +static void seg6_local_destroy_state(struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + + kfree(slwt->srh); +} + +static int seg6_local_fill_encap(struct sk_buff *skb, + struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + struct seg6_action_param *param; + int i, err; + + if (nla_put_u32(skb, SEG6_LOCAL_ACTION, slwt->action)) + return -EMSGSIZE; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (slwt->desc->attrs & (1 << i)) { + param = &seg6_action_params[i]; + err = param->put(skb, slwt); + if (err < 0) + return err; + } + } + + return 0; +} + +static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) +{ + struct seg6_local_lwt *slwt = seg6_local_lwtunnel(lwt); + unsigned long attrs; + int nlsize; + + nlsize = nla_total_size(4); /* action */ + + attrs = slwt->desc->attrs; + + if (attrs & (1 << SEG6_LOCAL_SRH)) + nlsize += nla_total_size((slwt->srh->hdrlen + 1) << 3); + + if (attrs & (1 << SEG6_LOCAL_TABLE)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_NH4)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_NH6)) + nlsize += nla_total_size(16); + + if (attrs & (1 << SEG6_LOCAL_IIF)) + nlsize += nla_total_size(4); + + if (attrs & (1 << SEG6_LOCAL_OIF)) + nlsize += nla_total_size(4); + + return nlsize; +} + +static int seg6_local_cmp_encap(struct lwtunnel_state *a, + struct lwtunnel_state *b) +{ + struct seg6_local_lwt *slwt_a, *slwt_b; + struct seg6_action_param *param; + int i; + + slwt_a = seg6_local_lwtunnel(a); + slwt_b = seg6_local_lwtunnel(b); + + if (slwt_a->action != slwt_b->action) + return 1; + + if (slwt_a->desc->attrs != slwt_b->desc->attrs) + return 1; + + for (i = 0; i < SEG6_LOCAL_MAX + 1; i++) { + if (slwt_a->desc->attrs & (1 << i)) { + param = &seg6_action_params[i]; + if (param->cmp(slwt_a, slwt_b)) + return 1; + } + } + + return 0; +} + +static const struct lwtunnel_encap_ops seg6_local_ops = { + .build_state = seg6_local_build_state, + .destroy_state = seg6_local_destroy_state, + .input = seg6_local_input, + .fill_encap = seg6_local_fill_encap, + .get_encap_size = seg6_local_get_encap_size, + .cmp_encap = seg6_local_cmp_encap, + .owner = THIS_MODULE, +}; + +int __init seg6_local_init(void) +{ + return lwtunnel_encap_add_ops(&seg6_local_ops, + LWTUNNEL_ENCAP_SEG6_LOCAL); +} + +void seg6_local_exit(void) +{ + lwtunnel_encap_del_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); +} -- cgit v1.2.3 From 92b31a9af73b3a3fc801899335d6c47966351830 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 10 Aug 2017 01:39:55 +0200 Subject: bpf: add BPF_J{LT,LE,SLT,SLE} instructions Currently, eBPF only understands BPF_JGT (>), BPF_JGE (>=), BPF_JSGT (s>), BPF_JSGE (s>=) instructions, this means that particularly *JLT/*JLE counterparts involving immediates need to be rewritten from e.g. X < [IMM] by swapping arguments into [IMM] > X, meaning the immediate first is required to be loaded into a register Y := [IMM], such that then we can compare with Y > X. Note that the destination operand is always required to be a register. This has the downside of having unnecessarily increased register pressure, meaning complex program would need to spill other registers temporarily to stack in order to obtain an unused register for the [IMM]. Loading to registers will thus also affect state pruning since we need to account for that register use and potentially those registers that had to be spilled/filled again. As a consequence slightly more stack space might have been used due to spilling, and BPF programs are a bit longer due to extra code involving the register load and potentially required spill/fills. Thus, add BPF_JLT (<), BPF_JLE (<=), BPF_JSLT (s<), BPF_JSLE (s<=) counterparts to the eBPF instruction set. Modifying LLVM to remove the NegateCC() workaround in a PoC patch at [1] and allowing it to also emit the new instructions resulted in cilium's BPF programs that are injected into the fast-path to have a reduced program length in the range of 2-3% (e.g. accumulated main and tail call sections from one of the object file reduced from 4864 to 4729 insns), reduced complexity in the range of 10-30% (e.g. accumulated sections reduced in one of the cases from 116432 to 88428 insns), and reduced stack usage in the range of 1-5% (e.g. accumulated sections from one of the object files reduced from 824 to 784b). The modification for LLVM will be incorporated in a backwards compatible way. Plan is for LLVM to have i) a target specific option to offer a possibility to explicitly enable the extension by the user (as we have with -m target specific extensions today for various CPU insns), and ii) have the kernel checked for presence of the extensions and enable them transparently when the user is selecting more aggressive options such as -march=native in a bpf target context. (Other frontends generating BPF byte code, e.g. ply can probe the kernel directly for its code generation.) [1] https://github.com/borkmann/llvm/tree/bpf-insns Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- Documentation/networking/filter.txt | 4 + include/uapi/linux/bpf.h | 5 + kernel/bpf/core.c | 60 ++++++ lib/test_bpf.c | 364 ++++++++++++++++++++++++++++++++++++ net/core/filter.c | 21 ++- tools/include/uapi/linux/bpf.h | 5 + 6 files changed, 455 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index d0fdba7d66e2..6a0df8df6c43 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -906,6 +906,10 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JSGE 0x70 /* eBPF only: signed '>=' */ BPF_CALL 0x80 /* eBPF only: function call */ BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ + BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ + BPF_JSLT 0xc0 /* eBPF only: signed '<' */ + BPF_JSLE 0xd0 /* eBPF only: signed '<=' */ So BPF_ADD | BPF_X | BPF_ALU means 32-bit addition in both classic BPF and eBPF. There are only two registers in classic BPF, so it means A += X. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1d06be1569b1..91da8371a2d0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ad5f55922a13..c69e7f5bfde7 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -595,9 +595,13 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, case BPF_JMP | BPF_JEQ | BPF_K: case BPF_JMP | BPF_JNE | BPF_K: case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JLT | BPF_K: case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JLE | BPF_K: case BPF_JMP | BPF_JSGT | BPF_K: + case BPF_JMP | BPF_JSLT | BPF_K: case BPF_JMP | BPF_JSGE | BPF_K: + case BPF_JMP | BPF_JSLE | BPF_K: case BPF_JMP | BPF_JSET | BPF_K: /* Accommodate for extra offset in case of a backjump. */ off = from->off; @@ -833,12 +837,20 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, + [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, + [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, + [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, + [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, + [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, + [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, + [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, + [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, /* Program return */ @@ -1073,6 +1085,18 @@ out: CONT_JMP; } CONT; + JMP_JLT_X: + if (DST < SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLT_K: + if (DST < IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JGE_X: if (DST >= SRC) { insn += insn->off; @@ -1085,6 +1109,18 @@ out: CONT_JMP; } CONT; + JMP_JLE_X: + if (DST <= SRC) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JLE_K: + if (DST <= IMM) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGT_X: if (((s64) DST) > ((s64) SRC)) { insn += insn->off; @@ -1097,6 +1133,18 @@ out: CONT_JMP; } CONT; + JMP_JSLT_X: + if (((s64) DST) < ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLT_K: + if (((s64) DST) < ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSGE_X: if (((s64) DST) >= ((s64) SRC)) { insn += insn->off; @@ -1109,6 +1157,18 @@ out: CONT_JMP; } CONT; + JMP_JSLE_X: + if (((s64) DST) <= ((s64) SRC)) { + insn += insn->off; + CONT_JMP; + } + CONT; + JMP_JSLE_K: + if (((s64) DST) <= ((s64) IMM)) { + insn += insn->off; + CONT_JMP; + } + CONT; JMP_JSET_X: if (DST & SRC) { insn += insn->off; diff --git a/lib/test_bpf.c b/lib/test_bpf.c index d9d5a410955c..aa8812ae6776 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -951,6 +951,32 @@ static struct bpf_test tests[] = { { 4, 4, 4, 3, 3 }, { { 2, 0 }, { 3, 1 }, { 4, MAX_K } }, }, + { + "JGE (jt 0), test 1", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 4, 3, 3 }, + { { 2, 0 }, { 3, 1 }, { 4, 1 } }, + }, + { + "JGE (jt 0), test 2", + .u.insns = { + BPF_STMT(BPF_LDX | BPF_LEN, 0), + BPF_STMT(BPF_LD | BPF_B | BPF_ABS, 2), + BPF_JUMP(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 1), + BPF_STMT(BPF_RET | BPF_K, 1), + BPF_STMT(BPF_RET | BPF_K, MAX_K) + }, + CLASSIC, + { 4, 4, 5, 3, 3 }, + { { 4, 1 }, { 5, 1 }, { 6, MAX_K } }, + }, { "JGE", .u.insns = { @@ -4492,6 +4518,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_K */ + { + "JMP_JSLT_K: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_K: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGT | BPF_K */ { "JMP_JSGT_K: Signed jump: if (-1 > -2) return 1", @@ -4521,6 +4576,73 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_K */ + { + "JMP_JSLE_K: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xfffffffffffffffeLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 0xffffffffffffffffLL), + BPF_JMP_IMM(BPF_JSLE, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 6), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 1), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_K: Signed jump: value walk 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 4), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 2), + BPF_ALU64_IMM(BPF_SUB, R1, 2), + BPF_JMP_IMM(BPF_JSLE, R1, 0, 1), + BPF_EXIT_INSN(), /* bad exit */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* good exit */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_K */ { "JMP_JSGE_K: Signed jump: if (-1 >= -2) return 1", @@ -4617,6 +4739,35 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K */ + { + "JMP_JLT_K: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLT, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JGT_K: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 1), + BPF_JMP_IMM(BPF_JLT, R1, -1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_K */ { "JMP_JGE_K: if (3 >= 2) return 1", @@ -4632,6 +4783,21 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_K */ + { + "JMP_JLE_K: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 2), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_K jump backwards */ { "JMP_JGT_K: if (3 > 2) return 1 (jump backwards)", @@ -4662,6 +4828,36 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_K jump backwards */ + { + "JMP_JGT_K: if (2 < 3) return 1 (jump backwards)", + .u.insns_int = { + BPF_JMP_IMM(BPF_JA, 0, 0, 2), /* goto start */ + BPF_ALU32_IMM(BPF_MOV, R0, 1), /* out: */ + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), /* start: */ + BPF_LD_IMM64(R1, 2), /* note: this takes 2 insns */ + BPF_JMP_IMM(BPF_JLT, R1, 3, -6), /* goto out */ + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_K: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_JMP_IMM(BPF_JLE, R1, 3, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_K */ { "JMP_JNE_K: if (3 != 2) return 1", @@ -4752,6 +4948,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLT | BPF_X */ + { + "JMP_JSLT_X: Signed jump: if (-2 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLT_X: Signed jump: if (-1 < -1) return 0", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLT, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JSGE | BPF_X */ { "JMP_JSGE_X: Signed jump: if (-1 >= -2) return 1", @@ -4783,6 +5010,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JSLE | BPF_X */ + { + "JMP_JSLE_X: Signed jump: if (-2 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -2), + BPF_JMP_REG(BPF_JSLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JSLE_X: Signed jump: if (-1 <= -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, -1), + BPF_JMP_REG(BPF_JSLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGT | BPF_X */ { "JMP_JGT_X: if (3 > 2) return 1", @@ -4814,6 +5072,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLT | BPF_X */ + { + "JMP_JLT_X: if (2 < 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLT_X: Unsigned jump: if (1 < -1) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, -1), + BPF_LD_IMM64(R2, 1), + BPF_JMP_REG(BPF_JLT, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JGE | BPF_X */ { "JMP_JGE_X: if (3 >= 2) return 1", @@ -4845,6 +5134,37 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + /* BPF_JMP | BPF_JLE | BPF_X */ + { + "JMP_JLE_X: if (2 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, + { + "JMP_JLE_X: if (3 <= 3) return 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 3), + BPF_JMP_REG(BPF_JLE, R1, R2, 1), + BPF_EXIT_INSN(), + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, { /* Mainly testing JIT + imm64 here. */ "JMP_JGE_X: ldimm64 test 1", @@ -4890,6 +5210,50 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + "JMP_JLE_X: ldimm64 test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 2), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xeeeeeeeeU } }, + }, + { + "JMP_JLE_X: ldimm64 test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 0), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xffffffffU } }, + }, + { + "JMP_JLE_X: ldimm64 test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JLE, R2, R1, 4), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_X */ { "JMP_JNE_X: if (3 != 2) return 1", diff --git a/net/core/filter.c b/net/core/filter.c index 78d00933dbe7..5afe3ac191ec 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -514,14 +514,27 @@ do_pass: break; } - /* Convert JEQ into JNE when 'jump_true' is next insn. */ - if (fp->jt == 0 && BPF_OP(fp->code) == BPF_JEQ) { - insn->code = BPF_JMP | BPF_JNE | bpf_src; + /* Convert some jumps when 'jump_true' is next insn. */ + if (fp->jt == 0) { + switch (BPF_OP(fp->code)) { + case BPF_JEQ: + insn->code = BPF_JMP | BPF_JNE | bpf_src; + break; + case BPF_JGT: + insn->code = BPF_JMP | BPF_JLE | bpf_src; + break; + case BPF_JGE: + insn->code = BPF_JMP | BPF_JLT | bpf_src; + break; + default: + goto jmp_rest; + } + target = i + fp->jf + 1; BPF_EMIT_JMP; break; } - +jmp_rest: /* Other jumps are mapped into two insns: Jxx and JA. */ target = i + fp->jt + 1; insn->code = BPF_JMP | BPF_OP(fp->code) | bpf_src; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 8d9bfcca3fe4..bf3b2e230455 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -30,9 +30,14 @@ #define BPF_FROM_LE BPF_TO_LE #define BPF_FROM_BE BPF_TO_BE +/* jmp encodings */ #define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ #define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ #define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ -- cgit v1.2.3 From 077fbac405bfc6d41419ad6c1725804ad4e9887c Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 11 Aug 2017 02:11:33 +0900 Subject: net: xfrm: support setting an output mark. On systems that use mark-based routing it may be necessary for routing lookups to use marks in order for packets to be routed correctly. An example of such a system is Android, which uses socket marks to route packets via different networks. Currently, routing lookups in tunnel mode always use a mark of zero, making routing incorrect on such systems. This patch adds a new output_mark element to the xfrm state and a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output mark differs from the existing xfrm mark in two ways: 1. The xfrm mark is used to match xfrm policies and states, while the xfrm output mark is used to set the mark (and influence the routing) of the packets emitted by those states. 2. The existing mark is constrained to be a subset of the bits of the originating socket or transformed packet, but the output mark is arbitrary and depends only on the state. The use of a separate mark provides additional flexibility. For example: - A packet subject to two transforms (e.g., transport mode inside tunnel mode) can have two different output marks applied to it, one for the transport mode SA and one for the tunnel mode SA. - On a system where socket marks determine routing, the packets emitted by an IPsec tunnel can be routed based on a mark that is determined by the tunnel, not by the marks of the unencrypted packets. - Support for setting the output marks can be introduced without breaking any existing setups that employ both mark-based routing and xfrm tunnel mode. Simply changing the code to use the xfrm mark for routing output packets could xfrm mark could change behaviour in a way that breaks these setups. If the output mark is unspecified or set to zero, the mark is not set or changed. Tested: make allyesconfig; make -j64 Tested: https://android-review.googlesource.com/452776 Signed-off-by: Lorenzo Colitti Signed-off-by: Steffen Klassert --- include/net/xfrm.h | 9 ++++++--- include/uapi/linux/xfrm.h | 1 + net/ipv4/xfrm4_policy.c | 14 +++++++++----- net/ipv6/xfrm6_policy.c | 9 ++++++--- net/xfrm/xfrm_device.c | 3 ++- net/xfrm/xfrm_output.c | 3 +++ net/xfrm/xfrm_policy.c | 17 +++++++++-------- net/xfrm/xfrm_user.c | 11 +++++++++++ 8 files changed, 47 insertions(+), 20 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 18d7de34a5c3..9c7b70cce6d6 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -165,6 +165,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; + u32 output_mark; } props; struct xfrm_lifetime_cfg lft; @@ -298,10 +299,12 @@ struct xfrm_policy_afinfo { struct dst_entry *(*dst_lookup)(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr); + const xfrm_address_t *daddr, + u32 mark); int (*get_saddr)(struct net *net, int oif, xfrm_address_t *saddr, - xfrm_address_t *daddr); + xfrm_address_t *daddr, + u32 mark); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); @@ -1640,7 +1643,7 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, - int family); + int family, u32 mark); struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 2b384ff09fa0..5fe7370a2bef 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -304,6 +304,7 @@ enum xfrm_attr_type_t { XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ + XFRMA_OUTPUT_MARK, /* __u32 */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 4aefb149fe0a..d7bf0b041885 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -20,7 +20,8 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct rtable *rt; @@ -28,6 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); + fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; @@ -42,20 +44,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index f44b25a48478..11d1314ab6c5 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -27,7 +27,8 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi6 fl6; struct dst_entry *dst; @@ -36,6 +37,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + fl6.flowi6_mark = mark; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -52,12 +54,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, } static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 1904127f5fb8..acf00104ef31 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -79,7 +79,8 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, daddr = &x->props.saddr; } - dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family); + dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, + x->props.family, x->props.output_mark); if (IS_ERR(dst)) return 0; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 8c0b6722aaa8..31a2e6d34dba 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } + if (x->props.output_mark) + skb->mark = x->props.output_mark; + err = x->outer_mode->output(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 06c3bf7ab86b..1de52f36caf5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -122,7 +122,7 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, - int family) + int family, u32 mark) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -131,7 +131,7 @@ struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); + dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); rcu_read_unlock(); @@ -143,7 +143,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, - int family) + int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; @@ -159,7 +159,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family); + dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -1340,14 +1340,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, - xfrm_address_t *remote, unsigned short family) + xfrm_address_t *remote, unsigned short family, u32 mark) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, oif, local, remote); + err = afinfo->get_saddr(net, oif, local, remote, mark); rcu_read_unlock(); return err; } @@ -1378,7 +1378,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, - tmpl->encap_family); + tmpl->encap_family, 0); if (error) goto fail; local = &tmp; @@ -1598,7 +1598,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family); + &saddr, &daddr, family, + xfrm[i]->props.output_mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index ffe8d5ef09eb..cc3268d814b4 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); + if (attrs[XFRMA_OUTPUT_MARK]) + x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); if (err) goto error; @@ -899,6 +902,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); + if (x->props.output_mark) { + ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); + if (ret) + goto out; + } out: return ret; } @@ -2454,6 +2462,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, + [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2673,6 +2682,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(x->props.extra_flags)); if (x->xso.dev) l += nla_total_size(sizeof(x->xso)); + if (x->props.output_mark) + l += nla_total_size(sizeof(x->props.output_mark)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); -- cgit v1.2.3 From fe4007999599c02598c17b643e8de43e487d48e8 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 15 Aug 2017 09:09:49 +0200 Subject: ipv6: fib: Provide offload indication using nexthop flags IPv6 routes currently lack nexthop flags as in IPv4. This has several implications. In the forwarding path, it requires us to check the carrier state of the nexthop device and potentially ignore a linkdown route, instead of checking for RTNH_F_LINKDOWN. It also requires capable drivers to use the user facing IPv6-specific route flags to provide offload indication, instead of using the nexthop flags as in IPv4. Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to provide offload indication instead of the RTF_OFFLOAD flag, which is removed while it's still not part of any official kernel release. In the near future we would like to use the field for the RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might not be ready in time for the current cycle. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Acked-by: David Ahern Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 ++++---- include/net/ip6_fib.h | 2 ++ include/uapi/linux/ipv6_route.h | 1 - net/ipv6/route.c | 7 +------ 4 files changed, 7 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 16676fffbf70..4895d5b8942b 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -2397,7 +2397,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) { list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6, - list)->rt->rt6i_flags |= RTF_OFFLOAD; + list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD; return; } @@ -2407,9 +2407,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry *fib_entry) nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6); if (nh && nh->offloaded) - mlxsw_sp_rt6->rt->rt6i_flags |= RTF_OFFLOAD; + mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD; else - mlxsw_sp_rt6->rt->rt6i_flags &= ~RTF_OFFLOAD; + mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD; } } @@ -2424,7 +2424,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct mlxsw_sp_fib_entry *fib_entry) list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) { struct rt6_info *rt = mlxsw_sp_rt6->rt; - rt->rt6i_flags &= ~RTF_OFFLOAD; + rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD; } } diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index 1d790ea40ea7..71c1646298ae 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -120,6 +120,8 @@ struct rt6_info { atomic_t rt6i_ref; + unsigned int rt6i_nh_flags; + /* These are in a separate cache line. */ struct rt6key rt6i_dst ____cacheline_aligned_in_smp; u32 rt6i_flags; diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index 33e2a5732bd1..d496c02e14bc 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -35,7 +35,6 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 -#define RTF_OFFLOAD 0x20000000 /* offloaded route */ #define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ #define RTF_LOCAL 0x80000000 diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 035762fed07d..6793135d49db 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1820,11 +1820,6 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg, goto out; } - if (cfg->fc_flags & RTF_OFFLOAD) { - NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD"); - goto out; - } - if (cfg->fc_dst_len > 128) { NL_SET_ERR_MSG(extack, "Invalid prefix length"); goto out; @@ -3335,7 +3330,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt, goto nla_put_failure; } - if (rt->rt6i_flags & RTF_OFFLOAD) + if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD) *flags |= RTNH_F_OFFLOAD; /* not needed for multipath encoding b/c it has a rtnexthop struct */ -- cgit v1.2.3 From b005fd189cec9407b700599e1e80e0552446ee79 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:31:58 -0700 Subject: bpf: introduce new program type for skbs on sockets A class of programs, run from strparser and soon from a new map type called sock map, are used with skb as the context but on established sockets. By creating a specific program type for these we can use bpf helpers that expect full sockets and get the verifier to ensure these helpers are not used out of context. The new type is BPF_PROG_TYPE_SK_SKB. This patch introduces the infrastructure and type. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + net/core/filter.c | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index b1e1035ca24b..4b72db30dacf 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -11,6 +11,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) BPF_PROG_TYPE(BPF_PROG_TYPE_SOCK_OPS, sock_ops_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SK_SKB, sk_skb_prog_ops) #endif #ifdef CONFIG_BPF_EVENTS BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 91da8371a2d0..2e796e384aeb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_OUT, BPF_PROG_TYPE_LWT_XMIT, BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, }; enum bpf_attach_type { diff --git a/net/core/filter.c b/net/core/filter.c index e0688a855c47..46321033ae0e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3234,6 +3234,20 @@ static const struct bpf_func_proto * } } +static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * lwt_xmit_func_proto(enum bpf_func_id func_id) { @@ -3525,6 +3539,22 @@ static bool sock_ops_is_valid_access(int off, int size, return __is_valid_sock_ops_access(off, size); } +static bool sk_skb_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + } + + return bpf_skb_is_valid_access(off, size, type, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -3994,6 +4024,12 @@ const struct bpf_verifier_ops sock_ops_prog_ops = { .convert_ctx_access = sock_ops_convert_ctx_access, }; +const struct bpf_verifier_ops sk_skb_prog_ops = { + .get_func_proto = sk_skb_func_proto, + .is_valid_access = sk_skb_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; -- cgit v1.2.3 From 174a79ff9515f400b9a6115643dafd62a635b7e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:32:47 -0700 Subject: bpf: sockmap with sk redirect support Recently we added a new map type called dev map used to forward XDP packets between ports (6093ec2dc313). This patches introduces a similar notion for sockets. A sockmap allows users to add participating sockets to a map. When sockets are added to the map enough context is stored with the map entry to use the entry with a new helper bpf_sk_redirect_map(map, key, flags) This helper (analogous to bpf_redirect_map in XDP) is given the map and an entry in the map. When called from a sockmap program, discussed below, the skb will be sent on the socket using skb_send_sock(). With the above we need a bpf program to call the helper from that will then implement the send logic. The initial site implemented in this series is the recv_sock hook. For this to work we implemented a map attach command to add attributes to a map. In sockmap we add two programs a parse program and a verdict program. The parse program uses strparser to build messages and pass them to the verdict program. The parse programs use the normal strparser semantics. The verdict program is of type SK_SKB. The verdict program returns a verdict SK_DROP, or SK_REDIRECT for now. Additional actions may be added later. When SK_REDIRECT is returned, expected when bpf program uses bpf_sk_redirect_map(), the sockmap logic will consult per cpu variables set by the helper routine and pull the sock entry out of the sock map. This pattern follows the existing redirect logic in cls and xdp programs. This gives the flow, recv_sock -> str_parser (parse_prog) -> verdict_prog -> skb_send_sock \ -> kfree_skb As an example use case a message based load balancer may use specific logic in the verdict program to select the sock to send on. Sample programs are provided in future patches that hopefully illustrate the user interfaces. Also selftests are in follow-on patches. Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 +- include/linux/bpf_types.h | 1 + include/linux/filter.h | 2 + include/uapi/linux/bpf.h | 33 +- kernel/bpf/Makefile | 2 +- kernel/bpf/sockmap.c | 792 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 51 ++- kernel/bpf/verifier.c | 14 + net/core/filter.c | 43 +++ 9 files changed, 940 insertions(+), 5 deletions(-) create mode 100644 kernel/bpf/sockmap.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d6e1de8ce0fc..a4145e9c74b5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -16,6 +16,7 @@ #include struct perf_event; +struct bpf_prog; struct bpf_map; /* map is generic key/value storage optionally accesible by eBPF programs */ @@ -37,6 +38,8 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); + int (*map_attach)(struct bpf_map *map, + struct bpf_prog *p1, struct bpf_prog *p2); }; struct bpf_map { @@ -138,8 +141,6 @@ enum bpf_reg_type { PTR_TO_PACKET_END, /* skb->data + headlen */ }; -struct bpf_prog; - /* The information passed from prog-specific *_is_valid_access * back to the verifier. */ @@ -312,6 +313,7 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); @@ -391,6 +393,7 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto; extern const struct bpf_func_proto bpf_skb_vlan_push_proto; extern const struct bpf_func_proto bpf_skb_vlan_pop_proto; extern const struct bpf_func_proto bpf_get_stackid_proto; +extern const struct bpf_func_proto bpf_sock_map_update_proto; /* Shared helpers among cBPF and eBPF. */ void bpf_user_rnd_init_once(void); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 4b72db30dacf..fa805074d168 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -38,4 +38,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif diff --git a/include/linux/filter.h b/include/linux/filter.h index d19ed3c15e1e..7015116331af 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -727,6 +727,8 @@ void xdp_do_flush_map(void); void bpf_warn_invalid_xdp_action(u32 act); void bpf_warn_invalid_xdp_redirect(u32 ifindex); +struct sock *do_sk_redirect_map(void); + #ifdef CONFIG_BPF_JIT extern int bpf_jit_enable; extern int bpf_jit_harden; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e796e384aeb..7f774769e3f5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -110,6 +110,7 @@ enum bpf_map_type { BPF_MAP_TYPE_ARRAY_OF_MAPS, BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, }; enum bpf_prog_type { @@ -135,11 +136,15 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, + BPF_CGROUP_SMAP_INGRESS, __MAX_BPF_ATTACH_TYPE }; #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE +/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */ +#define BPF_SOCKMAP_STRPARSER (1U << 0) + /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup @@ -211,6 +216,7 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; + __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -557,6 +563,23 @@ union bpf_attr { * @mode: operation mode (enum bpf_adj_room_mode) * @flags: reserved for future use * Return: 0 on success or negative error code + * + * int bpf_sk_redirect_map(map, key, flags) + * Redirect skb to a sock in map using key as a lookup key for the + * sock in map. + * @map: pointer to sockmap + * @key: key to lookup sock in map + * @flags: reserved for future use + * Return: SK_REDIRECT + * + * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * @skops: pointer to bpf_sock_ops + * @map: pointer to sockmap to update + * @key: key to insert/update sock in map + * @flags: same flags as map update elem + * @map_flags: sock map specific flags + * bit 1: Enable strparser + * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -610,7 +633,9 @@ union bpf_attr { FN(set_hash), \ FN(setsockopt), \ FN(skb_adjust_room), \ - FN(redirect_map), + FN(redirect_map), \ + FN(sk_redirect_map), \ + FN(sock_map_update), \ /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -747,6 +772,12 @@ struct xdp_md { __u32 data_end; }; +enum sk_action { + SK_ABORTED = 0, + SK_DROP, + SK_REDIRECT, +}; + #define BPF_TAG_SIZE 8 struct bpf_prog_info { diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 2f0bcda40e90..aa24287db888 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_NET),y) -obj-$(CONFIG_BPF_SYSCALL) += devmap.o +obj-$(CONFIG_BPF_SYSCALL) += devmap.o sockmap.o endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c new file mode 100644 index 000000000000..792f0addfafa --- /dev/null +++ b/kernel/bpf/sockmap.c @@ -0,0 +1,792 @@ +/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +/* A BPF sock_map is used to store sock objects. This is primarly used + * for doing socket redirect with BPF helper routines. + * + * A sock map may have two BPF programs attached to it, a program used + * to parse packets and a program to provide a verdict and redirect + * decision on the packet. If no BPF parse program is provided it is + * assumed that every skb is a "message" (skb->len). Otherwise the + * parse program is attached to strparser and used to build messages + * that may span multiple skbs. The verdict program will either select + * a socket to send/receive the skb on or provide the drop code indicating + * the skb should be dropped. More actions may be added later as needed. + * The default program will drop packets. + * + * For reference this program is similar to devmap used in XDP context + * reviewing these together may be useful. For an example please review + * ./samples/bpf/sockmap/. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct bpf_stab { + struct bpf_map map; + struct sock **sock_map; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; + refcount_t refcnt; +}; + +enum smap_psock_state { + SMAP_TX_RUNNING, +}; + +struct smap_psock { + struct rcu_head rcu; + + /* datapath variables */ + struct sk_buff_head rxqueue; + bool strp_enabled; + + /* datapath error path cache across tx work invocations */ + int save_rem; + int save_off; + struct sk_buff *save_skb; + + struct strparser strp; + struct bpf_prog *bpf_parse; + struct bpf_prog *bpf_verdict; + struct bpf_stab *stab; + + /* Back reference used when sock callback trigger sockmap operations */ + int key; + struct sock *sock; + unsigned long state; + + struct work_struct tx_work; + struct work_struct gc_work; + + void (*save_data_ready)(struct sock *sk); + void (*save_write_space)(struct sock *sk); + void (*save_state_change)(struct sock *sk); +}; + +static inline struct smap_psock *smap_psock_sk(const struct sock *sk) +{ + return (struct smap_psock *)rcu_dereference_sk_user_data(sk); +} + +static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) +{ + struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); + int rc; + + if (unlikely(!prog)) + return SK_DROP; + + skb_orphan(skb); + skb->sk = psock->sock; + bpf_compute_data_end(skb); + rc = (*prog->bpf_func)(skb, prog->insnsi); + skb->sk = NULL; + + return rc; +} + +static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sock; + int rc; + + /* Because we use per cpu values to feed input from sock redirect + * in BPF program to do_sk_redirect_map() call we need to ensure we + * are not preempted. RCU read lock is not sufficient in this case + * with CONFIG_PREEMPT_RCU enabled so we must be explicit here. + */ + preempt_disable(); + rc = smap_verdict_func(psock, skb); + switch (rc) { + case SK_REDIRECT: + sock = do_sk_redirect_map(); + preempt_enable(); + if (likely(sock)) { + struct smap_psock *peer = smap_psock_sk(sock); + + if (likely(peer && + test_bit(SMAP_TX_RUNNING, &peer->state) && + sk_stream_memory_free(peer->sock))) { + peer->sock->sk_wmem_queued += skb->truesize; + sk_mem_charge(peer->sock, skb->truesize); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } + } + /* Fall through and free skb otherwise */ + case SK_DROP: + default: + preempt_enable(); + kfree_skb(skb); + } +} + +static void smap_report_sk_error(struct smap_psock *psock, int err) +{ + struct sock *sk = psock->sock; + + sk->sk_err = err; + sk->sk_error_report(sk); +} + +static void smap_release_sock(struct sock *sock); + +/* Called with lock_sock(sk) held */ +static void smap_state_change(struct sock *sk) +{ + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + + /* Allowing transitions into an established syn_recv states allows + * for early binding sockets to a smap object before the connection + * is established. + */ + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + break; + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + case TCP_LISTEN: + break; + case TCP_CLOSE: + /* Only release if the map entry is in fact the sock in + * question. There is a case where the operator deletes + * the sock from the map, but the TCP sock is closed before + * the psock is detached. Use cmpxchg to verify correct + * sock is removed. + */ + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + break; + osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL); + if (osk == sk) + smap_release_sock(sk); + break; + default: + smap_report_sk_error(psock, EPIPE); + break; + } + rcu_read_unlock(); +} + +static void smap_read_sock_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + smap_do_verdict(psock, skb); + rcu_read_unlock(); +} + +/* Called with lock held on socket */ +static void smap_data_ready(struct sock *sk) +{ + struct smap_psock *psock; + + write_lock_bh(&sk->sk_callback_lock); + psock = smap_psock_sk(sk); + if (likely(psock)) + strp_data_ready(&psock->strp); + write_unlock_bh(&sk->sk_callback_lock); +} + +static void smap_tx_work(struct work_struct *w) +{ + struct smap_psock *psock; + struct sk_buff *skb; + int rem, off, n; + + psock = container_of(w, struct smap_psock, tx_work); + + /* lock sock to avoid losing sk_socket at some point during loop */ + lock_sock(psock->sock); + if (psock->save_skb) { + skb = psock->save_skb; + rem = psock->save_rem; + off = psock->save_off; + psock->save_skb = NULL; + goto start; + } + + while ((skb = skb_dequeue(&psock->rxqueue))) { + rem = skb->len; + off = 0; +start: + do { + if (likely(psock->sock->sk_socket)) + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + else + n = -EINVAL; + if (n <= 0) { + if (n == -EAGAIN) { + /* Retry when space is available */ + psock->save_skb = skb; + psock->save_rem = rem; + psock->save_off = off; + goto out; + } + /* Hard errors break pipe and stop xmit */ + smap_report_sk_error(psock, n ? -n : EPIPE); + clear_bit(SMAP_TX_RUNNING, &psock->state); + sk_mem_uncharge(psock->sock, skb->truesize); + psock->sock->sk_wmem_queued -= skb->truesize; + kfree_skb(skb); + goto out; + } + rem -= n; + off += n; + } while (rem); + sk_mem_uncharge(psock->sock, skb->truesize); + psock->sock->sk_wmem_queued -= skb->truesize; + kfree_skb(skb); + } +out: + release_sock(psock->sock); +} + +static void smap_write_space(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state))) + schedule_work(&psock->tx_work); + rcu_read_unlock(); +} + +static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) +{ + write_lock_bh(&sk->sk_callback_lock); + if (!psock->strp_enabled) + goto out; + sk->sk_data_ready = psock->save_data_ready; + sk->sk_write_space = psock->save_write_space; + sk->sk_state_change = psock->save_state_change; + psock->save_data_ready = NULL; + psock->save_write_space = NULL; + psock->save_state_change = NULL; + strp_stop(&psock->strp); + psock->strp_enabled = false; +out: + write_unlock_bh(&sk->sk_callback_lock); +} + +static void smap_destroy_psock(struct rcu_head *rcu) +{ + struct smap_psock *psock = container_of(rcu, + struct smap_psock, rcu); + + /* Now that a grace period has passed there is no longer + * any reference to this sock in the sockmap so we can + * destroy the psock, strparser, and bpf programs. But, + * because we use workqueue sync operations we can not + * do it in rcu context + */ + schedule_work(&psock->gc_work); +} + +static void smap_release_sock(struct sock *sock) +{ + struct smap_psock *psock = smap_psock_sk(sock); + + smap_stop_sock(psock, sock); + clear_bit(SMAP_TX_RUNNING, &psock->state); + rcu_assign_sk_user_data(sock, NULL); + call_rcu_sched(&psock->rcu, smap_destroy_psock); +} + +static int smap_parse_func_strparser(struct strparser *strp, + struct sk_buff *skb) +{ + struct smap_psock *psock; + struct bpf_prog *prog; + int rc; + + rcu_read_lock(); + psock = container_of(strp, struct smap_psock, strp); + prog = READ_ONCE(psock->bpf_parse); + + if (unlikely(!prog)) { + rcu_read_unlock(); + return skb->len; + } + + /* Attach socket for bpf program to use if needed we can do this + * because strparser clones the skb before handing it to a upper + * layer, meaning skb_orphan has been called. We NULL sk on the + * way out to ensure we don't trigger a BUG_ON in skb/sk operations + * later and because we are not charging the memory of this skb to + * any socket yet. + */ + skb->sk = psock->sock; + bpf_compute_data_end(skb); + rc = (*prog->bpf_func)(skb, prog->insnsi); + skb->sk = NULL; + rcu_read_unlock(); + return rc; +} + + +static int smap_read_sock_done(struct strparser *strp, int err) +{ + return err; +} + +static int smap_init_sock(struct smap_psock *psock, + struct sock *sk) +{ + struct strp_callbacks cb; + + memset(&cb, 0, sizeof(cb)); + cb.rcv_msg = smap_read_sock_strparser; + cb.parse_msg = smap_parse_func_strparser; + cb.read_sock_done = smap_read_sock_done; + return strp_init(&psock->strp, sk, &cb); +} + +static void smap_init_progs(struct smap_psock *psock, + struct bpf_stab *stab, + struct bpf_prog *verdict, + struct bpf_prog *parse) +{ + struct bpf_prog *orig_parse, *orig_verdict; + + orig_parse = xchg(&psock->bpf_parse, parse); + orig_verdict = xchg(&psock->bpf_verdict, verdict); + + if (orig_verdict) + bpf_prog_put(orig_verdict); + if (orig_parse) + bpf_prog_put(orig_parse); +} + +static void smap_start_sock(struct smap_psock *psock, struct sock *sk) +{ + if (sk->sk_data_ready == smap_data_ready) + return; + psock->save_data_ready = sk->sk_data_ready; + psock->save_write_space = sk->sk_write_space; + psock->save_state_change = sk->sk_state_change; + sk->sk_data_ready = smap_data_ready; + sk->sk_write_space = smap_write_space; + sk->sk_state_change = smap_state_change; + psock->strp_enabled = true; +} + +static void sock_map_remove_complete(struct bpf_stab *stab) +{ + bpf_map_area_free(stab->sock_map); + kfree(stab); +} + +static void smap_gc_work(struct work_struct *w) +{ + struct smap_psock *psock; + + psock = container_of(w, struct smap_psock, gc_work); + + /* no callback lock needed because we already detached sockmap ops */ + if (psock->strp_enabled) + strp_done(&psock->strp); + + cancel_work_sync(&psock->tx_work); + __skb_queue_purge(&psock->rxqueue); + + /* At this point all strparser and xmit work must be complete */ + if (psock->bpf_parse) + bpf_prog_put(psock->bpf_parse); + if (psock->bpf_verdict) + bpf_prog_put(psock->bpf_verdict); + + if (refcount_dec_and_test(&psock->stab->refcnt)) + sock_map_remove_complete(psock->stab); + + sock_put(psock->sock); + kfree(psock); +} + +static struct smap_psock *smap_init_psock(struct sock *sock, + struct bpf_stab *stab) +{ + struct smap_psock *psock; + + psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN); + if (!psock) + return ERR_PTR(-ENOMEM); + + psock->sock = sock; + skb_queue_head_init(&psock->rxqueue); + INIT_WORK(&psock->tx_work, smap_tx_work); + INIT_WORK(&psock->gc_work, smap_gc_work); + + rcu_assign_sk_user_data(sock, psock); + sock_hold(sock); + return psock; +} + +static struct bpf_map *sock_map_alloc(union bpf_attr *attr) +{ + struct bpf_stab *stab; + int err = -EINVAL; + u64 cost; + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + attr->value_size != 4 || attr->map_flags) + return ERR_PTR(-EINVAL); + + if (attr->value_size > KMALLOC_MAX_SIZE) + return ERR_PTR(-E2BIG); + + stab = kzalloc(sizeof(*stab), GFP_USER); + if (!stab) + return ERR_PTR(-ENOMEM); + + /* mandatory map attributes */ + stab->map.map_type = attr->map_type; + stab->map.key_size = attr->key_size; + stab->map.value_size = attr->value_size; + stab->map.max_entries = attr->max_entries; + stab->map.map_flags = attr->map_flags; + + /* make sure page count doesn't overflow */ + cost = (u64) stab->map.max_entries * sizeof(struct sock *); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_stab; + + stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(stab->map.pages); + if (err) + goto free_stab; + + stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * + sizeof(struct sock *)); + if (!stab->sock_map) + goto free_stab; + + refcount_set(&stab->refcnt, 1); + return &stab->map; +free_stab: + kfree(stab); + return ERR_PTR(err); +} + +static void sock_map_free(struct bpf_map *map) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int i; + + synchronize_rcu(); + + /* At this point no update, lookup or delete operations can happen. + * However, be aware we can still get a socket state event updates, + * and data ready callabacks that reference the psock from sk_user_data + * Also psock worker threads are still in-flight. So smap_release_sock + * will only free the psock after cancel_sync on the worker threads + * and a grace period expire to ensure psock is really safe to remove. + */ + rcu_read_lock(); + for (i = 0; i < stab->map.max_entries; i++) { + struct sock *sock; + + sock = xchg(&stab->sock_map[i], NULL); + if (!sock) + continue; + + smap_release_sock(sock); + } + rcu_read_unlock(); + + if (stab->bpf_verdict) + bpf_prog_put(stab->bpf_verdict); + if (stab->bpf_parse) + bpf_prog_put(stab->bpf_parse); + + if (refcount_dec_and_test(&stab->refcnt)) + sock_map_remove_complete(stab); +} + +static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + u32 i = key ? *(u32 *)key : U32_MAX; + u32 *next = (u32 *)next_key; + + if (i >= stab->map.max_entries) { + *next = 0; + return 0; + } + + if (i == stab->map.max_entries - 1) + return -ENOENT; + + *next = i + 1; + return 0; +} + +struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + + if (key >= map->max_entries) + return NULL; + + return READ_ONCE(stab->sock_map[key]); +} + +static int sock_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + int k = *(u32 *)key; + struct sock *sock; + + if (k >= map->max_entries) + return -EINVAL; + + sock = xchg(&stab->sock_map[k], NULL); + if (!sock) + return -EINVAL; + + smap_release_sock(sock); + return 0; +} + +/* Locking notes: Concurrent updates, deletes, and lookups are allowed and are + * done inside rcu critical sections. This ensures on updates that the psock + * will not be released via smap_release_sock() until concurrent updates/deletes + * complete. All operations operate on sock_map using cmpxchg and xchg + * operations to ensure we do not get stale references. Any reads into the + * map must be done with READ_ONCE() because of this. + * + * A psock is destroyed via call_rcu and after any worker threads are cancelled + * and syncd so we are certain all references from the update/lookup/delete + * operations as well as references in the data path are no longer in use. + * + * A psock object holds a refcnt on the sockmap it is attached to and this is + * not decremented until after a RCU grace period and garbage collection occurs. + * This ensures the map is not free'd until psocks linked to it are removed. The + * map link is used when the independent sock events trigger map deletion. + * + * Psocks may only participate in one sockmap at a time. Users that try to + * join a single sock to multiple maps will get an error. + * + * Last, but not least, it is possible the socket is closed while running + * an update on an existing psock. This will release the psock, but again + * not until the update has completed due to rcu grace period rules. + */ +static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, + struct bpf_map *map, + void *key, u64 flags, u64 map_flags) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *verdict, *parse; + struct smap_psock *psock = NULL; + struct sock *old_sock, *sock; + u32 i = *(u32 *)key; + bool update = false; + int err = 0; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + if (unlikely(i >= stab->map.max_entries)) + return -E2BIG; + + if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER)) + return -EINVAL; + + verdict = parse = NULL; + sock = READ_ONCE(stab->sock_map[i]); + + if (flags == BPF_EXIST || flags == BPF_ANY) { + if (!sock && flags == BPF_EXIST) { + return -ENOENT; + } else if (sock && sock != skops->sk) { + return -EINVAL; + } else if (sock) { + psock = smap_psock_sk(sock); + if (unlikely(!psock)) + return -EBUSY; + update = true; + } + } else if (sock && BPF_NOEXIST) { + return -EEXIST; + } + + /* reserve BPF programs early so can abort easily on failures */ + if (map_flags & BPF_SOCKMAP_STRPARSER) { + verdict = READ_ONCE(stab->bpf_verdict); + parse = READ_ONCE(stab->bpf_parse); + + if (!verdict || !parse) + return -ENOENT; + + /* bpf prog refcnt may be zero if a concurrent attach operation + * removes the program after the above READ_ONCE() but before + * we increment the refcnt. If this is the case abort with an + * error. + */ + verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + if (IS_ERR(verdict)) + return PTR_ERR(verdict); + + parse = bpf_prog_inc_not_zero(stab->bpf_parse); + if (IS_ERR(parse)) { + bpf_prog_put(verdict); + return PTR_ERR(parse); + } + } + + if (!psock) { + sock = skops->sk; + if (rcu_dereference_sk_user_data(sock)) + return -EEXIST; + psock = smap_init_psock(sock, stab); + if (IS_ERR(psock)) { + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); + return PTR_ERR(psock); + } + psock->key = i; + psock->stab = stab; + refcount_inc(&stab->refcnt); + set_bit(SMAP_TX_RUNNING, &psock->state); + } + + if (map_flags & BPF_SOCKMAP_STRPARSER) { + write_lock_bh(&sock->sk_callback_lock); + if (psock->strp_enabled) + goto start_done; + err = smap_init_sock(psock, sock); + if (err) + goto out; + smap_init_progs(psock, stab, verdict, parse); + smap_start_sock(psock, sock); +start_done: + write_unlock_bh(&sock->sk_callback_lock); + } else if (update) { + smap_stop_sock(psock, sock); + } + + if (!update) { + old_sock = xchg(&stab->sock_map[i], skops->sk); + if (old_sock) + smap_release_sock(old_sock); + } + + return 0; +out: + write_unlock_bh(&sock->sk_callback_lock); + if (!update) + smap_release_sock(sock); + return err; +} + +static int sock_map_attach_prog(struct bpf_map *map, + struct bpf_prog *parse, + struct bpf_prog *verdict) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *_parse, *_verdict; + + _parse = xchg(&stab->bpf_parse, parse); + _verdict = xchg(&stab->bpf_verdict, verdict); + + if (_parse) + bpf_prog_put(_parse); + if (_verdict) + bpf_prog_put(_verdict); + + return 0; +} + +static void *sock_map_lookup(struct bpf_map *map, void *key) +{ + return NULL; +} + +static int sock_map_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_sock_ops_kern skops; + u32 fd = *(u32 *)value; + struct socket *socket; + int err; + + socket = sockfd_lookup(fd, &err); + if (!socket) + return err; + + skops.sk = socket->sk; + if (!skops.sk) { + fput(socket->file); + return -EINVAL; + } + + err = sock_map_ctx_update_elem(&skops, map, key, + flags, BPF_SOCKMAP_STRPARSER); + fput(socket->file); + return err; +} + +const struct bpf_map_ops sock_map_ops = { + .map_alloc = sock_map_alloc, + .map_free = sock_map_free, + .map_lookup_elem = sock_map_lookup, + .map_get_next_key = sock_map_get_next_key, + .map_update_elem = sock_map_update_elem, + .map_delete_elem = sock_map_delete_elem, + .map_attach = sock_map_attach_prog, +}; + +BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags, u64, map_flags) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags); +} + +const struct bpf_func_proto bpf_sock_map_update_proto = { + .func = bpf_sock_map_update, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_PTR_TO_MAP_KEY, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 17e29f596de1..d2f2bdf71ffa 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1087,7 +1087,50 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_flags +#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 + +static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) +{ + struct bpf_prog *prog1, *prog2; + int ufd = attr->target_fd; + struct bpf_map *map; + struct fd f; + int err; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!map->ops->map_attach) { + fdput(f); + return -EOPNOTSUPP; + } + + prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); + if (IS_ERR(prog1)) { + fdput(f); + return PTR_ERR(prog1); + } + + prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); + if (IS_ERR(prog2)) { + fdput(f); + bpf_prog_put(prog1); + return PTR_ERR(prog2); + } + + err = map->ops->map_attach(map, prog1, prog2); + if (err) { + fdput(f); + bpf_prog_put(prog1); + bpf_prog_put(prog2); + return PTR_ERR(map); + } + + fdput(f); + return err; +} static int bpf_prog_attach(const union bpf_attr *attr) { @@ -1116,10 +1159,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; + case BPF_CGROUP_SMAP_INGRESS: + ptype = BPF_PROG_TYPE_SK_SKB; + break; default: return -EINVAL; } + if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS) + return sockmap_get_from_fd(attr, ptype); + prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7dd96d064be1..a71bc0996572 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1522,6 +1522,12 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; + case BPF_MAP_TYPE_SOCKMAP: + if (func_id != BPF_FUNC_sk_redirect_map && + func_id != BPF_FUNC_sock_map_update && + func_id != BPF_FUNC_map_delete_elem) + goto error; + break; default: break; } @@ -1550,6 +1556,14 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) if (map->map_type != BPF_MAP_TYPE_DEVMAP) goto error; break; + case BPF_FUNC_sk_redirect_map: + if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + goto error; + break; + case BPF_FUNC_sock_map_update: + if (map->map_type != BPF_MAP_TYPE_SOCKMAP) + goto error; + break; default: break; } diff --git a/net/core/filter.c b/net/core/filter.c index 46321033ae0e..8e136578488c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1858,6 +1858,45 @@ static const struct bpf_func_proto bpf_redirect_map_proto = { .arg3_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_sk_redirect_map, struct bpf_map *, map, u32, key, u64, flags) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + + if (unlikely(flags)) + return SK_ABORTED; + + ri->ifindex = key; + ri->flags = flags; + ri->map = map; + + return SK_REDIRECT; +} + +struct sock *do_sk_redirect_map(void) +{ + struct redirect_info *ri = this_cpu_ptr(&redirect_info); + struct sock *sk = NULL; + + if (ri->map) { + sk = __sock_map_lookup_elem(ri->map, ri->ifindex); + + ri->ifindex = 0; + ri->map = NULL; + /* we do not clear flags for future lookup */ + } + + return sk; +} + +static const struct bpf_func_proto bpf_sk_redirect_map_proto = { + .func = bpf_sk_redirect_map, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -3229,6 +3268,8 @@ static const struct bpf_func_proto * switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_setsockopt_proto; + case BPF_FUNC_sock_map_update: + return &bpf_sock_map_update_proto; default: return bpf_base_func_proto(func_id); } @@ -3243,6 +3284,8 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: return &bpf_get_socket_uid_proto; + case BPF_FUNC_sk_redirect_map: + return &bpf_sk_redirect_map_proto; default: return bpf_base_func_proto(func_id); } -- cgit v1.2.3 From 8a31db5615667956c513d205cfb06885c3ec6d0b Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Tue, 15 Aug 2017 22:33:09 -0700 Subject: bpf: add access to sock fields and pkt data from sk_skb programs Signed-off-by: John Fastabend Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 +++ kernel/bpf/verifier.c | 1 + net/core/filter.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 7f774769e3f5..5ecbe812a2cc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -712,6 +712,15 @@ struct __sk_buff { __u32 data; __u32 data_end; __u32 napi_id; + + /* accessed by BPF_PROG_TYPE_sk_skb types */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ }; struct bpf_tunnel_key { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a71bc0996572..958ba84a9995 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -886,6 +886,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_SCHED_ACT: case BPF_PROG_TYPE_XDP: case BPF_PROG_TYPE_LWT_XMIT: + case BPF_PROG_TYPE_SK_SKB: if (meta) return meta->pkt_access; diff --git a/net/core/filter.c b/net/core/filter.c index 8e136578488c..e9f8dcef6c57 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3278,8 +3278,16 @@ static const struct bpf_func_proto * static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) { switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_skb_pull_data: + return &bpf_skb_pull_data_proto; + case BPF_FUNC_skb_change_tail: + return &bpf_skb_change_tail_proto; + case BPF_FUNC_skb_change_head: + return &bpf_skb_change_head_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; case BPF_FUNC_get_socket_uid: @@ -3343,6 +3351,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (off + size > offsetofend(struct __sk_buff, cb[4])) return false; break; + case bpf_ctx_range_till(struct __sk_buff, remote_ip6[0], remote_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, local_ip6[0], local_ip6[3]): + case bpf_ctx_range_till(struct __sk_buff, remote_ip4, remote_ip4): + case bpf_ctx_range_till(struct __sk_buff, local_ip4, local_ip4): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): if (size != size_default) @@ -3371,6 +3383,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -3392,6 +3405,7 @@ static bool lwt_is_valid_access(int off, int size, { switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -3505,6 +3519,8 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; } return bpf_skb_is_valid_access(off, size, type, info); @@ -3582,11 +3598,63 @@ static bool sock_ops_is_valid_access(int off, int size, return __is_valid_sock_ops_access(off, size); } +static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + struct bpf_insn *insn = insn_buf; + + if (!direct_write) + return 0; + + /* if (!skb->cloned) + * goto start; + * + * (Fast-path, otherwise approximation that we might be + * a clone, do the rest in helper.) + */ + *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_6, BPF_REG_1, CLONED_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, BPF_REG_6, CLONED_MASK); + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_6, 0, 7); + + /* ret = bpf_skb_pull_data(skb, 0); */ + *insn++ = BPF_MOV64_REG(BPF_REG_6, BPF_REG_1); + *insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_2, BPF_REG_2); + *insn++ = BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_skb_pull_data); + /* if (!ret) + * goto restore; + * return SK_DROP; + */ + *insn++ = BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2); + *insn++ = BPF_ALU32_IMM(BPF_MOV, BPF_REG_0, SK_DROP); + *insn++ = BPF_EXIT_INSN(); + + /* restore: */ + *insn++ = BPF_MOV64_REG(BPF_REG_1, BPF_REG_6); + /* start: */ + *insn++ = prog->insnsi[0]; + + return insn - insn_buf; +} + static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range(struct __sk_buff, mark): + case bpf_ctx_range(struct __sk_buff, tc_index): + case bpf_ctx_range(struct __sk_buff, priority): + break; + default: + return false; + } + } + switch (off) { + case bpf_ctx_range(struct __sk_buff, tc_classid): + return false; case bpf_ctx_range(struct __sk_buff, data): info->reg_type = PTR_TO_PACKET; break; @@ -3783,6 +3851,106 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; + case offsetof(struct __sk_buff, family): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_family) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_family, + 2, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_daddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_daddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, local_ip4): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr) != 4); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_rcv_saddr, + 4, target_size)); + break; + case offsetof(struct __sk_buff, remote_ip6[0]) ... + offsetof(struct __sk_buff, remote_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, remote_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_daddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + case offsetof(struct __sk_buff, local_ip6[0]) ... + offsetof(struct __sk_buff, local_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) != 4); + + off = si->off; + off -= offsetof(struct __sk_buff, local_ip6[0]); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]) + + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, remote_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_dport) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_dport, + 2, target_size)); +#ifndef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_LSH, si->dst_reg, 16); +#endif + break; + + case offsetof(struct __sk_buff, local_port): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_num) != 2); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, + bpf_target_off(struct sock_common, + skc_num, 2, target_size)); + break; } return insn - insn_buf; @@ -4071,6 +4239,7 @@ const struct bpf_verifier_ops sk_skb_prog_ops = { .get_func_proto = sk_skb_func_proto, .is_valid_access = sk_skb_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .gen_prologue = sk_skb_prologue, }; int sk_detach_filter(struct sock *sk) -- cgit v1.2.3 From 0888e372c37fa31882c8ed89fb2f8188b08b6718 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Thu, 17 Aug 2017 00:35:11 +0000 Subject: net: inet: diag: expose sockets cgroup classid This is useful for directly looking up a task based on class id rather than having to scan through all open file descriptors. Signed-off-by: Sasha Levin Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + net/ipv4/inet_diag.c | 11 +++++++++++ 2 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index bbe201047df6..678496897a68 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -142,6 +142,7 @@ enum { INET_DIAG_PAD, INET_DIAG_MARK, INET_DIAG_BBRINFO, + INET_DIAG_CLASS_ID, __INET_DIAG_MAX, }; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3828b3a805cd..67325d5832d7 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -274,6 +274,17 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, goto errout; } + if (ext & (1 << (INET_DIAG_CLASS_ID - 1))) { + u32 classid = 0; + +#ifdef CONFIG_SOCK_CGROUP_DATA + classid = sock_cgroup_classid(&sk->sk_cgrp_data); +#endif + + if (nla_put_u32(skb, INET_DIAG_CLASS_ID, classid)) + goto errout; + } + out: nlmsg_end(skb, nlh); return 0; -- cgit v1.2.3 From 99d1712bc41c7c9a5a473c104a4ad15427757b22 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Aug 2017 15:15:29 +0200 Subject: netfilter: exthdr: tcp option set support This allows setting 2 and 4 byte quantities in the tcp option space. Main purpose is to allow native replacement for xt_TCPMSS to work around pmtu blackholes. Writes to kind and len are now allowed at the moment, it does not seem useful to do this as it causes corruption of the tcp option space. We can always lift this restriction later if a use-case appears. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +- net/netfilter/nft_exthdr.c | 164 ++++++++++++++++++++++++++++++- 2 files changed, 165 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index be25cf69295b..40fd199f7531 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -732,7 +732,8 @@ enum nft_exthdr_op { * @NFTA_EXTHDR_OFFSET: extension header offset (NLA_U32) * @NFTA_EXTHDR_LEN: extension header length (NLA_U32) * @NFTA_EXTHDR_FLAGS: extension header flags (NLA_U32) - * @NFTA_EXTHDR_OP: option match type (NLA_U8) + * @NFTA_EXTHDR_OP: option match type (NLA_U32) + * @NFTA_EXTHDR_SREG: option match type (NLA_U32) */ enum nft_exthdr_attributes { NFTA_EXTHDR_UNSPEC, @@ -742,6 +743,7 @@ enum nft_exthdr_attributes { NFTA_EXTHDR_LEN, NFTA_EXTHDR_FLAGS, NFTA_EXTHDR_OP, + NFTA_EXTHDR_SREG, __NFTA_EXTHDR_MAX }; #define NFTA_EXTHDR_MAX (__NFTA_EXTHDR_MAX - 1) diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index e3a6eebe7e0c..f5a0bf5e3bdd 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -8,6 +8,7 @@ * Development of this code funded by Astaro AG (http://www.astaro.com/) */ +#include #include #include #include @@ -23,6 +24,7 @@ struct nft_exthdr { u8 len; u8 op; enum nft_registers dreg:8; + enum nft_registers sreg:8; u8 flags; }; @@ -124,6 +126,88 @@ err: regs->verdict.code = NFT_BREAK; } +static void nft_exthdr_tcp_set_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + u8 buff[sizeof(struct tcphdr) + MAX_TCP_OPTION_SPACE]; + struct nft_exthdr *priv = nft_expr_priv(expr); + unsigned int i, optl, tcphdr_len, offset; + struct tcphdr *tcph; + u8 *opt; + u32 src; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, &tcphdr_len); + if (!tcph) + return; + + opt = (u8 *)tcph; + for (i = sizeof(*tcph); i < tcphdr_len - 1; i += optl) { + union { + u8 octet; + __be16 v16; + __be32 v32; + } old, new; + + optl = optlen(opt, i); + + if (priv->type != opt[i]) + continue; + + if (i + optl > tcphdr_len || priv->len + priv->offset > optl) + return; + + if (!skb_make_writable(pkt->skb, pkt->xt.thoff + i + priv->len)) + return; + + tcph = nft_tcp_header_pointer(pkt, sizeof(buff), buff, + &tcphdr_len); + if (!tcph) + return; + + src = regs->data[priv->sreg]; + offset = i + priv->offset; + + switch (priv->len) { + case 2: + old.v16 = get_unaligned((u16 *)(opt + offset)); + new.v16 = src; + + switch (priv->type) { + case TCPOPT_MSS: + /* increase can cause connection to stall */ + if (ntohs(old.v16) <= ntohs(new.v16)) + return; + break; + } + + if (old.v16 == new.v16) + return; + + put_unaligned(new.v16, (u16*)(opt + offset)); + inet_proto_csum_replace2(&tcph->check, pkt->skb, + old.v16, new.v16, false); + break; + case 4: + new.v32 = src; + old.v32 = get_unaligned((u32 *)(opt + offset)); + + if (old.v32 == new.v32) + return; + + put_unaligned(new.v32, (u32*)(opt + offset)); + inet_proto_csum_replace4(&tcph->check, pkt->skb, + old.v32, new.v32, false); + break; + default: + WARN_ON_ONCE(1); + break; + } + + return; + } +} + static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { [NFTA_EXTHDR_DREG] = { .type = NLA_U32 }, [NFTA_EXTHDR_TYPE] = { .type = NLA_U8 }, @@ -180,6 +264,55 @@ static int nft_exthdr_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, priv->len); } +static int nft_exthdr_tcp_set_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_exthdr *priv = nft_expr_priv(expr); + u32 offset, len, flags = 0, op = NFT_EXTHDR_OP_IPV6; + int err; + + if (!tb[NFTA_EXTHDR_SREG] || + !tb[NFTA_EXTHDR_TYPE] || + !tb[NFTA_EXTHDR_OFFSET] || + !tb[NFTA_EXTHDR_LEN]) + return -EINVAL; + + if (tb[NFTA_EXTHDR_DREG] || tb[NFTA_EXTHDR_FLAGS]) + return -EINVAL; + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OFFSET], U8_MAX, &offset); + if (err < 0) + return err; + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_LEN], U8_MAX, &len); + if (err < 0) + return err; + + if (offset < 2) + return -EOPNOTSUPP; + + switch (len) { + case 2: break; + case 4: break; + default: + return -EOPNOTSUPP; + } + + err = nft_parse_u32_check(tb[NFTA_EXTHDR_OP], U8_MAX, &op); + if (err < 0) + return err; + + priv->type = nla_get_u8(tb[NFTA_EXTHDR_TYPE]); + priv->offset = offset; + priv->len = len; + priv->sreg = nft_parse_register(tb[NFTA_EXTHDR_SREG]); + priv->flags = flags; + priv->op = op; + + return nft_validate_register_load(priv->sreg, priv->len); +} + static int nft_exthdr_dump_common(struct sk_buff *skb, const struct nft_exthdr *priv) { if (nla_put_u8(skb, NFTA_EXTHDR_TYPE, priv->type)) @@ -208,6 +341,16 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr) return nft_exthdr_dump_common(skb, priv); } +static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_exthdr *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_EXTHDR_SREG, priv->sreg)) + return -1; + + return nft_exthdr_dump_common(skb, priv); +} + static struct nft_expr_type nft_exthdr_type; static const struct nft_expr_ops nft_exthdr_ipv6_ops = { .type = &nft_exthdr_type, @@ -225,6 +368,14 @@ static const struct nft_expr_ops nft_exthdr_tcp_ops = { .dump = nft_exthdr_dump, }; +static const struct nft_expr_ops nft_exthdr_tcp_set_ops = { + .type = &nft_exthdr_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_exthdr)), + .eval = nft_exthdr_tcp_set_eval, + .init = nft_exthdr_tcp_set_init, + .dump = nft_exthdr_dump_set, +}; + static const struct nft_expr_ops * nft_exthdr_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -234,12 +385,21 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, if (!tb[NFTA_EXTHDR_OP]) return &nft_exthdr_ipv6_ops; + if (tb[NFTA_EXTHDR_SREG] && tb[NFTA_EXTHDR_DREG]) + return ERR_PTR(-EOPNOTSUPP); + op = ntohl(nla_get_u32(tb[NFTA_EXTHDR_OP])); switch (op) { case NFT_EXTHDR_OP_TCPOPT: - return &nft_exthdr_tcp_ops; + if (tb[NFTA_EXTHDR_SREG]) + return &nft_exthdr_tcp_set_ops; + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_tcp_ops; + break; case NFT_EXTHDR_OP_IPV6: - return &nft_exthdr_ipv6_ops; + if (tb[NFTA_EXTHDR_DREG]) + return &nft_exthdr_ipv6_ops; + break; } return ERR_PTR(-EOPNOTSUPP); -- cgit v1.2.3 From 6b5dc98e8fac041a3decfc3186e08c1c570ea691 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 8 Aug 2017 15:48:04 +0200 Subject: netfilter: rt: add support to fetch path mss to be used in combination with tcp option set support to mimic iptables TCPMSS --clamp-mss-to-pmtu. v2: Eric Dumazet points out dst must be initialized. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 + net/netfilter/nft_rt.c | 66 ++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 40fd199f7531..b49da72efa68 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -811,11 +811,13 @@ enum nft_meta_keys { * @NFT_RT_CLASSID: realm value of packet's route (skb->dst->tclassid) * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 + * @NFT_RT_TCPMSS: fetch current path tcp mss */ enum nft_rt_keys { NFT_RT_CLASSID, NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, + NFT_RT_TCPMSS, }; /** diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index c7383d8f88d0..e142e65d3176 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -23,6 +23,42 @@ struct nft_rt { enum nft_registers dreg:8; }; +static u16 get_tcpmss(const struct nft_pktinfo *pkt, const struct dst_entry *skbdst) +{ + u32 minlen = sizeof(struct ipv6hdr), mtu = dst_mtu(skbdst); + const struct sk_buff *skb = pkt->skb; + const struct nf_afinfo *ai; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + switch (nft_pf(pkt)) { + case NFPROTO_IPV4: + fl.u.ip4.daddr = ip_hdr(skb)->saddr; + minlen = sizeof(struct iphdr); + break; + case NFPROTO_IPV6: + fl.u.ip6.daddr = ipv6_hdr(skb)->saddr; + break; + } + + ai = nf_get_afinfo(nft_pf(pkt)); + if (ai) { + struct dst_entry *dst = NULL; + + ai->route(nft_net(pkt), &dst, &fl, false); + if (dst) { + mtu = min(mtu, dst_mtu(dst)); + dst_release(dst); + } + } + + if (mtu <= minlen || mtu > 0xffff) + return TCP_MSS_DEFAULT; + + return mtu - minlen; +} + static void nft_rt_get_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -57,6 +93,9 @@ static void nft_rt_get_eval(const struct nft_expr *expr, &ipv6_hdr(skb)->daddr), sizeof(struct in6_addr)); break; + case NFT_RT_TCPMSS: + nft_reg_store16(dest, get_tcpmss(pkt, dst)); + break; default: WARN_ON(1); goto err; @@ -94,6 +133,9 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, case NFT_RT_NEXTHOP6: len = sizeof(struct in6_addr); break; + case NFT_RT_TCPMSS: + len = sizeof(u16); + break; default: return -EOPNOTSUPP; } @@ -118,6 +160,29 @@ nla_put_failure: return -1; } +static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_rt *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->key) { + case NFT_RT_NEXTHOP4: + case NFT_RT_NEXTHOP6: + case NFT_RT_CLASSID: + return 0; + case NFT_RT_TCPMSS: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING); + break; + default: + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + static struct nft_expr_type nft_rt_type; static const struct nft_expr_ops nft_rt_get_ops = { .type = &nft_rt_type, @@ -125,6 +190,7 @@ static const struct nft_expr_ops nft_rt_get_ops = { .eval = nft_rt_get_eval, .init = nft_rt_get_init, .dump = nft_rt_get_dump, + .validate = nft_rt_validate, }; static struct nft_expr_type nft_rt_type __read_mostly = { -- cgit v1.2.3 From 96eabe7a40aa17e613cf3db2c742ee8b1fc764d0 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Fri, 18 Aug 2017 11:28:00 -0700 Subject: bpf: Allow selecting numa node during map creation The current map creation API does not allow to provide the numa-node preference. The memory usually comes from where the map-creation-process is running. The performance is not ideal if the bpf_prog is known to always run in a numa node different from the map-creation-process. One of the use case is sharding on CPU to different LRU maps (i.e. an array of LRU maps). Here is the test result of map_perf_test on the INNER_LRU_HASH_PREALLOC test if we force the lru map used by CPU0 to be allocated from a remote numa node: [ The machine has 20 cores. CPU0-9 at node 0. CPU10-19 at node 1 ] ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1628380 events per sec 4:inner_lru_hash_map_perf pre-alloc 1626396 events per sec 3:inner_lru_hash_map_perf pre-alloc 1626144 events per sec 6:inner_lru_hash_map_perf pre-alloc 1621657 events per sec 2:inner_lru_hash_map_perf pre-alloc 1621534 events per sec 1:inner_lru_hash_map_perf pre-alloc 1620292 events per sec 7:inner_lru_hash_map_perf pre-alloc 1613305 events per sec 0:inner_lru_hash_map_perf pre-alloc 1239150 events per sec #<<< After specifying numa node: ># taskset -c 10 ./map_perf_test 512 8 1260000 8000000 5:inner_lru_hash_map_perf pre-alloc 1629627 events per sec 3:inner_lru_hash_map_perf pre-alloc 1628057 events per sec 1:inner_lru_hash_map_perf pre-alloc 1623054 events per sec 6:inner_lru_hash_map_perf pre-alloc 1616033 events per sec 2:inner_lru_hash_map_perf pre-alloc 1614630 events per sec 4:inner_lru_hash_map_perf pre-alloc 1612651 events per sec 7:inner_lru_hash_map_perf pre-alloc 1609337 events per sec 0:inner_lru_hash_map_perf pre-alloc 1619340 events per sec #<<< This patch adds one field, numa_node, to the bpf_attr. Since numa node 0 is a valid node, a new flag BPF_F_NUMA_NODE is also added. The numa_node field is honored if and only if the BPF_F_NUMA_NODE flag is set. Numa node selection is not supported for percpu map. This patch does not change all the kmalloc. F.e. 'htab = kzalloc()' is not changed since the object is small enough to stay in the cache. Signed-off-by: Martin KaFai Lau Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 +++++++++- include/uapi/linux/bpf.h | 10 +++++++++- kernel/bpf/arraymap.c | 7 +++++-- kernel/bpf/devmap.c | 9 ++++++--- kernel/bpf/hashtab.c | 19 +++++++++++++++---- kernel/bpf/lpm_trie.c | 9 +++++++-- kernel/bpf/sockmap.c | 10 +++++++--- kernel/bpf/stackmap.c | 8 +++++--- kernel/bpf/syscall.c | 14 ++++++++++---- 9 files changed, 73 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1cc6c5ff61ec..55b88e329804 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -51,6 +51,7 @@ struct bpf_map { u32 map_flags; u32 pages; u32 id; + int numa_node; struct user_struct *user; const struct bpf_map_ops *ops; struct work_struct work; @@ -264,7 +265,7 @@ struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); -void *bpf_map_area_alloc(size_t size); +void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); extern int sysctl_unprivileged_bpf_disabled; @@ -316,6 +317,13 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); +/* Return map's numa specified by userspace */ +static inline int bpf_map_attr_numa_node(const union bpf_attr *attr) +{ + return (attr->map_flags & BPF_F_NUMA_NODE) ? + attr->numa_node : NUMA_NO_NODE; +} + #else static inline struct bpf_prog *bpf_prog_get(u32 ufd) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5ecbe812a2cc..843818dff96d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_attach_type { #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +/* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) /* Instead of having one common LRU list in the * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list @@ -173,6 +174,8 @@ enum bpf_attach_type { * across different LRU lists. */ #define BPF_F_NO_COMMON_LRU (1U << 1) +/* Specify numa node during map creation */ +#define BPF_F_NUMA_NODE (1U << 2) union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ @@ -180,8 +183,13 @@ union bpf_attr { __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ - __u32 map_flags; /* prealloc or not */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d771a3872500..96e9c5c1dfc9 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,13 +49,15 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_array *array; u64 array_size; u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0 || attr->map_flags) + attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE || + (percpu && numa_node != NUMA_NO_NODE)) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -77,7 +79,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* allocate all map elements and zero-initialize them */ - array = bpf_map_area_alloc(array_size); + array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); @@ -87,6 +89,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; array->map.map_flags = attr->map_flags; + array->map.numa_node = numa_node; array->elem_size = elem_size; if (!percpu) diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 18a72a8add43..67f4f00ce33a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -80,7 +80,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); dtab = kzalloc(sizeof(*dtab), GFP_USER); @@ -93,6 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.value_size = attr->value_size; dtab->map.max_entries = attr->max_entries; dtab->map.map_flags = attr->map_flags; + dtab->map.numa_node = bpf_map_attr_numa_node(attr); err = -ENOMEM; @@ -119,7 +120,8 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) goto free_dtab; dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * - sizeof(struct bpf_dtab_netdev *)); + sizeof(struct bpf_dtab_netdev *), + dtab->map.numa_node); if (!dtab->netdev_map) goto free_dtab; @@ -344,7 +346,8 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, if (!ifindex) { dev = NULL; } else { - dev = kmalloc(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN); + dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!dev) return -ENOMEM; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..47ae748c3a49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -18,6 +18,9 @@ #include "bpf_lru_list.h" #include "map_in_map.h" +#define HTAB_CREATE_FLAG_MASK \ + (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE) + struct bucket { struct hlist_nulls_head head; raw_spinlock_t lock; @@ -138,7 +141,8 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries); + htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->map.numa_node); if (!htab->elems) return -ENOMEM; @@ -233,6 +237,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -248,7 +253,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ return ERR_PTR(-EPERM); - if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU)) + if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ return ERR_PTR(-EINVAL); @@ -258,6 +263,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (lru && !prealloc) return ERR_PTR(-ENOTSUPP); + if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) + return ERR_PTR(-EINVAL); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -268,6 +276,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; htab->map.map_flags = attr->map_flags; + htab->map.numa_node = numa_node; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -346,7 +355,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * - sizeof(struct bucket)); + sizeof(struct bucket), + htab->map.numa_node); if (!htab->buckets) goto free_htab; @@ -689,7 +699,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, atomic_dec(&htab->count); return ERR_PTR(-E2BIG); } - l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) return ERR_PTR(-ENOMEM); } diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b09185f0f17d..1b767844a76f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -244,7 +244,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); + node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -405,6 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key) #define LPM_KEY_SIZE_MAX LPM_KEY_SIZE(LPM_DATA_SIZE_MAX) #define LPM_KEY_SIZE_MIN LPM_KEY_SIZE(LPM_DATA_SIZE_MIN) +#define LPM_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE) + static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; @@ -416,7 +419,8 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || - attr->map_flags != BPF_F_NO_PREALLOC || + !(attr->map_flags & BPF_F_NO_PREALLOC) || + attr->map_flags & ~LPM_CREATE_FLAG_MASK || attr->key_size < LPM_KEY_SIZE_MIN || attr->key_size > LPM_KEY_SIZE_MAX || attr->value_size < LPM_VAL_SIZE_MIN || @@ -433,6 +437,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) trie->map.value_size = attr->value_size; trie->map.max_entries = attr->max_entries; trie->map.map_flags = attr->map_flags; + trie->map.numa_node = bpf_map_attr_numa_node(attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 39de541fbcdc..78b2bb9370ac 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -443,7 +443,9 @@ static struct smap_psock *smap_init_psock(struct sock *sock, { struct smap_psock *psock; - psock = kzalloc(sizeof(struct smap_psock), GFP_ATOMIC | __GFP_NOWARN); + psock = kzalloc_node(sizeof(struct smap_psock), + GFP_ATOMIC | __GFP_NOWARN, + stab->map.numa_node); if (!psock) return ERR_PTR(-ENOMEM); @@ -465,7 +467,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size != 4 || attr->map_flags) + attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); if (attr->value_size > KMALLOC_MAX_SIZE) @@ -481,6 +483,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) stab->map.value_size = attr->value_size; stab->map.max_entries = attr->max_entries; stab->map.map_flags = attr->map_flags; + stab->map.numa_node = bpf_map_attr_numa_node(attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -495,7 +498,8 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) goto free_stab; stab->sock_map = bpf_map_area_alloc(stab->map.max_entries * - sizeof(struct sock *)); + sizeof(struct sock *), + stab->map.numa_node); if (!stab->sock_map) goto free_stab; diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 31147d730abf..135be433e9a0 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -31,7 +31,8 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; int err; - smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries); + smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries, + smap->map.numa_node); if (!smap->elems) return -ENOMEM; @@ -59,7 +60,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_flags) + if (attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); /* check sanity of attributes */ @@ -75,7 +76,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) return ERR_PTR(-E2BIG); - smap = bpf_map_area_alloc(cost); + smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); if (!smap) return ERR_PTR(-ENOMEM); @@ -91,6 +92,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b8cb1b3c9bfb..9378f3ba2cbf 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -105,7 +105,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -void *bpf_map_area_alloc(size_t size) +void *bpf_map_area_alloc(size_t size, int numa_node) { /* We definitely need __GFP_NORETRY, so OOM killer doesn't * trigger under memory pressure as we really just want to @@ -115,12 +115,13 @@ void *bpf_map_area_alloc(size_t size) void *area; if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - area = kmalloc(size, GFP_USER | flags); + area = kmalloc_node(size, GFP_USER | flags, numa_node); if (area != NULL) return area; } - return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); + return __vmalloc_node_flags_caller(size, numa_node, GFP_KERNEL | flags, + __builtin_return_address(0)); } void bpf_map_area_free(void *area) @@ -309,10 +310,11 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd +#define BPF_MAP_CREATE_LAST_FIELD numa_node /* called via syscall */ static int map_create(union bpf_attr *attr) { + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_map *map; int err; @@ -320,6 +322,10 @@ static int map_create(union bpf_attr *attr) if (err) return -EINVAL; + if (numa_node != NUMA_NO_NODE && + (numa_node >= nr_node_ids || !node_online(numa_node))) + return -EINVAL; + /* find map type and init map: hashtable vs rbtree vs bloom vs ... */ map = find_and_alloc_map(attr); if (IS_ERR(map)) -- cgit v1.2.3 From 84e54fe0a5eaed696dee4019c396f8396f5a908b Mon Sep 17 00:00:00 2001 From: William Tu Date: Tue, 22 Aug 2017 09:40:28 -0700 Subject: gre: introduce native tunnel support for ERSPAN The patch adds ERSPAN type II tunnel support. The implementation is based on the draft at [1]. One of the purposes is for Linux box to be able to receive ERSPAN monitoring traffic sent from the Cisco switch, by creating a ERSPAN tunnel device. In addition, the patch also adds ERSPAN TX, so Linux virtual switch can redirect monitored traffic to the ERSPAN tunnel device. The traffic will be encapsulated into ERSPAN and sent out. The implementation reuses tunnel key as ERSPAN session ID, and field 'erspan' as ERSPAN Index fields: ./ip link add dev ers11 type erspan seq key 100 erspan 123 \ local 172.16.1.200 remote 172.16.1.100 To use the above device as ERSPAN receiver, configure Nexus 5000 switch as below: monitor session 100 type erspan-source erspan-id 123 vrf default destination ip 172.16.1.200 source interface Ethernet1/11 both source interface Ethernet1/12 both no shut monitor erspan origin ip-address 172.16.1.100 global [1] https://tools.ietf.org/html/draft-foschiano-erspan-01 [2] iproute2 patch: http://marc.info/?l=linux-netdev&m=150306086924951&w=2 [3] test script: http://marc.info/?l=linux-netdev&m=150231021807304&w=2 Signed-off-by: William Tu Signed-off-by: Meenakshi Vohra Cc: Alexey Kuznetsov Cc: Hideaki YOSHIFUJI Signed-off-by: David S. Miller --- include/net/erspan.h | 61 ++++++++++ include/net/ip_tunnels.h | 3 + include/uapi/linux/if_ether.h | 1 + include/uapi/linux/if_tunnel.h | 1 + net/ipv4/ip_gre.c | 269 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 335 insertions(+) create mode 100644 include/net/erspan.h (limited to 'include/uapi/linux') diff --git a/include/net/erspan.h b/include/net/erspan.h new file mode 100644 index 000000000000..ca94fc86865e --- /dev/null +++ b/include/net/erspan.h @@ -0,0 +1,61 @@ +#ifndef __LINUX_ERSPAN_H +#define __LINUX_ERSPAN_H + +/* + * GRE header for ERSPAN encapsulation (8 octets [34:41]) -- 8 bytes + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * |0|0|0|1|0|00000|000000000|00000| Protocol Type for ERSPAN | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Sequence Number (increments per packet per session) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Note that in the above GRE header [RFC1701] out of the C, R, K, S, + * s, Recur, Flags, Version fields only S (bit 03) is set to 1. The + * other fields are set to zero, so only a sequence number follows. + * + * ERSPAN Type II header (8 octets [42:49]) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Ver | VLAN | COS | En|T| Session ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Reserved | Index | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * GRE proto ERSPAN type II = 0x88BE, type III = 0x22EB + */ + +#define ERSPAN_VERSION 0x1 + +#define VER_MASK 0xf000 +#define VLAN_MASK 0x0fff +#define COS_MASK 0xe000 +#define EN_MASK 0x1800 +#define T_MASK 0x0400 +#define ID_MASK 0x03ff +#define INDEX_MASK 0xfffff + +enum erspan_encap_type { + ERSPAN_ENCAP_NOVLAN = 0x0, /* originally without VLAN tag */ + ERSPAN_ENCAP_ISL = 0x1, /* originally ISL encapsulated */ + ERSPAN_ENCAP_8021Q = 0x2, /* originally 802.1Q encapsulated */ + ERSPAN_ENCAP_INFRAME = 0x3, /* VLAN tag perserved in frame */ +}; + +struct erspan_metadata { + __be32 index; /* type II */ +}; + +struct erspanhdr { + __be16 ver_vlan; +#define VER_OFFSET 12 + __be16 session_id; +#define COS_OFFSET 13 +#define EN_OFFSET 11 +#define T_OFFSET 10 + struct erspan_metadata md; +}; + +#endif diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 520809912f03..625c29329372 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -115,6 +115,9 @@ struct ip_tunnel { u32 o_seqno; /* The last output seqno */ int tun_hlen; /* Precalculated header length */ + /* This field used only by ERSPAN */ + u32 index; /* ERSPAN type II index */ + struct dst_cache dst_cache; struct ip_tunnel_parm parms; diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 5bc9bfd816b7..efeb1190c2ca 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -66,6 +66,7 @@ #define ETH_P_ATALK 0x809B /* Appletalk DDP */ #define ETH_P_AARP 0x80F3 /* Appletalk AARP */ #define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#define ETH_P_ERSPAN 0x88BE /* ERSPAN type II */ #define ETH_P_IPX 0x8137 /* IPX over DIX */ #define ETH_P_IPV6 0x86DD /* IPv6 over bluebook */ #define ETH_P_PAUSE 0x8808 /* IEEE Pause frames. See 802.3 31B */ diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 6792d1967d31..2e520883c054 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -134,6 +134,7 @@ enum { IFLA_GRE_COLLECT_METADATA, IFLA_GRE_IGNORE_DF, IFLA_GRE_FWMARK, + IFLA_GRE_ERSPAN_INDEX, __IFLA_GRE_MAX, }; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 7a7829e839c2..6e8a62289e03 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -48,6 +48,7 @@ #include #include #include +#include /* Problems & solutions @@ -115,6 +116,7 @@ static int ipgre_tunnel_init(struct net_device *dev); static unsigned int ipgre_net_id __read_mostly; static unsigned int gre_tap_net_id __read_mostly; +static unsigned int erspan_net_id __read_mostly; static void ipgre_err(struct sk_buff *skb, u32 info, const struct tnl_ptk_info *tpi) @@ -246,6 +248,56 @@ static void gre_err(struct sk_buff *skb, u32 info) ipgre_err(skb, info, &tpi); } +static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi, + int gre_hdr_len) +{ + struct net *net = dev_net(skb->dev); + struct metadata_dst *tun_dst = NULL; + struct ip_tunnel_net *itn; + struct ip_tunnel *tunnel; + struct erspanhdr *ershdr; + const struct iphdr *iph; + __be32 session_id; + __be32 index; + int len; + + itn = net_generic(net, erspan_net_id); + iph = ip_hdr(skb); + len = gre_hdr_len + sizeof(*ershdr); + + if (unlikely(!pskb_may_pull(skb, len))) + return -ENOMEM; + + iph = ip_hdr(skb); + ershdr = (struct erspanhdr *)(skb->data + gre_hdr_len); + + /* The original GRE header does not have key field, + * Use ERSPAN 10-bit session ID as key. + */ + session_id = cpu_to_be32(ntohs(ershdr->session_id)); + tpi->key = session_id; + index = ershdr->md.index; + tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, + tpi->flags | TUNNEL_KEY, + iph->saddr, iph->daddr, tpi->key); + + if (tunnel) { + if (__iptunnel_pull_header(skb, + gre_hdr_len + sizeof(*ershdr), + htons(ETH_P_TEB), + false, false) < 0) + goto drop; + + tunnel->index = ntohl(index); + skb_reset_mac_header(skb); + ip_tunnel_rcv(tunnel, skb, tpi, tun_dst, log_ecn_error); + return PACKET_RCVD; + } +drop: + kfree_skb(skb); + return PACKET_RCVD; +} + static int __ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct ip_tunnel_net *itn, int hdr_len, bool raw_proto) { @@ -328,6 +380,11 @@ static int gre_rcv(struct sk_buff *skb) if (hdr_len < 0) goto drop; + if (unlikely(tpi.proto == htons(ETH_P_ERSPAN))) { + if (erspan_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) + return 0; + } + if (ipgre_rcv(skb, &tpi, hdr_len) == PACKET_RCVD) return 0; @@ -503,6 +560,81 @@ free_skb: return NETDEV_TX_OK; } +static inline u8 tos_to_cos(u8 tos) +{ + u8 dscp, cos; + + dscp = tos >> 2; + cos = dscp >> 3; + return cos; +} + +static void erspan_build_header(struct sk_buff *skb, + __be32 id, u32 index, bool truncate) +{ + struct iphdr *iphdr = ip_hdr(skb); + struct ethhdr *eth = eth_hdr(skb); + enum erspan_encap_type enc_type; + struct erspanhdr *ershdr; + struct qtag_prefix { + __be16 eth_type; + __be16 tci; + } *qp; + u16 vlan_tci = 0; + + enc_type = ERSPAN_ENCAP_NOVLAN; + + /* If mirrored packet has vlan tag, extract tci and + * perserve vlan header in the mirrored frame. + */ + if (eth->h_proto == htons(ETH_P_8021Q)) { + qp = (struct qtag_prefix *)(skb->data + 2 * ETH_ALEN); + vlan_tci = ntohs(qp->tci); + enc_type = ERSPAN_ENCAP_INFRAME; + } + + skb_push(skb, sizeof(*ershdr)); + ershdr = (struct erspanhdr *)skb->data; + memset(ershdr, 0, sizeof(*ershdr)); + + ershdr->ver_vlan = htons((vlan_tci & VLAN_MASK) | + (ERSPAN_VERSION << VER_OFFSET)); + ershdr->session_id = htons((u16)(ntohl(id) & ID_MASK) | + ((tos_to_cos(iphdr->tos) << COS_OFFSET) & COS_MASK) | + (enc_type << EN_OFFSET & EN_MASK) | + ((truncate << T_OFFSET) & T_MASK)); + ershdr->md.index = htonl(index & INDEX_MASK); +} + +static netdev_tx_t erspan_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + bool truncate = false; + + if (gre_handle_offloads(skb, false)) + goto free_skb; + + if (skb_cow_head(skb, dev->needed_headroom)) + goto free_skb; + + if (skb->len > dev->mtu) { + pskb_trim(skb, dev->mtu); + truncate = true; + } + + /* Push ERSPAN header */ + erspan_build_header(skb, tunnel->parms.o_key, tunnel->index, truncate); + tunnel->parms.o_flags &= ~TUNNEL_KEY; + __gre_xmit(skb, dev, &tunnel->parms.iph, htons(ETH_P_ERSPAN)); + return NETDEV_TX_OK; + +free_skb: + kfree_skb(skb); + dev->stats.tx_dropped++; + return NETDEV_TX_OK; +} + static netdev_tx_t gre_tap_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -828,6 +960,39 @@ out: return ipgre_tunnel_validate(tb, data, extack); } +static int erspan_validate(struct nlattr *tb[], struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + __be16 flags = 0; + int ret; + + if (!data) + return 0; + + ret = ipgre_tap_validate(tb, data, extack); + if (ret) + return ret; + + /* ERSPAN should only have GRE sequence and key flag */ + flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]); + flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]); + if (flags != (GRE_SEQ | GRE_KEY)) + return -EINVAL; + + /* ERSPAN Session ID only has 10-bit. Since we reuse + * 32-bit key field as ID, check it's range. + */ + if (data[IFLA_GRE_IKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_IKEY])) & ~ID_MASK)) + return -EINVAL; + + if (data[IFLA_GRE_OKEY] && + (ntohl(nla_get_be32(data[IFLA_GRE_OKEY])) & ~ID_MASK)) + return -EINVAL; + + return 0; +} + static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], @@ -892,6 +1057,13 @@ static int ipgre_netlink_parms(struct net_device *dev, if (data[IFLA_GRE_FWMARK]) *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + if (data[IFLA_GRE_ERSPAN_INDEX]) { + t->index = nla_get_u32(data[IFLA_GRE_ERSPAN_INDEX]); + + if (t->index & ~INDEX_MASK) + return -EINVAL; + } + return 0; } @@ -949,6 +1121,36 @@ static const struct net_device_ops gre_tap_netdev_ops = { .ndo_fill_metadata_dst = gre_fill_metadata_dst, }; +static int erspan_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = netdev_priv(dev); + int t_hlen; + + tunnel->tun_hlen = 8; + tunnel->parms.iph.protocol = IPPROTO_GRE; + t_hlen = tunnel->hlen + sizeof(struct iphdr) + sizeof(struct erspanhdr); + + dev->needed_headroom = LL_MAX_HEADER + t_hlen + 4; + dev->mtu = ETH_DATA_LEN - t_hlen - 4; + dev->features |= GRE_FEATURES; + dev->hw_features |= GRE_FEATURES; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + + return ip_tunnel_init(dev); +} + +static const struct net_device_ops erspan_netdev_ops = { + .ndo_init = erspan_tunnel_init, + .ndo_uninit = ip_tunnel_uninit, + .ndo_start_xmit = erspan_xmit, + .ndo_set_mac_address = eth_mac_addr, + .ndo_validate_addr = eth_validate_addr, + .ndo_change_mtu = ip_tunnel_change_mtu, + .ndo_get_stats64 = ip_tunnel_get_stats64, + .ndo_get_iflink = ip_tunnel_get_iflink, + .ndo_fill_metadata_dst = gre_fill_metadata_dst, +}; + static void ipgre_tap_setup(struct net_device *dev) { ether_setup(dev); @@ -1041,6 +1243,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(1) + /* IFLA_GRE_FWMARK */ nla_total_size(4) + + /* IFLA_GRE_ERSPAN_INDEX */ + nla_total_size(4) + 0; } @@ -1083,12 +1287,25 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; } + if (t->index) + if (nla_put_u32(skb, IFLA_GRE_ERSPAN_INDEX, t->index)) + goto nla_put_failure; + return 0; nla_put_failure: return -EMSGSIZE; } +static void erspan_setup(struct net_device *dev) +{ + ether_setup(dev); + dev->netdev_ops = &erspan_netdev_ops; + dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + ip_tunnel_setup(dev, erspan_net_id); +} + static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_LINK] = { .type = NLA_U32 }, [IFLA_GRE_IFLAGS] = { .type = NLA_U16 }, @@ -1107,6 +1324,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, + [IFLA_GRE_ERSPAN_INDEX] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { @@ -1139,6 +1357,21 @@ static struct rtnl_link_ops ipgre_tap_ops __read_mostly = { .get_link_net = ip_tunnel_get_link_net, }; +static struct rtnl_link_ops erspan_link_ops __read_mostly = { + .kind = "erspan", + .maxtype = IFLA_GRE_MAX, + .policy = ipgre_policy, + .priv_size = sizeof(struct ip_tunnel), + .setup = erspan_setup, + .validate = erspan_validate, + .newlink = ipgre_newlink, + .changelink = ipgre_changelink, + .dellink = ip_tunnel_dellink, + .get_size = ipgre_get_size, + .fill_info = ipgre_fill_info, + .get_link_net = ip_tunnel_get_link_net, +}; + struct net_device *gretap_fb_dev_create(struct net *net, const char *name, u8 name_assign_type) { @@ -1202,6 +1435,26 @@ static struct pernet_operations ipgre_tap_net_ops = { .size = sizeof(struct ip_tunnel_net), }; +static int __net_init erspan_init_net(struct net *net) +{ + return ip_tunnel_init_net(net, erspan_net_id, + &erspan_link_ops, "erspan0"); +} + +static void __net_exit erspan_exit_net(struct net *net) +{ + struct ip_tunnel_net *itn = net_generic(net, erspan_net_id); + + ip_tunnel_delete_net(itn, &erspan_link_ops); +} + +static struct pernet_operations erspan_net_ops = { + .init = erspan_init_net, + .exit = erspan_exit_net, + .id = &erspan_net_id, + .size = sizeof(struct ip_tunnel_net), +}; + static int __init ipgre_init(void) { int err; @@ -1216,6 +1469,10 @@ static int __init ipgre_init(void) if (err < 0) goto pnet_tap_faied; + err = register_pernet_device(&erspan_net_ops); + if (err < 0) + goto pnet_erspan_failed; + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { pr_info("%s: can't add protocol\n", __func__); @@ -1230,13 +1487,21 @@ static int __init ipgre_init(void) if (err < 0) goto tap_ops_failed; + err = rtnl_link_register(&erspan_link_ops); + if (err < 0) + goto erspan_link_failed; + return 0; +erspan_link_failed: + rtnl_link_unregister(&ipgre_tap_ops); tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: + unregister_pernet_device(&erspan_net_ops); +pnet_erspan_failed: unregister_pernet_device(&ipgre_tap_net_ops); pnet_tap_faied: unregister_pernet_device(&ipgre_net_ops); @@ -1247,9 +1512,11 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); + rtnl_link_unregister(&erspan_link_ops); gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); unregister_pernet_device(&ipgre_tap_net_ops); unregister_pernet_device(&ipgre_net_ops); + unregister_pernet_device(&erspan_net_ops); } module_init(ipgre_init); @@ -1257,5 +1524,7 @@ module_exit(ipgre_fini); MODULE_LICENSE("GPL"); MODULE_ALIAS_RTNL_LINK("gre"); MODULE_ALIAS_RTNL_LINK("gretap"); +MODULE_ALIAS_RTNL_LINK("erspan"); MODULE_ALIAS_NETDEV("gre0"); MODULE_ALIAS_NETDEV("gretap0"); +MODULE_ALIAS_NETDEV("erspan0"); -- cgit v1.2.3 From 1177009131bee310421f5c04c43d3777cbacbdc8 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 24 Aug 2017 08:39:59 +0200 Subject: devlink: Add Ethernet header for dpipe This will be used by the IPv4 host table which will be introduced in the following patches. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 8 ++++++++ net/core/devlink.c | 17 +++++++++++++++++ 3 files changed, 26 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index ed7687bbf5d0..ddb8b5227580 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -328,6 +328,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb, struct devlink_dpipe_action *action); int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_match *match); +extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; #else diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b0e807ac53bb..a855f8dcc8ee 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -226,4 +226,12 @@ enum devlink_dpipe_action_type { DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY, }; +enum devlink_dpipe_field_ethernet_id { + DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, +}; + +enum devlink_dpipe_header_id { + DEVLINK_DPIPE_HEADER_ETHERNET, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index a0adfc31a3fe..4f6f3d601785 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -29,6 +29,23 @@ #define CREATE_TRACE_POINTS #include +static struct devlink_dpipe_field devlink_dpipe_fields_ethernet[] = { + { + .name = "destination_mac", + .id = DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, + .bitwidth = 48, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ethernet = { + .name = "ethernet", + .id = DEVLINK_DPIPE_HEADER_ETHERNET, + .fields = devlink_dpipe_fields_ethernet, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ethernet), + .global = true, +}; +EXPORT_SYMBOL(devlink_dpipe_header_ethernet); + EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); static LIST_HEAD(devlink_list); -- cgit v1.2.3 From 3fb886ecea93605a8ea14e258ff3158b8966781e Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 24 Aug 2017 08:40:00 +0200 Subject: devlink: Add IPv4 header for dpipe This will be used by the IPv4 host table which will be introduced in the following patches. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index ddb8b5227580..8ff8a6f77f29 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -329,6 +329,7 @@ int devlink_dpipe_action_put(struct sk_buff *skb, int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_match *match); extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; +extern struct devlink_dpipe_header devlink_dpipe_header_ipv4; #else diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a855f8dcc8ee..6c172548589d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -230,8 +230,13 @@ enum devlink_dpipe_field_ethernet_id { DEVLINK_DPIPE_FIELD_ETHERNET_DST_MAC, }; +enum devlink_dpipe_field_ipv4_id { + DEVLINK_DPIPE_FIELD_IPV4_DST_IP, +}; + enum devlink_dpipe_header_id { DEVLINK_DPIPE_HEADER_ETHERNET, + DEVLINK_DPIPE_HEADER_IPV4, }; #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 4f6f3d601785..fcf5dd662882 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -46,6 +46,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ethernet = { }; EXPORT_SYMBOL(devlink_dpipe_header_ethernet); +static struct devlink_dpipe_field devlink_dpipe_fields_ipv4[] = { + { + .name = "destination ip", + .id = DEVLINK_DPIPE_FIELD_IPV4_DST_IP, + .bitwidth = 32, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { + .name = "ipv4", + .id = DEVLINK_DPIPE_HEADER_IPV4, + .fields = devlink_dpipe_fields_ipv4, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv4), + .global = true, +}; +EXPORT_SYMBOL(devlink_dpipe_header_ipv4); + EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); static LIST_HEAD(devlink_list); -- cgit v1.2.3 From 38ee7f2d47565689f35662d488d25e7afc43477d Mon Sep 17 00:00:00 2001 From: David Lebrun Date: Fri, 25 Aug 2017 09:56:45 +0200 Subject: ipv6: sr: add support for encapsulation of L2 frames This patch implements the L2 frame encapsulation mechanism, referred to as T.Encaps.L2 in the SRv6 specifications [1]. A new type of SRv6 tunnel mode is added (SEG6_IPTUN_MODE_L2ENCAP). It only accepts packets with an existing MAC header (i.e., it will not work for locally generated packets). The resulting packet looks like IPv6 -> SRH -> Ethernet -> original L3 payload. The next header field of the SRH is set to NEXTHDR_NONE. [1] https://tools.ietf.org/html/draft-filsfils-spring-srv6-network-programming-01 Signed-off-by: David Lebrun Signed-off-by: David S. Miller --- include/uapi/linux/seg6_iptunnel.h | 18 ++++++++++++++---- net/ipv6/seg6_iptunnel.c | 25 +++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/seg6_iptunnel.h b/include/uapi/linux/seg6_iptunnel.h index b6e5a0a1afd7..b23df9f58354 100644 --- a/include/uapi/linux/seg6_iptunnel.h +++ b/include/uapi/linux/seg6_iptunnel.h @@ -33,16 +33,26 @@ struct seg6_iptunnel_encap { enum { SEG6_IPTUN_MODE_INLINE, SEG6_IPTUN_MODE_ENCAP, + SEG6_IPTUN_MODE_L2ENCAP, }; #ifdef __KERNEL__ static inline size_t seg6_lwt_headroom(struct seg6_iptunnel_encap *tuninfo) { - int encap = (tuninfo->mode == SEG6_IPTUN_MODE_ENCAP); - - return ((tuninfo->srh->hdrlen + 1) << 3) + - (encap * sizeof(struct ipv6hdr)); + int head = 0; + + switch (tuninfo->mode) { + case SEG6_IPTUN_MODE_INLINE: + break; + case SEG6_IPTUN_MODE_ENCAP: + head = sizeof(struct ipv6hdr); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + return 0; + } + + return ((tuninfo->srh->hdrlen + 1) << 3) + head; } #endif diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 5bec7817a7b9..bd6cc688bd19 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -238,6 +238,22 @@ static int seg6_do_srh(struct sk_buff *skb) if (err) return err; + skb->protocol = htons(ETH_P_IPV6); + break; + case SEG6_IPTUN_MODE_L2ENCAP: + if (!skb_mac_header_was_set(skb)) + return -EINVAL; + + if (pskb_expand_head(skb, skb->mac_len, 0, GFP_ATOMIC) < 0) + return -ENOMEM; + + skb_mac_header_rebuild(skb); + skb_push(skb, skb->mac_len); + + err = seg6_do_srh_encap(skb, tinfo->srh, NEXTHDR_NONE); + if (err) + return err; + skb->protocol = htons(ETH_P_IPV6); break; } @@ -386,6 +402,8 @@ static int seg6_build_state(struct nlattr *nla, break; case SEG6_IPTUN_MODE_ENCAP: break; + case SEG6_IPTUN_MODE_L2ENCAP: + break; default: return -EINVAL; } @@ -409,8 +427,11 @@ static int seg6_build_state(struct nlattr *nla, memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); newts->type = LWTUNNEL_ENCAP_SEG6; - newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT | - LWTUNNEL_STATE_INPUT_REDIRECT; + newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; + + if (tuninfo->mode != SEG6_IPTUN_MODE_L2ENCAP) + newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; + newts->headroom = seg6_lwt_headroom(tuninfo); *ts = newts; -- cgit v1.2.3 From 464bc0fd6273d518aee79fbd37211dd9bc35d863 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:04 -0700 Subject: bpf: convert sockmap field attach_bpf_fd2 to type In the initial sockmap API we provided strparser and verdict programs using a single attach command by extending the attach API with a the attach_bpf_fd2 field. However, if we add other programs in the future we will be adding a field for every new possible type, attach_bpf_fd(3,4,..). This seems a bit clumsy for an API. So lets push the programs using two new type fields. BPF_SK_SKB_STREAM_PARSER BPF_SK_SKB_STREAM_VERDICT This has the advantage of having a readable name and can easily be extended in the future. Updates to samples and sockmap included here also generalize tests slightly to support upcoming patch for multiple map support. Signed-off-by: John Fastabend Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Suggested-by: Alexei Starovoitov Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 10 +- include/uapi/linux/bpf.h | 9 +- kernel/bpf/sockmap.c | 25 ++-- kernel/bpf/syscall.c | 38 ++---- samples/sockmap/sockmap_kern.c | 6 +- samples/sockmap/sockmap_user.c | 12 +- tools/include/uapi/linux/bpf.h | 9 +- tools/lib/bpf/bpf.c | 14 +-- tools/lib/bpf/bpf.h | 4 - tools/testing/selftests/bpf/bpf_helpers.h | 3 +- tools/testing/selftests/bpf/sockmap_parse_prog.c | 2 +- tools/testing/selftests/bpf/sockmap_verdict_prog.c | 2 +- tools/testing/selftests/bpf/test_maps.c | 133 +++++++++------------ 13 files changed, 116 insertions(+), 151 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 830f472d8df5..c2cb1b5c094e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,8 +39,6 @@ struct bpf_map_ops { void (*map_fd_put_ptr)(void *ptr); u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); u32 (*map_fd_sys_lookup_elem)(void *ptr); - int (*map_attach)(struct bpf_map *map, - struct bpf_prog *p1, struct bpf_prog *p2); }; struct bpf_map { @@ -387,11 +385,19 @@ static inline void __dev_map_flush(struct bpf_map *map) #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); +int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else static inline struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) { return NULL; } + +static inline int sock_map_attach_prog(struct bpf_map *map, + struct bpf_prog *prog, + u32 type) +{ + return -EOPNOTSUPP; +} #endif /* verifier prototypes for helper functions called from eBPF programs */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 843818dff96d..97227be3690c 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -136,7 +136,8 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, - BPF_CGROUP_SMAP_INGRESS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -224,7 +225,6 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; - __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -580,14 +580,11 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_REDIRECT * - * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem - * @map_flags: sock map specific flags - * bit 1: Enable strparser - * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 617c239590c2..cf570d108fd5 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -723,20 +723,24 @@ out: return err; } -static int sock_map_attach_prog(struct bpf_map *map, - struct bpf_prog *parse, - struct bpf_prog *verdict) +int sock_map_attach_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - struct bpf_prog *_parse, *_verdict; + struct bpf_prog *orig; - _parse = xchg(&stab->bpf_parse, parse); - _verdict = xchg(&stab->bpf_verdict, verdict); + switch (type) { + case BPF_SK_SKB_STREAM_PARSER: + orig = xchg(&stab->bpf_parse, prog); + break; + case BPF_SK_SKB_STREAM_VERDICT: + orig = xchg(&stab->bpf_verdict, prog); + break; + default: + return -EOPNOTSUPP; + } - if (_parse) - bpf_prog_put(_parse); - if (_verdict) - bpf_prog_put(_verdict); + if (orig) + bpf_prog_put(orig); return 0; } @@ -777,7 +781,6 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, - .map_attach = sock_map_attach_prog, }; BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9378f3ba2cbf..021a05d9d800 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1093,12 +1093,12 @@ static int bpf_obj_get(const union bpf_attr *attr) #ifdef CONFIG_CGROUP_BPF -#define BPF_PROG_ATTACH_LAST_FIELD attach_bpf_fd2 +#define BPF_PROG_ATTACH_LAST_FIELD attach_flags -static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) +static int sockmap_get_from_fd(const union bpf_attr *attr) { - struct bpf_prog *prog1, *prog2; int ufd = attr->target_fd; + struct bpf_prog *prog; struct bpf_map *map; struct fd f; int err; @@ -1108,29 +1108,16 @@ static int sockmap_get_from_fd(const union bpf_attr *attr, int ptype) if (IS_ERR(map)) return PTR_ERR(map); - if (!map->ops->map_attach) { - fdput(f); - return -EOPNOTSUPP; - } - - prog1 = bpf_prog_get_type(attr->attach_bpf_fd, ptype); - if (IS_ERR(prog1)) { + prog = bpf_prog_get_type(attr->attach_bpf_fd, BPF_PROG_TYPE_SK_SKB); + if (IS_ERR(prog)) { fdput(f); - return PTR_ERR(prog1); - } - - prog2 = bpf_prog_get_type(attr->attach_bpf_fd2, ptype); - if (IS_ERR(prog2)) { - fdput(f); - bpf_prog_put(prog1); - return PTR_ERR(prog2); + return PTR_ERR(prog); } - err = map->ops->map_attach(map, prog1, prog2); + err = sock_map_attach_prog(map, prog, attr->attach_type); if (err) { fdput(f); - bpf_prog_put(prog1); - bpf_prog_put(prog2); + bpf_prog_put(prog); return err; } @@ -1165,16 +1152,13 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; - case BPF_CGROUP_SMAP_INGRESS: - ptype = BPF_PROG_TYPE_SK_SKB; - break; + case BPF_SK_SKB_STREAM_PARSER: + case BPF_SK_SKB_STREAM_VERDICT: + return sockmap_get_from_fd(attr); default: return -EINVAL; } - if (attr->attach_type == BPF_CGROUP_SMAP_INGRESS) - return sockmap_get_from_fd(attr, ptype); - prog = bpf_prog_get_type(attr->attach_bpf_fd, ptype); if (IS_ERR(prog)) return PTR_ERR(prog); diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 6ff986f7059b..f9b38ef82dc2 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -82,8 +82,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops) if (lport == 10000) { ret = 1; err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST, - BPF_SOCKMAP_STRPARSER); + BPF_NOEXIST); bpf_printk("passive(%i -> %i) map ctx update err: %d\n", lport, bpf_ntohl(rport), err); } @@ -95,8 +94,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops) if (bpf_ntohl(rport) == 10001) { ret = 10; err = bpf_sock_map_update(skops, &sock_map, &ret, - BPF_NOEXIST, - BPF_SOCKMAP_STRPARSER); + BPF_NOEXIST); bpf_printk("active(%i -> %i) map ctx update err: %d\n", lport, bpf_ntohl(rport), err); } diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index fb78f5abefb4..7cc9d228216f 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -256,8 +256,16 @@ int main(int argc, char **argv) } /* Attach programs to sockmap */ - err = __bpf_prog_attach(prog_fd[0], prog_fd[1], map_fd[0], - BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(prog_fd[0], map_fd[0], + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", + err, strerror(errno)); + return err; + } + + err = bpf_prog_attach(prog_fd[1], map_fd[0], + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", err, strerror(errno)); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f8f6377fd541..09ac590eefb1 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -136,7 +136,8 @@ enum bpf_attach_type { BPF_CGROUP_INET_EGRESS, BPF_CGROUP_INET_SOCK_CREATE, BPF_CGROUP_SOCK_OPS, - BPF_CGROUP_SMAP_INGRESS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, __MAX_BPF_ATTACH_TYPE }; @@ -227,7 +228,6 @@ union bpf_attr { __u32 attach_bpf_fd; /* eBPF program to attach */ __u32 attach_type; __u32 attach_flags; - __u32 attach_bpf_fd2; }; struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ @@ -572,14 +572,11 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_REDIRECT * - * int bpf_sock_map_update(skops, map, key, flags, map_flags) + * int bpf_sock_map_update(skops, map, key, flags) * @skops: pointer to bpf_sock_ops * @map: pointer to sockmap to update * @key: key to insert/update sock in map * @flags: same flags as map update elem - * @map_flags: sock map specific flags - * bit 1: Enable strparser - * other bits: reserved */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index a0717610b116..1d6907d379c9 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -235,28 +235,20 @@ int bpf_obj_get(const char *pathname) return sys_bpf(BPF_OBJ_GET, &attr, sizeof(attr)); } -int __bpf_prog_attach(int prog_fd1, int prog_fd2, int target_fd, - enum bpf_attach_type type, - unsigned int flags) +int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, + unsigned int flags) { union bpf_attr attr; bzero(&attr, sizeof(attr)); attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd1; - attr.attach_bpf_fd2 = prog_fd2; + attr.attach_bpf_fd = prog_fd; attr.attach_type = type; attr.attach_flags = flags; return sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); } -int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, - unsigned int flags) -{ - return __bpf_prog_attach(prog_fd, 0, target_fd, type, flags); -} - int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { union bpf_attr attr; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 90e9d4e85d08..b8ea5843c39e 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -56,10 +56,6 @@ int bpf_obj_pin(int fd, const char *pathname); int bpf_obj_get(const char *pathname); int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); -int __bpf_prog_attach(int prog1, int prog2, - int attachable_fd, - enum bpf_attach_type type, - unsigned int flags); int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, void *data_out, __u32 *size_out, __u32 *retval, diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 98f3be26d390..36fb9161b34a 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -68,8 +68,7 @@ static int (*bpf_setsockopt)(void *ctx, int level, int optname, void *optval, static int (*bpf_sk_redirect_map)(void *map, int key, int flags) = (void *) BPF_FUNC_sk_redirect_map; static int (*bpf_sock_map_update)(void *map, void *key, void *value, - unsigned long long flags, - unsigned long long map_lags) = + unsigned long long flags) = (void *) BPF_FUNC_sock_map_update; diff --git a/tools/testing/selftests/bpf/sockmap_parse_prog.c b/tools/testing/selftests/bpf/sockmap_parse_prog.c index 8b5453158399..710f43f42dc4 100644 --- a/tools/testing/selftests/bpf/sockmap_parse_prog.c +++ b/tools/testing/selftests/bpf/sockmap_parse_prog.c @@ -30,7 +30,7 @@ int bpf_prog1(struct __sk_buff *skb) */ d[0] = 1; - bpf_printk("data[0] = (%u): local_port %i remote %i\n", + bpf_printk("parse: data[0] = (%u): local_port %i remote %i\n", d[0], lport, bpf_ntohl(rport)); return skb->len; } diff --git a/tools/testing/selftests/bpf/sockmap_verdict_prog.c b/tools/testing/selftests/bpf/sockmap_verdict_prog.c index d5f9447b3808..0573c1db2519 100644 --- a/tools/testing/selftests/bpf/sockmap_verdict_prog.c +++ b/tools/testing/selftests/bpf/sockmap_verdict_prog.c @@ -40,7 +40,7 @@ int bpf_prog2(struct __sk_buff *skb) d[6] = 0xe; d[7] = 0xf; - bpf_printk("data[0] = (%u): local_port %i remote %i\n", + bpf_printk("verdict: data[0] = (%u): local_port %i remote %i redirect 5\n", d[0], lport, bpf_ntohl(rport)); return bpf_sk_redirect_map(&sock_map, 5, 0); } diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 40b2d1faf02b..6df6e6257424 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -547,20 +547,26 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - /* Nothing attached so these should fail */ + /* Test update without programs */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (!err) { - printf("Failed invalid update sockmap '%i:%i'\n", + if (err) { + printf("Failed noprog update sockmap '%i:%i'\n", i, sfd[i]); goto out_sockmap; } } /* Test attaching bad fds */ - err = __bpf_prog_attach(-1, -2, fd, BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_PARSER, 0); if (!err) { - printf("Failed invalid prog attach\n"); + printf("Failed invalid parser prog attach\n"); + goto out_sockmap; + } + + err = bpf_prog_attach(-1, fd, BPF_SK_SKB_STREAM_VERDICT, 0); + if (!err) { + printf("Failed invalid verdict prog attach\n"); goto out_sockmap; } @@ -591,14 +597,21 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - err = __bpf_prog_attach(parse_prog, verdict_prog, map_fd, - BPF_CGROUP_SMAP_INGRESS, 0); + err = bpf_prog_attach(parse_prog, map_fd, + BPF_SK_SKB_STREAM_PARSER, 0); + if (err) { + printf("Failed bpf prog attach\n"); + goto out_sockmap; + } + + err = bpf_prog_attach(verdict_prog, map_fd, + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { printf("Failed bpf prog attach\n"); goto out_sockmap; } - /* Test map update elem */ + /* Test map update elem afterwards fd lives in fd and map_fd */ for (i = 0; i < 6; i++) { err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY); if (err) { @@ -649,96 +662,68 @@ static void test_sockmap(int task, void *data) goto out_sockmap; } - /* Delete the reset of the elems include some NULL elems */ - for (i = 0; i < 6; i++) { - err = bpf_map_delete_elem(map_fd, &i); - if (err && (i == 0 || i == 1 || i >= 4)) { - printf("Failed delete sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } else if (!err && (i == 2 || i == 3)) { - printf("Failed null delete sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } - } - - /* Test having multiple SMAPs open and active on same fds */ - err = __bpf_prog_attach(parse_prog, verdict_prog, fd, - BPF_CGROUP_SMAP_INGRESS, 0); - if (err) { - printf("Failed fd bpf prog attach\n"); - goto out_sockmap; - } - - for (i = 0; i < 6; i++) { - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); - if (err) { - printf("Failed fd update sockmap %i '%i:%i'\n", - err, i, sfd[i]); - goto out_sockmap; - } - } - - /* Test duplicate socket add of NOEXIST, ANY and EXIST */ - i = 0; + /* Push fd into same slot */ + i = 2; err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); if (!err) { - printf("Failed BPF_NOEXIST create\n"); + printf("Failed allowed sockmap dup slot BPF_NOEXIST\n"); goto out_sockmap; } err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); if (err) { - printf("Failed sockmap update BPF_ANY\n"); + printf("Failed sockmap update new slot BPF_ANY\n"); goto out_sockmap; } err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); if (err) { - printf("Failed sockmap update BPF_EXIST\n"); + printf("Failed sockmap update new slot BPF_EXIST\n"); goto out_sockmap; } - /* The above were pushing fd into same slot try different slot now */ - i = 2; - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); - if (!err) { - printf("Failed BPF_NOEXIST create\n"); - goto out_sockmap; + /* Delete the elems without programs */ + for (i = 0; i < 6; i++) { + err = bpf_map_delete_elem(fd, &i); + if (err) { + printf("Failed delete sockmap %i '%i:%i'\n", + err, i, sfd[i]); + } } - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); + /* Test having multiple maps open and set with programs on same fds */ + err = bpf_prog_attach(parse_prog, fd, + BPF_SK_SKB_STREAM_PARSER, 0); if (err) { - printf("Failed sockmap update BPF_ANY\n"); + printf("Failed fd bpf parse prog attach\n"); goto out_sockmap; } - - err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); + err = bpf_prog_attach(verdict_prog, fd, + BPF_SK_SKB_STREAM_VERDICT, 0); if (err) { - printf("Failed sockmap update BPF_EXIST\n"); + printf("Failed fd bpf verdict prog attach\n"); goto out_sockmap; } - /* Try pushing fd into different map, this is not allowed at the - * moment. Which programs would we use? - */ - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_NOEXIST); - if (!err) { - printf("Failed BPF_NOEXIST create\n"); - goto out_sockmap; - } - - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_ANY); - if (!err) { - printf("Failed sockmap update BPF_ANY\n"); - goto out_sockmap; - } - - err = bpf_map_update_elem(map_fd, &i, &sfd[i], BPF_EXIST); - if (!err) { - printf("Failed sockmap update BPF_EXIST\n"); - goto out_sockmap; + for (i = 4; i < 6; i++) { + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_ANY); + if (!err) { + printf("Failed allowed duplicate programs in update ANY sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_NOEXIST); + if (!err) { + printf("Failed allowed duplicate program in update NOEXIST sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } + err = bpf_map_update_elem(fd, &i, &sfd[i], BPF_EXIST); + if (!err) { + printf("Failed allowed duplicate program in update EXIST sockmap %i '%i:%i'\n", + err, i, sfd[i]); + goto out_sockmap; + } } /* Test map close sockets */ -- cgit v1.2.3 From 2f857d04601a1bb56958b95a9f180bce0e91e5e6 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 28 Aug 2017 07:10:25 -0700 Subject: bpf: sockmap, remove STRPARSER map_flags and add multi-map support The addition of map_flags BPF_SOCKMAP_STRPARSER flags was to handle a specific use case where we want to have BPF parse program disabled on an entry in a sockmap. However, Alexei found the API a bit cumbersome and I agreed. Lets remove the STRPARSER flag and support the use case by allowing socks to be in multiple maps. This allows users to create two maps one with programs attached and one without. When socks are added to maps they now inherit any programs attached to the map. This is a nice generalization and IMO improves the API. The API rules are less ambiguous and do not need a flag: - When a sock is added to a sockmap we have two cases, i. The sock map does not have any attached programs so we can add sock to map without inheriting bpf programs. The sock may exist in 0 or more other maps. ii. The sock map has an attached BPF program. To avoid duplicate bpf programs we only add the sock entry if it does not have an existing strparser/verdict attached, returning -EBUSY if a program is already attached. Otherwise attach the program and inherit strparser/verdict programs from the sock map. This allows for socks to be in a multiple maps for redirects and inherit a BPF program from a single map. Also this patch simplifies the logic around BPF_{EXIST|NOEXIST|ANY} flags. In the original patch I tried to be extra clever and only update map entries when necessary. Now I've decided the complexity is not worth it. If users constantly update an entry with the same sock for no reason (i.e. update an entry without actually changing any parameters on map or sock) we still do an alloc/release. Using this and allowing multiple entries of a sock to exist in a map the logic becomes much simpler. Note: Now that multiple maps are supported the "maps" pointer called when a socket is closed becomes a list of maps to remove the sock from. To keep the map up to date when a sock is added to the sockmap we must add the map/elem in the list. Likewise when it is removed we must remove it from the list. This results in searching the per psock list on delete operation. On TCP_CLOSE events we walk the list and remove the psock from all map/entry locations. I don't see any perf implications in this because at most I have a psock in two maps. If a psock were to be in many maps its possibly this might be noticeable on delete but I can't think of a reason to dup a psock in many maps. The sk_callback_lock is used to protect read/writes to the list. This was convenient because in all locations we were taking the lock anyways just after working on the list. Also the lock is per sock so in normal cases we shouldn't see any contention. Suggested-by: Alexei Starovoitov Fixes: 174a79ff9515 ("bpf: sockmap with sk redirect support") Signed-off-by: John Fastabend Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 - kernel/bpf/sockmap.c | 269 +++++++++++++++++++++++++++++------------------ 2 files changed, 165 insertions(+), 107 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 97227be3690c..08c206a863e1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -143,9 +143,6 @@ enum bpf_attach_type { #define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE -/* If BPF_SOCKMAP_STRPARSER is used sockmap will use strparser on receive */ -#define BPF_SOCKMAP_STRPARSER (1U << 0) - /* If BPF_F_ALLOW_OVERRIDE flag is used in BPF_PROG_ATTACH command * to the given target_fd cgroup the descendent cgroup will be able to * override effective bpf program that was inherited from this cgroup diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index cf570d108fd5..a6882e54930b 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -13,15 +13,16 @@ /* A BPF sock_map is used to store sock objects. This is primarly used * for doing socket redirect with BPF helper routines. * - * A sock map may have two BPF programs attached to it, a program used - * to parse packets and a program to provide a verdict and redirect - * decision on the packet. If no BPF parse program is provided it is - * assumed that every skb is a "message" (skb->len). Otherwise the - * parse program is attached to strparser and used to build messages - * that may span multiple skbs. The verdict program will either select - * a socket to send/receive the skb on or provide the drop code indicating - * the skb should be dropped. More actions may be added later as needed. - * The default program will drop packets. + * A sock map may have BPF programs attached to it, currently a program + * used to parse packets and a program to provide a verdict and redirect + * decision on the packet are supported. Any programs attached to a sock + * map are inherited by sock objects when they are added to the map. If + * no BPF programs are attached the sock object may only be used for sock + * redirect. + * + * A sock object may be in multiple maps, but can only inherit a single + * parse or verdict program. If adding a sock object to a map would result + * in having multiple parsing programs the update will return an EBUSY error. * * For reference this program is similar to devmap used in XDP context * reviewing these together may be useful. For an example please review @@ -44,15 +45,21 @@ struct bpf_stab { struct sock **sock_map; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; - refcount_t refcnt; }; enum smap_psock_state { SMAP_TX_RUNNING, }; +struct smap_psock_map_entry { + struct list_head list; + struct sock **entry; +}; + struct smap_psock { struct rcu_head rcu; + /* refcnt is used inside sk_callback_lock */ + u32 refcnt; /* datapath variables */ struct sk_buff_head rxqueue; @@ -66,10 +73,9 @@ struct smap_psock { struct strparser strp; struct bpf_prog *bpf_parse; struct bpf_prog *bpf_verdict; - struct bpf_stab *stab; + struct list_head maps; /* Back reference used when sock callback trigger sockmap operations */ - int key; struct sock *sock; unsigned long state; @@ -83,7 +89,7 @@ struct smap_psock { static inline struct smap_psock *smap_psock_sk(const struct sock *sk) { - return (struct smap_psock *)rcu_dereference_sk_user_data(sk); + return rcu_dereference_sk_user_data(sk); } static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) @@ -149,11 +155,12 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct sock *sock); +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); /* Called with lock_sock(sk) held */ static void smap_state_change(struct sock *sk) { + struct smap_psock_map_entry *e, *tmp; struct smap_psock *psock; struct sock *osk; @@ -184,9 +191,15 @@ static void smap_state_change(struct sock *sk) psock = smap_psock_sk(sk); if (unlikely(!psock)) break; - osk = cmpxchg(&psock->stab->sock_map[psock->key], sk, NULL); - if (osk == sk) - smap_release_sock(sk); + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); break; default: psock = smap_psock_sk(sk); @@ -289,9 +302,8 @@ static void smap_write_space(struct sock *sk) static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) { - write_lock_bh(&sk->sk_callback_lock); if (!psock->strp_enabled) - goto out; + return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; sk->sk_state_change = psock->save_state_change; @@ -300,8 +312,6 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; -out: - write_unlock_bh(&sk->sk_callback_lock); } static void smap_destroy_psock(struct rcu_head *rcu) @@ -318,9 +328,11 @@ static void smap_destroy_psock(struct rcu_head *rcu) schedule_work(&psock->gc_work); } -static void smap_release_sock(struct sock *sock) +static void smap_release_sock(struct smap_psock *psock, struct sock *sock) { - struct smap_psock *psock = smap_psock_sk(sock); + psock->refcnt--; + if (psock->refcnt) + return; smap_stop_sock(psock, sock); clear_bit(SMAP_TX_RUNNING, &psock->state); @@ -414,6 +426,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { + struct smap_psock_map_entry *e, *tmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -431,8 +444,10 @@ static void smap_gc_work(struct work_struct *w) if (psock->bpf_verdict) bpf_prog_put(psock->bpf_verdict); - if (refcount_dec_and_test(&psock->stab->refcnt)) - sock_map_remove_complete(psock->stab); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + list_del(&e->list); + kfree(e); + } sock_put(psock->sock); kfree(psock); @@ -453,6 +468,8 @@ static struct smap_psock *smap_init_psock(struct sock *sock, skb_queue_head_init(&psock->rxqueue); INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); + INIT_LIST_HEAD(&psock->maps); + psock->refcnt = 1; rcu_assign_sk_user_data(sock, psock); sock_hold(sock); @@ -503,13 +520,24 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab->sock_map) goto free_stab; - refcount_set(&stab->refcnt, 1); return &stab->map; free_stab: kfree(stab); return ERR_PTR(err); } +static void smap_list_remove(struct smap_psock *psock, struct sock **entry) +{ + struct smap_psock_map_entry *e, *tmp; + + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + if (e->entry == entry) { + list_del(&e->list); + break; + } + } +} + static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); @@ -526,13 +554,18 @@ static void sock_map_free(struct bpf_map *map) */ rcu_read_lock(); for (i = 0; i < stab->map.max_entries; i++) { + struct smap_psock *psock; struct sock *sock; sock = xchg(&stab->sock_map[i], NULL); if (!sock) continue; - smap_release_sock(sock); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); @@ -541,8 +574,7 @@ static void sock_map_free(struct bpf_map *map) if (stab->bpf_parse) bpf_prog_put(stab->bpf_parse); - if (refcount_dec_and_test(&stab->refcnt)) - sock_map_remove_complete(stab); + sock_map_remove_complete(stab); } static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key) @@ -576,6 +608,7 @@ struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key) static int sock_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock *psock; int k = *(u32 *)key; struct sock *sock; @@ -586,7 +619,17 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) if (!sock) return -EINVAL; - smap_release_sock(sock); + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + if (!psock) + goto out; + + if (psock->bpf_parse) + smap_stop_sock(psock, sock); + smap_list_remove(psock, &stab->sock_map[k]); + smap_release_sock(psock, sock); +out: + write_unlock_bh(&sock->sk_callback_lock); return 0; } @@ -601,29 +644,34 @@ static int sock_map_delete_elem(struct bpf_map *map, void *key) * and syncd so we are certain all references from the update/lookup/delete * operations as well as references in the data path are no longer in use. * - * A psock object holds a refcnt on the sockmap it is attached to and this is - * not decremented until after a RCU grace period and garbage collection occurs. - * This ensures the map is not free'd until psocks linked to it are removed. The - * map link is used when the independent sock events trigger map deletion. + * Psocks may exist in multiple maps, but only a single set of parse/verdict + * programs may be inherited from the maps it belongs to. A reference count + * is kept with the total number of references to the psock from all maps. The + * psock will not be released until this reaches zero. The psock and sock + * user data data use the sk_callback_lock to protect critical data structures + * from concurrent access. This allows us to avoid two updates from modifying + * the user data in sock and the lock is required anyways for modifying + * callbacks, we simply increase its scope slightly. * - * Psocks may only participate in one sockmap at a time. Users that try to - * join a single sock to multiple maps will get an error. - * - * Last, but not least, it is possible the socket is closed while running - * an update on an existing psock. This will release the psock, but again - * not until the update has completed due to rcu grace period rules. + * Rules to follow, + * - psock must always be read inside RCU critical section + * - sk_user_data must only be modified inside sk_callback_lock and read + * inside RCU critical section. + * - psock->maps list must only be read & modified inside sk_callback_lock + * - sock_map must use READ_ONCE and (cmp)xchg operations + * - BPF verdict/parse programs must use READ_ONCE and xchg operations */ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, struct bpf_map *map, - void *key, u64 flags, u64 map_flags) + void *key, u64 flags) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct smap_psock_map_entry *e = NULL; struct bpf_prog *verdict, *parse; - struct smap_psock *psock = NULL; - struct sock *old_sock, *sock; + struct sock *osock, *sock; + struct smap_psock *psock; u32 i = *(u32 *)key; - bool update = false; - int err = 0; + int err; if (unlikely(flags > BPF_EXIST)) return -EINVAL; @@ -631,35 +679,22 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, if (unlikely(i >= stab->map.max_entries)) return -E2BIG; - if (unlikely(map_flags > BPF_SOCKMAP_STRPARSER)) - return -EINVAL; - - verdict = parse = NULL; sock = READ_ONCE(stab->sock_map[i]); - - if (flags == BPF_EXIST || flags == BPF_ANY) { - if (!sock && flags == BPF_EXIST) { - return -ENOENT; - } else if (sock && sock != skops->sk) { - return -EINVAL; - } else if (sock) { - psock = smap_psock_sk(sock); - if (unlikely(!psock)) - return -EBUSY; - update = true; - } - } else if (sock && BPF_NOEXIST) { + if (flags == BPF_EXIST && !sock) + return -ENOENT; + else if (flags == BPF_NOEXIST && sock) return -EEXIST; - } - /* reserve BPF programs early so can abort easily on failures */ - if (map_flags & BPF_SOCKMAP_STRPARSER) { - verdict = READ_ONCE(stab->bpf_verdict); - parse = READ_ONCE(stab->bpf_parse); + sock = skops->sk; - if (!verdict || !parse) - return -ENOENT; + /* 1. If sock map has BPF programs those will be inherited by the + * sock being added. If the sock is already attached to BPF programs + * this results in an error. + */ + verdict = READ_ONCE(stab->bpf_verdict); + parse = READ_ONCE(stab->bpf_parse); + if (parse && verdict) { /* bpf prog refcnt may be zero if a concurrent attach operation * removes the program after the above READ_ONCE() but before * we increment the refcnt. If this is the case abort with an @@ -676,50 +711,78 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } } - if (!psock) { - sock = skops->sk; - if (rcu_dereference_sk_user_data(sock)) - return -EEXIST; + write_lock_bh(&sock->sk_callback_lock); + psock = smap_psock_sk(sock); + + /* 2. Do not allow inheriting programs if psock exists and has + * already inherited programs. This would create confusion on + * which parser/verdict program is running. If no psock exists + * create one. Inside sk_callback_lock to ensure concurrent create + * doesn't update user data. + */ + if (psock) { + if (READ_ONCE(psock->bpf_parse) && parse) { + err = -EBUSY; + goto out_progs; + } + psock->refcnt++; + } else { psock = smap_init_psock(sock, stab); if (IS_ERR(psock)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) - bpf_prog_put(parse); - return PTR_ERR(psock); + err = PTR_ERR(psock); + goto out_progs; } - psock->key = i; - psock->stab = stab; - refcount_inc(&stab->refcnt); + set_bit(SMAP_TX_RUNNING, &psock->state); } - if (map_flags & BPF_SOCKMAP_STRPARSER) { - write_lock_bh(&sock->sk_callback_lock); - if (psock->strp_enabled) - goto start_done; + e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN); + if (!e) { + err = -ENOMEM; + goto out_progs; + } + e->entry = &stab->sock_map[i]; + + /* 3. At this point we have a reference to a valid psock that is + * running. Attach any BPF programs needed. + */ + if (parse && verdict && !psock->strp_enabled) { err = smap_init_sock(psock, sock); if (err) - goto out; + goto out_free; smap_init_progs(psock, stab, verdict, parse); smap_start_sock(psock, sock); -start_done: - write_unlock_bh(&sock->sk_callback_lock); - } else if (update) { - smap_stop_sock(psock, sock); } - if (!update) { - old_sock = xchg(&stab->sock_map[i], skops->sk); - if (old_sock) - smap_release_sock(old_sock); - } + /* 4. Place psock in sockmap for use and stop any programs on + * the old sock assuming its not the same sock we are replacing + * it with. Because we can only have a single set of programs if + * old_sock has a strp we can stop it. + */ + list_add_tail(&e->list, &psock->maps); + write_unlock_bh(&sock->sk_callback_lock); + osock = xchg(&stab->sock_map[i], sock); + if (osock) { + struct smap_psock *opsock = smap_psock_sk(osock); + + write_lock_bh(&osock->sk_callback_lock); + if (osock != sock && parse) + smap_stop_sock(opsock, osock); + smap_list_remove(opsock, &stab->sock_map[i]); + smap_release_sock(opsock, osock); + write_unlock_bh(&osock->sk_callback_lock); + } return 0; -out: +out_free: + smap_release_sock(psock, sock); +out_progs: + if (verdict) + bpf_prog_put(verdict); + if (parse) + bpf_prog_put(parse); write_unlock_bh(&sock->sk_callback_lock); - if (!update) - smap_release_sock(sock); + kfree(e); return err; } @@ -768,8 +831,7 @@ static int sock_map_update_elem(struct bpf_map *map, return -EINVAL; } - err = sock_map_ctx_update_elem(&skops, map, key, - flags, BPF_SOCKMAP_STRPARSER); + err = sock_map_ctx_update_elem(&skops, map, key, flags); fput(socket->file); return err; } @@ -783,11 +845,11 @@ const struct bpf_map_ops sock_map_ops = { .map_delete_elem = sock_map_delete_elem, }; -BPF_CALL_5(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, - struct bpf_map *, map, void *, key, u64, flags, u64, map_flags) +BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, + struct bpf_map *, map, void *, key, u64, flags) { WARN_ON_ONCE(!rcu_read_lock_held()); - return sock_map_ctx_update_elem(bpf_sock, map, key, flags, map_flags); + return sock_map_ctx_update_elem(bpf_sock, map, key, flags); } const struct bpf_func_proto bpf_sock_map_update_proto = { @@ -799,5 +861,4 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_PTR_TO_MAP_KEY, .arg4_type = ARG_ANYTHING, - .arg5_type = ARG_ANYTHING, }; -- cgit v1.2.3 From 2804fd3af6ba5ae5737705b27146455eabe2e2f8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Mon, 28 Aug 2017 15:03:13 -0400 Subject: if_ether: add forces ife lfb type This patch adds the forces IFE lfb type according to IEEE registered ethertypes. See http://standards-oui.ieee.org/ethertype/eth.txt for more information. Since there exists the IFE subsystem it can be used there. This patch also use the correct word "ForCES" instead of "FoRCES" which is a spelling error inside the IEEE ethertype specification. Signed-off-by: Alexander Aring Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index efeb1190c2ca..f68f6bf4a253 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -104,6 +104,7 @@ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ3 0x9300 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ +#define ETH_P_IFE 0xED3E /* ForCES inter-FE LFB type */ #define ETH_P_AF_IUCV 0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_802_3_MIN 0x0600 /* If the value in the ethernet type is less than this value -- cgit v1.2.3 From 155e6f649757c902901e599c268f8b575ddac1f8 Mon Sep 17 00:00:00 2001 From: Jiri Benc Date: Mon, 28 Aug 2017 21:43:21 +0200 Subject: ether: add NSH ethertype The NSH draft says: An IEEE EtherType, 0x894F, has been allocated for NSH. Signed-off-by: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index f68f6bf4a253..61f7ccce5b69 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -99,6 +99,7 @@ #define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */ #define ETH_P_80221 0x8917 /* IEEE 802.21 Media Independent Handover Protocol */ #define ETH_P_HSR 0x892F /* IEC 62439-3 HSRv1 */ +#define ETH_P_NSH 0x894F /* Network Service Header */ #define ETH_P_LOOPBACK 0x9000 /* Ethernet loopback packet, per IEEE 802.3 */ #define ETH_P_QINQ1 0x9100 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ #define ETH_P_QINQ2 0x9200 /* deprecated QinQ VLAN [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3 From 31770e34e43d6c8dee129bfee77e56c34e61f0e5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 30 Aug 2017 19:24:58 +0200 Subject: tcp: Revert "tcp: remove header prediction" This reverts commit 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78. Eric Dumazet says: We found at Google a significant regression caused by 45f119bf936b1f9f546a0b139c5b56f9bb2bdc78 tcp: remove header prediction In typical RPC (TCP_RR), when a TCP socket receives data, we now call tcp_ack() while we used to not call it. This touches enough cache lines to cause a slowdown. so problem does not seem to be HP removal itself but the tcp_ack() call. Therefore, it might be possible to remove HP after all, provided one finds a way to elide tcp_ack for most cases. Reported-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: David S. Miller --- include/linux/tcp.h | 6 ++ include/net/tcp.h | 23 ++++++ include/uapi/linux/snmp.h | 2 + net/ipv4/proc.c | 2 + net/ipv4/tcp.c | 4 +- net/ipv4/tcp_input.c | 188 ++++++++++++++++++++++++++++++++++++++++++++-- net/ipv4/tcp_minisocks.c | 2 + net/ipv4/tcp_output.c | 2 + 8 files changed, 223 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 267164a1d559..4aa40ef02d32 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -147,6 +147,12 @@ struct tcp_sock { u16 tcp_header_len; /* Bytes of tcp header to send */ u16 gso_segs; /* Max number of segs per GSO packet */ +/* + * Header prediction flags + * 0x5?10 << 16 + snd_wnd in net byte order + */ + __be32 pred_flags; + /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) diff --git a/include/net/tcp.h b/include/net/tcp.h index c546d13ffbca..9c3db054e47f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -634,6 +634,29 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +{ + tp->pred_flags = htonl((tp->tcp_header_len << 26) | + ntohl(TCP_FLAG_ACK) | + snd_wnd); +} + +static inline void tcp_fast_path_on(struct tcp_sock *tp) +{ + __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); +} + +static inline void tcp_fast_path_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && + tp->rcv_wnd && + atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && + !tp->urg_data) + tcp_fast_path_on(tp); +} + /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index b3f346fb9fe3..758f12b58541 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -184,7 +184,9 @@ enum LINUX_MIB_DELAYEDACKLOST, /* DelayedACKLost */ LINUX_MIB_LISTENOVERFLOWS, /* ListenOverflows */ LINUX_MIB_LISTENDROPS, /* ListenDrops */ + LINUX_MIB_TCPHPHITS, /* TCPHPHits */ LINUX_MIB_TCPPUREACKS, /* TCPPureAcks */ + LINUX_MIB_TCPHPACKS, /* TCPHPAcks */ LINUX_MIB_TCPRENORECOVERY, /* TCPRenoRecovery */ LINUX_MIB_TCPSACKRECOVERY, /* TCPSackRecovery */ LINUX_MIB_TCPSACKRENEGING, /* TCPSACKReneging */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index b6d3fe03feb3..127153f1ed8a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -206,7 +206,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), + SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), + SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 566083ee2654..21ca2df274c5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1963,8 +1963,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, tcp_rcv_space_adjust(sk); skip_copy: - if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { tp->urg_data = 0; + tcp_fast_path_check(sk); + } if (used + offset < skb->len) continue; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a0e436366d31..c5d7656beeee 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -103,6 +103,7 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2; #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_LOST_RETRANS 0x80 /* This ACK marks some retransmission lost */ +#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ORIG_SACK_ACKED 0x200 /* Never retransmitted data are (s)acked */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained D-SACK info */ @@ -3371,6 +3372,12 @@ static int tcp_ack_update_window(struct sock *sk, const struct sk_buff *skb, u32 if (tp->snd_wnd != nwin) { tp->snd_wnd = nwin; + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + tp->pred_flags = 0; + tcp_fast_path_check(sk); + if (tcp_send_head(sk)) tcp_slow_start_after_idle_check(sk); @@ -3592,7 +3599,19 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (flag & FLAG_UPDATE_TS_RECENT) tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); - { + if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack_seq); + tcp_snd_una_update(tp, ack); + flag |= FLAG_WIN_UPDATE; + + tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); + + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); + } else { u32 ack_ev_flags = CA_ACK_SLOWPATH; if (ack_seq != TCP_SKB_CB(skb)->end_seq) @@ -4407,6 +4426,8 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) if (TCP_SKB_CB(skb)->has_rxtstamp) TCP_SKB_CB(skb)->swtstamp = skb->tstamp; + /* Disable header prediction. */ + tp->pred_flags = 0; inet_csk_schedule_ack(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPOFOQUEUE); @@ -4647,6 +4668,8 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); + tcp_fast_path_check(sk); + if (eaten > 0) kfree_skb_partial(skb, fragstolen); if (!sock_flag(sk, SOCK_DEAD)) @@ -4972,6 +4995,7 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS(sock_net(sk), LINUX_MIB_RCVPRUNED); /* Massive buffer overcommit. */ + tp->pred_flags = 0; return -1; } @@ -5143,6 +5167,9 @@ static void tcp_check_urg(struct sock *sk, const struct tcphdr *th) tp->urg_data = TCP_URG_NOTYET; tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; } /* This is the 'fast' part of urgent handling. */ @@ -5301,6 +5328,26 @@ discard: /* * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * - Unexpected TCP option. + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. */ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct tcphdr *th) @@ -5311,19 +5358,144 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tcp_mstamp_refresh(tp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); + /* + * Header prediction. + * The code loosely follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ tp->rx_opt.saw_tstamp = 0; + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_prediction is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt && + !after(TCP_SKB_CB(skb)->ack_seq, tp->snd_nxt)) { + int tcp_header_len = tp->tcp_header_len; + + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + /* No? Slow path! */ + if (!tcp_parse_aligned_timestamp(tp, th)) + goto slow_path; + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + goto slow_path; + + /* DO NOT update ts_recent here, if checksum fails + * and timestamp was corrupted part, it will result + * in a hung connection since we will drop all + * future packets due to the PAWS test. + */ + } + + if (len <= tcp_header_len) { + /* Bulk data transfer: sender */ + if (len == tcp_header_len) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + /* We know that such packets are checksummed + * on entry. + */ + tcp_ack(sk, skb, 0); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return; + } else { /* Header too small */ + TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS); + goto discard; + } + } else { + int eaten = 0; + bool fragstolen = false; + + if (tcp_checksum_complete(skb)) + goto csum_error; + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(sk, skb); + + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPHITS); + + /* Bulk data transfer: receiver */ + eaten = tcp_queue_rcv(sk, skb, tcp_header_len, + &fragstolen); + + tcp_event_data_recv(sk, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!inet_csk_ack_scheduled(sk)) + goto no_ack; + } + + __tcp_ack_snd_check(sk, 0); +no_ack: + if (eaten) + kfree_skb_partial(skb, fragstolen); + sk->sk_data_ready(sk); + return; + } + } + +slow_path: if (len < (th->doff << 2) || tcp_checksum_complete(skb)) goto csum_error; if (!th->ack && !th->rst && !th->syn) goto discard; + /* + * Standard slow path. + */ + if (!tcp_validate_incoming(sk, skb, th, 1)) return; - if (tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT) < 0) +step5: + if (tcp_ack(sk, skb, FLAG_SLOWPATH | FLAG_UPDATE_TS_RECENT) < 0) goto discard; tcp_rcv_rtt_measure_ts(sk, skb); @@ -5376,6 +5548,11 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) if (sock_flag(sk, SOCK_KEEPOPEN)) inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -5504,7 +5681,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_ecn_rcv_synack(tp, th); tcp_init_wl(tp, TCP_SKB_CB(skb)->seq); - tcp_ack(sk, skb, 0); + tcp_ack(sk, skb, FLAG_SLOWPATH); /* Ok.. it's good. Set up sequence numbers and * move to established. @@ -5740,8 +5917,8 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; /* step 5: check the ACK field */ - - acceptable = tcp_ack(sk, skb, FLAG_UPDATE_TS_RECENT | + acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH | + FLAG_UPDATE_TS_RECENT | FLAG_NO_CHALLENGE_ACK) > 0; if (!acceptable) { @@ -5809,6 +5986,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) tp->lsndtime = tcp_jiffies32; tcp_initialize_rcv_mss(sk); + tcp_fast_path_on(tp); break; case TCP_FIN_WAIT1: { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1537b87c657f..188a6f31356d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -436,6 +436,8 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, struct tcp_sock *newtp = tcp_sk(newsk); /* Now setup tcp_sock */ + newtp->pred_flags = 0; + newtp->rcv_wup = newtp->copied_seq = newtp->rcv_nxt = treq->rcv_isn + 1; newtp->segs_in = 1; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3e0d19631534..5b6690d05abb 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -295,7 +295,9 @@ static u16 tcp_select_window(struct sock *sk) /* RFC1323 scaling applied */ new_win >>= tp->rx_opt.rcv_wscale; + /* If we advertise zero window, disable fast path. */ if (new_win == 0) { + tp->pred_flags = 0; if (old_win) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTOZEROWINDOWADV); -- cgit v1.2.3 From 7373ae7e8f0bf2c0718422481da986db5058b005 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 29 Aug 2017 22:44:16 -0600 Subject: net: ether: Add support for multiplexing and aggregation type Define the Qualcomm multiplexing and aggregation (MAP) ether type 0x00F9. This is needed for receiving data in the MAP protocol like RMNET. This is not an officially registered ID. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- include/uapi/linux/if_ether.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h index 61f7ccce5b69..9037065e23d0 100644 --- a/include/uapi/linux/if_ether.h +++ b/include/uapi/linux/if_ether.h @@ -140,6 +140,9 @@ #define ETH_P_IEEE802154 0x00F6 /* IEEE802.15.4 frame */ #define ETH_P_CAIF 0x00F7 /* ST-Ericsson CAIF protocol */ #define ETH_P_XDSA 0x00F8 /* Multiplexed DSA protocol */ +#define ETH_P_MAP 0x00F9 /* Qualcomm multiplexing and + * aggregation protocol + */ /* * This is an Ethernet frame header. -- cgit v1.2.3 From cdf4969c42a6c1a376dd03a9e846cf638d3cd4b1 Mon Sep 17 00:00:00 2001 From: Subash Abhinov Kasiviswanathan Date: Tue, 29 Aug 2017 22:44:17 -0600 Subject: net: arp: Add support for raw IP device Define the raw IP type. This is needed for raw IP net devices like rmnet. Signed-off-by: Subash Abhinov Kasiviswanathan Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index cf73510b9238..a2a635620600 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -59,6 +59,7 @@ #define ARPHRD_LAPB 516 /* LAPB */ #define ARPHRD_DDCMP 517 /* Digital's DDCMP protocol */ #define ARPHRD_RAWHDLC 518 /* Raw HDLC */ +#define ARPHRD_RAWIP 519 /* Raw IP */ #define ARPHRD_TUNNEL 768 /* IPIP tunnel */ #define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel */ -- cgit v1.2.3 From 1797f5b3cf0b3a73c42b89f7a8fd897417373730 Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Thu, 31 Aug 2017 17:59:12 +0200 Subject: devlink: Add IPv6 header for dpipe This will be used by the IPv6 host table which will be introduced in the following patches. The fields in the header are added per-use. This header is global and can be reused by many drivers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 5 +++++ net/core/devlink.c | 17 +++++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index aaf7178127a2..b9654e133599 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -330,6 +330,7 @@ int devlink_dpipe_match_put(struct sk_buff *skb, struct devlink_dpipe_match *match); extern struct devlink_dpipe_header devlink_dpipe_header_ethernet; extern struct devlink_dpipe_header devlink_dpipe_header_ipv4; +extern struct devlink_dpipe_header devlink_dpipe_header_ipv6; #else diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6c172548589d..0cbca96c66b9 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -234,9 +234,14 @@ enum devlink_dpipe_field_ipv4_id { DEVLINK_DPIPE_FIELD_IPV4_DST_IP, }; +enum devlink_dpipe_field_ipv6_id { + DEVLINK_DPIPE_FIELD_IPV6_DST_IP, +}; + enum devlink_dpipe_header_id { DEVLINK_DPIPE_HEADER_ETHERNET, DEVLINK_DPIPE_HEADER_IPV4, + DEVLINK_DPIPE_HEADER_IPV6, }; #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index cbc4b0461b0f..7d430c1d9c3e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -63,6 +63,23 @@ struct devlink_dpipe_header devlink_dpipe_header_ipv4 = { }; EXPORT_SYMBOL(devlink_dpipe_header_ipv4); +static struct devlink_dpipe_field devlink_dpipe_fields_ipv6[] = { + { + .name = "destination ip", + .id = DEVLINK_DPIPE_FIELD_IPV6_DST_IP, + .bitwidth = 128, + }, +}; + +struct devlink_dpipe_header devlink_dpipe_header_ipv6 = { + .name = "ipv6", + .id = DEVLINK_DPIPE_HEADER_IPV6, + .fields = devlink_dpipe_fields_ipv6, + .fields_count = ARRAY_SIZE(devlink_dpipe_fields_ipv6), + .global = true, +}; +EXPORT_SYMBOL(devlink_dpipe_header_ipv6); + EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg); static LIST_HEAD(devlink_list); -- cgit v1.2.3 From 482dca939fb7ee35ba20b944b4c2476133dbf0df Mon Sep 17 00:00:00 2001 From: David Ahern Date: Thu, 31 Aug 2017 15:05:44 -0700 Subject: bpf: Add mark and priority to sock options that can be set Add socket mark and priority to fields that can be set by ebpf program when a socket is created. Signed-off-by: David Ahern Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 ++ net/core/filter.c | 26 ++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 08c206a863e1..ba848b761cfb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -758,6 +758,8 @@ struct bpf_sock { __u32 family; __u32 type; __u32 protocol; + __u32 mark; + __u32 priority; }; #define XDP_PACKET_HEADROOM 256 diff --git a/net/core/filter.c b/net/core/filter.c index c6a37fe0285b..f51b9690adf3 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3455,6 +3455,10 @@ static bool sock_filter_is_valid_access(int off, int size, switch (off) { case offsetof(struct bpf_sock, bound_dev_if): break; + case offsetof(struct bpf_sock, mark): + break; + case offsetof(struct bpf_sock, priority): + break; default: return false; } @@ -3958,6 +3962,28 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, offsetof(struct sock, sk_bound_dev_if)); break; + case offsetof(struct bpf_sock, mark): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_mark) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_mark)); + break; + + case offsetof(struct bpf_sock, priority): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_priority) != 4); + + if (type == BPF_WRITE) + *insn++ = BPF_STX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + else + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sock, sk_priority)); + break; + case offsetof(struct bpf_sock, family): BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); -- cgit v1.2.3 From c03fa9bcacd9ac04595cc13f34f3445f0a5ecf13 Mon Sep 17 00:00:00 2001 From: Ivan Delalande Date: Thu, 31 Aug 2017 09:59:39 -0700 Subject: tcp_diag: report TCP MD5 signing keys and addresses Report TCP MD5 (RFC2385) signing keys, addresses and address prefixes to processes with CAP_NET_ADMIN requesting INET_DIAG_INFO. Currently it is not possible to retrieve these from the kernel once they have been configured on sockets. Signed-off-by: Ivan Delalande Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- include/uapi/linux/inet_diag.h | 1 + include/uapi/linux/tcp.h | 9 ++++ net/ipv4/tcp_diag.c | 109 ++++++++++++++++++++++++++++++++++++++--- 3 files changed, 113 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h index 678496897a68..f52ff62bfabe 100644 --- a/include/uapi/linux/inet_diag.h +++ b/include/uapi/linux/inet_diag.h @@ -143,6 +143,7 @@ enum { INET_DIAG_MARK, INET_DIAG_BBRINFO, INET_DIAG_CLASS_ID, + INET_DIAG_MD5SIG, __INET_DIAG_MAX, }; diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h index 030e594bab45..15c25eccab2b 100644 --- a/include/uapi/linux/tcp.h +++ b/include/uapi/linux/tcp.h @@ -256,4 +256,13 @@ struct tcp_md5sig { __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; /* key (binary) */ }; +/* INET_DIAG_MD5SIG */ +struct tcp_diag_md5sig { + __u8 tcpm_family; + __u8 tcpm_prefixlen; + __u16 tcpm_keylen; + __be32 tcpm_addr[4]; + __u8 tcpm_key[TCP_MD5SIG_MAXKEYLEN]; +}; + #endif /* _UAPI_LINUX_TCP_H */ diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index a748c74aa8b7..abbf0edcf6c2 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -16,6 +16,7 @@ #include +#include #include static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, @@ -36,6 +37,100 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r, tcp_get_info(sk, info); } +#ifdef CONFIG_TCP_MD5SIG +static void tcp_diag_md5sig_fill(struct tcp_diag_md5sig *info, + const struct tcp_md5sig_key *key) +{ + info->tcpm_family = key->family; + info->tcpm_prefixlen = key->prefixlen; + info->tcpm_keylen = key->keylen; + memcpy(info->tcpm_key, key->key, key->keylen); + + if (key->family == AF_INET) + info->tcpm_addr[0] = key->addr.a4.s_addr; + #if IS_ENABLED(CONFIG_IPV6) + else if (key->family == AF_INET6) + memcpy(&info->tcpm_addr, &key->addr.a6, + sizeof(info->tcpm_addr)); + #endif +} + +static int tcp_diag_put_md5sig(struct sk_buff *skb, + const struct tcp_md5sig_info *md5sig) +{ + const struct tcp_md5sig_key *key; + struct tcp_diag_md5sig *info; + struct nlattr *attr; + int md5sig_count = 0; + + hlist_for_each_entry_rcu(key, &md5sig->head, node) + md5sig_count++; + if (md5sig_count == 0) + return 0; + + attr = nla_reserve(skb, INET_DIAG_MD5SIG, + md5sig_count * sizeof(struct tcp_diag_md5sig)); + if (!attr) + return -EMSGSIZE; + + info = nla_data(attr); + memset(info, 0, md5sig_count * sizeof(struct tcp_diag_md5sig)); + hlist_for_each_entry_rcu(key, &md5sig->head, node) { + tcp_diag_md5sig_fill(info++, key); + if (--md5sig_count == 0) + break; + } + + return 0; +} +#endif + +static int tcp_diag_get_aux(struct sock *sk, bool net_admin, + struct sk_buff *skb) +{ +#ifdef CONFIG_TCP_MD5SIG + if (net_admin) { + struct tcp_md5sig_info *md5sig; + int err = 0; + + rcu_read_lock(); + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); + if (md5sig) + err = tcp_diag_put_md5sig(skb, md5sig); + rcu_read_unlock(); + if (err < 0) + return err; + } +#endif + + return 0; +} + +static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) +{ + size_t size = 0; + +#ifdef CONFIG_TCP_MD5SIG + if (net_admin && sk_fullsock(sk)) { + const struct tcp_md5sig_info *md5sig; + const struct tcp_md5sig_key *key; + size_t md5sig_count = 0; + + rcu_read_lock(); + md5sig = rcu_dereference(tcp_sk(sk)->md5sig_info); + if (md5sig) { + hlist_for_each_entry_rcu(key, &md5sig->head, node) + md5sig_count++; + } + rcu_read_unlock(); + size += nla_total_size(md5sig_count * + sizeof(struct tcp_diag_md5sig)); + } +#endif + + return size; +} + static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r, struct nlattr *bc) { @@ -68,13 +163,15 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, #endif static const struct inet_diag_handler tcp_diag_handler = { - .dump = tcp_diag_dump, - .dump_one = tcp_diag_dump_one, - .idiag_get_info = tcp_diag_get_info, - .idiag_type = IPPROTO_TCP, - .idiag_info_size = sizeof(struct tcp_info), + .dump = tcp_diag_dump, + .dump_one = tcp_diag_dump_one, + .idiag_get_info = tcp_diag_get_info, + .idiag_get_aux = tcp_diag_get_aux, + .idiag_get_aux_size = tcp_diag_get_aux_size, + .idiag_type = IPPROTO_TCP, + .idiag_info_size = sizeof(struct tcp_info), #ifdef CONFIG_INET_DIAG_DESTROY - .destroy = tcp_diag_destroy, + .destroy = tcp_diag_destroy, #endif }; -- cgit v1.2.3 From bea74641e3786d51dcf1175527cc1781420961c9 Mon Sep 17 00:00:00 2001 From: Vishwanath Pai Date: Fri, 18 Aug 2017 20:58:59 +0200 Subject: netfilter: xt_hashlimit: add rate match mode This patch adds a new feature to hashlimit that allows matching on the current packet/byte rate without rate limiting. This can be enabled with a new flag --hashlimit-rate-match. The match returns true if the current rate of packets is above/below the user specified value. The main difference between the existing algorithm and the new one is that the existing algorithm rate-limits the flow whereas the new algorithm does not. Instead it *classifies* the flow based on whether it is above or below a certain rate. I will demonstrate this with an example below. Let us assume this rule: iptables -A INPUT -m hashlimit --hashlimit-above 10/s -j new_chain If the packet rate is 15/s, the existing algorithm would ACCEPT 10 packets every second and send 5 packets to "new_chain". But with the new algorithm, as long as the rate of 15/s is sustained, all packets will continue to match and every packet is sent to new_chain. This new functionality will let us classify different flows based on their current rate, so that further decisions can be made on them based on what the current rate is. This is how the new algorithm works: We divide time into intervals of 1 (sec/min/hour) as specified by the user. We keep track of the number of packets/bytes processed in the current interval. After each interval we reset the counter to 0. When we receive a packet for match, we look at the packet rate during the current interval and the previous interval to make a decision: if [ prev_rate < user and cur_rate < user ] return Below else return Above Where cur_rate is the number of packets/bytes seen in the current interval, prev is the number of packets/bytes seen in the previous interval and 'user' is the rate specified by the user. We also provide flexibility to the user for choosing the time interval using the option --hashilmit-interval. For example the user can keep a low rate like x/hour but still keep the interval as small as 1 second. To preserve backwards compatibility we have to add this feature in a new revision, so I've created revision 3 for hashlimit. The two new options we add are: --hashlimit-rate-match --hashlimit-rate-interval I have updated the help text to add these new options. Also added a few tests for the new options. Suggested-by: Igor Lubashev Reviewed-by: Josh Hunt Signed-off-by: Vishwanath Pai Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_hashlimit.h | 3 +- include/uapi/linux/netfilter/xt_hashlimit.h | 36 +++- net/netfilter/xt_hashlimit.c | 277 +++++++++++++++++++++++++--- 3 files changed, 285 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netfilter/xt_hashlimit.h b/include/linux/netfilter/xt_hashlimit.h index 074790c0cf74..0fc458bde80b 100644 --- a/include/linux/netfilter/xt_hashlimit.h +++ b/include/linux/netfilter/xt_hashlimit.h @@ -5,5 +5,6 @@ #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ XT_HASHLIMIT_HASH_SIP | XT_HASHLIMIT_HASH_SPT | \ - XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES) + XT_HASHLIMIT_INVERT | XT_HASHLIMIT_BYTES |\ + XT_HASHLIMIT_RATE_MATCH) #endif /*_XT_HASHLIMIT_H*/ diff --git a/include/uapi/linux/netfilter/xt_hashlimit.h b/include/uapi/linux/netfilter/xt_hashlimit.h index 79da349f1060..aa98573248b1 100644 --- a/include/uapi/linux/netfilter/xt_hashlimit.h +++ b/include/uapi/linux/netfilter/xt_hashlimit.h @@ -19,12 +19,13 @@ struct xt_hashlimit_htable; enum { - XT_HASHLIMIT_HASH_DIP = 1 << 0, - XT_HASHLIMIT_HASH_DPT = 1 << 1, - XT_HASHLIMIT_HASH_SIP = 1 << 2, - XT_HASHLIMIT_HASH_SPT = 1 << 3, - XT_HASHLIMIT_INVERT = 1 << 4, - XT_HASHLIMIT_BYTES = 1 << 5, + XT_HASHLIMIT_HASH_DIP = 1 << 0, + XT_HASHLIMIT_HASH_DPT = 1 << 1, + XT_HASHLIMIT_HASH_SIP = 1 << 2, + XT_HASHLIMIT_HASH_SPT = 1 << 3, + XT_HASHLIMIT_INVERT = 1 << 4, + XT_HASHLIMIT_BYTES = 1 << 5, + XT_HASHLIMIT_RATE_MATCH = 1 << 6, }; struct hashlimit_cfg { @@ -79,6 +80,21 @@ struct hashlimit_cfg2 { __u8 srcmask, dstmask; }; +struct hashlimit_cfg3 { + __u64 avg; /* Average secs between packets * scale */ + __u64 burst; /* Period multiplier for upper limit. */ + __u32 mode; /* bitmask of XT_HASHLIMIT_HASH_* */ + + /* user specified */ + __u32 size; /* how many buckets */ + __u32 max; /* max number of entries */ + __u32 gc_interval; /* gc interval */ + __u32 expire; /* when do entries expire? */ + + __u32 interval; + __u8 srcmask, dstmask; +}; + struct xt_hashlimit_mtinfo1 { char name[IFNAMSIZ]; struct hashlimit_cfg1 cfg; @@ -95,4 +111,12 @@ struct xt_hashlimit_mtinfo2 { struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); }; +struct xt_hashlimit_mtinfo3 { + char name[NAME_MAX]; + struct hashlimit_cfg3 cfg; + + /* Used internally by the kernel */ + struct xt_hashlimit_htable *hinfo __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_HASHLIMIT_H */ diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index ffdb611e54a2..10d48234f5f4 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -56,6 +56,7 @@ static inline struct hashlimit_net *hashlimit_pernet(struct net *net) } /* need to declare this at the top */ +static const struct file_operations dl_file_ops_v2; static const struct file_operations dl_file_ops_v1; static const struct file_operations dl_file_ops; @@ -87,8 +88,19 @@ struct dsthash_ent { unsigned long expires; /* precalculated expiry time */ struct { unsigned long prev; /* last modification */ - u_int64_t credit; - u_int64_t credit_cap, cost; + union { + struct { + u_int64_t credit; + u_int64_t credit_cap; + u_int64_t cost; + }; + struct { + u_int32_t interval, prev_window; + u_int64_t current_rate; + u_int64_t rate; + int64_t burst; + }; + }; } rateinfo; struct rcu_head rcu; }; @@ -99,7 +111,7 @@ struct xt_hashlimit_htable { u_int8_t family; bool rnd_initialized; - struct hashlimit_cfg2 cfg; /* config */ + struct hashlimit_cfg3 cfg; /* config */ /* used internally */ spinlock_t lock; /* lock for list_head */ @@ -116,10 +128,10 @@ struct xt_hashlimit_htable { }; static int -cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) +cfg_copy(struct hashlimit_cfg3 *to, const void *from, int revision) { if (revision == 1) { - struct hashlimit_cfg1 *cfg = from; + struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; to->mode = cfg->mode; to->avg = cfg->avg; @@ -131,7 +143,19 @@ cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) to->srcmask = cfg->srcmask; to->dstmask = cfg->dstmask; } else if (revision == 2) { - memcpy(to, from, sizeof(struct hashlimit_cfg2)); + struct hashlimit_cfg2 *cfg = (struct hashlimit_cfg2 *)from; + + to->mode = cfg->mode; + to->avg = cfg->avg; + to->burst = cfg->burst; + to->size = cfg->size; + to->max = cfg->max; + to->gc_interval = cfg->gc_interval; + to->expire = cfg->expire; + to->srcmask = cfg->srcmask; + to->dstmask = cfg->dstmask; + } else if (revision == 3) { + memcpy(to, from, sizeof(struct hashlimit_cfg3)); } else { return -EINVAL; } @@ -240,13 +264,14 @@ dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent) } static void htable_gc(struct work_struct *work); -static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, +static int htable_create(struct net *net, struct hashlimit_cfg3 *cfg, const char *name, u_int8_t family, struct xt_hashlimit_htable **out_hinfo, int revision) { struct hashlimit_net *hashlimit_net = hashlimit_pernet(net); struct xt_hashlimit_htable *hinfo; + const struct file_operations *fops; unsigned int size, i; int ret; @@ -268,7 +293,7 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, *out_hinfo = hinfo; /* copy match config into hashtable config */ - ret = cfg_copy(&hinfo->cfg, (void *)cfg, 2); + ret = cfg_copy(&hinfo->cfg, (void *)cfg, 3); if (ret) return ret; @@ -293,11 +318,21 @@ static int htable_create(struct net *net, struct hashlimit_cfg2 *cfg, } spin_lock_init(&hinfo->lock); + switch (revision) { + case 1: + fops = &dl_file_ops_v1; + break; + case 2: + fops = &dl_file_ops_v2; + break; + default: + fops = &dl_file_ops; + } + hinfo->pde = proc_create_data(name, 0, (family == NFPROTO_IPV4) ? hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit, - (revision == 1) ? &dl_file_ops_v1 : &dl_file_ops, - hinfo); + fops, hinfo); if (hinfo->pde == NULL) { kfree(hinfo->name); vfree(hinfo); @@ -482,6 +517,25 @@ static u32 user2credits_byte(u32 user) return (u32) (us >> 32); } +static u64 user2rate(u64 user) +{ + if (user != 0) { + return div64_u64(XT_HASHLIMIT_SCALE_v2, user); + } else { + pr_warn("invalid rate from userspace: %llu\n", user); + return 0; + } +} + +static u64 user2rate_bytes(u64 user) +{ + u64 r; + + r = user ? 0xFFFFFFFFULL / user : 0xFFFFFFFFULL; + r = (r - 1) << 4; + return r; +} + static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, u32 mode, int revision) { @@ -491,6 +545,21 @@ static void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now, if (delta == 0) return; + if (revision >= 3 && mode & XT_HASHLIMIT_RATE_MATCH) { + u64 interval = dh->rateinfo.interval * HZ; + + if (delta < interval) + return; + + dh->rateinfo.prev = now; + dh->rateinfo.prev_window = + ((dh->rateinfo.current_rate * interval) > + (delta * dh->rateinfo.rate)); + dh->rateinfo.current_rate = 0; + + return; + } + dh->rateinfo.prev = now; if (mode & XT_HASHLIMIT_BYTES) { @@ -515,7 +584,23 @@ static void rateinfo_init(struct dsthash_ent *dh, struct xt_hashlimit_htable *hinfo, int revision) { dh->rateinfo.prev = jiffies; - if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { + if (revision >= 3 && hinfo->cfg.mode & XT_HASHLIMIT_RATE_MATCH) { + dh->rateinfo.prev_window = 0; + dh->rateinfo.current_rate = 0; + if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { + dh->rateinfo.rate = user2rate_bytes(hinfo->cfg.avg); + if (hinfo->cfg.burst) + dh->rateinfo.burst = + hinfo->cfg.burst * dh->rateinfo.rate; + else + dh->rateinfo.burst = dh->rateinfo.rate; + } else { + dh->rateinfo.rate = user2rate(hinfo->cfg.avg); + dh->rateinfo.burst = + hinfo->cfg.burst + dh->rateinfo.rate; + } + dh->rateinfo.interval = hinfo->cfg.interval; + } else if (hinfo->cfg.mode & XT_HASHLIMIT_BYTES) { dh->rateinfo.credit = CREDITS_PER_JIFFY_BYTES * HZ; dh->rateinfo.cost = user2credits_byte(hinfo->cfg.avg); dh->rateinfo.credit_cap = hinfo->cfg.burst; @@ -648,7 +733,7 @@ static u32 hashlimit_byte_cost(unsigned int len, struct dsthash_ent *dh) static bool hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, struct xt_hashlimit_htable *hinfo, - const struct hashlimit_cfg2 *cfg, int revision) + const struct hashlimit_cfg3 *cfg, int revision) { unsigned long now = jiffies; struct dsthash_ent *dh; @@ -680,6 +765,20 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, rateinfo_recalc(dh, now, hinfo->cfg.mode, revision); } + if (cfg->mode & XT_HASHLIMIT_RATE_MATCH) { + cost = (cfg->mode & XT_HASHLIMIT_BYTES) ? skb->len : 1; + dh->rateinfo.current_rate += cost; + + if (!dh->rateinfo.prev_window && + (dh->rateinfo.current_rate <= dh->rateinfo.burst)) { + spin_unlock(&dh->lock); + rcu_read_unlock_bh(); + return !(cfg->mode & XT_HASHLIMIT_INVERT); + } else { + goto overlimit; + } + } + if (cfg->mode & XT_HASHLIMIT_BYTES) cost = hashlimit_byte_cost(skb->len, dh); else @@ -693,6 +792,7 @@ hashlimit_mt_common(const struct sk_buff *skb, struct xt_action_param *par, return !(cfg->mode & XT_HASHLIMIT_INVERT); } +overlimit: spin_unlock(&dh->lock); local_bh_enable(); /* default match is underlimit - so over the limit, we need to invert */ @@ -708,7 +808,7 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo1 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; - struct hashlimit_cfg2 cfg = {}; + struct hashlimit_cfg3 cfg = {}; int ret; ret = cfg_copy(&cfg, (void *)&info->cfg, 1); @@ -720,17 +820,33 @@ hashlimit_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) } static bool -hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +hashlimit_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; struct xt_hashlimit_htable *hinfo = info->hinfo; + struct hashlimit_cfg3 cfg = {}; + int ret; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 2); + + if (ret) + return ret; + + return hashlimit_mt_common(skb, par, hinfo, &cfg, 2); +} + +static bool +hashlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_hashlimit_mtinfo3 *info = par->matchinfo; + struct xt_hashlimit_htable *hinfo = info->hinfo; - return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 2); + return hashlimit_mt_common(skb, par, hinfo, &info->cfg, 3); } static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, struct xt_hashlimit_htable **hinfo, - struct hashlimit_cfg2 *cfg, + struct hashlimit_cfg3 *cfg, const char *name, int revision) { struct net *net = par->net; @@ -753,7 +869,17 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, } /* Check for overflow. */ - if (cfg->mode & XT_HASHLIMIT_BYTES) { + if (revision >= 3 && cfg->mode & XT_HASHLIMIT_RATE_MATCH) { + if (cfg->avg == 0) { + pr_info("hashlimit invalid rate\n"); + return -ERANGE; + } + + if (cfg->interval == 0) { + pr_info("hashlimit invalid interval\n"); + return -EINVAL; + } + } else if (cfg->mode & XT_HASHLIMIT_BYTES) { if (user2credits_byte(cfg->avg) == 0) { pr_info("overflow, rate too high: %llu\n", cfg->avg); return -EINVAL; @@ -784,7 +910,7 @@ static int hashlimit_mt_check_common(const struct xt_mtchk_param *par, static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo1 *info = par->matchinfo; - struct hashlimit_cfg2 cfg = {}; + struct hashlimit_cfg3 cfg = {}; int ret; if (info->name[sizeof(info->name) - 1] != '\0') @@ -799,15 +925,40 @@ static int hashlimit_mt_check_v1(const struct xt_mtchk_param *par) &cfg, info->name, 1); } -static int hashlimit_mt_check(const struct xt_mtchk_param *par) +static int hashlimit_mt_check_v2(const struct xt_mtchk_param *par) { struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + struct hashlimit_cfg3 cfg = {}; + int ret; + + if (info->name[sizeof(info->name) - 1] != '\0') + return -EINVAL; + + ret = cfg_copy(&cfg, (void *)&info->cfg, 2); + + if (ret) + return ret; + + return hashlimit_mt_check_common(par, &info->hinfo, + &cfg, info->name, 2); +} + +static int hashlimit_mt_check(const struct xt_mtchk_param *par) +{ + struct xt_hashlimit_mtinfo3 *info = par->matchinfo; if (info->name[sizeof(info->name) - 1] != '\0') return -EINVAL; return hashlimit_mt_check_common(par, &info->hinfo, &info->cfg, - info->name, 2); + info->name, 3); +} + +static void hashlimit_mt_destroy_v2(const struct xt_mtdtor_param *par) +{ + const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + + htable_put(info->hinfo); } static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) @@ -819,7 +970,7 @@ static void hashlimit_mt_destroy_v1(const struct xt_mtdtor_param *par) static void hashlimit_mt_destroy(const struct xt_mtdtor_param *par) { - const struct xt_hashlimit_mtinfo2 *info = par->matchinfo; + const struct xt_hashlimit_mtinfo3 *info = par->matchinfo; htable_put(info->hinfo); } @@ -840,9 +991,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .name = "hashlimit", .revision = 2, .family = NFPROTO_IPV4, - .match = hashlimit_mt, + .match = hashlimit_mt_v2, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), + .checkentry = hashlimit_mt_check_v2, + .destroy = hashlimit_mt_destroy_v2, + .me = THIS_MODULE, + }, + { + .name = "hashlimit", + .revision = 3, + .family = NFPROTO_IPV4, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo3), + .usersize = offsetof(struct xt_hashlimit_mtinfo3, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -863,9 +1025,20 @@ static struct xt_match hashlimit_mt_reg[] __read_mostly = { .name = "hashlimit", .revision = 2, .family = NFPROTO_IPV6, - .match = hashlimit_mt, + .match = hashlimit_mt_v2, .matchsize = sizeof(struct xt_hashlimit_mtinfo2), .usersize = offsetof(struct xt_hashlimit_mtinfo2, hinfo), + .checkentry = hashlimit_mt_check_v2, + .destroy = hashlimit_mt_destroy_v2, + .me = THIS_MODULE, + }, + { + .name = "hashlimit", + .revision = 3, + .family = NFPROTO_IPV6, + .match = hashlimit_mt, + .matchsize = sizeof(struct xt_hashlimit_mtinfo3), + .usersize = offsetof(struct xt_hashlimit_mtinfo3, hinfo), .checkentry = hashlimit_mt_check, .destroy = hashlimit_mt_destroy, .me = THIS_MODULE, @@ -947,6 +1120,21 @@ static void dl_seq_print(struct dsthash_ent *ent, u_int8_t family, } } +static int dl_seq_real_show_v2(struct dsthash_ent *ent, u_int8_t family, + struct seq_file *s) +{ + const struct xt_hashlimit_htable *ht = s->private; + + spin_lock(&ent->lock); + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); + + dl_seq_print(ent, family, s); + + spin_unlock(&ent->lock); + return seq_has_overflowed(s); +} + static int dl_seq_real_show_v1(struct dsthash_ent *ent, u_int8_t family, struct seq_file *s) { @@ -969,7 +1157,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, spin_lock(&ent->lock); /* recalculate to show accurate numbers */ - rateinfo_recalc(ent, jiffies, ht->cfg.mode, 2); + rateinfo_recalc(ent, jiffies, ht->cfg.mode, 3); dl_seq_print(ent, family, s); @@ -977,6 +1165,20 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, return seq_has_overflowed(s); } +static int dl_seq_show_v2(struct seq_file *s, void *v) +{ + struct xt_hashlimit_htable *htable = s->private; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + + if (!hlist_empty(&htable->hash[*bucket])) { + hlist_for_each_entry(ent, &htable->hash[*bucket], node) + if (dl_seq_real_show_v2(ent, htable->family, s)) + return -1; + } + return 0; +} + static int dl_seq_show_v1(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = s->private; @@ -1012,6 +1214,13 @@ static const struct seq_operations dl_seq_ops_v1 = { .show = dl_seq_show_v1 }; +static const struct seq_operations dl_seq_ops_v2 = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show_v2 +}; + static const struct seq_operations dl_seq_ops = { .start = dl_seq_start, .next = dl_seq_next, @@ -1019,6 +1228,18 @@ static const struct seq_operations dl_seq_ops = { .show = dl_seq_show }; +static int dl_proc_open_v2(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops_v2); + + if (!ret) { + struct seq_file *sf = file->private_data; + + sf->private = PDE_DATA(inode); + } + return ret; +} + static int dl_proc_open_v1(struct inode *inode, struct file *file) { int ret = seq_open(file, &dl_seq_ops_v1); @@ -1042,6 +1263,14 @@ static int dl_proc_open(struct inode *inode, struct file *file) return ret; } +static const struct file_operations dl_file_ops_v2 = { + .owner = THIS_MODULE, + .open = dl_proc_open_v2, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + static const struct file_operations dl_file_ops_v1 = { .owner = THIS_MODULE, .open = dl_proc_open_v1, -- cgit v1.2.3 From a691205571723cb0544110ca91653ac4b0eb5b17 Mon Sep 17 00:00:00 2001 From: "Pablo M. Bermudo Garay" Date: Wed, 23 Aug 2017 22:41:25 +0200 Subject: netfilter: nft_limit: add stateful object type Register a new limit stateful object type into the stateful object infrastructure. Signed-off-by: Pablo M. Bermudo Garay Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 3 +- net/netfilter/nft_limit.c | 122 ++++++++++++++++++++++++++++++- 2 files changed, 123 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index b49da72efa68..871afa4871bf 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1282,7 +1282,8 @@ enum nft_ct_helper_attributes { #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 #define NFT_OBJECT_CT_HELPER 3 -#define __NFT_OBJECT_MAX 4 +#define NFT_OBJECT_LIMIT 4 +#define __NFT_OBJECT_MAX 5 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index aae2d1ec27f3..a9fc298ef4c3 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -229,14 +229,133 @@ static struct nft_expr_type nft_limit_type __read_mostly = { .owner = THIS_MODULE, }; +static void nft_limit_obj_pkts_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit_pkts *priv = nft_obj_data(obj); + + if (nft_limit_eval(&priv->limit, priv->cost)) + regs->verdict.code = NFT_BREAK; +} + +static int nft_limit_obj_pkts_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_limit_pkts *priv = nft_obj_data(obj); + int err; + + err = nft_limit_init(&priv->limit, tb); + if (err < 0) + return err; + + priv->cost = div64_u64(priv->limit.nsecs, priv->limit.rate); + return 0; +} + +static int nft_limit_obj_pkts_dump(struct sk_buff *skb, + struct nft_object *obj, + bool reset) +{ + const struct nft_limit_pkts *priv = nft_obj_data(obj); + + return nft_limit_dump(skb, &priv->limit, NFT_LIMIT_PKTS); +} + +static struct nft_object_type nft_limit_obj_type; +static const struct nft_object_ops nft_limit_obj_pkts_ops = { + .type = &nft_limit_obj_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_limit_pkts)), + .init = nft_limit_obj_pkts_init, + .eval = nft_limit_obj_pkts_eval, + .dump = nft_limit_obj_pkts_dump, +}; + +static void nft_limit_obj_bytes_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_limit *priv = nft_obj_data(obj); + u64 cost = div64_u64(priv->nsecs * pkt->skb->len, priv->rate); + + if (nft_limit_eval(priv, cost)) + regs->verdict.code = NFT_BREAK; +} + +static int nft_limit_obj_bytes_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_limit *priv = nft_obj_data(obj); + + return nft_limit_init(priv, tb); +} + +static int nft_limit_obj_bytes_dump(struct sk_buff *skb, + struct nft_object *obj, + bool reset) +{ + const struct nft_limit *priv = nft_obj_data(obj); + + return nft_limit_dump(skb, priv, NFT_LIMIT_PKT_BYTES); +} + +static struct nft_object_type nft_limit_obj_type; +static const struct nft_object_ops nft_limit_obj_bytes_ops = { + .type = &nft_limit_obj_type, + .size = sizeof(struct nft_limit), + .init = nft_limit_obj_bytes_init, + .eval = nft_limit_obj_bytes_eval, + .dump = nft_limit_obj_bytes_dump, +}; + +static const struct nft_object_ops * +nft_limit_obj_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + if (!tb[NFTA_LIMIT_TYPE]) + return &nft_limit_obj_pkts_ops; + + switch (ntohl(nla_get_be32(tb[NFTA_LIMIT_TYPE]))) { + case NFT_LIMIT_PKTS: + return &nft_limit_obj_pkts_ops; + case NFT_LIMIT_PKT_BYTES: + return &nft_limit_obj_bytes_ops; + } + return ERR_PTR(-EOPNOTSUPP); +} + +static struct nft_object_type nft_limit_obj_type __read_mostly = { + .select_ops = nft_limit_obj_select_ops, + .type = NFT_OBJECT_LIMIT, + .maxattr = NFTA_LIMIT_MAX, + .policy = nft_limit_policy, + .owner = THIS_MODULE, +}; + static int __init nft_limit_module_init(void) { - return nft_register_expr(&nft_limit_type); + int err; + + err = nft_register_obj(&nft_limit_obj_type); + if (err < 0) + return err; + + err = nft_register_expr(&nft_limit_type); + if (err < 0) + goto err1; + + return 0; +err1: + nft_unregister_obj(&nft_limit_obj_type); + return err; } static void __exit nft_limit_module_exit(void) { nft_unregister_expr(&nft_limit_type); + nft_unregister_obj(&nft_limit_obj_type); } module_init(nft_limit_module_init); @@ -245,3 +364,4 @@ module_exit(nft_limit_module_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("limit"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_LIMIT); -- cgit v1.2.3 From 2335ba704f32b855651d0cd15dd9b271ec565fb6 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Sun, 3 Sep 2017 23:55:59 +0200 Subject: netlink: add NLM_F_NONREC flag for deletion requests In the last NFWS in Faro, Portugal, we discussed that netlink is lacking the semantics to request non recursive deletions, ie. do not delete an object iff it has child objects that hang from this parent object that the user requests to be deleted. We need this new flag to solve a problem for the iptables-compat backward compatibility utility, that runs iptables commands using the existing nf_tables netlink interface. Specifically, custom chains in iptables cannot be deleted if there are rules in it, however, nf_tables allows to remove any chain that is populated with content. To sort out this asymmetry, iptables-compat userspace sets this new NLM_F_NONREC flag to obtain the same semantics that iptables provides. This new flag should only be used for deletion requests. Note this new flag value overlaps with the existing: * NLM_F_ROOT for get requests. * NLM_F_REPLACE for new requests. However, those flags should not ever be used in deletion requests. Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netlink.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f4fc9c9e123d..e8af60a7c56d 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -69,6 +69,9 @@ struct nlmsghdr { #define NLM_F_CREATE 0x400 /* Create, if it does not exist */ #define NLM_F_APPEND 0x800 /* Add to end of list */ +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + /* Flags for ACK message */ #define NLM_F_CAPPED 0x100 /* request was capped */ #define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ -- cgit v1.2.3