From 17abc9ec68b73ddeb262a507a62421016b9c54d5 Mon Sep 17 00:00:00 2001 From: Tomasz Duszynski Date: Fri, 14 Dec 2018 19:28:01 +0100 Subject: iio: add IIO_MASSCONCENTRATION channel type Measuring particulate matter in ug / m3 (micro-grams per cubic meter) is de facto standard. Existing air quality sensors usually follow this convention and are capable of returning measurements using this unit. IIO currently does not offer suitable channel type for this type of measurements hence this patch adds this. In addition, extra modifiers are introduced used for distinguishing between fine pm1, pm2p5 and coarse pm4, pm10 particle measurements, i.e IIO_MOD_PM1, IIO_MOD_PM25 and IIO_MOD_PM4, IIO_MOD_PM10. pmX consists of particles with aerodynamic diameter less or equal to X micrometers. Signed-off-by: Tomasz Duszynski Acked-by: Matt Ranostay Signed-off-by: Jonathan Cameron --- include/uapi/linux/iio/types.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index 92baabc103ac..c59adac24b1c 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -46,6 +46,7 @@ enum iio_chan_type { IIO_GRAVITY, IIO_POSITIONRELATIVE, IIO_PHASE, + IIO_MASSCONCENTRATION, }; enum iio_modifier { @@ -87,6 +88,10 @@ enum iio_modifier { IIO_MOD_VOC, IIO_MOD_LIGHT_UV, IIO_MOD_LIGHT_DUV, + IIO_MOD_PM1, + IIO_MOD_PM2P5, + IIO_MOD_PM4, + IIO_MOD_PM10, }; enum iio_event_type { -- cgit v1.2.3 From b170f7d48443d1ea3e4ffbf409025b5e5b1146fe Mon Sep 17 00:00:00 2001 From: Andreas Brauchli Date: Thu, 13 Dec 2018 15:43:22 +0100 Subject: iio: Add modifiers for ethanol and H2 gases Add ethanol and H2 gas modifiers: * IIO_MOD_ETHANOL * IIO_MOD_H2 Signed-off-by: Andreas Brauchli Acked-by: Matt Ranostay Signed-off-by: Jonathan Cameron --- Documentation/ABI/testing/sysfs-bus-iio | 4 ++++ include/uapi/linux/iio/types.h | 2 ++ tools/iio/iio_event_monitor.c | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 67fd88bf7910..864f8efd12e5 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -1554,6 +1554,10 @@ What: /sys/bus/iio/devices/iio:deviceX/in_concentration_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_co2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_co2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentration_ethanol_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_ethanol_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentration_h2_raw +What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_h2_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentration_voc_raw What: /sys/bus/iio/devices/iio:deviceX/in_concentrationX_voc_raw KernelVersion: 4.3 diff --git a/include/uapi/linux/iio/types.h b/include/uapi/linux/iio/types.h index c59adac24b1c..fdd81affca4b 100644 --- a/include/uapi/linux/iio/types.h +++ b/include/uapi/linux/iio/types.h @@ -92,6 +92,8 @@ enum iio_modifier { IIO_MOD_PM2P5, IIO_MOD_PM4, IIO_MOD_PM10, + IIO_MOD_ETHANOL, + IIO_MOD_H2, }; enum iio_event_type { diff --git a/tools/iio/iio_event_monitor.c b/tools/iio/iio_event_monitor.c index f6b8003fbe3c..7bf9bde28bcc 100644 --- a/tools/iio/iio_event_monitor.c +++ b/tools/iio/iio_event_monitor.c @@ -115,6 +115,8 @@ static const char * const iio_modifier_names[] = { [IIO_MOD_I] = "i", [IIO_MOD_Q] = "q", [IIO_MOD_CO2] = "co2", + [IIO_MOD_ETHANOL] = "ethanol", + [IIO_MOD_H2] = "h2", [IIO_MOD_VOC] = "voc", [IIO_MOD_PM1] = "pm1", [IIO_MOD_PM2P5] = "pm2p5", @@ -205,6 +207,8 @@ static bool event_is_known(struct iio_event_data *event) case IIO_MOD_I: case IIO_MOD_Q: case IIO_MOD_CO2: + case IIO_MOD_ETHANOL: + case IIO_MOD_H2: case IIO_MOD_VOC: case IIO_MOD_PM1: case IIO_MOD_PM2P5: -- cgit v1.2.3 From 7a79d717e0817610932ce3b7b6033ea06ee1d577 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Mon, 31 Dec 2018 23:59:59 +0100 Subject: batman-adv: Update copyright years for 2019 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- include/uapi/linux/batman_adv.h | 2 +- net/batman-adv/Kconfig | 2 +- net/batman-adv/Makefile | 2 +- net/batman-adv/bat_algo.c | 2 +- net/batman-adv/bat_algo.h | 2 +- net/batman-adv/bat_iv_ogm.c | 2 +- net/batman-adv/bat_iv_ogm.h | 2 +- net/batman-adv/bat_v.c | 2 +- net/batman-adv/bat_v.h | 2 +- net/batman-adv/bat_v_elp.c | 2 +- net/batman-adv/bat_v_elp.h | 2 +- net/batman-adv/bat_v_ogm.c | 2 +- net/batman-adv/bat_v_ogm.h | 2 +- net/batman-adv/bitarray.c | 2 +- net/batman-adv/bitarray.h | 2 +- net/batman-adv/bridge_loop_avoidance.c | 2 +- net/batman-adv/bridge_loop_avoidance.h | 2 +- net/batman-adv/debugfs.c | 2 +- net/batman-adv/debugfs.h | 2 +- net/batman-adv/distributed-arp-table.c | 2 +- net/batman-adv/distributed-arp-table.h | 2 +- net/batman-adv/fragmentation.c | 2 +- net/batman-adv/fragmentation.h | 2 +- net/batman-adv/gateway_client.c | 2 +- net/batman-adv/gateway_client.h | 2 +- net/batman-adv/gateway_common.c | 2 +- net/batman-adv/gateway_common.h | 2 +- net/batman-adv/hard-interface.c | 2 +- net/batman-adv/hard-interface.h | 2 +- net/batman-adv/hash.c | 2 +- net/batman-adv/hash.h | 2 +- net/batman-adv/icmp_socket.c | 2 +- net/batman-adv/icmp_socket.h | 2 +- net/batman-adv/log.c | 2 +- net/batman-adv/log.h | 2 +- net/batman-adv/main.c | 2 +- net/batman-adv/main.h | 2 +- net/batman-adv/multicast.c | 2 +- net/batman-adv/multicast.h | 2 +- net/batman-adv/netlink.c | 2 +- net/batman-adv/netlink.h | 2 +- net/batman-adv/network-coding.c | 2 +- net/batman-adv/network-coding.h | 2 +- net/batman-adv/originator.c | 2 +- net/batman-adv/originator.h | 2 +- net/batman-adv/routing.c | 2 +- net/batman-adv/routing.h | 2 +- net/batman-adv/send.c | 2 +- net/batman-adv/send.h | 2 +- net/batman-adv/soft-interface.c | 2 +- net/batman-adv/soft-interface.h | 2 +- net/batman-adv/sysfs.c | 2 +- net/batman-adv/sysfs.h | 2 +- net/batman-adv/tp_meter.c | 2 +- net/batman-adv/tp_meter.h | 2 +- net/batman-adv/trace.c | 2 +- net/batman-adv/trace.h | 2 +- net/batman-adv/translation-table.c | 2 +- net/batman-adv/translation-table.h | 2 +- net/batman-adv/tvlv.c | 2 +- net/batman-adv/tvlv.h | 2 +- net/batman-adv/types.h | 2 +- 63 files changed, 63 insertions(+), 63 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 894d8d2f713d..7eb2936a8e22 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 324a0e1143e7..a28e76a7e0a2 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: MIT */ -/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig index c386e6981416..a31db5e9ac8e 100644 --- a/net/batman-adv/Kconfig +++ b/net/batman-adv/Kconfig @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile index 9b58160fe485..a887ecc3efa1 100644 --- a/net/batman-adv/Makefile +++ b/net/batman-adv/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -# Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +# Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: # # Marek Lindner, Simon Wunderlich # diff --git a/net/batman-adv/bat_algo.c b/net/batman-adv/bat_algo.c index ea309ad06175..7b7e15641fef 100644 --- a/net/batman-adv/bat_algo.c +++ b/net/batman-adv/bat_algo.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h index 534b790c3753..25e7bb51928c 100644 --- a/net/batman-adv/bat_algo.h +++ b/net/batman-adv/bat_algo.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index f97e566f0402..de61091af666 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_iv_ogm.h b/net/batman-adv/bat_iv_ogm.h index 3dc6a7a43eb7..785f6666273c 100644 --- a/net/batman-adv/bat_iv_ogm.h +++ b/net/batman-adv/bat_iv_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c index 90e33f84d37a..445594ed58af 100644 --- a/net/batman-adv/bat_v.c +++ b/net/batman-adv/bat_v.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v.h b/net/batman-adv/bat_v.h index ec4a2a569750..465a4fc23354 100644 --- a/net/batman-adv/bat_v.h +++ b/net/batman-adv/bat_v.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Linus Lüssing * diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index e8090f099eb8..7b80f6f8d4dc 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h index e8c7b7fd290d..75f189ee4a1c 100644 --- a/net/batman-adv/bat_v_elp.h +++ b/net/batman-adv/bat_v_elp.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing, Marek Lindner * diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c index 2948b41b06d4..c9698ad41854 100644 --- a/net/batman-adv/bat_v_ogm.c +++ b/net/batman-adv/bat_v_ogm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h index e5be14c908c6..f67cf7ee06b2 100644 --- a/net/batman-adv/bat_v_ogm.h +++ b/net/batman-adv/bat_v_ogm.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c index a296a4d851f5..63e134e763e3 100644 --- a/net/batman-adv/bitarray.c +++ b/net/batman-adv/bitarray.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h index 48f683289531..f3a05ad9afad 100644 --- a/net/batman-adv/bitarray.h +++ b/net/batman-adv/bitarray.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index 5fdde2947802..ef39aabdb694 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index 71f95a3e4d3f..31771c751efb 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich * diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c index d4a7702e48d8..3b9d1ad2f467 100644 --- a/net/batman-adv/debugfs.c +++ b/net/batman-adv/debugfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h index 8de018e5c577..c0b8694041ec 100644 --- a/net/batman-adv/debugfs.h +++ b/net/batman-adv/debugfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index c7f3e2231233..899ab051ffce 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h index e58d7a9bfc8d..68c0ff321acd 100644 --- a/net/batman-adv/distributed-arp-table.h +++ b/net/batman-adv/distributed-arp-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2011-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2011-2019 B.A.T.M.A.N. contributors: * * Antonio Quartulli * diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c index 5b71a289d04f..b506d15b8230 100644 --- a/net/batman-adv/fragmentation.c +++ b/net/batman-adv/fragmentation.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h index 944512e07782..abdac26579bf 100644 --- a/net/batman-adv/fragmentation.h +++ b/net/batman-adv/fragmentation.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2013-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2013-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll * diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 9d8e5eda2314..0a6b9b30eabd 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h index f0b86fcb2493..b5732c8be81a 100644 --- a/net/batman-adv/gateway_client.h +++ b/net/batman-adv/gateway_client.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 936c107f3199..53d03adc9bfc 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 80afb2793687..89bae100a0b0 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index 508f4416dfc9..28c1fb8d1af0 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h index d1c0f6189301..48de28c83401 100644 --- a/net/batman-adv/hard-interface.h +++ b/net/batman-adv/hard-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c index 9194f4d891b1..56a08ce193d5 100644 --- a/net/batman-adv/hash.c +++ b/net/batman-adv/hash.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h index 0e36fa1c7c3e..37507b6d4006 100644 --- a/net/batman-adv/hash.h +++ b/net/batman-adv/hash.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2006-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2006-2019 B.A.T.M.A.N. contributors: * * Simon Wunderlich, Marek Lindner * diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c index d70f363c52ae..d34bb79e0405 100644 --- a/net/batman-adv/icmp_socket.c +++ b/net/batman-adv/icmp_socket.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h index 958be22beda9..5f8926522ff0 100644 --- a/net/batman-adv/icmp_socket.h +++ b/net/batman-adv/icmp_socket.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.c b/net/batman-adv/log.c index 02e55b78132f..f97849a8380b 100644 --- a/net/batman-adv/log.c +++ b/net/batman-adv/log.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 35f4f397ed57..660e9bcc85a2 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index d1ed839fd32b..75750870cf04 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index f3ef2c725a41..3ed669d7dc6b 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 69244e4598f5..5de6a37525f5 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h index 3b04ab13f0eb..466013fe88af 100644 --- a/net/batman-adv/multicast.h +++ b/net/batman-adv/multicast.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2014-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2014-2019 B.A.T.M.A.N. contributors: * * Linus Lüssing * diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 2dc3304cee54..5fe833cc9507 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/netlink.h b/net/batman-adv/netlink.h index 571d9a5ae7aa..216484b8b82d 100644 --- a/net/batman-adv/netlink.h +++ b/net/batman-adv/netlink.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2016-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2016-2019 B.A.T.M.A.N. contributors: * * Matthias Schiffer * diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c index 34caf129a9bf..278762bd94c6 100644 --- a/net/batman-adv/network-coding.c +++ b/net/batman-adv/network-coding.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h index 65c346812bc1..96ef0a511fc7 100644 --- a/net/batman-adv/network-coding.h +++ b/net/batman-adv/network-coding.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Martin Hundebøll, Jeppe Ledet-Pedersen * diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c index 56a981af5c92..e5cdf89ef63c 100644 --- a/net/batman-adv/originator.c +++ b/net/batman-adv/originator.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2009-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2009-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h index a8b4c7b667ec..dca1e4a34ec6 100644 --- a/net/batman-adv/originator.h +++ b/net/batman-adv/originator.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 469fa97d1add..cae0e5dd0768 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h index db54c2d9b8bf..0102d69d345c 100644 --- a/net/batman-adv/routing.h +++ b/net/batman-adv/routing.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 4a35f5c2f52b..66a8b3e44501 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index 64cce07b8fe6..1f6132922e60 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index 943071008670..b14fb3462af7 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h index daf87f07fadd..538bb661878c 100644 --- a/net/batman-adv/soft-interface.h +++ b/net/batman-adv/soft-interface.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index 09427fc6494a..e1b816262c53 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h index c1e3fb69952d..705ffbe763f4 100644 --- a/net/batman-adv/sysfs.h +++ b/net/batman-adv/sysfs.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Marek Lindner * diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index 11520de96ccb..500109bbd551 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/tp_meter.h b/net/batman-adv/tp_meter.h index 68e600974759..6b4d0f733896 100644 --- a/net/batman-adv/tp_meter.h +++ b/net/batman-adv/tp_meter.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2012-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2012-2019 B.A.T.M.A.N. contributors: * * Edo Monticelli, Antonio Quartulli * diff --git a/net/batman-adv/trace.c b/net/batman-adv/trace.c index 8e1024217cff..f77c917ed20d 100644 --- a/net/batman-adv/trace.c +++ b/net/batman-adv/trace.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann * diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 104784be94d7..5e5579051400 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2010-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2010-2019 B.A.T.M.A.N. contributors: * * Sven Eckelmann * diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 8dcd4968cde7..f73d79139ae7 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h index 01b6c8eafaf9..61bca75e5911 100644 --- a/net/batman-adv/translation-table.h +++ b/net/batman-adv/translation-table.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich, Antonio Quartulli * diff --git a/net/batman-adv/tvlv.c b/net/batman-adv/tvlv.c index 40e69c9346d2..7e947b01919d 100644 --- a/net/batman-adv/tvlv.c +++ b/net/batman-adv/tvlv.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/tvlv.h b/net/batman-adv/tvlv.h index ef5867f49824..c0f033b1acb8 100644 --- a/net/batman-adv/tvlv.h +++ b/net/batman-adv/tvlv.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index cbe17da36fcb..a21b34ed6548 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1,5 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ -/* Copyright (C) 2007-2018 B.A.T.M.A.N. contributors: +/* Copyright (C) 2007-2019 B.A.T.M.A.N. contributors: * * Marek Lindner, Simon Wunderlich * -- cgit v1.2.3 From c2eb8effb265ac5cdd960d8e61ecb931e9c767cd Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Wed, 24 Oct 2018 06:50:34 -0400 Subject: media: videodev2.h: add v4l2_timeval_to_ns inline function We want to be able to uniquely identify buffers for stateless codecs. The internal timestamp (a u64) as stored internally in the kernel is a suitable candidate for that, but in struct v4l2_buffer it is represented as a struct timeval. Add a v4l2_timeval_to_ns() function that converts the struct timeval into a u64 in the same way that the kernel does. This makes it possible to use this u64 elsewhere as a unique identifier of the buffer. Since timestamps are also copied from the output buffer to the corresponding capture buffer(s) by M2M devices, the u64 can be used to refer to both output and capture buffers. The plan is that in the future we redesign struct v4l2_buffer and use u64 for the timestamp instead of a struct timeval (which has lots of problems with 32 vs 64 bit and y2038 layout changes), and then there is no more need to use this function. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index b5671ce2724f..d6eed479c3a6 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -973,6 +973,18 @@ struct v4l2_buffer { }; }; +/** + * v4l2_timeval_to_ns - Convert timeval to nanoseconds + * @ts: pointer to the timeval variable to be converted + * + * Returns the scalar nanosecond representation of the timeval + * parameter. + */ +static inline __u64 v4l2_timeval_to_ns(const struct timeval *tv) +{ + return (__u64)tv->tv_sec * 1000000000ULL + tv->tv_usec * 1000; +} + /* Flags for 'flags' field */ /* Buffer is mapped (flag) */ #define V4L2_BUF_FLAG_MAPPED 0x00000001 -- cgit v1.2.3 From 4b837c6d7ee771f68a30f362c9f68171a95be222 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Mon, 14 Jan 2019 09:01:54 -0500 Subject: media: v4l: uAPI: V4L2_BUF_TYPE_META_OUTPUT is an output buffer type V4L2_BUF_TYPE_META_OUTPUT was added by commit 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") but the patch missed adding the type to the macro telling whether a given type is an output type or not. Do that now. Getting this wrong leads to handling the buffer as a capture buffer in a lot of places. Fixes: 72148d1a57e7 ("media: v4l: Add support for V4L2_BUF_TYPE_META_OUTPUT") Signed-off-by: Sakari Ailus Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index d6eed479c3a6..c0c36c165bf4 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -161,7 +161,8 @@ enum v4l2_buf_type { || (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_OVERLAY \ || (type) == V4L2_BUF_TYPE_VBI_OUTPUT \ || (type) == V4L2_BUF_TYPE_SLICED_VBI_OUTPUT \ - || (type) == V4L2_BUF_TYPE_SDR_OUTPUT) + || (type) == V4L2_BUF_TYPE_SDR_OUTPUT \ + || (type) == V4L2_BUF_TYPE_META_OUTPUT) enum v4l2_tuner_type { V4L2_TUNER_RADIO = 1, -- cgit v1.2.3 From 50656bad786d001b294764e9f047c5d5b3e4db75 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:09 -0500 Subject: media: v4l2-ctrl: Add control to enable h.264 constrained intra prediction Allow to enable h.264 constrained intra prediction (macroblocks using intra prediction modes are not allowed to use residual data and decoded samples of neighboring macroblocks coded using inter prediction modes). This control directly corresponds to the constrained_intra_pred_flag field in the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/extended-controls.rst | 4 ++++ drivers/media/v4l2-core/v4l2-ctrls.c | 2 ++ include/uapi/linux/v4l2-controls.h | 1 + 3 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index af4273aa5e85..235d0c293983 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -1154,6 +1154,10 @@ enum v4l2_mpeg_video_h264_entropy_mode - ``V4L2_CID_MPEG_VIDEO_H264_8X8_TRANSFORM (boolean)`` Enable 8X8 transform for H264. Applicable to the H264 encoder. +``V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (boolean)`` + Enable constrained intra prediction for H264. Applicable to the H264 + encoder. + ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)`` Cyclic intra macroblock refresh. This is the number of continuous macroblocks refreshed every frame. Each frame a successive set of diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 8c47d8f00429..50b56185c02e 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -825,6 +825,8 @@ const char *v4l2_ctrl_get_name(u32 id) case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER:return "H264 Number of HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP: return "H264 Set QP Value for HC Layers"; + case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: + return "H264 Constrained Intra Pred"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index 3dcfc6148f99..fd65c710b144 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -533,6 +533,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { }; #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) +#define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.2.3 From d034696cbe5a6e00f76ca4b7869c6cdef66aebd5 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 10 Jan 2019 11:56:10 -0500 Subject: media: v4l2-ctrl: Add control for h.264 chroma qp offset Allow to add fixed quantization parameter offset between luma and chroma quantization parameters. This control directly corresponds to the chroma_qp_index_offset field of the h.264 picture parameter set. Signed-off-by: Philipp Zabel Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/extended-controls.rst | 5 +++++ drivers/media/v4l2-core/v4l2-ctrls.c | 1 + include/uapi/linux/v4l2-controls.h | 1 + 3 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/media/uapi/v4l/extended-controls.rst b/Documentation/media/uapi/v4l/extended-controls.rst index 235d0c293983..00934efdc9e4 100644 --- a/Documentation/media/uapi/v4l/extended-controls.rst +++ b/Documentation/media/uapi/v4l/extended-controls.rst @@ -1158,6 +1158,11 @@ enum v4l2_mpeg_video_h264_entropy_mode - Enable constrained intra prediction for H264. Applicable to the H264 encoder. +``V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (integer)`` + Specify the offset that should be added to the luma quantization + parameter to determine the chroma quantization parameter. Applicable + to the H264 encoder. + ``V4L2_CID_MPEG_VIDEO_CYCLIC_INTRA_REFRESH_MB (integer)`` Cyclic intra macroblock refresh. This is the number of continuous macroblocks refreshed every frame. Each frame a successive set of diff --git a/drivers/media/v4l2-core/v4l2-ctrls.c b/drivers/media/v4l2-core/v4l2-ctrls.c index 50b56185c02e..99308dac2daa 100644 --- a/drivers/media/v4l2-core/v4l2-ctrls.c +++ b/drivers/media/v4l2-core/v4l2-ctrls.c @@ -827,6 +827,7 @@ const char *v4l2_ctrl_get_name(u32 id) return "H264 Set QP Value for HC Layers"; case V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION: return "H264 Constrained Intra Pred"; + case V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET: return "H264 Chroma QP Index Offset"; case V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP: return "MPEG4 I-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP: return "MPEG4 P-Frame QP Value"; case V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP: return "MPEG4 B-Frame QP Value"; diff --git a/include/uapi/linux/v4l2-controls.h b/include/uapi/linux/v4l2-controls.h index fd65c710b144..06479f2fb3ae 100644 --- a/include/uapi/linux/v4l2-controls.h +++ b/include/uapi/linux/v4l2-controls.h @@ -534,6 +534,7 @@ enum v4l2_mpeg_video_h264_hierarchical_coding_type { #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER (V4L2_CID_MPEG_BASE+381) #define V4L2_CID_MPEG_VIDEO_H264_HIERARCHICAL_CODING_LAYER_QP (V4L2_CID_MPEG_BASE+382) #define V4L2_CID_MPEG_VIDEO_H264_CONSTRAINED_INTRA_PREDICTION (V4L2_CID_MPEG_BASE+383) +#define V4L2_CID_MPEG_VIDEO_H264_CHROMA_QP_INDEX_OFFSET (V4L2_CID_MPEG_BASE+384) #define V4L2_CID_MPEG_VIDEO_MPEG4_I_FRAME_QP (V4L2_CID_MPEG_BASE+400) #define V4L2_CID_MPEG_VIDEO_MPEG4_P_FRAME_QP (V4L2_CID_MPEG_BASE+401) #define V4L2_CID_MPEG_VIDEO_MPEG4_B_FRAME_QP (V4L2_CID_MPEG_BASE+402) -- cgit v1.2.3 From 1c3721b1f22286033abeda30b7e12439b083ed0f Mon Sep 17 00:00:00 2001 From: Steve Longerbeam Date: Wed, 9 Jan 2019 13:30:04 -0500 Subject: media: videodev2.h: Add more field helper macros Adds two helper macros: V4L2_FIELD_IS_SEQUENTIAL: returns true if the given field type is 'sequential', that is a full frame is transmitted, or exists in memory, as all top field lines followed by all bottom field lines, or vice-versa. V4L2_FIELD_IS_INTERLACED: returns true if the given field type is 'interlaced', that is a full frame is transmitted, or exists in memory, as top field lines interlaced with bottom field lines. Signed-off-by: Steve Longerbeam Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/uapi/linux/videodev2.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index c0c36c165bf4..9a920f071ff9 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -130,6 +130,13 @@ enum v4l2_field { ((field) == V4L2_FIELD_BOTTOM ||\ (field) == V4L2_FIELD_TOP ||\ (field) == V4L2_FIELD_ALTERNATE) +#define V4L2_FIELD_IS_INTERLACED(field) \ + ((field) == V4L2_FIELD_INTERLACED ||\ + (field) == V4L2_FIELD_INTERLACED_TB ||\ + (field) == V4L2_FIELD_INTERLACED_BT) +#define V4L2_FIELD_IS_SEQUENTIAL(field) \ + ((field) == V4L2_FIELD_SEQ_TB ||\ + (field) == V4L2_FIELD_SEQ_BT) enum v4l2_buf_type { V4L2_BUF_TYPE_VIDEO_CAPTURE = 1, -- cgit v1.2.3 From 75dd48e2e420a3cbbe56dd7adfcc6f142c948272 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Mon, 14 Jan 2019 18:41:35 +0100 Subject: netfilter: nf_tables: Support RULE_ID reference in new rule To allow for a batch to contain rules in arbitrary ordering, introduce NFTA_RULE_POSITION_ID attribute which works just like NFTA_RULE_POSITION but contains the ID of another rule within the same batch. This helps iptables-nft-restore handling dumps with mixed insert/append commands correctly. Note that NFTA_RULE_POSITION takes precedence over NFTA_RULE_POSITION_ID, so if the former is present, the latter is ignored. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nf_tables_api.c | 9 +++++++++ 2 files changed, 11 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 7de4f1bdaf06..99ca95b830b6 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -219,6 +219,7 @@ enum nft_chain_attributes { * @NFTA_RULE_POSITION: numeric handle of the previous rule (NLA_U64) * @NFTA_RULE_USERDATA: user data (NLA_BINARY, NFT_USERDATA_MAXLEN) * @NFTA_RULE_ID: uniquely identifies a rule in a transaction (NLA_U32) + * @NFTA_RULE_POSITION_ID: transaction unique identifier of the previous rule (NLA_U32) */ enum nft_rule_attributes { NFTA_RULE_UNSPEC, @@ -231,6 +232,7 @@ enum nft_rule_attributes { NFTA_RULE_USERDATA, NFTA_RULE_PAD, NFTA_RULE_ID, + NFTA_RULE_POSITION_ID, __NFTA_RULE_MAX }; #define NFTA_RULE_MAX (__NFTA_RULE_MAX - 1) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 621ff834d3a4..d88c86c5b433 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2610,6 +2610,9 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) return 0; } +static struct nft_rule *nft_rule_lookup_byid(const struct net *net, + const struct nlattr *nla); + #define NFT_RULE_MAXEXPRS 128 static int nf_tables_newrule(struct net *net, struct sock *nlsk, @@ -2679,6 +2682,12 @@ static int nf_tables_newrule(struct net *net, struct sock *nlsk, NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION]); return PTR_ERR(old_rule); } + } else if (nla[NFTA_RULE_POSITION_ID]) { + old_rule = nft_rule_lookup_byid(net, nla[NFTA_RULE_POSITION_ID]); + if (IS_ERR(old_rule)) { + NL_SET_BAD_ATTR(extack, nla[NFTA_RULE_POSITION_ID]); + return PTR_ERR(old_rule); + } } } -- cgit v1.2.3 From 0fb4d21956f4a9af225594a46857ccf29bd747bc Mon Sep 17 00:00:00 2001 From: wenxu Date: Wed, 16 Jan 2019 07:53:51 +0800 Subject: netfilter: nft_meta: Add NFT_META_I/OIFKIND meta type In the ip_rcv the skb goes through the PREROUTING hook first, then kicks in vrf device and go through the same hook again. When conntrack dnat works with vrf, there will be some conflict with rules because the packet goes through the hook twice with different nf status. ip link add user1 type vrf table 1 ip link add user2 type vrf table 2 ip l set dev tun1 master user1 ip l set dev tun2 master user2 nft add table firewall nft add chain firewall zones { type filter hook prerouting priority - 300 \; } nft add rule firewall zones counter ct zone set iif map { "tun1" : 1, "tun2" : 2 } nft add chain firewall rule-1000-ingress nft add rule firewall rule-1000-ingress ct zone 1 tcp dport 22 ct state new counter accept nft add rule firewall rule-1000-ingress counter drop nft add chain firewall rule-1000-egress nft add rule firewall rule-1000-egress tcp dport 22 ct state new counter drop nft add rule firewall rule-1000-egress counter accept nft add chain firewall rules-all { type filter hook prerouting priority - 150 \; } nft add rule firewall rules-all ip daddr vmap { "2.2.2.11" : jump rule-1000-ingress } nft add rule firewall rules-all ct zone vmap { 1 : jump rule-1000-egress } nft add rule firewall dnat-all ct zone vmap { 1 : jump dnat-1000 } nft add rule firewall dnat-1000 ip daddr 2.2.2.11 counter dnat to 10.0.0.7 For a package with ip daddr 2.2.2.11 and tcp dport 22, first time accept in the rule-1000-ingress and dnat to 10.0.0.7. Then second time the packet goto the wrong chain rule-1000-egress which leads the packet drop With this patch, userspace can add the 'don't re-do entire ruleset for vrf' policy itself via: nft add rule firewall rules-all meta iifkind "vrf" counter accept Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 ++++ net/netfilter/nft_meta.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 99ca95b830b6..0ba8f48bdf0b 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -791,6 +791,8 @@ enum nft_exthdr_attributes { * @NFT_META_CGROUP: socket control group (skb->sk->sk_classid) * @NFT_META_PRANDOM: a 32bit pseudo-random number * @NFT_META_SECPATH: boolean, secpath_exists (!!skb->sp) + * @NFT_META_IIFKIND: packet input interface kind name (dev->rtnl_link_ops->kind) + * @NFT_META_OIFKIND: packet output interface kind name (dev->rtnl_link_ops->kind) */ enum nft_meta_keys { NFT_META_LEN, @@ -819,6 +821,8 @@ enum nft_meta_keys { NFT_META_CGROUP, NFT_META_PRANDOM, NFT_META_SECPATH, + NFT_META_IIFKIND, + NFT_META_OIFKIND, }; /** diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 6df486c5ebd3..987d2d6ce624 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -244,6 +244,16 @@ void nft_meta_get_eval(const struct nft_expr *expr, strncpy((char *)dest, p->br->dev->name, IFNAMSIZ); return; #endif + case NFT_META_IIFKIND: + if (in == NULL || in->rtnl_link_ops == NULL) + goto err; + strncpy((char *)dest, in->rtnl_link_ops->kind, IFNAMSIZ); + break; + case NFT_META_OIFKIND: + if (out == NULL || out->rtnl_link_ops == NULL) + goto err; + strncpy((char *)dest, out->rtnl_link_ops->kind, IFNAMSIZ); + break; default: WARN_ON(1); goto err; @@ -340,6 +350,8 @@ static int nft_meta_get_init(const struct nft_ctx *ctx, break; case NFT_META_IIFNAME: case NFT_META_OIFNAME: + case NFT_META_IIFKIND: + case NFT_META_OIFKIND: len = IFNAMSIZ; break; case NFT_META_PRANDOM: -- cgit v1.2.3 From 0123a75e1d57c3df31e536868339c98c02c14917 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Fri, 18 Jan 2019 14:36:29 +0100 Subject: Revert "netfilter: nft_hash: add map lookups for hashing operations" A better way to implement this from userspace has been found without specific code in the kernel side, revert this. Fixes: b9ccc07e3f31 ("netfilter: nft_hash: add map lookups for hashing operations") Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 4 +- net/netfilter/nft_hash.c | 121 ------------------------------- 2 files changed, 2 insertions(+), 123 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 0ba8f48bdf0b..030302893d96 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -877,8 +877,8 @@ enum nft_hash_attributes { NFTA_HASH_SEED, NFTA_HASH_OFFSET, NFTA_HASH_TYPE, - NFTA_HASH_SET_NAME, - NFTA_HASH_SET_ID, + NFTA_HASH_SET_NAME, /* deprecated */ + NFTA_HASH_SET_ID, /* deprecated */ __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c2d237144f74..ea658e6c53e3 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -25,7 +25,6 @@ struct nft_jhash { u32 modulus; u32 seed; u32 offset; - struct nft_set *map; }; static void nft_jhash_eval(const struct nft_expr *expr, @@ -42,33 +41,10 @@ static void nft_jhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } -static void nft_jhash_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - const void *data = ®s->data[priv->sreg]; - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = reciprocal_scale(jhash(data, priv->len, priv->seed), - priv->modulus) + priv->offset; - - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - struct nft_symhash { enum nft_registers dreg:8; u32 modulus; u32 offset; - struct nft_set *map; }; static void nft_symhash_eval(const struct nft_expr *expr, @@ -84,28 +60,6 @@ static void nft_symhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } -static void nft_symhash_map_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) -{ - struct nft_symhash *priv = nft_expr_priv(expr); - struct sk_buff *skb = pkt->skb; - const struct nft_set *map = priv->map; - const struct nft_set_ext *ext; - u32 result; - bool found; - - result = reciprocal_scale(__skb_get_hash_symmetric(skb), - priv->modulus) + priv->offset; - - found = map->ops->lookup(nft_net(pkt), map, &result, &ext); - if (!found) - return; - - nft_data_copy(®s->data[priv->dreg], - nft_set_ext_data(ext), map->dlen); -} - static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, @@ -114,9 +68,6 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, [NFTA_HASH_TYPE] = { .type = NLA_U32 }, - [NFTA_HASH_SET_NAME] = { .type = NLA_STRING, - .len = NFT_SET_MAXNAMELEN - 1 }, - [NFTA_HASH_SET_ID] = { .type = NLA_U32 }, }; static int nft_jhash_init(const struct nft_ctx *ctx, @@ -166,20 +117,6 @@ static int nft_jhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_jhash_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_jhash_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_HASH_SET_NAME], - tb[NFTA_HASH_SET_ID], genmask); - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_symhash_init(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nlattr * const tb[]) @@ -206,20 +143,6 @@ static int nft_symhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_symhash_map_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) -{ - struct nft_jhash *priv = nft_expr_priv(expr); - u8 genmask = nft_genmask_next(ctx->net); - - nft_symhash_init(ctx, expr, tb); - priv->map = nft_set_lookup_global(ctx->net, ctx->table, - tb[NFTA_HASH_SET_NAME], - tb[NFTA_HASH_SET_ID], genmask); - return PTR_ERR_OR_ZERO(priv->map); -} - static int nft_jhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -247,18 +170,6 @@ nla_put_failure: return -1; } -static int nft_jhash_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_jhash *priv = nft_expr_priv(expr); - - if (nft_jhash_dump(skb, expr) || - nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) - return -1; - - return 0; -} - static int nft_symhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -279,18 +190,6 @@ nla_put_failure: return -1; } -static int nft_symhash_map_dump(struct sk_buff *skb, - const struct nft_expr *expr) -{ - const struct nft_symhash *priv = nft_expr_priv(expr); - - if (nft_symhash_dump(skb, expr) || - nla_put_string(skb, NFTA_HASH_SET_NAME, priv->map->name)) - return -1; - - return 0; -} - static struct nft_expr_type nft_hash_type; static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, @@ -300,14 +199,6 @@ static const struct nft_expr_ops nft_jhash_ops = { .dump = nft_jhash_dump, }; -static const struct nft_expr_ops nft_jhash_map_ops = { - .type = &nft_hash_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)), - .eval = nft_jhash_map_eval, - .init = nft_jhash_map_init, - .dump = nft_jhash_map_dump, -}; - static const struct nft_expr_ops nft_symhash_ops = { .type = &nft_hash_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), @@ -316,14 +207,6 @@ static const struct nft_expr_ops nft_symhash_ops = { .dump = nft_symhash_dump, }; -static const struct nft_expr_ops nft_symhash_map_ops = { - .type = &nft_hash_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), - .eval = nft_symhash_map_eval, - .init = nft_symhash_map_init, - .dump = nft_symhash_map_dump, -}; - static const struct nft_expr_ops * nft_hash_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) @@ -336,12 +219,8 @@ nft_hash_select_ops(const struct nft_ctx *ctx, type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); switch (type) { case NFT_HASH_SYM: - if (tb[NFTA_HASH_SET_NAME]) - return &nft_symhash_map_ops; return &nft_symhash_ops; case NFT_HASH_JENKINS: - if (tb[NFTA_HASH_SET_NAME]) - return &nft_jhash_map_ops; return &nft_jhash_ops; default: break; -- cgit v1.2.3 From f88c19aab5f34835f1ba467c5b508ec4f782f07f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Jan 2019 12:44:25 -0800 Subject: net_sched: add hit counter for matchall Although matchall always matches packets, however, it still relies on a protocol match first. So it is still useful to have such a counter for matchall. Of course, unlike u32, every time we hit a matchall filter, it is always a success, so we don't have to distinguish them. Sample output: filter protocol 802.1Q pref 100 matchall chain 0 filter protocol 802.1Q pref 100 matchall chain 0 handle 0x1 not_in_hw (rule hit 10) action order 1: vlan pop continue index 1 ref 1 bind 1 installed 40 sec used 1 sec Action statistics: Sent 836 bytes 10 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Reported-by: Martin Olsson Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 6 ++++++ net/sched/cls_matchall.c | 24 ++++++++++++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 95d0db2a8350..32a3416b51c3 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -527,11 +527,17 @@ enum { /* Match-all classifier */ +struct tc_matchall_pcnt { + __u64 rhit; +}; + enum { TCA_MATCHALL_UNSPEC, TCA_MATCHALL_CLASSID, TCA_MATCHALL_ACT, TCA_MATCHALL_FLAGS, + TCA_MATCHALL_PCNT, + TCA_MATCHALL_PAD, __TCA_MATCHALL_MAX, }; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 0e408ee9dcec..a1b803fd372e 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -22,6 +23,7 @@ struct cls_mall_head { u32 handle; u32 flags; unsigned int in_hw_count; + struct tc_matchall_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -34,6 +36,7 @@ static int mall_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; *res = head->res; + __this_cpu_inc(head->pf->rhit); return tcf_exts_exec(skb, &head->exts, res); } @@ -46,6 +49,7 @@ static void __mall_destroy(struct cls_mall_head *head) { tcf_exts_destroy(&head->exts); tcf_exts_put_net(&head->exts); + free_percpu(head->pf); kfree(head); } @@ -192,6 +196,11 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, handle = 1; new->handle = handle; new->flags = flags; + new->pf = alloc_percpu(struct tc_matchall_pcnt); + if (!new->pf) { + err = -ENOMEM; + goto err_alloc_percpu; + } err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], ovr, extack); @@ -214,6 +223,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, err_replace_hw_filter: err_set_parms: + free_percpu(new->pf); +err_alloc_percpu: tcf_exts_destroy(&new->exts); err_exts_init: kfree(new); @@ -270,8 +281,10 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, tc_setup_cb_t *cb, static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { + struct tc_matchall_pcnt gpf = {}; struct cls_mall_head *head = fh; struct nlattr *nest; + int cpu; if (!head) return skb->len; @@ -289,6 +302,17 @@ static int mall_dump(struct net *net, struct tcf_proto *tp, void *fh, if (head->flags && nla_put_u32(skb, TCA_MATCHALL_FLAGS, head->flags)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_matchall_pcnt *pf = per_cpu_ptr(head->pf, cpu); + + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_MATCHALL_PCNT, + sizeof(struct tc_matchall_pcnt), + &gpf, TCA_MATCHALL_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &head->exts)) goto nla_put_failure; -- cgit v1.2.3 From cb5ccfbe73b389470e1dc11061bb185ef4bc9aec Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:10 +0200 Subject: devlink: Add health buffer support Devlink health buffer is a mechanism to pass descriptors between drivers and devlink. The API allows the driver to add objects, object pair, value array (nested attributes), value and name. Driver can use this API to fill the buffers in a format which can be translated by the devlink to the netlink message. In order to fulfill it, an internal buffer descriptor is defined. This will hold the data and metadata per each attribute and by used to pass actual commands to the netlink. This mechanism will be later used in devlink health for dump and diagnose data store by the drivers. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/net/devlink.h | 76 +++++++ include/uapi/linux/devlink.h | 8 + net/core/devlink.c | 501 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 585 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 67f4293bc970..77c77319290a 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -423,6 +423,8 @@ struct devlink_region; typedef void devlink_snapshot_data_dest_t(const void *data); +struct devlink_health_buffer; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -584,6 +586,22 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor); +int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, + int attrtype); +void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer); +void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer); +int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, + char *name); +int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, + u8 value); +int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, + u32 value); +int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, + u64 value); +int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, + char *name); +int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, + void *data, int len); #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -844,6 +862,64 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, return 0; } +static inline int +devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, + int attrtype) +{ + return 0; +} + +static inline void +devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) +{ +} + +static inline void +devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) +{ +} + +static inline int +devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, + char *name) +{ + return 0; +} + +static inline int +devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, + u8 value) +{ + return 0; +} + +static inline int +devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, + u32 value) +{ + return 0; +} + +static inline int +devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, + u64 value) +{ + return 0; +} + +static inline int +devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, + char *name) +{ + return 0; +} + +static inline int +devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, + void *data, int len) +{ + return 0; +} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6e52d3660654..cff0e0cb5ac2 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -285,6 +285,14 @@ enum devlink_attr { DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME, /* string */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY, /* nested */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */ + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index abb0da9d7b4b..8984501edade 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3597,6 +3597,507 @@ out: return 0; } +#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN) +#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2) +#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE) +#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024 + +struct devlink_health_buffer { + void *data; + u64 offset; + u64 bytes_left; + u64 bytes_left_metadata; + u64 max_nested_depth; + u64 curr_nest; +}; + +struct devlink_health_buffer_desc { + int attrtype; + u16 len; + u8 nla_type; + u8 nest_end; + int value[0]; +}; + +static void +devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list, + u64 num_of_buffers) +{ + u64 i; + + for (i = 0; i < num_of_buffers; i++) { + memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE); + buffers_list[i]->offset = 0; + buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE; + buffers_list[i]->bytes_left_metadata = + DEVLINK_HEALTH_BUFFER_DATA_SIZE; + buffers_list[i]->max_nested_depth = 0; + buffers_list[i]->curr_nest = 0; + } +} + +static void +devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, + u64 size); + +static struct devlink_health_buffer ** +devlink_health_buffers_create(u64 size) +{ + struct devlink_health_buffer **buffers_list; + u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size); + u64 i; + + buffers_list = kcalloc(num_of_buffers, + sizeof(struct devlink_health_buffer *), + GFP_KERNEL); + if (!buffers_list) + return NULL; + + for (i = 0; i < num_of_buffers; i++) { + struct devlink_health_buffer *buffer; + void *data; + + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL); + if (!buffer || !data) { + kfree(buffer); + kfree(data); + goto buffers_cleanup; + } + buffers_list[i] = buffer; + buffer->data = data; + } + devlink_health_buffers_reset(buffers_list, num_of_buffers); + + return buffers_list; + +buffers_cleanup: + devlink_health_buffers_destroy(buffers_list, --i); + kfree(buffers_list); + return NULL; +} + +static void +devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, + u64 num_of_buffers) +{ + u64 i; + + for (i = 0; i < num_of_buffers; i++) { + kfree(buffers_list[i]->data); + kfree(buffers_list[i]); + } +} + +void +devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer, + int len) +{ + buffer->offset += len; +} + +/* In order to store a nest, need two descriptors, for start and end */ +#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2) + +int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer, + int len, int metadata_len) +{ + if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE) + return -EINVAL; + + if (buffer->bytes_left < len || + buffer->bytes_left_metadata < metadata_len) + return -ENOMEM; + + return 0; +} + +static struct devlink_health_buffer_desc * +devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer) +{ + return buffer->data + buffer->offset; +} + +int +devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, + int attrtype) +{ + struct devlink_health_buffer_desc *desc; + int err; + + err = devlink_health_buffer_verify_len(buffer, 0, + DEVLINK_HEALTH_BUFFER_NEST_SIZE); + if (err) + return err; + + if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT && + attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR && + attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE && + attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY) + return -EINVAL; + + desc = devlink_health_buffer_get_desc_from_offset(buffer); + + desc->attrtype = attrtype; + buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE; + devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); + + buffer->curr_nest++; + buffer->max_nested_depth = max(buffer->max_nested_depth, + buffer->curr_nest); + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start); + +enum devlink_health_buffer_nest_end_cancel { + DEVLINK_HEALTH_BUFFER_NEST_END = 1, + DEVLINK_HEALTH_BUFFER_NEST_CANCEL, +}; + +static void +devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer, + enum devlink_health_buffer_nest_end_cancel nest) +{ + struct devlink_health_buffer_desc *desc; + + WARN_ON(!buffer->curr_nest); + buffer->curr_nest--; + + desc = devlink_health_buffer_get_desc_from_offset(buffer); + desc->nest_end = nest; + devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); +} + +void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) +{ + devlink_health_buffer_nest_end_cancel(buffer, + DEVLINK_HEALTH_BUFFER_NEST_END); +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end); + +void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) +{ + devlink_health_buffer_nest_end_cancel(buffer, + DEVLINK_HEALTH_BUFFER_NEST_CANCEL); +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel); + +int +devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, + char *name) +{ + struct devlink_health_buffer_desc *desc; + int err; + + err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1, + sizeof(*desc)); + if (err) + return err; + + desc = devlink_health_buffer_get_desc_from_offset(buffer); + desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME; + desc->nla_type = NLA_NUL_STRING; + desc->len = strlen(name) + 1; + memcpy(&desc->value, name, desc->len); + devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); + + buffer->bytes_left_metadata -= sizeof(*desc); + buffer->bytes_left -= (strlen(name) + 1); + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name); + +static int +devlink_health_buffer_put_value(struct devlink_health_buffer *buffer, + u8 nla_type, void *value, int len) +{ + struct devlink_health_buffer_desc *desc; + int err; + + err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc)); + if (err) + return err; + + desc = devlink_health_buffer_get_desc_from_offset(buffer); + desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA; + desc->nla_type = nla_type; + desc->len = len; + memcpy(&desc->value, value, len); + devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); + + buffer->bytes_left_metadata -= sizeof(*desc); + buffer->bytes_left -= len; + + return 0; +} + +int +devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, + u8 value) +{ + int err; + + err = devlink_health_buffer_put_value(buffer, NLA_U8, &value, + sizeof(value)); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8); + +int +devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, + u32 value) +{ + int err; + + err = devlink_health_buffer_put_value(buffer, NLA_U32, &value, + sizeof(value)); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32); + +int +devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, + u64 value) +{ + int err; + + err = devlink_health_buffer_put_value(buffer, NLA_U64, &value, + sizeof(value)); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64); + +int +devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, + char *name) +{ + int err; + + if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) + return -EINVAL; + + err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name, + strlen(name) + 1); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string); + +int +devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, + void *data, int len) +{ + int err; + + if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) + return -EINVAL; + + err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data); + +static int +devlink_health_buffer_fill_data(struct sk_buff *skb, + struct devlink_health_buffer_desc *desc) +{ + int err = -EINVAL; + + switch (desc->nla_type) { + case NLA_U8: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, + *(u8 *)desc->value); + break; + case NLA_U32: + err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, + *(u32 *)desc->value); + break; + case NLA_U64: + err = nla_put_u64_64bit(skb, + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, + *(u64 *)desc->value, DEVLINK_ATTR_PAD); + break; + case NLA_NUL_STRING: + err = nla_put_string(skb, + DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, + (char *)&desc->value); + break; + case NLA_BINARY: + err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, + desc->len, (void *)&desc->value); + break; + } + + return err; +} + +static int +devlink_health_buffer_fill_type(struct sk_buff *skb, + struct devlink_health_buffer_desc *desc) +{ + int err = -EINVAL; + + switch (desc->nla_type) { + case NLA_U8: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, + NLA_U8); + break; + case NLA_U32: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, + NLA_U32); + break; + case NLA_U64: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, + NLA_U64); + break; + case NLA_NUL_STRING: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, + NLA_NUL_STRING); + break; + case NLA_BINARY: + err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, + NLA_BINARY); + break; + } + + return err; +} + +static inline struct devlink_health_buffer_desc * +devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc) +{ + return (void *)&desc->value + desc->len; +} + +static int +devlink_health_buffer_prepare_skb(struct sk_buff *skb, + struct devlink_health_buffer *buffer) +{ + struct devlink_health_buffer_desc *last_desc, *desc; + struct nlattr **buffer_nlattr; + int err; + int i = 0; + + buffer_nlattr = kcalloc(buffer->max_nested_depth, + sizeof(*buffer_nlattr), GFP_KERNEL); + if (!buffer_nlattr) + return -EINVAL; + + last_desc = devlink_health_buffer_get_desc_from_offset(buffer); + desc = buffer->data; + while (desc != last_desc) { + switch (desc->attrtype) { + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT: + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR: + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE: + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY: + buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype); + if (!buffer_nlattr[i]) + goto nla_put_failure; + i++; + break; + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA: + err = devlink_health_buffer_fill_data(skb, desc); + if (err) + goto nla_put_failure; + err = devlink_health_buffer_fill_type(skb, desc); + if (err) + goto nla_put_failure; + break; + case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME: + err = nla_put_string(skb, desc->attrtype, + (char *)&desc->value); + if (err) + goto nla_put_failure; + break; + default: + WARN_ON(!desc->nest_end); + WARN_ON(i <= 0); + if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END) + nla_nest_end(skb, buffer_nlattr[--i]); + else + nla_nest_cancel(skb, buffer_nlattr[--i]); + break; + } + desc = devlink_health_buffer_get_next_desc(desc); + } + + return 0; + +nla_put_failure: + kfree(buffer_nlattr); + return err; +} + +static int +devlink_health_buffer_snd(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_health_buffer **buffers_array, + u64 num_of_buffers) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + void *hdr; + int err; + u64 i; + + for (i = 0; i < num_of_buffers; i++) { + /* Skip buffer if driver did not fill it up with any data */ + if (!buffers_array[i]->offset) + continue; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) + goto nla_put_failure; + + err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]); + if (err) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + err = genlmsg_reply(skb, info); + if (err) + return err; + } + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + err = genlmsg_reply(skb, info); + if (err) + return err; + + return 0; + +nla_put_failure: + err = -EIO; + nlmsg_free(skb); + return err; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, -- cgit v1.2.3 From ff253fedab961b22117a73ab808fcfa9e6852b50 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:13 +0200 Subject: devlink: Add health get command Add devlink health get command to provide reporter/s data for user space. Add the ability to get data per reporter or dump data from all available reporters. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 12 ++++ net/core/devlink.c | 152 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index cff0e0cb5ac2..c05470578b99 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -89,6 +89,8 @@ enum devlink_command { DEVLINK_CMD_REGION_DEL, DEVLINK_CMD_REGION_READ, + DEVLINK_CMD_HEALTH_REPORTER_GET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -293,6 +295,16 @@ enum devlink_attr { DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */ DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */ + DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ + DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ + DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, /* u8 */ + DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 943d3e7dea6a..2ba9275449c2 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4317,6 +4317,149 @@ int devlink_health_report(struct devlink_health_reporter *reporter, } EXPORT_SYMBOL_GPL(devlink_health_report); +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *reporter_name; + + if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + return NULL; + + reporter_name = + nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + return devlink_health_reporter_find_by_name(devlink, reporter_name); +} + +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, + reporter->dump_avail)) + goto reporter_nest_cancel; + if (reporter->dump_avail && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_end(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + struct sk_buff *msg; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + info->snd_portid, info->snd_seq, + 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(reporter, &devlink->reporter_list, + list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, devlink, + reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4342,6 +4485,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4562,6 +4706,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .doit = devlink_nl_cmd_health_reporter_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 6f9d56132eb6d2603d4273cfc65bed914ec47acb Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:14 +0200 Subject: devlink: Add health set command Add devlink health set command, in order to set configuration parameters for a specific reporter. Supported parameters are: - graceful_period: Time interval between auto recoveries (in msec) - auto_recover: Determines if the devlink shall execute recover upon receiving error for the reporter Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index c05470578b99..49ad5a76b121 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -90,6 +90,7 @@ enum devlink_command { DEVLINK_CMD_REGION_READ, DEVLINK_CMD_HEALTH_REPORTER_GET, + DEVLINK_CMD_HEALTH_REPORTER_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 2ba9275449c2..a34414bf1c27 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4460,6 +4460,33 @@ out: return msg->len; } +static int +devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->recover && + (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + return -EINVAL; + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + reporter->graceful_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) + reporter->auto_recover = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4486,6 +4513,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4714,6 +4743,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, + .doit = devlink_nl_cmd_health_reporter_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From fcd852c69d776c0f46c8f79e8e431e5cc6ddc7b7 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:15 +0200 Subject: devlink: Add health recover command Add devlink health recover command to the uapi, in order to allow the user to execute a recover operation over a specific reporter. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 49ad5a76b121..1c186fd77343 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -91,6 +91,7 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_GET, DEVLINK_CMD_HEALTH_REPORTER_SET, + DEVLINK_CMD_HEALTH_REPORTER_RECOVER, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index a34414bf1c27..b224d0d31c0c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4487,6 +4487,19 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + return devlink_health_reporter_recover(reporter, NULL); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4750,6 +4763,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + .doit = devlink_nl_cmd_health_reporter_recover_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 8a66704a13d9713593342e29b4f0c19762f5746b Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:16 +0200 Subject: devlink: Add health diagnose command Add devlink health diagnose command, in order to run a diagnose operation over a specific reporter. It is expected from driver's callback for diagnose command to fill it via the buffer descriptors API. Devlink will parse it and convert it to netlink nla API in order to pass it to the user. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 1c186fd77343..51b4d7612cf8 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -92,6 +92,7 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_GET, DEVLINK_CMD_HEALTH_REPORTER_SET, DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index b224d0d31c0c..57252ca31e1e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4500,6 +4500,50 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, return devlink_health_reporter_recover(reporter, NULL); } +static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + u64 num_of_buffers; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->diagnose) + return -EOPNOTSUPP; + + num_of_buffers = + DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size); + + mutex_lock(&reporter->diagnose_lock); + devlink_health_buffers_reset(reporter->diagnose_buffers_array, + num_of_buffers); + + err = reporter->ops->diagnose(reporter, + reporter->diagnose_buffers_array, + DEVLINK_HEALTH_BUFFER_SIZE, + num_of_buffers); + if (err) + goto out; + + err = devlink_health_buffer_snd(info, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + 0, reporter->diagnose_buffers_array, + num_of_buffers); + if (err) + goto out; + + mutex_unlock(&reporter->diagnose_lock); + return 0; + +out: + mutex_unlock(&reporter->diagnose_lock); + return err; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4770,6 +4814,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + .doit = devlink_nl_cmd_health_reporter_diagnose_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 12bd0dcefe88782ac1c9fff632958dd1b71d27e5 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 17 Jan 2019 23:59:17 +0200 Subject: devlink: Add health dump {get,clear} commands Add devlink health dump commands, in order to run an dump operation over a specific reporter. The supported operations are dump_get in order to get last saved dump (if not exist, dump now) and dump_clear to clear last saved dump. It is expected from driver's callback for diagnose command to fill it via the buffer descriptors API. Devlink will parse it and convert it to netlink nla API in order to pass it to the user. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 75 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 51b4d7612cf8..6b26bb2ce4dc 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -93,6 +93,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_SET, DEVLINK_CMD_HEALTH_REPORTER_RECOVER, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 57252ca31e1e..60248a53c0ad 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4544,6 +4544,65 @@ out: return err; } +static void +devlink_health_dump_clear(struct devlink_health_reporter *reporter) +{ + reporter->dump_avail = false; + reporter->dump_ts = 0; + devlink_health_buffers_reset(reporter->dump_buffers_array, + DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); +} + +static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + u64 num_of_buffers; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->dump) + return -EOPNOTSUPP; + + num_of_buffers = + DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size); + + mutex_lock(&reporter->dump_lock); + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto out; + + err = devlink_health_buffer_snd(info, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + 0, reporter->dump_buffers_array, + num_of_buffers); + +out: + mutex_unlock(&reporter->dump_lock); + return err; +} + +static int +devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + mutex_lock(&reporter->dump_lock); + devlink_health_dump_clear(reporter); + mutex_unlock(&reporter->dump_lock); + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4821,6 +4880,22 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 36647055b37ec78e9068f470f14e7cd75c001c22 Mon Sep 17 00:00:00 2001 From: Toke Høiland-Jørgensen Date: Tue, 18 Dec 2018 17:02:07 -0800 Subject: cfg80211: Add airtime statistics and settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds TX airtime statistics to the cfg80211 station dump (to go along with the RX info already present), and adds a new parameter to set the airtime weight of each station. The latter allows userspace to implement policies for different stations by varying their weights. Signed-off-by: Toke Høiland-Jørgensen [rmanohar@codeaurora.org: fixed checkpatch warnings] Signed-off-by: Rajkumar Manoharan [move airtime weight != 0 check into policy] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 11 ++++++++++- include/uapi/linux/nl80211.h | 15 +++++++++++++++ net/wireless/nl80211.c | 24 ++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index e0c41eb1c860..1691f52fcc80 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1003,6 +1003,7 @@ enum station_parameters_apply_mask { * @support_p2p_ps: information if station supports P2P PS mechanism * @he_capa: HE capabilities of station * @he_capa_len: the length of the HE capabilities + * @airtime_weight: airtime scheduler weight for this station */ struct station_parameters { const u8 *supported_rates; @@ -1032,6 +1033,7 @@ struct station_parameters { int support_p2p_ps; const struct ieee80211_he_cap_elem *he_capa; u8 he_capa_len; + u16 airtime_weight; }; /** @@ -1300,6 +1302,8 @@ struct cfg80211_tid_stats { * from this peer * @connected_to_gate: true if mesh STA has a path to mesh gate * @rx_duration: aggregate PPDU duration(usecs) for all the frames from a peer + * @tx_duration: aggregate PPDU duration(usecs) for all the frames to a peer + * @airtime_weight: current airtime scheduling weight * @pertid: per-TID statistics, see &struct cfg80211_tid_stats, using the last * (IEEE80211_NUM_TIDS) index for MSDUs not encapsulated in QoS-MPDUs. * Note that this doesn't use the @filled bit, but is used if non-NULL. @@ -1350,8 +1354,9 @@ struct station_info { u32 expected_throughput; - u64 rx_beacon; + u64 tx_duration; u64 rx_duration; + u64 rx_beacon; u8 rx_beacon_signal_avg; u8 connected_to_gate; @@ -1359,6 +1364,8 @@ struct station_info { s8 ack_signal; s8 avg_ack_signal; + u16 airtime_weight; + u32 rx_mpdu_count; u32 fcs_err_count; }; @@ -2391,6 +2398,8 @@ enum wiphy_params_flags { WIPHY_PARAM_TXQ_QUANTUM = 1 << 8, }; +#define IEEE80211_DEFAULT_AIRTIME_WEIGHT 256 + /** * struct cfg80211_pmksa - PMK Security Association * diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 31ae5c7f10e3..ebe79e12c82e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2299,6 +2299,9 @@ enum nl80211_commands { * This is also used for capability advertisement in the wiphy information, * with the appropriate sub-attributes. * + * @NL80211_ATTR_AIRTIME_WEIGHT: Station's weight when scheduled by the airtime + * scheduler. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2748,6 +2751,8 @@ enum nl80211_attrs { NL80211_ATTR_PEER_MEASUREMENTS, + NL80211_ATTR_AIRTIME_WEIGHT, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3125,6 +3130,9 @@ enum nl80211_sta_bss_param { * might not be fully accurate. * @NL80211_STA_INFO_CONNECTED_TO_GATE: set to true if STA has a path to a * mesh gate (u8, 0 or 1) + * @NL80211_STA_INFO_TX_DURATION: aggregate PPDU duration for all frames + * sent to the station (u64, usec) + * @NL80211_STA_INFO_AIRTIME_WEIGHT: current airtime weight for station (u16) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3168,6 +3176,8 @@ enum nl80211_sta_info { NL80211_STA_INFO_RX_MPDUS, NL80211_STA_INFO_FCS_ERROR_COUNT, NL80211_STA_INFO_CONNECTED_TO_GATE, + NL80211_STA_INFO_TX_DURATION, + NL80211_STA_INFO_AIRTIME_WEIGHT, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, @@ -5316,6 +5326,10 @@ enum nl80211_feature_flags { * if this flag is not set. Ignoring this can leak clear text packets and/or * freeze the connection. * + * @NL80211_EXT_FEATURE_AIRTIME_FAIRNESS: Driver supports getting airtime + * fairness for transmitted packets and has enabled airtime fairness + * scheduling. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5355,6 +5369,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5e49492d5911..a89688929b16 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -557,6 +557,7 @@ const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_PEER_MEASUREMENTS] = NLA_POLICY_NESTED(NL80211_PMSR_FTM_REQ_ATTR_MAX, nl80211_pmsr_attr_policy), + [NL80211_ATTR_AIRTIME_WEIGHT] = NLA_POLICY_MIN(NLA_U16, 1), }; /* policy for the key attributes */ @@ -4851,6 +4852,11 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO(PLID, plid, u16); PUT_SINFO(PLINK_STATE, plink_state, u8); PUT_SINFO_U64(RX_DURATION, rx_duration); + PUT_SINFO_U64(TX_DURATION, tx_duration); + + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) + PUT_SINFO(AIRTIME_WEIGHT, airtime_weight, u16); switch (rdev->wiphy.signal_type) { case CFG80211_SIGNAL_TYPE_MBM: @@ -5470,6 +5476,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) nla_get_u8(info->attrs[NL80211_ATTR_OPMODE_NOTIF]); } + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) + params.airtime_weight = + nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); + + if (params.airtime_weight && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) + return -EOPNOTSUPP; + /* Include parameters for TDLS peer (will check later) */ err = nl80211_set_station_tdls(info, ¶ms); if (err) @@ -5598,6 +5613,15 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) params.plink_action = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); + if (info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]) + params.airtime_weight = + nla_get_u16(info->attrs[NL80211_ATTR_AIRTIME_WEIGHT]); + + if (params.airtime_weight && + !wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) + return -EOPNOTSUPP; + err = nl80211_parse_sta_channel_info(info, ¶ms); if (err) return err; -- cgit v1.2.3 From cc24163690997c685641d84e77ff6f1c592b06fe Mon Sep 17 00:00:00 2001 From: Julan Hsu Date: Tue, 15 Jan 2019 15:28:42 -0800 Subject: nl80211/mac80211: mesh: add hop count to mpath info Expose hop count to destination information in mpath info Signed-off-by: Julan Hsu Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 4 ++++ include/uapi/linux/nl80211.h | 2 ++ net/mac80211/cfg.c | 4 +++- net/mac80211/mesh_hwmp.c | 5 +++++ net/wireless/nl80211.c | 5 ++++- 5 files changed, 18 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1691f52fcc80..37816786d3e1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1429,6 +1429,7 @@ enum monitor_flags { * @MPATH_INFO_DISCOVERY_TIMEOUT: @discovery_timeout filled * @MPATH_INFO_DISCOVERY_RETRIES: @discovery_retries filled * @MPATH_INFO_FLAGS: @flags filled + * @MPATH_INFO_HOP_COUNT: @hop_count filled */ enum mpath_info_flags { MPATH_INFO_FRAME_QLEN = BIT(0), @@ -1438,6 +1439,7 @@ enum mpath_info_flags { MPATH_INFO_DISCOVERY_TIMEOUT = BIT(4), MPATH_INFO_DISCOVERY_RETRIES = BIT(5), MPATH_INFO_FLAGS = BIT(6), + MPATH_INFO_HOP_COUNT = BIT(7) }; /** @@ -1457,6 +1459,7 @@ enum mpath_info_flags { * This number should increase every time the list of mesh paths * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. + * @hop_count: hops to destination */ struct mpath_info { u32 filled; @@ -1467,6 +1470,7 @@ struct mpath_info { u32 discovery_timeout; u8 discovery_retries; u8 flags; + u8 hop_count; int generation; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index ebe79e12c82e..213a1d7c1063 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3287,6 +3287,7 @@ enum nl80211_mpath_flags { * &enum nl80211_mpath_flags; * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries + * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination * @NL80211_MPATH_INFO_MAX: highest mesh path information attribute number * currently defind * @__NL80211_MPATH_INFO_AFTER_LAST: internal use @@ -3300,6 +3301,7 @@ enum nl80211_mpath_info { NL80211_MPATH_INFO_FLAGS, NL80211_MPATH_INFO_DISCOVERY_TIMEOUT, NL80211_MPATH_INFO_DISCOVERY_RETRIES, + NL80211_MPATH_INFO_HOP_COUNT, /* keep last */ __NL80211_MPATH_INFO_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 83ee573b1804..52cbaaf5caea 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1745,7 +1745,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, MPATH_INFO_EXPTIME | MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | - MPATH_INFO_FLAGS; + MPATH_INFO_FLAGS | + MPATH_INFO_HOP_COUNT; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; @@ -1765,6 +1766,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, pinfo->flags |= NL80211_MPATH_FLAG_FIXED; if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; + pinfo->hop_count = mpath->hop_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 6950cd0bf594..6d1190b3332f 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -386,6 +386,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, unsigned long orig_lifetime, exp_time; u32 last_hop_metric, new_metric; bool process = true; + u8 hopcount; rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); @@ -404,6 +405,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, orig_sn = PREQ_IE_ORIG_SN(hwmp_ie); orig_lifetime = PREQ_IE_LIFETIME(hwmp_ie); orig_metric = PREQ_IE_METRIC(hwmp_ie); + hopcount = PREQ_IE_HOPCOUNT(hwmp_ie) + 1; break; case MPATH_PREP: /* Originator here refers to the MP that was the target in the @@ -415,6 +417,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, orig_sn = PREP_IE_TARGET_SN(hwmp_ie); orig_lifetime = PREP_IE_LIFETIME(hwmp_ie); orig_metric = PREP_IE_METRIC(hwmp_ie); + hopcount = PREP_IE_HOPCOUNT(hwmp_ie) + 1; break; default: rcu_read_unlock(); @@ -482,6 +485,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, mpath->sn = orig_sn; mpath->exp_time = time_after(mpath->exp_time, exp_time) ? mpath->exp_time : exp_time; + mpath->hop_count = hopcount; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); ewma_mesh_fail_avg_init(&sta->mesh->fail_avg); @@ -523,6 +527,7 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, mpath->metric = last_hop_metric; mpath->exp_time = time_after(mpath->exp_time, exp_time) ? mpath->exp_time : exp_time; + mpath->hop_count = 1; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); ewma_mesh_fail_avg_init(&sta->mesh->fail_avg); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a89688929b16..159125e16c79 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5827,7 +5827,10 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, pinfo->discovery_timeout)) || ((pinfo->filled & MPATH_INFO_DISCOVERY_RETRIES) && nla_put_u8(msg, NL80211_MPATH_INFO_DISCOVERY_RETRIES, - pinfo->discovery_retries))) + pinfo->discovery_retries)) || + ((pinfo->filled & MPATH_INFO_HOP_COUNT) && + nla_put_u8(msg, NL80211_MPATH_INFO_HOP_COUNT, + pinfo->hop_count))) goto nla_put_failure; nla_nest_end(msg, pinfoattr); -- cgit v1.2.3 From 540bbcb930ed2fc9d6a57e0babea00027a7ecc67 Mon Sep 17 00:00:00 2001 From: Julan Hsu Date: Tue, 15 Jan 2019 15:28:43 -0800 Subject: nl80211/mac80211: mesh: add mesh path change count to mpath info Expose path change count to destination in mpath info Signed-off-by: Julan Hsu Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++- include/uapi/linux/nl80211.h | 4 +++- net/mac80211/cfg.c | 4 +++- net/mac80211/mesh.h | 2 ++ net/mac80211/mesh_hwmp.c | 4 ++++ net/wireless/nl80211.c | 5 ++++- 6 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 37816786d3e1..9c1d7979c200 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1439,7 +1439,8 @@ enum mpath_info_flags { MPATH_INFO_DISCOVERY_TIMEOUT = BIT(4), MPATH_INFO_DISCOVERY_RETRIES = BIT(5), MPATH_INFO_FLAGS = BIT(6), - MPATH_INFO_HOP_COUNT = BIT(7) + MPATH_INFO_HOP_COUNT = BIT(7), + MPATH_INFO_PATH_CHANGE = BIT(8), }; /** @@ -1460,6 +1461,7 @@ enum mpath_info_flags { * changes, i.e. when a station is added or removed, so that * userspace can tell whether it got a consistent snapshot. * @hop_count: hops to destination + * @path_change_count: total number of path changes to destination */ struct mpath_info { u32 filled; @@ -1471,6 +1473,7 @@ struct mpath_info { u8 discovery_retries; u8 flags; u8 hop_count; + u32 path_change_count; int generation; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 213a1d7c1063..426db4d8f71c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3288,8 +3288,9 @@ enum nl80211_mpath_flags { * @NL80211_MPATH_INFO_DISCOVERY_TIMEOUT: total path discovery timeout, in msec * @NL80211_MPATH_INFO_DISCOVERY_RETRIES: mesh path discovery retries * @NL80211_MPATH_INFO_HOP_COUNT: hop count to destination + * @NL80211_MPATH_INFO_PATH_CHANGE: total number of path changes to destination * @NL80211_MPATH_INFO_MAX: highest mesh path information attribute number - * currently defind + * currently defined * @__NL80211_MPATH_INFO_AFTER_LAST: internal use */ enum nl80211_mpath_info { @@ -3302,6 +3303,7 @@ enum nl80211_mpath_info { NL80211_MPATH_INFO_DISCOVERY_TIMEOUT, NL80211_MPATH_INFO_DISCOVERY_RETRIES, NL80211_MPATH_INFO_HOP_COUNT, + NL80211_MPATH_INFO_PATH_CHANGE, /* keep last */ __NL80211_MPATH_INFO_AFTER_LAST, diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 52cbaaf5caea..e5e0f100389c 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1746,7 +1746,8 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, MPATH_INFO_DISCOVERY_TIMEOUT | MPATH_INFO_DISCOVERY_RETRIES | MPATH_INFO_FLAGS | - MPATH_INFO_HOP_COUNT; + MPATH_INFO_HOP_COUNT | + MPATH_INFO_PATH_CHANGE; pinfo->frame_qlen = mpath->frame_queue.qlen; pinfo->sn = mpath->sn; @@ -1767,6 +1768,7 @@ static void mpath_set_pinfo(struct mesh_path *mpath, u8 *next_hop, if (mpath->flags & MESH_PATH_RESOLVED) pinfo->flags |= NL80211_MPATH_FLAG_RESOLVED; pinfo->hop_count = mpath->hop_count; + pinfo->path_change_count = mpath->path_change_count; } static int ieee80211_get_mpath(struct wiphy *wiphy, struct net_device *dev, diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h index cad6592c52a1..8b26858ab4d5 100644 --- a/net/mac80211/mesh.h +++ b/net/mac80211/mesh.h @@ -94,6 +94,7 @@ enum mesh_deferred_task_flags { * @last_preq_to_root: Timestamp of last PREQ sent to root * @is_root: the destination station of this path is a root node * @is_gate: the destination station of this path is a mesh gate + * @path_change_count: the number of path changes to destination * * * The dst address is unique in the mesh path table. Since the mesh_path is @@ -124,6 +125,7 @@ struct mesh_path { unsigned long last_preq_to_root; bool is_root; bool is_gate; + u32 path_change_count; }; /** diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 6d1190b3332f..a0aebf44493f 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -479,6 +479,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, } if (fresh_info) { + if (rcu_access_pointer(mpath->next_hop) != sta) + mpath->path_change_count++; mesh_path_assign_nexthop(mpath, sta); mpath->flags |= MESH_PATH_SN_VALID; mpath->metric = new_metric; @@ -523,6 +525,8 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, } if (fresh_info) { + if (rcu_access_pointer(mpath->next_hop) != sta) + mpath->path_change_count++; mesh_path_assign_nexthop(mpath, sta); mpath->metric = last_hop_metric; mpath->exp_time = time_after(mpath->exp_time, exp_time) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 159125e16c79..e5f9c9ceb6c9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -5830,7 +5830,10 @@ static int nl80211_send_mpath(struct sk_buff *msg, u32 portid, u32 seq, pinfo->discovery_retries)) || ((pinfo->filled & MPATH_INFO_HOP_COUNT) && nla_put_u8(msg, NL80211_MPATH_INFO_HOP_COUNT, - pinfo->hop_count))) + pinfo->hop_count)) || + ((pinfo->filled & MPATH_INFO_PATH_CHANGE) && + nla_put_u32(msg, NL80211_MPATH_INFO_PATH_CHANGE, + pinfo->path_change_count))) goto nla_put_failure; nla_nest_end(msg, pinfoattr); -- cgit v1.2.3 From 5954894ba3723995fbeab77bef62bb4878a654bb Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Jan 2019 17:14:01 -0800 Subject: net_sched: add performance counters for basic filter Similar to u32 filter, it is useful to know how many times we reach each basic filter and how many times we pass the ematch attached to it. Sample output: filter protocol arp pref 49152 basic chain 0 filter protocol arp pref 49152 basic chain 0 handle 0x1 (rule hit 3 success 3) action order 1: gact action pass random type none pass val 0 index 1 ref 1 bind 1 installed 81 sec used 4 sec Action statistics: Sent 126 bytes 3 pkt (dropped 0, overlimits 0 requeues 0) backlog 0b 0p requeues 0 Cc: Jamal Hadi Salim Cc: Jiri Pirko Signed-off-by: Cong Wang Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 7 +++++++ net/sched/cls_basic.c | 25 +++++++++++++++++++++++++ 2 files changed, 32 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 32a3416b51c3..02ac251be8c4 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -333,12 +333,19 @@ enum { /* Basic filter */ +struct tc_basic_pcnt { + __u64 rcnt; + __u64 rhit; +}; + enum { TCA_BASIC_UNSPEC, TCA_BASIC_CLASSID, TCA_BASIC_EMATCHES, TCA_BASIC_ACT, TCA_BASIC_POLICE, + TCA_BASIC_PCNT, + TCA_BASIC_PAD, __TCA_BASIC_MAX }; diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 6a5dce8baf19..4a57fec6f306 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -35,6 +36,7 @@ struct basic_filter { struct tcf_result res; struct tcf_proto *tp; struct list_head link; + struct tc_basic_pcnt __percpu *pf; struct rcu_work rwork; }; @@ -46,8 +48,10 @@ static int basic_classify(struct sk_buff *skb, const struct tcf_proto *tp, struct basic_filter *f; list_for_each_entry_rcu(f, &head->flist, link) { + __this_cpu_inc(f->pf->rcnt); if (!tcf_em_tree_match(skb, &f->ematches, NULL)) continue; + __this_cpu_inc(f->pf->rhit); *res = f->res; r = tcf_exts_exec(skb, &f->exts, res); if (r < 0) @@ -89,6 +93,7 @@ static void __basic_delete_filter(struct basic_filter *f) tcf_exts_destroy(&f->exts); tcf_em_tree_destroy(&f->ematches); tcf_exts_put_net(&f->exts); + free_percpu(f->pf); kfree(f); } @@ -208,6 +213,11 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, if (err) goto errout; fnew->handle = handle; + fnew->pf = alloc_percpu(struct tc_basic_pcnt); + if (!fnew->pf) { + err = -ENOMEM; + goto errout; + } err = basic_set_parms(net, tp, fnew, base, tb, tca[TCA_RATE], ovr, extack); @@ -231,6 +241,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return 0; errout: + free_percpu(fnew->pf); tcf_exts_destroy(&fnew->exts); kfree(fnew); return err; @@ -265,8 +276,10 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl) static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, struct sk_buff *skb, struct tcmsg *t) { + struct tc_basic_pcnt gpf = {}; struct basic_filter *f = fh; struct nlattr *nest; + int cpu; if (f == NULL) return skb->len; @@ -281,6 +294,18 @@ static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, nla_put_u32(skb, TCA_BASIC_CLASSID, f->res.classid)) goto nla_put_failure; + for_each_possible_cpu(cpu) { + struct tc_basic_pcnt *pf = per_cpu_ptr(f->pf, cpu); + + gpf.rcnt += pf->rcnt; + gpf.rhit += pf->rhit; + } + + if (nla_put_64bit(skb, TCA_BASIC_PCNT, + sizeof(struct tc_basic_pcnt), + &gpf, TCA_BASIC_PAD)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts) < 0 || tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) goto nla_put_failure; -- cgit v1.2.3 From ad07c8ceb6631a83b62d405a61448bba92adac68 Mon Sep 17 00:00:00 2001 From: Andrew Murray Date: Thu, 10 Jan 2019 13:53:34 +0000 Subject: perf/core: Remove unused perf_flags Now that perf_flags is not used we remove it. Signed-off-by: Andrew Murray Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Mark Rutland Cc: Matt Turner Cc: Michael Ellerman Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Richard Henderson Cc: Russell King Cc: Sascha Hauer Cc: Shawn Guo Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linuxppc-dev@lists.ozlabs.org Cc: robin.murphy@arm.com Cc: suzuki.poulose@arm.com Link: https://lkml.kernel.org/r/1547128414-50693-13-git-send-email-andrew.murray@arm.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 2 -- tools/include/uapi/linux/perf_event.h | 2 -- 2 files changed, 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 9de8780ac8d9..ea19b5d491bf 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -445,8 +445,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index 9de8780ac8d9..ea19b5d491bf 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -445,8 +445,6 @@ struct perf_event_query_bpf { __u32 ids[0]; }; -#define perf_flags(attr) (*(&(attr)->read_format + 1)) - /* * Ioctls that can be done on a perf event fd: */ -- cgit v1.2.3 From 76193a94522f1d4edf2447a536f3f796ce56343b Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:13 -0800 Subject: perf, bpf: Introduce PERF_RECORD_KSYMBOL For better performance analysis of dynamically JITed and loaded kernel functions, such as BPF programs, this patch introduces PERF_RECORD_KSYMBOL, a new perf_event_type that exposes kernel symbol register/unregister information to user space. The following data structure is used for PERF_RECORD_KSYMBOL. /* * struct { * struct perf_event_header header; * u64 addr; * u32 len; * u16 ksym_type; * u16 flags; * char name[]; * struct sample_id sample_id; * }; */ Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Tested-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-2-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 8 ++++ include/uapi/linux/perf_event.h | 26 ++++++++++- kernel/events/core.c | 98 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 130 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4eb88065a9b5..136fe0495374 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1122,6 +1122,10 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, } extern void perf_event_mmap(struct vm_area_struct *vma); + +extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym); + extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1342,6 +1346,10 @@ static inline int perf_unregister_guest_info_callbacks (struct perf_guest_info_callbacks *callbacks) { return 0; } static inline void perf_event_mmap(struct vm_area_struct *vma) { } + +typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); +static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, + bool unregister, const char *sym) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index ea19b5d491bf..1dee5c8f166b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -372,7 +372,8 @@ struct perf_event_attr { context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ - __reserved_1 : 35; + ksymbol : 1, /* include ksymbol events */ + __reserved_1 : 34; union { __u32 wakeup_events; /* wakeup every n events */ @@ -963,9 +964,32 @@ enum perf_event_type { */ PERF_RECORD_NAMESPACES = 16, + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + PERF_RECORD_MAX, /* non-ABI */ }; +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/events/core.c b/kernel/events/core.c index bc525cd1615c..e04ab5f325cf 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -385,6 +385,7 @@ static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; +static atomic_t nr_ksymbol_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4235,7 +4236,7 @@ static bool is_sb_event(struct perf_event *event) if (attr->mmap || attr->mmap_data || attr->mmap2 || attr->comm || attr->comm_exec || - attr->task || + attr->task || attr->ksymbol || attr->context_switch) return true; return false; @@ -4305,6 +4306,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (has_branch_stack(event)) dec = true; + if (event->attr.ksymbol) + atomic_dec(&nr_ksymbol_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7653,6 +7656,97 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +/* + * ksymbol register/unregister tracking + */ + +struct perf_ksymbol_event { + const char *name; + int name_len; + struct { + struct perf_event_header header; + u64 addr; + u32 len; + u16 ksym_type; + u16 flags; + } event_id; +}; + +static int perf_event_ksymbol_match(struct perf_event *event) +{ + return event->attr.ksymbol; +} + +static void perf_event_ksymbol_output(struct perf_event *event, void *data) +{ + struct perf_ksymbol_event *ksymbol_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_ksymbol_match(event)) + return; + + perf_event_header__init_id(&ksymbol_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + ksymbol_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, ksymbol_event->event_id); + __output_copy(&handle, ksymbol_event->name, ksymbol_event->name_len); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, + const char *sym) +{ + struct perf_ksymbol_event ksymbol_event; + char name[KSYM_NAME_LEN]; + u16 flags = 0; + int name_len; + + if (!atomic_read(&nr_ksymbol_events)) + return; + + if (ksym_type >= PERF_RECORD_KSYMBOL_TYPE_MAX || + ksym_type == PERF_RECORD_KSYMBOL_TYPE_UNKNOWN) + goto err; + + strlcpy(name, sym, KSYM_NAME_LEN); + name_len = strlen(name) + 1; + while (!IS_ALIGNED(name_len, sizeof(u64))) + name[name_len++] = '\0'; + BUILD_BUG_ON(KSYM_NAME_LEN % sizeof(u64)); + + if (unregister) + flags |= PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER; + + ksymbol_event = (struct perf_ksymbol_event){ + .name = name, + .name_len = name_len, + .event_id = { + .header = { + .type = PERF_RECORD_KSYMBOL, + .size = sizeof(ksymbol_event.event_id) + + name_len, + }, + .addr = addr, + .len = len, + .ksym_type = ksym_type, + .flags = flags, + }, + }; + + perf_iterate_sb(perf_event_ksymbol_output, &ksymbol_event, NULL); + return; +err: + WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -9912,6 +10006,8 @@ static void account_event(struct perf_event *event) inc = true; if (is_cgroup_event(event)) inc = true; + if (event->attr.ksymbol) + atomic_inc(&nr_ksymbol_events); if (inc) { /* -- cgit v1.2.3 From 6ee52e2a3fe4ea35520720736e6791df1fb67106 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 17 Jan 2019 08:15:15 -0800 Subject: perf, bpf: Introduce PERF_RECORD_BPF_EVENT For better performance analysis of BPF programs, this patch introduces PERF_RECORD_BPF_EVENT, a new perf_event_type that exposes BPF program load/unload information to user space. Each BPF program may contain up to BPF_MAX_SUBPROGS (256) sub programs. The following example shows kernel symbols for a BPF program with 7 sub programs: ffffffffa0257cf9 t bpf_prog_b07ccb89267cf242_F ffffffffa02592e1 t bpf_prog_2dcecc18072623fc_F ffffffffa025b0e9 t bpf_prog_bb7a405ebaec5d5c_F ffffffffa025dd2c t bpf_prog_a7540d4a39ec1fc7_F ffffffffa025fcca t bpf_prog_05762d4ade0e3737_F ffffffffa026108f t bpf_prog_db4bd11e35df90d4_F ffffffffa0263f00 t bpf_prog_89d64e4abf0f0126_F ffffffffa0257cf9 t bpf_prog_ae31629322c4b018__dummy_tracepoi When a bpf program is loaded, PERF_RECORD_KSYMBOL is generated for each of these sub programs. Therefore, PERF_RECORD_BPF_EVENT is not needed for simple profiling. For annotation, user space need to listen to PERF_RECORD_BPF_EVENT and gather more information about these (sub) programs via sys_bpf. Signed-off-by: Song Liu Reviewed-by: Arnaldo Carvalho de Melo Acked-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Tested-by: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: Peter Zijlstra Cc: kernel-team@fb.com Cc: netdev@vger.kernel.org Link: http://lkml.kernel.org/r/20190117161521.1341602-4-songliubraving@fb.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/filter.h | 7 +++ include/linux/perf_event.h | 6 +++ include/uapi/linux/perf_event.h | 29 +++++++++- kernel/bpf/core.c | 2 +- kernel/bpf/syscall.c | 2 + kernel/events/core.c | 115 ++++++++++++++++++++++++++++++++++++++++ 6 files changed, 159 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/filter.h b/include/linux/filter.h index ad106d845b22..d531d4250bff 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -951,6 +951,7 @@ bpf_address_lookup(unsigned long addr, unsigned long *size, void bpf_prog_kallsyms_add(struct bpf_prog *fp); void bpf_prog_kallsyms_del(struct bpf_prog *fp); +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym); #else /* CONFIG_BPF_JIT */ @@ -1006,6 +1007,12 @@ static inline void bpf_prog_kallsyms_add(struct bpf_prog *fp) static inline void bpf_prog_kallsyms_del(struct bpf_prog *fp) { } + +static inline void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +{ + sym[0] = '\0'; +} + #endif /* CONFIG_BPF_JIT */ void bpf_prog_kallsyms_del_subprogs(struct bpf_prog *fp); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 136fe0495374..a79e59fc3b7d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1125,6 +1125,9 @@ extern void perf_event_mmap(struct vm_area_struct *vma); extern void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym); +extern void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); @@ -1350,6 +1353,9 @@ static inline void perf_event_mmap(struct vm_area_struct *vma) { } typedef int (perf_ksymbol_get_name_f)(char *name, int name_len, void *data); static inline void perf_event_ksymbol(u16 ksym_type, u64 addr, u32 len, bool unregister, const char *sym) { } +static inline void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } static inline void perf_event_namespaces(struct task_struct *tsk) { } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1dee5c8f166b..7198ddd0c6b1 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -373,7 +373,8 @@ struct perf_event_attr { write_backward : 1, /* Write ring buffer from end to beginning */ namespaces : 1, /* include namespaces data */ ksymbol : 1, /* include ksymbol events */ - __reserved_1 : 34; + bpf_event : 1, /* include bpf events */ + __reserved_1 : 33; union { __u32 wakeup_events; /* wakeup every n events */ @@ -979,6 +980,25 @@ enum perf_event_type { */ PERF_RECORD_KSYMBOL = 17, + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -990,6 +1010,13 @@ enum perf_record_ksymbol_type { #define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + #define PERF_MAX_STACK_DEPTH 127 #define PERF_MAX_CONTEXTS_PER_STACK 8 diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f908b9356025..19c49313c709 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -495,7 +495,7 @@ bpf_get_prog_addr_region(const struct bpf_prog *prog, *symbol_end = addr + hdr->pages * PAGE_SIZE; } -static void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) +void bpf_get_prog_name(const struct bpf_prog *prog, char *sym) { const char *end = sym + KSYM_NAME_LEN; const struct btf_type *type; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..30ebd085790b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1211,6 +1211,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); bpf_prog_kallsyms_del_all(prog); @@ -1554,6 +1555,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) } bpf_prog_kallsyms_add(prog); + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_LOAD, 0); return err; free_used_maps: diff --git a/kernel/events/core.c b/kernel/events/core.c index e04ab5f325cf..236bb8ddb7bc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -386,6 +386,7 @@ static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; static atomic_t nr_ksymbol_events __read_mostly; +static atomic_t nr_bpf_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4308,6 +4309,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.ksymbol) atomic_dec(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_dec(&nr_bpf_events); if (dec) { if (!atomic_add_unless(&perf_sched_count, -1, 1)) @@ -7747,6 +7750,116 @@ err: WARN_ONCE(1, "%s: Invalid KSYMBOL type 0x%x\n", __func__, ksym_type); } +/* + * bpf program load/unload tracking + */ + +struct perf_bpf_event { + struct bpf_prog *prog; + struct { + struct perf_event_header header; + u16 type; + u16 flags; + u32 id; + u8 tag[BPF_TAG_SIZE]; + } event_id; +}; + +static int perf_event_bpf_match(struct perf_event *event) +{ + return event->attr.bpf_event; +} + +static void perf_event_bpf_output(struct perf_event *event, void *data) +{ + struct perf_bpf_event *bpf_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_bpf_match(event)) + return; + + perf_event_header__init_id(&bpf_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + bpf_event->event_id.header.size); + if (ret) + return; + + perf_output_put(&handle, bpf_event->event_id); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_event_bpf_emit_ksymbols(struct bpf_prog *prog, + enum perf_bpf_event_type type) +{ + bool unregister = type == PERF_BPF_EVENT_PROG_UNLOAD; + char sym[KSYM_NAME_LEN]; + int i; + + if (prog->aux->func_cnt == 0) { + bpf_get_prog_name(prog, sym); + perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)prog->bpf_func, + prog->jited_len, unregister, sym); + } else { + for (i = 0; i < prog->aux->func_cnt; i++) { + struct bpf_prog *subprog = prog->aux->func[i]; + + bpf_get_prog_name(subprog, sym); + perf_event_ksymbol( + PERF_RECORD_KSYMBOL_TYPE_BPF, + (u64)(unsigned long)subprog->bpf_func, + subprog->jited_len, unregister, sym); + } + } +} + +void perf_event_bpf_event(struct bpf_prog *prog, + enum perf_bpf_event_type type, + u16 flags) +{ + struct perf_bpf_event bpf_event; + + if (type <= PERF_BPF_EVENT_UNKNOWN || + type >= PERF_BPF_EVENT_MAX) + return; + + switch (type) { + case PERF_BPF_EVENT_PROG_LOAD: + case PERF_BPF_EVENT_PROG_UNLOAD: + if (atomic_read(&nr_ksymbol_events)) + perf_event_bpf_emit_ksymbols(prog, type); + break; + default: + break; + } + + if (!atomic_read(&nr_bpf_events)) + return; + + bpf_event = (struct perf_bpf_event){ + .prog = prog, + .event_id = { + .header = { + .type = PERF_RECORD_BPF_EVENT, + .size = sizeof(bpf_event.event_id), + }, + .type = type, + .flags = flags, + .id = prog->aux->id, + }, + }; + + BUILD_BUG_ON(BPF_TAG_SIZE % sizeof(u64)); + + memcpy(bpf_event.event_id.tag, prog->tag, BPF_TAG_SIZE); + perf_iterate_sb(perf_event_bpf_output, &bpf_event, NULL); +} + void perf_event_itrace_started(struct perf_event *event) { event->attach_state |= PERF_ATTACH_ITRACE; @@ -10008,6 +10121,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.ksymbol) atomic_inc(&nr_ksymbol_events); + if (event->attr.bpf_event) + atomic_inc(&nr_bpf_events); if (inc) { /* -- cgit v1.2.3 From aefcb7460e0b5f35f72601b7a98eec5ca1639cf2 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Tue, 15 Jan 2019 15:18:56 +1100 Subject: m68k/mac: Fix PRAM accessors PMU-based m68k Macs pre-date PowerMac-style NVRAM. Use the appropriate PMU commands. Also implement the missing XPRAM accessors for VIA-based Macs. Acked-by: Geert Uytterhoeven Tested-by: Stan Johnson Signed-off-by: Finn Thain Signed-off-by: Greg Kroah-Hartman --- arch/m68k/mac/misc.c | 43 +++++++++++++++++++++++++++++++++---------- include/uapi/linux/pmu.h | 2 ++ 2 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index af000a015f68..d016ca2e0d10 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -66,23 +66,22 @@ static unsigned char pmu_pram_read_byte(int offset) { struct adb_request req; - if (pmu_request(&req, NULL, 3, PMU_READ_NVRAM, - (offset >> 8) & 0xFF, offset & 0xFF) < 0) + if (pmu_request(&req, NULL, 3, PMU_READ_XPRAM, + offset & 0xFF, 1) < 0) return 0; - while (!req.complete) - pmu_poll(); - return req.reply[3]; + pmu_wait_complete(&req); + + return req.reply[0]; } static void pmu_pram_write_byte(unsigned char data, int offset) { struct adb_request req; - if (pmu_request(&req, NULL, 4, PMU_WRITE_NVRAM, - (offset >> 8) & 0xFF, offset & 0xFF, data) < 0) + if (pmu_request(&req, NULL, 4, PMU_WRITE_XPRAM, + offset & 0xFF, 1, data) < 0) return; - while (!req.complete) - pmu_poll(); + pmu_wait_complete(&req); } #endif /* CONFIG_ADB_PMU */ @@ -151,6 +150,16 @@ static void via_rtc_send(__u8 data) #define RTC_REG_SECONDS_3 3 #define RTC_REG_WRITE_PROTECT 13 +/* + * Inside Mac has no information about two-byte RTC commands but + * the MAME/MESS source code has the essentials. + */ + +#define RTC_REG_XPRAM 14 +#define RTC_CMD_XPRAM_READ (RTC_CMD_READ(RTC_REG_XPRAM) << 8) +#define RTC_CMD_XPRAM_WRITE (RTC_CMD_WRITE(RTC_REG_XPRAM) << 8) +#define RTC_CMD_XPRAM_ARG(a) (((a & 0xE0) << 3) | ((a & 0x1F) << 2)) + /* * Execute a VIA PRAM/RTC command. For read commands * data should point to a one-byte buffer for the @@ -198,11 +207,25 @@ static void via_rtc_command(int command, __u8 *data) static unsigned char via_pram_read_byte(int offset) { - return 0; + unsigned char temp; + + via_rtc_command(RTC_CMD_XPRAM_READ | RTC_CMD_XPRAM_ARG(offset), &temp); + + return temp; } static void via_pram_write_byte(unsigned char data, int offset) { + unsigned char temp; + + temp = 0x55; + via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp); + + temp = data; + via_rtc_command(RTC_CMD_XPRAM_WRITE | RTC_CMD_XPRAM_ARG(offset), &temp); + + temp = 0x55 | RTC_FLG_WRITE_PROTECT; + via_rtc_command(RTC_CMD_WRITE(RTC_REG_WRITE_PROTECT), &temp); } /* diff --git a/include/uapi/linux/pmu.h b/include/uapi/linux/pmu.h index 97256f90e6df..f2fc1bd80017 100644 --- a/include/uapi/linux/pmu.h +++ b/include/uapi/linux/pmu.h @@ -19,7 +19,9 @@ #define PMU_POWER_CTRL 0x11 /* control power of some devices */ #define PMU_ADB_CMD 0x20 /* send ADB packet */ #define PMU_ADB_POLL_OFF 0x21 /* disable ADB auto-poll */ +#define PMU_WRITE_XPRAM 0x32 /* write eXtended Parameter RAM */ #define PMU_WRITE_NVRAM 0x33 /* write non-volatile RAM */ +#define PMU_READ_XPRAM 0x3a /* read eXtended Parameter RAM */ #define PMU_READ_NVRAM 0x3b /* read non-volatile RAM */ #define PMU_SET_RTC 0x30 /* set real-time clock */ #define PMU_READ_RTC 0x38 /* read real-time clock */ -- cgit v1.2.3 From ec74136ded792deed80780a2f8baf3521eeb72f9 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 14 Jan 2019 09:10:21 -0800 Subject: binder: create node flag to request sender's security context To allow servers to verify client identity, allow a node flag to be set that causes the sender's security context to be delivered with the transaction. The BR_TRANSACTION command is extended in BR_TRANSACTION_SEC_CTX to contain a pointer to the security context string. Signed-off-by: Todd Kjos Reviewed-by: Joel Fernandes (Google) Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 106 ++++++++++++++++++++++++++++-------- include/uapi/linux/android/binder.h | 19 +++++++ 2 files changed, 102 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/android/binder.c b/drivers/android/binder.c index cdfc87629efb..5f6ef5e63b91 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -329,6 +329,8 @@ struct binder_error { * (invariant after initialized) * @min_priority: minimum scheduling priority * (invariant after initialized) + * @txn_security_ctx: require sender's security context + * (invariant after initialized) * @async_todo: list of async work items * (protected by @proc->inner_lock) * @@ -365,6 +367,7 @@ struct binder_node { * invariant after initialization */ u8 accept_fds:1; + u8 txn_security_ctx:1; u8 min_priority; }; bool has_async_transaction; @@ -615,6 +618,7 @@ struct binder_transaction { long saved_priority; kuid_t sender_euid; struct list_head fd_fixups; + binder_uintptr_t security_ctx; /** * @lock: protects @from, @to_proc, and @to_thread * @@ -1152,6 +1156,7 @@ static struct binder_node *binder_init_node_ilocked( node->work.type = BINDER_WORK_NODE; node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); + node->txn_security_ctx = !!(flags & FLAT_BINDER_FLAG_TXN_SECURITY_CTX); spin_lock_init(&node->lock); INIT_LIST_HEAD(&node->work.entry); INIT_LIST_HEAD(&node->async_todo); @@ -2778,6 +2783,8 @@ static void binder_transaction(struct binder_proc *proc, binder_size_t last_fixup_min_off = 0; struct binder_context *context = proc->context; int t_debug_id = atomic_inc_return(&binder_last_id); + char *secctx = NULL; + u32 secctx_sz = 0; e = binder_transaction_log_add(&binder_transaction_log); e->debug_id = t_debug_id; @@ -3020,6 +3027,20 @@ static void binder_transaction(struct binder_proc *proc, t->flags = tr->flags; t->priority = task_nice(current); + if (target_node && target_node->txn_security_ctx) { + u32 secid; + + security_task_getsecid(proc->tsk, &secid); + ret = security_secid_to_secctx(secid, &secctx, &secctx_sz); + if (ret) { + return_error = BR_FAILED_REPLY; + return_error_param = ret; + return_error_line = __LINE__; + goto err_get_secctx_failed; + } + extra_buffers_size += ALIGN(secctx_sz, sizeof(u64)); + } + trace_binder_transaction(reply, t, target_node); t->buffer = binder_alloc_new_buf(&target_proc->alloc, tr->data_size, @@ -3036,6 +3057,19 @@ static void binder_transaction(struct binder_proc *proc, t->buffer = NULL; goto err_binder_alloc_buf_failed; } + if (secctx) { + size_t buf_offset = ALIGN(tr->data_size, sizeof(void *)) + + ALIGN(tr->offsets_size, sizeof(void *)) + + ALIGN(extra_buffers_size, sizeof(void *)) - + ALIGN(secctx_sz, sizeof(u64)); + char *kptr = t->buffer->data + buf_offset; + + t->security_ctx = (uintptr_t)kptr + + binder_alloc_get_user_buffer_offset(&target_proc->alloc); + memcpy(kptr, secctx, secctx_sz); + security_release_secctx(secctx, secctx_sz); + secctx = NULL; + } t->buffer->debug_id = t->debug_id; t->buffer->transaction = t; t->buffer->target_node = target_node; @@ -3305,6 +3339,9 @@ err_copy_data_failed: t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); err_binder_alloc_buf_failed: + if (secctx) + security_release_secctx(secctx, secctx_sz); +err_get_secctx_failed: kfree(tcomplete); binder_stats_deleted(BINDER_STAT_TRANSACTION_COMPLETE); err_alloc_tcomplete_failed: @@ -4036,11 +4073,13 @@ retry: while (1) { uint32_t cmd; - struct binder_transaction_data tr; + struct binder_transaction_data_secctx tr; + struct binder_transaction_data *trd = &tr.transaction_data; struct binder_work *w = NULL; struct list_head *list = NULL; struct binder_transaction *t = NULL; struct binder_thread *t_from; + size_t trsize = sizeof(*trd); binder_inner_proc_lock(proc); if (!binder_worklist_empty_ilocked(&thread->todo)) @@ -4240,8 +4279,8 @@ retry: if (t->buffer->target_node) { struct binder_node *target_node = t->buffer->target_node; - tr.target.ptr = target_node->ptr; - tr.cookie = target_node->cookie; + trd->target.ptr = target_node->ptr; + trd->cookie = target_node->cookie; t->saved_priority = task_nice(current); if (t->priority < target_node->min_priority && !(t->flags & TF_ONE_WAY)) @@ -4251,22 +4290,23 @@ retry: binder_set_nice(target_node->min_priority); cmd = BR_TRANSACTION; } else { - tr.target.ptr = 0; - tr.cookie = 0; + trd->target.ptr = 0; + trd->cookie = 0; cmd = BR_REPLY; } - tr.code = t->code; - tr.flags = t->flags; - tr.sender_euid = from_kuid(current_user_ns(), t->sender_euid); + trd->code = t->code; + trd->flags = t->flags; + trd->sender_euid = from_kuid(current_user_ns(), t->sender_euid); t_from = binder_get_txn_from(t); if (t_from) { struct task_struct *sender = t_from->proc->tsk; - tr.sender_pid = task_tgid_nr_ns(sender, - task_active_pid_ns(current)); + trd->sender_pid = + task_tgid_nr_ns(sender, + task_active_pid_ns(current)); } else { - tr.sender_pid = 0; + trd->sender_pid = 0; } ret = binder_apply_fd_fixups(t); @@ -4297,15 +4337,20 @@ retry: } continue; } - tr.data_size = t->buffer->data_size; - tr.offsets_size = t->buffer->offsets_size; - tr.data.ptr.buffer = (binder_uintptr_t) + trd->data_size = t->buffer->data_size; + trd->offsets_size = t->buffer->offsets_size; + trd->data.ptr.buffer = (binder_uintptr_t) ((uintptr_t)t->buffer->data + binder_alloc_get_user_buffer_offset(&proc->alloc)); - tr.data.ptr.offsets = tr.data.ptr.buffer + + trd->data.ptr.offsets = trd->data.ptr.buffer + ALIGN(t->buffer->data_size, sizeof(void *)); + tr.secctx = t->security_ctx; + if (t->security_ctx) { + cmd = BR_TRANSACTION_SEC_CTX; + trsize = sizeof(tr); + } if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); @@ -4316,7 +4361,7 @@ retry: return -EFAULT; } ptr += sizeof(uint32_t); - if (copy_to_user(ptr, &tr, sizeof(tr))) { + if (copy_to_user(ptr, &tr, trsize)) { if (t_from) binder_thread_dec_tmpref(t_from); @@ -4325,7 +4370,7 @@ retry: return -EFAULT; } - ptr += sizeof(tr); + ptr += trsize; trace_binder_transaction_received(t); binder_stat_br(proc, thread, cmd); @@ -4333,16 +4378,18 @@ retry: "%d:%d %s %d %d:%d, cmd %d size %zd-%zd ptr %016llx-%016llx\n", proc->pid, thread->pid, (cmd == BR_TRANSACTION) ? "BR_TRANSACTION" : - "BR_REPLY", + (cmd == BR_TRANSACTION_SEC_CTX) ? + "BR_TRANSACTION_SEC_CTX" : "BR_REPLY", t->debug_id, t_from ? t_from->proc->pid : 0, t_from ? t_from->pid : 0, cmd, t->buffer->data_size, t->buffer->offsets_size, - (u64)tr.data.ptr.buffer, (u64)tr.data.ptr.offsets); + (u64)trd->data.ptr.buffer, + (u64)trd->data.ptr.offsets); if (t_from) binder_thread_dec_tmpref(t_from); t->buffer->allow_user_free = 1; - if (cmd == BR_TRANSACTION && !(t->flags & TF_ONE_WAY)) { + if (cmd != BR_REPLY && !(t->flags & TF_ONE_WAY)) { binder_inner_proc_lock(thread->proc); t->to_parent = thread->transaction_stack; t->to_thread = thread; @@ -4690,7 +4737,8 @@ out: return ret; } -static int binder_ioctl_set_ctx_mgr(struct file *filp) +static int binder_ioctl_set_ctx_mgr(struct file *filp, + struct flat_binder_object *fbo) { int ret = 0; struct binder_proc *proc = filp->private_data; @@ -4719,7 +4767,7 @@ static int binder_ioctl_set_ctx_mgr(struct file *filp) } else { context->binder_context_mgr_uid = curr_euid; } - new_node = binder_new_node(proc, NULL); + new_node = binder_new_node(proc, fbo); if (!new_node) { ret = -ENOMEM; goto out; @@ -4842,8 +4890,20 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) binder_inner_proc_unlock(proc); break; } + case BINDER_SET_CONTEXT_MGR_EXT: { + struct flat_binder_object fbo; + + if (copy_from_user(&fbo, ubuf, sizeof(fbo))) { + ret = -EINVAL; + goto err; + } + ret = binder_ioctl_set_ctx_mgr(filp, &fbo); + if (ret) + goto err; + break; + } case BINDER_SET_CONTEXT_MGR: - ret = binder_ioctl_set_ctx_mgr(filp); + ret = binder_ioctl_set_ctx_mgr(filp, NULL); if (ret) goto err; break; diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h index b9ba520f7e4b..2832134e5397 100644 --- a/include/uapi/linux/android/binder.h +++ b/include/uapi/linux/android/binder.h @@ -41,6 +41,14 @@ enum { enum { FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff, FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100, + + /** + * @FLAT_BINDER_FLAG_TXN_SECURITY_CTX: request security contexts + * + * Only when set, causes senders to include their security + * context + */ + FLAT_BINDER_FLAG_TXN_SECURITY_CTX = 0x1000, }; #ifdef BINDER_IPC_32BIT @@ -218,6 +226,7 @@ struct binder_node_info_for_ref { #define BINDER_VERSION _IOWR('b', 9, struct binder_version) #define BINDER_GET_NODE_DEBUG_INFO _IOWR('b', 11, struct binder_node_debug_info) #define BINDER_GET_NODE_INFO_FOR_REF _IOWR('b', 12, struct binder_node_info_for_ref) +#define BINDER_SET_CONTEXT_MGR_EXT _IOW('b', 13, struct flat_binder_object) /* * NOTE: Two special error codes you should check for when calling @@ -276,6 +285,11 @@ struct binder_transaction_data { } data; }; +struct binder_transaction_data_secctx { + struct binder_transaction_data transaction_data; + binder_uintptr_t secctx; +}; + struct binder_transaction_data_sg { struct binder_transaction_data transaction_data; binder_size_t buffers_size; @@ -311,6 +325,11 @@ enum binder_driver_return_protocol { BR_OK = _IO('r', 1), /* No parameters! */ + BR_TRANSACTION_SEC_CTX = _IOR('r', 2, + struct binder_transaction_data_secctx), + /* + * binder_transaction_data_secctx: the received command. + */ BR_TRANSACTION = _IOR('r', 2, struct binder_transaction_data), BR_REPLY = _IOR('r', 3, struct binder_transaction_data), /* -- cgit v1.2.3 From a258aeacd7f0dc10bb45caa7e92a3ea3ca1a76e9 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 18 Jan 2019 14:30:23 +0200 Subject: bonding: add support for xstats and export 3ad stats This patch adds support for extended statistics (xstats) call to the bonding. The first user would be the 3ad code which counts the following events: - LACPDU Rx/Tx - LACPDU unknown type Rx - LACPDU illegal Rx - Marker Rx/Tx - Marker response Rx/Tx - Marker unknown type Rx All of these are exported via netlink as separate attributes to be easily extensible as we plan to add more in the future. Similar to how the bridge and other xstats exports, the structure inside is: [ IFLA_STATS_LINK_XSTATS ] -> [ LINK_XSTATS_TYPE_BOND ] -> [ BOND_XSTATS_3AD ] -> [ 3ad stats attributes ] With this structure it's easy to add more stat types later. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- drivers/net/bonding/bond_3ad.c | 83 ++++++++++++++++++++++++++++++++++++++ drivers/net/bonding/bond_netlink.c | 71 ++++++++++++++++++++++++++++++++ include/net/bond_3ad.h | 3 ++ include/uapi/linux/if_bonding.h | 24 +++++++++++ include/uapi/linux/if_link.h | 1 + 5 files changed, 182 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index d1d8cb6b8cdc..d30c21b34858 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -31,6 +31,7 @@ #include #include #include +#include /* General definitions */ #define AD_SHORT_TIMEOUT 1 @@ -2696,3 +2697,85 @@ void bond_3ad_update_lacp_rate(struct bonding *bond) } spin_unlock_bh(&bond->mode_lock); } + +void bond_3ad_stats_add(struct slave *slave, struct bond_3ad_stats *stats) +{ + struct bond_3ad_stats *rstats = &SLAVE_AD_INFO(slave)->stats; + u64 stat; + + atomic64_add(atomic64_read(&rstats->lacpdu_rx), &stats->lacpdu_rx); + atomic64_add(atomic64_read(&rstats->lacpdu_tx), &stats->lacpdu_tx); + + stat = atomic64_read(&rstats->lacpdu_unknown_rx); + atomic64_add(stat, &stats->lacpdu_unknown_rx); + stat = atomic64_read(&rstats->lacpdu_illegal_rx); + atomic64_add(stat, &stats->lacpdu_illegal_rx); + + atomic64_add(atomic64_read(&rstats->marker_rx), &stats->marker_rx); + atomic64_add(atomic64_read(&rstats->marker_tx), &stats->marker_tx); + + stat = atomic64_read(&rstats->marker_resp_rx); + atomic64_add(stat, &stats->marker_resp_rx); + stat = atomic64_read(&rstats->marker_resp_tx); + atomic64_add(stat, &stats->marker_resp_tx); + stat = atomic64_read(&rstats->marker_unknown_rx); + atomic64_add(stat, &stats->marker_unknown_rx); +} + +size_t bond_3ad_stats_size(void) +{ + return nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_RX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_TX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_UNKNOWN_RX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_LACPDU_ILLEGAL_RX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_TX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_RX */ + nla_total_size_64bit(sizeof(u64)) + /* BOND_3AD_STAT_MARKER_RESP_TX */ + nla_total_size_64bit(sizeof(u64)); /* BOND_3AD_STAT_MARKER_UNKNOWN_RX */ +} + +int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats) +{ + u64 val; + + val = atomic64_read(&stats->lacpdu_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->lacpdu_tx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_TX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->lacpdu_unknown_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_UNKNOWN_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->lacpdu_illegal_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_LACPDU_ILLEGAL_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + + val = atomic64_read(&stats->marker_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->marker_tx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_TX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->marker_resp_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->marker_resp_tx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_RESP_TX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + val = atomic64_read(&stats->marker_unknown_rx); + if (nla_put_u64_64bit(skb, BOND_3AD_STAT_MARKER_UNKNOWN_RX, val, + BOND_3AD_STAT_PAD)) + return -EMSGSIZE; + + return 0; +} diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c index 6b9ad8673218..d1338fbe1830 100644 --- a/drivers/net/bonding/bond_netlink.c +++ b/drivers/net/bonding/bond_netlink.c @@ -675,6 +675,75 @@ nla_put_failure: return -EMSGSIZE; } +static size_t bond_get_linkxstats_size(const struct net_device *dev, int attr) +{ + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + case IFLA_STATS_LINK_XSTATS_SLAVE: + break; + default: + return 0; + } + + return bond_3ad_stats_size() + nla_total_size(0); +} + +static int bond_fill_linkxstats(struct sk_buff *skb, + const struct net_device *dev, + int *prividx, int attr) +{ + struct nlattr *nla __maybe_unused; + struct slave *slave = NULL; + struct nlattr *nest, *nest2; + struct bonding *bond; + + switch (attr) { + case IFLA_STATS_LINK_XSTATS: + bond = netdev_priv(dev); + break; + case IFLA_STATS_LINK_XSTATS_SLAVE: + slave = bond_slave_get_rtnl(dev); + if (!slave) + return 0; + bond = slave->bond; + break; + default: + return -EINVAL; + } + + nest = nla_nest_start(skb, LINK_XSTATS_TYPE_BOND); + if (!nest) + return -EMSGSIZE; + if (BOND_MODE(bond) == BOND_MODE_8023AD) { + struct bond_3ad_stats stats; + struct list_head *iter; + + memset(&stats, 0, sizeof(stats)); + if (slave) { + bond_3ad_stats_add(slave, &stats); + } else { + bond_for_each_slave(bond, slave, iter) + bond_3ad_stats_add(slave, &stats); + } + + nest2 = nla_nest_start(skb, BOND_XSTATS_3AD); + if (!nest2) { + nla_nest_end(skb, nest); + return -EMSGSIZE; + } + + if (bond_3ad_stats_fill(skb, &stats)) { + nla_nest_cancel(skb, nest2); + nla_nest_end(skb, nest); + return -EMSGSIZE; + } + nla_nest_end(skb, nest2); + } + nla_nest_end(skb, nest); + + return 0; +} + struct rtnl_link_ops bond_link_ops __read_mostly = { .kind = "bond", .priv_size = sizeof(struct bonding), @@ -689,6 +758,8 @@ struct rtnl_link_ops bond_link_ops __read_mostly = { .get_num_tx_queues = bond_get_num_tx_queues, .get_num_rx_queues = bond_get_num_tx_queues, /* Use the same number as for TX queues */ + .fill_linkxstats = bond_fill_linkxstats, + .get_linkxstats_size = bond_get_linkxstats_size, .slave_maxtype = IFLA_BOND_SLAVE_MAX, .slave_policy = bond_slave_policy, .slave_changelink = bond_slave_changelink, diff --git a/include/net/bond_3ad.h b/include/net/bond_3ad.h index 30e60dba1b2d..25aaf49d19be 100644 --- a/include/net/bond_3ad.h +++ b/include/net/bond_3ad.h @@ -321,5 +321,8 @@ int bond_3ad_lacpdu_recv(const struct sk_buff *skb, struct bonding *bond, int bond_3ad_set_carrier(struct bonding *bond); void bond_3ad_update_lacp_rate(struct bonding *bond); void bond_3ad_update_ad_actor_settings(struct bonding *bond); +void bond_3ad_stats_add(struct slave *slave, struct bond_3ad_stats *stats); +int bond_3ad_stats_fill(struct sk_buff *skb, struct bond_3ad_stats *stats); +size_t bond_3ad_stats_size(void); #endif /* _NET_BOND_3AD_H */ diff --git a/include/uapi/linux/if_bonding.h b/include/uapi/linux/if_bonding.h index 61a1bf6e865e..790585f0e61b 100644 --- a/include/uapi/linux/if_bonding.h +++ b/include/uapi/linux/if_bonding.h @@ -117,6 +117,30 @@ struct ad_info { __u8 partner_system[ETH_ALEN]; }; +/* Embedded inside LINK_XSTATS_TYPE_BOND */ +enum { + BOND_XSTATS_UNSPEC, + BOND_XSTATS_3AD, + __BOND_XSTATS_MAX +}; +#define BOND_XSTATS_MAX (__BOND_XSTATS_MAX - 1) + +/* Embedded inside BOND_XSTATS_3AD */ +enum { + BOND_3AD_STAT_LACPDU_RX, + BOND_3AD_STAT_LACPDU_TX, + BOND_3AD_STAT_LACPDU_UNKNOWN_RX, + BOND_3AD_STAT_LACPDU_ILLEGAL_RX, + BOND_3AD_STAT_MARKER_RX, + BOND_3AD_STAT_MARKER_TX, + BOND_3AD_STAT_MARKER_RESP_RX, + BOND_3AD_STAT_MARKER_RESP_TX, + BOND_3AD_STAT_MARKER_UNKNOWN_RX, + BOND_3AD_STAT_PAD, + __BOND_3AD_STAT_MAX +}; +#define BOND_3AD_STAT_MAX (__BOND_3AD_STAT_MAX - 1) + #endif /* _LINUX_IF_BONDING_H */ /* diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index d6533828123a..5b225ff63b48 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -925,6 +925,7 @@ enum { enum { LINK_XSTATS_TYPE_UNSPEC, LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, __LINK_XSTATS_TYPE_MAX }; #define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) -- cgit v1.2.3 From 4effd28c1245303dce7fd290c501ac2c11052114 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 21 Jan 2019 07:26:27 +0100 Subject: bridge: join all-snoopers multicast address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Next to snooping IGMP/MLD queries RFC4541, section 2.1.1.a) recommends to snoop multicast router advertisements to detect multicast routers. Multicast router advertisements are sent to an "all-snoopers" multicast address. To be able to receive them reliably, we need to join this group. Otherwise other snooping switches might refrain from forwarding these advertisements to us. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/uapi/linux/in.h | 9 +++--- net/bridge/br_multicast.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++- net/ipv6/mcast.c | 2 ++ 3 files changed, 78 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/in.h b/include/uapi/linux/in.h index a55cb8b10165..e7ad9d350a28 100644 --- a/include/uapi/linux/in.h +++ b/include/uapi/linux/in.h @@ -292,10 +292,11 @@ struct sockaddr_in { #define IN_LOOPBACK(a) ((((long int) (a)) & 0xff000000) == 0x7f000000) /* Defines for Multicast INADDR */ -#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */ -#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */ -#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */ -#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */ +#define INADDR_UNSPEC_GROUP 0xe0000000U /* 224.0.0.0 */ +#define INADDR_ALLHOSTS_GROUP 0xe0000001U /* 224.0.0.1 */ +#define INADDR_ALLRTRS_GROUP 0xe0000002U /* 224.0.0.2 */ +#define INADDR_ALLSNOOPERS_GROUP 0xe000006aU /* 224.0.0.106 */ +#define INADDR_MAX_LOCAL_GROUP 0xe00000ffU /* 224.0.0.255 */ #endif /* contains the htonl type stuff.. */ diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 156c4905639e..2366f4a2780e 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1780,6 +1780,68 @@ void br_multicast_init(struct net_bridge *br) INIT_HLIST_HEAD(&br->mdb_list); } +static void br_ip4_multicast_join_snoopers(struct net_bridge *br) +{ + struct in_device *in_dev = in_dev_get(br->dev); + + if (!in_dev) + return; + + ip_mc_inc_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP)); + in_dev_put(in_dev); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_join_snoopers(struct net_bridge *br) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); + ipv6_dev_mc_inc(br->dev, &addr); +} +#else +static inline void br_ip6_multicast_join_snoopers(struct net_bridge *br) +{ +} +#endif + +static void br_multicast_join_snoopers(struct net_bridge *br) +{ + br_ip4_multicast_join_snoopers(br); + br_ip6_multicast_join_snoopers(br); +} + +static void br_ip4_multicast_leave_snoopers(struct net_bridge *br) +{ + struct in_device *in_dev = in_dev_get(br->dev); + + if (WARN_ON(!in_dev)) + return; + + ip_mc_dec_group(in_dev, htonl(INADDR_ALLSNOOPERS_GROUP)); + in_dev_put(in_dev); +} + +#if IS_ENABLED(CONFIG_IPV6) +static void br_ip6_multicast_leave_snoopers(struct net_bridge *br) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xff020000), 0, 0, htonl(0x6a)); + ipv6_dev_mc_dec(br->dev, &addr); +} +#else +static inline void br_ip6_multicast_leave_snoopers(struct net_bridge *br) +{ +} +#endif + +static void br_multicast_leave_snoopers(struct net_bridge *br) +{ + br_ip4_multicast_leave_snoopers(br); + br_ip6_multicast_leave_snoopers(br); +} + static void __br_multicast_open(struct net_bridge *br, struct bridge_mcast_own_query *query) { @@ -1793,6 +1855,9 @@ static void __br_multicast_open(struct net_bridge *br, void br_multicast_open(struct net_bridge *br) { + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_join_snoopers(br); + __br_multicast_open(br, &br->ip4_own_query); #if IS_ENABLED(CONFIG_IPV6) __br_multicast_open(br, &br->ip6_own_query); @@ -1808,6 +1873,9 @@ void br_multicast_stop(struct net_bridge *br) del_timer_sync(&br->ip6_other_query.timer); del_timer_sync(&br->ip6_own_query.timer); #endif + + if (br_opt_get(br, BROPT_MULTICAST_ENABLED)) + br_multicast_leave_snoopers(br); } void br_multicast_dev_del(struct net_bridge *br) @@ -1943,8 +2011,10 @@ int br_multicast_toggle(struct net_bridge *br, unsigned long val) br_mc_disabled_update(br->dev, val); br_opt_toggle(br, BROPT_MULTICAST_ENABLED, !!val); - if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) + if (!br_opt_get(br, BROPT_MULTICAST_ENABLED)) { + br_multicast_leave_snoopers(br); goto unlock; + } if (!netif_running(br->dev)) goto unlock; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 21f6deb2aec9..42f3f5cd349f 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -940,6 +940,7 @@ int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr) { return __ipv6_dev_mc_inc(dev, addr, MCAST_EXCLUDE); } +EXPORT_SYMBOL(ipv6_dev_mc_inc); /* * device multicast group del @@ -987,6 +988,7 @@ int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr) return err; } +EXPORT_SYMBOL(ipv6_dev_mc_dec); /* * check if the interface/address pair is valid -- cgit v1.2.3 From 4b3087c7e37f9e499127201849e33960dc81da11 Mon Sep 17 00:00:00 2001 From: Linus Lüssing Date: Mon, 21 Jan 2019 07:26:28 +0100 Subject: bridge: Snoop Multicast Router Advertisements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When multiple multicast routers are present in a broadcast domain then only one of them will be detectable via IGMP/MLD query snooping. The multicast router with the lowest IP address will become the selected and active querier while all other multicast routers will then refrain from sending queries. To detect such rather silent multicast routers, too, RFC4286 ("Multicast Router Discovery") provides a standardized protocol to detect multicast routers for multicast snooping switches. This patch implements the necessary MRD Advertisement message parsing and after successful processing adds such routers to the internal multicast router list. Signed-off-by: Linus Lüssing Signed-off-by: David S. Miller --- include/linux/in.h | 5 +++++ include/net/addrconf.h | 15 +++++++++++++ include/uapi/linux/icmpv6.h | 2 ++ include/uapi/linux/igmp.h | 1 + net/bridge/br_multicast.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ net/ipv6/mcast_snoop.c | 5 ++++- 6 files changed, 82 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/in.h b/include/linux/in.h index 31b493734763..435e7f2a513a 100644 --- a/include/linux/in.h +++ b/include/linux/in.h @@ -60,6 +60,11 @@ static inline bool ipv4_is_lbcast(__be32 addr) return addr == htonl(INADDR_BROADCAST); } +static inline bool ipv4_is_all_snoopers(__be32 addr) +{ + return addr == htonl(INADDR_ALLSNOOPERS_GROUP); +} + static inline bool ipv4_is_zeronet(__be32 addr) { return (addr & htonl(0xff000000)) == htonl(0x00000000); diff --git a/include/net/addrconf.h b/include/net/addrconf.h index daf11dcb0f70..20d523ee2fec 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -229,6 +229,7 @@ void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); +int ipv6_mc_check_icmpv6(struct sk_buff *skb); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); @@ -499,6 +500,20 @@ static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) #endif } +static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) +{ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 + __be64 *p = (__be64 *)addr; + + return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | + (p[1] ^ cpu_to_be64(0x6a))) == 0UL; +#else + return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | + addr->s6_addr32[1] | addr->s6_addr32[2] | + (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; +#endif +} + #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); diff --git a/include/uapi/linux/icmpv6.h b/include/uapi/linux/icmpv6.h index caf8dc019250..325395f56bfa 100644 --- a/include/uapi/linux/icmpv6.h +++ b/include/uapi/linux/icmpv6.h @@ -108,6 +108,8 @@ struct icmp6hdr { #define ICMPV6_MOBILE_PREFIX_SOL 146 #define ICMPV6_MOBILE_PREFIX_ADV 147 +#define ICMPV6_MRDISC_ADV 151 + /* * Codes for Destination Unreachable */ diff --git a/include/uapi/linux/igmp.h b/include/uapi/linux/igmp.h index 7e44ac02ca18..90c28bc466c6 100644 --- a/include/uapi/linux/igmp.h +++ b/include/uapi/linux/igmp.h @@ -93,6 +93,7 @@ struct igmpv3_query { #define IGMP_MTRACE_RESP 0x1e #define IGMP_MTRACE 0x1f +#define IGMP_MRDISC_ADV 0x30 /* From RFC4286 */ /* * Use the BSD names for these for compatibility diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 2366f4a2780e..2c46c7aca571 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -29,10 +30,12 @@ #include #include #if IS_ENABLED(CONFIG_IPV6) +#include #include #include #include #include +#include #endif #include "br_private.h" @@ -1583,6 +1586,19 @@ static void br_multicast_pim(struct net_bridge *br, br_multicast_mark_router(br, port); } +static int br_ip4_multicast_mrd_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + if (ip_hdr(skb)->protocol != IPPROTO_IGMP || + igmp_hdr(skb)->type != IGMP_MRDISC_ADV) + return -ENOMSG; + + br_multicast_mark_router(br, port); + + return 0; +} + static int br_multicast_ipv4_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, @@ -1600,7 +1616,15 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } else if (pim_ipv4_all_pim_routers(ip_hdr(skb)->daddr)) { if (ip_hdr(skb)->protocol == IPPROTO_PIM) br_multicast_pim(br, port, skb); + } else if (ipv4_is_all_snoopers(ip_hdr(skb)->daddr)) { + err = br_ip4_multicast_mrd_rcv(br, port, skb); + + if (err < 0 && err != -ENOMSG) { + br_multicast_err_count(br, port, skb->protocol); + return err; + } } + return 0; } else if (err < 0) { br_multicast_err_count(br, port, skb->protocol); @@ -1635,6 +1659,27 @@ static int br_multicast_ipv4_rcv(struct net_bridge *br, } #if IS_ENABLED(CONFIG_IPV6) +static int br_ip6_multicast_mrd_rcv(struct net_bridge *br, + struct net_bridge_port *port, + struct sk_buff *skb) +{ + int ret; + + if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) + return -ENOMSG; + + ret = ipv6_mc_check_icmpv6(skb); + if (ret < 0) + return ret; + + if (icmp6_hdr(skb)->icmp6_type != ICMPV6_MRDISC_ADV) + return -ENOMSG; + + br_multicast_mark_router(br, port); + + return 0; +} + static int br_multicast_ipv6_rcv(struct net_bridge *br, struct net_bridge_port *port, struct sk_buff *skb, @@ -1649,6 +1694,16 @@ static int br_multicast_ipv6_rcv(struct net_bridge *br, if (err == -ENOMSG) { if (!ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr)) BR_INPUT_SKB_CB(skb)->mrouters_only = 1; + + if (ipv6_addr_is_all_snoopers(&ipv6_hdr(skb)->daddr)) { + err = br_ip6_multicast_mrd_rcv(br, port, skb); + + if (err < 0 && err != -ENOMSG) { + br_multicast_err_count(br, port, skb->protocol); + return err; + } + } + return 0; } else if (err < 0) { br_multicast_err_count(br, port, skb->protocol); diff --git a/net/ipv6/mcast_snoop.c b/net/ipv6/mcast_snoop.c index a72ddfc40eb3..55e2ac179f28 100644 --- a/net/ipv6/mcast_snoop.c +++ b/net/ipv6/mcast_snoop.c @@ -41,6 +41,8 @@ static int ipv6_mc_check_ip6hdr(struct sk_buff *skb) if (skb->len < len || len <= offset) return -EINVAL; + skb_set_transport_header(skb, offset); + return 0; } @@ -142,7 +144,7 @@ static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb) return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo); } -static int ipv6_mc_check_icmpv6(struct sk_buff *skb) +int ipv6_mc_check_icmpv6(struct sk_buff *skb) { unsigned int len = skb_transport_offset(skb) + sizeof(struct icmp6hdr); unsigned int transport_len = ipv6_transport_len(skb); @@ -161,6 +163,7 @@ static int ipv6_mc_check_icmpv6(struct sk_buff *skb) return 0; } +EXPORT_SYMBOL(ipv6_mc_check_icmpv6); /** * ipv6_mc_check_mld - checks whether this is a sane MLD packet -- cgit v1.2.3 From d9ff286a0f59fa7843549e49bd240393dd7d8b87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 23 Jan 2019 09:22:27 -0800 Subject: bpf: allow BPF programs access skb_shared_info->gso_segs field This adds the ability to read gso_segs from a BPF program. v3: Use BPF_REG_AX instead of BPF_REG_TMP for the temporary register, as suggested by Martin. v2: refined Eddie Hao patch to address Alexei feedback. Signed-off-by: Eric Dumazet Cc: Eddie Hao Cc: Martin KaFai Lau Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 21 +++++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/test_verifier.c | 36 +++++++++++++++++++++++++++++ 4 files changed, 59 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 91c43884f295..2940a9854f6d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2540,6 +2540,7 @@ struct __sk_buff { __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; + __u32 gso_segs; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 2b3b436ef545..8e587dd1da20 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -6700,6 +6700,27 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct __sk_buff, gso_segs): + /* si->dst_reg = skb_shinfo(SKB); */ +#ifdef NET_SKBUFF_DATA_USES_OFFSET + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, head), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, head)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, end)); + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); +#else + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, end), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, end)); +#endif + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct skb_shared_info, gso_segs), + si->dst_reg, si->dst_reg, + bpf_target_off(struct skb_shared_info, + gso_segs, 2, + target_size)); + break; case offsetof(struct __sk_buff, wire_len): BUILD_BUG_ON(FIELD_SIZEOF(struct qdisc_skb_cb, pkt_len) != 4); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 91c43884f295..2940a9854f6d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2540,6 +2540,7 @@ struct __sk_buff { __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); __u64 tstamp; __u32 wire_len; + __u32 gso_segs; }; struct bpf_tunnel_key { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 4f67afeec8d9..e4fef6ca8071 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -5663,6 +5663,42 @@ static struct bpf_test tests[] = { .result = ACCEPT, .prog_type = BPF_PROG_TYPE_CGROUP_SKB, }, + { + "read gso_segs from CGROUP_SKB", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_segs)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "write gso_segs from CGROUP_SKB", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_W, BPF_REG_1, BPF_REG_0, + offsetof(struct __sk_buff, gso_segs)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = REJECT, + .result_unpriv = REJECT, + .errstr = "invalid bpf_context access off=164 size=4", + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + }, + { + "read gso_segs from CLS", + .insns = { + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, gso_segs)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .prog_type = BPF_PROG_TYPE_SCHED_CLS, + }, { "multiple registers share map_lookup_elem result", .insns = { -- cgit v1.2.3 From a36b38aa2af61146ea80980a01cf6e952ab021c1 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Thu, 24 Jan 2019 19:59:39 +0100 Subject: xsk: add sock_diag interface for AF_XDP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds the sock_diag interface for querying sockets from user space. Tools like iproute2 ss(8) can use this interface to list open AF_XDP sockets. The user-space ABI is defined in linux/xdp_diag.h and includes netlink request and response structs. The request can query sockets and the response contains socket information about the rings, umems, inode and more. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann --- include/uapi/linux/xdp_diag.h | 72 ++++++++++++++++ net/xdp/Kconfig | 8 ++ net/xdp/Makefile | 1 + net/xdp/xsk.c | 6 +- net/xdp/xsk.h | 12 +++ net/xdp/xsk_diag.c | 191 ++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 285 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/xdp_diag.h create mode 100644 net/xdp/xsk.h create mode 100644 net/xdp/xsk_diag.c (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/xdp_diag.h b/include/uapi/linux/xdp_diag.h new file mode 100644 index 000000000000..78b2591a7782 --- /dev/null +++ b/include/uapi/linux/xdp_diag.h @@ -0,0 +1,72 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * xdp_diag: interface for query/monitor XDP sockets + * Copyright(c) 2019 Intel Corporation. + */ + +#ifndef _LINUX_XDP_DIAG_H +#define _LINUX_XDP_DIAG_H + +#include + +struct xdp_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 xdiag_ino; + __u32 xdiag_show; + __u32 xdiag_cookie[2]; +}; + +struct xdp_diag_msg { + __u8 xdiag_family; + __u8 xdiag_type; + __u16 pad; + __u32 xdiag_ino; + __u32 xdiag_cookie[2]; +}; + +#define XDP_SHOW_INFO (1 << 0) /* Basic information */ +#define XDP_SHOW_RING_CFG (1 << 1) +#define XDP_SHOW_UMEM (1 << 2) +#define XDP_SHOW_MEMINFO (1 << 3) + +enum { + XDP_DIAG_NONE, + XDP_DIAG_INFO, + XDP_DIAG_UID, + XDP_DIAG_RX_RING, + XDP_DIAG_TX_RING, + XDP_DIAG_UMEM, + XDP_DIAG_UMEM_FILL_RING, + XDP_DIAG_UMEM_COMPLETION_RING, + XDP_DIAG_MEMINFO, + __XDP_DIAG_MAX, +}; + +#define XDP_DIAG_MAX (__XDP_DIAG_MAX - 1) + +struct xdp_diag_info { + __u32 ifindex; + __u32 queue_id; +}; + +struct xdp_diag_ring { + __u32 entries; /*num descs */ +}; + +#define XDP_DU_F_ZEROCOPY (1 << 0) + +struct xdp_diag_umem { + __u64 size; + __u32 id; + __u32 num_pages; + __u32 chunk_size; + __u32 headroom; + __u32 ifindex; + __u32 queue_id; + __u32 flags; + __u32 refs; +}; + +#endif /* _LINUX_XDP_DIAG_H */ diff --git a/net/xdp/Kconfig b/net/xdp/Kconfig index 90e4a7152854..0255b33cff4b 100644 --- a/net/xdp/Kconfig +++ b/net/xdp/Kconfig @@ -5,3 +5,11 @@ config XDP_SOCKETS help XDP sockets allows a channel between XDP programs and userspace applications. + +config XDP_SOCKETS_DIAG + tristate "XDP sockets: monitoring interface" + depends on XDP_SOCKETS + default n + help + Support for PF_XDP sockets monitoring interface used by the ss tool. + If unsure, say Y. diff --git a/net/xdp/Makefile b/net/xdp/Makefile index 04f073146256..59dbfdf93dca 100644 --- a/net/xdp/Makefile +++ b/net/xdp/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_XDP_SOCKETS) += xsk.o xdp_umem.o xsk_queue.o +obj-$(CONFIG_XDP_SOCKETS_DIAG) += xsk_diag.o diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 80ca48cefc42..949d3bbccb2f 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -27,14 +27,10 @@ #include "xsk_queue.h" #include "xdp_umem.h" +#include "xsk.h" #define TX_BATCH_SIZE 16 -static struct xdp_sock *xdp_sk(struct sock *sk) -{ - return (struct xdp_sock *)sk; -} - bool xsk_is_setup_for_bpf_map(struct xdp_sock *xs) { return READ_ONCE(xs->rx) && READ_ONCE(xs->umem) && diff --git a/net/xdp/xsk.h b/net/xdp/xsk.h new file mode 100644 index 000000000000..ba8120610426 --- /dev/null +++ b/net/xdp/xsk.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2019 Intel Corporation. */ + +#ifndef XSK_H_ +#define XSK_H_ + +static inline struct xdp_sock *xdp_sk(struct sock *sk) +{ + return (struct xdp_sock *)sk; +} + +#endif /* XSK_H_ */ diff --git a/net/xdp/xsk_diag.c b/net/xdp/xsk_diag.c new file mode 100644 index 000000000000..661d007c3b28 --- /dev/null +++ b/net/xdp/xsk_diag.c @@ -0,0 +1,191 @@ +// SPDX-License-Identifier: GPL-2.0 +/* XDP sockets monitoring support + * + * Copyright(c) 2019 Intel Corporation. + * + * Author: Björn Töpel + */ + +#include +#include +#include +#include + +#include "xsk_queue.h" +#include "xsk.h" + +static int xsk_diag_put_info(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_diag_info di = {}; + + di.ifindex = xs->dev ? xs->dev->ifindex : 0; + di.queue_id = xs->queue_id; + return nla_put(nlskb, XDP_DIAG_INFO, sizeof(di), &di); +} + +static int xsk_diag_put_ring(const struct xsk_queue *queue, int nl_type, + struct sk_buff *nlskb) +{ + struct xdp_diag_ring dr = {}; + + dr.entries = queue->nentries; + return nla_put(nlskb, nl_type, sizeof(dr), &dr); +} + +static int xsk_diag_put_rings_cfg(const struct xdp_sock *xs, + struct sk_buff *nlskb) +{ + int err = 0; + + if (xs->rx) + err = xsk_diag_put_ring(xs->rx, XDP_DIAG_RX_RING, nlskb); + if (!err && xs->tx) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_TX_RING, nlskb); + return err; +} + +static int xsk_diag_put_umem(const struct xdp_sock *xs, struct sk_buff *nlskb) +{ + struct xdp_umem *umem = xs->umem; + struct xdp_diag_umem du = {}; + int err; + + if (!umem) + return 0; + + du.id = umem->id; + du.size = umem->size; + du.num_pages = umem->npgs; + du.chunk_size = (__u32)(~umem->chunk_mask + 1); + du.headroom = umem->headroom; + du.ifindex = umem->dev ? umem->dev->ifindex : 0; + du.queue_id = umem->queue_id; + du.flags = 0; + if (umem->zc) + du.flags |= XDP_DU_F_ZEROCOPY; + du.refs = refcount_read(&umem->users); + + err = nla_put(nlskb, XDP_DIAG_UMEM, sizeof(du), &du); + + if (!err && umem->fq) + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_FILL_RING, nlskb); + if (!err && umem->cq) { + err = xsk_diag_put_ring(xs->tx, XDP_DIAG_UMEM_COMPLETION_RING, + nlskb); + } + return err; +} + +static int xsk_diag_fill(struct sock *sk, struct sk_buff *nlskb, + struct xdp_diag_req *req, + struct user_namespace *user_ns, + u32 portid, u32 seq, u32 flags, int sk_ino) +{ + struct xdp_sock *xs = xdp_sk(sk); + struct xdp_diag_msg *msg; + struct nlmsghdr *nlh; + + nlh = nlmsg_put(nlskb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*msg), + flags); + if (!nlh) + return -EMSGSIZE; + + msg = nlmsg_data(nlh); + memset(msg, 0, sizeof(*msg)); + msg->xdiag_family = AF_XDP; + msg->xdiag_type = sk->sk_type; + msg->xdiag_ino = sk_ino; + sock_diag_save_cookie(sk, msg->xdiag_cookie); + + if ((req->xdiag_show & XDP_SHOW_INFO) && xsk_diag_put_info(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_INFO) && + nla_put_u32(nlskb, XDP_DIAG_UID, + from_kuid_munged(user_ns, sock_i_uid(sk)))) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_RING_CFG) && + xsk_diag_put_rings_cfg(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_UMEM) && + xsk_diag_put_umem(xs, nlskb)) + goto out_nlmsg_trim; + + if ((req->xdiag_show & XDP_SHOW_MEMINFO) && + sock_diag_put_meminfo(sk, nlskb, XDP_DIAG_MEMINFO)) + goto out_nlmsg_trim; + + nlmsg_end(nlskb, nlh); + return 0; + +out_nlmsg_trim: + nlmsg_cancel(nlskb, nlh); + return -EMSGSIZE; +} + +static int xsk_diag_dump(struct sk_buff *nlskb, struct netlink_callback *cb) +{ + struct xdp_diag_req *req = nlmsg_data(cb->nlh); + struct net *net = sock_net(nlskb->sk); + int num = 0, s_num = cb->args[0]; + struct sock *sk; + + mutex_lock(&net->xdp.lock); + + sk_for_each(sk, &net->xdp.list) { + if (!net_eq(sock_net(sk), net)) + continue; + if (num++ < s_num) + continue; + + if (xsk_diag_fill(sk, nlskb, req, + sk_user_ns(NETLINK_CB(cb->skb).sk), + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + sock_i_ino(sk)) < 0) { + num--; + break; + } + } + + mutex_unlock(&net->xdp.lock); + cb->args[0] = num; + return nlskb->len; +} + +static int xsk_diag_handler_dump(struct sk_buff *nlskb, struct nlmsghdr *hdr) +{ + struct netlink_dump_control c = { .dump = xsk_diag_dump }; + int hdrlen = sizeof(struct xdp_diag_req); + struct net *net = sock_net(nlskb->sk); + + if (nlmsg_len(hdr) < hdrlen) + return -EINVAL; + + if (!(hdr->nlmsg_flags & NLM_F_DUMP)) + return -EOPNOTSUPP; + + return netlink_dump_start(net->diag_nlsk, nlskb, hdr, &c); +} + +static const struct sock_diag_handler xsk_diag_handler = { + .family = AF_XDP, + .dump = xsk_diag_handler_dump, +}; + +static int __init xsk_diag_init(void) +{ + return sock_diag_register(&xsk_diag_handler); +} + +static void __exit xsk_diag_exit(void) +{ + sock_diag_unregister(&xsk_diag_handler); +} + +module_init(xsk_diag_init); +module_exit(xsk_diag_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, AF_XDP); -- cgit v1.2.3 From ab4dfa20534e32e48de6b761b42d943518fb26f7 Mon Sep 17 00:00:00 2001 From: Veerendranath Jakkam Date: Wed, 19 Dec 2018 22:52:25 +0530 Subject: cfg80211: Allow drivers to advertise supported AKM suites There was no such capability advertisement from the driver and thus the current user space has to assume the driver to support all the AKMs. While that may be the case with some drivers (e.g., mac80211-based ones), there are cfg80211-based drivers that implement SME and have constraints on which AKMs can be supported (e.g., such drivers may need an update to support SAE AKM using NL80211_CMD_EXTERNAL_AUTH). Allow such drivers to advertise the exact set of supported AKMs so that user space tools can determine what network profile options should be allowed to be configured. Signed-off-by: Veerendranath Jakkam [pmsr data might be big, start a new netlink message section] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 +++++ include/uapi/linux/nl80211.h | 6 ++++++ net/wireless/nl80211.c | 9 +++++++++ 3 files changed, 20 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9c1d7979c200..b61ac6e9de08 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -4128,6 +4128,8 @@ struct cfg80211_pmsr_capabilities { * @signal_type: signal type reported in &struct cfg80211_bss. * @cipher_suites: supported cipher suites * @n_cipher_suites: number of supported cipher suites + * @akm_suites: supported AKM suites + * @n_akm_suites: number of supported AKM suites * @retry_short: Retry limit for short frames (dot11ShortRetryLimit) * @retry_long: Retry limit for long frames (dot11LongRetryLimit) * @frag_threshold: Fragmentation threshold (dot11FragmentationThreshold); @@ -4326,6 +4328,9 @@ struct wiphy { int n_cipher_suites; const u32 *cipher_suites; + int n_akm_suites; + const u32 *akm_suites; + u8 retry_short; u8 retry_long; u32 frag_threshold; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 426db4d8f71c..5f9d5cd458a1 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1565,6 +1565,12 @@ enum nl80211_commands { * (a u32 with flags from &enum nl80211_wpa_versions). * @NL80211_ATTR_AKM_SUITES: Used with CONNECT, ASSOCIATE, and NEW_BEACON to * indicate which key management algorithm(s) to use (an array of u32). + * This attribute is also sent in response to @NL80211_CMD_GET_WIPHY, + * indicating the supported AKM suites, intended for specific drivers which + * implement SME and have constraints on which AKMs are supported and also + * the cases where an AKM support is offloaded to the driver/firmware. + * If there is no such notification from the driver, user space should + * assume the driver supports all the AKM suites. * * @NL80211_ATTR_REQ_IE: (Re)association request information elements as * sent out by the card, for ROAM and successful CONNECT events. diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index e5f9c9ceb6c9..eb4437fa0539 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -2279,6 +2279,15 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, if (nl80211_send_pmsr_capa(rdev, msg)) goto nla_put_failure; + state->split_start++; + break; + case 15: + if (rdev->wiphy.akm_suites && + nla_put(msg, NL80211_ATTR_AKM_SUITES, + sizeof(u32) * rdev->wiphy.n_akm_suites, + rdev->wiphy.akm_suites)) + goto nla_put_failure; + /* done */ state->split_start = 0; break; -- cgit v1.2.3 From 30e5c2c6bf285d93dee4c45f23da95d7d50b125a Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 25 Jan 2019 10:53:23 -0800 Subject: net: Revert devlink health changes. This reverts the devlink health changes from 9/17/2019, Jiri wants things to be designed differently and it was agreed that the easiest way to do this is start from the beginning again. Commits reverted: cb5ccfbe73b389470e1dc11061bb185ef4bc9aec 880ee82f0313453ec5a6cb122866ac057263066b c7af343b4e33578b7de91786a3f639c8cfa0d97b ff253fedab961b22117a73ab808fcfa9e6852b50 6f9d56132eb6d2603d4273cfc65bed914ec47acb fcd852c69d776c0f46c8f79e8e431e5cc6ddc7b7 8a66704a13d9713593342e29b4f0c19762f5746b 12bd0dcefe88782ac1c9fff632958dd1b71d27e5 aba25279c10094c5c97d09c3491ca86d00b4ad5e ce019faa70f81555fa17ebc1d5a03651f2e7e15a b8c45a033acc607201588f7665ba84207e5149e0 And the follow-on build fix: o33a0efa4baecd689da9474ce0e8b673eb6931c60 Signed-off-by: David S. Miller --- Documentation/networking/devlink-health.txt | 86 -- drivers/net/ethernet/mellanox/mlx5/core/Makefile | 2 +- drivers/net/ethernet/mellanox/mlx5/core/en.h | 18 +- .../net/ethernet/mellanox/mlx5/core/en/reporter.h | 15 - .../ethernet/mellanox/mlx5/core/en/reporter_tx.c | 356 ------- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 186 +++- drivers/net/ethernet/mellanox/mlx5/core/en_tx.c | 2 +- include/net/devlink.h | 144 --- include/trace/events/devlink.h | 62 -- include/uapi/linux/devlink.h | 25 - net/core/devlink.c | 1058 -------------------- 11 files changed, 169 insertions(+), 1785 deletions(-) delete mode 100644 Documentation/networking/devlink-health.txt delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h delete mode 100644 drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c (limited to 'include/uapi/linux') diff --git a/Documentation/networking/devlink-health.txt b/Documentation/networking/devlink-health.txt deleted file mode 100644 index 1db3fbea0831..000000000000 --- a/Documentation/networking/devlink-health.txt +++ /dev/null @@ -1,86 +0,0 @@ -The health mechanism is targeted for Real Time Alerting, in order to know when -something bad had happened to a PCI device -- Provide alert debug information -- Self healing -- If problem needs vendor support, provide a way to gather all needed debugging - information. - -The main idea is to unify and centralize driver health reports in the -generic devlink instance and allow the user to set different -attributes of the health reporting and recovery procedures. - -The devlink health reporter: -Device driver creates a "health reporter" per each error/health type. -Error/Health type can be a known/generic (eg pci error, fw error, rx/tx error) -or unknown (driver specific). -For each registered health reporter a driver can issue error/health reports -asynchronously. All health reports handling is done by devlink. -Device driver can provide specific callbacks for each "health reporter", e.g. - - Recovery procedures - - Diagnostics and object dump procedures - - OOB initial parameters -Different parts of the driver can register different types of health reporters -with different handlers. - -Once an error is reported, devlink health will do the following actions: - * A log is being send to the kernel trace events buffer - * Health status and statistics are being updated for the reporter instance - * Object dump is being taken and saved at the reporter instance (as long as - there is no other dump which is already stored) - * Auto recovery attempt is being done. Depends on: - - Auto-recovery configuration - - Grace period vs. time passed since last recover - -The user interface: -User can access/change each reporter's parameters and driver specific callbacks -via devlink, e.g per error type (per health reporter) - - Configure reporter's generic parameters (like: disable/enable auto recovery) - - Invoke recovery procedure - - Run diagnostics - - Object dump - -The devlink health interface (via netlink): -DEVLINK_CMD_HEALTH_REPORTER_GET - Retrieves status and configuration info per DEV and reporter. -DEVLINK_CMD_HEALTH_REPORTER_SET - Allows reporter-related configuration setting. -DEVLINK_CMD_HEALTH_REPORTER_RECOVER - Triggers a reporter's recovery procedure. -DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE - Retrieves diagnostics data from a reporter on a device. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET - Retrieves the last stored dump. Devlink health - saves a single dump. If an dump is not already stored by the devlink - for this reporter, devlink generates a new dump. - dump output is defined by the reporter. -DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR - Clears the last saved dump file for the specified reporter. - - - netlink - +--------------------------+ - | | - | + | - | | | - +--------------------------+ - |request for ops - |(diagnose, - mlx5_core devlink |recover, - |dump) -+--------+ +--------------------------+ -| | | reporter| | -| | | +---------v----------+ | -| | ops execution | | | | -| <----------------------------------+ | | -| | | | | | -| | | + ^------------------+ | -| | | | request for ops | -| | | | (recover, dump) | -| | | | | -| | | +-+------------------+ | -| | health report | | health handler | | -| +-------------------------------> | | -| | | +--------------------+ | -| | health reporter create | | -| +----------------------------> | -+--------+ +--------------------------+ diff --git a/drivers/net/ethernet/mellanox/mlx5/core/Makefile b/drivers/net/ethernet/mellanox/mlx5/core/Makefile index 6bb2a860b15b..9de9abacf7f6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/Makefile +++ b/drivers/net/ethernet/mellanox/mlx5/core/Makefile @@ -22,7 +22,7 @@ mlx5_core-y := main.o cmd.o debugfs.o fw.o eq.o uar.o pagealloc.o \ # mlx5_core-$(CONFIG_MLX5_CORE_EN) += en_main.o en_common.o en_fs.o en_ethtool.o \ en_tx.o en_rx.o en_dim.o en_txrx.o en/xdp.o en_stats.o \ - en_selftest.o en/port.o en/monitor_stats.o en/reporter_tx.o + en_selftest.o en/port.o en/monitor_stats.o # # Netdev extra diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 27e276c9bf84..8fa8fdd30b85 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -388,7 +388,10 @@ struct mlx5e_txqsq { struct mlx5e_channel *channel; int txq_ix; u32 rate_limit; - struct work_struct recover_work; + struct mlx5e_txqsq_recover { + struct work_struct recover_work; + u64 last_recover; + } recover; } ____cacheline_aligned_in_smp; struct mlx5e_dma_info { @@ -679,13 +682,6 @@ struct mlx5e_rss_params { u8 hfunc; }; -struct mlx5e_modify_sq_param { - int curr_state; - int next_state; - int rl_update; - int rl_index; -}; - struct mlx5e_priv { /* priv data path fields - start */ struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC]; @@ -741,7 +737,6 @@ struct mlx5e_priv { #ifdef CONFIG_MLX5_EN_TLS struct mlx5e_tls *tls; #endif - struct devlink_health_reporter *tx_reporter; }; struct mlx5e_profile { @@ -871,11 +866,6 @@ void mlx5e_set_rq_type(struct mlx5_core_dev *mdev, struct mlx5e_params *params); void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev, struct mlx5e_params *params); -int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, - struct mlx5e_modify_sq_param *p); -void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq); -void mlx5e_tx_disable_queue(struct netdev_queue *txq); - static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev) { return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) && diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h deleted file mode 100644 index 2335c5b48820..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ -/* Copyright (c) 2018 Mellanox Technologies. */ - -#ifndef __MLX5E_EN_REPORTER_H -#define __MLX5E_EN_REPORTER_H - -#include -#include "en.h" - -int mlx5e_tx_reporter_create(struct mlx5e_priv *priv); -void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv); -void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq); -void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq); - -#endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c deleted file mode 100644 index d9675afbb924..000000000000 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/reporter_tx.c +++ /dev/null @@ -1,356 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */ -/* Copyright (c) 2018 Mellanox Technologies. */ - -#include -#include "reporter.h" -#include "lib/eq.h" - -#define MLX5E_TX_REPORTER_PER_SQ_MAX_LEN 256 - -struct mlx5e_tx_err_ctx { - int (*recover)(struct mlx5e_txqsq *sq); - struct mlx5e_txqsq *sq; -}; - -static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) -{ - unsigned long exp_time = jiffies + msecs_to_jiffies(2000); - - while (time_before(jiffies, exp_time)) { - if (sq->cc == sq->pc) - return 0; - - msleep(20); - } - - netdev_err(sq->channel->netdev, - "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", - sq->sqn, sq->cc, sq->pc); - - return -ETIMEDOUT; -} - -static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) -{ - WARN_ONCE(sq->cc != sq->pc, - "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", - sq->sqn, sq->cc, sq->pc); - sq->cc = 0; - sq->dma_fifo_cc = 0; - sq->pc = 0; -} - -static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state) -{ - struct mlx5_core_dev *mdev = sq->channel->mdev; - struct net_device *dev = sq->channel->netdev; - struct mlx5e_modify_sq_param msp = {0}; - int err; - - msp.curr_state = curr_state; - msp.next_state = MLX5_SQC_STATE_RST; - - err = mlx5e_modify_sq(mdev, sq->sqn, &msp); - if (err) { - netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn); - return err; - } - - memset(&msp, 0, sizeof(msp)); - msp.curr_state = MLX5_SQC_STATE_RST; - msp.next_state = MLX5_SQC_STATE_RDY; - - err = mlx5e_modify_sq(mdev, sq->sqn, &msp); - if (err) { - netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn); - return err; - } - - return 0; -} - -static int mlx5e_tx_reporter_err_cqe_recover(struct mlx5e_txqsq *sq) -{ - struct mlx5_core_dev *mdev = sq->channel->mdev; - struct net_device *dev = sq->channel->netdev; - u8 state; - int err; - - if (!test_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) - return 0; - - err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); - if (err) { - netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", - sq->sqn, err); - return err; - } - - if (state != MLX5_RQC_STATE_ERR) { - netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn); - return -EINVAL; - } - - mlx5e_tx_disable_queue(sq->txq); - - err = mlx5e_wait_for_sq_flush(sq); - if (err) - return err; - - /* At this point, no new packets will arrive from the stack as TXQ is - * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all - * pending WQEs. SQ can safely reset the SQ. - */ - - err = mlx5e_sq_to_ready(sq, state); - if (err) - return err; - - mlx5e_reset_txqsq_cc_pc(sq); - sq->stats->recover++; - mlx5e_activate_txqsq(sq); - - return 0; -} - -void mlx5e_tx_reporter_err_cqe(struct mlx5e_txqsq *sq) -{ - char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN]; - struct mlx5e_tx_err_ctx err_ctx = {0}; - - err_ctx.sq = sq; - err_ctx.recover = mlx5e_tx_reporter_err_cqe_recover; - sprintf(err_str, "ERR CQE on SQ: 0x%x", sq->sqn); - - devlink_health_report(sq->channel->priv->tx_reporter, err_str, - &err_ctx); -} - -static int mlx5e_tx_reporter_timeout_recover(struct mlx5e_txqsq *sq) -{ - struct mlx5_eq_comp *eq = sq->cq.mcq.eq; - u32 eqe_count; - - netdev_err(sq->channel->netdev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", - eq->core.eqn, eq->core.cons_index, eq->core.irqn); - - eqe_count = mlx5_eq_poll_irq_disabled(eq); - if (!eqe_count) { - clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); - return 1; - } - - netdev_err(sq->channel->netdev, "Recover %d eqes on EQ 0x%x\n", - eqe_count, eq->core.eqn); - sq->channel->stats->eq_rearm++; - return 0; -} - -void mlx5e_tx_reporter_timeout(struct mlx5e_txqsq *sq) -{ - struct mlx5e_tx_err_ctx err_ctx; - char err_str[MLX5E_TX_REPORTER_PER_SQ_MAX_LEN]; - - err_ctx.sq = sq; - err_ctx.recover = mlx5e_tx_reporter_timeout_recover; - sprintf(err_str, - "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", - sq->channel->ix, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, - jiffies_to_usecs(jiffies - sq->txq->trans_start)); - devlink_health_report(sq->channel->priv->tx_reporter, err_str, - &err_ctx); -} - -/* state lock cannot be grabbed within this function. - * It can cause a dead lock or a read-after-free. - */ -int mlx5e_tx_reporter_recover_from_ctx(struct mlx5e_tx_err_ctx *err_ctx) -{ - return err_ctx->recover(err_ctx->sq); -} - -static int mlx5e_tx_reporter_recover_all(struct mlx5e_priv *priv) -{ - int err; - - mutex_lock(&priv->state_lock); - mlx5e_close_locked(priv->netdev); - err = mlx5e_open_locked(priv->netdev); - mutex_unlock(&priv->state_lock); - - return err; -} - -static int mlx5e_tx_reporter_recover(struct devlink_health_reporter *reporter, - void *context) -{ - struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); - struct mlx5e_tx_err_ctx *err_ctx = context; - - return err_ctx ? mlx5e_tx_reporter_recover_from_ctx(err_ctx) : - mlx5e_tx_reporter_recover_all(priv); -} - -static int -mlx5e_tx_reporter_build_diagnose_output(struct devlink_health_buffer *buffer, - u32 sqn, u8 state, u8 stopped) -{ - int err, i; - int nest = 0; - char name[20]; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - sprintf(name, "SQ 0x%x", sqn); - err = devlink_health_buffer_put_object_name(buffer, name); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_object_name(buffer, "HW state"); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_value_u8(buffer, state); - if (err) - goto buffer_error; - - devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE */ - nest--; - - devlink_health_buffer_nest_end(buffer); /* DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR */ - nest--; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_object_name(buffer, "stopped"); - if (err) - goto buffer_error; - - err = devlink_health_buffer_nest_start(buffer, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE); - if (err) - goto buffer_error; - nest++; - - err = devlink_health_buffer_put_value_u8(buffer, stopped); - if (err) - goto buffer_error; - - for (i = 0; i < nest; i++) - devlink_health_buffer_nest_end(buffer); - - return 0; - -buffer_error: - for (i = 0; i < nest; i++) - devlink_health_buffer_nest_cancel(buffer); - return err; -} - -static int mlx5e_tx_reporter_diagnose(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, - unsigned int num_buffers) -{ - struct mlx5e_priv *priv = devlink_health_reporter_priv(reporter); - unsigned int buff = 0; - int i = 0, err = 0; - - if (buffer_size < MLX5E_TX_REPORTER_PER_SQ_MAX_LEN) - return -ENOMEM; - - mutex_lock(&priv->state_lock); - - if (!test_bit(MLX5E_STATE_OPENED, &priv->state)) { - mutex_unlock(&priv->state_lock); - return 0; - } - - while (i < priv->channels.num * priv->channels.params.num_tc) { - struct mlx5e_txqsq *sq = priv->txq2sq[i]; - u8 state; - - err = mlx5_core_query_sq_state(priv->mdev, sq->sqn, &state); - if (err) - break; - - err = mlx5e_tx_reporter_build_diagnose_output(buffers_array[buff], - sq->sqn, state, - netif_xmit_stopped(sq->txq)); - if (err) { - if (++buff == num_buffers) - break; - } else { - i++; - } - } - - mutex_unlock(&priv->state_lock); - return err; -} - -static const struct devlink_health_reporter_ops mlx5_tx_reporter_ops = { - .name = "TX", - .recover = mlx5e_tx_reporter_recover, - .diagnose_size = MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC * - MLX5E_TX_REPORTER_PER_SQ_MAX_LEN, - .diagnose = mlx5e_tx_reporter_diagnose, - .dump_size = 0, - .dump = NULL, -}; - -#define MLX5_REPORTER_TX_GRACEFUL_PERIOD 500 -int mlx5e_tx_reporter_create(struct mlx5e_priv *priv) -{ - struct mlx5_core_dev *mdev = priv->mdev; - struct devlink *devlink = priv_to_devlink(mdev); - - priv->tx_reporter = - devlink_health_reporter_create(devlink, &mlx5_tx_reporter_ops, - MLX5_REPORTER_TX_GRACEFUL_PERIOD, - true, priv); - return PTR_ERR_OR_ZERO(priv->tx_reporter); -} - -void mlx5e_tx_reporter_destroy(struct mlx5e_priv *priv) -{ - devlink_health_reporter_destroy(priv->tx_reporter); -} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index dee0c8f3d4e9..8cfd2ec7c0a2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -51,7 +51,6 @@ #include "en/xdp.h" #include "lib/eq.h" #include "en/monitor_stats.h" -#include "en/reporter.h" struct mlx5e_rq_param { u32 rqc[MLX5_ST_SZ_DW(rqc)]; @@ -1161,7 +1160,7 @@ static int mlx5e_alloc_txqsq_db(struct mlx5e_txqsq *sq, int numa) return 0; } -static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work); +static void mlx5e_sq_recover(struct work_struct *work); static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, int txq_ix, struct mlx5e_params *params, @@ -1183,7 +1182,7 @@ static int mlx5e_alloc_txqsq(struct mlx5e_channel *c, sq->uar_map = mdev->mlx5e_res.bfreg.map; sq->min_inline_mode = params->tx_min_inline_mode; sq->stats = &c->priv->channel_stats[c->ix].sq[tc]; - INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work); + INIT_WORK(&sq->recover.recover_work, mlx5e_sq_recover); if (MLX5_IPSEC_DEV(c->priv->mdev)) set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state); if (mlx5_accel_is_tls_device(c->priv->mdev)) @@ -1271,8 +1270,15 @@ static int mlx5e_create_sq(struct mlx5_core_dev *mdev, return err; } -int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, - struct mlx5e_modify_sq_param *p) +struct mlx5e_modify_sq_param { + int curr_state; + int next_state; + bool rl_update; + int rl_index; +}; + +static int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn, + struct mlx5e_modify_sq_param *p) { void *in; void *sqc; @@ -1370,7 +1376,17 @@ err_free_txqsq: return err; } -void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) +static void mlx5e_reset_txqsq_cc_pc(struct mlx5e_txqsq *sq) +{ + WARN_ONCE(sq->cc != sq->pc, + "SQ 0x%x: cc (0x%x) != pc (0x%x)\n", + sq->sqn, sq->cc, sq->pc); + sq->cc = 0; + sq->dma_fifo_cc = 0; + sq->pc = 0; +} + +static void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) { sq->txq = netdev_get_tx_queue(sq->channel->netdev, sq->txq_ix); clear_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state); @@ -1379,7 +1395,7 @@ void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq) netif_tx_start_queue(sq->txq); } -void mlx5e_tx_disable_queue(struct netdev_queue *txq) +static inline void netif_tx_disable_queue(struct netdev_queue *txq) { __netif_tx_lock_bh(txq); netif_tx_stop_queue(txq); @@ -1395,7 +1411,7 @@ static void mlx5e_deactivate_txqsq(struct mlx5e_txqsq *sq) /* prevent netif_tx_wake_queue */ napi_synchronize(&c->napi); - mlx5e_tx_disable_queue(sq->txq); + netif_tx_disable_queue(sq->txq); /* last doorbell out, godspeed .. */ if (mlx5e_wqc_has_room_for(wq, sq->cc, sq->pc, 1)) { @@ -1415,7 +1431,6 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) struct mlx5_rate_limit rl = {0}; cancel_work_sync(&sq->dim.work); - cancel_work_sync(&sq->recover_work); mlx5e_destroy_sq(mdev, sq->sqn); if (sq->rate_limit) { rl.rate = sq->rate_limit; @@ -1425,15 +1440,105 @@ static void mlx5e_close_txqsq(struct mlx5e_txqsq *sq) mlx5e_free_txqsq(sq); } -static void mlx5e_tx_err_cqe_work(struct work_struct *recover_work) +static int mlx5e_wait_for_sq_flush(struct mlx5e_txqsq *sq) +{ + unsigned long exp_time = jiffies + msecs_to_jiffies(2000); + + while (time_before(jiffies, exp_time)) { + if (sq->cc == sq->pc) + return 0; + + msleep(20); + } + + netdev_err(sq->channel->netdev, + "Wait for SQ 0x%x flush timeout (sq cc = 0x%x, sq pc = 0x%x)\n", + sq->sqn, sq->cc, sq->pc); + + return -ETIMEDOUT; +} + +static int mlx5e_sq_to_ready(struct mlx5e_txqsq *sq, int curr_state) { - struct mlx5e_txqsq *sq = container_of(recover_work, struct mlx5e_txqsq, - recover_work); + struct mlx5_core_dev *mdev = sq->channel->mdev; + struct net_device *dev = sq->channel->netdev; + struct mlx5e_modify_sq_param msp = {0}; + int err; - if (!sq->channel->priv->tx_reporter) + msp.curr_state = curr_state; + msp.next_state = MLX5_SQC_STATE_RST; + + err = mlx5e_modify_sq(mdev, sq->sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to reset\n", sq->sqn); + return err; + } + + memset(&msp, 0, sizeof(msp)); + msp.curr_state = MLX5_SQC_STATE_RST; + msp.next_state = MLX5_SQC_STATE_RDY; + + err = mlx5e_modify_sq(mdev, sq->sqn, &msp); + if (err) { + netdev_err(dev, "Failed to move sq 0x%x to ready\n", sq->sqn); + return err; + } + + return 0; +} + +static void mlx5e_sq_recover(struct work_struct *work) +{ + struct mlx5e_txqsq_recover *recover = + container_of(work, struct mlx5e_txqsq_recover, + recover_work); + struct mlx5e_txqsq *sq = container_of(recover, struct mlx5e_txqsq, + recover); + struct mlx5_core_dev *mdev = sq->channel->mdev; + struct net_device *dev = sq->channel->netdev; + u8 state; + int err; + + err = mlx5_core_query_sq_state(mdev, sq->sqn, &state); + if (err) { + netdev_err(dev, "Failed to query SQ 0x%x state. err = %d\n", + sq->sqn, err); + return; + } + + if (state != MLX5_RQC_STATE_ERR) { + netdev_err(dev, "SQ 0x%x not in ERROR state\n", sq->sqn); + return; + } + + netif_tx_disable_queue(sq->txq); + + if (mlx5e_wait_for_sq_flush(sq)) return; - mlx5e_tx_reporter_err_cqe(sq); + /* If the interval between two consecutive recovers per SQ is too + * short, don't recover to avoid infinite loop of ERR_CQE -> recover. + * If we reached this state, there is probably a bug that needs to be + * fixed. let's keep the queue close and let tx timeout cleanup. + */ + if (jiffies_to_msecs(jiffies - recover->last_recover) < + MLX5E_SQ_RECOVER_MIN_INTERVAL) { + netdev_err(dev, "Recover SQ 0x%x canceled, too many error CQEs\n", + sq->sqn); + return; + } + + /* At this point, no new packets will arrive from the stack as TXQ is + * marked with QUEUE_STATE_DRV_XOFF. In addition, NAPI cleared all + * pending WQEs. SQ can safely reset the SQ. + */ + if (mlx5e_sq_to_ready(sq, state)) + return; + + mlx5e_reset_txqsq_cc_pc(sq); + sq->stats->recover++; + recover->last_recover = jiffies; + mlx5e_activate_txqsq(sq); } static int mlx5e_open_icosq(struct mlx5e_channel *c, @@ -3102,7 +3207,6 @@ static void mlx5e_cleanup_nic_tx(struct mlx5e_priv *priv) { int tc; - mlx5e_tx_reporter_destroy(priv); for (tc = 0; tc < priv->profile->max_tc; tc++) mlx5e_destroy_tis(priv->mdev, priv->tisn[tc]); } @@ -4074,14 +4178,31 @@ netdev_features_t mlx5e_features_check(struct sk_buff *skb, return features; } +static bool mlx5e_tx_timeout_eq_recover(struct net_device *dev, + struct mlx5e_txqsq *sq) +{ + struct mlx5_eq_comp *eq = sq->cq.mcq.eq; + u32 eqe_count; + + netdev_err(dev, "EQ 0x%x: Cons = 0x%x, irqn = 0x%x\n", + eq->core.eqn, eq->core.cons_index, eq->core.irqn); + + eqe_count = mlx5_eq_poll_irq_disabled(eq); + if (!eqe_count) + return false; + + netdev_err(dev, "Recover %d eqes on EQ 0x%x\n", eqe_count, eq->core.eqn); + sq->channel->stats->eq_rearm++; + return true; +} + static void mlx5e_tx_timeout_work(struct work_struct *work) { struct mlx5e_priv *priv = container_of(work, struct mlx5e_priv, tx_timeout_work); - int i; - - if (!priv->tx_reporter) - return; + struct net_device *dev = priv->netdev; + bool reopen_channels = false; + int i, err; rtnl_lock(); mutex_lock(&priv->state_lock); @@ -4090,16 +4211,36 @@ static void mlx5e_tx_timeout_work(struct work_struct *work) goto unlock; for (i = 0; i < priv->channels.num * priv->channels.params.num_tc; i++) { - struct netdev_queue *dev_queue = - netdev_get_tx_queue(priv->netdev, i); + struct netdev_queue *dev_queue = netdev_get_tx_queue(dev, i); struct mlx5e_txqsq *sq = priv->txq2sq[i]; if (!netif_xmit_stopped(dev_queue)) continue; - mlx5e_tx_reporter_timeout(sq); + netdev_err(dev, + "TX timeout on queue: %d, SQ: 0x%x, CQ: 0x%x, SQ Cons: 0x%x SQ Prod: 0x%x, usecs since last trans: %u\n", + i, sq->sqn, sq->cq.mcq.cqn, sq->cc, sq->pc, + jiffies_to_usecs(jiffies - dev_queue->trans_start)); + + /* If we recover a lost interrupt, most likely TX timeout will + * be resolved, skip reopening channels + */ + if (!mlx5e_tx_timeout_eq_recover(dev, sq)) { + clear_bit(MLX5E_SQ_STATE_ENABLED, &sq->state); + reopen_channels = true; + } } + if (!reopen_channels) + goto unlock; + + mlx5e_close_locked(dev); + err = mlx5e_open_locked(dev); + if (err) + netdev_err(priv->netdev, + "mlx5e_open_locked failed recovering from a tx_timeout, err(%d).\n", + err); + unlock: mutex_unlock(&priv->state_lock); rtnl_unlock(); @@ -4767,7 +4908,6 @@ static int mlx5e_init_nic_tx(struct mlx5e_priv *priv) #ifdef CONFIG_MLX5_CORE_EN_DCB mlx5e_dcbnl_initialize(priv); #endif - mlx5e_tx_reporter_create(priv); return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c index a8e052a5ce36..598ad7e4d5c9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tx.c @@ -514,7 +514,7 @@ bool mlx5e_poll_tx_cq(struct mlx5e_cq *cq, int napi_budget) mlx5e_dump_error_cqe(sq, (struct mlx5_err_cqe *)cqe); queue_work(cq->channel->priv->wq, - &sq->recover_work); + &sq->recover.recover_work); } stats->cqe_err++; } diff --git a/include/net/devlink.h b/include/net/devlink.h index a81a1b7a67d7..67f4293bc970 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -30,7 +30,6 @@ struct devlink { struct list_head param_list; struct list_head region_list; u32 snapshot_id; - struct list_head reporter_list; struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; @@ -424,36 +423,6 @@ struct devlink_region; typedef void devlink_snapshot_data_dest_t(const void *data); -struct devlink_health_buffer; -struct devlink_health_reporter; - -/** - * struct devlink_health_reporter_ops - Reporter operations - * @name: reporter name - * dump_size: dump buffer size allocated by the devlink - * diagnose_size: diagnose buffer size allocated by the devlink - * recover: callback to recover from reported error - * if priv_ctx is NULL, run a full recover - * dump: callback to dump an object - * if priv_ctx is NULL, run a full dump - * diagnose: callback to diagnose the current status - */ - -struct devlink_health_reporter_ops { - char *name; - unsigned int dump_size; - unsigned int diagnose_size; - int (*recover)(struct devlink_health_reporter *reporter, - void *priv_ctx); - int (*dump)(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, unsigned int num_buffers, - void *priv_ctx); - int (*diagnose)(struct devlink_health_reporter *reporter, - struct devlink_health_buffer **buffers_array, - unsigned int buffer_size, unsigned int num_buffers); -}; - struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -615,34 +584,6 @@ int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor); -int devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype); -void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer); -void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer); -int devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name); -int devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value); -int devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value); -int devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value); -int devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name); -int devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len); -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv); -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter); - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter); -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx); #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -903,91 +844,6 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, return 0; } -static inline int -devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype) -{ - return 0; -} - -static inline void -devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) -{ -} - -static inline void -devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) -{ -} - -static inline int -devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name) -{ - return 0; -} - -static inline int -devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len) -{ - return 0; -} - -static inline struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - return NULL; -} - -static inline void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ -} - -static inline void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return NULL; -} - -static inline int -devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - return 0; -} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/trace/events/devlink.h b/include/trace/events/devlink.h index 7e39d2fc7c75..44acfbca1266 100644 --- a/include/trace/events/devlink.h +++ b/include/trace/events/devlink.h @@ -46,65 +46,6 @@ TRACE_EVENT(devlink_hwmsg, (int) __entry->len, __get_dynamic_array(buf), __entry->len) ); -TRACE_EVENT(devlink_health_report, - TP_PROTO(const struct devlink *devlink, const char *reporter_name, - const char *msg), - - TP_ARGS(devlink, reporter_name, msg), - - TP_STRUCT__entry( - __string(bus_name, devlink->dev->bus->name) - __string(dev_name, dev_name(devlink->dev)) - __string(driver_name, devlink->dev->driver->name) - __string(reporter_name, msg) - __string(msg, msg) - ), - - TP_fast_assign( - __assign_str(bus_name, devlink->dev->bus->name); - __assign_str(dev_name, dev_name(devlink->dev)); - __assign_str(driver_name, devlink->dev->driver->name); - __assign_str(reporter_name, reporter_name); - __assign_str(msg, msg); - ), - - TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: %s", - __get_str(bus_name), __get_str(dev_name), - __get_str(driver_name), __get_str(reporter_name), - __get_str(msg)) -); - -TRACE_EVENT(devlink_health_recover_aborted, - TP_PROTO(const struct devlink *devlink, const char *reporter_name, - bool health_state, u64 time_since_last_recover), - - TP_ARGS(devlink, reporter_name, health_state, time_since_last_recover), - - TP_STRUCT__entry( - __string(bus_name, devlink->dev->bus->name) - __string(dev_name, dev_name(devlink->dev)) - __string(driver_name, devlink->dev->driver->name) - __string(reporter_name, reporter_name) - __field(bool, health_state) - __field(u64, time_since_last_recover) - ), - - TP_fast_assign( - __assign_str(bus_name, devlink->dev->bus->name); - __assign_str(dev_name, dev_name(devlink->dev)); - __assign_str(driver_name, devlink->dev->driver->name); - __assign_str(reporter_name, reporter_name); - __entry->health_state = health_state; - __entry->time_since_last_recover = time_since_last_recover; - ), - - TP_printk("bus_name=%s dev_name=%s driver_name=%s reporter_name=%s: health_state=%d time_since_last_recover = %llu recover aborted", - __get_str(bus_name), __get_str(dev_name), - __get_str(driver_name), __get_str(reporter_name), - __entry->health_state, - __entry->time_since_last_recover) -); - #endif /* _TRACE_DEVLINK_H */ /* This part must be outside protection */ @@ -123,9 +64,6 @@ static inline void trace_devlink_hwmsg(const struct devlink *devlink, { } -static inline void trace_devlink_health(const char *msg) -{ -} #endif /* _TRACE_DEVLINK_H */ #endif diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6b26bb2ce4dc..6e52d3660654 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -89,13 +89,6 @@ enum devlink_command { DEVLINK_CMD_REGION_DEL, DEVLINK_CMD_REGION_READ, - DEVLINK_CMD_HEALTH_REPORTER_GET, - DEVLINK_CMD_HEALTH_REPORTER_SET, - DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -292,24 +285,6 @@ enum devlink_attr { DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME, /* string */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY, /* nested */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, /* u8 */ - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, /* dynamic */ - - DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ - DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ - DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ - DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, /* u8 */ - DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ - /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 24f266468ca5..abb0da9d7b4b 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3597,1015 +3597,6 @@ out: return 0; } -#define DEVLINK_HEALTH_BUFFER_SIZE (4096 - GENL_HDRLEN) -#define DEVLINK_HEALTH_BUFFER_DATA_SIZE (DEVLINK_HEALTH_BUFFER_SIZE / 2) -#define DEVLINK_HEALTH_SIZE_TO_BUFFERS(size) DIV_ROUND_UP_ULL(size, DEVLINK_HEALTH_BUFFER_DATA_SIZE) -#define DEVLINK_HEALTH_BUFFER_MAX_CHUNK 1024 - -struct devlink_health_buffer { - void *data; - u64 offset; - u64 bytes_left; - u64 bytes_left_metadata; - u64 max_nested_depth; - u64 curr_nest; -}; - -struct devlink_health_buffer_desc { - int attrtype; - u16 len; - u8 nla_type; - u8 nest_end; - int value[0]; -}; - -static void -devlink_health_buffers_reset(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) -{ - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - memset(buffers_list[i]->data, 0, DEVLINK_HEALTH_BUFFER_SIZE); - buffers_list[i]->offset = 0; - buffers_list[i]->bytes_left = DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->bytes_left_metadata = - DEVLINK_HEALTH_BUFFER_DATA_SIZE; - buffers_list[i]->max_nested_depth = 0; - buffers_list[i]->curr_nest = 0; - } -} - -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 size); - -static struct devlink_health_buffer ** -devlink_health_buffers_create(u64 size) -{ - struct devlink_health_buffer **buffers_list; - u64 num_of_buffers = DEVLINK_HEALTH_SIZE_TO_BUFFERS(size); - u64 i; - - buffers_list = kcalloc(num_of_buffers, - sizeof(struct devlink_health_buffer *), - GFP_KERNEL); - if (!buffers_list) - return NULL; - - for (i = 0; i < num_of_buffers; i++) { - struct devlink_health_buffer *buffer; - void *data; - - buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); - data = kzalloc(DEVLINK_HEALTH_BUFFER_SIZE, GFP_KERNEL); - if (!buffer || !data) { - kfree(buffer); - kfree(data); - goto buffers_cleanup; - } - buffers_list[i] = buffer; - buffer->data = data; - } - devlink_health_buffers_reset(buffers_list, num_of_buffers); - - return buffers_list; - -buffers_cleanup: - devlink_health_buffers_destroy(buffers_list, --i); - kfree(buffers_list); - return NULL; -} - -static void -devlink_health_buffers_destroy(struct devlink_health_buffer **buffers_list, - u64 num_of_buffers) -{ - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - kfree(buffers_list[i]->data); - kfree(buffers_list[i]); - } -} - -void -devlink_health_buffer_offset_inc(struct devlink_health_buffer *buffer, - int len) -{ - buffer->offset += len; -} - -/* In order to store a nest, need two descriptors, for start and end */ -#define DEVLINK_HEALTH_BUFFER_NEST_SIZE (sizeof(struct devlink_health_buffer_desc) * 2) - -int devlink_health_buffer_verify_len(struct devlink_health_buffer *buffer, - int len, int metadata_len) -{ - if (len > DEVLINK_HEALTH_BUFFER_DATA_SIZE) - return -EINVAL; - - if (buffer->bytes_left < len || - buffer->bytes_left_metadata < metadata_len) - return -ENOMEM; - - return 0; -} - -static struct devlink_health_buffer_desc * -devlink_health_buffer_get_desc_from_offset(struct devlink_health_buffer *buffer) -{ - return buffer->data + buffer->offset; -} - -int -devlink_health_buffer_nest_start(struct devlink_health_buffer *buffer, - int attrtype) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, 0, - DEVLINK_HEALTH_BUFFER_NEST_SIZE); - if (err) - return err; - - if (attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE && - attrtype != DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY) - return -EINVAL; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - - desc->attrtype = attrtype; - buffer->bytes_left_metadata -= DEVLINK_HEALTH_BUFFER_NEST_SIZE; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); - - buffer->curr_nest++; - buffer->max_nested_depth = max(buffer->max_nested_depth, - buffer->curr_nest); - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_start); - -enum devlink_health_buffer_nest_end_cancel { - DEVLINK_HEALTH_BUFFER_NEST_END = 1, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL, -}; - -static void -devlink_health_buffer_nest_end_cancel(struct devlink_health_buffer *buffer, - enum devlink_health_buffer_nest_end_cancel nest) -{ - struct devlink_health_buffer_desc *desc; - - WARN_ON(!buffer->curr_nest); - buffer->curr_nest--; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->nest_end = nest; - devlink_health_buffer_offset_inc(buffer, sizeof(*desc)); -} - -void devlink_health_buffer_nest_end(struct devlink_health_buffer *buffer) -{ - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_END); -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_end); - -void devlink_health_buffer_nest_cancel(struct devlink_health_buffer *buffer) -{ - devlink_health_buffer_nest_end_cancel(buffer, - DEVLINK_HEALTH_BUFFER_NEST_CANCEL); -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_nest_cancel); - -int -devlink_health_buffer_put_object_name(struct devlink_health_buffer *buffer, - char *name) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, strlen(name) + 1, - sizeof(*desc)); - if (err) - return err; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME; - desc->nla_type = NLA_NUL_STRING; - desc->len = strlen(name) + 1; - memcpy(&desc->value, name, desc->len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); - - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= (strlen(name) + 1); - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_object_name); - -static int -devlink_health_buffer_put_value(struct devlink_health_buffer *buffer, - u8 nla_type, void *value, int len) -{ - struct devlink_health_buffer_desc *desc; - int err; - - err = devlink_health_buffer_verify_len(buffer, len, sizeof(*desc)); - if (err) - return err; - - desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc->attrtype = DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA; - desc->nla_type = nla_type; - desc->len = len; - memcpy(&desc->value, value, len); - devlink_health_buffer_offset_inc(buffer, sizeof(*desc) + desc->len); - - buffer->bytes_left_metadata -= sizeof(*desc); - buffer->bytes_left -= len; - - return 0; -} - -int -devlink_health_buffer_put_value_u8(struct devlink_health_buffer *buffer, - u8 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U8, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u8); - -int -devlink_health_buffer_put_value_u32(struct devlink_health_buffer *buffer, - u32 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U32, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u32); - -int -devlink_health_buffer_put_value_u64(struct devlink_health_buffer *buffer, - u64 value) -{ - int err; - - err = devlink_health_buffer_put_value(buffer, NLA_U64, &value, - sizeof(value)); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_u64); - -int -devlink_health_buffer_put_value_string(struct devlink_health_buffer *buffer, - char *name) -{ - int err; - - if (strlen(name) + 1 > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; - - err = devlink_health_buffer_put_value(buffer, NLA_NUL_STRING, name, - strlen(name) + 1); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_string); - -int -devlink_health_buffer_put_value_data(struct devlink_health_buffer *buffer, - void *data, int len) -{ - int err; - - if (len > DEVLINK_HEALTH_BUFFER_MAX_CHUNK) - return -EINVAL; - - err = devlink_health_buffer_put_value(buffer, NLA_BINARY, data, len); - if (err) - return err; - - return 0; -} -EXPORT_SYMBOL_GPL(devlink_health_buffer_put_value_data); - -static int -devlink_health_buffer_fill_data(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) -{ - int err = -EINVAL; - - switch (desc->nla_type) { - case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u8 *)desc->value); - break; - case NLA_U32: - err = nla_put_u32(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u32 *)desc->value); - break; - case NLA_U64: - err = nla_put_u64_64bit(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - *(u64 *)desc->value, DEVLINK_ATTR_PAD); - break; - case NLA_NUL_STRING: - err = nla_put_string(skb, - DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - (char *)&desc->value); - break; - case NLA_BINARY: - err = nla_put(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA, - desc->len, (void *)&desc->value); - break; - } - - return err; -} - -static int -devlink_health_buffer_fill_type(struct sk_buff *skb, - struct devlink_health_buffer_desc *desc) -{ - int err = -EINVAL; - - switch (desc->nla_type) { - case NLA_U8: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U8); - break; - case NLA_U32: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U32); - break; - case NLA_U64: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_U64); - break; - case NLA_NUL_STRING: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_NUL_STRING); - break; - case NLA_BINARY: - err = nla_put_u8(skb, DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_TYPE, - NLA_BINARY); - break; - } - - return err; -} - -static inline struct devlink_health_buffer_desc * -devlink_health_buffer_get_next_desc(struct devlink_health_buffer_desc *desc) -{ - return (void *)&desc->value + desc->len; -} - -static int -devlink_health_buffer_prepare_skb(struct sk_buff *skb, - struct devlink_health_buffer *buffer) -{ - struct devlink_health_buffer_desc *last_desc, *desc; - struct nlattr **buffer_nlattr; - int err; - int i = 0; - - buffer_nlattr = kcalloc(buffer->max_nested_depth, - sizeof(*buffer_nlattr), GFP_KERNEL); - if (!buffer_nlattr) - return -EINVAL; - - last_desc = devlink_health_buffer_get_desc_from_offset(buffer); - desc = buffer->data; - while (desc != last_desc) { - switch (desc->attrtype) { - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_PAIR: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE: - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_ARRAY: - buffer_nlattr[i] = nla_nest_start(skb, desc->attrtype); - if (!buffer_nlattr[i]) - goto nla_put_failure; - i++; - break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_VALUE_DATA: - err = devlink_health_buffer_fill_data(skb, desc); - if (err) - goto nla_put_failure; - err = devlink_health_buffer_fill_type(skb, desc); - if (err) - goto nla_put_failure; - break; - case DEVLINK_ATTR_HEALTH_BUFFER_OBJECT_NAME: - err = nla_put_string(skb, desc->attrtype, - (char *)&desc->value); - if (err) - goto nla_put_failure; - break; - default: - WARN_ON(!desc->nest_end); - WARN_ON(i <= 0); - if (desc->nest_end == DEVLINK_HEALTH_BUFFER_NEST_END) - nla_nest_end(skb, buffer_nlattr[--i]); - else - nla_nest_cancel(skb, buffer_nlattr[--i]); - break; - } - desc = devlink_health_buffer_get_next_desc(desc); - } - - return 0; - -nla_put_failure: - kfree(buffer_nlattr); - return err; -} - -static int -devlink_health_buffer_snd(struct genl_info *info, - enum devlink_command cmd, int flags, - struct devlink_health_buffer **buffers_array, - u64 num_of_buffers) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - void *hdr; - int err; - u64 i; - - for (i = 0; i < num_of_buffers; i++) { - /* Skip buffer if driver did not fill it up with any data */ - if (!buffers_array[i]->offset) - continue; - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - - hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, - &devlink_nl_family, NLM_F_MULTI, cmd); - if (!hdr) - goto nla_put_failure; - - err = devlink_health_buffer_prepare_skb(skb, buffers_array[i]); - if (err) - goto nla_put_failure; - - genlmsg_end(skb, hdr); - err = genlmsg_reply(skb, info); - if (err) - return err; - } - - skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!skb) - return -ENOMEM; - nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, - NLMSG_DONE, 0, flags | NLM_F_MULTI); - if (!nlh) - goto nla_put_failure; - - err = genlmsg_reply(skb, info); - if (err) - return err; - - return 0; - -nla_put_failure: - err = -EIO; - nlmsg_free(skb); - return err; -} - -struct devlink_health_reporter { - struct list_head list; - struct devlink_health_buffer **dump_buffers_array; - struct mutex dump_lock; /* lock parallel read/write from dump buffers */ - struct devlink_health_buffer **diagnose_buffers_array; - struct mutex diagnose_lock; /* lock parallel read/write from diagnose buffers */ - void *priv; - const struct devlink_health_reporter_ops *ops; - struct devlink *devlink; - u64 graceful_period; - bool auto_recover; - u8 health_state; - u8 dump_avail; - u64 dump_ts; - u64 error_count; - u64 recovery_count; - u64 last_recovery_ts; -}; - -enum devlink_health_reporter_state { - DEVLINK_HEALTH_REPORTER_STATE_HEALTHY, - DEVLINK_HEALTH_REPORTER_STATE_ERROR, -}; - -void * -devlink_health_reporter_priv(struct devlink_health_reporter *reporter) -{ - return reporter->priv; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_priv); - -static struct devlink_health_reporter * -devlink_health_reporter_find_by_name(struct devlink *devlink, - const char *reporter_name) -{ - struct devlink_health_reporter *reporter; - - list_for_each_entry(reporter, &devlink->reporter_list, list) - if (!strcmp(reporter->ops->name, reporter_name)) - return reporter; - return NULL; -} - -/** - * devlink_health_reporter_create - create devlink health reporter - * - * @devlink: devlink - * @ops: ops - * @graceful_period: to avoid recovery loops, in msecs - * @auto_recover: auto recover when error occurs - * @priv: priv - */ -struct devlink_health_reporter * -devlink_health_reporter_create(struct devlink *devlink, - const struct devlink_health_reporter_ops *ops, - u64 graceful_period, bool auto_recover, - void *priv) -{ - struct devlink_health_reporter *reporter; - - mutex_lock(&devlink->lock); - if (devlink_health_reporter_find_by_name(devlink, ops->name)) { - reporter = ERR_PTR(-EEXIST); - goto unlock; - } - - if (WARN_ON(ops->dump && !ops->dump_size) || - WARN_ON(ops->diagnose && !ops->diagnose_size) || - WARN_ON(auto_recover && !ops->recover) || - WARN_ON(graceful_period && !ops->recover)) { - reporter = ERR_PTR(-EINVAL); - goto unlock; - } - - reporter = kzalloc(sizeof(*reporter), GFP_KERNEL); - if (!reporter) { - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - - if (ops->dump) { - reporter->dump_buffers_array = - devlink_health_buffers_create(ops->dump_size); - if (!reporter->dump_buffers_array) { - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - if (ops->diagnose) { - reporter->diagnose_buffers_array = - devlink_health_buffers_create(ops->diagnose_size); - if (!reporter->diagnose_buffers_array) { - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(ops->dump_size)); - kfree(reporter); - reporter = ERR_PTR(-ENOMEM); - goto unlock; - } - } - - list_add_tail(&reporter->list, &devlink->reporter_list); - mutex_init(&reporter->dump_lock); - mutex_init(&reporter->diagnose_lock); - - reporter->priv = priv; - reporter->ops = ops; - reporter->devlink = devlink; - reporter->graceful_period = graceful_period; - reporter->auto_recover = auto_recover; -unlock: - mutex_unlock(&devlink->lock); - return reporter; -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_create); - -/** - * devlink_health_reporter_destroy - destroy devlink health reporter - * - * @reporter: devlink health reporter to destroy - */ -void -devlink_health_reporter_destroy(struct devlink_health_reporter *reporter) -{ - mutex_lock(&reporter->devlink->lock); - list_del(&reporter->list); - devlink_health_buffers_destroy(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - devlink_health_buffers_destroy(reporter->diagnose_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size)); - kfree(reporter); - mutex_unlock(&reporter->devlink->lock); -} -EXPORT_SYMBOL_GPL(devlink_health_reporter_destroy); - -static int -devlink_health_reporter_recover(struct devlink_health_reporter *reporter, - void *priv_ctx) -{ - int err; - - if (!reporter->ops->recover) - return -EOPNOTSUPP; - - err = reporter->ops->recover(reporter, priv_ctx); - if (err) - return err; - - reporter->recovery_count++; - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_HEALTHY; - reporter->last_recovery_ts = jiffies; - - return 0; -} - -static int devlink_health_do_dump(struct devlink_health_reporter *reporter, - void *priv_ctx) -{ - int err; - - if (!reporter->ops->dump) - return 0; - - if (reporter->dump_avail) - return 0; - - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); - err = reporter->ops->dump(reporter, reporter->dump_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size), - priv_ctx); - if (!err) { - reporter->dump_avail = true; - reporter->dump_ts = jiffies; - } - - return err; -} - -int devlink_health_report(struct devlink_health_reporter *reporter, - const char *msg, void *priv_ctx) -{ - struct devlink *devlink = reporter->devlink; - int err = 0; - - /* write a log message of the current error */ - WARN_ON(!msg); - trace_devlink_health_report(devlink, reporter->ops->name, msg); - reporter->error_count++; - - /* abort if the previous error wasn't recovered */ - if (reporter->auto_recover && - (reporter->health_state != DEVLINK_HEALTH_REPORTER_STATE_HEALTHY || - jiffies - reporter->last_recovery_ts < - msecs_to_jiffies(reporter->graceful_period))) { - trace_devlink_health_recover_aborted(devlink, - reporter->ops->name, - reporter->health_state, - jiffies - - reporter->last_recovery_ts); - return -ECANCELED; - } - - reporter->health_state = DEVLINK_HEALTH_REPORTER_STATE_ERROR; - - mutex_lock(&reporter->dump_lock); - /* store current dump of current error, for later analysis */ - devlink_health_do_dump(reporter, priv_ctx); - mutex_unlock(&reporter->dump_lock); - - if (reporter->auto_recover) - err = devlink_health_reporter_recover(reporter, priv_ctx); - - return err; -} -EXPORT_SYMBOL_GPL(devlink_health_report); - -static struct devlink_health_reporter * -devlink_health_reporter_get_from_info(struct devlink *devlink, - struct genl_info *info) -{ - char *reporter_name; - - if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) - return NULL; - - reporter_name = - nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); - return devlink_health_reporter_find_by_name(devlink, reporter_name); -} - -static int -devlink_nl_health_reporter_fill(struct sk_buff *msg, - struct devlink *devlink, - struct devlink_health_reporter *reporter, - enum devlink_command cmd, u32 portid, - u32 seq, int flags) -{ - struct nlattr *reporter_attr; - void *hdr; - - hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); - if (!hdr) - return -EMSGSIZE; - - if (devlink_nl_put_handle(msg, devlink)) - goto genlmsg_cancel; - - reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); - if (!reporter_attr) - goto genlmsg_cancel; - if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, - reporter->ops->name)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, - reporter->health_state)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, - reporter->error_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, - reporter->recovery_count, DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, - reporter->graceful_period, - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, - reporter->auto_recover)) - goto reporter_nest_cancel; - if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_AVAIL, - reporter->dump_avail)) - goto reporter_nest_cancel; - if (reporter->dump_avail && - nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, - jiffies_to_msecs(reporter->dump_ts), - DEVLINK_ATTR_PAD)) - goto reporter_nest_cancel; - - nla_nest_end(msg, reporter_attr); - genlmsg_end(msg, hdr); - return 0; - -reporter_nest_cancel: - nla_nest_end(msg, reporter_attr); -genlmsg_cancel: - genlmsg_cancel(msg, hdr); - return -EMSGSIZE; -} - -static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - struct sk_buff *msg; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) - return -ENOMEM; - - err = devlink_nl_health_reporter_fill(msg, devlink, reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - info->snd_portid, info->snd_seq, - 0); - if (err) { - nlmsg_free(msg); - return err; - } - - return genlmsg_reply(msg, info); -} - -static int -devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, - struct netlink_callback *cb) -{ - struct devlink_health_reporter *reporter; - struct devlink *devlink; - int start = cb->args[0]; - int idx = 0; - int err; - - mutex_lock(&devlink_mutex); - list_for_each_entry(devlink, &devlink_list, list) { - if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) - continue; - mutex_lock(&devlink->lock); - list_for_each_entry(reporter, &devlink->reporter_list, - list) { - if (idx < start) { - idx++; - continue; - } - err = devlink_nl_health_reporter_fill(msg, devlink, - reporter, - DEVLINK_CMD_HEALTH_REPORTER_GET, - NETLINK_CB(cb->skb).portid, - cb->nlh->nlmsg_seq, - NLM_F_MULTI); - if (err) { - mutex_unlock(&devlink->lock); - goto out; - } - idx++; - } - mutex_unlock(&devlink->lock); - } -out: - mutex_unlock(&devlink_mutex); - - cb->args[0] = idx; - return msg->len; -} - -static int -devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->recover && - (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || - info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) - return -EINVAL; - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) - reporter->graceful_period = - nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); - - if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) - reporter->auto_recover = - nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); - - return 0; -} - -static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - return devlink_health_reporter_recover(reporter, NULL); -} - -static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - u64 num_of_buffers; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->diagnose) - return -EOPNOTSUPP; - - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->diagnose_size); - - mutex_lock(&reporter->diagnose_lock); - devlink_health_buffers_reset(reporter->diagnose_buffers_array, - num_of_buffers); - - err = reporter->ops->diagnose(reporter, - reporter->diagnose_buffers_array, - DEVLINK_HEALTH_BUFFER_SIZE, - num_of_buffers); - if (err) - goto out; - - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - 0, reporter->diagnose_buffers_array, - num_of_buffers); - if (err) - goto out; - - mutex_unlock(&reporter->diagnose_lock); - return 0; - -out: - mutex_unlock(&reporter->diagnose_lock); - return err; -} - -static void -devlink_health_dump_clear(struct devlink_health_reporter *reporter) -{ - reporter->dump_avail = false; - reporter->dump_ts = 0; - devlink_health_buffers_reset(reporter->dump_buffers_array, - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size)); -} - -static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - u64 num_of_buffers; - int err; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - if (!reporter->ops->dump) - return -EOPNOTSUPP; - - num_of_buffers = - DEVLINK_HEALTH_SIZE_TO_BUFFERS(reporter->ops->dump_size); - - mutex_lock(&reporter->dump_lock); - err = devlink_health_do_dump(reporter, NULL); - if (err) - goto out; - - err = devlink_health_buffer_snd(info, - DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - 0, reporter->dump_buffers_array, - num_of_buffers); - -out: - mutex_unlock(&reporter->dump_lock); - return err; -} - -static int -devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, - struct genl_info *info) -{ - struct devlink *devlink = info->user_ptr[0]; - struct devlink_health_reporter *reporter; - - reporter = devlink_health_reporter_get_from_info(devlink, info); - if (!reporter) - return -EINVAL; - - mutex_lock(&reporter->dump_lock); - devlink_health_dump_clear(reporter); - mutex_unlock(&reporter->dump_lock); - return 0; -} - static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4631,9 +3622,6 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, - [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, - [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, - [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4854,51 +3842,6 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, - .doit = devlink_nl_cmd_health_reporter_get_doit, - .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, - .policy = devlink_nl_policy, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - /* can be retrieved by unprivileged users */ - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, - .doit = devlink_nl_cmd_health_reporter_set_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, - .doit = devlink_nl_cmd_health_reporter_recover_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, - .doit = devlink_nl_cmd_health_reporter_diagnose_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, - .doit = devlink_nl_cmd_health_reporter_dump_get_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, - }, - { - .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, - .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, - .policy = devlink_nl_policy, - .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NO_LOCK, - }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -4939,7 +3882,6 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->resource_list); INIT_LIST_HEAD(&devlink->param_list); INIT_LIST_HEAD(&devlink->region_list); - INIT_LIST_HEAD(&devlink->reporter_list); mutex_init(&devlink->lock); return devlink; } -- cgit v1.2.3 From fe4943702c850fa07f963eaa6f1530d9d4c2da78 Mon Sep 17 00:00:00 2001 From: Srinivas Dasari Date: Wed, 23 Jan 2019 18:06:56 +0530 Subject: cfg80211: Authentication offload to user space in AP mode commit 40cbfa90218b ("cfg80211/nl80211: Optional authentication offload to userspace")' introduced authentication offload to user space by the host drivers in station mode. This commit extends the same for the AP mode too. Extend NL80211_ATTR_EXTERNAL_AUTH_SUPPORT to also claim the support of external authentication from the user space in AP mode. A new flag parameter is introduced in cfg80211_ap_settings to intend the same while "start ap". Host driver to use NL80211_CMD_FRAME interface to transmit and receive the authentication frames to / from the user space. Host driver to indicate the flag NL80211_RXMGMT_FLAG_EXTERNAL_AUTH while sending the authentication frame to the user space. This intends to the user space that the driver wishes it to process the authentication frame for certain protocols, though it had initially advertised the support for SME functionality. User space shall accordingly do the authentication and indicate its final status through the command NL80211_CMD_EXTERNAL_AUTH. Allow the command even if userspace doesn't include the attribute NL80211_ATTR_SSID for AP interface. Host driver shall continue with the association sequence and indicate the STA connection status through cfg80211_new_sta. To facilitate the host drivers in AP mode for matching the pmkid by the stations during the association, NL80211_CMD_EXTERNAL_AUTH is also enhanced to include the pmkid to drivers after the authentication. This pmkid can also be used in the STA mode to include in the association request. Also modify nl80211_external_auth to not mandate SSID in AP mode. Signed-off-by: Srinivas Dasari [remove useless nla_get_flag() usage] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 15 +++++++++++++++ include/uapi/linux/nl80211.h | 13 +++++++++---- net/wireless/nl80211.c | 25 ++++++++++++++++++------- 3 files changed, 42 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index b61ac6e9de08..7033c90850b0 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -835,6 +835,17 @@ struct cfg80211_bitrate_mask { } control[NUM_NL80211_BANDS]; }; +/** + * enum cfg80211_ap_settings_flags - AP settings flags + * + * Used by cfg80211_ap_settings + * + * @AP_SETTINGS_EXTERNAL_AUTH_SUPPORT: AP supports external authentication + */ +enum cfg80211_ap_settings_flags { + AP_SETTINGS_EXTERNAL_AUTH_SUPPORT = BIT(0), +}; + /** * struct cfg80211_ap_settings - AP configuration * @@ -865,6 +876,7 @@ struct cfg80211_bitrate_mask { * @he_cap: HE capabilities (or %NULL if HE isn't enabled) * @ht_required: stations must support HT * @vht_required: stations must support VHT + * @flags: flags, as defined in enum cfg80211_ap_settings_flags */ struct cfg80211_ap_settings { struct cfg80211_chan_def chandef; @@ -890,6 +902,7 @@ struct cfg80211_ap_settings { const struct ieee80211_vht_cap *vht_cap; const struct ieee80211_he_cap_elem *he_cap; bool ht_required, vht_required; + u32 flags; }; /** @@ -2831,6 +2844,7 @@ struct cfg80211_pmk_conf { * use %WLAN_STATUS_UNSPECIFIED_FAILURE if user space cannot give you * the real status code for failures. Used only for the authentication * response command interface (user space to driver). + * @pmkid: The identifier to refer a PMKSA. */ struct cfg80211_external_auth_params { enum nl80211_external_auth_action action; @@ -2838,6 +2852,7 @@ struct cfg80211_external_auth_params { struct cfg80211_ssid ssid; unsigned int key_mgmt_suite; u16 status; + const u8 *pmkid; }; /** diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5f9d5cd458a1..8b0fdb9e133b 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -2266,10 +2266,10 @@ enum nl80211_commands { * &enum nl80211_external_auth_action value). This is used with the * %NL80211_CMD_EXTERNAL_AUTH request event. * @NL80211_ATTR_EXTERNAL_AUTH_SUPPORT: Flag attribute indicating that the user - * space supports external authentication. This attribute shall be used - * only with %NL80211_CMD_CONNECT request. The driver may offload - * authentication processing to user space if this capability is indicated - * in NL80211_CMD_CONNECT requests from the user space. + * space supports external authentication. This attribute shall be used + * with %NL80211_CMD_CONNECT and %NL80211_CMD_START_AP request. The driver + * may offload authentication processing to user space if this capability + * is indicated in the respective requests from the user space. * * @NL80211_ATTR_NSS: Station's New/updated RX_NSS value notified using this * u8 attribute. This is used with %NL80211_CMD_STA_OPMODE_CHANGED. @@ -5631,9 +5631,14 @@ enum nl80211_crit_proto_id { * Used by cfg80211_rx_mgmt() * * @NL80211_RXMGMT_FLAG_ANSWERED: frame was answered by device/driver. + * @NL80211_RXMGMT_FLAG_EXTERNAL_AUTH: Host driver intends to offload + * the authentication. Exclusively defined for host drivers that + * advertises the SME functionality but would like the userspace + * to handle certain authentication algorithms (e.g. SAE). */ enum nl80211_rxmgmt_flags { NL80211_RXMGMT_FLAG_ANSWERED = 1 << 0, + NL80211_RXMGMT_FLAG_EXTERNAL_AUTH = 1 << 1, }; /* diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index eb4437fa0539..dc96077afe5e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4550,6 +4550,9 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) nl80211_calculate_ap_params(¶ms); + if (info->attrs[NL80211_ATTR_EXTERNAL_AUTH_SUPPORT]) + params.flags |= AP_SETTINGS_EXTERNAL_AUTH_SUPPORT; + wdev_lock(wdev); err = rdev_start_ap(rdev, dev, ¶ms); if (!err) { @@ -13086,7 +13089,9 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->external_auth) return -EOPNOTSUPP; - if (!info->attrs[NL80211_ATTR_SSID]) + if (!info->attrs[NL80211_ATTR_SSID] && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EINVAL; if (!info->attrs[NL80211_ATTR_BSSID]) @@ -13097,18 +13102,24 @@ static int nl80211_external_auth(struct sk_buff *skb, struct genl_info *info) memset(¶ms, 0, sizeof(params)); - params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); - if (params.ssid.ssid_len == 0 || - params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) - return -EINVAL; - memcpy(params.ssid.ssid, nla_data(info->attrs[NL80211_ATTR_SSID]), - params.ssid.ssid_len); + if (info->attrs[NL80211_ATTR_SSID]) { + params.ssid.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + if (params.ssid.ssid_len == 0 || + params.ssid.ssid_len > IEEE80211_MAX_SSID_LEN) + return -EINVAL; + memcpy(params.ssid.ssid, + nla_data(info->attrs[NL80211_ATTR_SSID]), + params.ssid.ssid_len); + } memcpy(params.bssid, nla_data(info->attrs[NL80211_ATTR_BSSID]), ETH_ALEN); params.status = nla_get_u16(info->attrs[NL80211_ATTR_STATUS_CODE]); + if (info->attrs[NL80211_ATTR_PMKID]) + params.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); + return rdev_external_auth(rdev, dev, ¶ms); } -- cgit v1.2.3 From 6c900360e7c0df6a4846ac97d7b548d72cd801b0 Mon Sep 17 00:00:00 2001 From: Liangwei Dong Date: Fri, 18 Jan 2019 16:54:38 +0530 Subject: nl80211: Allow set/del pmksa operations for AP Host drivers may offload authentication to the user space through the commit ("cfg80211: Authentication offload to user space in AP mode"). This interface can be used to implement SAE by having the userspace do authentication/PMKID key derivation and driver handle the association. A step ahead, this interface can get further optimized if the PMKID is passed to the host driver and also have it respond to the association request by the STA on a valid PMKID. This commit enables the userspace to pass the PMKID to the host drivers through the set/del pmksa operations in AP mode. Set/Del pmksa is now restricted to STA/P2P client mode only and thus the drivers might not expect them in any other(AP) mode. This commit also introduces a feature flag NL80211_EXT_FEATURE_AP_PMKSA_CACHING (johannes: renamed) to maintain the backward compatibility of such an expectation by the host drivers. These operations are allowed in AP mode only when the drivers advertize the capability through this flag. Signed-off-by: Liangwei Dong Signed-off-by: Srinivas Dasari [rename flag to NL80211_EXT_FEATURE_AP_PMKSA_CACHING] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 8b0fdb9e133b..dd4f86ee286e 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5340,6 +5340,9 @@ enum nl80211_feature_flags { * fairness for transmitted packets and has enabled airtime fairness * scheduling. * + * @NL80211_EXT_FEATURE_AP_PMKSA_CACHING: Driver/device supports PMKSA caching + * (set/del PMKSA operations) in AP mode. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5380,6 +5383,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_CAN_REPLACE_PTK0, NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index dc96077afe5e..af89e5c9fd0a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9899,7 +9899,10 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) } if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + !(dev->ieee80211_ptr->iftype == NL80211_IFTYPE_AP && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_AP_PMKSA_CACHING))) return -EOPNOTSUPP; switch (info->genlhdr->cmd) { -- cgit v1.2.3 From d405c7407a5468d4fc11724d76063e0647d80106 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Sat, 26 Jan 2019 12:25:59 -0500 Subject: bpf: allocate 0x06 to new eBPF instruction class JMP32 The new eBPF instruction class JMP32 uses the reserved class number 0x6. Kernel BPF ISA documentation updated accordingly. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Signed-off-by: Alexei Starovoitov --- Documentation/networking/filter.txt | 15 ++++++++------- include/uapi/linux/bpf.h | 1 + tools/include/uapi/linux/bpf.h | 1 + 3 files changed, 10 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/filter.txt b/Documentation/networking/filter.txt index 2196b824e96c..01603bc2eff1 100644 --- a/Documentation/networking/filter.txt +++ b/Documentation/networking/filter.txt @@ -865,7 +865,7 @@ Three LSB bits store instruction class which is one of: BPF_STX 0x03 BPF_STX 0x03 BPF_ALU 0x04 BPF_ALU 0x04 BPF_JMP 0x05 BPF_JMP 0x05 - BPF_RET 0x06 [ class 6 unused, for future if needed ] + BPF_RET 0x06 BPF_JMP32 0x06 BPF_MISC 0x07 BPF_ALU64 0x07 When BPF_CLASS(code) == BPF_ALU or BPF_JMP, 4th bit encodes source operand ... @@ -902,9 +902,9 @@ If BPF_CLASS(code) == BPF_ALU or BPF_ALU64 [ in eBPF ], BPF_OP(code) is one of: BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */ BPF_END 0xd0 /* eBPF only: endianness conversion */ -If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: +If BPF_CLASS(code) == BPF_JMP or BPF_JMP32 [ in eBPF ], BPF_OP(code) is one of: - BPF_JA 0x00 + BPF_JA 0x00 /* BPF_JMP only */ BPF_JEQ 0x10 BPF_JGT 0x20 BPF_JGE 0x30 @@ -912,8 +912,8 @@ If BPF_CLASS(code) == BPF_JMP, BPF_OP(code) is one of: BPF_JNE 0x50 /* eBPF only: jump != */ BPF_JSGT 0x60 /* eBPF only: signed '>' */ BPF_JSGE 0x70 /* eBPF only: signed '>=' */ - BPF_CALL 0x80 /* eBPF only: function call */ - BPF_EXIT 0x90 /* eBPF only: function return */ + BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ + BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ BPF_JSLT 0xc0 /* eBPF only: signed '<' */ @@ -936,8 +936,9 @@ Classic BPF wastes the whole BPF_RET class to represent a single 'ret' operation. Classic BPF_RET | BPF_K means copy imm32 into return register and perform function exit. eBPF is modeled to match CPU, so BPF_JMP | BPF_EXIT in eBPF means function exit only. The eBPF program needs to store return -value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is currently -unused and reserved for future use. +value into register R0 before doing a BPF_EXIT. Class 6 in eBPF is used as +BPF_JMP32 to mean exactly the same operations as BPF_JMP, but with 32-bit wide +operands for the comparisons instead. For load and store instructions the 8-bit 'code' field is divided as: diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2940a9854f6d..60b99b730a41 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 2940a9854f6d..60b99b730a41 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -14,6 +14,7 @@ /* Extended instruction set based on top of classic BPF */ /* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ #define BPF_ALU64 0x07 /* alu mode in double word width */ /* ld/ldx fields */ -- cgit v1.2.3 From 71368af9027f18fe5d1c6f372cfdff7e4bde8b48 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 16 Jan 2019 17:01:36 -0500 Subject: x86/speculation: Add PR_SPEC_DISABLE_NOEXEC With the default SPEC_STORE_BYPASS_SECCOMP/SPEC_STORE_BYPASS_PRCTL mode, the TIF_SSBD bit will be inherited when a new task is fork'ed or cloned. It will also remain when a new program is execve'ed. Only certain class of applications (like Java) that can run on behalf of multiple users on a single thread will require disabling speculative store bypass for security purposes. Those applications will call prctl(2) at startup time to disable SSB. They won't rely on the fact the SSB might have been disabled. Other applications that don't need SSBD will just move on without checking if SSBD has been turned on or not. The fact that the TIF_SSBD is inherited across execve(2) boundary will cause performance of applications that don't need SSBD but their predecessors have SSBD on to be unwittingly impacted especially if they write to memory a lot. To remedy this problem, a new PR_SPEC_DISABLE_NOEXEC argument for the PR_SET_SPECULATION_CTRL option of prctl(2) is added to allow applications to specify that the SSBD feature bit on the task structure should be cleared whenever a new program is being execve'ed. Suggested-by: Thomas Gleixner Signed-off-by: Waiman Long Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Cc: Jonathan Corbet Cc: linux-doc@vger.kernel.org Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: David Woodhouse Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Tim Chen Cc: KarimAllah Ahmed Cc: Peter Zijlstra Cc: Konrad Rzeszutek Wilk Link: https://lkml.kernel.org/r/1547676096-3281-1-git-send-email-longman@redhat.com --- Documentation/userspace-api/spec_ctrl.rst | 27 +++++++++++++++------------ arch/x86/kernel/cpu/bugs.c | 12 ++++++++++++ arch/x86/kernel/process.c | 12 ++++++++++++ include/linux/sched.h | 5 +++++ include/uapi/linux/prctl.h | 1 + tools/include/uapi/linux/prctl.h | 1 + 6 files changed, 46 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/userspace-api/spec_ctrl.rst b/Documentation/userspace-api/spec_ctrl.rst index c4dbe6f7cdae..1129c7550a48 100644 --- a/Documentation/userspace-api/spec_ctrl.rst +++ b/Documentation/userspace-api/spec_ctrl.rst @@ -28,18 +28,20 @@ PR_GET_SPECULATION_CTRL returns the state of the speculation misfeature which is selected with arg2 of prctl(2). The return value uses bits 0-3 with the following meaning: -==== ===================== =================================================== -Bit Define Description -==== ===================== =================================================== -0 PR_SPEC_PRCTL Mitigation can be controlled per task by - PR_SET_SPECULATION_CTRL. -1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is - disabled. -2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is - enabled. -3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A - subsequent prctl(..., PR_SPEC_ENABLE) will fail. -==== ===================== =================================================== +==== ====================== ================================================== +Bit Define Description +==== ====================== ================================================== +0 PR_SPEC_PRCTL Mitigation can be controlled per task by + PR_SET_SPECULATION_CTRL. +1 PR_SPEC_ENABLE The speculation feature is enabled, mitigation is + disabled. +2 PR_SPEC_DISABLE The speculation feature is disabled, mitigation is + enabled. +3 PR_SPEC_FORCE_DISABLE Same as PR_SPEC_DISABLE, but cannot be undone. A + subsequent prctl(..., PR_SPEC_ENABLE) will fail. +4 PR_SPEC_DISABLE_NOEXEC Same as PR_SPEC_DISABLE, but the state will be + cleared on :manpage:`execve(2)`. +==== ====================== ================================================== If all bits are 0 the CPU is not affected by the speculation misfeature. @@ -92,6 +94,7 @@ Speculation misfeature controls * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_ENABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE, 0, 0); * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_FORCE_DISABLE, 0, 0); + * prctl(PR_SET_SPECULATION_CTRL, PR_SPEC_STORE_BYPASS, PR_SPEC_DISABLE_NOEXEC, 0, 0); - PR_SPEC_INDIR_BRANCH: Indirect Branch Speculation in User Processes (Mitigate Spectre V2 style attacks against user processes) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1de0f4170178..2faeaf46347a 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -798,15 +798,25 @@ static int ssb_prctl_set(struct task_struct *task, unsigned long ctrl) if (task_spec_ssb_force_disable(task)) return -EPERM; task_clear_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_DISABLE: task_set_spec_ssb_disable(task); + task_clear_spec_ssb_noexec(task); task_update_spec_tif(task); break; case PR_SPEC_FORCE_DISABLE: task_set_spec_ssb_disable(task); task_set_spec_ssb_force_disable(task); + task_clear_spec_ssb_noexec(task); + task_update_spec_tif(task); + break; + case PR_SPEC_DISABLE_NOEXEC: + if (task_spec_ssb_force_disable(task)) + return -EPERM; + task_set_spec_ssb_disable(task); + task_set_spec_ssb_noexec(task); task_update_spec_tif(task); break; default: @@ -885,6 +895,8 @@ static int ssb_prctl_get(struct task_struct *task) case SPEC_STORE_BYPASS_PRCTL: if (task_spec_ssb_force_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_FORCE_DISABLE; + if (task_spec_ssb_noexec(task)) + return PR_SPEC_PRCTL | PR_SPEC_DISABLE_NOEXEC; if (task_spec_ssb_disable(task)) return PR_SPEC_PRCTL | PR_SPEC_DISABLE; return PR_SPEC_PRCTL | PR_SPEC_ENABLE; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 90ae0ca51083..58ac7be52c7a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -255,6 +255,18 @@ void arch_setup_new_exec(void) /* If cpuid was previously disabled for this task, re-enable it. */ if (test_thread_flag(TIF_NOCPUID)) enable_cpuid(); + + /* + * Don't inherit TIF_SSBD across exec boundary when + * PR_SPEC_DISABLE_NOEXEC is used. + */ + if (test_thread_flag(TIF_SSBD) && + task_spec_ssb_noexec(current)) { + clear_thread_flag(TIF_SSBD); + task_clear_spec_ssb_disable(current); + task_clear_spec_ssb_noexec(current); + speculation_ctrl_update(task_thread_info(current)->flags); + } } static inline void switch_to_bitmap(struct thread_struct *prev, diff --git a/include/linux/sched.h b/include/linux/sched.h index d2f90fa92468..fc836dc71bba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1459,6 +1459,7 @@ static inline bool is_percpu_thread(void) #define PFA_SPEC_SSB_FORCE_DISABLE 4 /* Speculative Store Bypass force disabled*/ #define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */ #define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */ +#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */ #define TASK_PFA_TEST(name, func) \ static inline bool task_##func(struct task_struct *p) \ @@ -1487,6 +1488,10 @@ TASK_PFA_TEST(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_SET(SPEC_SSB_DISABLE, spec_ssb_disable) TASK_PFA_CLEAR(SPEC_SSB_DISABLE, spec_ssb_disable) +TASK_PFA_TEST(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_SET(SPEC_SSB_NOEXEC, spec_ssb_noexec) +TASK_PFA_CLEAR(SPEC_SSB_NOEXEC, spec_ssb_noexec) + TASK_PFA_TEST(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) TASK_PFA_SET(SPEC_SSB_FORCE_DISABLE, spec_ssb_force_disable) diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index b4875a93363a..094bb03b9cc2 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -219,6 +219,7 @@ struct prctl_mm_map { # define PR_SPEC_ENABLE (1UL << 1) # define PR_SPEC_DISABLE (1UL << 2) # define PR_SPEC_FORCE_DISABLE (1UL << 3) +# define PR_SPEC_DISABLE_NOEXEC (1UL << 4) /* Reset arm64 pointer authentication keys */ #define PR_PAC_RESET_KEYS 54 -- cgit v1.2.3 From 1194c4133195dfcb6c5fc0935d54bbed872a5285 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Tue, 29 Jan 2019 00:56:17 +0000 Subject: nfit: Add Hyper-V NVDIMM DSM command set to white list Add the Hyper-V _DSM command set to the white list of NVDIMM command sets. This command set is documented at http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901"). Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Signed-off-by: Dan Williams --- drivers/acpi/nfit/core.c | 17 ++++++++++++++--- drivers/acpi/nfit/nfit.h | 6 +++++- include/uapi/linux/ndctl.h | 1 + 3 files changed, 20 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/acpi/nfit/core.c b/drivers/acpi/nfit/core.c index 1598e3a121a6..4a7e8b1fa43b 100644 --- a/drivers/acpi/nfit/core.c +++ b/drivers/acpi/nfit/core.c @@ -1846,9 +1846,17 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, dev_set_drvdata(&adev_dimm->dev, nfit_mem); /* - * Until standardization materializes we need to consider 4 - * different command sets. Note, that checking for function0 (bit0) - * tells us if any commands are reachable through this GUID. + * There are 4 "legacy" NVDIMM command sets + * (NVDIMM_FAMILY_{INTEL,MSFT,HPE1,HPE2}) that were created before + * an EFI working group was established to constrain this + * proliferation. The nfit driver probes for the supported command + * set by GUID. Note, if you're a platform developer looking to add + * a new command set to this probe, consider using an existing set, + * or otherwise seek approval to publish the command set at + * http://www.uefi.org/RFIC_LIST. + * + * Note, that checking for function0 (bit0) tells us if any commands + * are reachable through this GUID. */ for (i = 0; i <= NVDIMM_FAMILY_MAX; i++) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) @@ -1871,6 +1879,8 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc, dsm_mask &= ~(1 << 8); } else if (nfit_mem->family == NVDIMM_FAMILY_MSFT) { dsm_mask = 0xffffffff; + } else if (nfit_mem->family == NVDIMM_FAMILY_HYPERV) { + dsm_mask = 0x1f; } else { dev_dbg(dev, "unknown dimm command family\n"); nfit_mem->family = -1; @@ -3714,6 +3724,7 @@ static __init int nfit_init(void) guid_parse(UUID_NFIT_DIMM_N_HPE1, &nfit_uuid[NFIT_DEV_DIMM_N_HPE1]); guid_parse(UUID_NFIT_DIMM_N_HPE2, &nfit_uuid[NFIT_DEV_DIMM_N_HPE2]); guid_parse(UUID_NFIT_DIMM_N_MSFT, &nfit_uuid[NFIT_DEV_DIMM_N_MSFT]); + guid_parse(UUID_NFIT_DIMM_N_HYPERV, &nfit_uuid[NFIT_DEV_DIMM_N_HYPERV]); nfit_wq = create_singlethread_workqueue("nfit"); if (!nfit_wq) diff --git a/drivers/acpi/nfit/nfit.h b/drivers/acpi/nfit/nfit.h index 33691aecfcee..4de167b4f76f 100644 --- a/drivers/acpi/nfit/nfit.h +++ b/drivers/acpi/nfit/nfit.h @@ -34,11 +34,14 @@ /* https://msdn.microsoft.com/library/windows/hardware/mt604741 */ #define UUID_NFIT_DIMM_N_MSFT "1ee68b36-d4bd-4a1a-9a16-4f8e53d46e05" +/* http://www.uefi.org/RFIC_LIST (see "Virtual NVDIMM 0x1901") */ +#define UUID_NFIT_DIMM_N_HYPERV "5746c5f2-a9a2-4264-ad0e-e4ddc9e09e80" + #define ACPI_NFIT_MEM_FAILED_MASK (ACPI_NFIT_MEM_SAVE_FAILED \ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) -#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT +#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_HYPERV #define NVDIMM_STANDARD_CMDMASK \ (1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \ @@ -94,6 +97,7 @@ enum nfit_uuids { NFIT_DEV_DIMM_N_HPE1 = NVDIMM_FAMILY_HPE1, NFIT_DEV_DIMM_N_HPE2 = NVDIMM_FAMILY_HPE2, NFIT_DEV_DIMM_N_MSFT = NVDIMM_FAMILY_MSFT, + NFIT_DEV_DIMM_N_HYPERV = NVDIMM_FAMILY_HYPERV, NFIT_SPA_VOLATILE, NFIT_SPA_PM, NFIT_SPA_DCR, diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index f57c9e434d2d..de5d90212409 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -243,6 +243,7 @@ struct nd_cmd_pkg { #define NVDIMM_FAMILY_HPE1 1 #define NVDIMM_FAMILY_HPE2 2 #define NVDIMM_FAMILY_MSFT 3 +#define NVDIMM_FAMILY_HYPERV 4 #define ND_IOCTL_CALL _IOWR(ND_IOCTL, ND_CMD_CALL,\ struct nd_cmd_pkg) -- cgit v1.2.3 From f4601dee25d5fe8010023552b10879f3d62e45ce Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 28 Jan 2019 18:00:21 +0530 Subject: devlink: Add port param get command Add port param get command which gets data per parameter. It also has option to dump the parameters data per port. v7->v8: Append "Acked-by: Jiri Pirko " Cc: Jiri Pirko Signed-off-by: Vasundhara Volam Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 + net/core/devlink.c | 102 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 97 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 6e52d3660654..448973beac9d 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -89,6 +89,8 @@ enum devlink_command { DEVLINK_CMD_REGION_DEL, DEVLINK_CMD_REGION_READ, + DEVLINK_CMD_PORT_PARAM_GET, /* can dump */ + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 diff --git a/net/core/devlink.c b/net/core/devlink.c index 371481ca2afd..66313dcdbdca 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2843,6 +2843,7 @@ nla_put_failure: } static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, + unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd, u32 portid, u32 seq, int flags) @@ -2880,6 +2881,11 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; + + if (cmd == DEVLINK_CMD_PORT_PARAM_GET) + if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) + goto genlmsg_cancel; + param_attr = nla_nest_start(msg, DEVLINK_ATTR_PARAM); if (!param_attr) goto genlmsg_cancel; @@ -2933,7 +2939,7 @@ static void devlink_param_notify(struct devlink *devlink, msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_param_fill(msg, devlink, param_item, cmd, 0, 0, 0); + err = devlink_nl_param_fill(msg, devlink, 0, param_item, cmd, 0, 0, 0); if (err) { nlmsg_free(msg); return; @@ -2962,7 +2968,7 @@ static int devlink_nl_cmd_param_get_dumpit(struct sk_buff *msg, idx++; continue; } - err = devlink_nl_param_fill(msg, devlink, param_item, + err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, @@ -3051,7 +3057,7 @@ devlink_param_value_get_from_info(const struct devlink_param *param, } static struct devlink_param_item * -devlink_param_get_from_info(struct devlink *devlink, +devlink_param_get_from_info(struct list_head *param_list, struct genl_info *info) { char *param_name; @@ -3060,7 +3066,7 @@ devlink_param_get_from_info(struct devlink *devlink, return NULL; param_name = nla_data(info->attrs[DEVLINK_ATTR_PARAM_NAME]); - return devlink_param_find_by_name(&devlink->param_list, param_name); + return devlink_param_find_by_name(param_list, param_name); } static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, @@ -3071,7 +3077,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, struct sk_buff *msg; int err; - param_item = devlink_param_get_from_info(devlink, info); + param_item = devlink_param_get_from_info(&devlink->param_list, info); if (!param_item) return -EINVAL; @@ -3079,7 +3085,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, if (!msg) return -ENOMEM; - err = devlink_nl_param_fill(msg, devlink, param_item, + err = devlink_nl_param_fill(msg, devlink, 0, param_item, DEVLINK_CMD_PARAM_GET, info->snd_portid, info->snd_seq, 0); if (err) { @@ -3102,7 +3108,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, union devlink_param_value value; int err = 0; - param_item = devlink_param_get_from_info(devlink, info); + param_item = devlink_param_get_from_info(&devlink->param_list, info); if (!param_item) return -EINVAL; param = param_item->param; @@ -3183,6 +3189,80 @@ static void devlink_param_unregister_one(struct devlink *devlink, kfree(param_item); } +static int devlink_nl_cmd_port_param_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_param_item *param_item; + struct devlink_port *devlink_port; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(devlink_port, &devlink->port_list, list) { + list_for_each_entry(param_item, + &devlink_port->param_list, list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_param_fill(msg, + devlink_port->devlink, + devlink_port->index, param_item, + DEVLINK_CMD_PORT_PARAM_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + +static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_param_item *param_item; + struct sk_buff *msg; + int err; + + param_item = devlink_param_get_from_info(&devlink_port->param_list, + info); + if (!param_item) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_param_fill(msg, devlink_port->devlink, + devlink_port->index, param_item, + DEVLINK_CMD_PORT_PARAM_GET, + info->snd_portid, info->snd_seq, 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) @@ -3820,6 +3900,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_GET, + .doit = devlink_nl_cmd_port_param_get_doit, + .dumpit = devlink_nl_cmd_port_param_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + /* can be retrieved by unprivileged users */ + }, { .cmd = DEVLINK_CMD_REGION_GET, .doit = devlink_nl_cmd_region_get_doit, -- cgit v1.2.3 From 9c54873b4e2ee22507627b1adac9e3a8407741bd Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 28 Jan 2019 18:00:22 +0530 Subject: devlink: Add port param set command Add port param set command to set the value for a parameter. Value can be set to any of the supported configuration modes. v7->v8: Append "Acked-by: Jiri Pirko " Cc: Jiri Pirko Signed-off-by: Vasundhara Volam Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 37 ++++++++++++++++++++++++++++++++----- 2 files changed, 33 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 448973beac9d..3658fb20b190 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -90,6 +90,7 @@ enum devlink_command { DEVLINK_CMD_REGION_READ, DEVLINK_CMD_PORT_PARAM_GET, /* can dump */ + DEVLINK_CMD_PORT_PARAM_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 66313dcdbdca..113ad9f529e1 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3096,10 +3096,11 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } -static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, - struct genl_info *info) +static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, + struct list_head *param_list, + struct genl_info *info, + enum devlink_command cmd) { - struct devlink *devlink = info->user_ptr[0]; enum devlink_param_type param_type; struct devlink_param_gset_ctx ctx; enum devlink_param_cmode cmode; @@ -3108,7 +3109,7 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, union devlink_param_value value; int err = 0; - param_item = devlink_param_get_from_info(&devlink->param_list, info); + param_item = devlink_param_get_from_info(param_list, info); if (!param_item) return -EINVAL; param = param_item->param; @@ -3148,10 +3149,19 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, return err; } - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, param_item, cmd); return 0; } +static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + return __devlink_nl_cmd_param_set_doit(devlink, &devlink->param_list, + info, DEVLINK_CMD_PARAM_NEW); +} + static int devlink_param_register_one(struct devlink *devlink, struct list_head *param_list, const struct devlink_param *param) @@ -3263,6 +3273,16 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, return genlmsg_reply(msg, info); } +static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink_port *devlink_port = info->user_ptr[0]; + + return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, + &devlink_port->param_list, + info, 0); +} + static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, struct devlink *devlink, struct devlink_snapshot *snapshot) @@ -3908,6 +3928,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_PORT_PARAM_SET, + .doit = devlink_nl_cmd_port_param_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_PORT, + }, { .cmd = DEVLINK_CMD_REGION_GET, .doit = devlink_nl_cmd_region_get_doit, -- cgit v1.2.3 From c1e5786d6771c67fe044c3bcaa23e631e0503261 Mon Sep 17 00:00:00 2001 From: Vasundhara Volam Date: Mon, 28 Jan 2019 18:00:25 +0530 Subject: devlink: Add devlink notifications support for port params Add notification call for devlink port param set, register and unregister functions. Add devlink_port_param_value_changed() function to enable the driver notify devlink on value change. Driver should use this function after value was changed on any configuration mode part to driverinit. v7->v8: Order devlink_port_param_value_changed() definitions followed by devlink_param_value_changed() Cc: Jiri Pirko Signed-off-by: Vasundhara Volam Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 8 ++++ include/uapi/linux/devlink.h | 2 + net/core/devlink.c | 111 ++++++++++++++++++++++++++++++++----------- 3 files changed, 94 insertions(+), 27 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index ae2ccf297946..ceb5e89d74d6 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -586,6 +586,8 @@ int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, u32 param_id, union devlink_param_value init_val); void devlink_param_value_changed(struct devlink *devlink, u32 param_id); +void devlink_port_param_value_changed(struct devlink_port *devlink_port, + u32 param_id); void devlink_param_value_str_fill(union devlink_param_value *dst_val, const char *src); struct devlink_region *devlink_region_create(struct devlink *devlink, @@ -855,6 +857,12 @@ devlink_param_value_changed(struct devlink *devlink, u32 param_id) { } +static inline void +devlink_port_param_value_changed(struct devlink_port *devlink_port, + u32 param_id) +{ +} + static inline void devlink_param_value_str_fill(union devlink_param_value *dst_val, const char *src) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 3658fb20b190..61b4447a6c5b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -91,6 +91,8 @@ enum devlink_command { DEVLINK_CMD_PORT_PARAM_GET, /* can dump */ DEVLINK_CMD_PORT_PARAM_SET, + DEVLINK_CMD_PORT_PARAM_NEW, + DEVLINK_CMD_PORT_PARAM_DEL, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 55456cc36833..451ab4725340 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2882,7 +2882,9 @@ static int devlink_nl_param_fill(struct sk_buff *msg, struct devlink *devlink, if (devlink_nl_put_handle(msg, devlink)) goto genlmsg_cancel; - if (cmd == DEVLINK_CMD_PORT_PARAM_GET) + if (cmd == DEVLINK_CMD_PORT_PARAM_GET || + cmd == DEVLINK_CMD_PORT_PARAM_NEW || + cmd == DEVLINK_CMD_PORT_PARAM_DEL) if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, port_index)) goto genlmsg_cancel; @@ -2928,18 +2930,22 @@ genlmsg_cancel: } static void devlink_param_notify(struct devlink *devlink, + unsigned int port_index, struct devlink_param_item *param_item, enum devlink_command cmd) { struct sk_buff *msg; int err; - WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL); + WARN_ON(cmd != DEVLINK_CMD_PARAM_NEW && cmd != DEVLINK_CMD_PARAM_DEL && + cmd != DEVLINK_CMD_PORT_PARAM_NEW && + cmd != DEVLINK_CMD_PORT_PARAM_DEL); msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return; - err = devlink_nl_param_fill(msg, devlink, 0, param_item, cmd, 0, 0, 0); + err = devlink_nl_param_fill(msg, devlink, port_index, param_item, cmd, + 0, 0, 0); if (err) { nlmsg_free(msg); return; @@ -3097,6 +3103,7 @@ static int devlink_nl_cmd_param_get_doit(struct sk_buff *skb, } static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, struct genl_info *info, enum devlink_command cmd) @@ -3149,7 +3156,7 @@ static int __devlink_nl_cmd_param_set_doit(struct devlink *devlink, return err; } - devlink_param_notify(devlink, param_item, cmd); + devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } @@ -3158,13 +3165,15 @@ static int devlink_nl_cmd_param_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; - return __devlink_nl_cmd_param_set_doit(devlink, &devlink->param_list, + return __devlink_nl_cmd_param_set_doit(devlink, 0, &devlink->param_list, info, DEVLINK_CMD_PARAM_NEW); } static int devlink_param_register_one(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, - const struct devlink_param *param) + const struct devlink_param *param, + enum devlink_command cmd) { struct devlink_param_item *param_item; @@ -3182,19 +3191,21 @@ static int devlink_param_register_one(struct devlink *devlink, param_item->param = param; list_add_tail(¶m_item->list, param_list); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } static void devlink_param_unregister_one(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, - const struct devlink_param *param) + const struct devlink_param *param, + enum devlink_command cmd) { struct devlink_param_item *param_item; param_item = devlink_param_find_by_name(param_list, param->name); WARN_ON(!param_item); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_DEL); + devlink_param_notify(devlink, port_index, param_item, cmd); list_del(¶m_item->list); kfree(param_item); } @@ -3279,8 +3290,9 @@ static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct devlink_port *devlink_port = info->user_ptr[0]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, - &devlink_port->param_list, - info, 0); + devlink_port->index, + &devlink_port->param_list, info, + DEVLINK_CMD_PORT_PARAM_NEW); } static int devlink_nl_region_snapshot_id_put(struct sk_buff *msg, @@ -4598,9 +4610,12 @@ static int devlink_param_verify(const struct devlink_param *param) } static int __devlink_params_register(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, const struct devlink_param *params, - size_t params_count) + size_t params_count, + enum devlink_command reg_cmd, + enum devlink_command unreg_cmd) { const struct devlink_param *param = params; int i; @@ -4612,7 +4627,8 @@ static int __devlink_params_register(struct devlink *devlink, if (err) goto rollback; - err = devlink_param_register_one(devlink, param_list, param); + err = devlink_param_register_one(devlink, port_index, + param_list, param, reg_cmd); if (err) goto rollback; } @@ -4624,23 +4640,27 @@ rollback: if (!i) goto unlock; for (param--; i > 0; i--, param--) - devlink_param_unregister_one(devlink, param_list, param); + devlink_param_unregister_one(devlink, port_index, param_list, + param, unreg_cmd); unlock: mutex_unlock(&devlink->lock); return err; } static void __devlink_params_unregister(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, const struct devlink_param *params, - size_t params_count) + size_t params_count, + enum devlink_command cmd) { const struct devlink_param *param = params; int i; mutex_lock(&devlink->lock); for (i = 0; i < params_count; i++, param++) - devlink_param_unregister_one(devlink, param_list, param); + devlink_param_unregister_one(devlink, 0, param_list, param, + cmd); mutex_unlock(&devlink->lock); } @@ -4657,8 +4677,10 @@ int devlink_params_register(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { - return __devlink_params_register(devlink, &devlink->param_list, params, - params_count); + return __devlink_params_register(devlink, 0, &devlink->param_list, + params, params_count, + DEVLINK_CMD_PARAM_NEW, + DEVLINK_CMD_PARAM_DEL); } EXPORT_SYMBOL_GPL(devlink_params_register); @@ -4672,8 +4694,9 @@ void devlink_params_unregister(struct devlink *devlink, const struct devlink_param *params, size_t params_count) { - return __devlink_params_unregister(devlink, &devlink->param_list, - params, params_count); + return __devlink_params_unregister(devlink, 0, &devlink->param_list, + params, params_count, + DEVLINK_CMD_PARAM_DEL); } EXPORT_SYMBOL_GPL(devlink_params_unregister); @@ -4691,8 +4714,11 @@ int devlink_port_params_register(struct devlink_port *devlink_port, size_t params_count) { return __devlink_params_register(devlink_port->devlink, + devlink_port->index, &devlink_port->param_list, params, - params_count); + params_count, + DEVLINK_CMD_PORT_PARAM_NEW, + DEVLINK_CMD_PORT_PARAM_DEL); } EXPORT_SYMBOL_GPL(devlink_port_params_register); @@ -4709,8 +4735,10 @@ void devlink_port_params_unregister(struct devlink_port *devlink_port, size_t params_count) { return __devlink_params_unregister(devlink_port->devlink, + devlink_port->index, &devlink_port->param_list, - params, params_count); + params, params_count, + DEVLINK_CMD_PORT_PARAM_DEL); } EXPORT_SYMBOL_GPL(devlink_port_params_unregister); @@ -4739,6 +4767,7 @@ __devlink_param_driverinit_value_get(struct list_head *param_list, u32 param_id, static int __devlink_param_driverinit_value_set(struct devlink *devlink, + unsigned int port_index, struct list_head *param_list, u32 param_id, union devlink_param_value init_val, enum devlink_command cmd) @@ -4759,7 +4788,7 @@ __devlink_param_driverinit_value_set(struct devlink *devlink, param_item->driverinit_value = init_val; param_item->driverinit_value_valid = true; - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, port_index, param_item, cmd); return 0; } @@ -4800,7 +4829,7 @@ EXPORT_SYMBOL_GPL(devlink_param_driverinit_value_get); int devlink_param_driverinit_value_set(struct devlink *devlink, u32 param_id, union devlink_param_value init_val) { - return __devlink_param_driverinit_value_set(devlink, + return __devlink_param_driverinit_value_set(devlink, 0, &devlink->param_list, param_id, init_val, DEVLINK_CMD_PARAM_NEW); @@ -4849,8 +4878,10 @@ int devlink_port_param_driverinit_value_set(struct devlink_port *devlink_port, union devlink_param_value init_val) { return __devlink_param_driverinit_value_set(devlink_port->devlink, + devlink_port->index, &devlink_port->param_list, - param_id, init_val, 0); + param_id, init_val, + DEVLINK_CMD_PORT_PARAM_NEW); } EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set); @@ -4865,7 +4896,6 @@ EXPORT_SYMBOL_GPL(devlink_port_param_driverinit_value_set); * This function should be used by the driver to notify devlink on value * change, excluding driverinit configuration mode. * For driverinit configuration mode driver should use the function - * devlink_param_driverinit_value_set() instead. */ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) { @@ -4874,10 +4904,37 @@ void devlink_param_value_changed(struct devlink *devlink, u32 param_id) param_item = devlink_param_find_by_id(&devlink->param_list, param_id); WARN_ON(!param_item); - devlink_param_notify(devlink, param_item, DEVLINK_CMD_PARAM_NEW); + devlink_param_notify(devlink, 0, param_item, DEVLINK_CMD_PARAM_NEW); } EXPORT_SYMBOL_GPL(devlink_param_value_changed); +/** + * devlink_port_param_value_changed - notify devlink on a parameter's value + * change. Should be called by the driver + * right after the change. + * + * @devlink_port: devlink_port + * @param_id: parameter ID + * + * This function should be used by the driver to notify devlink on value + * change, excluding driverinit configuration mode. + * For driverinit configuration mode driver should use the function + * devlink_port_param_driverinit_value_set() instead. + */ +void devlink_port_param_value_changed(struct devlink_port *devlink_port, + u32 param_id) +{ + struct devlink_param_item *param_item; + + param_item = devlink_param_find_by_id(&devlink_port->param_list, + param_id); + WARN_ON(!param_item); + + devlink_param_notify(devlink_port->devlink, devlink_port->index, + param_item, DEVLINK_CMD_PORT_PARAM_NEW); +} +EXPORT_SYMBOL_GPL(devlink_port_param_value_changed); + /** * devlink_param_value_str_fill - Safely fill-up the string preventing * from overflow of the preallocated buffer -- cgit v1.2.3 From 2d908b38d40921a03225d42fd6e48eb51bffd606 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Jan 2019 11:28:19 +0100 Subject: serial: Add Tegra Combined UART driver The Tegra Combined UART (TCU) is a mailbox-based mechanism that allows multiplexing multiple "virtual UARTs" into a single hardware serial port. The TCU is the primary serial port on Tegra194 devices. Add a TCU driver utilizing the mailbox framework, as the used mailboxes are part of Tegra HSP blocks that are already controlled by the Tegra HSP mailbox driver. Based on work by Mikko Perttunen . Signed-off-by: Thierry Reding Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/Kconfig | 22 +++ drivers/tty/serial/Makefile | 1 + drivers/tty/serial/tegra-tcu.c | 298 +++++++++++++++++++++++++++++++++++++++ include/uapi/linux/serial_core.h | 3 + 4 files changed, 324 insertions(+) create mode 100644 drivers/tty/serial/tegra-tcu.c (limited to 'include/uapi/linux') diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 089a6f285d5e..72966bc0ac76 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -335,6 +335,28 @@ config SERIAL_TEGRA are enabled). This driver uses the APB DMA to achieve higher baudrate and better performance. +config SERIAL_TEGRA_TCU + tristate "NVIDIA Tegra Combined UART" + depends on ARCH_TEGRA && TEGRA_HSP_MBOX + select SERIAL_CORE + help + Support for the mailbox-based TCU (Tegra Combined UART) serial port. + TCU is a virtual serial port that allows multiplexing multiple data + streams into a single hardware serial port. + +config SERIAL_TEGRA_TCU_CONSOLE + bool "Support for console on a Tegra TCU serial port" + depends on SERIAL_TEGRA_TCU=y + select SERIAL_CORE_CONSOLE + default y + ---help--- + If you say Y here, it will be possible to use a the Tegra TCU as the + system console (the system console is the device which receives all + kernel messages and warnings and which allows logins in single user + mode). + + If unsure, say Y. + config SERIAL_MAX3100 tristate "MAX3100 support" depends on SPI diff --git a/drivers/tty/serial/Makefile b/drivers/tty/serial/Makefile index 1511e8a9f856..40b702aaa85e 100644 --- a/drivers/tty/serial/Makefile +++ b/drivers/tty/serial/Makefile @@ -77,6 +77,7 @@ obj-$(CONFIG_SERIAL_LANTIQ) += lantiq.o obj-$(CONFIG_SERIAL_XILINX_PS_UART) += xilinx_uartps.o obj-$(CONFIG_SERIAL_SIRFSOC) += sirfsoc_uart.o obj-$(CONFIG_SERIAL_TEGRA) += serial-tegra.o +obj-$(CONFIG_SERIAL_TEGRA_TCU) += tegra-tcu.o obj-$(CONFIG_SERIAL_AR933X) += ar933x_uart.o obj-$(CONFIG_SERIAL_EFM32_UART) += efm32-uart.o obj-$(CONFIG_SERIAL_ARC) += arc_uart.o diff --git a/drivers/tty/serial/tegra-tcu.c b/drivers/tty/serial/tegra-tcu.c new file mode 100644 index 000000000000..aaf8748a6147 --- /dev/null +++ b/drivers/tty/serial/tegra-tcu.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TCU_MBOX_BYTE(i, x) ((x) << (i * 8)) +#define TCU_MBOX_BYTE_V(x, i) (((x) >> (i * 8)) & 0xff) +#define TCU_MBOX_NUM_BYTES(x) ((x) << 24) +#define TCU_MBOX_NUM_BYTES_V(x) (((x) >> 24) & 0x3) + +struct tegra_tcu { + struct uart_driver driver; +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + struct console console; +#endif + struct uart_port port; + + struct mbox_client tx_client, rx_client; + struct mbox_chan *tx, *rx; +}; + +static unsigned int tegra_tcu_uart_tx_empty(struct uart_port *port) +{ + return TIOCSER_TEMT; +} + +static void tegra_tcu_uart_set_mctrl(struct uart_port *port, unsigned int mctrl) +{ +} + +static unsigned int tegra_tcu_uart_get_mctrl(struct uart_port *port) +{ + return 0; +} + +static void tegra_tcu_uart_stop_tx(struct uart_port *port) +{ +} + +static void tegra_tcu_write_one(struct tegra_tcu *tcu, u32 value, + unsigned int count) +{ + void *msg; + + value |= TCU_MBOX_NUM_BYTES(count); + msg = (void *)(unsigned long)value; + mbox_send_message(tcu->tx, msg); + mbox_flush(tcu->tx, 1000); +} + +static void tegra_tcu_write(struct tegra_tcu *tcu, const char *s, + unsigned int count) +{ + unsigned int written = 0, i = 0; + bool insert_nl = false; + u32 value = 0; + + while (i < count) { + if (insert_nl) { + value |= TCU_MBOX_BYTE(written++, '\n'); + insert_nl = false; + i++; + } else if (s[i] == '\n') { + value |= TCU_MBOX_BYTE(written++, '\r'); + insert_nl = true; + } else { + value |= TCU_MBOX_BYTE(written++, s[i++]); + } + + if (written == 3) { + tegra_tcu_write_one(tcu, value, 3); + value = written = 0; + } + } + + if (written) + tegra_tcu_write_one(tcu, value, written); +} + +static void tegra_tcu_uart_start_tx(struct uart_port *port) +{ + struct tegra_tcu *tcu = port->private_data; + struct circ_buf *xmit = &port->state->xmit; + unsigned long count; + + for (;;) { + count = CIRC_CNT_TO_END(xmit->head, xmit->tail, UART_XMIT_SIZE); + if (!count) + break; + + tegra_tcu_write(tcu, &xmit->buf[xmit->tail], count); + xmit->tail = (xmit->tail + count) & (UART_XMIT_SIZE - 1); + } + + uart_write_wakeup(port); +} + +static void tegra_tcu_uart_stop_rx(struct uart_port *port) +{ +} + +static void tegra_tcu_uart_break_ctl(struct uart_port *port, int ctl) +{ +} + +static int tegra_tcu_uart_startup(struct uart_port *port) +{ + return 0; +} + +static void tegra_tcu_uart_shutdown(struct uart_port *port) +{ +} + +static void tegra_tcu_uart_set_termios(struct uart_port *port, + struct ktermios *new, + struct ktermios *old) +{ +} + +static const struct uart_ops tegra_tcu_uart_ops = { + .tx_empty = tegra_tcu_uart_tx_empty, + .set_mctrl = tegra_tcu_uart_set_mctrl, + .get_mctrl = tegra_tcu_uart_get_mctrl, + .stop_tx = tegra_tcu_uart_stop_tx, + .start_tx = tegra_tcu_uart_start_tx, + .stop_rx = tegra_tcu_uart_stop_rx, + .break_ctl = tegra_tcu_uart_break_ctl, + .startup = tegra_tcu_uart_startup, + .shutdown = tegra_tcu_uart_shutdown, + .set_termios = tegra_tcu_uart_set_termios, +}; + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) +static void tegra_tcu_console_write(struct console *cons, const char *s, + unsigned int count) +{ + struct tegra_tcu *tcu = container_of(cons, struct tegra_tcu, console); + + tegra_tcu_write(tcu, s, count); +} + +static int tegra_tcu_console_setup(struct console *cons, char *options) +{ + return 0; +} +#endif + +static void tegra_tcu_receive(struct mbox_client *cl, void *msg) +{ + struct tegra_tcu *tcu = container_of(cl, struct tegra_tcu, rx_client); + struct tty_port *port = &tcu->port.state->port; + u32 value = (u32)(unsigned long)msg; + unsigned int num_bytes, i; + + num_bytes = TCU_MBOX_NUM_BYTES_V(value); + + for (i = 0; i < num_bytes; i++) + tty_insert_flip_char(port, TCU_MBOX_BYTE_V(value, i), + TTY_NORMAL); + + tty_flip_buffer_push(port); +} + +static int tegra_tcu_probe(struct platform_device *pdev) +{ + struct uart_port *port; + struct tegra_tcu *tcu; + int err; + + tcu = devm_kzalloc(&pdev->dev, sizeof(*tcu), GFP_KERNEL); + if (!tcu) + return -ENOMEM; + + tcu->tx_client.dev = &pdev->dev; + tcu->rx_client.dev = &pdev->dev; + tcu->rx_client.rx_callback = tegra_tcu_receive; + + tcu->tx = mbox_request_channel_byname(&tcu->tx_client, "tx"); + if (IS_ERR(tcu->tx)) { + err = PTR_ERR(tcu->tx); + dev_err(&pdev->dev, "failed to get tx mailbox: %d\n", err); + return err; + } + + tcu->rx = mbox_request_channel_byname(&tcu->rx_client, "rx"); + if (IS_ERR(tcu->rx)) { + err = PTR_ERR(tcu->rx); + dev_err(&pdev->dev, "failed to get rx mailbox: %d\n", err); + goto free_tx; + } + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + /* setup the console */ + strcpy(tcu->console.name, "ttyTCU"); + tcu->console.device = uart_console_device; + tcu->console.flags = CON_PRINTBUFFER | CON_ANYTIME; + tcu->console.index = -1; + tcu->console.write = tegra_tcu_console_write; + tcu->console.setup = tegra_tcu_console_setup; + tcu->console.data = &tcu->driver; +#endif + + /* setup the driver */ + tcu->driver.owner = THIS_MODULE; + tcu->driver.driver_name = "tegra-tcu"; + tcu->driver.dev_name = "ttyTCU"; +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + tcu->driver.cons = &tcu->console; +#endif + tcu->driver.nr = 1; + + err = uart_register_driver(&tcu->driver); + if (err) { + dev_err(&pdev->dev, "failed to register UART driver: %d\n", + err); + goto free_rx; + } + + /* setup the port */ + port = &tcu->port; + spin_lock_init(&port->lock); + port->dev = &pdev->dev; + port->type = PORT_TEGRA_TCU; + port->ops = &tegra_tcu_uart_ops; + port->fifosize = 1; + port->iotype = UPIO_MEM; + port->flags = UPF_BOOT_AUTOCONF; + port->private_data = tcu; + + err = uart_add_one_port(&tcu->driver, port); + if (err) { + dev_err(&pdev->dev, "failed to add UART port: %d\n", err); + goto unregister_uart; + } + + platform_set_drvdata(pdev, tcu); +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + register_console(&tcu->console); +#endif + + return 0; + +unregister_uart: + uart_unregister_driver(&tcu->driver); +free_rx: + mbox_free_channel(tcu->rx); +free_tx: + mbox_free_channel(tcu->tx); + + return err; +} + +static int tegra_tcu_remove(struct platform_device *pdev) +{ + struct tegra_tcu *tcu = platform_get_drvdata(pdev); + +#if IS_ENABLED(CONFIG_SERIAL_TEGRA_TCU_CONSOLE) + unregister_console(&tcu->console); +#endif + uart_remove_one_port(&tcu->driver, &tcu->port); + uart_unregister_driver(&tcu->driver); + mbox_free_channel(tcu->rx); + mbox_free_channel(tcu->tx); + + return 0; +} + +static const struct of_device_id tegra_tcu_match[] = { + { .compatible = "nvidia,tegra194-tcu" }, + { } +}; + +static struct platform_driver tegra_tcu_driver = { + .driver = { + .name = "tegra-tcu", + .of_match_table = tegra_tcu_match, + }, + .probe = tegra_tcu_probe, + .remove = tegra_tcu_remove, +}; +module_platform_driver(tegra_tcu_driver); + +MODULE_AUTHOR("Mikko Perttunen "); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("NVIDIA Tegra Combined UART driver"); diff --git a/include/uapi/linux/serial_core.h b/include/uapi/linux/serial_core.h index df4a7534e239..6009ee2c2e99 100644 --- a/include/uapi/linux/serial_core.h +++ b/include/uapi/linux/serial_core.h @@ -79,6 +79,9 @@ /* Nuvoton UART */ #define PORT_NPCM 40 +/* NVIDIA Tegra Combined UART */ +#define PORT_TEGRA_TCU 41 + /* Intel EG20 */ #define PORT_PCH_8LINE 44 #define PORT_PCH_2LINE 45 -- cgit v1.2.3 From 80df2704a375bb4b3c9c5cce9c00052361b16d61 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 28 Jan 2019 15:08:23 +0800 Subject: sctp: introduce SCTP_FUTURE/CURRENT/ALL_ASSOC This patch is to add 3 constants SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and SCTP_ALL_ASSOC for reserved assoc_ids, as defined in rfc6458#section-7.2. And add the process for them when doing lookup and inserting in sctp_id2assoc and sctp_assoc_set_id. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 4 ++++ net/sctp/associola.c | 7 +++++-- net/sctp/socket.c | 2 +- 3 files changed, 10 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d584073532b8..b8f2c4d56532 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -59,6 +59,10 @@ typedef __s32 sctp_assoc_t; +#define SCTP_FUTURE_ASSOC 0 +#define SCTP_CURRENT_ASSOC 1 +#define SCTP_ALL_ASSOC 2 + /* The following symbols come from the Sockets API Extensions for * SCTP . */ diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 201c888604e4..b99f163e33ac 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -1651,8 +1651,11 @@ int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp) if (preload) idr_preload(gfp); spin_lock_bh(&sctp_assocs_id_lock); - /* 0 is not a valid assoc_id, must be >= 1 */ - ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, 1, 0, GFP_NOWAIT); + /* 0, 1, 2 are used as SCTP_FUTURE_ASSOC, SCTP_CURRENT_ASSOC and + * SCTP_ALL_ASSOC, so an available id must be > SCTP_ALL_ASSOC. + */ + ret = idr_alloc_cyclic(&sctp_assocs_id, asoc, SCTP_ALL_ASSOC + 1, 0, + GFP_NOWAIT); spin_unlock_bh(&sctp_assocs_id_lock); if (preload) idr_preload_end(); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f93c3cf9e567..a52d132470ea 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -248,7 +248,7 @@ struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) } /* Otherwise this is a UDP-style socket. */ - if (!id || (id == (sctp_assoc_t)-1)) + if (id <= SCTP_ALL_ASSOC) return NULL; spin_lock_bh(&sctp_assocs_id_lock); -- cgit v1.2.3 From d0a060be573bfbf8753a15dca35497db5e968bb0 Mon Sep 17 00:00:00 2001 From: Kristina Martsenko Date: Wed, 30 Jan 2019 12:02:44 +0000 Subject: arm64: add ptrace regsets for ptrauth key management Add two new ptrace regsets, which can be used to request and change the pointer authentication keys of a thread. NT_ARM_PACA_KEYS gives access to the instruction/data address keys, and NT_ARM_PACG_KEYS to the generic authentication key. The keys are also part of the core dump file of the process. The regsets are only exposed if the kernel is compiled with CONFIG_CHECKPOINT_RESTORE=y, as the only intended use case is checkpointing and restoring processes that are using pointer authentication. (This can be changed later if there are other use cases.) Reviewed-by: Dave Martin Signed-off-by: Kristina Martsenko Signed-off-by: Catalin Marinas --- Documentation/arm64/pointer-authentication.txt | 5 + arch/arm64/include/uapi/asm/ptrace.h | 13 +++ arch/arm64/kernel/ptrace.c | 147 +++++++++++++++++++++++++ include/uapi/linux/elf.h | 2 + 4 files changed, 167 insertions(+) (limited to 'include/uapi/linux') diff --git a/Documentation/arm64/pointer-authentication.txt b/Documentation/arm64/pointer-authentication.txt index a25cd21290e9..5baca42ba146 100644 --- a/Documentation/arm64/pointer-authentication.txt +++ b/Documentation/arm64/pointer-authentication.txt @@ -78,6 +78,11 @@ bits can vary between the two. Note that the masks apply to TTBR0 addresses, and are not valid to apply to TTBR1 addresses (e.g. kernel pointers). +Additionally, when CONFIG_CHECKPOINT_RESTORE is also set, the kernel +will expose the NT_ARM_PACA_KEYS and NT_ARM_PACG_KEYS regsets (struct +user_pac_address_keys and struct user_pac_generic_keys). These can be +used to get and set the keys for a thread. + Virtualization -------------- diff --git a/arch/arm64/include/uapi/asm/ptrace.h b/arch/arm64/include/uapi/asm/ptrace.h index 28d77c9ed531..d78623acb649 100644 --- a/arch/arm64/include/uapi/asm/ptrace.h +++ b/arch/arm64/include/uapi/asm/ptrace.h @@ -233,6 +233,19 @@ struct user_pac_mask { __u64 insn_mask; }; +/* pointer authentication keys (NT_ARM_PACA_KEYS, NT_ARM_PACG_KEYS) */ + +struct user_pac_address_keys { + __uint128_t apiakey; + __uint128_t apibkey; + __uint128_t apdakey; + __uint128_t apdbkey; +}; + +struct user_pac_generic_keys { + __uint128_t apgakey; +}; + #endif /* __ASSEMBLY__ */ #endif /* _UAPI__ASM_PTRACE_H */ diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 9dce33b0e260..a86413be5a2d 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -979,6 +979,131 @@ static int pac_mask_get(struct task_struct *target, return user_regset_copyout(&pos, &count, &kbuf, &ubuf, &uregs, 0, -1); } + +#ifdef CONFIG_CHECKPOINT_RESTORE +static __uint128_t pac_key_to_user(const struct ptrauth_key *key) +{ + return (__uint128_t)key->hi << 64 | key->lo; +} + +static struct ptrauth_key pac_key_from_user(__uint128_t ukey) +{ + struct ptrauth_key key = { + .lo = (unsigned long)ukey, + .hi = (unsigned long)(ukey >> 64), + }; + + return key; +} + +static void pac_address_keys_to_user(struct user_pac_address_keys *ukeys, + const struct ptrauth_keys *keys) +{ + ukeys->apiakey = pac_key_to_user(&keys->apia); + ukeys->apibkey = pac_key_to_user(&keys->apib); + ukeys->apdakey = pac_key_to_user(&keys->apda); + ukeys->apdbkey = pac_key_to_user(&keys->apdb); +} + +static void pac_address_keys_from_user(struct ptrauth_keys *keys, + const struct user_pac_address_keys *ukeys) +{ + keys->apia = pac_key_from_user(ukeys->apiakey); + keys->apib = pac_key_from_user(ukeys->apibkey); + keys->apda = pac_key_from_user(ukeys->apdakey); + keys->apdb = pac_key_from_user(ukeys->apdbkey); +} + +static int pac_address_keys_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); +} + +static int pac_address_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_address_keys user_keys; + int ret; + + if (!system_supports_address_auth()) + return -EINVAL; + + pac_address_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_address_keys_from_user(keys, &user_keys); + + return 0; +} + +static void pac_generic_keys_to_user(struct user_pac_generic_keys *ukeys, + const struct ptrauth_keys *keys) +{ + ukeys->apgakey = pac_key_to_user(&keys->apga); +} + +static void pac_generic_keys_from_user(struct ptrauth_keys *keys, + const struct user_pac_generic_keys *ukeys) +{ + keys->apga = pac_key_from_user(ukeys->apgakey); +} + +static int pac_generic_keys_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); +} + +static int pac_generic_keys_set(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + const void *kbuf, const void __user *ubuf) +{ + struct ptrauth_keys *keys = &target->thread.keys_user; + struct user_pac_generic_keys user_keys; + int ret; + + if (!system_supports_generic_auth()) + return -EINVAL; + + pac_generic_keys_to_user(&user_keys, keys); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, + &user_keys, 0, -1); + if (ret) + return ret; + pac_generic_keys_from_user(keys, &user_keys); + + return 0; +} +#endif /* CONFIG_CHECKPOINT_RESTORE */ #endif /* CONFIG_ARM64_PTR_AUTH */ enum aarch64_regset { @@ -995,6 +1120,10 @@ enum aarch64_regset { #endif #ifdef CONFIG_ARM64_PTR_AUTH REGSET_PAC_MASK, +#ifdef CONFIG_CHECKPOINT_RESTORE + REGSET_PACA_KEYS, + REGSET_PACG_KEYS, +#endif #endif }; @@ -1074,6 +1203,24 @@ static const struct user_regset aarch64_regsets[] = { .get = pac_mask_get, /* this cannot be set dynamically */ }, +#ifdef CONFIG_CHECKPOINT_RESTORE + [REGSET_PACA_KEYS] = { + .core_note_type = NT_ARM_PACA_KEYS, + .n = sizeof(struct user_pac_address_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .get = pac_address_keys_get, + .set = pac_address_keys_set, + }, + [REGSET_PACG_KEYS] = { + .core_note_type = NT_ARM_PACG_KEYS, + .n = sizeof(struct user_pac_generic_keys) / sizeof(__uint128_t), + .size = sizeof(__uint128_t), + .align = sizeof(__uint128_t), + .get = pac_generic_keys_get, + .set = pac_generic_keys_set, + }, +#endif #endif }; diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index e4d6ddd93567..34c02e4290fe 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -421,6 +421,8 @@ typedef struct elf64_shdr { #define NT_ARM_SYSTEM_CALL 0x404 /* ARM system call number */ #define NT_ARM_SVE 0x405 /* ARM Scalable Vector Extension registers */ #define NT_ARM_PAC_MASK 0x406 /* ARM pointer authentication code masks */ +#define NT_ARM_PACA_KEYS 0x407 /* ARM pointer authentication address keys */ +#define NT_ARM_PACG_KEYS 0x408 /* ARM pointer authentication generic key */ #define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ #define NT_VMCOREDD 0x700 /* Vmcore Device Dump Note */ #define NT_MIPS_DSP 0x800 /* MIPS DSP ASE registers */ -- cgit v1.2.3 From d83525ca62cf8ebe3271d14c36fb900c294274a2 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 31 Jan 2019 15:40:04 -0800 Subject: bpf: introduce bpf_spin_lock Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let bpf program serialize access to other variables. Example: struct hash_elem { int cnt; struct bpf_spin_lock lock; }; struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); if (val) { bpf_spin_lock(&val->lock); val->cnt++; bpf_spin_unlock(&val->lock); } Restrictions and safety checks: - bpf_spin_lock is only allowed inside HASH and ARRAY maps. - BTF description of the map is mandatory for safety analysis. - bpf program can take one bpf_spin_lock at a time, since two or more can cause dead locks. - only one 'struct bpf_spin_lock' is allowed per map element. It drastically simplifies implementation yet allows bpf program to use any number of bpf_spin_locks. - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed. - bpf program must bpf_spin_unlock() before return. - bpf program can access 'struct bpf_spin_lock' only via bpf_spin_lock()/bpf_spin_unlock() helpers. - load/store into 'struct bpf_spin_lock lock;' field is not allowed. - to use bpf_spin_lock() helper the BTF description of map value must be a struct and have 'struct bpf_spin_lock anyname;' field at the top level. Nested lock inside another struct is not allowed. - syscall map_lookup doesn't copy bpf_spin_lock field to user space. - syscall map_update and program map_update do not update bpf_spin_lock field. - bpf_spin_lock cannot be on the stack or inside networking packet. bpf_spin_lock can only be inside HASH or ARRAY map value. - bpf_spin_lock is available to root only and to all program types. - bpf_spin_lock is not allowed in inner maps of map-in-map. - ld_abs is not allowed inside spin_lock-ed region. - tracing progs and socket filter progs cannot use bpf_spin_lock due to insufficient preemption checks Implementation details: - cgroup-bpf class of programs can nest with xdp/tc programs. Hence bpf_spin_lock is equivalent to spin_lock_irqsave. Other solutions to avoid nested bpf_spin_lock are possible. Like making sure that all networking progs run with softirq disabled. spin_lock_irqsave is the simplest and doesn't add overhead to the programs that don't use it. - arch_spinlock_t is used when its implemented as queued_spin_lock - archs can force their own arch_spinlock_t - on architectures where queued_spin_lock is not available and sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used. - presence of bpf_spin_lock inside map value could have been indicated via extra flag during map_create, but specifying it via BTF is cleaner. It provides introspection for map key/value and reduces user mistakes. Next steps: - allow bpf_spin_lock in other map types (like cgroup local storage) - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper to request kernel to grab bpf_spin_lock before rewriting the value. That will serialize access to map elements. Acked-by: Peter Zijlstra (Intel) Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 37 +++++++++- include/linux/bpf_verifier.h | 1 + include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 7 +- kernel/Kconfig.locks | 3 + kernel/bpf/arraymap.c | 7 +- kernel/bpf/btf.c | 42 +++++++++++ kernel/bpf/core.c | 2 + kernel/bpf/hashtab.c | 21 +++--- kernel/bpf/helpers.c | 80 ++++++++++++++++++++ kernel/bpf/map_in_map.c | 5 ++ kernel/bpf/syscall.c | 21 +++++- kernel/bpf/verifier.c | 169 ++++++++++++++++++++++++++++++++++++++++++- net/core/filter.c | 16 +++- 14 files changed, 386 insertions(+), 26 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0394f1f9213b..2ae615b48bb8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -72,14 +72,15 @@ struct bpf_map { u32 value_size; u32 max_entries; u32 map_flags; - u32 pages; + int spin_lock_off; /* >=0 valid offset, <0 error */ u32 id; int numa_node; u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; + u32 pages; bool unpriv_array; - /* 55 bytes hole */ + /* 51 bytes hole */ /* The 3rd and 4th cacheline with misc members to avoid false sharing * particularly with refcounting. @@ -91,6 +92,34 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +static inline bool map_value_has_spin_lock(const struct bpf_map *map) +{ + return map->spin_lock_off >= 0; +} + +static inline void check_and_init_map_lock(struct bpf_map *map, void *dst) +{ + if (likely(!map_value_has_spin_lock(map))) + return; + *(struct bpf_spin_lock *)(dst + map->spin_lock_off) = + (struct bpf_spin_lock){}; +} + +/* copy everything but bpf_spin_lock */ +static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) +{ + if (unlikely(map_value_has_spin_lock(map))) { + u32 off = map->spin_lock_off; + + memcpy(dst, src, off); + memcpy(dst + off + sizeof(struct bpf_spin_lock), + src + off + sizeof(struct bpf_spin_lock), + map->value_size - off - sizeof(struct bpf_spin_lock)); + } else { + memcpy(dst, src, map->value_size); + } +} + struct bpf_offload_dev; struct bpf_offloaded_map; @@ -162,6 +191,7 @@ enum bpf_arg_type { ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ + ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ }; /* type of values returned from helper functions */ @@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto; extern const struct bpf_func_proto bpf_msg_redirect_map_proto; extern const struct bpf_func_proto bpf_sk_redirect_hash_proto; extern const struct bpf_func_proto bpf_sk_redirect_map_proto; - +extern const struct bpf_func_proto bpf_spin_lock_proto; +extern const struct bpf_func_proto bpf_spin_unlock_proto; extern const struct bpf_func_proto bpf_get_local_storage_proto; /* Shared helpers among cBPF and eBPF. */ diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 0620e418dde5..69f7a3449eda 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -148,6 +148,7 @@ struct bpf_verifier_state { /* call stack tracking */ struct bpf_func_state *frame[MAX_CALL_FRAMES]; u32 curframe; + u32 active_spin_lock; bool speculative; }; diff --git a/include/linux/btf.h b/include/linux/btf.h index 12502e25e767..455d31b55828 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t); #ifdef CONFIG_BPF_SYSCALL const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 60b99b730a41..86f7c438d40f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2422,7 +2422,9 @@ union bpf_attr { FN(map_peek_elem), \ FN(msg_push_data), \ FN(msg_pop_data), \ - FN(rc_pointer_rel), + FN(rc_pointer_rel), \ + FN(spin_lock), \ + FN(spin_unlock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -3056,4 +3058,7 @@ struct bpf_line_info { __u32 line_col; }; +struct bpf_spin_lock { + __u32 val; +}; #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 84d882f3e299..fbba478ae522 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -242,6 +242,9 @@ config QUEUED_SPINLOCKS def_bool y if ARCH_USE_QUEUED_SPINLOCKS depends on SMP +config BPF_ARCH_SPINLOCK + bool + config ARCH_USE_QUEUED_RWLOCKS bool diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 25632a75d630..d6d979910a2a 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -270,9 +270,10 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + - array->elem_size * (index & array->index_mask), - value, map->value_size); + copy_map_value(map, + array->value + + array->elem_size * (index & array->index_mask), + value); return 0; } diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 3d661f0606fe..7019c1f05cab 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -355,6 +355,11 @@ static bool btf_type_is_struct(const struct btf_type *t) return kind == BTF_KIND_STRUCT || kind == BTF_KIND_UNION; } +static bool __btf_type_is_struct(const struct btf_type *t) +{ + return BTF_INFO_KIND(t->info) == BTF_KIND_STRUCT; +} + static bool btf_type_is_array(const struct btf_type *t) { return BTF_INFO_KIND(t->info) == BTF_KIND_ARRAY; @@ -2045,6 +2050,43 @@ static void btf_struct_log(struct btf_verifier_env *env, btf_verifier_log(env, "size=%u vlen=%u", t->size, btf_type_vlen(t)); } +/* find 'struct bpf_spin_lock' in map value. + * return >= 0 offset if found + * and < 0 in case of error + */ +int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t) +{ + const struct btf_member *member; + u32 i, off = -ENOENT; + + if (!__btf_type_is_struct(t)) + return -EINVAL; + + for_each_member(i, t, member) { + const struct btf_type *member_type = btf_type_by_id(btf, + member->type); + if (!__btf_type_is_struct(member_type)) + continue; + if (member_type->size != sizeof(struct bpf_spin_lock)) + continue; + if (strcmp(__btf_name_by_offset(btf, member_type->name_off), + "bpf_spin_lock")) + continue; + if (off != -ENOENT) + /* only one 'struct bpf_spin_lock' is allowed */ + return -E2BIG; + off = btf_member_bit_offset(t, member); + if (off % 8) + /* valid C code cannot generate such BTF */ + return -EINVAL; + off /= 8; + if (off % __alignof__(struct bpf_spin_lock)) + /* valid struct bpf_spin_lock will be 4 byte aligned */ + return -EINVAL; + } + return off; +} + static void btf_struct_seq_show(const struct btf *btf, const struct btf_type *t, u32 type_id, void *data, u8 bits_offset, struct seq_file *m) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index f13c543b7b36..ef88b167959d 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2002,6 +2002,8 @@ const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_map_push_elem_proto __weak; const struct bpf_func_proto bpf_map_pop_elem_proto __weak; const struct bpf_func_proto bpf_map_peek_elem_proto __weak; +const struct bpf_func_proto bpf_spin_lock_proto __weak; +const struct bpf_func_proto bpf_spin_unlock_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4b7c76765d9d..6d3b22c5ad68 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -718,21 +718,12 @@ static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) BITS_PER_LONG == 64; } -static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) -{ - u32 size = htab->map.value_size; - - if (percpu || fd_htab_map_needs_adjust(htab)) - size = round_up(size, 8); - return size; -} - static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab_size_value(htab, percpu); + u32 size = htab->map.value_size; bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -770,10 +761,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-ENOMEM); goto dec_count; } + check_and_init_map_lock(&htab->map, + l_new->key + round_up(key_size, 8)); } memcpy(l_new->key, key, key_size); if (percpu) { + size = round_up(size, 8); if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -791,8 +785,13 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); - } else { + } else if (fd_htab_map_needs_adjust(htab)) { + size = round_up(size, 8); memcpy(l_new->key + round_up(key_size, 8), value, size); + } else { + copy_map_value(&htab->map, + l_new->key + round_up(key_size, 8), + value); } l_new->hash = hash; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a74972b07e74..fbe544761628 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -221,6 +221,86 @@ const struct bpf_func_proto bpf_get_current_comm_proto = { .arg2_type = ARG_CONST_SIZE, }; +#if defined(CONFIG_QUEUED_SPINLOCKS) || defined(CONFIG_BPF_ARCH_SPINLOCK) + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + union { + __u32 val; + arch_spinlock_t lock; + } u = { .lock = __ARCH_SPIN_LOCK_UNLOCKED }; + + compiletime_assert(u.val == 0, "__ARCH_SPIN_LOCK_UNLOCKED not 0"); + BUILD_BUG_ON(sizeof(*l) != sizeof(__u32)); + BUILD_BUG_ON(sizeof(*lock) != sizeof(__u32)); + arch_spin_lock(l); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + arch_spinlock_t *l = (void *)lock; + + arch_spin_unlock(l); +} + +#else + +static inline void __bpf_spin_lock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + BUILD_BUG_ON(sizeof(*l) != sizeof(*lock)); + do { + atomic_cond_read_relaxed(l, !VAL); + } while (atomic_xchg(l, 1)); +} + +static inline void __bpf_spin_unlock(struct bpf_spin_lock *lock) +{ + atomic_t *l = (void *)lock; + + atomic_set_release(l, 0); +} + +#endif + +static DEFINE_PER_CPU(unsigned long, irqsave_flags); + +notrace BPF_CALL_1(bpf_spin_lock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + local_irq_save(flags); + __bpf_spin_lock(lock); + __this_cpu_write(irqsave_flags, flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_lock_proto = { + .func = bpf_spin_lock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + +notrace BPF_CALL_1(bpf_spin_unlock, struct bpf_spin_lock *, lock) +{ + unsigned long flags; + + flags = __this_cpu_read(irqsave_flags); + __bpf_spin_unlock(lock); + local_irq_restore(flags); + return 0; +} + +const struct bpf_func_proto bpf_spin_unlock_proto = { + .func = bpf_spin_unlock, + .gpl_only = false, + .ret_type = RET_VOID, + .arg1_type = ARG_PTR_TO_SPIN_LOCK, +}; + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 52378d3e34b3..583346a0ab29 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -37,6 +37,11 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } + if (map_value_has_spin_lock(inner_map)) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + inner_map_meta_size = sizeof(*inner_map_meta); /* In some cases verifier needs to access beyond just base map. */ if (inner_map->ops == &array_map_ops) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b155cd17c1bd..ebf0a673cb83 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -463,7 +463,7 @@ int map_check_no_btf(const struct bpf_map *map, return -ENOTSUPP; } -static int map_check_btf(const struct bpf_map *map, const struct btf *btf, +static int map_check_btf(struct bpf_map *map, const struct btf *btf, u32 btf_key_id, u32 btf_value_id) { const struct btf_type *key_type, *value_type; @@ -478,6 +478,21 @@ static int map_check_btf(const struct bpf_map *map, const struct btf *btf, if (!value_type || value_size != map->value_size) return -EINVAL; + map->spin_lock_off = btf_find_spin_lock(btf, value_type); + + if (map_value_has_spin_lock(map)) { + if (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_ARRAY) + return -ENOTSUPP; + if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > + map->value_size) { + WARN_ONCE(1, + "verifier bug spin_lock_off %d value_size %d\n", + map->spin_lock_off, map->value_size); + return -EFAULT; + } + } + if (map->ops->map_check_btf) ret = map->ops->map_check_btf(map, btf, key_type, value_type); @@ -542,6 +557,8 @@ static int map_create(union bpf_attr *attr) map->btf = btf; map->btf_key_type_id = attr->btf_key_type_id; map->btf_value_type_id = attr->btf_value_type_id; + } else { + map->spin_lock_off = -EINVAL; } err = security_bpf_map_alloc(map); @@ -740,7 +757,7 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - memcpy(value, ptr, value_size); + copy_map_value(map, value, ptr); } rcu_read_unlock(); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8c1c21cd50b4..38892bdee651 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -213,6 +213,7 @@ struct bpf_call_arg_meta { s64 msize_smax_value; u64 msize_umax_value; int ptr_id; + int func_id; }; static DEFINE_MUTEX(bpf_verifier_lock); @@ -351,6 +352,12 @@ static bool reg_is_refcounted(const struct bpf_reg_state *reg) return type_is_refcounted(reg->type); } +static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg) +{ + return reg->type == PTR_TO_MAP_VALUE && + map_value_has_spin_lock(reg->map_ptr); +} + static bool reg_is_refcounted_or_null(const struct bpf_reg_state *reg) { return type_is_refcounted_or_null(reg->type); @@ -712,6 +719,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state, } dst_state->speculative = src->speculative; dst_state->curframe = src->curframe; + dst_state->active_spin_lock = src->active_spin_lock; for (i = 0; i <= src->curframe; i++) { dst = dst_state->frame[i]; if (!dst) { @@ -1483,6 +1491,21 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, if (err) verbose(env, "R%d max value is outside of the array range\n", regno); + + if (map_value_has_spin_lock(reg->map_ptr)) { + u32 lock = reg->map_ptr->spin_lock_off; + + /* if any part of struct bpf_spin_lock can be touched by + * load/store reject this program. + * To check that [x1, x2) overlaps with [y1, y2) + * it is sufficient to check x1 < y2 && y1 < x2. + */ + if (reg->smin_value + off < lock + sizeof(struct bpf_spin_lock) && + lock < reg->umax_value + off + size) { + verbose(env, "bpf_spin_lock cannot be accessed directly by load/store\n"); + return -EACCES; + } + } return err; } @@ -2192,6 +2215,91 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +/* Implementation details: + * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL + * Two bpf_map_lookups (even with the same key) will have different reg->id. + * For traditional PTR_TO_MAP_VALUE the verifier clears reg->id after + * value_or_null->value transition, since the verifier only cares about + * the range of access to valid map value pointer and doesn't care about actual + * address of the map element. + * For maps with 'struct bpf_spin_lock' inside map value the verifier keeps + * reg->id > 0 after value_or_null->value transition. By doing so + * two bpf_map_lookups will be considered two different pointers that + * point to different bpf_spin_locks. + * The verifier allows taking only one bpf_spin_lock at a time to avoid + * dead-locks. + * Since only one bpf_spin_lock is allowed the checks are simpler than + * reg_is_refcounted() logic. The verifier needs to remember only + * one spin_lock instead of array of acquired_refs. + * cur_state->active_spin_lock remembers which map value element got locked + * and clears it after bpf_spin_unlock. + */ +static int process_spin_lock(struct bpf_verifier_env *env, int regno, + bool is_lock) +{ + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; + struct bpf_verifier_state *cur = env->cur_state; + bool is_const = tnum_is_const(reg->var_off); + struct bpf_map *map = reg->map_ptr; + u64 val = reg->var_off.value; + + if (reg->type != PTR_TO_MAP_VALUE) { + verbose(env, "R%d is not a pointer to map_value\n", regno); + return -EINVAL; + } + if (!is_const) { + verbose(env, + "R%d doesn't have constant offset. bpf_spin_lock has to be at the constant offset\n", + regno); + return -EINVAL; + } + if (!map->btf) { + verbose(env, + "map '%s' has to have BTF in order to use bpf_spin_lock\n", + map->name); + return -EINVAL; + } + if (!map_value_has_spin_lock(map)) { + if (map->spin_lock_off == -E2BIG) + verbose(env, + "map '%s' has more than one 'struct bpf_spin_lock'\n", + map->name); + else if (map->spin_lock_off == -ENOENT) + verbose(env, + "map '%s' doesn't have 'struct bpf_spin_lock'\n", + map->name); + else + verbose(env, + "map '%s' is not a struct type or bpf_spin_lock is mangled\n", + map->name); + return -EINVAL; + } + if (map->spin_lock_off != val + reg->off) { + verbose(env, "off %lld doesn't point to 'struct bpf_spin_lock'\n", + val + reg->off); + return -EINVAL; + } + if (is_lock) { + if (cur->active_spin_lock) { + verbose(env, + "Locking two bpf_spin_locks are not allowed\n"); + return -EINVAL; + } + cur->active_spin_lock = reg->id; + } else { + if (!cur->active_spin_lock) { + verbose(env, "bpf_spin_unlock without taking a lock\n"); + return -EINVAL; + } + if (cur->active_spin_lock != reg->id) { + verbose(env, "bpf_spin_unlock of different lock\n"); + return -EINVAL; + } + cur->active_spin_lock = 0; + } + return 0; +} + static bool arg_type_is_mem_ptr(enum bpf_arg_type type) { return type == ARG_PTR_TO_MEM || @@ -2268,6 +2376,17 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, return -EFAULT; } meta->ptr_id = reg->id; + } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { + if (meta->func_id == BPF_FUNC_spin_lock) { + if (process_spin_lock(env, regno, true)) + return -EACCES; + } else if (meta->func_id == BPF_FUNC_spin_unlock) { + if (process_spin_lock(env, regno, false)) + return -EACCES; + } else { + verbose(env, "verifier internal error\n"); + return -EFAULT; + } } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be @@ -2887,6 +3006,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn return err; } + meta.func_id = func_id; /* check args */ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &meta); if (err) @@ -4473,7 +4593,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; } - if (is_null || !reg_is_refcounted(reg)) { + if (is_null || !(reg_is_refcounted(reg) || + reg_may_point_to_spin_lock(reg))) { /* We don't need id from this point onwards anymore, * thus we should better reset it, so that state * pruning has chances to take effect. @@ -4871,6 +4992,11 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return err; } + if (env->cur_state->active_spin_lock) { + verbose(env, "BPF_LD_[ABS|IND] cannot be used inside bpf_spin_lock-ed region\n"); + return -EINVAL; + } + if (regs[BPF_REG_6].type != PTR_TO_CTX) { verbose(env, "at the time of BPF_LD_ABS|IND R6 != pointer to skb\n"); @@ -5607,8 +5733,11 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and * everything else matches, we are OK. - * We don't care about the 'id' value, because nothing - * uses it for PTR_TO_MAP_VALUE (only for ..._OR_NULL) + * 'id' is not compared, since it's only used for maps with + * bpf_spin_lock inside map element and in such cases if + * the rest of the prog is valid for one map element then + * it's valid for all map elements regardless of the key + * used in bpf_map_lookup() */ return memcmp(rold, rcur, offsetof(struct bpf_reg_state, id)) == 0 && range_within(rold, rcur) && @@ -5811,6 +5940,9 @@ static bool states_equal(struct bpf_verifier_env *env, if (old->speculative && !cur->speculative) return false; + if (old->active_spin_lock != cur->active_spin_lock) + return false; + /* for states to be equal callsites have to be the same * and all frame states need to be equivalent */ @@ -6229,6 +6361,12 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_spin_lock && + (insn->src_reg == BPF_PSEUDO_CALL || + insn->imm != BPF_FUNC_spin_unlock)) { + verbose(env, "function calls are not allowed while holding a lock\n"); + return -EINVAL; + } if (insn->src_reg == BPF_PSEUDO_CALL) err = check_func_call(env, insn, &env->insn_idx); else @@ -6259,6 +6397,11 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (env->cur_state->active_spin_lock) { + verbose(env, "bpf_spin_unlock is missing\n"); + return -EINVAL; + } + if (state->curframe) { /* exit from nested function */ env->prev_insn_idx = env->insn_idx; @@ -6356,6 +6499,19 @@ static int check_map_prealloc(struct bpf_map *map) !(map->map_flags & BPF_F_NO_PREALLOC); } +static bool is_tracing_prog_type(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_KPROBE: + case BPF_PROG_TYPE_TRACEPOINT: + case BPF_PROG_TYPE_PERF_EVENT: + case BPF_PROG_TYPE_RAW_TRACEPOINT: + return true; + default: + return false; + } +} + static int check_map_prog_compatibility(struct bpf_verifier_env *env, struct bpf_map *map, struct bpf_prog *prog) @@ -6378,6 +6534,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, } } + if ((is_tracing_prog_type(prog->type) || + prog->type == BPF_PROG_TYPE_SOCKET_FILTER) && + map_value_has_spin_lock(map)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && !bpf_offload_prog_map_match(prog, map)) { verbose(env, "offload device mismatch between prog and map\n"); diff --git a/net/core/filter.c b/net/core/filter.c index 41984ad4b9b4..3a49f68eda10 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5314,10 +5314,20 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_tail_call_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + default: + break; + } + + if (!capable(CAP_SYS_ADMIN)) + return NULL; + + switch (func_id) { + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; case BPF_FUNC_trace_printk: - if (capable(CAP_SYS_ADMIN)) - return bpf_get_trace_printk_proto(); - /* else, fall through */ + return bpf_get_trace_printk_proto(); default: return NULL; } -- cgit v1.2.3 From 96049f3afd50fe8db69fa0068cdca822e747b1e4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 31 Jan 2019 15:40:09 -0800 Subject: bpf: introduce BPF_F_LOCK flag Introduce BPF_F_LOCK flag for map_lookup and map_update syscall commands and for map_update() helper function. In all these cases take a lock of existing element (which was provided in BTF description) before copying (in or out) the rest of map value. Implementation details that are part of uapi: Array: The array map takes the element lock for lookup/update. Hash: hash map also takes the lock for lookup/update and tries to avoid the bucket lock. If old element exists it takes the element lock and updates the element in place. If element doesn't exist it allocates new one and inserts into hash table while holding the bucket lock. In rare case the hashmap has to take both the bucket lock and the element lock to update old value in place. Cgroup local storage: It is similar to array. update in place and lookup are done with lock taken. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/arraymap.c | 24 ++++++++++++++++-------- kernel/bpf/hashtab.c | 42 +++++++++++++++++++++++++++++++++++++++--- kernel/bpf/helpers.c | 16 ++++++++++++++++ kernel/bpf/local_storage.c | 14 +++++++++++++- kernel/bpf/syscall.c | 25 +++++++++++++++++++++++-- 7 files changed, 110 insertions(+), 14 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2ae615b48bb8..bd169a7bcc93 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -119,6 +119,8 @@ static inline void copy_map_value(struct bpf_map *map, void *dst, void *src) memcpy(dst, src, map->value_size); } } +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src); struct bpf_offload_dev; struct bpf_offloaded_map; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 86f7c438d40f..1777fa0c61e4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -267,6 +267,7 @@ enum bpf_attach_type { #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ #define BPF_EXIST 2 /* update existing element */ +#define BPF_F_LOCK 4 /* spin_lock-ed map_lookup/map_update */ /* flags for BPF_MAP_CREATE command */ #define BPF_F_NO_PREALLOC (1U << 0) diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d6d979910a2a..c72e0d8e1e65 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -253,8 +253,9 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, { struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; + char *val; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -262,18 +263,25 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (unlikely(map_flags == BPF_NOEXIST)) + if (unlikely(map_flags & BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + if (unlikely((map_flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) + return -EINVAL; + + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); - else - copy_map_value(map, - array->value + - array->elem_size * (index & array->index_mask), - value); + } else { + val = array->value + + array->elem_size * (index & array->index_mask); + if (map_flags & BPF_F_LOCK) + copy_map_value_locked(map, val, value, false); + else + copy_map_value(map, val, value); + } return 0; } diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 6d3b22c5ad68..937776531998 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -804,11 +804,11 @@ dec_count: static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, u64 map_flags) { - if (l_old && map_flags == BPF_NOEXIST) + if (l_old && (map_flags & ~BPF_F_LOCK) == BPF_NOEXIST) /* elem already exists */ return -EEXIST; - if (!l_old && map_flags == BPF_EXIST) + if (!l_old && (map_flags & ~BPF_F_LOCK) == BPF_EXIST) /* elem doesn't exist, cannot update it */ return -ENOENT; @@ -827,7 +827,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u32 key_size, hash; int ret; - if (unlikely(map_flags > BPF_EXIST)) + if (unlikely((map_flags & ~BPF_F_LOCK) > BPF_EXIST)) /* unknown flags */ return -EINVAL; @@ -840,6 +840,28 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, b = __select_bucket(htab, hash); head = &b->head; + if (unlikely(map_flags & BPF_F_LOCK)) { + if (unlikely(!map_value_has_spin_lock(map))) + return -EINVAL; + /* find an element without taking the bucket lock */ + l_old = lookup_nulls_elem_raw(head, hash, key, key_size, + htab->n_buckets); + ret = check_flags(htab, l_old, map_flags); + if (ret) + return ret; + if (l_old) { + /* grab the element lock and update value in place */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + return 0; + } + /* fall through, grab the bucket lock and lookup again. + * 99.9% chance that the element won't be found, + * but second lookup under lock has to be done. + */ + } + /* bpf_map_update_elem() can be called in_irq() */ raw_spin_lock_irqsave(&b->lock, flags); @@ -849,6 +871,20 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, if (ret) goto err; + if (unlikely(l_old && (map_flags & BPF_F_LOCK))) { + /* first lookup without the bucket lock didn't find the element, + * but second lookup with the bucket lock found it. + * This case is highly unlikely, but has to be dealt with: + * grab the element lock in addition to the bucket lock + * and update element in place + */ + copy_map_value_locked(map, + l_old->key + round_up(key_size, 8), + value, false); + ret = 0; + goto err; + } + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false, l_old); if (IS_ERR(l_new)) { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index fbe544761628..a411fc17d265 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -301,6 +301,22 @@ const struct bpf_func_proto bpf_spin_unlock_proto = { .arg1_type = ARG_PTR_TO_SPIN_LOCK, }; +void copy_map_value_locked(struct bpf_map *map, void *dst, void *src, + bool lock_src) +{ + struct bpf_spin_lock *lock; + + if (lock_src) + lock = src + map->spin_lock_off; + else + lock = dst + map->spin_lock_off; + preempt_disable(); + ____bpf_spin_lock(lock); + copy_map_value(map, dst, src); + ____bpf_spin_unlock(lock); + preempt_enable(); +} + #ifdef CONFIG_CGROUPS BPF_CALL_0(bpf_get_current_cgroup_id) { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 0295427f06e2..6b572e2de7fb 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -131,7 +131,14 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, struct bpf_cgroup_storage *storage; struct bpf_storage_buffer *new; - if (flags != BPF_ANY && flags != BPF_EXIST) + if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST))) + return -EINVAL; + + if (unlikely(flags & BPF_NOEXIST)) + return -EINVAL; + + if (unlikely((flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map))) return -EINVAL; storage = cgroup_storage_lookup((struct bpf_cgroup_storage_map *)map, @@ -139,6 +146,11 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, if (!storage) return -ENOENT; + if (flags & BPF_F_LOCK) { + copy_map_value_locked(map, storage->buf->data, value, false); + return 0; + } + new = kmalloc_node(sizeof(struct bpf_storage_buffer) + map->value_size, __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index b29e6dc44650..0834958f1dc4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -682,7 +682,7 @@ static void *__bpf_copy_key(void __user *ukey, u64 key_size) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value +#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD flags static int map_lookup_elem(union bpf_attr *attr) { @@ -698,6 +698,9 @@ static int map_lookup_elem(union bpf_attr *attr) if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) return -EINVAL; + if (attr->flags & ~BPF_F_LOCK) + return -EINVAL; + f = fdget(ufd); map = __bpf_map_get(f); if (IS_ERR(map)) @@ -708,6 +711,12 @@ static int map_lookup_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); @@ -758,7 +767,13 @@ static int map_lookup_elem(union bpf_attr *attr) err = -ENOENT; } else { err = 0; - copy_map_value(map, value, ptr); + if (attr->flags & BPF_F_LOCK) + /* lock 'ptr' and copy everything but lock */ + copy_map_value_locked(map, value, ptr, true); + else + copy_map_value(map, value, ptr); + /* mask lock, since value wasn't zero inited */ + check_and_init_map_lock(map, value); } rcu_read_unlock(); } @@ -818,6 +833,12 @@ static int map_update_elem(union bpf_attr *attr) goto err_put; } + if ((attr->flags & BPF_F_LOCK) && + !map_value_has_spin_lock(map)) { + err = -EINVAL; + goto err_put; + } + key = __bpf_copy_key(ukey, map->key_size); if (IS_ERR(key)) { err = PTR_ERR(key); -- cgit v1.2.3 From fb99bce7120014307dde57b3d7def6977a9a62a1 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Wed, 30 Jan 2019 21:58:05 +0000 Subject: net: tls: Support 256 bit keys Wire up support for 256 bit keys from the setsockopt to the crypto framework Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 5 +++- include/uapi/linux/tls.h | 15 ++++++++++ net/tls/tls_main.c | 33 +++++++++++++++++++-- net/tls/tls_sw.c | 29 +++++++++++++++--- tools/testing/selftests/net/tls.c | 62 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 137 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tls.h b/include/net/tls.h index 4592606e136a..da616db48413 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -206,7 +206,10 @@ struct cipher_context { union tls_crypto_context { struct tls_crypto_info info; - struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; + union { + struct tls12_crypto_info_aes_gcm_128 aes_gcm_128; + struct tls12_crypto_info_aes_gcm_256 aes_gcm_256; + }; }; struct tls_context { diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index ff02287495ac..9affceaa3db4 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -59,6 +59,13 @@ #define TLS_CIPHER_AES_GCM_128_TAG_SIZE 16 #define TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE 8 +#define TLS_CIPHER_AES_GCM_256 52 +#define TLS_CIPHER_AES_GCM_256_IV_SIZE 8 +#define TLS_CIPHER_AES_GCM_256_KEY_SIZE 32 +#define TLS_CIPHER_AES_GCM_256_SALT_SIZE 4 +#define TLS_CIPHER_AES_GCM_256_TAG_SIZE 16 +#define TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE 8 + #define TLS_SET_RECORD_TYPE 1 #define TLS_GET_RECORD_TYPE 2 @@ -75,4 +82,12 @@ struct tls12_crypto_info_aes_gcm_128 { unsigned char rec_seq[TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE]; }; +struct tls12_crypto_info_aes_gcm_256 { + struct tls_crypto_info info; + unsigned char iv[TLS_CIPHER_AES_GCM_256_IV_SIZE]; + unsigned char key[TLS_CIPHER_AES_GCM_256_KEY_SIZE]; + unsigned char salt[TLS_CIPHER_AES_GCM_256_SALT_SIZE]; + unsigned char rec_seq[TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE]; +}; + #endif /* _UAPI_LINUX_TLS_H */ diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index d36d095cbcf0..0f028cfdf835 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -372,6 +372,30 @@ static int do_tls_getsockopt_tx(struct sock *sk, char __user *optval, rc = -EFAULT; break; } + case TLS_CIPHER_AES_GCM_256: { + struct tls12_crypto_info_aes_gcm_256 * + crypto_info_aes_gcm_256 = + container_of(crypto_info, + struct tls12_crypto_info_aes_gcm_256, + info); + + if (len != sizeof(*crypto_info_aes_gcm_256)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(crypto_info_aes_gcm_256->iv, + ctx->tx.iv + TLS_CIPHER_AES_GCM_256_SALT_SIZE, + TLS_CIPHER_AES_GCM_256_IV_SIZE); + memcpy(crypto_info_aes_gcm_256->rec_seq, ctx->tx.rec_seq, + TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, + crypto_info_aes_gcm_256, + sizeof(*crypto_info_aes_gcm_256))) + rc = -EFAULT; + break; + } default: rc = -EINVAL; } @@ -412,6 +436,7 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, { struct tls_crypto_info *crypto_info; struct tls_context *ctx = tls_get_ctx(sk); + size_t optsize; int rc = 0; int conf; @@ -444,8 +469,12 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } switch (crypto_info->cipher_type) { - case TLS_CIPHER_AES_GCM_128: { - if (optlen != sizeof(struct tls12_crypto_info_aes_gcm_128)) { + case TLS_CIPHER_AES_GCM_128: + case TLS_CIPHER_AES_GCM_256: { + optsize = crypto_info->cipher_type == TLS_CIPHER_AES_GCM_128 ? + sizeof(struct tls12_crypto_info_aes_gcm_128) : + sizeof(struct tls12_crypto_info_aes_gcm_256); + if (optlen != optsize) { rc = -EINVAL; goto err_crypto_info; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 3f2a6af27e62..9326c06c2ffe 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1999,6 +1999,7 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) { struct tls_crypto_info *crypto_info; struct tls12_crypto_info_aes_gcm_128 *gcm_128_info; + struct tls12_crypto_info_aes_gcm_256 *gcm_256_info; struct tls_sw_context_tx *sw_ctx_tx = NULL; struct tls_sw_context_rx *sw_ctx_rx = NULL; struct cipher_context *cctx; @@ -2006,7 +2007,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) struct strp_callbacks cb; u16 nonce_size, tag_size, iv_size, rec_seq_size; struct crypto_tfm *tfm; - char *iv, *rec_seq; + char *iv, *rec_seq, *key, *salt; + size_t keysize; int rc = 0; if (!ctx) { @@ -2067,6 +2069,24 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; gcm_128_info = (struct tls12_crypto_info_aes_gcm_128 *)crypto_info; + keysize = TLS_CIPHER_AES_GCM_128_KEY_SIZE; + key = gcm_128_info->key; + salt = gcm_128_info->salt; + break; + } + case TLS_CIPHER_AES_GCM_256: { + nonce_size = TLS_CIPHER_AES_GCM_256_IV_SIZE; + tag_size = TLS_CIPHER_AES_GCM_256_TAG_SIZE; + iv_size = TLS_CIPHER_AES_GCM_256_IV_SIZE; + iv = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->iv; + rec_seq_size = TLS_CIPHER_AES_GCM_256_REC_SEQ_SIZE; + rec_seq = + ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->rec_seq; + gcm_256_info = + (struct tls12_crypto_info_aes_gcm_256 *)crypto_info; + keysize = TLS_CIPHER_AES_GCM_256_KEY_SIZE; + key = gcm_256_info->key; + salt = gcm_256_info->salt; break; } default: @@ -2090,7 +2110,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) rc = -ENOMEM; goto free_priv; } - memcpy(cctx->iv, gcm_128_info->salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + /* Note: 128 & 256 bit salt are the same size */ + memcpy(cctx->iv, salt, TLS_CIPHER_AES_GCM_128_SALT_SIZE); memcpy(cctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); cctx->rec_seq_size = rec_seq_size; cctx->rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); @@ -2110,8 +2131,8 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) ctx->push_pending_record = tls_sw_push_pending_record; - rc = crypto_aead_setkey(*aead, gcm_128_info->key, - TLS_CIPHER_AES_GCM_128_KEY_SIZE); + rc = crypto_aead_setkey(*aead, key, keysize); + if (rc) goto free_aead; diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c index ff68ed19c0ef..c356f481de79 100644 --- a/tools/testing/selftests/net/tls.c +++ b/tools/testing/selftests/net/tls.c @@ -763,4 +763,66 @@ TEST_F(tls, control_msg) EXPECT_EQ(memcmp(buf, test_str, send_len), 0); } +TEST(keysizes) { + struct tls12_crypto_info_aes_gcm_256 tls12; + struct sockaddr_in addr; + int sfd, ret, fd, cfd; + socklen_t len; + bool notls; + + notls = false; + len = sizeof(addr); + + memset(&tls12, 0, sizeof(tls12)); + tls12.info.version = TLS_1_2_VERSION; + tls12.info.cipher_type = TLS_CIPHER_AES_GCM_256; + + addr.sin_family = AF_INET; + addr.sin_addr.s_addr = htonl(INADDR_ANY); + addr.sin_port = 0; + + fd = socket(AF_INET, SOCK_STREAM, 0); + sfd = socket(AF_INET, SOCK_STREAM, 0); + + ret = bind(sfd, &addr, sizeof(addr)); + ASSERT_EQ(ret, 0); + ret = listen(sfd, 10); + ASSERT_EQ(ret, 0); + + ret = getsockname(sfd, &addr, &len); + ASSERT_EQ(ret, 0); + + ret = connect(fd, &addr, sizeof(addr)); + ASSERT_EQ(ret, 0); + + ret = setsockopt(fd, IPPROTO_TCP, TCP_ULP, "tls", sizeof("tls")); + if (ret != 0) { + notls = true; + printf("Failure setting TCP_ULP, testing without tls\n"); + } + + if (!notls) { + ret = setsockopt(fd, SOL_TLS, TLS_TX, &tls12, + sizeof(tls12)); + EXPECT_EQ(ret, 0); + } + + cfd = accept(sfd, &addr, &len); + ASSERT_GE(cfd, 0); + + if (!notls) { + ret = setsockopt(cfd, IPPROTO_TCP, TCP_ULP, "tls", + sizeof("tls")); + EXPECT_EQ(ret, 0); + + ret = setsockopt(cfd, SOL_TLS, TLS_RX, &tls12, + sizeof(tls12)); + EXPECT_EQ(ret, 0); + } + + close(sfd); + close(fd); + close(cfd); +} + TEST_HARNESS_MAIN -- cgit v1.2.3 From 130b392c6cd6b2aed1b7eb32253d4920babb4891 Mon Sep 17 00:00:00 2001 From: Dave Watson Date: Wed, 30 Jan 2019 21:58:31 +0000 Subject: net: tls: Add tls 1.3 support TLS 1.3 has minor changes from TLS 1.2 at the record layer. * Header now hardcodes the same version and application content type in the header. * The real content type is appended after the data, before encryption (or after decryption). * The IV is xored with the sequence number, instead of concatinating four bytes of IV with the explicit IV. * Zero-padding: No exlicit length is given, we search backwards from the end of the decrypted data for the first non-zero byte, which is the content type. Currently recv supports reading zero-padding, but there is no way for send to add zero padding. Signed-off-by: Dave Watson Signed-off-by: David S. Miller --- include/net/tls.h | 66 +++++++++++++++++------- include/uapi/linux/tls.h | 4 ++ net/tls/tls_device.c | 5 +- net/tls/tls_device_fallback.c | 3 +- net/tls/tls_main.c | 3 +- net/tls/tls_sw.c | 116 ++++++++++++++++++++++++++++++++++-------- 6 files changed, 154 insertions(+), 43 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tls.h b/include/net/tls.h index 754b130672f0..004bf01ce868 100644 --- a/include/net/tls.h +++ b/include/net/tls.h @@ -119,6 +119,9 @@ struct tls_rec { /* AAD | msg_encrypted.sg.data (data contains overhead for hdr & iv & tag) */ struct scatterlist sg_aead_out[2]; + char content_type; + struct scatterlist sg_content_type; + char aad_space[TLS_AAD_SPACE_SIZE]; u8 iv_data[TLS_CIPHER_AES_GCM_128_IV_SIZE + TLS_CIPHER_AES_GCM_128_SALT_SIZE]; @@ -203,6 +206,7 @@ struct cipher_context { u16 rec_seq_size; char *rec_seq; u16 aad_size; + u16 tail_size; }; union tls_crypto_context { @@ -397,49 +401,77 @@ static inline bool tls_bigint_increment(unsigned char *seq, int len) } static inline void tls_advance_record_sn(struct sock *sk, - struct cipher_context *ctx) + struct cipher_context *ctx, + int version) { if (tls_bigint_increment(ctx->rec_seq, ctx->rec_seq_size)) tls_err_abort(sk, EBADMSG); - tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - ctx->iv_size); + + if (version != TLS_1_3_VERSION) { + tls_bigint_increment(ctx->iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, + ctx->iv_size); + } } static inline void tls_fill_prepend(struct tls_context *ctx, char *buf, size_t plaintext_len, - unsigned char record_type) + unsigned char record_type, + int version) { size_t pkt_len, iv_size = ctx->tx.iv_size; - pkt_len = plaintext_len + iv_size + ctx->tx.tag_size; + pkt_len = plaintext_len + ctx->tx.tag_size; + if (version != TLS_1_3_VERSION) { + pkt_len += iv_size; + + memcpy(buf + TLS_NONCE_OFFSET, + ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); + } /* we cover nonce explicit here as well, so buf should be of * size KTLS_DTLS_HEADER_SIZE + KTLS_DTLS_NONCE_EXPLICIT_SIZE */ - buf[0] = record_type; - buf[1] = TLS_VERSION_MINOR(ctx->crypto_send.info.version); - buf[2] = TLS_VERSION_MAJOR(ctx->crypto_send.info.version); + buf[0] = version == TLS_1_3_VERSION ? + TLS_RECORD_TYPE_DATA : record_type; + /* Note that VERSION must be TLS_1_2 for both TLS1.2 and TLS1.3 */ + buf[1] = TLS_1_2_VERSION_MINOR; + buf[2] = TLS_1_2_VERSION_MAJOR; /* we can use IV for nonce explicit according to spec */ buf[3] = pkt_len >> 8; buf[4] = pkt_len & 0xFF; - memcpy(buf + TLS_NONCE_OFFSET, - ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv_size); } static inline void tls_make_aad(char *buf, size_t size, char *record_sequence, int record_sequence_size, - unsigned char record_type) + unsigned char record_type, + int version) +{ + if (version != TLS_1_3_VERSION) { + memcpy(buf, record_sequence, record_sequence_size); + buf += 8; + } else { + size += TLS_CIPHER_AES_GCM_128_TAG_SIZE; + } + + buf[0] = version == TLS_1_3_VERSION ? + TLS_RECORD_TYPE_DATA : record_type; + buf[1] = TLS_1_2_VERSION_MAJOR; + buf[2] = TLS_1_2_VERSION_MINOR; + buf[3] = size >> 8; + buf[4] = size & 0xFF; +} + +static inline void xor_iv_with_seq(int version, char *iv, char *seq) { - memcpy(buf, record_sequence, record_sequence_size); + int i; - buf[8] = record_type; - buf[9] = TLS_1_2_VERSION_MAJOR; - buf[10] = TLS_1_2_VERSION_MINOR; - buf[11] = size >> 8; - buf[12] = size & 0xFF; + if (version == TLS_1_3_VERSION) { + for (i = 0; i < 8; i++) + iv[i + 4] ^= seq[i]; + } } static inline struct tls_context *tls_get_ctx(const struct sock *sk) diff --git a/include/uapi/linux/tls.h b/include/uapi/linux/tls.h index 9affceaa3db4..401d6f01de6a 100644 --- a/include/uapi/linux/tls.h +++ b/include/uapi/linux/tls.h @@ -51,6 +51,10 @@ #define TLS_1_2_VERSION_MINOR 0x3 #define TLS_1_2_VERSION TLS_VERSION_NUMBER(TLS_1_2) +#define TLS_1_3_VERSION_MAJOR 0x3 +#define TLS_1_3_VERSION_MINOR 0x4 +#define TLS_1_3_VERSION TLS_VERSION_NUMBER(TLS_1_3) + /* Supported ciphers */ #define TLS_CIPHER_AES_GCM_128 51 #define TLS_CIPHER_AES_GCM_128_IV_SIZE 8 diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index d753e362d2d9..7ee9008b2187 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -257,7 +257,8 @@ static int tls_push_record(struct sock *sk, tls_fill_prepend(ctx, skb_frag_address(frag), record->len - ctx->tx.prepend_size, - record_type); + record_type, + ctx->crypto_send.info.version); /* HW doesn't care about the data in the tag, because it fills it. */ dummy_tag_frag.page = skb_frag_page(frag); @@ -270,7 +271,7 @@ static int tls_push_record(struct sock *sk, spin_unlock_irq(&offload_ctx->lock); offload_ctx->open_record = NULL; set_bit(TLS_PENDING_CLOSED_RECORD, &ctx->flags); - tls_advance_record_sn(sk, &ctx->tx); + tls_advance_record_sn(sk, &ctx->tx, ctx->crypto_send.info.version); for (i = 0; i < record->num_frags; i++) { frag = &record->frags[i]; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 450a6dbc5a88..54c3a758f2a7 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -73,7 +73,8 @@ static int tls_enc_record(struct aead_request *aead_req, len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, - (char *)&rcd_sn, sizeof(rcd_sn), buf[0]); + (char *)&rcd_sn, sizeof(rcd_sn), buf[0], + TLS_1_2_VERSION); memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, TLS_CIPHER_AES_GCM_128_IV_SIZE); diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 0f028cfdf835..d1c2fd9a3f63 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -463,7 +463,8 @@ static int do_tls_setsockopt_conf(struct sock *sk, char __user *optval, } /* check version */ - if (crypto_info->version != TLS_1_2_VERSION) { + if (crypto_info->version != TLS_1_2_VERSION && + crypto_info->version != TLS_1_3_VERSION) { rc = -ENOTSUPP; goto err_crypto_info; } diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 34f3523f668e..06d7ae97b929 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -120,6 +120,34 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len) return __skb_nsg(skb, offset, len, 0); } +static int padding_length(struct tls_sw_context_rx *ctx, + struct tls_context *tls_ctx, struct sk_buff *skb) +{ + struct strp_msg *rxm = strp_msg(skb); + int sub = 0; + + /* Determine zero-padding length */ + if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION) { + char content_type = 0; + int err; + int back = 17; + + while (content_type == 0) { + if (back > rxm->full_len) + return -EBADMSG; + err = skb_copy_bits(skb, + rxm->offset + rxm->full_len - back, + &content_type, 1); + if (content_type) + break; + sub++; + back++; + } + ctx->control = content_type; + } + return sub; +} + static void tls_decrypt_done(struct crypto_async_request *req, int err) { struct aead_request *aead_req = (struct aead_request *)req; @@ -142,7 +170,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err) tls_err_abort(skb->sk, err); } else { struct strp_msg *rxm = strp_msg(skb); - + rxm->full_len -= padding_length(ctx, tls_ctx, skb); rxm->offset += tls_ctx->rx.prepend_size; rxm->full_len -= tls_ctx->rx.overhead_size; } @@ -448,6 +476,8 @@ static int tls_do_encryption(struct sock *sk, int rc; memcpy(rec->iv_data, tls_ctx->tx.iv, sizeof(rec->iv_data)); + xor_iv_with_seq(tls_ctx->crypto_send.info.version, rec->iv_data, + tls_ctx->tx.rec_seq); sge->offset += tls_ctx->tx.prepend_size; sge->length -= tls_ctx->tx.prepend_size; @@ -483,7 +513,8 @@ static int tls_do_encryption(struct sock *sk, /* Unhook the record from context if encryption is not failure */ ctx->open_rec = NULL; - tls_advance_record_sn(sk, &tls_ctx->tx); + tls_advance_record_sn(sk, &tls_ctx->tx, + tls_ctx->crypto_send.info.version); return rc; } @@ -640,7 +671,17 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_pl->sg.end; sk_msg_iter_var_prev(i); - sg_mark_end(sk_msg_elem(msg_pl, i)); + + rec->content_type = record_type; + if (tls_ctx->crypto_send.info.version == TLS_1_3_VERSION) { + /* Add content type to end of message. No padding added */ + sg_set_buf(&rec->sg_content_type, &rec->content_type, 1); + sg_mark_end(&rec->sg_content_type); + sg_chain(msg_pl->sg.data, msg_pl->sg.end + 1, + &rec->sg_content_type); + } else { + sg_mark_end(sk_msg_elem(msg_pl, i)); + } i = msg_pl->sg.start; sg_chain(rec->sg_aead_in, 2, rec->inplace_crypto ? @@ -653,18 +694,22 @@ static int tls_push_record(struct sock *sk, int flags, i = msg_en->sg.start; sg_chain(rec->sg_aead_out, 2, &msg_en->sg.data[i]); - tls_make_aad(rec->aad_space, msg_pl->sg.size, + tls_make_aad(rec->aad_space, msg_pl->sg.size + tls_ctx->tx.tail_size, tls_ctx->tx.rec_seq, tls_ctx->tx.rec_seq_size, - record_type); + record_type, + tls_ctx->crypto_send.info.version); tls_fill_prepend(tls_ctx, page_address(sg_page(&msg_en->sg.data[i])) + - msg_en->sg.data[i].offset, msg_pl->sg.size, - record_type); + msg_en->sg.data[i].offset, + msg_pl->sg.size + tls_ctx->tx.tail_size, + record_type, + tls_ctx->crypto_send.info.version); tls_ctx->pending_open_record_frags = false; - rc = tls_do_encryption(sk, tls_ctx, ctx, req, msg_pl->sg.size, i); + rc = tls_do_encryption(sk, tls_ctx, ctx, req, + msg_pl->sg.size + tls_ctx->tx.tail_size, i); if (rc < 0) { if (rc != -EINPROGRESS) { tls_err_abort(sk, EBADMSG); @@ -1292,7 +1337,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, u8 *aad, *iv, *mem = NULL; struct scatterlist *sgin = NULL; struct scatterlist *sgout = NULL; - const int data_len = rxm->full_len - tls_ctx->rx.overhead_size; + const int data_len = rxm->full_len - tls_ctx->rx.overhead_size + + tls_ctx->rx.tail_size; if (*zc && (out_iov || out_sg)) { if (out_iov) @@ -1343,12 +1389,20 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb, kfree(mem); return err; } - memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + if (tls_ctx->crypto_recv.info.version == TLS_1_3_VERSION) + memcpy(iv, tls_ctx->rx.iv, crypto_aead_ivsize(ctx->aead_recv)); + else + memcpy(iv, tls_ctx->rx.iv, TLS_CIPHER_AES_GCM_128_SALT_SIZE); + + xor_iv_with_seq(tls_ctx->crypto_recv.info.version, iv, + tls_ctx->rx.rec_seq); /* Prepare AAD */ - tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size, + tls_make_aad(aad, rxm->full_len - tls_ctx->rx.overhead_size + + tls_ctx->rx.tail_size, tls_ctx->rx.rec_seq, tls_ctx->rx.rec_seq_size, - ctx->control); + ctx->control, + tls_ctx->crypto_recv.info.version); /* Prepare sgin */ sg_init_table(sgin, n_sgin); @@ -1405,6 +1459,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, { struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx); + int version = tls_ctx->crypto_recv.info.version; struct strp_msg *rxm = strp_msg(skb); int err = 0; @@ -1417,13 +1472,17 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb, err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, async); if (err < 0) { if (err == -EINPROGRESS) - tls_advance_record_sn(sk, &tls_ctx->rx); + tls_advance_record_sn(sk, &tls_ctx->rx, + version); return err; } + + rxm->full_len -= padding_length(ctx, tls_ctx, skb); + rxm->offset += tls_ctx->rx.prepend_size; rxm->full_len -= tls_ctx->rx.overhead_size; - tls_advance_record_sn(sk, &tls_ctx->rx); + tls_advance_record_sn(sk, &tls_ctx->rx, version); ctx->decrypted = true; ctx->saved_data_ready(sk); } else { @@ -1611,7 +1670,8 @@ int tls_sw_recvmsg(struct sock *sk, to_decrypt = rxm->full_len - tls_ctx->rx.overhead_size; if (to_decrypt <= len && !is_kvec && !is_peek && - ctx->control == TLS_RECORD_TYPE_DATA) + ctx->control == TLS_RECORD_TYPE_DATA && + tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION) zc = true; err = decrypt_skb_update(sk, skb, &msg->msg_iter, @@ -1835,9 +1895,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) data_len = ((header[4] & 0xFF) | (header[3] << 8)); - cipher_overhead = tls_ctx->rx.tag_size + tls_ctx->rx.iv_size; + cipher_overhead = tls_ctx->rx.tag_size; + if (tls_ctx->crypto_recv.info.version != TLS_1_3_VERSION) + cipher_overhead += tls_ctx->rx.iv_size; - if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead) { + if (data_len > TLS_MAX_PAYLOAD_SIZE + cipher_overhead + + tls_ctx->rx.tail_size) { ret = -EMSGSIZE; goto read_failure; } @@ -1846,12 +1909,12 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb) goto read_failure; } - if (header[1] != TLS_VERSION_MINOR(tls_ctx->crypto_recv.info.version) || - header[2] != TLS_VERSION_MAJOR(tls_ctx->crypto_recv.info.version)) { + /* Note that both TLS1.3 and TLS1.2 use TLS_1_2 version here */ + if (header[1] != TLS_1_2_VERSION_MINOR || + header[2] != TLS_1_2_VERSION_MAJOR) { ret = -EINVAL; goto read_failure; } - #ifdef CONFIG_TLS_DEVICE handle_device_resync(strp->sk, TCP_SKB_CB(skb)->seq + rxm->offset, *(u64*)tls_ctx->rx.rec_seq); @@ -2100,10 +2163,19 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) goto free_priv; } - cctx->aad_size = TLS_AAD_SPACE_SIZE; + if (crypto_info->version == TLS_1_3_VERSION) { + nonce_size = 0; + cctx->aad_size = TLS_HEADER_SIZE; + cctx->tail_size = 1; + } else { + cctx->aad_size = TLS_AAD_SPACE_SIZE; + cctx->tail_size = 0; + } + cctx->prepend_size = TLS_HEADER_SIZE + nonce_size; cctx->tag_size = tag_size; - cctx->overhead_size = cctx->prepend_size + cctx->tag_size; + cctx->overhead_size = cctx->prepend_size + cctx->tag_size + + cctx->tail_size; cctx->iv_size = iv_size; cctx->iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, GFP_KERNEL); -- cgit v1.2.3 From f9cf22882c606f3ffe06f620bb6d03b9eff18d3d Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 31 Jan 2019 10:50:40 -0800 Subject: devlink: add device information API ethtool -i has served us well for a long time, but its showing its limitations more and more. The device information should also be reported per device not per-netdev. Lay foundation for a simple devlink-based way of reading device info. Add driver name and device serial number as initial pieces of information exposed via this new API. v3: - rename helpers (Jiri); - rename driver name attr (Jiri); - remove double spacing in commit message (Jiri). RFC v2: - wrap the skb into an opaque structure (Jiri); - allow the serial number of be any length (Jiri & Andrew); - add driver name (Jonathan). Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 18 +++++++ include/uapi/linux/devlink.h | 5 ++ net/core/devlink.c | 112 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 85c9eabaf056..a6d0a530483d 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -429,6 +429,7 @@ enum devlink_param_wol_types { } struct devlink_region; +struct devlink_info_req; typedef void devlink_snapshot_data_dest_t(const void *data); @@ -484,6 +485,8 @@ struct devlink_ops { int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode); int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode, struct netlink_ext_ack *extack); + int (*info_get)(struct devlink *devlink, struct devlink_info_req *req, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) @@ -607,6 +610,10 @@ u32 devlink_region_shapshot_id_get(struct devlink *devlink); int devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, u8 *data, u32 snapshot_id, devlink_snapshot_data_dest_t *data_destructor); +int devlink_info_serial_number_put(struct devlink_info_req *req, + const char *sn); +int devlink_info_driver_name_put(struct devlink_info_req *req, + const char *name); #else @@ -905,6 +912,17 @@ devlink_region_snapshot_create(struct devlink_region *region, u64 data_len, return 0; } +static inline int +devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) +{ + return 0; +} + +static inline int +devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ + return 0; +} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 61b4447a6c5b..142710d45093 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -94,6 +94,8 @@ enum devlink_command { DEVLINK_CMD_PORT_PARAM_NEW, DEVLINK_CMD_PORT_PARAM_DEL, + DEVLINK_CMD_INFO_GET, /* can dump */ + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -290,6 +292,9 @@ enum devlink_attr { DEVLINK_ATTR_REGION_CHUNK_ADDR, /* u64 */ DEVLINK_ATTR_REGION_CHUNK_LEN, /* u64 */ + DEVLINK_ATTR_INFO_DRIVER_NAME, /* string */ + DEVLINK_ATTR_INFO_SERIAL_NUMBER, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index e6f170caf449..f456f6aa3d40 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3714,6 +3714,110 @@ out: return 0; } +struct devlink_info_req { + struct sk_buff *msg; +}; + +int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name); +} +EXPORT_SYMBOL_GPL(devlink_info_driver_name_put); + +int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) +{ + return nla_put_string(req->msg, DEVLINK_ATTR_INFO_SERIAL_NUMBER, sn); +} +EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); + +static int +devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, + enum devlink_command cmd, u32 portid, + u32 seq, int flags, struct netlink_ext_ack *extack) +{ + struct devlink_info_req req; + void *hdr; + int err; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + err = -EMSGSIZE; + if (devlink_nl_put_handle(msg, devlink)) + goto err_cancel_msg; + + req.msg = msg; + err = devlink->ops->info_get(devlink, &req, extack); + if (err) + goto err_cancel_msg; + + genlmsg_end(msg, hdr); + return 0; + +err_cancel_msg: + genlmsg_cancel(msg, hdr); + return err; +} + +static int devlink_nl_cmd_info_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct sk_buff *msg; + int err; + + if (!devlink->ops || !devlink->ops->info_get) + return -EOPNOTSUPP; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + info->snd_portid, info->snd_seq, 0, + info->extack); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + if (idx < start) { + idx++; + continue; + } + + mutex_lock(&devlink->lock); + err = devlink_nl_info_fill(msg, devlink, DEVLINK_CMD_INFO_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, + cb->extack); + mutex_unlock(&devlink->lock); + if (err) + break; + idx++; + } + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -3974,6 +4078,14 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_INFO_GET, + .doit = devlink_nl_cmd_info_get_doit, + .dumpit = devlink_nl_cmd_info_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From fc6fae7dd987dccce3f322c32dc26b52d69ad00e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 31 Jan 2019 10:50:41 -0800 Subject: devlink: add version reporting to devlink info API ethtool -i has a few fixed-size fields which can be used to report firmware version and expansion ROM version. Unfortunately, modern hardware has more firmware components. There is usually some datapath microcode, management controller, PXE drivers, and a CPLD load. Running ethtool -i on modern controllers reveals the fact that vendors cram multiple values into firmware version field. Here are some examples from systems I could lay my hands on quickly: tg3: "FFV20.2.17 bc 5720-v1.39" i40e: "6.01 0x800034a4 1.1747.0" nfp: "0.0.3.5 0.25 sriov-2.1.16 nic" Add a new devlink API to allow retrieving multiple versions, and provide user-readable name for those versions. While at it break down the versions into three categories: - fixed - this is the board/fixed component version, usually vendors report information like the board version in the PCI VPD, but it will benefit from naming and common API as well; - running - this is the running firmware version; - stored - this is firmware in the flash, after firmware update this value will reflect the flashed version, while the running version may only be updated after reboot. v3: - add per-type helpers instead of using the special argument (Jiri). RFCv2: - remove the nesting in attr DEVLINK_ATTR_INFO_VERSIONS (now versions are mixed with other info attrs)l - have the driver report versions from the same callback as other info. Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 33 +++++++++++++++++++++++++ include/uapi/linux/devlink.h | 5 ++++ net/core/devlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 95 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index a6d0a530483d..6dc0ef964392 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -614,6 +614,15 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn); int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name); +int devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value); +int devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value); +int devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value); #else @@ -923,6 +932,30 @@ devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) { return 0; } + +static inline int +devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return 0; +} + +static inline int +devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return 0; +} + +static inline int +devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return 0; +} #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 142710d45093..7fffd879c328 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -294,6 +294,11 @@ enum devlink_attr { DEVLINK_ATTR_INFO_DRIVER_NAME, /* string */ DEVLINK_ATTR_INFO_SERIAL_NUMBER, /* string */ + DEVLINK_ATTR_INFO_VERSION_FIXED, /* nested */ + DEVLINK_ATTR_INFO_VERSION_RUNNING, /* nested */ + DEVLINK_ATTR_INFO_VERSION_STORED, /* nested */ + DEVLINK_ATTR_INFO_VERSION_NAME, /* string */ + DEVLINK_ATTR_INFO_VERSION_VALUE, /* string */ /* add new attributes above here, update the policy in devlink.c */ diff --git a/net/core/devlink.c b/net/core/devlink.c index f456f6aa3d40..e31b6d617837 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3730,6 +3730,63 @@ int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn) } EXPORT_SYMBOL_GPL(devlink_info_serial_number_put); +static int devlink_info_version_put(struct devlink_info_req *req, int attr, + const char *version_name, + const char *version_value) +{ + struct nlattr *nest; + int err; + + nest = nla_nest_start(req->msg, attr); + if (!nest) + return -EMSGSIZE; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_NAME, + version_name); + if (err) + goto nla_put_failure; + + err = nla_put_string(req->msg, DEVLINK_ATTR_INFO_VERSION_VALUE, + version_value); + if (err) + goto nla_put_failure; + + nla_nest_end(req->msg, nest); + + return 0; + +nla_put_failure: + nla_nest_cancel(req->msg, nest); + return err; +} + +int devlink_info_version_fixed_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_FIXED, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_fixed_put); + +int devlink_info_version_stored_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_STORED, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_stored_put); + +int devlink_info_version_running_put(struct devlink_info_req *req, + const char *version_name, + const char *version_value) +{ + return devlink_info_version_put(req, DEVLINK_ATTR_INFO_VERSION_RUNNING, + version_name, version_value); +} +EXPORT_SYMBOL_GPL(devlink_info_version_running_put); + static int devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink, enum devlink_command cmd, u32 portid, -- cgit v1.2.3 From bcb3fc3247e5a7ceb467ca0cfdaa4c1b830dd8f9 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:47 -0800 Subject: arch: sparc: Override struct __kernel_old_timeval struct __kernel_old_timeval is supposed to have the same layout as struct timeval. But, it was inadvarently missed that __kernel_suseconds has a different definition for sparc64. Provide an asm-specific override that fixes it. Reported-by: Arnd Bergmann Suggested-by: Arnd Bergmann Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Cc: sparclinux@vger.kernel.org Signed-off-by: David S. Miller --- arch/sparc/include/uapi/asm/posix_types.h | 10 ++++++++++ include/uapi/linux/time.h | 2 ++ 2 files changed, 12 insertions(+) (limited to 'include/uapi/linux') diff --git a/arch/sparc/include/uapi/asm/posix_types.h b/arch/sparc/include/uapi/asm/posix_types.h index fec499d6efb7..f139e0048628 100644 --- a/arch/sparc/include/uapi/asm/posix_types.h +++ b/arch/sparc/include/uapi/asm/posix_types.h @@ -19,6 +19,16 @@ typedef unsigned short __kernel_old_gid_t; typedef int __kernel_suseconds_t; #define __kernel_suseconds_t __kernel_suseconds_t +typedef long __kernel_long_t; +typedef unsigned long __kernel_ulong_t; +#define __kernel_long_t __kernel_long_t + +struct __kernel_old_timeval { + __kernel_long_t tv_sec; + __kernel_suseconds_t tv_usec; +}; +#define __kernel_old_timeval __kernel_old_timeval + #else /* sparc 32 bit */ diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 6b56a2208be7..04d5587f30d3 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -63,10 +63,12 @@ struct __kernel_itimerspec { * here, this is probably because it is not y2038 safe and needs to * be changed to use another interface. */ +#ifndef __kernel_old_timeval struct __kernel_old_timeval { __kernel_long_t tv_sec; __kernel_long_t tv_usec; }; +#endif /* * The IDs of the various system clocks (for POSIX.1b interval timers): -- cgit v1.2.3 From 98bb03c865d7ddf7a9a2eb80f9a0dd5f261c56ad Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:49 -0800 Subject: socket: Add struct __kernel_sock_timeval The new type is meant to be used as a y2038 safe structure to be used as part of cmsg data. Presently the SO_TIMESTAMP socket option uses struct timeval for timestamps. This is not y2038 safe. Subsequent patches in the series add new y2038 safe socket option to be used in the place of SO_TIMESTAMP_OLD. struct __kernel_sock_timeval will be used as the timestamp format at that time. struct __kernel_sock_timeval also maintains the same layout across 32 bit and 64 bit ABIs. Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/time.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 04d5587f30d3..b8ad1b86b942 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -70,6 +70,11 @@ struct __kernel_old_timeval { }; #endif +struct __kernel_sock_timeval { + __s64 tv_sec; + __s64 tv_usec; +}; + /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ -- cgit v1.2.3 From 9718475e69084de15c3930ce35672a7dc6da866b Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Sat, 2 Feb 2019 07:34:51 -0800 Subject: socket: Add SO_TIMESTAMPING_NEW Add SO_TIMESTAMPING_NEW variant of socket timestamp options. This is the y2038 safe versions of the SO_TIMESTAMPING_OLD for all architectures. Signed-off-by: Deepa Dinamani Acked-by: Willem de Bruijn Cc: chris@zankel.net Cc: fenghua.yu@intel.com Cc: rth@twiddle.net Cc: tglx@linutronix.de Cc: ubraun@linux.ibm.com Cc: linux-alpha@vger.kernel.org Cc: linux-arch@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: linux-s390@vger.kernel.org Cc: linux-xtensa@linux-xtensa.org Cc: sparclinux@vger.kernel.org Signed-off-by: David S. Miller --- arch/alpha/include/uapi/asm/socket.h | 5 +++-- arch/mips/include/uapi/asm/socket.h | 5 +++-- arch/parisc/include/uapi/asm/socket.h | 5 +++-- arch/sparc/include/uapi/asm/socket.h | 5 +++-- include/linux/socket.h | 8 ++++++++ include/uapi/asm-generic/socket.h | 5 +++-- include/uapi/linux/errqueue.h | 4 ++++ net/core/scm.c | 27 +++++++++++++++++++++++++++ net/core/sock.c | 8 +++++++- net/ipv4/tcp.c | 30 +++++++++++++++++------------- net/smc/af_smc.c | 3 ++- net/socket.c | 13 ++++++++----- 12 files changed, 88 insertions(+), 30 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index aab11eec7c22..934ea6268f1a 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -117,19 +117,20 @@ #define SO_TIMESTAMP_NEW 63 #define SO_TIMESTAMPNS_NEW 64 +#define SO_TIMESTAMPING_NEW 65 #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 #define SO_TIMESTAMP SO_TIMESTAMP_OLD #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #else #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) #endif -#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD - #define SCM_TIMESTAMP SO_TIMESTAMP #define SCM_TIMESTAMPNS SO_TIMESTAMPNS #define SCM_TIMESTAMPING SO_TIMESTAMPING diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 11014f684d9f..110f9506d64f 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -128,19 +128,20 @@ #define SO_TIMESTAMP_NEW 63 #define SO_TIMESTAMPNS_NEW 64 +#define SO_TIMESTAMPING_NEW 65 #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 #define SO_TIMESTAMP SO_TIMESTAMP_OLD #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #else #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) #endif -#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD - #define SCM_TIMESTAMP SO_TIMESTAMP #define SCM_TIMESTAMPNS SO_TIMESTAMPNS #define SCM_TIMESTAMPING SO_TIMESTAMPING diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index cbc4b89c2fe4..bee2a9dde656 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -109,19 +109,20 @@ #define SO_TIMESTAMP_NEW 0x4038 #define SO_TIMESTAMPNS_NEW 0x4039 +#define SO_TIMESTAMPING_NEW 0x403A #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 #define SO_TIMESTAMP SO_TIMESTAMP_OLD #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #else #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) #endif -#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD - #define SCM_TIMESTAMP SO_TIMESTAMP #define SCM_TIMESTAMPNS SO_TIMESTAMPNS #define SCM_TIMESTAMPING SO_TIMESTAMPING diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 85127425b294..2b38dda51426 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -110,19 +110,20 @@ #define SO_TIMESTAMP_NEW 0x0041 #define SO_TIMESTAMPNS_NEW 0x0042 +#define SO_TIMESTAMPING_NEW 0x0043 #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 #define SO_TIMESTAMP SO_TIMESTAMP_OLD #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #else #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) #endif -#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD - #define SCM_TIMESTAMP SO_TIMESTAMP #define SCM_TIMESTAMPNS SO_TIMESTAMPNS #define SCM_TIMESTAMPING SO_TIMESTAMPING diff --git a/include/linux/socket.h b/include/linux/socket.h index ab2041a00e01..6016daeecee4 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -349,9 +349,17 @@ struct ucred { extern int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *kaddr); extern int put_cmsg(struct msghdr*, int level, int type, int len, void *data); +struct timespec64; struct __kernel_timespec; struct old_timespec32; +struct scm_timestamping_internal { + struct timespec64 ts[3]; +}; + +extern void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss); +extern void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss); + /* The __sys_...msg variants allow MSG_CMSG_COMPAT iff * forbid_cmsg_compat==false */ diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index f22d3f7162f8..2713e0fa68ef 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -112,6 +112,7 @@ #define SO_TIMESTAMP_NEW 63 #define SO_TIMESTAMPNS_NEW 64 +#define SO_TIMESTAMPING_NEW 65 #if !defined(__KERNEL__) @@ -119,13 +120,13 @@ /* on 64-bit and x32, avoid the ?: operator */ #define SO_TIMESTAMP SO_TIMESTAMP_OLD #define SO_TIMESTAMPNS SO_TIMESTAMPNS_OLD +#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD #else #define SO_TIMESTAMP (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMP_OLD : SO_TIMESTAMP_NEW) #define SO_TIMESTAMPNS (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPNS_OLD : SO_TIMESTAMPNS_NEW) +#define SO_TIMESTAMPING (sizeof(time_t) == sizeof(__kernel_long_t) ? SO_TIMESTAMPING_OLD : SO_TIMESTAMPING_NEW) #endif -#define SO_TIMESTAMPING SO_TIMESTAMPING_OLD - #define SCM_TIMESTAMP SO_TIMESTAMP #define SCM_TIMESTAMPNS SO_TIMESTAMPNS #define SCM_TIMESTAMPING SO_TIMESTAMPING diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index c0151200f7d1..d955b9e32288 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -41,6 +41,10 @@ struct scm_timestamping { struct timespec ts[3]; }; +struct scm_timestamping64 { + struct __kernel_timespec ts[3]; +}; + /* The type of scm_timestamping, passed in sock_extended_err ee_info. * This defines the type of ts[0]. For SCM_TSTAMP_SND only, if ts[0] * is zero, then this is a hardware timestamp and recorded in ts[2]. diff --git a/net/core/scm.c b/net/core/scm.c index b1ff8a441748..52ef219cf6df 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -252,6 +253,32 @@ out: } EXPORT_SYMBOL(put_cmsg); +void put_cmsg_scm_timestamping64(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping64 tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) { + tss.ts[i].tv_sec = tss_internal->ts[i].tv_sec; + tss.ts[i].tv_nsec = tss_internal->ts[i].tv_nsec; + } + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_NEW, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping64); + +void put_cmsg_scm_timestamping(struct msghdr *msg, struct scm_timestamping_internal *tss_internal) +{ + struct scm_timestamping tss; + int i; + + for (i = 0; i < ARRAY_SIZE(tss.ts); i++) + tss.ts[i] = timespec64_to_timespec(tss_internal->ts[i]); + + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, sizeof(tss), &tss); +} +EXPORT_SYMBOL(put_cmsg_scm_timestamping); + void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) { struct cmsghdr __user *cm diff --git a/net/core/sock.c b/net/core/sock.c index 14b987eab10c..a9d1ecce96e5 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -890,6 +890,8 @@ set_rcvbuf: } break; + case SO_TIMESTAMPING_NEW: + sock_set_flag(sk, SOCK_TSTAMP_NEW); case SO_TIMESTAMPING_OLD: if (val & ~SOF_TIMESTAMPING_MASK) { ret = -EINVAL; @@ -921,9 +923,13 @@ set_rcvbuf: if (val & SOF_TIMESTAMPING_RX_SOFTWARE) sock_enable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); - else + else { + if (optname == SO_TIMESTAMPING_NEW) + sock_reset_flag(sk, SOCK_TSTAMP_NEW); + sock_disable_timestamp(sk, (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); + } break; case SO_RCVLOWAT: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e9388bf104a..cab6b2f2f61d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1844,22 +1844,22 @@ out: #endif static void tcp_update_recv_tstamps(struct sk_buff *skb, - struct scm_timestamping *tss) + struct scm_timestamping_internal *tss) { if (skb->tstamp) - tss->ts[0] = ktime_to_timespec(skb->tstamp); + tss->ts[0] = ktime_to_timespec64(skb->tstamp); else - tss->ts[0] = (struct timespec) {0}; + tss->ts[0] = (struct timespec64) {0}; if (skb_hwtstamps(skb)->hwtstamp) - tss->ts[2] = ktime_to_timespec(skb_hwtstamps(skb)->hwtstamp); + tss->ts[2] = ktime_to_timespec64(skb_hwtstamps(skb)->hwtstamp); else - tss->ts[2] = (struct timespec) {0}; + tss->ts[2] = (struct timespec64) {0}; } /* Similar to __sock_recv_timestamp, but does not require an skb */ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, - struct scm_timestamping *tss) + struct scm_timestamping_internal *tss) { int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); bool has_timestamping = false; @@ -1873,8 +1873,10 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_NEW, sizeof(kts), &kts); } else { + struct timespec ts_old = timespec64_to_timespec(tss->ts[0]); + put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPNS_OLD, - sizeof(tss->ts[0]), &tss->ts[0]); + sizeof(ts_old), &ts_old); } } else { if (new_tstamp) { @@ -1898,20 +1900,22 @@ static void tcp_recv_timestamp(struct msghdr *msg, const struct sock *sk, if (sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) has_timestamping = true; else - tss->ts[0] = (struct timespec) {0}; + tss->ts[0] = (struct timespec64) {0}; } if (tss->ts[2].tv_sec || tss->ts[2].tv_nsec) { if (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) has_timestamping = true; else - tss->ts[2] = (struct timespec) {0}; + tss->ts[2] = (struct timespec64) {0}; } if (has_timestamping) { - tss->ts[1] = (struct timespec) {0}; - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMPING_OLD, - sizeof(*tss), tss); + tss->ts[1] = (struct timespec64) {0}; + if (sock_flag(sk, SOCK_TSTAMP_NEW)) + put_cmsg_scm_timestamping64(msg, tss); + else + put_cmsg_scm_timestamping(msg, tss); } } @@ -1952,7 +1956,7 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, long timeo; struct sk_buff *skb, *last; u32 urg_hole = 0; - struct scm_timestamping tss; + struct scm_timestamping_internal tss; bool has_tss = false; bool has_cmsg; diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index c4e56602e0c6..369870b0ef79 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -291,7 +291,8 @@ static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk, (1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_WIFI_STATUS) | \ (1UL << SOCK_NOFCS) | \ - (1UL << SOCK_FILTER_LOCKED)) + (1UL << SOCK_FILTER_LOCKED) | \ + (1UL << SOCK_TSTAMP_NEW)) /* copy only relevant settings and flags of SOL_SOCKET level from smc to * clc socket (since smc is not called for these options from net/core) */ diff --git a/net/socket.c b/net/socket.c index 1de96abd78d3..d51930689b98 100644 --- a/net/socket.c +++ b/net/socket.c @@ -706,7 +706,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, { int need_software_tstamp = sock_flag(sk, SOCK_RCVTSTAMP); int new_tstamp = sock_flag(sk, SOCK_TSTAMP_NEW); - struct scm_timestamping tss; + struct scm_timestamping_internal tss; + int empty = 1, false_tstamp = 0; struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb); @@ -752,20 +753,22 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, memset(&tss, 0, sizeof(tss)); if ((sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) && - ktime_to_timespec_cond(skb->tstamp, tss.ts + 0)) + ktime_to_timespec64_cond(skb->tstamp, tss.ts + 0)) empty = 0; if (shhwtstamps && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE) && !skb_is_swtx_tstamp(skb, false_tstamp) && - ktime_to_timespec_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { + ktime_to_timespec64_cond(shhwtstamps->hwtstamp, tss.ts + 2)) { empty = 0; if ((sk->sk_tsflags & SOF_TIMESTAMPING_OPT_PKTINFO) && !skb_is_err_queue(skb)) put_ts_pktinfo(msg, skb); } if (!empty) { - put_cmsg(msg, SOL_SOCKET, - SO_TIMESTAMPING_OLD, sizeof(tss), &tss); + if (sock_flag(sk, SOCK_TSTAMP_NEW)) + put_cmsg_scm_timestamping64(msg, &tss); + else + put_cmsg_scm_timestamping(msg, &tss); if (skb_is_err_queue(skb) && skb->len && SKB_EXT_ERR(skb)->opt_stats) -- cgit v1.2.3 From bff5731d43efbdf0bbd2d73cab32fe6435ea1046 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 1 Feb 2019 17:56:28 -0800 Subject: net: devlink: report cell size of shared buffers Shared buffer allocation is usually done in cell increments. Drivers will either round up the allocation or refuse the configuration if it's not an exact multiple of cell size. Drivers know exactly the cell size of shared buffer, so help out users by providing this information in dumps. Signed-off-by: Jakub Kicinski Reviewed-by: Dirk van der Merwe Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c | 1 + drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c | 1 + include/net/devlink.h | 1 + include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 3 +++ 5 files changed, 8 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c index 12c61e0cc570..80066f437a65 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_buffers.c @@ -713,6 +713,7 @@ int mlxsw_sp_sb_pool_get(struct mlxsw_core *mlxsw_core, pool_info->pool_type = (enum devlink_sb_pool_type) dir; pool_info->size = mlxsw_sp_cells_bytes(mlxsw_sp, pr->size); pool_info->threshold_type = (enum devlink_sb_threshold_type) pr->mode; + pool_info->cell_size = mlxsw_sp->sb->cell_size; return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c index 814360ed3a20..ea2e3f829aba 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_shared_buf.c @@ -48,6 +48,7 @@ int nfp_shared_buf_pool_get(struct nfp_pf *pf, unsigned int sb, u16 pool_index, pool_info->pool_type = le32_to_cpu(get_data.pool_type); pool_info->threshold_type = le32_to_cpu(get_data.threshold_type); pool_info->size = le32_to_cpu(get_data.size) * unit_size; + pool_info->cell_size = unit_size; return 0; } diff --git a/include/net/devlink.h b/include/net/devlink.h index 1c8523920f66..74d992a68a06 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -62,6 +62,7 @@ struct devlink_sb_pool_info { enum devlink_sb_pool_type pool_type; u32 size; enum devlink_sb_threshold_type threshold_type; + u32 cell_size; }; /** diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 7fffd879c328..054b2d1a4537 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -300,6 +300,8 @@ enum devlink_attr { DEVLINK_ATTR_INFO_VERSION_NAME, /* string */ DEVLINK_ATTR_INFO_VERSION_VALUE, /* string */ + DEVLINK_ATTR_SB_POOL_CELL_SIZE, /* u32 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index eb839d74bcc0..52bf27491fb8 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -932,6 +932,9 @@ static int devlink_nl_sb_pool_fill(struct sk_buff *msg, struct devlink *devlink, if (nla_put_u8(msg, DEVLINK_ATTR_SB_POOL_THRESHOLD_TYPE, pool_info.threshold_type)) goto nla_put_failure; + if (nla_put_u32(msg, DEVLINK_ATTR_SB_POOL_CELL_SIZE, + pool_info.cell_size)) + goto nla_put_failure; genlmsg_end(msg, hdr); return 0; -- cgit v1.2.3 From a46c52d9f2659498f0c0871f7f2333a692c243fe Mon Sep 17 00:00:00 2001 From: wenxu Date: Tue, 29 Jan 2019 15:51:17 +0800 Subject: netfilter: nft_tunnel: Add NFTA_TUNNEL_MODE options nft "tunnel" expr match both the tun_info of RX and TX. This patch provide the NFTA_TUNNEL_MODE to individually match the tun_info of RX or TX. Signed-off-by: wenxu Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 9 +++++++++ net/netfilter/nft_tunnel.c | 34 ++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 030302893d96..a66c8de006cc 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1727,10 +1727,19 @@ enum nft_tunnel_keys { }; #define NFT_TUNNEL_MAX (__NFT_TUNNEL_MAX - 1) +enum nft_tunnel_mode { + NFT_TUNNEL_MODE_NONE, + NFT_TUNNEL_MODE_RX, + NFT_TUNNEL_MODE_TX, + __NFT_TUNNEL_MODE_MAX +}; +#define NFT_TUNNEL_MODE_MAX (__NFT_TUNNEL_MODE_MAX - 1) + enum nft_tunnel_attributes { NFTA_TUNNEL_UNSPEC, NFTA_TUNNEL_KEY, NFTA_TUNNEL_DREG, + NFTA_TUNNEL_MODE, __NFTA_TUNNEL_MAX }; #define NFTA_TUNNEL_MAX (__NFTA_TUNNEL_MAX - 1) diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c index 3a15f219e4e7..ea28588c5eed 100644 --- a/net/netfilter/nft_tunnel.c +++ b/net/netfilter/nft_tunnel.c @@ -15,6 +15,7 @@ struct nft_tunnel { enum nft_tunnel_keys key:8; enum nft_registers dreg:8; + enum nft_tunnel_mode mode:8; }; static void nft_tunnel_get_eval(const struct nft_expr *expr, @@ -29,14 +30,32 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_TUNNEL_PATH: - nft_reg_store8(dest, !!tun_info); + if (!tun_info) { + nft_reg_store8(dest, false); + return; + } + if (priv->mode == NFT_TUNNEL_MODE_NONE || + (priv->mode == NFT_TUNNEL_MODE_RX && + !(tun_info->mode & IP_TUNNEL_INFO_TX)) || + (priv->mode == NFT_TUNNEL_MODE_TX && + (tun_info->mode & IP_TUNNEL_INFO_TX))) + nft_reg_store8(dest, true); + else + nft_reg_store8(dest, false); break; case NFT_TUNNEL_ID: if (!tun_info) { regs->verdict.code = NFT_BREAK; return; } - *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id)); + if (priv->mode == NFT_TUNNEL_MODE_NONE || + (priv->mode == NFT_TUNNEL_MODE_RX && + !(tun_info->mode & IP_TUNNEL_INFO_TX)) || + (priv->mode == NFT_TUNNEL_MODE_TX && + (tun_info->mode & IP_TUNNEL_INFO_TX))) + *dest = ntohl(tunnel_id_to_key32(tun_info->key.tun_id)); + else + regs->verdict.code = NFT_BREAK; break; default: WARN_ON(1); @@ -47,6 +66,7 @@ static void nft_tunnel_get_eval(const struct nft_expr *expr, static const struct nla_policy nft_tunnel_policy[NFTA_TUNNEL_MAX + 1] = { [NFTA_TUNNEL_KEY] = { .type = NLA_U32 }, [NFTA_TUNNEL_DREG] = { .type = NLA_U32 }, + [NFTA_TUNNEL_MODE] = { .type = NLA_U32 }, }; static int nft_tunnel_get_init(const struct nft_ctx *ctx, @@ -74,6 +94,14 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx, priv->dreg = nft_parse_register(tb[NFTA_TUNNEL_DREG]); + if (tb[NFTA_TUNNEL_MODE]) { + priv->mode = ntohl(nla_get_be32(tb[NFTA_TUNNEL_MODE])); + if (priv->mode > NFT_TUNNEL_MODE_MAX) + return -EOPNOTSUPP; + } else { + priv->mode = NFT_TUNNEL_MODE_NONE; + } + return nft_validate_register_store(ctx, priv->dreg, NULL, NFT_DATA_VALUE, len); } @@ -87,6 +115,8 @@ static int nft_tunnel_get_dump(struct sk_buff *skb, goto nla_put_failure; if (nft_dump_register(skb, NFTA_TUNNEL_DREG, priv->dreg)) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_TUNNEL_MODE, htonl(priv->mode))) + goto nla_put_failure; return 0; nla_put_failure: -- cgit v1.2.3 From 3eb450367d0823226515ee24712ed08eccb33eb9 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Tue, 23 Oct 2018 23:21:14 -0400 Subject: rds: add type of service(tos) infrastructure RDS Service type (TOS) is user-defined and needs to be configured via RDS IOCTL interface. It must be set before initiating any traffic and once set the TOS can not be changed. All out-going traffic from the socket will be associated with its TOS. Reviewed-by: Sowmini Varadhan Signed-off-by: Santosh Shilimkar [yanjun.zhu@oracle.com: Adapted original patch with ipv6 changes] Signed-off-by: Zhu Yanjun --- include/uapi/linux/rds.h | 11 +++++++++++ net/rds/af_rds.c | 35 ++++++++++++++++++++++++++++++++++- net/rds/connection.c | 20 +++++++++++--------- net/rds/ib.c | 1 + net/rds/ib_cm.c | 2 +- net/rds/rdma_transport.c | 1 + net/rds/rds.h | 9 +++++++-- net/rds/recv.c | 1 + net/rds/send.c | 6 +++--- net/rds/tcp.c | 1 + net/rds/tcp_listen.c | 2 +- 11 files changed, 72 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h index 8b73cb603c5f..5d0f76c780e5 100644 --- a/include/uapi/linux/rds.h +++ b/include/uapi/linux/rds.h @@ -69,6 +69,12 @@ #define RDS_TRANS_COUNT 3 #define RDS_TRANS_NONE (~0) +/* IOCTLS commands for SOL_RDS */ +#define SIOCRDSSETTOS (SIOCPROTOPRIVATE) +#define SIOCRDSGETTOS (SIOCPROTOPRIVATE + 1) + +typedef __u8 rds_tos_t; + /* * Control message types for SOL_RDS. * @@ -149,6 +155,7 @@ struct rds_info_connection { __be32 faddr; __u8 transport[TRANSNAMSIZ]; /* null term ascii */ __u8 flags; + __u8 tos; } __attribute__((packed)); struct rds6_info_connection { @@ -171,6 +178,7 @@ struct rds_info_message { __be16 lport; __be16 fport; __u8 flags; + __u8 tos; } __attribute__((packed)); struct rds6_info_message { @@ -214,6 +222,7 @@ struct rds_info_tcp_socket { __u32 last_sent_nxt; __u32 last_expected_una; __u32 last_seen_una; + __u8 tos; } __attribute__((packed)); struct rds6_info_tcp_socket { @@ -240,6 +249,7 @@ struct rds_info_rdma_connection { __u32 max_send_sge; __u32 rdma_mr_max; __u32 rdma_mr_size; + __u8 tos; }; struct rds6_info_rdma_connection { @@ -253,6 +263,7 @@ struct rds6_info_rdma_connection { __u32 max_send_sge; __u32 rdma_mr_max; __u32 rdma_mr_size; + __u8 tos; }; /* RDS message Receive Path Latency points */ diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 65571a6273c3..904515858037 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -254,7 +254,38 @@ static __poll_t rds_poll(struct file *file, struct socket *sock, static int rds_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return -ENOIOCTLCMD; + struct rds_sock *rs = rds_sk_to_rs(sock->sk); + rds_tos_t tos; + + switch (cmd) { + case SIOCRDSSETTOS: + if (get_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + + if (rs->rs_transport && + rs->rs_transport->t_type == RDS_TRANS_TCP) + tos = 0; + + spin_lock_bh(&rds_sock_lock); + if (rs->rs_tos || rs->rs_conn) { + spin_unlock_bh(&rds_sock_lock); + return -EINVAL; + } + rs->rs_tos = tos; + spin_unlock_bh(&rds_sock_lock); + break; + case SIOCRDSGETTOS: + spin_lock_bh(&rds_sock_lock); + tos = rs->rs_tos; + spin_unlock_bh(&rds_sock_lock); + if (put_user(tos, (rds_tos_t __user *)arg)) + return -EFAULT; + break; + default: + return -ENOIOCTLCMD; + } + + return 0; } static int rds_cancel_sent_to(struct rds_sock *rs, char __user *optval, @@ -650,6 +681,8 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; rs->rs_rx_traces = 0; + rs->rs_tos = 0; + rs->rs_conn = NULL; spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); diff --git a/net/rds/connection.c b/net/rds/connection.c index 1ab14b68ecc8..7ea134f9a825 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -84,7 +84,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - int dev_if) + u8 tos, int dev_if) { struct rds_connection *conn, *ret = NULL; @@ -92,6 +92,7 @@ static struct rds_connection *rds_conn_lookup(struct net *net, if (ipv6_addr_equal(&conn->c_faddr, faddr) && ipv6_addr_equal(&conn->c_laddr, laddr) && conn->c_trans == trans && + conn->c_tos == tos && net == rds_conn_net(conn) && conn->c_dev_if == dev_if) { ret = conn; @@ -160,7 +161,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, + gfp_t gfp, u8 tos, int is_outgoing, int dev_if) { @@ -172,7 +173,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, int npaths = (trans->t_mp_capable ? RDS_MPATH_WORKERS : 1); rcu_read_lock(); - conn = rds_conn_lookup(net, head, laddr, faddr, trans, dev_if); + conn = rds_conn_lookup(net, head, laddr, faddr, trans, tos, dev_if); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && @@ -206,6 +207,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, conn->c_isv6 = !ipv6_addr_v4mapped(laddr); conn->c_faddr = *faddr; conn->c_dev_if = dev_if; + conn->c_tos = tos; #if IS_ENABLED(CONFIG_IPV6) /* If the local address is link local, set c_bound_if to be the @@ -298,7 +300,7 @@ static struct rds_connection *__rds_conn_create(struct net *net, struct rds_connection *found; found = rds_conn_lookup(net, head, laddr, faddr, trans, - dev_if); + tos, dev_if); if (found) { struct rds_conn_path *cp; int i; @@ -333,10 +335,10 @@ out: struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, - struct rds_transport *trans, gfp_t gfp, - int dev_if) + struct rds_transport *trans, u8 tos, + gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 0, dev_if); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 0, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create); @@ -344,9 +346,9 @@ struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, int dev_if) + u8 tos, gfp_t gfp, int dev_if) { - return __rds_conn_create(net, laddr, faddr, trans, gfp, 1, dev_if); + return __rds_conn_create(net, laddr, faddr, trans, gfp, tos, 1, dev_if); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); diff --git a/net/rds/ib.c b/net/rds/ib.c index 9d7b7586f240..21b6588b71ca 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -301,6 +301,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, iinfo->src_addr = conn->c_laddr.s6_addr32[3]; iinfo->dst_addr = conn->c_faddr.s6_addr32[3]; + iinfo->tos = conn->c_tos; memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid)); memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid)); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index a1c3ad380ec8..70518e329a9e 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -786,7 +786,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* RDS/IB is not currently netns aware, thus init_net */ conn = rds_conn_create(&init_net, daddr6, saddr6, - &rds_ib_transport, GFP_KERNEL, ifindex); + &rds_ib_transport, 0, GFP_KERNEL, ifindex); if (IS_ERR(conn)) { rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn)); conn = NULL; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 63cbc6b8560c..e37f91537d29 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -115,6 +115,7 @@ static int rds_rdma_cm_event_handler_cmn(struct rdma_cm_id *cm_id, pr_warn("RDS/RDMA: conn <%pI6c, %pI6c> rejected, dropping connection\n", &conn->c_laddr, &conn->c_faddr); conn->c_proposed_version = RDS_PROTOCOL_COMPAT_VERSION; + conn->c_tos = 0; rds_conn_drop(conn); } rdsdebug("Connection rejected: %s\n", diff --git a/net/rds/rds.h b/net/rds/rds.h index 660023f08553..7e52b92092d7 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -158,6 +158,9 @@ struct rds_connection { unsigned int c_version; possible_net_t c_net; + /* TOS */ + u8 c_tos; + struct list_head c_map_item; unsigned long c_map_queued; @@ -652,6 +655,7 @@ struct rds_sock { u8 rs_rx_traces; u8 rs_rx_trace[RDS_MSG_RX_DGRAM_TRACE_MAX]; struct rds_msg_zcopy_queue rs_zcookie_queue; + u8 rs_tos; }; static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk) @@ -760,13 +764,14 @@ void rds_conn_exit(void); struct rds_connection *rds_conn_create(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, - struct rds_transport *trans, gfp_t gfp, + struct rds_transport *trans, + u8 tos, gfp_t gfp, int dev_if); struct rds_connection *rds_conn_create_outgoing(struct net *net, const struct in6_addr *laddr, const struct in6_addr *faddr, struct rds_transport *trans, - gfp_t gfp, int dev_if); + u8 tos, gfp_t gfp, int dev_if); void rds_conn_shutdown(struct rds_conn_path *cpath); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); diff --git a/net/rds/recv.c b/net/rds/recv.c index 6bb6b16ca270..853de4876088 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -782,6 +782,7 @@ void rds_inc_info_copy(struct rds_incoming *inc, minfo.seq = be64_to_cpu(inc->i_hdr.h_sequence); minfo.len = be32_to_cpu(inc->i_hdr.h_len); + minfo.tos = inc->i_conn->c_tos; if (flip) { minfo.laddr = daddr; diff --git a/net/rds/send.c b/net/rds/send.c index fd8b687d5c05..c555e121b908 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1277,12 +1277,12 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len) /* rds_conn_create has a spinlock that runs with IRQ off. * Caching the conn in the socket helps a lot. */ - if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) + if (rs->rs_conn && ipv6_addr_equal(&rs->rs_conn->c_faddr, &daddr)) { conn = rs->rs_conn; - else { + } else { conn = rds_conn_create_outgoing(sock_net(sock->sk), &rs->rs_bound_addr, &daddr, - rs->rs_transport, + rs->rs_transport, 0, sock->sk->sk_allocation, scope_id); if (IS_ERR(conn)) { diff --git a/net/rds/tcp.c b/net/rds/tcp.c index c16f0a362c32..eb6851952cbf 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -267,6 +267,7 @@ static void rds_tcp_tc_info(struct socket *rds_sock, unsigned int len, tsinfo.last_sent_nxt = tc->t_last_sent_nxt; tsinfo.last_expected_una = tc->t_last_expected_una; tsinfo.last_seen_una = tc->t_last_seen_una; + tsinfo.tos = tc->t_cpath->cp_conn->c_tos; rds_info_copy(iter, &tsinfo, sizeof(tsinfo)); } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index c12203f646da..810a3a49e947 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -200,7 +200,7 @@ int rds_tcp_accept_one(struct socket *sock) conn = rds_conn_create(sock_net(sock->sk), my_addr, peer_addr, - &rds_tcp_transport, GFP_KERNEL, dev_if); + &rds_tcp_transport, 0, GFP_KERNEL, dev_if); if (IS_ERR(conn)) { ret = PTR_ERR(conn); -- cgit v1.2.3 From 2c620ff93d9fbd5d644760d4c21d389078ec1080 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Mon, 2 Jul 2018 22:44:20 -0700 Subject: time: Add struct __kernel_timex struct timex uses struct timeval internally. struct timeval is not y2038 safe. Introduce a new UAPI type struct __kernel_timex that is y2038 safe. struct __kernel_timex uses a timeval type that is similar to struct __kernel_timespec which preserves the same structure size across 32 bit and 64 bit ABIs. struct __kernel_timex also restructures other members of the structure to make the structure the same on 64 bit and 32 bit architectures. Note that struct __kernel_timex is the same as struct timex on a 64 bit architecture. The above solution is similar to other new y2038 syscalls that are being introduced: both 32 bit and 64 bit ABIs have a common entry, and the compat entry supports the old 32 bit syscall interface. Alternatives considered were: 1. Add new time type to struct timex that makes use of padded bits. This time type could be based on the struct __kernel_timespec. modes will use a flag to notify which time structure should be used internally. This needs some application level changes on both 64 bit and 32 bit architectures. Although 64 bit machines could continue to use the older timeval structure without any changes. 2. Add a new u8 type to struct timex that makes use of padded bits. This can be used to save higher order tv_sec bits. modes will use a flag to notify presence of such a type. This will need some application level changes on 32 bit architectures. 3. Add a new compat_timex structure that differs in only the size of the time type; keep rest of struct timex the same. This requires extra syscalls to manage all 3 cases on 64 bit architectures. This will not need any application level changes but will add more complexity from kernel side. Signed-off-by: Deepa Dinamani Signed-off-by: Arnd Bergmann --- include/linux/timex.h | 7 +++++++ include/uapi/linux/timex.h | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 39c25dbebfe8..7f40e9e42ecc 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,6 +53,13 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H +/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path + * and 32-bit emulation. + */ +#ifndef CONFIG_64BIT_TIME +#define __kernel_timex timex +#endif + #include #define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h index 92685d826444..a1c6b73016a5 100644 --- a/include/uapi/linux/timex.h +++ b/include/uapi/linux/timex.h @@ -92,6 +92,47 @@ struct timex { int :32; int :32; int :32; }; +struct __kernel_timex_timeval { + __kernel_time64_t tv_sec; + long long tv_usec; +}; + +#ifndef __kernel_timex +struct __kernel_timex { + unsigned int modes; /* mode selector */ + int :32; /* pad */ + long long offset; /* time offset (usec) */ + long long freq; /* frequency offset (scaled ppm) */ + long long maxerror;/* maximum error (usec) */ + long long esterror;/* estimated error (usec) */ + int status; /* clock command/status */ + int :32; /* pad */ + long long constant;/* pll time constant */ + long long precision;/* clock precision (usec) (read only) */ + long long tolerance;/* clock frequency tolerance (ppm) + * (read only) + */ + struct __kernel_timex_timeval time; /* (read only, except for ADJ_SETOFFSET) */ + long long tick; /* (modified) usecs between clock ticks */ + + long long ppsfreq;/* pps frequency (scaled ppm) (ro) */ + long long jitter; /* pps jitter (us) (ro) */ + int shift; /* interval duration (s) (shift) (ro) */ + int :32; /* pad */ + long long stabil; /* pps stability (scaled ppm) (ro) */ + long long jitcnt; /* jitter limit exceeded (ro) */ + long long calcnt; /* calibration intervals (ro) */ + long long errcnt; /* calibration errors (ro) */ + long long stbcnt; /* stability limit exceeded (ro) */ + + int tai; /* TAI offset (ro) */ + + int :32; int :32; int :32; int :32; + int :32; int :32; int :32; int :32; + int :32; int :32; int :32; +}; +#endif + /* * Mode codes (timex.mode) */ -- cgit v1.2.3 From c70a772fda11570ebddecbce1543a3fda008db4a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Jan 2019 00:00:34 +0100 Subject: y2038: remove struct definition redirects We now use 64-bit time_t on all architectures, so the __kernel_timex, __kernel_timeval and __kernel_timespec redirects can be removed after having served their purpose. This makes it all much less confusing, as the __kernel_* types now always refer to the same layout based on 64-bit time_t across all 32-bit and 64-bit architectures. Signed-off-by: Arnd Bergmann --- include/linux/time64.h | 8 -------- include/linux/timex.h | 7 ------- include/uapi/linux/time.h | 4 ---- include/uapi/linux/timex.h | 2 -- 4 files changed, 21 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/time64.h b/include/linux/time64.h index 05634afba0db..f38d382ffec1 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -7,14 +7,6 @@ typedef __s64 time64_t; typedef __u64 timeu64_t; -/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path - * and 32-bit emulation. - */ -#ifndef CONFIG_64BIT_TIME -#define __kernel_timespec timespec -#define __kernel_itimerspec itimerspec -#endif - #include struct timespec64 { diff --git a/include/linux/timex.h b/include/linux/timex.h index 4aff9f0d1367..ce0859763670 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,13 +53,6 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H -/* CONFIG_64BIT_TIME enables new 64 bit time_t syscalls in the compat path - * and 32-bit emulation. - */ -#ifndef CONFIG_64BIT_TIME -#define __kernel_timex timex -#endif - #include #define ADJ_ADJTIME 0x8000 /* switch between adjtime/adjtimex modes */ diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index 6b56a2208be7..b03f8717c312 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -42,19 +42,15 @@ struct itimerval { struct timeval it_value; /* current value */ }; -#ifndef __kernel_timespec struct __kernel_timespec { __kernel_time64_t tv_sec; /* seconds */ long long tv_nsec; /* nanoseconds */ }; -#endif -#ifndef __kernel_itimerspec struct __kernel_itimerspec { struct __kernel_timespec it_interval; /* timer period */ struct __kernel_timespec it_value; /* timer expiration */ }; -#endif /* * legacy timeval structure, only embedded in structures that diff --git a/include/uapi/linux/timex.h b/include/uapi/linux/timex.h index a1c6b73016a5..9f517f9010bb 100644 --- a/include/uapi/linux/timex.h +++ b/include/uapi/linux/timex.h @@ -97,7 +97,6 @@ struct __kernel_timex_timeval { long long tv_usec; }; -#ifndef __kernel_timex struct __kernel_timex { unsigned int modes; /* mode selector */ int :32; /* pad */ @@ -131,7 +130,6 @@ struct __kernel_timex { int :32; int :32; int :32; int :32; int :32; int :32; int :32; }; -#endif /* * Mode codes (timex.mode) -- cgit v1.2.3 From e9e0c8903009477b630e37a8b6364b26a00720da Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:34 +0200 Subject: fanotify: encode file identifier for FAN_REPORT_FID When user requests the flag FAN_REPORT_FID in fanotify_init(), a unique file identifier of the event target object will be reported with the event. The file identifier includes the filesystem's fsid (i.e. from statfs(2)) and an NFS file handle of the file (i.e. from name_to_handle_at(2)). The file identifier makes holding the path reference and passing a file descriptor to user redundant, so those are disabled in a group with FAN_REPORT_FID. Encode fid and store it in event for a group with FAN_REPORT_FID. Up to 12 bytes of file handle on 32bit arch (16 bytes on 64bit arch) are stored inline in fanotify_event struct. Larger file handles are stored in an external allocated buffer. On failure to encode fid, we print a warning and queue the event without the fid information. [JK: Fold part of later patched into this one to use exportfs_encode_inode_fh() right away] Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 84 +++++++++++++++++++++++++++++++++++--- fs/notify/fanotify/fanotify.h | 78 +++++++++++++++++++++++++++++++++-- fs/notify/fanotify/fanotify_user.c | 6 +-- include/uapi/linux/fanotify.h | 1 + 4 files changed, 156 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index d8e3b6e50844..dd33227e518a 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "fanotify.h" @@ -25,10 +26,18 @@ static bool should_merge(struct fsnotify_event *old_fsn, old = FANOTIFY_E(old_fsn); new = FANOTIFY_E(new_fsn); - if (old_fsn->inode == new_fsn->inode && old->pid == new->pid && - old->path.mnt == new->path.mnt && - old->path.dentry == new->path.dentry) - return true; + if (old_fsn->inode != new_fsn->inode || old->pid != new->pid || + old->fh_type != new->fh_type || old->fh_len != new->fh_len) + return false; + + if (fanotify_event_has_path(old)) { + return old->path.mnt == new->path.mnt && + old->path.dentry == new->path.dentry; + } else if (fanotify_event_has_fid(old)) { + return fanotify_fid_equal(&old->fid, &new->fid, old->fh_len); + } + + /* Do not merge events if we failed to encode fid */ return false; } @@ -143,6 +152,60 @@ static u32 fanotify_group_event_mask(struct fsnotify_iter_info *iter_info, ~marks_ignored_mask; } +static int fanotify_encode_fid(struct fanotify_event *event, + const struct path *path, gfp_t gfp) +{ + struct fanotify_fid *fid = &event->fid; + int dwords, bytes = 0; + struct kstatfs stat; + int err, type; + + stat.f_fsid.val[0] = stat.f_fsid.val[1] = 0; + fid->ext_fh = NULL; + dwords = 0; + err = -ENOENT; + type = exportfs_encode_inode_fh(d_inode(path->dentry), NULL, &dwords, + NULL); + if (!dwords) + goto out_err; + + err = vfs_statfs(path, &stat); + if (err) + goto out_err; + + bytes = dwords << 2; + if (bytes > FANOTIFY_INLINE_FH_LEN) { + /* Treat failure to allocate fh as failure to allocate event */ + err = -ENOMEM; + fid->ext_fh = kmalloc(bytes, gfp); + if (!fid->ext_fh) + goto out_err; + } + + type = exportfs_encode_inode_fh(d_inode(path->dentry), + fanotify_fid_fh(fid, bytes), &dwords, + NULL); + err = -EINVAL; + if (!type || type == FILEID_INVALID || bytes != dwords << 2) + goto out_err; + + fid->fsid = stat.f_fsid; + event->fh_len = bytes; + + return type; + +out_err: + pr_warn_ratelimited("fanotify: failed to encode fid (fsid=%x.%x, " + "type=%d, bytes=%d, err=%i)\n", + stat.f_fsid.val[0], stat.f_fsid.val[1], + type, bytes, err); + kfree(fid->ext_fh); + fid->ext_fh = NULL; + event->fh_len = 0; + + return FILEID_INVALID; +} + struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group, struct inode *inode, u32 mask, const struct path *path) @@ -181,10 +244,16 @@ init: __maybe_unused event->pid = get_pid(task_pid(current)); else event->pid = get_pid(task_tgid(current)); - if (path) { + event->fh_len = 0; + if (path && FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + /* Report the event without a file identifier on encode error */ + event->fh_type = fanotify_encode_fid(event, path, gfp); + } else if (path) { + event->fh_type = FILEID_ROOT; event->path = *path; path_get(&event->path); } else { + event->fh_type = FILEID_INVALID; event->path.mnt = NULL; event->path.dentry = NULL; } @@ -281,7 +350,10 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) struct fanotify_event *event; event = FANOTIFY_E(fsn_event); - path_put(&event->path); + if (fanotify_event_has_path(event)) + path_put(&event->path); + else if (fanotify_event_has_ext_fh(event)) + kfree(event->fid.ext_fh); put_pid(event->pid); if (fanotify_is_perm_event(event->mask)) { kmem_cache_free(fanotify_perm_event_cachep, diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 898b5b2bc1c7..271482fb9611 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,11 +2,49 @@ #include #include #include +#include extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +/* + * 3 dwords are sufficient for most local fs (64bit ino, 32bit generation). + * For 32bit arch, fid increases the size of fanotify_event by 12 bytes and + * fh_* fields increase the size of fanotify_event by another 4 bytes. + * For 64bit arch, fid increases the size of fanotify_fid by 8 bytes and + * fh_* fields are packed in a hole after mask. + */ +#if BITS_PER_LONG == 32 +#define FANOTIFY_INLINE_FH_LEN (3 << 2) +#else +#define FANOTIFY_INLINE_FH_LEN (4 << 2) +#endif + +struct fanotify_fid { + __kernel_fsid_t fsid; + union { + unsigned char fh[FANOTIFY_INLINE_FH_LEN]; + unsigned char *ext_fh; + }; +}; + +static inline void *fanotify_fid_fh(struct fanotify_fid *fid, + unsigned int fh_len) +{ + return fh_len <= FANOTIFY_INLINE_FH_LEN ? fid->fh : fid->ext_fh; +} + +static inline bool fanotify_fid_equal(struct fanotify_fid *fid1, + struct fanotify_fid *fid2, + unsigned int fh_len) +{ + return fid1->fsid.val[0] == fid2->fsid.val[0] && + fid1->fsid.val[1] == fid2->fsid.val[1] && + !memcmp(fanotify_fid_fh(fid1, fh_len), + fanotify_fid_fh(fid2, fh_len), fh_len); +} + /* * Structure for normal fanotify events. It gets allocated in * fanotify_handle_event() and freed when the information is retrieved by @@ -16,13 +54,47 @@ struct fanotify_event { struct fsnotify_event fse; u32 mask; /* - * We hold ref to this path so it may be dereferenced at any point - * during this object's lifetime + * Those fields are outside fanotify_fid to pack fanotify_event nicely + * on 64bit arch and to use fh_type as an indication of whether path + * or fid are used in the union: + * FILEID_ROOT (0) for path, > 0 for fid, FILEID_INVALID for neither. */ - struct path path; + u8 fh_type; + u8 fh_len; + u16 pad; + union { + /* + * We hold ref to this path so it may be dereferenced at any + * point during this object's lifetime + */ + struct path path; + /* + * With FAN_REPORT_FID, we do not hold any reference on the + * victim object. Instead we store its NFS file handle and its + * filesystem's fsid as a unique identifier. + */ + struct fanotify_fid fid; + }; struct pid *pid; }; +static inline bool fanotify_event_has_path(struct fanotify_event *event) +{ + return event->fh_type == FILEID_ROOT; +} + +static inline bool fanotify_event_has_fid(struct fanotify_event *event) +{ + return event->fh_type != FILEID_ROOT && + event->fh_type != FILEID_INVALID; +} + +static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) +{ + return fanotify_event_has_fid(event) && + event->fh_len > FANOTIFY_INLINE_FH_LEN; +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 096503bd0edb..c965fcf4979e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -181,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fanotify_event_metadata metadata; struct fanotify_event *event; struct file *f = NULL; - int fd, ret; + int ret, fd = FAN_NOFD; pr_debug("%s: group=%p event=%p\n", __func__, group, fsn_event); @@ -193,9 +193,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, metadata.mask = event->mask & FANOTIFY_OUTGOING_EVENTS; metadata.pid = pid_vnr(event->pid); - if (unlikely(event->mask & FAN_Q_OVERFLOW)) { - fd = FAN_NOFD; - } else { + if (fanotify_event_has_path(event)) { fd = create_fd(group, event, &f); if (fd < 0) return fd; diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 909c98fcace2..d07f3cbc2786 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -44,6 +44,7 @@ /* Flags to determine fanotify event format */ #define FAN_REPORT_TID 0x00000100 /* event->pid is thread id */ +#define FAN_REPORT_FID 0x00000200 /* Report unique file id */ /* Deprecated - do not use this in programs and do not add new flags here! */ #define FAN_ALL_INIT_FLAGS (FAN_CLOEXEC | FAN_NONBLOCK | \ -- cgit v1.2.3 From 5e469c830fdb5a1ebaa69b375b87f583326fd296 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:35 +0200 Subject: fanotify: copy event fid info to user If group requested FAN_REPORT_FID and event has file identifier, copy that information to user reading the event after event metadata. fid information is formatted as struct fanotify_event_info_fid that includes a generic header struct fanotify_event_info_header, so that other info types could be defined in the future using the same header. metadata->event_len includes the length of the fid information. The fid information includes the filesystem's fsid (see statfs(2)) followed by an NFS file handle of the file that could be passed as an argument to open_by_handle_at(2). Cc: Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.h | 5 +++ fs/notify/fanotify/fanotify_user.c | 82 +++++++++++++++++++++++++++++++++++--- include/uapi/linux/fanotify.h | 20 ++++++++++ 3 files changed, 102 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 271482fb9611..4aafc7144c3d 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -95,6 +95,11 @@ static inline bool fanotify_event_has_ext_fh(struct fanotify_event *event) event->fh_len > FANOTIFY_INLINE_FH_LEN; } +static inline void *fanotify_event_fh(struct fanotify_event *event) +{ + return fanotify_fid_fh(&event->fid, event->fh_len); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index c965fcf4979e..cd82dd713c91 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -47,6 +47,18 @@ struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; +#define FANOTIFY_EVENT_ALIGN 4 + +static int fanotify_event_info_len(struct fanotify_event *event) +{ + if (!fanotify_event_has_fid(event)) + return 0; + + return roundup(sizeof(struct fanotify_event_info_fid) + + sizeof(struct file_handle) + event->fh_len, + FANOTIFY_EVENT_ALIGN); +} + /* * Get an fsnotify notification event if one exists and is small * enough to fit in "count". Return an error pointer if the count @@ -57,6 +69,9 @@ struct kmem_cache *fanotify_perm_event_cachep __read_mostly; static struct fsnotify_event *get_one_event(struct fsnotify_group *group, size_t count) { + size_t event_size = FAN_EVENT_METADATA_LEN; + struct fanotify_event *event; + assert_spin_locked(&group->notification_lock); pr_debug("%s: group=%p count=%zd\n", __func__, group, count); @@ -64,11 +79,18 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, if (fsnotify_notify_queue_is_empty(group)) return NULL; - if (FAN_EVENT_METADATA_LEN > count) + if (FAN_GROUP_FLAG(group, FAN_REPORT_FID)) { + event = FANOTIFY_E(fsnotify_peek_first_event(group)); + event_size += fanotify_event_info_len(event); + } + + if (event_size > count) return ERR_PTR(-EINVAL); - /* held the notification_lock the whole time, so this is the - * same event we peeked above */ + /* + * Held the notification_lock the whole time, so this is the + * same event we peeked above + */ return fsnotify_remove_first_event(group); } @@ -174,6 +196,48 @@ static int process_access_response(struct fsnotify_group *group, return 0; } +static int copy_fid_to_user(struct fanotify_event *event, char __user *buf) +{ + struct fanotify_event_info_fid info = { }; + struct file_handle handle = { }; + size_t fh_len = event->fh_len; + size_t len = fanotify_event_info_len(event); + + if (!len) + return 0; + + if (WARN_ON_ONCE(len < sizeof(info) + sizeof(handle) + fh_len)) + return -EFAULT; + + /* Copy event info fid header followed by vaiable sized file handle */ + info.hdr.info_type = FAN_EVENT_INFO_TYPE_FID; + info.hdr.len = len; + info.fsid = event->fid.fsid; + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + buf += sizeof(info); + len -= sizeof(info); + handle.handle_type = event->fh_type; + handle.handle_bytes = fh_len; + if (copy_to_user(buf, &handle, sizeof(handle))) + return -EFAULT; + + buf += sizeof(handle); + len -= sizeof(handle); + if (copy_to_user(buf, fanotify_event_fh(event), fh_len)) + return -EFAULT; + + /* Pad with 0's */ + buf += fh_len; + len -= fh_len; + WARN_ON_ONCE(len < 0 || len >= FANOTIFY_EVENT_ALIGN); + if (len > 0 && clear_user(buf, len)) + return -EFAULT; + + return 0; +} + static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event *fsn_event, char __user *buf, size_t count) @@ -197,6 +261,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, fd = create_fd(group, event, &f); if (fd < 0) return fd; + } else if (fanotify_event_has_fid(event)) { + metadata.event_len += fanotify_event_info_len(event); } metadata.fd = fd; @@ -208,14 +274,20 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, if (WARN_ON_ONCE(metadata.event_len > count)) goto out_close_fd; - if (copy_to_user(buf, &metadata, metadata.event_len)) + if (copy_to_user(buf, &metadata, FAN_EVENT_METADATA_LEN)) goto out_close_fd; if (fanotify_is_perm_event(event->mask)) FANOTIFY_PE(fsn_event)->fd = fd; - if (fd != FAN_NOFD) + if (fanotify_event_has_path(event)) { fd_install(fd, f); + } else if (fanotify_event_has_fid(event)) { + ret = copy_fid_to_user(event, buf + FAN_EVENT_METADATA_LEN); + if (ret < 0) + return ret; + } + return metadata.event_len; out_close_fd: diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index d07f3cbc2786..959ae2bdc7ca 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -107,6 +107,26 @@ struct fanotify_event_metadata { __s32 pid; }; +#define FAN_EVENT_INFO_TYPE_FID 1 + +/* Variable length info record following event metadata */ +struct fanotify_event_info_header { + __u8 info_type; + __u8 pad; + __u16 len; +}; + +/* Unique file identifier info record */ +struct fanotify_event_info_fid { + struct fanotify_event_info_header hdr; + __kernel_fsid_t fsid; + /* + * Following is an opaque struct file_handle that can be passed as + * an argument to open_by_handle_at(2). + */ + unsigned char handle[0]; +}; + struct fanotify_response { __s32 fd; __u32 response; -- cgit v1.2.3 From 235328d1fa4251c6dcb32351219bb553a58838d2 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 10 Jan 2019 19:04:43 +0200 Subject: fanotify: add support for create/attrib/move/delete events Add support for events with data type FSNOTIFY_EVENT_INODE (e.g. create/attrib/move/delete) for inode and filesystem mark types. The "inode" events do not carry enough information (i.e. path) to report event->fd, so we do not allow setting a mask for those events unless group supports reporting fid. The "inode" events are not supported on a mount mark, because they do not carry enough information (i.e. path) to be filtered by mount point. The "dirent" events (create/move/delete) report the fid of the parent directory where events took place without specifying the filename of the child. In the future, fanotify may get support for reporting filename information for those events. Cc: Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++++++++- fs/notify/fanotify/fanotify_user.c | 12 ++++++++++++ include/linux/fanotify.h | 22 ++++++++++++++++++++-- include/uapi/linux/fanotify.h | 8 ++++++++ 4 files changed, 48 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 974239b03442..158c69acb04d 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -313,9 +313,16 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_ACCESS != FS_ACCESS); BUILD_BUG_ON(FAN_MODIFY != FS_MODIFY); + BUILD_BUG_ON(FAN_ATTRIB != FS_ATTRIB); BUILD_BUG_ON(FAN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE); BUILD_BUG_ON(FAN_CLOSE_WRITE != FS_CLOSE_WRITE); BUILD_BUG_ON(FAN_OPEN != FS_OPEN); + BUILD_BUG_ON(FAN_MOVED_TO != FS_MOVED_TO); + BUILD_BUG_ON(FAN_MOVED_FROM != FS_MOVED_FROM); + BUILD_BUG_ON(FAN_CREATE != FS_CREATE); + BUILD_BUG_ON(FAN_DELETE != FS_DELETE); + BUILD_BUG_ON(FAN_DELETE_SELF != FS_DELETE_SELF); + BUILD_BUG_ON(FAN_MOVE_SELF != FS_MOVE_SELF); BUILD_BUG_ON(FAN_EVENT_ON_CHILD != FS_EVENT_ON_CHILD); BUILD_BUG_ON(FAN_Q_OVERFLOW != FS_Q_OVERFLOW); BUILD_BUG_ON(FAN_OPEN_PERM != FS_OPEN_PERM); @@ -324,7 +331,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, BUILD_BUG_ON(FAN_OPEN_EXEC != FS_OPEN_EXEC); BUILD_BUG_ON(FAN_OPEN_EXEC_PERM != FS_OPEN_EXEC_PERM); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 12); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 19); mask = fanotify_group_event_mask(group, iter_info, mask, data, data_type); diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index bf06fd6ef761..6c61a06d0ef5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -976,6 +976,18 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, group->priority == FS_PRIO_0) goto fput_and_out; + /* + * Events with data type inode do not carry enough information to report + * event->fd, so we do not allow setting a mask for inode events unless + * group supports reporting fid. + * inode events are not supported on a mount mark, because they do not + * carry enough information (i.e. path) to be filtered by mount point. + */ + if (mask & FANOTIFY_INODE_EVENTS && + (!FAN_GROUP_FLAG(group, FAN_REPORT_FID) || + mark_type == FAN_MARK_MOUNT)) + goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { ret = 0; if (mark_type == FAN_MARK_MOUNT) diff --git a/include/linux/fanotify.h b/include/linux/fanotify.h index f59be967f72b..e9d45387089f 100644 --- a/include/linux/fanotify.h +++ b/include/linux/fanotify.h @@ -35,10 +35,28 @@ FAN_MARK_IGNORED_SURV_MODIFY | \ FAN_MARK_FLUSH) -/* Events that user can request to be notified on */ -#define FANOTIFY_EVENTS (FAN_ACCESS | FAN_MODIFY | \ +/* + * Events that can be reported with data type FSNOTIFY_EVENT_PATH. + * Note that FAN_MODIFY can also be reported with data type + * FSNOTIFY_EVENT_INODE. + */ +#define FANOTIFY_PATH_EVENTS (FAN_ACCESS | FAN_MODIFY | \ FAN_CLOSE | FAN_OPEN | FAN_OPEN_EXEC) +/* + * Directory entry modification events - reported only to directory + * where entry is modified and not to a watching parent. + */ +#define FANOTIFY_DIRENT_EVENTS (FAN_MOVE | FAN_CREATE | FAN_DELETE) + +/* Events that can only be reported with data type FSNOTIFY_EVENT_INODE */ +#define FANOTIFY_INODE_EVENTS (FANOTIFY_DIRENT_EVENTS | \ + FAN_ATTRIB | FAN_MOVE_SELF | FAN_DELETE_SELF) + +/* Events that user can request to be notified on */ +#define FANOTIFY_EVENTS (FANOTIFY_PATH_EVENTS | \ + FANOTIFY_INODE_EVENTS) + /* Events that require a permission response from user */ #define FANOTIFY_PERM_EVENTS (FAN_OPEN_PERM | FAN_ACCESS_PERM | \ FAN_OPEN_EXEC_PERM) diff --git a/include/uapi/linux/fanotify.h b/include/uapi/linux/fanotify.h index 959ae2bdc7ca..b9effa6f8503 100644 --- a/include/uapi/linux/fanotify.h +++ b/include/uapi/linux/fanotify.h @@ -7,9 +7,16 @@ /* the following events that user-space can register for */ #define FAN_ACCESS 0x00000001 /* File was accessed */ #define FAN_MODIFY 0x00000002 /* File was modified */ +#define FAN_ATTRIB 0x00000004 /* Metadata changed */ #define FAN_CLOSE_WRITE 0x00000008 /* Writtable file closed */ #define FAN_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ #define FAN_OPEN 0x00000020 /* File was opened */ +#define FAN_MOVED_FROM 0x00000040 /* File was moved from X */ +#define FAN_MOVED_TO 0x00000080 /* File was moved to Y */ +#define FAN_CREATE 0x00000100 /* Subfile was created */ +#define FAN_DELETE 0x00000200 /* Subfile was deleted */ +#define FAN_DELETE_SELF 0x00000400 /* Self was deleted */ +#define FAN_MOVE_SELF 0x00000800 /* Self was moved */ #define FAN_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FAN_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ @@ -24,6 +31,7 @@ /* helper events */ #define FAN_CLOSE (FAN_CLOSE_WRITE | FAN_CLOSE_NOWRITE) /* close */ +#define FAN_MOVE (FAN_MOVED_FROM | FAN_MOVED_TO) /* moves */ /* flags used for fanotify_init() */ #define FAN_CLOEXEC 0x00000001 -- cgit v1.2.3 From 1db64e8733f653814f041ffe1428524494ef6123 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:32 +0200 Subject: devlink: Add devlink formatted message (fmsg) API Devlink fmsg is a mechanism to pass descriptors between drivers and devlink, in json-like format. The API allows the driver to add nested attributes such as object, object pair and value array, in addition to attributes such as name and value. Driver can use this API to fill the fmsg context in a format which will be translated by the devlink to the netlink message later. There is no memory allocation in advance (other than the initial list head), and it dynamically allocates messages descriptors and add them to the list on the fly. When it needs to send the data using SKBs to the netlink layer, it fragments the data between different SKBs. In order to do this fragmentation, it uses virtual nests attributes, to avoid actual nesting use which cannot be divided between different SKBs. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 149 +++++++++++++ include/uapi/linux/devlink.h | 8 + net/core/devlink.c | 483 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 640 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 74d992a68a06..7c5722e816aa 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -448,6 +448,8 @@ struct devlink_info_req; typedef void devlink_snapshot_data_dest_t(const void *data); +struct devlink_fmsg; + struct devlink_ops { int (*reload)(struct devlink *devlink, struct netlink_ext_ack *extack); int (*port_type_set)(struct devlink_port *devlink_port, @@ -639,6 +641,37 @@ int devlink_info_version_running_put(struct devlink_info_req *req, const char *version_name, const char *version_value); +int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg); +int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg); + +int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name); +int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg); + +int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name); +int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg); + +int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value); +int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value); +int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value); +int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value); +int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value); +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len); + +int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value); +int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value); +int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value); +int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value); +int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value); +int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u16 value_len); + #else static inline struct devlink *devlink_alloc(const struct devlink_ops *ops, @@ -971,6 +1004,122 @@ devlink_info_version_running_put(struct devlink_info_req *req, { return 0; } + +static inline int +devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) +{ + return 0; +} + +static inline int +devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) +{ + return 0; +} + +static inline int +devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) +{ + return 0; +} + +static inline int +devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) +{ + return 0; +} + +static inline int +devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) +{ + return 0; +} + +static inline int +devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) +{ + return 0; +} + +static inline int +devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +{ + return 0; +} + +static inline int +devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) +{ + return 0; +} + +static inline int +devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) +{ + return 0; +} + +static inline int +devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +{ + return 0; +} + +static inline int +devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) +{ + return 0; +} + +static inline int +devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) +{ + return 0; +} + +static inline int +devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value) +{ + return 0; +} + +static inline int +devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value) +{ + return 0; +} + +static inline int +devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value) +{ + return 0; +} + +static inline int +devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value) +{ + return 0; +} + +static inline int +devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value) +{ + return 0; +} + +static inline int +devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u16 value_len) +{ + return 0; +} #endif #if IS_REACHABLE(CONFIG_NET_DEVLINK) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 054b2d1a4537..076692209a9b 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -302,6 +302,14 @@ enum devlink_attr { DEVLINK_ATTR_SB_POOL_CELL_SIZE, /* u32 */ + DEVLINK_ATTR_FMSG, /* nested */ + DEVLINK_ATTR_FMSG_OBJ_NEST_START, /* flag */ + DEVLINK_ATTR_FMSG_PAIR_NEST_START, /* flag */ + DEVLINK_ATTR_FMSG_ARR_NEST_START, /* flag */ + DEVLINK_ATTR_FMSG_NEST_END, /* flag */ + DEVLINK_ATTR_FMSG_OBJ_NAME, /* string */ + DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, /* u8 */ + DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA, /* dynamic */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index cd0d393bc62d..03883697fcf0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -3879,6 +3879,489 @@ static int devlink_nl_cmd_info_get_dumpit(struct sk_buff *msg, return msg->len; } +struct devlink_fmsg_item { + struct list_head list; + int attrtype; + u8 nla_type; + u16 len; + int value[0]; +}; + +struct devlink_fmsg { + struct list_head item_list; +}; + +static struct devlink_fmsg *devlink_fmsg_alloc(void) +{ + struct devlink_fmsg *fmsg; + + fmsg = kzalloc(sizeof(*fmsg), GFP_KERNEL); + if (!fmsg) + return NULL; + + INIT_LIST_HEAD(&fmsg->item_list); + + return fmsg; +} + +static void devlink_fmsg_free(struct devlink_fmsg *fmsg) +{ + struct devlink_fmsg_item *item, *tmp; + + list_for_each_entry_safe(item, tmp, &fmsg->item_list, list) { + list_del(&item->list); + kfree(item); + } + kfree(fmsg); +} + +static int devlink_fmsg_nest_common(struct devlink_fmsg *fmsg, + int attrtype) +{ + struct devlink_fmsg_item *item; + + item = kzalloc(sizeof(*item), GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->attrtype = attrtype; + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +int devlink_fmsg_obj_nest_start(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_OBJ_NEST_START); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_start); + +static int devlink_fmsg_nest_end(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_NEST_END); +} + +int devlink_fmsg_obj_nest_end(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_obj_nest_end); + +#define DEVLINK_FMSG_MAX_SIZE (GENLMSG_DEFAULT_SIZE - GENL_HDRLEN - NLA_HDRLEN) + +static int devlink_fmsg_put_name(struct devlink_fmsg *fmsg, const char *name) +{ + struct devlink_fmsg_item *item; + + if (strlen(name) + 1 > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + strlen(name) + 1, GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->nla_type = NLA_NUL_STRING; + item->len = strlen(name) + 1; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_NAME; + memcpy(&item->value, name, item->len); + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +int devlink_fmsg_pair_nest_start(struct devlink_fmsg *fmsg, const char *name) +{ + int err; + + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_PAIR_NEST_START); + if (err) + return err; + + err = devlink_fmsg_put_name(fmsg, name); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_start); + +int devlink_fmsg_pair_nest_end(struct devlink_fmsg *fmsg) +{ + return devlink_fmsg_nest_end(fmsg); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_pair_nest_end); + +int devlink_fmsg_arr_pair_nest_start(struct devlink_fmsg *fmsg, + const char *name) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_nest_common(fmsg, DEVLINK_ATTR_FMSG_ARR_NEST_START); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_start); + +int devlink_fmsg_arr_pair_nest_end(struct devlink_fmsg *fmsg) +{ + int err; + + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; + + err = devlink_fmsg_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_arr_pair_nest_end); + +static int devlink_fmsg_put_value(struct devlink_fmsg *fmsg, + const void *value, u16 value_len, + u8 value_nla_type) +{ + struct devlink_fmsg_item *item; + + if (value_len > DEVLINK_FMSG_MAX_SIZE) + return -EMSGSIZE; + + item = kzalloc(sizeof(*item) + value_len, GFP_KERNEL); + if (!item) + return -ENOMEM; + + item->nla_type = value_nla_type; + item->len = value_len; + item->attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + memcpy(&item->value, value, item->len); + list_add_tail(&item->list, &fmsg->item_list); + + return 0; +} + +int devlink_fmsg_bool_put(struct devlink_fmsg *fmsg, bool value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_FLAG); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_put); + +int devlink_fmsg_u8_put(struct devlink_fmsg *fmsg, u8 value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U8); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_put); + +int devlink_fmsg_u32_put(struct devlink_fmsg *fmsg, u32 value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U32); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_put); + +int devlink_fmsg_u64_put(struct devlink_fmsg *fmsg, u64 value) +{ + return devlink_fmsg_put_value(fmsg, &value, sizeof(value), NLA_U64); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_put); + +int devlink_fmsg_string_put(struct devlink_fmsg *fmsg, const char *value) +{ + return devlink_fmsg_put_value(fmsg, value, strlen(value) + 1, + NLA_NUL_STRING); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_put); + +int devlink_fmsg_binary_put(struct devlink_fmsg *fmsg, const void *value, + u16 value_len) +{ + return devlink_fmsg_put_value(fmsg, value, value_len, NLA_BINARY); +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_put); + +int devlink_fmsg_bool_pair_put(struct devlink_fmsg *fmsg, const char *name, + bool value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_bool_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_bool_pair_put); + +int devlink_fmsg_u8_pair_put(struct devlink_fmsg *fmsg, const char *name, + u8 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u8_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u8_pair_put); + +int devlink_fmsg_u32_pair_put(struct devlink_fmsg *fmsg, const char *name, + u32 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u32_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u32_pair_put); + +int devlink_fmsg_u64_pair_put(struct devlink_fmsg *fmsg, const char *name, + u64 value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_u64_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_u64_pair_put); + +int devlink_fmsg_string_pair_put(struct devlink_fmsg *fmsg, const char *name, + const char *value) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_string_put(fmsg, value); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_string_pair_put); + +int devlink_fmsg_binary_pair_put(struct devlink_fmsg *fmsg, const char *name, + const void *value, u16 value_len) +{ + int err; + + err = devlink_fmsg_pair_nest_start(fmsg, name); + if (err) + return err; + + err = devlink_fmsg_binary_put(fmsg, value, value_len); + if (err) + return err; + + err = devlink_fmsg_pair_nest_end(fmsg); + if (err) + return err; + + return 0; +} +EXPORT_SYMBOL_GPL(devlink_fmsg_binary_pair_put); + +static int +devlink_fmsg_item_fill_type(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ + switch (msg->nla_type) { + case NLA_FLAG: + case NLA_U8: + case NLA_U32: + case NLA_U64: + case NLA_NUL_STRING: + case NLA_BINARY: + return nla_put_u8(skb, DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, + msg->nla_type); + default: + return -EINVAL; + } +} + +static int +devlink_fmsg_item_fill_data(struct devlink_fmsg_item *msg, struct sk_buff *skb) +{ + int attrtype = DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA; + u8 tmp; + + switch (msg->nla_type) { + case NLA_FLAG: + /* Always provide flag data, regardless of its value */ + tmp = *(bool *) msg->value; + + return nla_put_u8(skb, attrtype, tmp); + case NLA_U8: + return nla_put_u8(skb, attrtype, *(u8 *) msg->value); + case NLA_U32: + return nla_put_u32(skb, attrtype, *(u32 *) msg->value); + case NLA_U64: + return nla_put_u64_64bit(skb, attrtype, *(u64 *) msg->value, + DEVLINK_ATTR_PAD); + case NLA_NUL_STRING: + return nla_put_string(skb, attrtype, (char *) &msg->value); + case NLA_BINARY: + return nla_put(skb, attrtype, msg->len, (void *) &msg->value); + default: + return -EINVAL; + } +} + +static int +devlink_fmsg_prepare_skb(struct devlink_fmsg *fmsg, struct sk_buff *skb, + int *start) +{ + struct devlink_fmsg_item *item; + struct nlattr *fmsg_nlattr; + int i = 0; + int err; + + fmsg_nlattr = nla_nest_start(skb, DEVLINK_ATTR_FMSG); + if (!fmsg_nlattr) + return -EMSGSIZE; + + list_for_each_entry(item, &fmsg->item_list, list) { + if (i < *start) { + i++; + continue; + } + + switch (item->attrtype) { + case DEVLINK_ATTR_FMSG_OBJ_NEST_START: + case DEVLINK_ATTR_FMSG_PAIR_NEST_START: + case DEVLINK_ATTR_FMSG_ARR_NEST_START: + case DEVLINK_ATTR_FMSG_NEST_END: + err = nla_put_flag(skb, item->attrtype); + break; + case DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA: + err = devlink_fmsg_item_fill_type(item, skb); + if (err) + break; + err = devlink_fmsg_item_fill_data(item, skb); + break; + case DEVLINK_ATTR_FMSG_OBJ_NAME: + err = nla_put_string(skb, item->attrtype, + (char *) &item->value); + break; + default: + err = -EINVAL; + break; + } + if (!err) + *start = ++i; + else + break; + } + + nla_nest_end(skb, fmsg_nlattr); + return err; +} + +static int devlink_fmsg_snd(struct devlink_fmsg *fmsg, + struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct nlmsghdr *nlh; + struct sk_buff *skb; + bool last = false; + int index = 0; + void *hdr; + int err; + + while (!last) { + int tmp_index = index; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, flags | NLM_F_MULTI, cmd); + if (!hdr) { + err = -EMSGSIZE; + goto nla_put_failure; + } + + err = devlink_fmsg_prepare_skb(fmsg, skb, &index); + if (!err) + last = true; + else if (err != -EMSGSIZE || tmp_index == index) + goto nla_put_failure; + + genlmsg_end(skb, hdr); + err = genlmsg_reply(skb, info); + if (err) + return err; + } + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = -EMSGSIZE; + goto nla_put_failure; + } + err = genlmsg_reply(skb, info); + if (err) + return err; + + return 0; + +nla_put_failure: + nlmsg_free(skb); + return err; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, -- cgit v1.2.3 From 7afe335a8bede4e2839b0e0fa36ef629fe4a0206 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:35 +0200 Subject: devlink: Add health get command Add devlink health get command to provide reporter/s data for user space. Add the ability to get data per reporter or dump data from all available reporters. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 11 ++++ net/core/devlink.c | 149 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 076692209a9b..d8f20d6ce139 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -96,6 +96,8 @@ enum devlink_command { DEVLINK_CMD_INFO_GET, /* can dump */ + DEVLINK_CMD_HEALTH_REPORTER_GET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -310,6 +312,15 @@ enum devlink_attr { DEVLINK_ATTR_FMSG_OBJ_NAME, /* string */ DEVLINK_ATTR_FMSG_OBJ_VALUE_TYPE, /* u8 */ DEVLINK_ATTR_FMSG_OBJ_VALUE_DATA, /* dynamic */ + + DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ + DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ + DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ + DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 3eaa290831aa..86f7c0e5d4bc 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4572,6 +4572,146 @@ int devlink_health_report(struct devlink_health_reporter *reporter, } EXPORT_SYMBOL_GPL(devlink_health_report); +static struct devlink_health_reporter * +devlink_health_reporter_get_from_info(struct devlink *devlink, + struct genl_info *info) +{ + char *reporter_name; + + if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]) + return NULL; + + reporter_name = + nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]); + return devlink_health_reporter_find_by_name(devlink, reporter_name); +} + +static int +devlink_nl_health_reporter_fill(struct sk_buff *msg, + struct devlink *devlink, + struct devlink_health_reporter *reporter, + enum devlink_command cmd, u32 portid, + u32 seq, int flags) +{ + struct nlattr *reporter_attr; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(msg, devlink)) + goto genlmsg_cancel; + + reporter_attr = nla_nest_start(msg, DEVLINK_ATTR_HEALTH_REPORTER); + if (!reporter_attr) + goto genlmsg_cancel; + if (nla_put_string(msg, DEVLINK_ATTR_HEALTH_REPORTER_NAME, + reporter->ops->name)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, + reporter->health_state)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, + reporter->error_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, + reporter->recovery_count, DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, + reporter->graceful_period, + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, + reporter->auto_recover)) + goto reporter_nest_cancel; + if (reporter->dump_fmsg && + nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, + jiffies_to_msecs(reporter->dump_ts), + DEVLINK_ATTR_PAD)) + goto reporter_nest_cancel; + + nla_nest_end(msg, reporter_attr); + genlmsg_end(msg, hdr); + return 0; + +reporter_nest_cancel: + nla_nest_end(msg, reporter_attr); +genlmsg_cancel: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +static int devlink_nl_cmd_health_reporter_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + struct sk_buff *msg; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + err = devlink_nl_health_reporter_fill(msg, devlink, reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + info->snd_portid, info->snd_seq, + 0); + if (err) { + nlmsg_free(msg); + return err; + } + + return genlmsg_reply(msg, info); +} + +static int +devlink_nl_cmd_health_reporter_get_dumpit(struct sk_buff *msg, + struct netlink_callback *cb) +{ + struct devlink_health_reporter *reporter; + struct devlink *devlink; + int start = cb->args[0]; + int idx = 0; + int err; + + mutex_lock(&devlink_mutex); + list_for_each_entry(devlink, &devlink_list, list) { + if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) + continue; + mutex_lock(&devlink->lock); + list_for_each_entry(reporter, &devlink->reporter_list, + list) { + if (idx < start) { + idx++; + continue; + } + err = devlink_nl_health_reporter_fill(msg, devlink, + reporter, + DEVLINK_CMD_HEALTH_REPORTER_GET, + NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + NLM_F_MULTI); + if (err) { + mutex_unlock(&devlink->lock); + goto out; + } + idx++; + } + mutex_unlock(&devlink->lock); + } +out: + mutex_unlock(&devlink_mutex); + + cb->args[0] = idx; + return msg->len; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4597,6 +4737,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_PARAM_VALUE_CMODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, + [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4840,6 +4981,14 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_GET, + .doit = devlink_nl_cmd_health_reporter_get_doit, + .dumpit = devlink_nl_cmd_health_reporter_get_dumpit, + .policy = devlink_nl_policy, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + /* can be retrieved by unprivileged users */ + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From a1e55ec0a0c6969cb7e9d9080a84041bb7b2b6e6 Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:36 +0200 Subject: devlink: Add health set command Add devlink health set command, in order to set configuration parameters for a specific reporter. Supported parameters are: - graceful_period: Time interval between auto recoveries (in msec) - auto_recover: Determines if the devlink shall execute recover upon receiving error for the reporter Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 36 ++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index d8f20d6ce139..b03065a99884 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -97,6 +97,7 @@ enum devlink_command { DEVLINK_CMD_INFO_GET, /* can dump */ DEVLINK_CMD_HEALTH_REPORTER_GET, + DEVLINK_CMD_HEALTH_REPORTER_SET, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 86f7c0e5d4bc..0b231fb76e59 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4712,6 +4712,33 @@ out: return msg->len; } +static int +devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->recover && + (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] || + info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER])) + return -EOPNOTSUPP; + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]) + reporter->graceful_period = + nla_get_u64(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD]); + + if (info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]) + reporter->auto_recover = + nla_get_u8(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER]); + + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -4738,6 +4765,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_REGION_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_REGION_SNAPSHOT_ID] = { .type = NLA_U32 }, [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, + [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -4989,6 +5018,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, /* can be retrieved by unprivileged users */ }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_SET, + .doit = devlink_nl_cmd_health_reporter_set_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 20a0943a5b237f7d59dc581e9e3637f5c87f1fde Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:37 +0200 Subject: devlink: Add health recover command Add devlink health recover command to the uapi, in order to allow the user to execute a recover operation over a specific reporter. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b03065a99884..a3a97e6edad8 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -98,6 +98,7 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_GET, DEVLINK_CMD_HEALTH_REPORTER_SET, + DEVLINK_CMD_HEALTH_REPORTER_RECOVER, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 0b231fb76e59..0e6b0e034863 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4739,6 +4739,19 @@ devlink_nl_cmd_health_reporter_set_doit(struct sk_buff *skb, return 0; } +static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + return devlink_health_reporter_recover(reporter, NULL); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -5025,6 +5038,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + .doit = devlink_nl_cmd_health_reporter_recover_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From fca42a2794e31379855c7d687055da43a6e05eef Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:38 +0200 Subject: devlink: Add health diagnose command Add devlink health diagnose command, in order to run a diagnose operation over a specific reporter. It is expected from driver's callback for diagnose command to fill it via the devlink fmsg API. Devlink will parse it and convert it to netlink nla API in order to pass it to the user. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 1 + net/core/devlink.c | 46 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index a3a97e6edad8..09be37137841 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -99,6 +99,7 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_GET, DEVLINK_CMD_HEALTH_REPORTER_SET, DEVLINK_CMD_HEALTH_REPORTER_RECOVER, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 0e6b0e034863..1e8613db2bf7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4752,6 +4752,45 @@ static int devlink_nl_cmd_health_reporter_recover_doit(struct sk_buff *skb, return devlink_health_reporter_recover(reporter, NULL); } +static int devlink_nl_cmd_health_reporter_diagnose_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + struct devlink_fmsg *fmsg; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->diagnose) + return -EOPNOTSUPP; + + fmsg = devlink_fmsg_alloc(); + if (!fmsg) + return -ENOMEM; + + err = devlink_fmsg_obj_nest_start(fmsg); + if (err) + goto out; + + err = reporter->ops->diagnose(reporter, fmsg); + if (err) + goto out; + + err = devlink_fmsg_obj_nest_end(fmsg); + if (err) + goto out; + + err = devlink_fmsg_snd(fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, 0); + +out: + devlink_fmsg_free(fmsg); + return err; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -5045,6 +5084,13 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + .doit = devlink_nl_cmd_health_reporter_diagnose_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 35455e23e6f3cffe20e2b948e57597a8dc240b1e Mon Sep 17 00:00:00 2001 From: Eran Ben Elisha Date: Thu, 7 Feb 2019 11:36:39 +0200 Subject: devlink: Add health dump {get,clear} commands Add devlink health dump commands, in order to run an dump operation over a specific reporter. The supported operations are dump_get in order to get last saved dump (if not exist, dump now) and dump_clear to clear last saved dump. It is expected from driver's callback for diagnose command to fill it via the devlink fmsg API. Devlink will parse it and convert it to netlink nla API in order to pass it to the user. Signed-off-by: Eran Ben Elisha Reviewed-by: Moshe Shemesh Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 2 ++ net/core/devlink.c | 63 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 09be37137841..72d9f7c89190 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -100,6 +100,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_SET, DEVLINK_CMD_HEALTH_REPORTER_RECOVER, DEVLINK_CMD_HEALTH_REPORTER_DIAGNOSE, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, /* add new commands above here */ __DEVLINK_CMD_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 1e8613db2bf7..7fbdba547d4f 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4791,6 +4791,53 @@ out: return err; } +static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + int err; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->dump) + return -EOPNOTSUPP; + + mutex_lock(&reporter->dump_lock); + err = devlink_health_do_dump(reporter, NULL); + if (err) + goto out; + + err = devlink_fmsg_snd(reporter->dump_fmsg, info, + DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0); + +out: + mutex_unlock(&reporter->dump_lock); + return err; +} + +static int +devlink_nl_cmd_health_reporter_dump_clear_doit(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_health_reporter *reporter; + + reporter = devlink_health_reporter_get_from_info(devlink, info); + if (!reporter) + return -EINVAL; + + if (!reporter->ops->dump) + return -EOPNOTSUPP; + + mutex_lock(&reporter->dump_lock); + devlink_health_dump_clear(reporter); + mutex_unlock(&reporter->dump_lock); + return 0; +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -5091,6 +5138,22 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, + .doit = devlink_nl_cmd_health_reporter_dump_get_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, + { + .cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + .doit = devlink_nl_cmd_health_reporter_dump_clear_doit, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 998a8a8387ff5f65da456d1fc448dbb926fb5d78 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 7 Feb 2019 21:41:46 +0100 Subject: net: phy: let genphy_c45_read_link manage the devices to check Let genphy_c45_read_link manage the devices to check, this removes overhead from callers. Add C22EXT to the list of excluded devices because it doesn't implement the status register. According to the 802.3 clause 45 spec registers 29.0 - 29.4 are reserved. At the moment we have very few clause 45 PHY drivers, so we are lacking experience whether other drivers will have to exclude further devices, or may need to check PHY XS. If we should figure out that list of devices to check needs to be configurable, I think best will be to add a device list member to struct phy_driver. v2: - adjusted commit message - exclude also device C22EXT from link checking Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/marvell10g.c | 10 +--------- drivers/net/phy/phy-c45.c | 18 ++++++++++-------- include/linux/phy.h | 2 +- include/uapi/linux/mdio.h | 2 ++ 4 files changed, 14 insertions(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/marvell10g.c b/drivers/net/phy/marvell10g.c index 296a537cdfcb..96a79c6c7810 100644 --- a/drivers/net/phy/marvell10g.c +++ b/drivers/net/phy/marvell10g.c @@ -428,16 +428,8 @@ static int mv3310_read_10gbr_status(struct phy_device *phydev) static int mv3310_read_status(struct phy_device *phydev) { - u32 mmd_mask = phydev->c45_ids.devices_in_package; int val; - /* The vendor devads do not report link status. Avoid the PHYXS - * instance as there are three, and its status depends on the MAC - * being appropriately configured for the negotiated speed. - */ - mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2) | - BIT(MDIO_MMD_PHYXS)); - phydev->speed = SPEED_UNKNOWN; phydev->duplex = DUPLEX_UNKNOWN; linkmode_zero(phydev->lp_advertising); @@ -453,7 +445,7 @@ static int mv3310_read_status(struct phy_device *phydev) if (val & MDIO_STAT1_LSTATUS) return mv3310_read_10gbr_status(phydev); - val = genphy_c45_read_link(phydev, mmd_mask); + val = genphy_c45_read_link(phydev); if (val < 0) return val; diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index c92d0fb7ec4f..6adfe1f6319e 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -118,17 +118,24 @@ EXPORT_SYMBOL_GPL(genphy_c45_aneg_done); /** * genphy_c45_read_link - read the overall link status from the MMDs * @phydev: target phy_device struct - * @mmd_mask: MMDs to read status from * * Read the link status from the specified MMDs, and if they all indicate * that the link is up, set phydev->link to 1. If an error is encountered, * a negative errno will be returned, otherwise zero. */ -int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask) +int genphy_c45_read_link(struct phy_device *phydev) { + u32 mmd_mask = phydev->c45_ids.devices_in_package; int val, devad; bool link = true; + /* The vendor devads and C22EXT do not report link status. Avoid the + * PHYXS instance as its status may depend on the MAC being + * appropriately configured for the negotiated speed. + */ + mmd_mask &= ~(MDIO_DEVS_VEND1 | MDIO_DEVS_VEND2 | MDIO_DEVS_C22EXT | + MDIO_DEVS_PHYXS); + while (mmd_mask && link) { devad = __ffs(mmd_mask); mmd_mask &= ~BIT(devad); @@ -266,16 +273,11 @@ EXPORT_SYMBOL_GPL(gen10g_config_aneg); int gen10g_read_status(struct phy_device *phydev) { - u32 mmd_mask = phydev->c45_ids.devices_in_package; - /* For now just lie and say it's 10G all the time */ phydev->speed = SPEED_10000; phydev->duplex = DUPLEX_FULL; - /* Avoid reading the vendor MMDs */ - mmd_mask &= ~(BIT(MDIO_MMD_VEND1) | BIT(MDIO_MMD_VEND2)); - - return genphy_c45_read_link(phydev, mmd_mask); + return genphy_c45_read_link(phydev); } EXPORT_SYMBOL_GPL(gen10g_read_status); diff --git a/include/linux/phy.h b/include/linux/phy.h index 237dd035858a..f41bf651f6a0 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -1094,7 +1094,7 @@ int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_aneg_done(struct phy_device *phydev); -int genphy_c45_read_link(struct phy_device *phydev, u32 mmd_mask); +int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index d435b00d64ad..2e6e309f0847 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -123,6 +123,8 @@ #define MDIO_DEVS_TC MDIO_DEVS_PRESENT(MDIO_MMD_TC) #define MDIO_DEVS_AN MDIO_DEVS_PRESENT(MDIO_MMD_AN) #define MDIO_DEVS_C22EXT MDIO_DEVS_PRESENT(MDIO_MMD_C22EXT) +#define MDIO_DEVS_VEND1 MDIO_DEVS_PRESENT(MDIO_MMD_VEND1) +#define MDIO_DEVS_VEND2 MDIO_DEVS_PRESENT(MDIO_MMD_VEND2) /* Control register 2. */ #define MDIO_PMA_CTRL2_TYPE 0x000f /* PMA/PMD type selection */ -- cgit v1.2.3 From 3b5e74e0afe3382f9354b657714ac40673b7c597 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 8 Feb 2019 19:25:22 +0100 Subject: net: phy: disregard "Clause 22 registers present" bit in get_phy_c45_devs_in_pkg Bit 0 in register 1.5 doesn't represent a device but is a flag that Clause 22 registers are present. Therefore disregard this bit when populating the device list. If code needs this information it should read register 1.5 directly instead of accessing the device list. Because this bit doesn't represent a device don't define a MDIO_MMD_XYZ constant, just define a MDIO_DEVS_XYZ constant for the flag in the device list bitmap. v2: - make masking of bit 0 more explicit - improve commit message Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/phy/phy_device.c | 3 +++ include/uapi/linux/mdio.h | 1 + 2 files changed, 4 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 9369e1323c39..d4fc1fd8af41 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -684,6 +684,9 @@ static int get_phy_c45_devs_in_pkg(struct mii_bus *bus, int addr, int dev_addr, return -EIO; *devices_in_package |= (phy_reg & 0xffff); + /* Bit 0 doesn't represent a device, it indicates c22 regs presence */ + *devices_in_package &= ~BIT(0); + return 0; } diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 2e6e309f0847..0e012b168e4d 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -115,6 +115,7 @@ /* Device present registers. */ #define MDIO_DEVS_PRESENT(devad) (1 << (devad)) +#define MDIO_DEVS_C22PRESENT MDIO_DEVS_PRESENT(0) #define MDIO_DEVS_PMAPMD MDIO_DEVS_PRESENT(MDIO_MMD_PMAPMD) #define MDIO_DEVS_WIS MDIO_DEVS_PRESENT(MDIO_MMD_WIS) #define MDIO_DEVS_PCS MDIO_DEVS_PRESENT(MDIO_MMD_PCS) -- cgit v1.2.3 From 180cf62cec0418a64dade18b3575047af46c6335 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Sat, 9 Feb 2019 00:05:36 +0100 Subject: batman-adv: Fix typo "reseved" -> "reserved" checkpatch.pl complains since commit 45e417022023 ("scripts/spelling.txt: add more spellings to spelling.txt") about an additional spelling mistake in batman-adv:` CHECK: 'reseved' may be misspelled - perhaps 'reserved'? #232: FILE: include/uapi/linux/batadv_packet.h:232: + * @flags: reseved for routing relevant flags - currently always 0 Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batadv_packet.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batadv_packet.h b/include/uapi/linux/batadv_packet.h index 7eb2936a8e22..c99336f4eefe 100644 --- a/include/uapi/linux/batadv_packet.h +++ b/include/uapi/linux/batadv_packet.h @@ -229,7 +229,7 @@ struct batadv_ogm_packet { * @packet_type: batman-adv packet type, part of the general header * @version: batman-adv protocol version, part of the general header * @ttl: time to live for this packet, part of the general header - * @flags: reseved for routing relevant flags - currently always 0 + * @flags: reserved for routing relevant flags - currently always 0 * @seqno: sequence number * @orig: originator mac address * @tvlv_len: length of the appended tvlv buffer (in bytes) -- cgit v1.2.3 From 60040513536097584c3d55b39acdfa7080645d80 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:14:56 +0100 Subject: batman-adv: Prepare framework for mesh genl config The batman-adv configuration interface was implemented solely using sysfs. This approach was condemned by non-batadv developers as "huge mistake". Instead a netlink/genl based implementation was suggested. The main objects for this configuration is the mesh/soft-interface object. Its actual object in memory already contains most of the available configuration settings. The genl interface reflects this by allowing to get/set it using the mesh specific commands. The BATADV_CMD_GET_MESH_INFO (or short version BATADV_CMD_GET_MESH) is reused as get command because it already provides the content of other information from the mesh/soft-interface which are not yet configuration specific. The set command BATADV_CMD_SET_MESH will also notify interested userspace listeners of the "config" mcast group using the BATADV_CMD_SET_MESH command message type that settings might have been changed and what the current values are. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 16 +++- net/batman-adv/netlink.c | 160 ++++++++++++++++++++++++---------------- 2 files changed, 111 insertions(+), 65 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a28e76a7e0a2..b3394aac9a88 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -27,6 +27,7 @@ #define BATADV_NL_NAME "batadv" +#define BATADV_NL_MCAST_GROUP_CONFIG "config" #define BATADV_NL_MCAST_GROUP_TPMETER "tpmeter" /** @@ -372,10 +373,14 @@ enum batadv_nl_commands { BATADV_CMD_UNSPEC, /** - * @BATADV_CMD_GET_MESH_INFO: Query basic information about batman-adv - * device + * @BATADV_CMD_GET_MESH: Get attributes from softif/mesh */ - BATADV_CMD_GET_MESH_INFO, + BATADV_CMD_GET_MESH, + + /** + * @BATADV_CMD_GET_MESH_INFO: Alias for @BATADV_CMD_GET_MESH + */ + BATADV_CMD_GET_MESH_INFO = BATADV_CMD_GET_MESH, /** * @BATADV_CMD_TP_METER: Start a tp meter session @@ -443,6 +448,11 @@ enum batadv_nl_commands { */ BATADV_CMD_GET_MCAST_FLAGS, + /** + * @BATADV_CMD_SET_MESH: Set attributes for softif/mesh + */ + BATADV_CMD_SET_MESH, + /* add new commands above here */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index e56c40d1bfe0..179cc7cfcb70 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -62,6 +62,7 @@ struct genl_family batadv_netlink_family; /* multicast groups */ enum batadv_netlink_multicast_groups { + BATADV_NL_MCGRP_CONFIG, BATADV_NL_MCGRP_TPMETER, }; @@ -78,6 +79,7 @@ enum batadv_genl_ops_flags { }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { + [BATADV_NL_MCGRP_CONFIG] = { .name = BATADV_NL_MCAST_GROUP_CONFIG }, [BATADV_NL_MCGRP_TPMETER] = { .name = BATADV_NL_MCAST_GROUP_TPMETER }, }; @@ -138,20 +140,29 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) } /** - * batadv_netlink_mesh_info_put() - fill in generic information about mesh - * interface - * @msg: netlink message to be sent back - * @soft_iface: interface for which the data should be taken + * batadv_netlink_mesh_fill() - Fill message with mesh attributes + * @msg: Netlink message to dump into + * @bat_priv: the bat priv with all the soft interface information + * @cmd: type of message to generate + * @portid: Port making netlink request + * @seq: sequence number for message + * @flags: Additional flags for message * - * Return: 0 on success, < 0 on error + * Return: 0 on success or negative error number in case of failure */ -static int -batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) +static int batadv_netlink_mesh_fill(struct sk_buff *msg, + struct batadv_priv *bat_priv, + enum batadv_nl_commands cmd, + u32 portid, u32 seq, int flags) { - struct batadv_priv *bat_priv = netdev_priv(soft_iface); + struct net_device *soft_iface = bat_priv->soft_iface; struct batadv_hard_iface *primary_if = NULL; struct net_device *hard_iface; - int ret = -ENOBUFS; + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); + if (!hdr) + return -ENOBUFS; if (nla_put_string(msg, BATADV_ATTR_VERSION, BATADV_SOURCE_VERSION) || nla_put_string(msg, BATADV_ATTR_ALGO_NAME, @@ -162,16 +173,16 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) soft_iface->dev_addr) || nla_put_u8(msg, BATADV_ATTR_TT_TTVN, (u8)atomic_read(&bat_priv->tt.vn))) - goto out; + goto nla_put_failure; #ifdef CONFIG_BATMAN_ADV_BLA if (nla_put_u16(msg, BATADV_ATTR_BLA_CRC, ntohs(bat_priv->bla.claim_dest.group))) - goto out; + goto nla_put_failure; #endif if (batadv_mcast_mesh_info_put(msg, bat_priv)) - goto out; + goto nla_put_failure; primary_if = batadv_primary_if_get_selected(bat_priv); if (primary_if && primary_if->if_status == BATADV_IF_ACTIVE) { @@ -183,77 +194,95 @@ batadv_netlink_mesh_info_put(struct sk_buff *msg, struct net_device *soft_iface) hard_iface->name) || nla_put(msg, BATADV_ATTR_HARD_ADDRESS, ETH_ALEN, hard_iface->dev_addr)) - goto out; + goto nla_put_failure; } - ret = 0; + if (primary_if) + batadv_hardif_put(primary_if); - out: + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: if (primary_if) batadv_hardif_put(primary_if); - return ret; + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; } /** - * batadv_netlink_get_mesh_info() - handle incoming BATADV_CMD_GET_MESH_INFO - * netlink request - * @skb: received netlink message - * @info: receiver information + * batadv_netlink_notify_mesh() - send softif attributes to listener + * @bat_priv: the bat priv with all the soft interface information * * Return: 0 on success, < 0 on error */ -static int -batadv_netlink_get_mesh_info(struct sk_buff *skb, struct genl_info *info) +static int batadv_netlink_notify_mesh(struct batadv_priv *bat_priv) { - struct net *net = genl_info_net(info); - struct net_device *soft_iface; - struct sk_buff *msg = NULL; - void *msg_head; - int ifindex; + struct sk_buff *msg; int ret; - if (!info->attrs[BATADV_ATTR_MESH_IFINDEX]) - return -EINVAL; - - ifindex = nla_get_u32(info->attrs[BATADV_ATTR_MESH_IFINDEX]); - if (!ifindex) - return -EINVAL; + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; - soft_iface = dev_get_by_index(net, ifindex); - if (!soft_iface || !batadv_softif_is_valid(soft_iface)) { - ret = -ENODEV; - goto out; + ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_SET_MESH, + 0, 0, 0); + if (ret < 0) { + nlmsg_free(msg); + return ret; } - msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - ret = -ENOMEM; - goto out; - } + genlmsg_multicast_netns(&batadv_netlink_family, + dev_net(bat_priv->soft_iface), msg, 0, + BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); - msg_head = genlmsg_put(msg, info->snd_portid, info->snd_seq, - &batadv_netlink_family, 0, - BATADV_CMD_GET_MESH_INFO); - if (!msg_head) { - ret = -ENOBUFS; - goto out; - } + return 0; +} - ret = batadv_netlink_mesh_info_put(msg, soft_iface); +/** + * batadv_netlink_get_mesh() - Get softif attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) +{ + struct batadv_priv *bat_priv = info->user_ptr[0]; + struct sk_buff *msg; + int ret; - out: - if (soft_iface) - dev_put(soft_iface); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; - if (ret) { - if (msg) - nlmsg_free(msg); + ret = batadv_netlink_mesh_fill(msg, bat_priv, BATADV_CMD_GET_MESH, + info->snd_portid, info->snd_seq, 0); + if (ret < 0) { + nlmsg_free(msg); return ret; } - genlmsg_end(msg, msg_head); - return genlmsg_reply(msg, info); + ret = genlmsg_reply(msg, info); + + return ret; +} + +/** + * batadv_netlink_set_mesh() - Set softif attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) +{ + struct batadv_priv *bat_priv = info->user_ptr[0]; + + batadv_netlink_notify_mesh(bat_priv); + + return 0; } /** @@ -600,10 +629,11 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, static const struct genl_ops batadv_netlink_ops[] = { { - .cmd = BATADV_CMD_GET_MESH_INFO, - .flags = GENL_ADMIN_PERM, + .cmd = BATADV_CMD_GET_MESH, + /* can be retrieved by unprivileged users */ .policy = batadv_netlink_policy, - .doit = batadv_netlink_get_mesh_info, + .doit = batadv_netlink_get_mesh, + .internal_flags = BATADV_FLAG_NEED_MESH, }, { .cmd = BATADV_CMD_TP_METER, @@ -685,7 +715,13 @@ static const struct genl_ops batadv_netlink_ops[] = { .policy = batadv_netlink_policy, .dumpit = batadv_mcast_flags_dump, }, - + { + .cmd = BATADV_CMD_SET_MESH, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_set_mesh, + .internal_flags = BATADV_FLAG_NEED_MESH, + }, }; struct genl_family batadv_netlink_family __ro_after_init = { -- cgit v1.2.3 From 5c55a40fa801df2d807141319c0fdbb3939c3947 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:33:17 +0100 Subject: batman-adv: Prepare framework for hardif genl config The batman-adv configuration interface was implemented solely using sysfs. This approach was condemned by non-batadv developers as "huge mistake". Instead a netlink/genl based implementation was suggested. Beside the mesh/soft-interface specific configuration, the slave/hard-interface have B.A.T.M.A.N. V specific configuration settings. The genl interface reflects this by allowing to get/set it using the hard-interface specific commands. The BATADV_CMD_GET_HARDIFS (or short version BATADV_CMD_GET_HARDIF) is reused as get command because it already allow sto dump the content of other information from the slave/hard-interface which are not yet configuration specific. The set command BATADV_CMD_SET_HARDIF will also notify interested userspace listeners of the "config" mcast group using the BATADV_CMD_SET_HARDIF command message type that settings might have been changed and what the current values are. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 16 ++- net/batman-adv/netlink.c | 240 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 233 insertions(+), 23 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index b3394aac9a88..62456a087ef6 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -398,9 +398,15 @@ enum batadv_nl_commands { BATADV_CMD_GET_ROUTING_ALGOS, /** - * @BATADV_CMD_GET_HARDIFS: Query list of hard interfaces + * @BATADV_CMD_GET_HARDIF: Get attributes from a hardif of the + * current softif */ - BATADV_CMD_GET_HARDIFS, + BATADV_CMD_GET_HARDIF, + + /** + * @BATADV_CMD_GET_HARDIFS: Alias for @BATADV_CMD_GET_HARDIF + */ + BATADV_CMD_GET_HARDIFS = BATADV_CMD_GET_HARDIF, /** * @BATADV_CMD_GET_TRANSTABLE_LOCAL: Query list of local translations @@ -453,6 +459,12 @@ enum batadv_nl_commands { */ BATADV_CMD_SET_MESH, + /** + * @BATADV_CMD_SET_HARDIF: Set attributes for hardif of the + * current softif + */ + BATADV_CMD_SET_HARDIF, + /* add new commands above here */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 179cc7cfcb70..a184e4225fb5 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,13 @@ enum batadv_genl_ops_flags { * saved in info->user_ptr[0] */ BATADV_FLAG_NEED_MESH = BIT(0), + + /** + * @BATADV_FLAG_NEED_HARDIF: request requires valid hard interface in + * attribute BATADV_ATTR_HARD_IFINDEX and expects a pointer to it to be + * saved in info->user_ptr[1] + */ + BATADV_FLAG_NEED_HARDIF = BIT(1), }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { @@ -446,29 +454,38 @@ batadv_netlink_tp_meter_cancel(struct sk_buff *skb, struct genl_info *info) } /** - * batadv_netlink_dump_hardif_entry() - Dump one hard interface into a message + * batadv_netlink_hardif_fill() - Fill message with hardif attributes * @msg: Netlink message to dump into + * @bat_priv: the bat priv with all the soft interface information + * @hard_iface: hard interface which was modified + * @cmd: type of message to generate * @portid: Port making netlink request + * @seq: sequence number for message + * @flags: Additional flags for message * @cb: Control block containing additional options - * @hard_iface: Hard interface to dump * - * Return: error code, or 0 on success + * Return: 0 on success or negative error number in case of failure */ -static int -batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, - struct netlink_callback *cb, - struct batadv_hard_iface *hard_iface) +static int batadv_netlink_hardif_fill(struct sk_buff *msg, + struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface, + enum batadv_nl_commands cmd, + u32 portid, u32 seq, int flags, + struct netlink_callback *cb) { struct net_device *net_dev = hard_iface->net_dev; void *hdr; - hdr = genlmsg_put(msg, portid, cb->nlh->nlmsg_seq, - &batadv_netlink_family, NLM_F_MULTI, - BATADV_CMD_GET_HARDIFS); + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); if (!hdr) - return -EMSGSIZE; + return -ENOBUFS; + + if (cb) + genl_dump_check_consistent(cb, hdr); - genl_dump_check_consistent(cb, hdr); + if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, + bat_priv->soft_iface->ifindex)) + goto nla_put_failure; if (nla_put_u32(msg, BATADV_ATTR_HARD_IFINDEX, net_dev->ifindex) || @@ -486,24 +503,107 @@ batadv_netlink_dump_hardif_entry(struct sk_buff *msg, u32 portid, genlmsg_end(msg, hdr); return 0; - nla_put_failure: +nla_put_failure: genlmsg_cancel(msg, hdr); return -EMSGSIZE; } /** - * batadv_netlink_dump_hardifs() - Dump all hard interface into a messages + * batadv_netlink_notify_hardif() - send hardif attributes to listener + * @bat_priv: the bat priv with all the soft interface information + * @hard_iface: hard interface which was modified + * + * Return: 0 on success, < 0 on error + */ +static int batadv_netlink_notify_hardif(struct batadv_priv *bat_priv, + struct batadv_hard_iface *hard_iface) +{ + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, + BATADV_CMD_SET_HARDIF, 0, 0, 0, NULL); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + genlmsg_multicast_netns(&batadv_netlink_family, + dev_net(bat_priv->soft_iface), msg, 0, + BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); + + return 0; +} + +/** + * batadv_netlink_get_hardif() - Get hardif attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_get_hardif(struct sk_buff *skb, + struct genl_info *info) +{ + struct batadv_hard_iface *hard_iface = info->user_ptr[1]; + struct batadv_priv *bat_priv = info->user_ptr[0]; + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, + BATADV_CMD_GET_HARDIF, + info->snd_portid, info->snd_seq, 0, + NULL); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + ret = genlmsg_reply(msg, info); + + return ret; +} + +/** + * batadv_netlink_set_hardif() - Set hardif attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_set_hardif(struct sk_buff *skb, + struct genl_info *info) +{ + struct batadv_hard_iface *hard_iface = info->user_ptr[1]; + struct batadv_priv *bat_priv = info->user_ptr[0]; + + batadv_netlink_notify_hardif(bat_priv, hard_iface); + + return 0; +} + +/** + * batadv_netlink_dump_hardif() - Dump all hard interface into a messages * @msg: Netlink message to dump into * @cb: Parameters from query * * Return: error code, or length of reply message on success */ static int -batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) +batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) { struct net *net = sock_net(cb->skb->sk); struct net_device *soft_iface; struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv; int ifindex; int portid = NETLINK_CB(cb->skb).portid; int skip = cb->args[0]; @@ -523,6 +623,8 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) return -ENODEV; } + bat_priv = netdev_priv(soft_iface); + rtnl_lock(); cb->seq = batadv_hardif_generation << 1 | 1; @@ -533,8 +635,10 @@ batadv_netlink_dump_hardifs(struct sk_buff *msg, struct netlink_callback *cb) if (i++ < skip) continue; - if (batadv_netlink_dump_hardif_entry(msg, portid, cb, - hard_iface)) { + if (batadv_netlink_hardif_fill(msg, bat_priv, hard_iface, + BATADV_CMD_GET_HARDIF, + portid, cb->nlh->nlmsg_seq, + NLM_F_MULTI, cb)) { i--; break; } @@ -583,6 +687,52 @@ err_put_softif: return ERR_PTR(-EINVAL); } +/** + * batadv_get_hardif_from_info() - Retrieve hardif from genl attributes + * @bat_priv: the bat priv with all the soft interface information + * @net: the applicable net namespace + * @info: receiver information + * + * Return: Pointer to hard interface (with increased refcnt) on success, error + * pointer on error + */ +static struct batadv_hard_iface * +batadv_get_hardif_from_info(struct batadv_priv *bat_priv, struct net *net, + struct genl_info *info) +{ + struct batadv_hard_iface *hard_iface; + struct net_device *hard_dev; + unsigned int hardif_index; + + if (!info->attrs[BATADV_ATTR_HARD_IFINDEX]) + return ERR_PTR(-EINVAL); + + hardif_index = nla_get_u32(info->attrs[BATADV_ATTR_HARD_IFINDEX]); + + hard_dev = dev_get_by_index(net, hardif_index); + if (!hard_dev) + return ERR_PTR(-ENODEV); + + hard_iface = batadv_hardif_get_by_netdev(hard_dev); + if (!hard_iface) + goto err_put_harddev; + + if (hard_iface->soft_iface != bat_priv->soft_iface) + goto err_put_hardif; + + /* hard_dev is referenced by hard_iface and not needed here */ + dev_put(hard_dev); + + return hard_iface; + +err_put_hardif: + batadv_hardif_put(hard_iface); +err_put_harddev: + dev_put(hard_dev); + + return ERR_PTR(-EINVAL); +} + /** * batadv_pre_doit() - Prepare batman-adv genl doit request * @ops: requested netlink operation @@ -595,8 +745,21 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct net *net = genl_info_net(info); + struct batadv_hard_iface *hard_iface; + struct batadv_priv *bat_priv = NULL; struct net_device *soft_iface; - struct batadv_priv *bat_priv; + u8 user_ptr1_flags; + u8 mesh_dep_flags; + int ret; + + user_ptr1_flags = BATADV_FLAG_NEED_HARDIF; + if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1)) + return -EINVAL; + + mesh_dep_flags = BATADV_FLAG_NEED_HARDIF; + if (WARN_ON((ops->internal_flags & mesh_dep_flags) && + (~ops->internal_flags & BATADV_FLAG_NEED_MESH))) + return -EINVAL; if (ops->internal_flags & BATADV_FLAG_NEED_MESH) { soft_iface = batadv_get_softif_from_info(net, info); @@ -607,7 +770,23 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[0] = bat_priv; } + if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF) { + hard_iface = batadv_get_hardif_from_info(bat_priv, net, info); + if (IS_ERR(hard_iface)) { + ret = PTR_ERR(hard_iface); + goto err_put_softif; + } + + info->user_ptr[1] = hard_iface; + } + return 0; + +err_put_softif: + if (bat_priv) + dev_put(bat_priv->soft_iface); + + return ret; } /** @@ -619,8 +798,16 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { + struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv; + if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && + info->user_ptr[1]) { + hard_iface = info->user_ptr[1]; + + batadv_hardif_put(hard_iface); + } + if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; dev_put(bat_priv->soft_iface); @@ -656,10 +843,13 @@ static const struct genl_ops batadv_netlink_ops[] = { .dumpit = batadv_algo_dump, }, { - .cmd = BATADV_CMD_GET_HARDIFS, - .flags = GENL_ADMIN_PERM, + .cmd = BATADV_CMD_GET_HARDIF, + /* can be retrieved by unprivileged users */ .policy = batadv_netlink_policy, - .dumpit = batadv_netlink_dump_hardifs, + .dumpit = batadv_netlink_dump_hardif, + .doit = batadv_netlink_get_hardif, + .internal_flags = BATADV_FLAG_NEED_MESH | + BATADV_FLAG_NEED_HARDIF, }, { .cmd = BATADV_CMD_GET_TRANSTABLE_LOCAL, @@ -722,6 +912,14 @@ static const struct genl_ops batadv_netlink_ops[] = { .doit = batadv_netlink_set_mesh, .internal_flags = BATADV_FLAG_NEED_MESH, }, + { + .cmd = BATADV_CMD_SET_HARDIF, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_set_hardif, + .internal_flags = BATADV_FLAG_NEED_MESH | + BATADV_FLAG_NEED_HARDIF, + }, }; struct genl_family batadv_netlink_family __ro_after_init = { -- cgit v1.2.3 From 49e7e37cd98122126e8da58df2fe2261c6e83df2 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:41:08 +0100 Subject: batman-adv: Prepare framework for vlan genl config The batman-adv configuration interface was implemented solely using sysfs. This approach was condemned by non-batadv developers as "huge mistake". Instead a netlink/genl based implementation was suggested. Beside the mesh/soft-interface specific configuration, the VLANs on top of the mesh/soft-interface have configuration settings. The genl interface reflects this by allowing to get/set it using the vlan specific commands BATADV_CMD_GET_VLAN/BATADV_CMD_SET_VLAN. The set command BATADV_CMD_SET_MESH will also notify interested userspace listeners of the "config" mcast group using the BATADV_CMD_SET_VLAN command message type that settings might have been changed and what the current values are. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 17 ++++ net/batman-adv/netlink.c | 191 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 206 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 62456a087ef6..10f3cc34fe16 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -345,6 +345,11 @@ enum batadv_nl_attrs { */ BATADV_ATTR_MCAST_FLAGS_PRIV, + /** + * @BATADV_ATTR_VLANID: VLAN id on top of soft interface + */ + BATADV_ATTR_VLANID, + /* add attributes above here, update the policy in netlink.c */ /** @@ -465,6 +470,18 @@ enum batadv_nl_commands { */ BATADV_CMD_SET_HARDIF, + /** + * @BATADV_CMD_GET_VLAN: Get attributes from a VLAN of the + * current softif + */ + BATADV_CMD_GET_VLAN, + + /** + * @BATADV_CMD_SET_VLAN: Set attributes for VLAN of the + * current softif + */ + BATADV_CMD_SET_VLAN, + /* add new commands above here */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index a184e4225fb5..d40913bb9031 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,13 @@ enum batadv_genl_ops_flags { * saved in info->user_ptr[1] */ BATADV_FLAG_NEED_HARDIF = BIT(1), + + /** + * @BATADV_FLAG_NEED_VLAN: request requires valid vlan in + * attribute BATADV_ATTR_VLANID and expects a pointer to it to be + * saved in info->user_ptr[1] + */ + BATADV_FLAG_NEED_VLAN = BIT(2), }; static const struct genl_multicast_group batadv_netlink_mcgrps[] = { @@ -130,6 +138,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_DAT_CACHE_VID] = { .type = NLA_U16 }, [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, + [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, }; /** @@ -653,6 +662,123 @@ batadv_netlink_dump_hardif(struct sk_buff *msg, struct netlink_callback *cb) return msg->len; } +/** + * batadv_netlink_vlan_fill() - Fill message with vlan attributes + * @msg: Netlink message to dump into + * @bat_priv: the bat priv with all the soft interface information + * @vlan: vlan which was modified + * @cmd: type of message to generate + * @portid: Port making netlink request + * @seq: sequence number for message + * @flags: Additional flags for message + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_vlan_fill(struct sk_buff *msg, + struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan, + enum batadv_nl_commands cmd, + u32 portid, u32 seq, int flags) +{ + void *hdr; + + hdr = genlmsg_put(msg, portid, seq, &batadv_netlink_family, flags, cmd); + if (!hdr) + return -ENOBUFS; + + if (nla_put_u32(msg, BATADV_ATTR_MESH_IFINDEX, + bat_priv->soft_iface->ifindex)) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) + goto nla_put_failure; + + genlmsg_end(msg, hdr); + return 0; + +nla_put_failure: + genlmsg_cancel(msg, hdr); + return -EMSGSIZE; +} + +/** + * batadv_netlink_notify_vlan() - send vlan attributes to listener + * @bat_priv: the bat priv with all the soft interface information + * @vlan: vlan which was modified + * + * Return: 0 on success, < 0 on error + */ +static int batadv_netlink_notify_vlan(struct batadv_priv *bat_priv, + struct batadv_softif_vlan *vlan) +{ + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, + BATADV_CMD_SET_VLAN, 0, 0, 0); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + genlmsg_multicast_netns(&batadv_netlink_family, + dev_net(bat_priv->soft_iface), msg, 0, + BATADV_NL_MCGRP_CONFIG, GFP_KERNEL); + + return 0; +} + +/** + * batadv_netlink_get_vlan() - Get vlan attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_get_vlan(struct sk_buff *skb, struct genl_info *info) +{ + struct batadv_softif_vlan *vlan = info->user_ptr[1]; + struct batadv_priv *bat_priv = info->user_ptr[0]; + struct sk_buff *msg; + int ret; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + ret = batadv_netlink_vlan_fill(msg, bat_priv, vlan, BATADV_CMD_GET_VLAN, + info->snd_portid, info->snd_seq, 0); + if (ret < 0) { + nlmsg_free(msg); + return ret; + } + + ret = genlmsg_reply(msg, info); + + return ret; +} + +/** + * batadv_netlink_set_vlan() - Get vlan attributes + * @skb: Netlink message with request data + * @info: receiver information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) +{ + struct batadv_softif_vlan *vlan = info->user_ptr[1]; + struct batadv_priv *bat_priv = info->user_ptr[0]; + + batadv_netlink_notify_vlan(bat_priv, vlan); + + return 0; +} + /** * batadv_get_softif_from_info() - Retrieve soft interface from genl attributes * @net: the applicable net namespace @@ -733,6 +859,34 @@ err_put_harddev: return ERR_PTR(-EINVAL); } +/** + * batadv_get_vlan_from_info() - Retrieve vlan from genl attributes + * @bat_priv: the bat priv with all the soft interface information + * @net: the applicable net namespace + * @info: receiver information + * + * Return: Pointer to vlan on success (with increased refcnt), error pointer + * on error + */ +static struct batadv_softif_vlan * +batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net, + struct genl_info *info) +{ + struct batadv_softif_vlan *vlan; + u16 vid; + + if (!info->attrs[BATADV_ATTR_VLANID]) + return ERR_PTR(-EINVAL); + + vid = nla_get_u16(info->attrs[BATADV_ATTR_VLANID]); + + vlan = batadv_softif_vlan_get(bat_priv, vid | BATADV_VLAN_HAS_TAG); + if (!vlan) + return ERR_PTR(-ENOENT); + + return vlan; +} + /** * batadv_pre_doit() - Prepare batman-adv genl doit request * @ops: requested netlink operation @@ -747,16 +901,17 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct net *net = genl_info_net(info); struct batadv_hard_iface *hard_iface; struct batadv_priv *bat_priv = NULL; + struct batadv_softif_vlan *vlan; struct net_device *soft_iface; u8 user_ptr1_flags; u8 mesh_dep_flags; int ret; - user_ptr1_flags = BATADV_FLAG_NEED_HARDIF; + user_ptr1_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON(hweight8(ops->internal_flags & user_ptr1_flags) > 1)) return -EINVAL; - mesh_dep_flags = BATADV_FLAG_NEED_HARDIF; + mesh_dep_flags = BATADV_FLAG_NEED_HARDIF | BATADV_FLAG_NEED_VLAN; if (WARN_ON((ops->internal_flags & mesh_dep_flags) && (~ops->internal_flags & BATADV_FLAG_NEED_MESH))) return -EINVAL; @@ -780,6 +935,16 @@ static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, info->user_ptr[1] = hard_iface; } + if (ops->internal_flags & BATADV_FLAG_NEED_VLAN) { + vlan = batadv_get_vlan_from_info(bat_priv, net, info); + if (IS_ERR(vlan)) { + ret = PTR_ERR(vlan); + goto err_put_softif; + } + + info->user_ptr[1] = vlan; + } + return 0; err_put_softif: @@ -799,6 +964,7 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct batadv_hard_iface *hard_iface; + struct batadv_softif_vlan *vlan; struct batadv_priv *bat_priv; if (ops->internal_flags & BATADV_FLAG_NEED_HARDIF && @@ -808,6 +974,11 @@ static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb, batadv_hardif_put(hard_iface); } + if (ops->internal_flags & BATADV_FLAG_NEED_VLAN && info->user_ptr[1]) { + vlan = info->user_ptr[1]; + batadv_softif_vlan_put(vlan); + } + if (ops->internal_flags & BATADV_FLAG_NEED_MESH && info->user_ptr[0]) { bat_priv = info->user_ptr[0]; dev_put(bat_priv->soft_iface); @@ -920,6 +1091,22 @@ static const struct genl_ops batadv_netlink_ops[] = { .internal_flags = BATADV_FLAG_NEED_MESH | BATADV_FLAG_NEED_HARDIF, }, + { + .cmd = BATADV_CMD_GET_VLAN, + /* can be retrieved by unprivileged users */ + .policy = batadv_netlink_policy, + .doit = batadv_netlink_get_vlan, + .internal_flags = BATADV_FLAG_NEED_MESH | + BATADV_FLAG_NEED_VLAN, + }, + { + .cmd = BATADV_CMD_SET_VLAN, + .flags = GENL_ADMIN_PERM, + .policy = batadv_netlink_policy, + .doit = batadv_netlink_set_vlan, + .internal_flags = BATADV_FLAG_NEED_MESH | + BATADV_FLAG_NEED_VLAN, + }, }; struct genl_family batadv_netlink_family __ro_after_init = { -- cgit v1.2.3 From 9ab4cee5ced970019a5d0f3a43cf85671ca7b38f Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:46:14 +0100 Subject: batman-adv: Add aggregated_ogms mesh genl configuration The mesh interface can delay OGM messages to aggregate different ogms together in a single OGM packet. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_AGGREGATED_OGMS_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 12 ++++++++++++ 2 files changed, 18 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 10f3cc34fe16..f8941e80d6b4 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -350,6 +350,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_VLANID, + /** + * @BATADV_ATTR_AGGREGATED_OGMS_ENABLED: whether the batman protocol + * messages of the mesh interface shall be aggregated or not. + */ + BATADV_ATTR_AGGREGATED_OGMS_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index d40913bb9031..82665dc2eb2b 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -139,6 +139,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_MCAST_FLAGS] = { .type = NLA_U32 }, [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, + [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 }, }; /** @@ -214,6 +215,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_AGGREGATED_OGMS_ENABLED, + !!atomic_read(&bat_priv->aggregated_ogms))) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -296,6 +301,13 @@ static int batadv_netlink_get_mesh(struct sk_buff *skb, struct genl_info *info) static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) { struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]) { + attr = info->attrs[BATADV_ATTR_AGGREGATED_OGMS_ENABLED]; + + atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr)); + } batadv_netlink_notify_mesh(bat_priv); -- cgit v1.2.3 From e43d16b87dc2cad18799cfd1142f4acae4135ea4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:51:55 +0100 Subject: batman-adv: Add ap_isolation mesh/vlan genl configuration The mesh interface can drop messages between clients to implement a mesh-wide AP isolation. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH and BATADV_CMD_SET_VLAN/BATADV_CMD_GET_VLAN commands allow to set/get the configuration of this feature using the BATADV_ATTR_AP_ISOLATION_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. This feature also requires that skbuff which should be handled as isolated are marked. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the mark/mask using the u32 attributes BATADV_ATTR_ISOLATION_MARK and BATADV_ATTR_ISOLATION_MASK. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 19 +++++++++ net/batman-adv/netlink.c | 89 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index f8941e80d6b4..a4dadafe08dd 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -356,6 +356,25 @@ enum batadv_nl_attrs { */ BATADV_ATTR_AGGREGATED_OGMS_ENABLED, + /** + * @BATADV_ATTR_AP_ISOLATION_ENABLED: whether the data traffic going + * from a wireless client to another wireless client will be silently + * dropped. + */ + BATADV_ATTR_AP_ISOLATION_ENABLED, + + /** + * @BATADV_ATTR_ISOLATION_MARK: the isolation mark which is used to + * classify clients as "isolated" by the Extended Isolation feature. + */ + BATADV_ATTR_ISOLATION_MARK, + + /** + * @BATADV_ATTR_ISOLATION_MASK: the isolation (bit)mask which is used to + * classify clients as "isolated" by the Extended Isolation feature. + */ + BATADV_ATTR_ISOLATION_MASK, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 82665dc2eb2b..fc80f003ed6e 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -140,6 +140,9 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_MCAST_FLAGS_PRIV] = { .type = NLA_U32 }, [BATADV_ATTR_VLANID] = { .type = NLA_U16 }, [BATADV_ATTR_AGGREGATED_OGMS_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, + [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, }; /** @@ -157,6 +160,52 @@ batadv_netlink_get_ifindex(const struct nlmsghdr *nlh, int attrtype) return attr ? nla_get_u32(attr) : 0; } +/** + * batadv_netlink_mesh_fill_ap_isolation() - Add ap_isolation softif attribute + * @msg: Netlink message to dump into + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_mesh_fill_ap_isolation(struct sk_buff *msg, + struct batadv_priv *bat_priv) +{ + struct batadv_softif_vlan *vlan; + u8 ap_isolation; + + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (!vlan) + return 0; + + ap_isolation = atomic_read(&vlan->ap_isolation); + batadv_softif_vlan_put(vlan); + + return nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, + !!ap_isolation); +} + +/** + * batadv_option_set_ap_isolation() - Set ap_isolation from genl msg + * @attr: parsed BATADV_ATTR_AP_ISOLATION_ENABLED attribute + * @bat_priv: the bat priv with all the soft interface information + * + * Return: 0 on success or negative error number in case of failure + */ +static int batadv_netlink_set_mesh_ap_isolation(struct nlattr *attr, + struct batadv_priv *bat_priv) +{ + struct batadv_softif_vlan *vlan; + + vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS); + if (!vlan) + return -ENOENT; + + atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); + batadv_softif_vlan_put(vlan); + + return 0; +} + /** * batadv_netlink_mesh_fill() - Fill message with mesh attributes * @msg: Netlink message to dump into @@ -219,6 +268,17 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, !!atomic_read(&bat_priv->aggregated_ogms))) goto nla_put_failure; + if (batadv_netlink_mesh_fill_ap_isolation(msg, bat_priv)) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MARK, + bat_priv->isolation_mark)) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_ISOLATION_MASK, + bat_priv->isolation_mark_mask)) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -309,6 +369,24 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->aggregated_ogms, !!nla_get_u8(attr)); } + if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { + attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; + + batadv_netlink_set_mesh_ap_isolation(attr, bat_priv); + } + + if (info->attrs[BATADV_ATTR_ISOLATION_MARK]) { + attr = info->attrs[BATADV_ATTR_ISOLATION_MARK]; + + bat_priv->isolation_mark = nla_get_u32(attr); + } + + if (info->attrs[BATADV_ATTR_ISOLATION_MASK]) { + attr = info->attrs[BATADV_ATTR_ISOLATION_MASK]; + + bat_priv->isolation_mark_mask = nla_get_u32(attr); + } + batadv_netlink_notify_mesh(bat_priv); return 0; @@ -705,6 +783,10 @@ static int batadv_netlink_vlan_fill(struct sk_buff *msg, if (nla_put_u32(msg, BATADV_ATTR_VLANID, vlan->vid & VLAN_VID_MASK)) goto nla_put_failure; + if (nla_put_u8(msg, BATADV_ATTR_AP_ISOLATION_ENABLED, + !!atomic_read(&vlan->ap_isolation))) + goto nla_put_failure; + genlmsg_end(msg, hdr); return 0; @@ -785,6 +867,13 @@ static int batadv_netlink_set_vlan(struct sk_buff *skb, struct genl_info *info) { struct batadv_softif_vlan *vlan = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]) { + attr = info->attrs[BATADV_ATTR_AP_ISOLATION_ENABLED]; + + atomic_set(&vlan->ap_isolation, !!nla_get_u8(attr)); + } batadv_netlink_notify_vlan(bat_priv, vlan); -- cgit v1.2.3 From d7e52506b680826d6ff7ce73e6a90a3b9defc741 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 12:55:44 +0100 Subject: batman-adv: Add bonding mesh genl configuration The mesh interface can use multiple slave/hard-interface ports at the same time to transport the traffic to other nodes. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_BONDING_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 11 +++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a4dadafe08dd..f74ff261ec8f 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -375,6 +375,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_ISOLATION_MASK, + /** + * @BATADV_ATTR_BONDING_ENABLED: whether the data traffic going through + * the mesh will be sent using multiple interfaces at the same time. + */ + BATADV_ATTR_BONDING_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index fc80f003ed6e..310a2c339fd1 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -143,6 +143,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_AP_ISOLATION_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, + [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, }; /** @@ -279,6 +280,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, bat_priv->isolation_mark_mask)) goto nla_put_failure; + if (nla_put_u8(msg, BATADV_ATTR_BONDING_ENABLED, + !!atomic_read(&bat_priv->bonding))) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -387,6 +392,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) bat_priv->isolation_mark_mask = nla_get_u32(attr); } + if (info->attrs[BATADV_ATTR_BONDING_ENABLED]) { + attr = info->attrs[BATADV_ATTR_BONDING_ENABLED]; + + atomic_set(&bat_priv->bonding, !!nla_get_u8(attr)); + } + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From 43ff6105a527aaa1a8e00163b0b0aedbbc0c4522 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:03:39 +0100 Subject: batman-adv: Add bridge_loop_avoidance mesh genl configuration The mesh interface can try to detect loops in the same mesh caused by (indirectly) bridged mesh/soft-interfaces of different nodes. Some of the loops can also be resolved without breaking the mesh. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 7 +++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index f74ff261ec8f..3cb35c661056 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -381,6 +381,13 @@ enum batadv_nl_attrs { */ BATADV_ATTR_BONDING_ENABLED, + /** + * @BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED: whether the bridge loop + * avoidance feature is enabled. This feature detects and avoids loops + * between the mesh and devices bridged with the soft interface + */ + BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 310a2c339fd1..40c940da9498 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -144,6 +144,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_ISOLATION_MARK] = { .type = NLA_U32 }, [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, }; /** @@ -284,6 +285,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, !!atomic_read(&bat_priv->bonding))) goto nla_put_failure; +#ifdef CONFIG_BATMAN_ADV_BLA + if (nla_put_u8(msg, BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, + !!atomic_read(&bat_priv->bridge_loop_avoidance))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_BLA */ + if (primary_if) batadv_hardif_put(primary_if); @@ -398,6 +405,16 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->bonding, !!nla_get_u8(attr)); } +#ifdef CONFIG_BATMAN_ADV_BLA + if (info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]) { + attr = info->attrs[BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED]; + + atomic_set(&bat_priv->bridge_loop_avoidance, + !!nla_get_u8(attr)); + batadv_bla_status_update(bat_priv->soft_iface); + } +#endif /* CONFIG_BATMAN_ADV_BLA */ + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From a1c8de80329609ba68ff860074070efb1e14ade4 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:06:42 +0100 Subject: batman-adv: Add distributed_arp_table mesh genl configuration The mesh interface can use a distributed hash table to answer ARP requests without flooding the request through the whole mesh. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 8 ++++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 3cb35c661056..f303a1496476 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -388,6 +388,14 @@ enum batadv_nl_attrs { */ BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED, + /** + * @BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED: whether the distributed + * arp table feature is enabled. This feature uses a distributed hash + * table to answer ARP requests without flooding the request through + * the whole mesh. + */ + BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 40c940da9498..3028c2a5c782 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -145,6 +145,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_ISOLATION_MASK] = { .type = NLA_U32 }, [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, }; /** @@ -291,6 +292,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BLA */ +#ifdef CONFIG_BATMAN_ADV_DAT + if (nla_put_u8(msg, BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, + !!atomic_read(&bat_priv->distributed_arp_table))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_DAT */ + if (primary_if) batadv_hardif_put(primary_if); @@ -415,6 +422,16 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_BLA */ +#ifdef CONFIG_BATMAN_ADV_DAT + if (info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]) { + attr = info->attrs[BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED]; + + atomic_set(&bat_priv->distributed_arp_table, + !!nla_get_u8(attr)); + batadv_dat_status_update(bat_priv->soft_iface); + } +#endif /* CONFIG_BATMAN_ADV_DAT */ + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From 3e15b06eb7e410ef9f1b9673be094b3e10eacf93 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:09:49 +0100 Subject: batman-adv: Add fragmentation mesh genl configuration The mesh interface can fragment unicast packets when the packet size exceeds the outgoing slave/hard-interface MTU. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_FRAGMENTATION_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 7 +++++++ net/batman-adv/netlink.c | 12 ++++++++++++ 2 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index f303a1496476..847841b8de5d 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -396,6 +396,13 @@ enum batadv_nl_attrs { */ BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED, + /** + * @BATADV_ATTR_FRAGMENTATION_ENABLED: whether the data traffic going + * through the mesh will be fragmented or silently discarded if the + * packet size exceeds the outgoing interface MTU. + */ + BATADV_ATTR_FRAGMENTATION_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 3028c2a5c782..5b2160d2f482 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -146,6 +146,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_BONDING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 }, }; /** @@ -298,6 +299,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DAT */ + if (nla_put_u8(msg, BATADV_ATTR_FRAGMENTATION_ENABLED, + !!atomic_read(&bat_priv->fragmentation))) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -432,6 +437,13 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_DAT */ + if (info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]) { + attr = info->attrs[BATADV_ATTR_FRAGMENTATION_ENABLED]; + + atomic_set(&bat_priv->fragmentation, !!nla_get_u8(attr)); + batadv_update_min_mtu(bat_priv->soft_iface); + } + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From e2d0d35b5b0ce420505e88255fd5922ed035bb8d Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:15:00 +0100 Subject: batman-adv: Add gateway mesh genl configuration The mesh/soft-interface can optimize the handling of DHCP packets. Instead of flooding them through the whole mesh, it can be forwarded as unicast to a specific gateway server. The originator which injects the packets in the mesh has to select (based on sel_class thresholds) a responsible gateway server. This is done by switching this originator to the gw_mode client. The servers announce their forwarding bandwidth (download/upload) when the gw_mode server was selected. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the attributes: * u8 BATADV_ATTR_GW_MODE (0 == off, 1 == client, 2 == server) * u32 BATADV_ATTR_GW_BANDWIDTH_DOWN (in 100 kbit/s steps) * u32 BATADV_ATTR_GW_BANDWIDTH_UP (in 100 kbit/s steps) * u32 BATADV_ATTR_GW_SEL_CLASS Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 40 ++++++++++++++++++ net/batman-adv/gateway_client.c | 1 - net/batman-adv/gateway_common.c | 1 + net/batman-adv/gateway_common.h | 6 --- net/batman-adv/netlink.c | 92 +++++++++++++++++++++++++++++++++++++++++ net/batman-adv/soft-interface.c | 2 +- net/batman-adv/sysfs.c | 1 + 7 files changed, 135 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 847841b8de5d..165272be6878 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -139,6 +139,20 @@ enum batadv_mcast_flags_priv { BATADV_MCAST_FLAGS_QUERIER_IPV6_SHADOWING = (1 << 4), }; +/** + * enum batadv_gw_modes - gateway mode of node + */ +enum batadv_gw_modes { + /** @BATADV_GW_MODE_OFF: gw mode disabled */ + BATADV_GW_MODE_OFF, + + /** @BATADV_GW_MODE_CLIENT: send DHCP requests to gw servers */ + BATADV_GW_MODE_CLIENT, + + /** @BATADV_GW_MODE_SERVER: announce itself as gatway server */ + BATADV_GW_MODE_SERVER, +}; + /** * enum batadv_nl_attrs - batman-adv netlink attributes */ @@ -403,6 +417,32 @@ enum batadv_nl_attrs { */ BATADV_ATTR_FRAGMENTATION_ENABLED, + /** + * @BATADV_ATTR_GW_BANDWIDTH_DOWN: defines the download bandwidth which + * is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set + * to 'server'. + */ + BATADV_ATTR_GW_BANDWIDTH_DOWN, + + /** + * @BATADV_ATTR_GW_BANDWIDTH_UP: defines the upload bandwidth which + * is propagated by this node if %BATADV_ATTR_GW_BANDWIDTH_MODE was set + * to 'server'. + */ + BATADV_ATTR_GW_BANDWIDTH_UP, + + /** + * @BATADV_ATTR_GW_MODE: defines the state of the gateway features. + * Possible values are specified in enum batadv_gw_modes + */ + BATADV_ATTR_GW_MODE, + + /** + * @BATADV_ATTR_GW_SEL_CLASS: defines the selection criteria this node + * will use to choose a gateway if gw_mode was set to 'client'. + */ + BATADV_ATTR_GW_SEL_CLASS, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c index 0a6b9b30eabd..f5811f61aa92 100644 --- a/net/batman-adv/gateway_client.c +++ b/net/batman-adv/gateway_client.c @@ -47,7 +47,6 @@ #include #include -#include "gateway_common.h" #include "hard-interface.h" #include "log.h" #include "netlink.h" diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c index 53d03adc9bfc..e064de45e22c 100644 --- a/net/batman-adv/gateway_common.c +++ b/net/batman-adv/gateway_common.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "gateway_client.h" #include "log.h" diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h index 89bae100a0b0..128467a0fb89 100644 --- a/net/batman-adv/gateway_common.h +++ b/net/batman-adv/gateway_common.h @@ -25,12 +25,6 @@ struct net_device; -enum batadv_gw_modes { - BATADV_GW_MODE_OFF, - BATADV_GW_MODE_CLIENT, - BATADV_GW_MODE_SERVER, -}; - /** * enum batadv_bandwidth_units - bandwidth unit types */ diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 5b2160d2f482..473467afea91 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -51,6 +51,7 @@ #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" #include "gateway_client.h" +#include "gateway_common.h" #include "hard-interface.h" #include "multicast.h" #include "originator.h" @@ -147,6 +148,10 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_BRIDGE_LOOP_AVOIDANCE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_DISTRIBUTED_ARP_TABLE_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_FRAGMENTATION_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_GW_BANDWIDTH_DOWN] = { .type = NLA_U32 }, + [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 }, + [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, + [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, }; /** @@ -303,6 +308,28 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, !!atomic_read(&bat_priv->fragmentation))) goto nla_put_failure; + if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_DOWN, + atomic_read(&bat_priv->gw.bandwidth_down))) + goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_GW_BANDWIDTH_UP, + atomic_read(&bat_priv->gw.bandwidth_up))) + goto nla_put_failure; + + if (nla_put_u8(msg, BATADV_ATTR_GW_MODE, + atomic_read(&bat_priv->gw.mode))) + goto nla_put_failure; + + if (bat_priv->algo_ops->gw.get_best_gw_node && + bat_priv->algo_ops->gw.is_eligible) { + /* GW selection class is not available if the routing algorithm + * in use does not implement the GW API + */ + if (nla_put_u32(msg, BATADV_ATTR_GW_SEL_CLASS, + atomic_read(&bat_priv->gw.sel_class))) + goto nla_put_failure; + } + if (primary_if) batadv_hardif_put(primary_if); @@ -444,6 +471,71 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) batadv_update_min_mtu(bat_priv->soft_iface); } + if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]) { + attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_DOWN]; + + atomic_set(&bat_priv->gw.bandwidth_down, nla_get_u32(attr)); + batadv_gw_tvlv_container_update(bat_priv); + } + + if (info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]) { + attr = info->attrs[BATADV_ATTR_GW_BANDWIDTH_UP]; + + atomic_set(&bat_priv->gw.bandwidth_up, nla_get_u32(attr)); + batadv_gw_tvlv_container_update(bat_priv); + } + + if (info->attrs[BATADV_ATTR_GW_MODE]) { + u8 gw_mode; + + attr = info->attrs[BATADV_ATTR_GW_MODE]; + gw_mode = nla_get_u8(attr); + + if (gw_mode <= BATADV_GW_MODE_SERVER) { + /* Invoking batadv_gw_reselect() is not enough to really + * de-select the current GW. It will only instruct the + * gateway client code to perform a re-election the next + * time that this is needed. + * + * When gw client mode is being switched off the current + * GW must be de-selected explicitly otherwise no GW_ADD + * uevent is thrown on client mode re-activation. This + * is operation is performed in + * batadv_gw_check_client_stop(). + */ + batadv_gw_reselect(bat_priv); + + /* always call batadv_gw_check_client_stop() before + * changing the gateway state + */ + batadv_gw_check_client_stop(bat_priv); + atomic_set(&bat_priv->gw.mode, gw_mode); + batadv_gw_tvlv_container_update(bat_priv); + } + } + + if (info->attrs[BATADV_ATTR_GW_SEL_CLASS] && + bat_priv->algo_ops->gw.get_best_gw_node && + bat_priv->algo_ops->gw.is_eligible) { + /* setting the GW selection class is allowed only if the routing + * algorithm in use implements the GW API + */ + + u32 sel_class_max = 0xffffffffu; + u32 sel_class; + + attr = info->attrs[BATADV_ATTR_GW_SEL_CLASS]; + sel_class = nla_get_u32(attr); + + if (!bat_priv->algo_ops->gw.store_sel_class) + sel_class_max = BATADV_TQ_MAX_VALUE; + + if (sel_class >= 1 && sel_class <= sel_class_max) { + atomic_set(&bat_priv->gw.sel_class, sel_class); + batadv_gw_reselect(bat_priv); + } + } + batadv_netlink_notify_mesh(bat_priv); return 0; diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index b14fb3462af7..12028c287de5 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -50,13 +50,13 @@ #include #include #include +#include #include "bat_algo.h" #include "bridge_loop_avoidance.h" #include "debugfs.h" #include "distributed-arp-table.h" #include "gateway_client.h" -#include "gateway_common.h" #include "hard-interface.h" #include "multicast.h" #include "network-coding.h" diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c index e1b816262c53..64fd5932a555 100644 --- a/net/batman-adv/sysfs.c +++ b/net/batman-adv/sysfs.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "bridge_loop_avoidance.h" #include "distributed-arp-table.h" -- cgit v1.2.3 From bfc7f1be57b8a5ea738ce5db62b82234e4901abf Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:19:38 +0100 Subject: batman-adv: Add hop_penalty mesh genl configuration The TQ (B.A.T.M.A.N. IV) and throughput values (B.A.T.M.A.N. V) are reduced when they are forwarded. One of the reductions is the penalty for traversing an additional hop. This hop_penalty (0-255) defines the percentage of reduction (0-100%). The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the u8 BATADV_ATTR_HOP_PENALTY attribute. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 11 +++++++++++ 2 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 165272be6878..b37cb923332e 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -443,6 +443,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_GW_SEL_CLASS, + /** + * @BATADV_ATTR_HOP_PENALTY: defines the penalty which will be applied + * to an originator message's tq-field on every hop. + */ + BATADV_ATTR_HOP_PENALTY, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 473467afea91..ce6e6f078765 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -152,6 +152,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_GW_BANDWIDTH_UP] = { .type = NLA_U32 }, [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, + [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, }; /** @@ -330,6 +331,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; } + if (nla_put_u8(msg, BATADV_ATTR_HOP_PENALTY, + atomic_read(&bat_priv->hop_penalty))) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -536,6 +541,12 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } } + if (info->attrs[BATADV_ATTR_HOP_PENALTY]) { + attr = info->attrs[BATADV_ATTR_HOP_PENALTY]; + + atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr)); + } + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From b85bd091098a52f7bf00d2725b536455f82ba0d0 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:22:33 +0100 Subject: batman-adv: Add log_level mesh genl configuration In contrast to other modules, batman-adv allows to set the debug message verbosity per mesh/soft-interface and not per module (via modparam). The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the u32 (bitmask) BATADV_ATTR_LOG_LEVEL attribute. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index b37cb923332e..6d36e4b47eb4 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -449,6 +449,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_HOP_PENALTY, + /** + * @BATADV_ATTR_LOG_LEVEL: bitmask with to define which debug messages + * should be send to the debug log/trace ring buffer + */ + BATADV_ATTR_LOG_LEVEL, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index ce6e6f078765..8c019d46815c 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -53,6 +53,7 @@ #include "gateway_client.h" #include "gateway_common.h" #include "hard-interface.h" +#include "log.h" #include "multicast.h" #include "originator.h" #include "soft-interface.h" @@ -153,6 +154,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_GW_MODE] = { .type = NLA_U8 }, [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, + [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, }; /** @@ -335,6 +337,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, atomic_read(&bat_priv->hop_penalty))) goto nla_put_failure; +#ifdef CONFIG_BATMAN_ADV_DEBUG + if (nla_put_u32(msg, BATADV_ATTR_LOG_LEVEL, + atomic_read(&bat_priv->log_level))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_DEBUG */ + if (primary_if) batadv_hardif_put(primary_if); @@ -547,6 +555,15 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) atomic_set(&bat_priv->hop_penalty, nla_get_u8(attr)); } +#ifdef CONFIG_BATMAN_ADV_DEBUG + if (info->attrs[BATADV_ATTR_LOG_LEVEL]) { + attr = info->attrs[BATADV_ATTR_LOG_LEVEL]; + + atomic_set(&bat_priv->log_level, + nla_get_u32(attr) & BATADV_DBG_ALL); + } +#endif /* CONFIG_BATMAN_ADV_DEBUG */ + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From f75b56bc91122e2934e2cb458f98727c41d535c7 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:25:05 +0100 Subject: batman-adv: Add multicast forceflood mesh genl configuration The mesh interface can optimize the flooding of multicast packets based on the content of the global translation tables. To disable this behavior and use the broadcast-like flooding of the packets, forceflood has to be enabled. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED attribute. Setting the u8 to zero will disable this feature (allowing multicast optimizations) and setting it to something else is enabling this feature (forcing simple flooding). Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 9 +++++++++ net/batman-adv/netlink.c | 15 +++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 6d36e4b47eb4..38caaaae8a05 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -455,6 +455,15 @@ enum batadv_nl_attrs { */ BATADV_ATTR_LOG_LEVEL, + /** + * @BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED: whether multicast + * optimizations should be replaced by simple broadcast-like flooding + * of multicast packets. If set to non-zero then all nodes in the mesh + * are going to use classic flooding for any multicast packet with no + * optimizations. + */ + BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 8c019d46815c..475bd15f806c 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -155,6 +155,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_GW_SEL_CLASS] = { .type = NLA_U32 }, [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, + [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, }; /** @@ -343,6 +344,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_DEBUG */ +#ifdef CONFIG_BATMAN_ADV_MCAST + if (nla_put_u8(msg, BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, + !atomic_read(&bat_priv->multicast_mode))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_MCAST */ + if (primary_if) batadv_hardif_put(primary_if); @@ -564,6 +571,14 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_DEBUG */ +#ifdef CONFIG_BATMAN_ADV_MCAST + if (info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]) { + attr = info->attrs[BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED]; + + atomic_set(&bat_priv->multicast_mode, !nla_get_u8(attr)); + } +#endif /* CONFIG_BATMAN_ADV_MCAST */ + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From 6c57cde6800bae2361b8ac14a5924ffc592b3a90 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:26:14 +0100 Subject: batman-adv: Add network_coding mesh genl configuration The mesh interface can use (in an homogeneous mesh) network coding, a mechanism that aims to increase the overall network throughput by fusing multiple packets in one transmission. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the BATADV_ATTR_NETWORK_CODING_ENABLED attribute. Setting the u8 to zero will disable this feature and setting it to something else is enabling this feature. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 7 +++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 38caaaae8a05..a4239c147bde 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -464,6 +464,13 @@ enum batadv_nl_attrs { */ BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED, + /** + * @BATADV_ATTR_NETWORK_CODING_ENABLED: whether Network Coding (using + * some magic to send fewer wifi packets but still the same content) is + * enabled or not. + */ + BATADV_ATTR_NETWORK_CODING_ENABLED, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 475bd15f806c..6f01f92e6ab3 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -55,6 +55,7 @@ #include "hard-interface.h" #include "log.h" #include "multicast.h" +#include "network-coding.h" #include "originator.h" #include "soft-interface.h" #include "tp_meter.h" @@ -156,6 +157,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_HOP_PENALTY] = { .type = NLA_U8 }, [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, }; /** @@ -350,6 +352,12 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_MCAST */ +#ifdef CONFIG_BATMAN_ADV_NC + if (nla_put_u8(msg, BATADV_ATTR_NETWORK_CODING_ENABLED, + !!atomic_read(&bat_priv->network_coding))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_NC */ + if (primary_if) batadv_hardif_put(primary_if); @@ -579,6 +587,15 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_MCAST */ +#ifdef CONFIG_BATMAN_ADV_NC + if (info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]) { + attr = info->attrs[BATADV_ATTR_NETWORK_CODING_ENABLED]; + + atomic_set(&bat_priv->network_coding, !!nla_get_u8(attr)); + batadv_nc_status_update(bat_priv->soft_iface); + } +#endif /* CONFIG_BATMAN_ADV_NC */ + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From 7b751b39f018a828a04692e199c044087102e96c Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:28:02 +0100 Subject: batman-adv: Add orig_interval mesh genl configuration The OGM packets are transmitted every orig_interval milliseconds. This value can be changed using the configuration interface. The BATADV_CMD_SET_MESH/BATADV_CMD_GET_MESH commands allow to set/get the configuration of this feature using the u32 BATADV_ATTR_ORIG_INTERVAL attribute. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index a4239c147bde..6bedd4889c37 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -471,6 +471,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_NETWORK_CODING_ENABLED, + /** + * @BATADV_ATTR_ORIG_INTERVAL: defines the interval in milliseconds in + * which batman sends its protocol messages. + */ + BATADV_ATTR_ORIG_INTERVAL, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 6f01f92e6ab3..e25c139ba0e1 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -158,6 +158,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_LOG_LEVEL] = { .type = NLA_U32 }, [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, + [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, }; /** @@ -358,6 +359,10 @@ static int batadv_netlink_mesh_fill(struct sk_buff *msg, goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_NC */ + if (nla_put_u32(msg, BATADV_ATTR_ORIG_INTERVAL, + atomic_read(&bat_priv->orig_interval))) + goto nla_put_failure; + if (primary_if) batadv_hardif_put(primary_if); @@ -596,6 +601,18 @@ static int batadv_netlink_set_mesh(struct sk_buff *skb, struct genl_info *info) } #endif /* CONFIG_BATMAN_ADV_NC */ + if (info->attrs[BATADV_ATTR_ORIG_INTERVAL]) { + u32 orig_interval; + + attr = info->attrs[BATADV_ATTR_ORIG_INTERVAL]; + orig_interval = nla_get_u32(attr); + + orig_interval = min_t(u32, orig_interval, INT_MAX); + orig_interval = max_t(u32, orig_interval, 2 * BATADV_JITTER); + + atomic_set(&bat_priv->orig_interval, orig_interval); + } + batadv_netlink_notify_mesh(bat_priv); return 0; -- cgit v1.2.3 From a108008290405545b43b9c7975344bc59af2341b Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:30:04 +0100 Subject: batman-adv: Add elp_interval hardif genl configuration The ELP packets are transmitted every elp_interval milliseconds on an slave/hard-interface. This value can be changed using the configuration interface. The BATADV_CMD_SET_HARDIF/BATADV_CMD_GET_HARDIF commands allow to set/get the configuration of this feature using the u32 BATADV_ATTR_ELP_INTERVAL attribute. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 6 ++++++ net/batman-adv/netlink.c | 17 +++++++++++++++++ 2 files changed, 23 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index 6bedd4889c37..f966e497361b 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -477,6 +477,12 @@ enum batadv_nl_attrs { */ BATADV_ATTR_ORIG_INTERVAL, + /** + * @BATADV_ATTR_ELP_INTERVAL: defines the interval in milliseconds in + * which batman emits probing packets for neighbor sensing (ELP). + */ + BATADV_ATTR_ELP_INTERVAL, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index e25c139ba0e1..09187e6e92a3 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -159,6 +159,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_MULTICAST_FORCEFLOOD_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, + [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, }; /** @@ -825,6 +826,12 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, goto nla_put_failure; } +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, + atomic_read(&hard_iface->bat_v.elp_interval))) + goto nla_put_failure; +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + genlmsg_end(msg, hdr); return 0; @@ -910,6 +917,16 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, struct batadv_hard_iface *hard_iface = info->user_ptr[1]; struct batadv_priv *bat_priv = info->user_ptr[0]; +#ifdef CONFIG_BATMAN_ADV_BATMAN_V + struct nlattr *attr; + + if (info->attrs[BATADV_ATTR_ELP_INTERVAL]) { + attr = info->attrs[BATADV_ATTR_ELP_INTERVAL]; + + atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr)); + } +#endif /* CONFIG_BATMAN_ADV_BATMAN_V */ + batadv_netlink_notify_hardif(bat_priv, hard_iface); return 0; -- cgit v1.2.3 From 9a182242f17c06fad620663e6fdf992e97661e66 Mon Sep 17 00:00:00 2001 From: Sven Eckelmann Date: Fri, 23 Nov 2018 13:31:23 +0100 Subject: batman-adv: Add throughput_override hardif genl configuration The B.A.T.M.A.N. V implementation tries to estimate the link throughput of an interface to an originator using different automatic methods. It is still possible to overwrite it the link throughput for all reachable originators via this interface. The BATADV_CMD_SET_HARDIF/BATADV_CMD_GET_HARDIF commands allow to set/get the configuration of this feature using the u32 BATADV_ATTR_THROUGHPUT_OVERRIDE attribute. The used unit is in 100 Kbit/s. If the value is set to 0 then batman-adv will try to estimate the throughput by itself. Signed-off-by: Sven Eckelmann Signed-off-by: Simon Wunderlich --- include/uapi/linux/batman_adv.h | 8 ++++++++ net/batman-adv/netlink.c | 12 ++++++++++++ 2 files changed, 20 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/batman_adv.h b/include/uapi/linux/batman_adv.h index f966e497361b..305bf316dd03 100644 --- a/include/uapi/linux/batman_adv.h +++ b/include/uapi/linux/batman_adv.h @@ -483,6 +483,14 @@ enum batadv_nl_attrs { */ BATADV_ATTR_ELP_INTERVAL, + /** + * @BATADV_ATTR_THROUGHPUT_OVERRIDE: defines the throughput value to be + * used by B.A.T.M.A.N. V when estimating the link throughput using + * this interface. If the value is set to 0 then batman-adv will try to + * estimate the throughput by itself. + */ + BATADV_ATTR_THROUGHPUT_OVERRIDE, + /* add attributes above here, update the policy in netlink.c */ /** diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c index 09187e6e92a3..476b4c6017c9 100644 --- a/net/batman-adv/netlink.c +++ b/net/batman-adv/netlink.c @@ -160,6 +160,7 @@ static const struct nla_policy batadv_netlink_policy[NUM_BATADV_ATTR] = { [BATADV_ATTR_NETWORK_CODING_ENABLED] = { .type = NLA_U8 }, [BATADV_ATTR_ORIG_INTERVAL] = { .type = NLA_U32 }, [BATADV_ATTR_ELP_INTERVAL] = { .type = NLA_U32 }, + [BATADV_ATTR_THROUGHPUT_OVERRIDE] = { .type = NLA_U32 }, }; /** @@ -830,6 +831,10 @@ static int batadv_netlink_hardif_fill(struct sk_buff *msg, if (nla_put_u32(msg, BATADV_ATTR_ELP_INTERVAL, atomic_read(&hard_iface->bat_v.elp_interval))) goto nla_put_failure; + + if (nla_put_u32(msg, BATADV_ATTR_THROUGHPUT_OVERRIDE, + atomic_read(&hard_iface->bat_v.throughput_override))) + goto nla_put_failure; #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ genlmsg_end(msg, hdr); @@ -925,6 +930,13 @@ static int batadv_netlink_set_hardif(struct sk_buff *skb, atomic_set(&hard_iface->bat_v.elp_interval, nla_get_u32(attr)); } + + if (info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]) { + attr = info->attrs[BATADV_ATTR_THROUGHPUT_OVERRIDE]; + + atomic_set(&hard_iface->bat_v.throughput_override, + nla_get_u32(attr)); + } #endif /* CONFIG_BATMAN_ADV_BATMAN_V */ batadv_netlink_notify_hardif(bat_priv, hard_iface); -- cgit v1.2.3 From 257eeded20b34219d5484cfc415b3e39093f37b8 Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 10 Feb 2019 14:24:59 +0200 Subject: net: Move all TC actions identifiers to one place Move all the TC identifiers to one place, to the same enum that defines the identifier of police action. This makes it easier choose numbers for new actions since they are now defined in one place. We preserve the original values for binary compatibility. New IDs should be added inside the enum. Signed-off-by: Eli Cohen Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 43 ++++++++++++++++++++++++++++--- include/uapi/linux/tc_act/tc_bpf.h | 2 -- include/uapi/linux/tc_act/tc_connmark.h | 2 -- include/uapi/linux/tc_act/tc_csum.h | 2 -- include/uapi/linux/tc_act/tc_gact.h | 1 - include/uapi/linux/tc_act/tc_ife.h | 1 - include/uapi/linux/tc_act/tc_ipt.h | 3 --- include/uapi/linux/tc_act/tc_mirred.h | 1 - include/uapi/linux/tc_act/tc_nat.h | 2 -- include/uapi/linux/tc_act/tc_pedit.h | 2 -- include/uapi/linux/tc_act/tc_sample.h | 2 -- include/uapi/linux/tc_act/tc_skbedit.h | 2 -- include/uapi/linux/tc_act/tc_skbmod.h | 2 -- include/uapi/linux/tc_act/tc_tunnel_key.h | 2 -- include/uapi/linux/tc_act/tc_vlan.h | 2 -- net/sched/act_simple.c | 2 -- tools/include/uapi/linux/tc_act/tc_bpf.h | 2 -- 17 files changed, 40 insertions(+), 33 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 02ac251be8c4..7ab55f97e7c4 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -63,12 +63,49 @@ enum { #define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) #define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN +/* These macros are put here for binary compatibility with userspace apps that + * make use of them. For kernel code and new userspace apps, use the TCA_ID_* + * versions. + */ +#define TCA_ACT_GACT 5 +#define TCA_ACT_IPT 6 +#define TCA_ACT_PEDIT 7 +#define TCA_ACT_MIRRED 8 +#define TCA_ACT_NAT 9 +#define TCA_ACT_XT 10 +#define TCA_ACT_SKBEDIT 11 +#define TCA_ACT_VLAN 12 +#define TCA_ACT_BPF 13 +#define TCA_ACT_CONNMARK 14 +#define TCA_ACT_SKBMOD 15 +#define TCA_ACT_CSUM 16 +#define TCA_ACT_TUNNEL_KEY 17 +#define TCA_ACT_SIMP 22 +#define TCA_ACT_IFE 25 +#define TCA_ACT_SAMPLE 26 + /* Action type identifiers*/ enum { - TCA_ID_UNSPEC=0, - TCA_ID_POLICE=1, + TCA_ID_UNSPEC = 0, + TCA_ID_POLICE = 1, + TCA_ID_GACT = TCA_ACT_GACT, + TCA_ID_IPT = TCA_ACT_IPT, + TCA_ID_PEDIT = TCA_ACT_PEDIT, + TCA_ID_MIRRED = TCA_ACT_MIRRED, + TCA_ID_NAT = TCA_ACT_NAT, + TCA_ID_XT = TCA_ACT_XT, + TCA_ID_SKBEDIT = TCA_ACT_SKBEDIT, + TCA_ID_VLAN = TCA_ACT_VLAN, + TCA_ID_BPF = TCA_ACT_BPF, + TCA_ID_CONNMARK = TCA_ACT_CONNMARK, + TCA_ID_SKBMOD = TCA_ACT_SKBMOD, + TCA_ID_CSUM = TCA_ACT_CSUM, + TCA_ID_TUNNEL_KEY = TCA_ACT_TUNNEL_KEY, + TCA_ID_SIMP = TCA_ACT_SIMP, + TCA_ID_IFE = TCA_ACT_IFE, + TCA_ID_SAMPLE = TCA_ACT_SAMPLE, /* other actions go here */ - __TCA_ID_MAX=255 + __TCA_ID_MAX = 255 }; #define TCA_ID_MAX __TCA_ID_MAX diff --git a/include/uapi/linux/tc_act/tc_bpf.h b/include/uapi/linux/tc_act/tc_bpf.h index 6e89a5df49a4..653c4f94f76e 100644 --- a/include/uapi/linux/tc_act/tc_bpf.h +++ b/include/uapi/linux/tc_act/tc_bpf.h @@ -13,8 +13,6 @@ #include -#define TCA_ACT_BPF 13 - struct tc_act_bpf { tc_gen; }; diff --git a/include/uapi/linux/tc_act/tc_connmark.h b/include/uapi/linux/tc_act/tc_connmark.h index 80caa47b1933..9f8f6f709feb 100644 --- a/include/uapi/linux/tc_act/tc_connmark.h +++ b/include/uapi/linux/tc_act/tc_connmark.h @@ -5,8 +5,6 @@ #include #include -#define TCA_ACT_CONNMARK 14 - struct tc_connmark { tc_gen; __u16 zone; diff --git a/include/uapi/linux/tc_act/tc_csum.h b/include/uapi/linux/tc_act/tc_csum.h index 0ecf4d29e2f3..94b2044929de 100644 --- a/include/uapi/linux/tc_act/tc_csum.h +++ b/include/uapi/linux/tc_act/tc_csum.h @@ -5,8 +5,6 @@ #include #include -#define TCA_ACT_CSUM 16 - enum { TCA_CSUM_UNSPEC, TCA_CSUM_PARMS, diff --git a/include/uapi/linux/tc_act/tc_gact.h b/include/uapi/linux/tc_act/tc_gact.h index 94273c3b81b0..37e5392e02c7 100644 --- a/include/uapi/linux/tc_act/tc_gact.h +++ b/include/uapi/linux/tc_act/tc_gact.h @@ -5,7 +5,6 @@ #include #include -#define TCA_ACT_GACT 5 struct tc_gact { tc_gen; diff --git a/include/uapi/linux/tc_act/tc_ife.h b/include/uapi/linux/tc_act/tc_ife.h index 2f48490ef386..8c401f185675 100644 --- a/include/uapi/linux/tc_act/tc_ife.h +++ b/include/uapi/linux/tc_act/tc_ife.h @@ -6,7 +6,6 @@ #include #include -#define TCA_ACT_IFE 25 /* Flag bits for now just encoding/decoding; mutually exclusive */ #define IFE_ENCODE 1 #define IFE_DECODE 0 diff --git a/include/uapi/linux/tc_act/tc_ipt.h b/include/uapi/linux/tc_act/tc_ipt.h index b743c8bddd13..c48d7da6750d 100644 --- a/include/uapi/linux/tc_act/tc_ipt.h +++ b/include/uapi/linux/tc_act/tc_ipt.h @@ -4,9 +4,6 @@ #include -#define TCA_ACT_IPT 6 -#define TCA_ACT_XT 10 - enum { TCA_IPT_UNSPEC, TCA_IPT_TABLE, diff --git a/include/uapi/linux/tc_act/tc_mirred.h b/include/uapi/linux/tc_act/tc_mirred.h index 5dd671cf5776..2500a0005d05 100644 --- a/include/uapi/linux/tc_act/tc_mirred.h +++ b/include/uapi/linux/tc_act/tc_mirred.h @@ -5,7 +5,6 @@ #include #include -#define TCA_ACT_MIRRED 8 #define TCA_EGRESS_REDIR 1 /* packet redirect to EGRESS*/ #define TCA_EGRESS_MIRROR 2 /* mirror packet to EGRESS */ #define TCA_INGRESS_REDIR 3 /* packet redirect to INGRESS*/ diff --git a/include/uapi/linux/tc_act/tc_nat.h b/include/uapi/linux/tc_act/tc_nat.h index 086be842587b..21399c2c6130 100644 --- a/include/uapi/linux/tc_act/tc_nat.h +++ b/include/uapi/linux/tc_act/tc_nat.h @@ -5,8 +5,6 @@ #include #include -#define TCA_ACT_NAT 9 - enum { TCA_NAT_UNSPEC, TCA_NAT_PARMS, diff --git a/include/uapi/linux/tc_act/tc_pedit.h b/include/uapi/linux/tc_act/tc_pedit.h index 24ec792dacc1..f3e61b04fa01 100644 --- a/include/uapi/linux/tc_act/tc_pedit.h +++ b/include/uapi/linux/tc_act/tc_pedit.h @@ -5,8 +5,6 @@ #include #include -#define TCA_ACT_PEDIT 7 - enum { TCA_PEDIT_UNSPEC, TCA_PEDIT_TM, diff --git a/include/uapi/linux/tc_act/tc_sample.h b/include/uapi/linux/tc_act/tc_sample.h index bd7e9f03abd2..fee1bcc20793 100644 --- a/include/uapi/linux/tc_act/tc_sample.h +++ b/include/uapi/linux/tc_act/tc_sample.h @@ -6,8 +6,6 @@ #include #include -#define TCA_ACT_SAMPLE 26 - struct tc_sample { tc_gen; }; diff --git a/include/uapi/linux/tc_act/tc_skbedit.h b/include/uapi/linux/tc_act/tc_skbedit.h index 6de6071ebed6..800e93377218 100644 --- a/include/uapi/linux/tc_act/tc_skbedit.h +++ b/include/uapi/linux/tc_act/tc_skbedit.h @@ -23,8 +23,6 @@ #include -#define TCA_ACT_SKBEDIT 11 - #define SKBEDIT_F_PRIORITY 0x1 #define SKBEDIT_F_QUEUE_MAPPING 0x2 #define SKBEDIT_F_MARK 0x4 diff --git a/include/uapi/linux/tc_act/tc_skbmod.h b/include/uapi/linux/tc_act/tc_skbmod.h index 38c072f66f2f..c525b3503797 100644 --- a/include/uapi/linux/tc_act/tc_skbmod.h +++ b/include/uapi/linux/tc_act/tc_skbmod.h @@ -13,8 +13,6 @@ #include -#define TCA_ACT_SKBMOD 15 - #define SKBMOD_F_DMAC 0x1 #define SKBMOD_F_SMAC 0x2 #define SKBMOD_F_ETYPE 0x4 diff --git a/include/uapi/linux/tc_act/tc_tunnel_key.h b/include/uapi/linux/tc_act/tc_tunnel_key.h index be384d63e1b5..41c8b462c177 100644 --- a/include/uapi/linux/tc_act/tc_tunnel_key.h +++ b/include/uapi/linux/tc_act/tc_tunnel_key.h @@ -14,8 +14,6 @@ #include -#define TCA_ACT_TUNNEL_KEY 17 - #define TCA_TUNNEL_KEY_ACT_SET 1 #define TCA_TUNNEL_KEY_ACT_RELEASE 2 diff --git a/include/uapi/linux/tc_act/tc_vlan.h b/include/uapi/linux/tc_act/tc_vlan.h index 0d7b5fd6605b..168995b54a70 100644 --- a/include/uapi/linux/tc_act/tc_vlan.h +++ b/include/uapi/linux/tc_act/tc_vlan.h @@ -13,8 +13,6 @@ #include -#define TCA_ACT_VLAN 12 - #define TCA_VLAN_ACT_POP 1 #define TCA_VLAN_ACT_PUSH 2 #define TCA_VLAN_ACT_MODIFY 3 diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 902957beceb3..b2b16d440154 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -19,8 +19,6 @@ #include #include -#define TCA_ACT_SIMP 22 - #include #include diff --git a/tools/include/uapi/linux/tc_act/tc_bpf.h b/tools/include/uapi/linux/tc_act/tc_bpf.h index 6e89a5df49a4..653c4f94f76e 100644 --- a/tools/include/uapi/linux/tc_act/tc_bpf.h +++ b/tools/include/uapi/linux/tc_act/tc_bpf.h @@ -13,8 +13,6 @@ #include -#define TCA_ACT_BPF 13 - struct tc_act_bpf { tc_gen; }; -- cgit v1.2.3 From eddd2cf195d6fb5e4bbc91a0fe4be55110f559ab Mon Sep 17 00:00:00 2001 From: Eli Cohen Date: Sun, 10 Feb 2019 14:25:00 +0200 Subject: net: Change TCA_ACT_* to TCA_ID_* to match that of TCA_ID_POLICE Modify the kernel users of the TCA_ACT_* macros to use TCA_ID_*. For example, use TCA_ID_GACT instead of TCA_ACT_GACT. This will align with TCA_ID_POLICE and also differentiates these identifier, used in struct tc_action_ops type field, from other macros starting with TCA_ACT_. To make things clearer, we name the enum defining the TCA_ID_* identifiers and also change the "type" field of struct tc_action to id. Signed-off-by: Eli Cohen Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/act_api.h | 2 +- include/net/tc_act/tc_csum.h | 2 +- include/net/tc_act/tc_gact.h | 2 +- include/net/tc_act/tc_mirred.h | 4 ++-- include/net/tc_act/tc_pedit.h | 2 +- include/net/tc_act/tc_sample.h | 2 +- include/net/tc_act/tc_skbedit.h | 2 +- include/net/tc_act/tc_tunnel_key.h | 4 ++-- include/net/tc_act/tc_vlan.h | 2 +- include/uapi/linux/pkt_cls.h | 2 +- net/sched/act_api.c | 2 +- net/sched/act_bpf.c | 2 +- net/sched/act_connmark.c | 2 +- net/sched/act_csum.c | 2 +- net/sched/act_gact.c | 2 +- net/sched/act_ife.c | 2 +- net/sched/act_ipt.c | 4 ++-- net/sched/act_mirred.c | 2 +- net/sched/act_nat.c | 2 +- net/sched/act_pedit.c | 2 +- net/sched/act_police.c | 2 +- net/sched/act_sample.c | 2 +- net/sched/act_simple.c | 2 +- net/sched/act_skbedit.c | 2 +- net/sched/act_skbmod.c | 2 +- net/sched/act_tunnel_key.c | 2 +- net/sched/act_vlan.c | 2 +- 27 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/act_api.h b/include/net/act_api.h index dbc795ec659e..c745e9ccfab2 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -80,7 +80,7 @@ static inline void tcf_tm_dump(struct tcf_t *dtm, const struct tcf_t *stm) struct tc_action_ops { struct list_head head; char kind[IFNAMSIZ]; - __u32 type; /* TBD to match kind */ + enum tca_id id; /* identifier should match kind */ size_t size; struct module *owner; int (*act)(struct sk_buff *, const struct tc_action *, diff --git a/include/net/tc_act/tc_csum.h b/include/net/tc_act/tc_csum.h index 32d2454c0479..68269e4581b7 100644 --- a/include/net/tc_act/tc_csum.h +++ b/include/net/tc_act/tc_csum.h @@ -21,7 +21,7 @@ struct tcf_csum { static inline bool is_tcf_csum(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_CSUM) + if (a->ops && a->ops->id == TCA_ID_CSUM) return true; #endif return false; diff --git a/include/net/tc_act/tc_gact.h b/include/net/tc_act/tc_gact.h index ef8dd0db70ce..ee8d005f56fc 100644 --- a/include/net/tc_act/tc_gact.h +++ b/include/net/tc_act/tc_gact.h @@ -22,7 +22,7 @@ static inline bool __is_tcf_gact_act(const struct tc_action *a, int act, #ifdef CONFIG_NET_CLS_ACT struct tcf_gact *gact; - if (a->ops && a->ops->type != TCA_ACT_GACT) + if (a->ops && a->ops->id != TCA_ID_GACT) return false; gact = to_gact(a); diff --git a/include/net/tc_act/tc_mirred.h b/include/net/tc_act/tc_mirred.h index a2e9cbca5c9e..c757585a05b0 100644 --- a/include/net/tc_act/tc_mirred.h +++ b/include/net/tc_act/tc_mirred.h @@ -17,7 +17,7 @@ struct tcf_mirred { static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_MIRRED) + if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_REDIR; #endif return false; @@ -26,7 +26,7 @@ static inline bool is_tcf_mirred_egress_redirect(const struct tc_action *a) static inline bool is_tcf_mirred_egress_mirror(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_MIRRED) + if (a->ops && a->ops->id == TCA_ID_MIRRED) return to_mirred(a)->tcfm_eaction == TCA_EGRESS_MIRROR; #endif return false; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index fac3ad4a86de..748cf87a4d7e 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -23,7 +23,7 @@ struct tcf_pedit { static inline bool is_tcf_pedit(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_PEDIT) + if (a->ops && a->ops->id == TCA_ID_PEDIT) return true; #endif return false; diff --git a/include/net/tc_act/tc_sample.h b/include/net/tc_act/tc_sample.h index 01dbfea32672..0a559d4b6f0f 100644 --- a/include/net/tc_act/tc_sample.h +++ b/include/net/tc_act/tc_sample.h @@ -20,7 +20,7 @@ struct tcf_sample { static inline bool is_tcf_sample(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - return a->ops && a->ops->type == TCA_ACT_SAMPLE; + return a->ops && a->ops->id == TCA_ID_SAMPLE; #else return false; #endif diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h index 911bbac838a2..85c5c4756d92 100644 --- a/include/net/tc_act/tc_skbedit.h +++ b/include/net/tc_act/tc_skbedit.h @@ -44,7 +44,7 @@ static inline bool is_tcf_skbedit_mark(const struct tc_action *a) #ifdef CONFIG_NET_CLS_ACT u32 flags; - if (a->ops && a->ops->type == TCA_ACT_SKBEDIT) { + if (a->ops && a->ops->id == TCA_ID_SKBEDIT) { rcu_read_lock(); flags = rcu_dereference(to_skbedit(a)->params)->flags; rcu_read_unlock(); diff --git a/include/net/tc_act/tc_tunnel_key.h b/include/net/tc_act/tc_tunnel_key.h index 46b8c7f1c8d5..23d5b8b19f3e 100644 --- a/include/net/tc_act/tc_tunnel_key.h +++ b/include/net/tc_act/tc_tunnel_key.h @@ -34,7 +34,7 @@ static inline bool is_tcf_tunnel_set(const struct tc_action *a) struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); - if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) + if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_SET; #endif return false; @@ -46,7 +46,7 @@ static inline bool is_tcf_tunnel_release(const struct tc_action *a) struct tcf_tunnel_key *t = to_tunnel_key(a); struct tcf_tunnel_key_params *params = rtnl_dereference(t->params); - if (a->ops && a->ops->type == TCA_ACT_TUNNEL_KEY) + if (a->ops && a->ops->id == TCA_ID_TUNNEL_KEY) return params->tcft_action == TCA_TUNNEL_KEY_ACT_RELEASE; #endif return false; diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 22ae260d6869..fe39ed502bef 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -30,7 +30,7 @@ struct tcf_vlan { static inline bool is_tcf_vlan(const struct tc_action *a) { #ifdef CONFIG_NET_CLS_ACT - if (a->ops && a->ops->type == TCA_ACT_VLAN) + if (a->ops && a->ops->id == TCA_ID_VLAN) return true; #endif return false; diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7ab55f97e7c4..51a0496f78ea 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -85,7 +85,7 @@ enum { #define TCA_ACT_SAMPLE 26 /* Action type identifiers*/ -enum { +enum tca_id { TCA_ID_UNSPEC = 0, TCA_ID_POLICE = 1, TCA_ID_GACT = TCA_ACT_GACT, diff --git a/net/sched/act_api.c b/net/sched/act_api.c index d4b8355737d8..aecf1bf233c8 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -543,7 +543,7 @@ int tcf_register_action(struct tc_action_ops *act, write_lock(&act_mod_lock); list_for_each_entry(a, &act_base, head) { - if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + if (act->id == a->id || (strcmp(act->kind, a->kind) == 0)) { write_unlock(&act_mod_lock); unregister_pernet_subsys(ops); return -EEXIST; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index c7633843e223..aa5c38d11a30 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -396,7 +396,7 @@ static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", - .type = TCA_ACT_BPF, + .id = TCA_ID_BPF, .owner = THIS_MODULE, .act = tcf_bpf_act, .dump = tcf_bpf_dump, diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 8475913f2070..5d24993cccfe 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -204,7 +204,7 @@ static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_connmark_ops = { .kind = "connmark", - .type = TCA_ACT_CONNMARK, + .id = TCA_ID_CONNMARK, .owner = THIS_MODULE, .act = tcf_connmark_act, .dump = tcf_connmark_dump, diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 3dc25b7806d7..945fb34ae721 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -660,7 +660,7 @@ static size_t tcf_csum_get_fill_size(const struct tc_action *act) static struct tc_action_ops act_csum_ops = { .kind = "csum", - .type = TCA_ACT_CSUM, + .id = TCA_ID_CSUM, .owner = THIS_MODULE, .act = tcf_csum_act, .dump = tcf_csum_dump, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index b61c20ebb314..93da0004e9f4 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -253,7 +253,7 @@ static size_t tcf_gact_get_fill_size(const struct tc_action *act) static struct tc_action_ops act_gact_ops = { .kind = "gact", - .type = TCA_ACT_GACT, + .id = TCA_ID_GACT, .owner = THIS_MODULE, .act = tcf_gact_act, .stats_update = tcf_gact_stats_update, diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 30b63fa23ee2..9b1f2b3990ee 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -864,7 +864,7 @@ static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_ife_ops = { .kind = "ife", - .type = TCA_ACT_IFE, + .id = TCA_ID_IFE, .owner = THIS_MODULE, .act = tcf_ife_act, .dump = tcf_ife_dump, diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8af6c11d2482..1bad190710ad 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -338,7 +338,7 @@ static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_ipt_ops = { .kind = "ipt", - .type = TCA_ACT_IPT, + .id = TCA_ID_IPT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, @@ -387,7 +387,7 @@ static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_xt_ops = { .kind = "xt", - .type = TCA_ACT_XT, + .id = TCA_ID_XT, .owner = THIS_MODULE, .act = tcf_ipt_act, .dump = tcf_ipt_dump, diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index c8cf4d10c435..6692fd054617 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -400,7 +400,7 @@ static void tcf_mirred_put_dev(struct net_device *dev) static struct tc_action_ops act_mirred_ops = { .kind = "mirred", - .type = TCA_ACT_MIRRED, + .id = TCA_ID_MIRRED, .owner = THIS_MODULE, .act = tcf_mirred_act, .stats_update = tcf_stats_update, diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index c5c1e23add77..543eab9193f1 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -304,7 +304,7 @@ static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_nat_ops = { .kind = "nat", - .type = TCA_ACT_NAT, + .id = TCA_ID_NAT, .owner = THIS_MODULE, .act = tcf_nat_act, .dump = tcf_nat_dump, diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 3663d3b615a4..a80373878df7 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -470,7 +470,7 @@ static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_pedit_ops = { .kind = "pedit", - .type = TCA_ACT_PEDIT, + .id = TCA_ID_PEDIT, .owner = THIS_MODULE, .act = tcf_pedit_act, .dump = tcf_pedit_dump, diff --git a/net/sched/act_police.c b/net/sched/act_police.c index ec8ec55e0fe8..8271a6263824 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -366,7 +366,7 @@ MODULE_LICENSE("GPL"); static struct tc_action_ops act_police_ops = { .kind = "police", - .type = TCA_ID_POLICE, + .id = TCA_ID_POLICE, .owner = THIS_MODULE, .act = tcf_police_act, .dump = tcf_police_dump, diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 1a0c682fd734..203e399e5c85 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -233,7 +233,7 @@ static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_sample_ops = { .kind = "sample", - .type = TCA_ACT_SAMPLE, + .id = TCA_ID_SAMPLE, .owner = THIS_MODULE, .act = tcf_sample_act, .dump = tcf_sample_dump, diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index b2b16d440154..d54cb608dbaf 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -195,7 +195,7 @@ static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_simp_ops = { .kind = "simple", - .type = TCA_ACT_SIMP, + .id = TCA_ID_SIMP, .owner = THIS_MODULE, .act = tcf_simp_act, .dump = tcf_simp_dump, diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 64dba3708fce..39f8a67ea940 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -305,7 +305,7 @@ static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_skbedit_ops = { .kind = "skbedit", - .type = TCA_ACT_SKBEDIT, + .id = TCA_ID_SKBEDIT, .owner = THIS_MODULE, .act = tcf_skbedit_act, .dump = tcf_skbedit_dump, diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 59710a183bd3..7bac1d78e7a3 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -260,7 +260,7 @@ static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", - .type = TCA_ACT_SKBMOD, + .id = TCA_ACT_SKBMOD, .owner = THIS_MODULE, .act = tcf_skbmod_act, .dump = tcf_skbmod_dump, diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 8b43fe0130f7..9104b8e36482 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -563,7 +563,7 @@ static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_tunnel_key_ops = { .kind = "tunnel_key", - .type = TCA_ACT_TUNNEL_KEY, + .id = TCA_ID_TUNNEL_KEY, .owner = THIS_MODULE, .act = tunnel_key_act, .dump = tunnel_key_dump, diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 93fdaf707313..ac0061599225 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -297,7 +297,7 @@ static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) static struct tc_action_ops act_vlan_ops = { .kind = "vlan", - .type = TCA_ACT_VLAN, + .id = TCA_ID_VLAN, .owner = THIS_MODULE, .act = tcf_vlan_act, .dump = tcf_vlan_dump, -- cgit v1.2.3 From 46f8bc92758c6259bcf945e9216098661c1587cd Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:20 -0800 Subject: bpf: Add a bpf_sock pointer to __sk_buff and a bpf_sk_fullsock helper In kernel, it is common to check "skb->sk && sk_fullsock(skb->sk)" before accessing the fields in sock. For example, in __netdev_pick_tx: static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb, struct net_device *sb_dev) { /* ... */ struct sock *sk = skb->sk; if (queue_index != new_index && sk && sk_fullsock(sk) && rcu_access_pointer(sk->sk_dst_cache)) sk_tx_queue_set(sk, new_index); /* ... */ return queue_index; } This patch adds a "struct bpf_sock *sk" pointer to the "struct __sk_buff" where a few of the convert_ctx_access() in filter.c has already been accessing the skb->sk sock_common's fields, e.g. sock_ops_convert_ctx_access(). "__sk_buff->sk" is a PTR_TO_SOCK_COMMON_OR_NULL in the verifier. Some of the fileds in "bpf_sock" will not be directly accessible through the "__sk_buff->sk" pointer. It is limited by the new "bpf_sock_common_is_valid_access()". e.g. The existing "type", "protocol", "mark" and "priority" in bpf_sock are not allowed. The newly added "struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk)" can be used to get a sk with all accessible fields in "bpf_sock". This helper is added to both cg_skb and sched_(cls|act). int cg_skb_foo(struct __sk_buff *skb) { struct bpf_sock *sk; sk = skb->sk; if (!sk) return 1; sk = bpf_sk_fullsock(sk); if (!sk) return 1; if (sk->family != AF_INET6 || sk->protocol != IPPROTO_TCP) return 1; /* some_traffic_shaping(); */ return 1; } (1) The sk is read only (2) There is no new "struct bpf_sock_common" introduced. (3) Future kernel sock's members could be added to bpf_sock only instead of repeatedly adding at multiple places like currently in bpf_sock_ops_md, bpf_sock_addr_md, sk_reuseport_md...etc. (4) After "sk = skb->sk", the reg holding sk is in type PTR_TO_SOCK_COMMON_OR_NULL. (5) After bpf_sk_fullsock(), the return type will be in type PTR_TO_SOCKET_OR_NULL which is the same as the return type of bpf_sk_lookup_xxx(). However, bpf_sk_fullsock() does not take refcnt. The acquire_reference_state() is only depending on the return type now. To avoid it, a new is_acquire_function() is checked before calling acquire_reference_state(). (6) The WARN_ON in "release_reference_state()" is no longer an internal verifier bug. When reg->id is not found in state->refs[], it means the bpf_prog does something wrong like "bpf_sk_release(bpf_sk_fullsock(skb->sk))" where reference has never been acquired by calling "bpf_sk_fullsock(skb->sk)". A -EINVAL and a verbose are done instead of WARN_ON. A test is added to the test_verifier in a later patch. Since the WARN_ON in "release_reference_state()" is no longer needed, "__release_reference_state()" is folded into "release_reference_state()" also. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 12 +++++ include/uapi/linux/bpf.h | 12 ++++- kernel/bpf/verifier.c | 132 +++++++++++++++++++++++++++++++++-------------- net/core/filter.c | 42 +++++++++++++++ 4 files changed, 157 insertions(+), 41 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bd169a7bcc93..a60463b45b54 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -194,6 +194,7 @@ enum bpf_arg_type { ARG_ANYTHING, /* any (initialized) argument is ok */ ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */ ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */ + ARG_PTR_TO_SOCK_COMMON, /* pointer to sock_common */ }; /* type of values returned from helper functions */ @@ -256,6 +257,8 @@ enum bpf_reg_type { PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ PTR_TO_SOCKET, /* reg points to struct bpf_sock */ PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ + PTR_TO_SOCK_COMMON, /* reg points to sock_common */ + PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -920,6 +923,9 @@ void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); #if defined(CONFIG_NET) +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info); bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info); u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, @@ -928,6 +934,12 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size); #else +static inline bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} static inline bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1777fa0c61e4..5d79cba74ddc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2329,6 +2329,14 @@ union bpf_attr { * "**y**". * Return * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in bpf_sock can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2425,7 +2433,8 @@ union bpf_attr { FN(msg_pop_data), \ FN(rc_pointer_rel), \ FN(spin_lock), \ - FN(spin_unlock), + FN(spin_unlock), \ + FN(sk_fullsock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2545,6 +2554,7 @@ struct __sk_buff { __u64 tstamp; __u32 wire_len; __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); }; struct bpf_tunnel_key { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 516dfc6d78de..b755d55a3791 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -331,10 +331,17 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) type == PTR_TO_PACKET_META; } +static bool type_is_sk_pointer(enum bpf_reg_type type) +{ + return type == PTR_TO_SOCKET || + type == PTR_TO_SOCK_COMMON; +} + static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || - type == PTR_TO_SOCKET_OR_NULL; + type == PTR_TO_SOCKET_OR_NULL || + type == PTR_TO_SOCK_COMMON_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -377,6 +384,12 @@ static bool is_release_function(enum bpf_func_id func_id) return func_id == BPF_FUNC_sk_release; } +static bool is_acquire_function(enum bpf_func_id func_id) +{ + return func_id == BPF_FUNC_sk_lookup_tcp || + func_id == BPF_FUNC_sk_lookup_udp; +} + /* string representation of 'enum bpf_reg_type' */ static const char * const reg_type_str[] = { [NOT_INIT] = "?", @@ -392,6 +405,8 @@ static const char * const reg_type_str[] = { [PTR_TO_FLOW_KEYS] = "flow_keys", [PTR_TO_SOCKET] = "sock", [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", + [PTR_TO_SOCK_COMMON] = "sock_common", + [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", }; static char slot_type_char[] = { @@ -618,13 +633,10 @@ static int acquire_reference_state(struct bpf_verifier_env *env, int insn_idx) } /* release function corresponding to acquire_reference_state(). Idempotent. */ -static int __release_reference_state(struct bpf_func_state *state, int ptr_id) +static int release_reference_state(struct bpf_func_state *state, int ptr_id) { int i, last_idx; - if (!ptr_id) - return -EFAULT; - last_idx = state->acquired_refs - 1; for (i = 0; i < state->acquired_refs; i++) { if (state->refs[i].id == ptr_id) { @@ -636,21 +648,7 @@ static int __release_reference_state(struct bpf_func_state *state, int ptr_id) return 0; } } - return -EFAULT; -} - -/* variation on the above for cases where we expect that there must be an - * outstanding reference for the specified ptr_id. - */ -static int release_reference_state(struct bpf_verifier_env *env, int ptr_id) -{ - struct bpf_func_state *state = cur_func(env); - int err; - - err = __release_reference_state(state, ptr_id); - if (WARN_ON_ONCE(err != 0)) - verbose(env, "verifier internal error: can't release reference\n"); - return err; + return -EINVAL; } static int transfer_reference_state(struct bpf_func_state *dst, @@ -1209,6 +1207,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case CONST_PTR_TO_MAP: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return true; default: return false; @@ -1647,6 +1647,7 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; struct bpf_insn_access_aux info = {}; + bool valid; if (reg->smin_value < 0) { verbose(env, "R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", @@ -1654,15 +1655,28 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } - if (!bpf_sock_is_valid_access(off, size, t, &info)) { - verbose(env, "invalid bpf_sock access off=%d size=%d\n", - off, size); - return -EACCES; + switch (reg->type) { + case PTR_TO_SOCK_COMMON: + valid = bpf_sock_common_is_valid_access(off, size, t, &info); + break; + case PTR_TO_SOCKET: + valid = bpf_sock_is_valid_access(off, size, t, &info); + break; + default: + valid = false; } - env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; - return 0; + if (valid) { + env->insn_aux_data[insn_idx].ctx_field_size = + info.ctx_field_size; + return 0; + } + + verbose(env, "R%d invalid %s access off=%d size=%d\n", + regno, reg_type_str[reg->type], off, size); + + return -EACCES; } static bool __is_pointer_value(bool allow_ptr_leaks, @@ -1688,8 +1702,14 @@ static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { const struct bpf_reg_state *reg = reg_state(env, regno); - return reg->type == PTR_TO_CTX || - reg->type == PTR_TO_SOCKET; + return reg->type == PTR_TO_CTX; +} + +static bool is_sk_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = reg_state(env, regno); + + return type_is_sk_pointer(reg->type); } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) @@ -1800,6 +1820,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCKET: pointer_desc = "sock "; break; + case PTR_TO_SOCK_COMMON: + pointer_desc = "sock_common "; + break; default: break; } @@ -2003,11 +2026,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * PTR_TO_PACKET[_META,_END]. In the latter * case, we know the offset is zero. */ - if (reg_type == SCALAR_VALUE) + if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, value_regno); - else + } else { mark_reg_known_zero(env, regs, value_regno); + if (reg_type_may_be_null(reg_type)) + regs[value_regno].id = ++env->id_gen; + } regs[value_regno].type = reg_type; } @@ -2053,9 +2079,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_flow_keys_access(env, off, size); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); - } else if (reg->type == PTR_TO_SOCKET) { + } else if (type_is_sk_pointer(reg->type)) { if (t == BPF_WRITE) { - verbose(env, "cannot write into socket\n"); + verbose(env, "R%d cannot write into %s\n", + regno, reg_type_str[reg->type]); return -EACCES; } err = check_sock_access(env, insn_idx, regno, off, size, t); @@ -2102,7 +2129,8 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins if (is_ctx_reg(env, insn->dst_reg) || is_pkt_reg(env, insn->dst_reg) || - is_flow_key_reg(env, insn->dst_reg)) { + is_flow_key_reg(env, insn->dst_reg) || + is_sk_reg(env, insn->dst_reg)) { verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); @@ -2369,6 +2397,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_ctx_reg(env, reg, regno); if (err < 0) return err; + } else if (arg_type == ARG_PTR_TO_SOCK_COMMON) { + expected_type = PTR_TO_SOCK_COMMON; + /* Any sk pointer can be ARG_PTR_TO_SOCK_COMMON */ + if (!type_is_sk_pointer(type)) + goto err_type; } else if (arg_type == ARG_PTR_TO_SOCKET) { expected_type = PTR_TO_SOCKET; if (type != expected_type) @@ -2783,7 +2816,7 @@ static int release_reference(struct bpf_verifier_env *env, for (i = 0; i <= vstate->curframe; i++) release_reg_references(env, vstate->frame[i], meta->ptr_id); - return release_reference_state(env, meta->ptr_id); + return release_reference_state(cur_func(env), meta->ptr_id); } static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -3049,8 +3082,11 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } } else if (is_release_function(func_id)) { err = release_reference(env, &meta); - if (err) + if (err) { + verbose(env, "func %s#%d reference has not been acquired before\n", + func_id_name(func_id), func_id); return err; + } } regs = cur_regs(env); @@ -3099,12 +3135,19 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].id = ++env->id_gen; } } else if (fn->ret_type == RET_PTR_TO_SOCKET_OR_NULL) { - int id = acquire_reference_state(env, insn_idx); - if (id < 0) - return id; mark_reg_known_zero(env, regs, BPF_REG_0); regs[BPF_REG_0].type = PTR_TO_SOCKET_OR_NULL; - regs[BPF_REG_0].id = id; + if (is_acquire_function(func_id)) { + int id = acquire_reference_state(env, insn_idx); + + if (id < 0) + return id; + /* For release_reference() */ + regs[BPF_REG_0].id = id; + } else { + /* For mark_ptr_or_null_reg() */ + regs[BPF_REG_0].id = ++env->id_gen; + } } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3364,6 +3407,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_PACKET_END: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4597,6 +4642,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { reg->type = PTR_TO_SOCKET; + } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { + reg->type = PTR_TO_SOCK_COMMON; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -4621,7 +4668,7 @@ static void mark_ptr_or_null_regs(struct bpf_verifier_state *vstate, u32 regno, int i, j; if (reg_is_refcounted_or_null(®s[regno]) && is_null) - __release_reference_state(state, id); + release_reference_state(state, id); for (i = 0; i < MAX_BPF_REG; i++) mark_ptr_or_null_reg(state, ®s[i], id, is_null); @@ -5790,6 +5837,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_FLOW_KEYS: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6110,6 +6159,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_CTX: case PTR_TO_SOCKET: case PTR_TO_SOCKET_OR_NULL: + case PTR_TO_SOCK_COMMON: + case PTR_TO_SOCK_COMMON_OR_NULL: return false; default: return true; @@ -7112,6 +7163,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) convert_ctx_access = ops->convert_ctx_access; break; case PTR_TO_SOCKET: + case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; default: diff --git a/net/core/filter.c b/net/core/filter.c index 3a49f68eda10..401d2e0aebf8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1793,6 +1793,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_sk_fullsock_proto = { + .func = bpf_sk_fullsock, + .gpl_only = false, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { @@ -5406,6 +5420,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) switch (func_id) { case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; default: return sk_filter_func_proto(func_id, prog); } @@ -5477,6 +5493,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_uid_proto; case BPF_FUNC_fib_lookup: return &bpf_skb_fib_lookup_proto; + case BPF_FUNC_sk_fullsock: + return &bpf_sk_fullsock_proto; #ifdef CONFIG_XFRM case BPF_FUNC_skb_get_xfrm_state: return &bpf_skb_get_xfrm_state_proto; @@ -5764,6 +5782,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != sizeof(__u64)) return false; break; + case offsetof(struct __sk_buff, sk): + if (type == BPF_WRITE || size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5950,6 +5973,18 @@ static bool __sock_filter_check_size(int off, int size, return size == size_default; } +bool bpf_sock_common_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + switch (off) { + case bpf_ctx_range_till(struct bpf_sock, type, priority): + return false; + default: + return bpf_sock_is_valid_access(off, size, type, info); + } +} + bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { @@ -6748,6 +6783,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, off += offsetof(struct qdisc_skb_cb, pkt_len); *target_size = 4; *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off); + break; + + case offsetof(struct __sk_buff, sk): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, sk)); + break; } return insn - insn_buf; -- cgit v1.2.3 From aa65d6960a98fc15a96ce361b26e9fd55c9bccc5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:21 -0800 Subject: bpf: Add state, dst_ip4, dst_ip6 and dst_port to bpf_sock This patch adds "state", "dst_ip4", "dst_ip6" and "dst_port" to the bpf_sock. The userspace has already been using "state", e.g. inet_diag (ss -t) and getsockopt(TCP_INFO). This patch also allows narrow load on the following existing fields: "family", "type", "protocol" and "src_port". Unlike IP address, the load offset is resticted to the first byte for them but it can be relaxed later if there is a use case. This patch also folds __sock_filter_check_size() into bpf_sock_is_valid_access() since it is not called by any where else. All bpf_sock checking is in one place. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 17 ++++----- net/core/filter.c | 99 +++++++++++++++++++++++++++++++++++++----------- 2 files changed, 85 insertions(+), 31 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5d79cba74ddc..d8f91777c5b6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2606,15 +2606,14 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; - __u32 src_ip4; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_ip6[4]; /* Allows 1,2,4-byte read. - * Stored in network byte order. - */ - __u32 src_port; /* Allows 4-byte read. - * Stored in host byte order - */ + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __u32 dst_port; /* network byte order */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; }; struct bpf_sock_tuple { diff --git a/net/core/filter.c b/net/core/filter.c index 401d2e0aebf8..01bb64bf2b5e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5958,21 +5958,6 @@ full_access: return true; } -static bool __sock_filter_check_size(int off, int size, - struct bpf_insn_access_aux *info) -{ - const int size_default = sizeof(__u32); - - switch (off) { - case bpf_ctx_range(struct bpf_sock, src_ip4): - case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): - bpf_ctx_record_field_size(info, size_default); - return bpf_ctx_narrow_access_ok(off, size, size_default); - } - - return size == size_default; -} - bool bpf_sock_common_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) @@ -5988,13 +5973,29 @@ bool bpf_sock_common_is_valid_access(int off, int size, bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; if (off % size != 0) return false; - if (!__sock_filter_check_size(off, size, info)) - return false; - return true; + + switch (off) { + case offsetof(struct bpf_sock, state): + case offsetof(struct bpf_sock, family): + case offsetof(struct bpf_sock, type): + case offsetof(struct bpf_sock, protocol): + case offsetof(struct bpf_sock, dst_port): + case offsetof(struct bpf_sock, src_port): + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + case bpf_ctx_range(struct bpf_sock, dst_ip4): + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); + } + + return size == size_default; } static bool sock_filter_is_valid_access(int off, int size, @@ -6838,24 +6839,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, break; case offsetof(struct bpf_sock, family): - BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2); - - *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg, - offsetof(struct sock, sk_family)); + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_family), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_family, + FIELD_SIZEOF(struct sock_common, + skc_family), + target_size)); break; case offsetof(struct bpf_sock, type): + BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + *target_size = 2; break; case offsetof(struct bpf_sock, protocol): + BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE); *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, offsetof(struct sock, __sk_flags_offset)); *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); + *target_size = 1; break; case offsetof(struct bpf_sock, src_ip4): @@ -6867,6 +6876,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, target_size)); break; + case offsetof(struct bpf_sock, dst_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_daddr, + FIELD_SIZEOF(struct sock_common, + skc_daddr), + target_size)); + break; + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): #if IS_ENABLED(CONFIG_IPV6) off = si->off; @@ -6885,6 +6903,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, #endif break; + case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, dst_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, + skc_v6_daddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_daddr.s6_addr32[0]), + target_size) + off); +#else + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); + *target_size = 4; +#endif + break; + case offsetof(struct bpf_sock, src_port): *insn++ = BPF_LDX_MEM( BPF_FIELD_SIZEOF(struct sock_common, skc_num), @@ -6894,6 +6929,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, skc_num), target_size)); break; + + case offsetof(struct bpf_sock, dst_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_dport), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_dport, + FIELD_SIZEOF(struct sock_common, + skc_dport), + target_size)); + break; + + case offsetof(struct bpf_sock, state): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_state), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_state, + FIELD_SIZEOF(struct sock_common, + skc_state), + target_size)); + break; } return insn - insn_buf; -- cgit v1.2.3 From 655a51e536c09d15ffa3603b1b6fce2b45b85a1f Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Sat, 9 Feb 2019 23:22:24 -0800 Subject: bpf: Add struct bpf_tcp_sock and BPF_FUNC_tcp_sock This patch adds a helper function BPF_FUNC_tcp_sock and it is currently available for cg_skb and sched_(cls|act): struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk); int cg_skb_foo(struct __sk_buff *skb) { struct bpf_tcp_sock *tp; struct bpf_sock *sk; __u32 snd_cwnd; sk = skb->sk; if (!sk) return 1; tp = bpf_tcp_sock(sk); if (!tp) return 1; snd_cwnd = tp->snd_cwnd; /* ... */ return 1; } A 'struct bpf_tcp_sock' is also added to the uapi bpf.h to provide read-only access. bpf_tcp_sock has all the existing tcp_sock's fields that has already been exposed by the bpf_sock_ops. i.e. no new tcp_sock's fields are exposed in bpf.h. This helper returns a pointer to the tcp_sock. If it is not a tcp_sock or it cannot be traced back to a tcp_sock by sk_to_full_sk(), it returns NULL. Hence, the caller needs to check for NULL before accessing it. The current use case is to expose members from tcp_sock to allow a cg_skb_bpf_prog to provide per cgroup traffic policing/shaping. Acked-by: Alexei Starovoitov Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 30 ++++++++++++++++++ include/uapi/linux/bpf.h | 51 ++++++++++++++++++++++++++++++- kernel/bpf/verifier.c | 31 +++++++++++++++++-- net/core/filter.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 188 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a60463b45b54..7f58828755fd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -204,6 +204,7 @@ enum bpf_return_type { RET_PTR_TO_MAP_VALUE, /* returns a pointer to map elem value */ RET_PTR_TO_MAP_VALUE_OR_NULL, /* returns a pointer to map elem value or NULL */ RET_PTR_TO_SOCKET_OR_NULL, /* returns a pointer to a socket or NULL */ + RET_PTR_TO_TCP_SOCK_OR_NULL, /* returns a pointer to a tcp_sock or NULL */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs @@ -259,6 +260,8 @@ enum bpf_reg_type { PTR_TO_SOCKET_OR_NULL, /* reg points to struct bpf_sock or NULL */ PTR_TO_SOCK_COMMON, /* reg points to sock_common */ PTR_TO_SOCK_COMMON_OR_NULL, /* reg points to sock_common or NULL */ + PTR_TO_TCP_SOCK, /* reg points to struct tcp_sock */ + PTR_TO_TCP_SOCK_OR_NULL, /* reg points to struct tcp_sock or NULL */ }; /* The information passed from prog-specific *_is_valid_access @@ -956,4 +959,31 @@ static inline u32 bpf_sock_convert_ctx_access(enum bpf_access_type type, } #endif +#ifdef CONFIG_INET +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info); + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size); +#else +static inline bool bpf_tcp_sock_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + return false; +} + +static inline u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + return 0; +} +#endif /* CONFIG_INET */ + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d8f91777c5b6..25c8c0e62ecf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2337,6 +2337,15 @@ union bpf_attr { * Return * A **struct bpf_sock** pointer on success, or NULL in * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * + * Return + * A **struct bpf_tcp_sock** pointer on success, or NULL in + * case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2434,7 +2443,8 @@ union bpf_attr { FN(rc_pointer_rel), \ FN(spin_lock), \ FN(spin_unlock), \ - FN(sk_fullsock), + FN(sk_fullsock), \ + FN(tcp_sock), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2616,6 +2626,45 @@ struct bpf_sock { __u32 state; }; +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ +}; + struct bpf_sock_tuple { union { struct { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index b755d55a3791..1b9496c41383 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -334,14 +334,16 @@ static bool type_is_pkt_pointer(enum bpf_reg_type type) static bool type_is_sk_pointer(enum bpf_reg_type type) { return type == PTR_TO_SOCKET || - type == PTR_TO_SOCK_COMMON; + type == PTR_TO_SOCK_COMMON || + type == PTR_TO_TCP_SOCK; } static bool reg_type_may_be_null(enum bpf_reg_type type) { return type == PTR_TO_MAP_VALUE_OR_NULL || type == PTR_TO_SOCKET_OR_NULL || - type == PTR_TO_SOCK_COMMON_OR_NULL; + type == PTR_TO_SOCK_COMMON_OR_NULL || + type == PTR_TO_TCP_SOCK_OR_NULL; } static bool type_is_refcounted(enum bpf_reg_type type) @@ -407,6 +409,8 @@ static const char * const reg_type_str[] = { [PTR_TO_SOCKET_OR_NULL] = "sock_or_null", [PTR_TO_SOCK_COMMON] = "sock_common", [PTR_TO_SOCK_COMMON_OR_NULL] = "sock_common_or_null", + [PTR_TO_TCP_SOCK] = "tcp_sock", + [PTR_TO_TCP_SOCK_OR_NULL] = "tcp_sock_or_null", }; static char slot_type_char[] = { @@ -1209,6 +1213,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return true; default: return false; @@ -1662,6 +1668,9 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, case PTR_TO_SOCKET: valid = bpf_sock_is_valid_access(off, size, t, &info); break; + case PTR_TO_TCP_SOCK: + valid = bpf_tcp_sock_is_valid_access(off, size, t, &info); + break; default: valid = false; } @@ -1823,6 +1832,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, case PTR_TO_SOCK_COMMON: pointer_desc = "sock_common "; break; + case PTR_TO_TCP_SOCK: + pointer_desc = "tcp_sock "; + break; default: break; } @@ -3148,6 +3160,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn /* For mark_ptr_or_null_reg() */ regs[BPF_REG_0].id = ++env->id_gen; } + } else if (fn->ret_type == RET_PTR_TO_TCP_SOCK_OR_NULL) { + mark_reg_known_zero(env, regs, BPF_REG_0); + regs[BPF_REG_0].type = PTR_TO_TCP_SOCK_OR_NULL; + regs[BPF_REG_0].id = ++env->id_gen; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -3409,6 +3425,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: verbose(env, "R%d pointer arithmetic on %s prohibited\n", dst, reg_type_str[ptr_reg->type]); return -EACCES; @@ -4644,6 +4662,8 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, reg->type = PTR_TO_SOCKET; } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { reg->type = PTR_TO_SOCK_COMMON; + } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { + reg->type = PTR_TO_TCP_SOCK; } if (is_null || !(reg_is_refcounted(reg) || reg_may_point_to_spin_lock(reg))) { @@ -5839,6 +5859,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: /* Only valid matches are exact, which memcmp() above * would have accepted */ @@ -6161,6 +6183,8 @@ static bool reg_type_mismatch_ok(enum bpf_reg_type type) case PTR_TO_SOCKET_OR_NULL: case PTR_TO_SOCK_COMMON: case PTR_TO_SOCK_COMMON_OR_NULL: + case PTR_TO_TCP_SOCK: + case PTR_TO_TCP_SOCK_OR_NULL: return false; default: return true; @@ -7166,6 +7190,9 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) case PTR_TO_SOCK_COMMON: convert_ctx_access = bpf_sock_convert_ctx_access; break; + case PTR_TO_TCP_SOCK: + convert_ctx_access = bpf_tcp_sock_convert_ctx_access; + break; default: continue; } diff --git a/net/core/filter.c b/net/core/filter.c index c0d7b9ef279f..353735575204 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5315,6 +5315,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = { .arg5_type = ARG_ANYTHING, }; +bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked)) + return false; + + if (off % size != 0) + return false; + + switch (off) { + case offsetof(struct bpf_tcp_sock, bytes_received): + case offsetof(struct bpf_tcp_sock, bytes_acked): + return size == sizeof(__u64); + default: + return size == sizeof(__u32); + } +} + +u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + +#define BPF_TCP_SOCK_GET_COMMON(FIELD) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \ + FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\ + si->dst_reg, si->src_reg, \ + offsetof(struct tcp_sock, FIELD)); \ + } while (0) + + CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock, + BPF_TCP_SOCK_GET_COMMON); + + if (insn > insn_buf) + return insn - insn_buf; + + switch (si->off) { + case offsetof(struct bpf_tcp_sock, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct tcp_sock, rtt_min) + + offsetof(struct minmax_sample, v)); + break; + } + + return insn - insn_buf; +} + +BPF_CALL_1(bpf_tcp_sock, struct sock *, sk) +{ + sk = sk_to_full_sk(sk); + + if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP) + return (unsigned long)sk; + + return (unsigned long)NULL; +} + +static const struct bpf_func_proto bpf_tcp_sock_proto = { + .func = bpf_tcp_sock, + .gpl_only = false, + .ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL, + .arg1_type = ARG_PTR_TO_SOCK_COMMON, +}; + #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5470,6 +5543,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif default: return sk_filter_func_proto(func_id, prog); } @@ -5560,6 +5637,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_lookup_udp_proto; case BPF_FUNC_sk_release: return &bpf_sk_release_proto; + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; #endif default: return bpf_base_func_proto(func_id); -- cgit v1.2.3 From d9a9ea94f748f47b1d75c6c5e33edcf74476c445 Mon Sep 17 00:00:00 2001 From: Chad Austin Date: Mon, 7 Jan 2019 16:53:17 -0800 Subject: fuse: support clients that don't implement 'opendir' Allow filesystems to return ENOSYS from opendir, preventing the kernel from sending opendir and releasedir messages in the future. This avoids userspace transitions when filesystems don't need to keep track of state per directory handle. A new capability flag, FUSE_NO_OPENDIR_SUPPORT, parallels FUSE_NO_OPEN_SUPPORT, indicating the new semantics for returning ENOSYS from opendir. Signed-off-by: Chad Austin Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 11 +++++++---- fs/fuse/fuse_i.h | 3 +++ fs/fuse/inode.c | 3 ++- include/uapi/linux/fuse.h | 7 ++++++- 4 files changed, 18 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 8ee0446a8322..cc6ffd23b80f 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -90,7 +90,7 @@ static void fuse_file_put(struct fuse_file *ff, bool sync, bool isdir) if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; - if (ff->fc->no_open && !isdir) { + if (isdir ? ff->fc->no_opendir : ff->fc->no_open) { /* * Drop the release request when client does not * implement 'open' @@ -125,7 +125,7 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = 0; ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */ - if (!fc->no_open || isdir) { + if (isdir ? !fc->no_opendir : !fc->no_open) { struct fuse_open_out outarg; int err; @@ -134,11 +134,14 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file, ff->fh = outarg.fh; ff->open_flags = outarg.open_flags; - } else if (err != -ENOSYS || isdir) { + } else if (err != -ENOSYS) { fuse_file_free(ff); return err; } else { - fc->no_open = 1; + if (isdir) + fc->no_opendir = 1; + else + fc->no_open = 1; } } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 033e30af519f..0920c0c032a0 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -630,6 +630,9 @@ struct fuse_conn { /** Is open/release not implemented by fs? */ unsigned no_open:1; + /** Is opendir/releasedir not implemented by fs? */ + unsigned no_opendir:1; + /** Is fsync not implemented by fs? */ unsigned no_fsync:1; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2bbb7c59d6da..1b3f3b67d9f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -972,7 +972,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO | FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT | FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | - FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS; + FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | + FUSE_NO_OPENDIR_SUPPORT; req->in.h.opcode = FUSE_INIT; req->in.numargs = 1; req->in.args[0].size = sizeof(*arg); diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index b4967d48bfda..2ac598614a8f 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -122,6 +122,9 @@ * - add FOPEN_CACHE_DIR * - add FUSE_MAX_PAGES, add max_pages to init_out * - add FUSE_CACHE_SYMLINKS + * + * 7.29 + * - add FUSE_NO_OPENDIR_SUPPORT flag */ #ifndef _LINUX_FUSE_H @@ -157,7 +160,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 28 +#define FUSE_KERNEL_MINOR_VERSION 29 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -259,6 +262,7 @@ struct fuse_file_lock { * FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages * FUSE_CACHE_SYMLINKS: cache READLINK responses + * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -284,6 +288,7 @@ struct fuse_file_lock { #define FUSE_ABORT_ERROR (1 << 21) #define FUSE_MAX_PAGES (1 << 22) #define FUSE_CACHE_SYMLINKS (1 << 23) +#define FUSE_NO_OPENDIR_SUPPORT (1 << 24) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From 7fd8afa8933a095a97995885740999f174e61b60 Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Mon, 11 Feb 2019 15:25:29 +0100 Subject: net: phy: Add generic support for 2.5GBaseT and 5GBaseT The 802.3bz specification, based on previous by the NBASET alliance, defines the 2.5GBaseT and 5GBaseT link modes for ethernet traffic on cat5e, cat6 and cat7 cables. These mode integrate with the already defined C45 MDIO PMA/PMD registers set that added 10G support, by defining some previously reserved bits, and adding a new register (2.5G/5G Extended abilities). This commit adds the required definitions in include/uapi/linux/mdio.h to support these modes, and detect when a link-partner advertises them. It also adds support for these mode in the generic C45 PHY infrastructure. Signed-off-by: Maxime Chevallier Reviewed-by: Heiner Kallweit Reviewed-by: Andrew Lunn Signed-off-by: David S. Miller --- drivers/net/phy/phy-c45.c | 37 +++++++++++++++++++++++++++++++++++++ include/uapi/linux/mdio.h | 16 ++++++++++++++++ 2 files changed, 53 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy-c45.c b/drivers/net/phy/phy-c45.c index 6f028de4dae1..7af5fa81daf6 100644 --- a/drivers/net/phy/phy-c45.c +++ b/drivers/net/phy/phy-c45.c @@ -47,6 +47,16 @@ int genphy_c45_pma_setup_forced(struct phy_device *phydev) /* Assume 1000base-T */ ctrl2 |= MDIO_PMA_CTRL2_1000BT; break; + case SPEED_2500: + ctrl1 |= MDIO_CTRL1_SPEED2_5G; + /* Assume 2.5Gbase-T */ + ctrl2 |= MDIO_PMA_CTRL2_2_5GBT; + break; + case SPEED_5000: + ctrl1 |= MDIO_CTRL1_SPEED5G; + /* Assume 5Gbase-T */ + ctrl2 |= MDIO_PMA_CTRL2_5GBT; + break; case SPEED_10000: ctrl1 |= MDIO_CTRL1_SPEED10G; /* Assume 10Gbase-T */ @@ -194,6 +204,12 @@ int genphy_c45_read_lpa(struct phy_device *phydev) if (val < 0) return val; + if (val & MDIO_AN_10GBT_STAT_LP2_5G) + linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->lp_advertising); + if (val & MDIO_AN_10GBT_STAT_LP5G) + linkmode_set_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + phydev->lp_advertising); if (val & MDIO_AN_10GBT_STAT_LP10G) linkmode_set_bit(ETHTOOL_LINK_MODE_10000baseT_Full_BIT, phydev->lp_advertising); @@ -224,6 +240,12 @@ int genphy_c45_read_pma(struct phy_device *phydev) case MDIO_PMA_CTRL1_SPEED1000: phydev->speed = SPEED_1000; break; + case MDIO_CTRL1_SPEED2_5G: + phydev->speed = SPEED_2500; + break; + case MDIO_CTRL1_SPEED5G: + phydev->speed = SPEED_5000; + break; case MDIO_CTRL1_SPEED10G: phydev->speed = SPEED_10000; break; @@ -339,6 +361,21 @@ int genphy_c45_pma_read_abilities(struct phy_device *phydev) linkmode_mod_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, phydev->supported, val & MDIO_PMA_EXTABLE_10BT); + + if (val & MDIO_PMA_EXTABLE_NBT) { + val = phy_read_mmd(phydev, MDIO_MMD_PMAPMD, + MDIO_PMA_NG_EXTABLE); + if (val < 0) + return val; + + linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, + phydev->supported, + val & MDIO_PMA_NG_EXTABLE_2_5GBT); + + linkmode_mod_bit(ETHTOOL_LINK_MODE_5000baseT_Full_BIT, + phydev->supported, + val & MDIO_PMA_NG_EXTABLE_5GBT); + } } return 0; diff --git a/include/uapi/linux/mdio.h b/include/uapi/linux/mdio.h index 0e012b168e4d..0a552061ff1c 100644 --- a/include/uapi/linux/mdio.h +++ b/include/uapi/linux/mdio.h @@ -45,6 +45,7 @@ #define MDIO_AN_ADVERTISE 16 /* AN advertising (base page) */ #define MDIO_AN_LPA 19 /* AN LP abilities (base page) */ #define MDIO_PCS_EEE_ABLE 20 /* EEE Capability register */ +#define MDIO_PMA_NG_EXTABLE 21 /* 2.5G/5G PMA/PMD extended ability */ #define MDIO_PCS_EEE_WK_ERR 22 /* EEE wake error counter */ #define MDIO_PHYXS_LNSTAT 24 /* PHY XGXS lane state */ #define MDIO_AN_EEE_ADV 60 /* EEE advertisement */ @@ -92,6 +93,10 @@ #define MDIO_CTRL1_SPEED10G (MDIO_CTRL1_SPEEDSELEXT | 0x00) /* 10PASS-TS/2BASE-TL */ #define MDIO_CTRL1_SPEED10P2B (MDIO_CTRL1_SPEEDSELEXT | 0x04) +/* 2.5 Gb/s */ +#define MDIO_CTRL1_SPEED2_5G (MDIO_CTRL1_SPEEDSELEXT | 0x18) +/* 5 Gb/s */ +#define MDIO_CTRL1_SPEED5G (MDIO_CTRL1_SPEEDSELEXT | 0x1c) /* Status register 1. */ #define MDIO_STAT1_LPOWERABLE 0x0002 /* Low-power ability */ @@ -145,6 +150,8 @@ #define MDIO_PMA_CTRL2_1000BKX 0x000d /* 1000BASE-KX type */ #define MDIO_PMA_CTRL2_100BTX 0x000e /* 100BASE-TX type */ #define MDIO_PMA_CTRL2_10BT 0x000f /* 10BASE-T type */ +#define MDIO_PMA_CTRL2_2_5GBT 0x0030 /* 2.5GBaseT type */ +#define MDIO_PMA_CTRL2_5GBT 0x0031 /* 5GBaseT type */ #define MDIO_PCS_CTRL2_TYPE 0x0003 /* PCS type selection */ #define MDIO_PCS_CTRL2_10GBR 0x0000 /* 10GBASE-R type */ #define MDIO_PCS_CTRL2_10GBX 0x0001 /* 10GBASE-X type */ @@ -198,6 +205,7 @@ #define MDIO_PMA_EXTABLE_1000BKX 0x0040 /* 1000BASE-KX ability */ #define MDIO_PMA_EXTABLE_100BTX 0x0080 /* 100BASE-TX ability */ #define MDIO_PMA_EXTABLE_10BT 0x0100 /* 10BASE-T ability */ +#define MDIO_PMA_EXTABLE_NBT 0x4000 /* 2.5/5GBASE-T ability */ /* PHY XGXS lane state register. */ #define MDIO_PHYXS_LNSTAT_SYNC0 0x0001 @@ -234,9 +242,13 @@ #define MDIO_PCS_10GBRT_STAT2_BER 0x3f00 /* AN 10GBASE-T control register. */ +#define MDIO_AN_10GBT_CTRL_ADV2_5G 0x0080 /* Advertise 2.5GBASE-T */ +#define MDIO_AN_10GBT_CTRL_ADV5G 0x0100 /* Advertise 5GBASE-T */ #define MDIO_AN_10GBT_CTRL_ADV10G 0x1000 /* Advertise 10GBASE-T */ /* AN 10GBASE-T status register. */ +#define MDIO_AN_10GBT_STAT_LP2_5G 0x0020 /* LP is 2.5GBT capable */ +#define MDIO_AN_10GBT_STAT_LP5G 0x0040 /* LP is 5GBT capable */ #define MDIO_AN_10GBT_STAT_LPTRR 0x0200 /* LP training reset req. */ #define MDIO_AN_10GBT_STAT_LPLTABLE 0x0400 /* LP loop timing ability */ #define MDIO_AN_10GBT_STAT_LP10G 0x0800 /* LP is 10GBT capable */ @@ -265,6 +277,10 @@ #define MDIO_EEE_10GKX4 0x0020 /* 10G KX4 EEE cap */ #define MDIO_EEE_10GKR 0x0040 /* 10G KR EEE cap */ +/* 2.5G/5G Extended abilities register. */ +#define MDIO_PMA_NG_EXTABLE_2_5GBT 0x0001 /* 2.5GBASET ability */ +#define MDIO_PMA_NG_EXTABLE_5GBT 0x0002 /* 5GBASET ability */ + /* LASI RX_ALARM control/status registers. */ #define MDIO_PMA_LASI_RX_PHYXSLFLT 0x0001 /* PHY XS RX local fault */ #define MDIO_PMA_LASI_RX_PCSLFLT 0x0008 /* PCS RX local fault */ -- cgit v1.2.3 From 3e0bd37ce0e4a574df6d87a901e13bcb46e10301 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Wed, 13 Feb 2019 11:53:35 -0800 Subject: bpf: add plumbing for BPF_LWT_ENCAP_IP in bpf_lwt_push_encap This patch adds all needed plumbing in preparation to allowing bpf programs to do IP encapping via bpf_lwt_push_encap. Actual implementation is added in the next patch in the patchset. Of note: - bpf_lwt_push_encap can now be called from BPF_PROG_TYPE_LWT_XMIT prog types in addition to BPF_PROG_TYPE_LWT_IN; - if the skb being encapped has GSO set, encapsulation is limited to IPIP/IP+GRE/IP+GUE (both IPv4 and IPv6); - as route lookups are different for ingress vs egress, the single external bpf_lwt_push_encap BPF helper is routed internally to either bpf_lwt_in_push_encap or bpf_lwt_xmit_push_encap BPF_CALLs, depending on prog type. v8 changes: fixed a typo. Signed-off-by: Peter Oskolkov Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 26 ++++++++++++++++++++++++-- net/core/filter.c | 48 +++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 25c8c0e62ecf..bcdd2474eee7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2016,6 +2016,19 @@ union bpf_attr { * Only works if *skb* contains an IPv6 packet. Insert a * Segment Routing Header (**struct ipv6_sr_hdr**) inside * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to LWT_BPF_MAX_HEADROOM total + * bytes in all prepended headers. Please note that + * if skb_is_gso(skb) is true, no more than two headers + * can be prepended, and the inner header, if present, + * should be either GRE or UDP/GUE. + * + * BPF_LWT_ENCAP_SEG6*** types can be called by bpf programs of + * type BPF_PROG_TYPE_LWT_IN; BPF_LWT_ENCAP_IP type can be called + * by bpf programs of types BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT. * * A call to this helper is susceptible to change the underlaying * packet buffer. Therefore, at load time, all checks on pointers @@ -2517,7 +2530,8 @@ enum bpf_hdr_start_off { /* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_SEG6, - BPF_LWT_ENCAP_SEG6_INLINE + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, }; #define __bpf_md_ptr(type, name) \ @@ -2606,7 +2620,15 @@ enum bpf_ret_code { BPF_DROP = 2, /* 3-6 reserved */ BPF_REDIRECT = 7, - /* >127 are reserved for prog type specific return codes */ + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, }; struct bpf_sock { diff --git a/net/core/filter.c b/net/core/filter.c index 353735575204..12c88c21b6b8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4815,7 +4815,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len } #endif /* CONFIG_IPV6_SEG6_BPF */ -BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) +static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, + bool ingress) +{ + return -EINVAL; /* Implemented in the next patch. */ +} +#endif + +BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, u32, len) { switch (type) { @@ -4823,14 +4831,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr, case BPF_LWT_ENCAP_SEG6: case BPF_LWT_ENCAP_SEG6_INLINE: return bpf_push_seg6_encap(skb, type, hdr, len); +#endif +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, true /* ingress */); +#endif + default: + return -EINVAL; + } +} + +BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type, + void *, hdr, u32, len) +{ + switch (type) { +#if IS_ENABLED(CONFIG_LWTUNNEL_BPF) + case BPF_LWT_ENCAP_IP: + return bpf_push_ip_encap(skb, hdr, len, false /* egress */); #endif default: return -EINVAL; } } -static const struct bpf_func_proto bpf_lwt_push_encap_proto = { - .func = bpf_lwt_push_encap, +static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = { + .func = bpf_lwt_in_push_encap, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_MEM, + .arg4_type = ARG_CONST_SIZE +}; + +static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = { + .func = bpf_lwt_xmit_push_encap, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_PTR_TO_CTX, @@ -5417,7 +5452,8 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_lwt_seg6_adjust_srh || func == bpf_lwt_seg6_action || #endif - func == bpf_lwt_push_encap) + func == bpf_lwt_in_push_encap || + func == bpf_lwt_xmit_push_encap) return true; return false; @@ -5815,7 +5851,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_lwt_push_encap: - return &bpf_lwt_push_encap_proto; + return &bpf_lwt_in_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } @@ -5851,6 +5887,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_l4_csum_replace_proto; case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; + case BPF_FUNC_lwt_push_encap: + return &bpf_lwt_xmit_push_encap_proto; default: return lwt_out_func_proto(func_id, prog); } -- cgit v1.2.3 From ca5e9aba753ed15d173c7a7b88e4d402b7ca8121 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 12 Feb 2019 19:26:03 -0800 Subject: time: Add time_types.h sys/time.h is the mandated include for many time related defines. However, linux/time.h overlaps sys/time.h significantly and this makes including both from userspace or one from the other impossible. This also means that userspace can get away with including sys/time.h whenever it needs linux/time.h and this is what's been happening in the user world usually. But, we have new data types that we plan to use in the uapi time interfaces also defined in the linux/time.h. But, we are unable to use these types when sys/time.h is included. Hence, move the new types to a new header, time_types.h. We intend to eventually have all the uapi defines that the kernel uses defined in this header. Note that the plan is to replace uapi interfaces with timeval to use __kernel_old_timeval, timespec to use __kernel_old_timespec etc. Reported-by: Ran Rozenstein Fixes: 9718475e6908 ("socket: Add SO_TIMESTAMPING_NEW") Signed-off-by: Deepa Dinamani Signed-off-by: David S. Miller --- include/uapi/linux/time.h | 36 +----------------------------------- include/uapi/linux/time_types.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 35 deletions(-) create mode 100644 include/uapi/linux/time_types.h (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/time.h b/include/uapi/linux/time.h index b8ad1b86b942..958932effc5e 100644 --- a/include/uapi/linux/time.h +++ b/include/uapi/linux/time.h @@ -3,7 +3,7 @@ #define _UAPI_LINUX_TIME_H #include - +#include #ifndef _STRUCT_TIMESPEC #define _STRUCT_TIMESPEC @@ -23,7 +23,6 @@ struct timezone { int tz_dsttime; /* type of dst correction */ }; - /* * Names of the interval timers, and structure * defining a timer setting: @@ -42,39 +41,6 @@ struct itimerval { struct timeval it_value; /* current value */ }; -#ifndef __kernel_timespec -struct __kernel_timespec { - __kernel_time64_t tv_sec; /* seconds */ - long long tv_nsec; /* nanoseconds */ -}; -#endif - -#ifndef __kernel_itimerspec -struct __kernel_itimerspec { - struct __kernel_timespec it_interval; /* timer period */ - struct __kernel_timespec it_value; /* timer expiration */ -}; -#endif - -/* - * legacy timeval structure, only embedded in structures that - * traditionally used 'timeval' to pass time intervals (not absolute - * times). Do not add new users. If user space fails to compile - * here, this is probably because it is not y2038 safe and needs to - * be changed to use another interface. - */ -#ifndef __kernel_old_timeval -struct __kernel_old_timeval { - __kernel_long_t tv_sec; - __kernel_long_t tv_usec; -}; -#endif - -struct __kernel_sock_timeval { - __s64 tv_sec; - __s64 tv_usec; -}; - /* * The IDs of the various system clocks (for POSIX.1b interval timers): */ diff --git a/include/uapi/linux/time_types.h b/include/uapi/linux/time_types.h new file mode 100644 index 000000000000..459070c61d47 --- /dev/null +++ b/include/uapi/linux/time_types.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_TIME_TYPES_H +#define _UAPI_LINUX_TIME_TYPES_H + +#include + +#ifndef __kernel_timespec +struct __kernel_timespec { + __kernel_time64_t tv_sec; /* seconds */ + long long tv_nsec; /* nanoseconds */ +}; +#endif + +#ifndef __kernel_itimerspec +struct __kernel_itimerspec { + struct __kernel_timespec it_interval; /* timer period */ + struct __kernel_timespec it_value; /* timer expiration */ +}; +#endif + +/* + * legacy timeval structure, only embedded in structures that + * traditionally used 'timeval' to pass time intervals (not absolute + * times). Do not add new users. If user space fails to compile + * here, this is probably because it is not y2038 safe and needs to + * be changed to use another interface. + */ +#ifndef __kernel_old_timeval +struct __kernel_old_timeval { + __kernel_long_t tv_sec; + __kernel_long_t tv_usec; +}; +#endif + +struct __kernel_sock_timeval { + __s64 tv_sec; + __s64 tv_usec; +}; + +#endif /* _UAPI_LINUX_TIME_TYPES_H */ -- cgit v1.2.3 From 460a2db0273efe15488a3e4b88da2678eb403178 Mon Sep 17 00:00:00 2001 From: Deepa Dinamani Date: Tue, 12 Feb 2019 19:26:04 -0800 Subject: errqueue.h: Include time_types.h Now that we have a separate header for struct __kernel_timespec, include it directly without relying on userspace to do it. Reported-by: Ran Rozenstein Signed-off-by: Deepa Dinamani Signed-off-by: David S. Miller --- include/uapi/linux/errqueue.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/errqueue.h b/include/uapi/linux/errqueue.h index d955b9e32288..28491dac074b 100644 --- a/include/uapi/linux/errqueue.h +++ b/include/uapi/linux/errqueue.h @@ -3,6 +3,7 @@ #define _UAPI_LINUX_ERRQUEUE_H #include +#include struct sock_extended_err { __u32 ee_errno; -- cgit v1.2.3 From 76726ccb7f461c83040e7082cf95fe1dea2afd1f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 14 Feb 2019 13:40:44 -0800 Subject: devlink: add flash update command Add devlink flash update command. Advanced NICs have firmware stored in flash and often cryptographically secured. Updating that flash is handled by management firmware. Ethtool has a flash update command which served us well, however, it has two shortcomings: - it takes rtnl_lock unnecessarily - really flash update has nothing to do with networking, so using a networking device as a handle is suboptimal, which leads us to the second one: - it requires a functioning netdev - in case device enters an error state and can't spawn a netdev (e.g. communication with the device fails) there is no netdev to use as a handle for flashing. Devlink already has the ability to report the firmware versions, now with the ability to update the firmware/flash we will be able to recover devices in bad state. To enable updates of sub-components of the FW allow passing component name. This name should correspond to one of the versions reported in devlink info. v1: - replace target id with component name (Jiri). Signed-off-by: Jakub Kicinski Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 3 +++ include/uapi/linux/devlink.h | 6 ++++++ net/core/devlink.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index c6d88759b7d5..18d7a051f412 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -521,6 +521,9 @@ struct devlink_ops { struct netlink_ext_ack *extack); int (*info_get)(struct devlink *devlink, struct devlink_info_req *req, struct netlink_ext_ack *extack); + int (*flash_update)(struct devlink *devlink, const char *file_name, + const char *component, + struct netlink_ext_ack *extack); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 72d9f7c89190..53de8802a000 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -103,6 +103,8 @@ enum devlink_command { DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, DEVLINK_CMD_HEALTH_REPORTER_DUMP_CLEAR, + DEVLINK_CMD_FLASH_UPDATE, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 @@ -326,6 +328,10 @@ enum devlink_attr { DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ + + DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME, /* string */ + DEVLINK_ATTR_FLASH_UPDATE_COMPONENT, /* string */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 1d7502a5a651..4a1ad0b13e52 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -2662,6 +2662,27 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) return devlink->ops->reload(devlink, info->extack); } +static int devlink_nl_cmd_flash_update(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *file_name, *component; + struct nlattr *nla_component; + + if (!devlink->ops->flash_update) + return -EOPNOTSUPP; + + if (!info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]) + return -EINVAL; + file_name = nla_data(info->attrs[DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME]); + + nla_component = info->attrs[DEVLINK_ATTR_FLASH_UPDATE_COMPONENT]; + component = nla_component ? nla_data(nla_component) : NULL; + + return devlink->ops->flash_update(devlink, file_name, component, + info->extack); +} + static const struct devlink_param devlink_param_generic[] = { { .id = DEVLINK_PARAM_GENERIC_ID_INT_ERR_RESET, @@ -4883,6 +4904,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_HEALTH_REPORTER_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD] = { .type = NLA_U64 }, [DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER] = { .type = NLA_U8 }, + [DEVLINK_ATTR_FLASH_UPDATE_FILE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_FLASH_UPDATE_COMPONENT] = { .type = NLA_NUL_STRING }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -5171,6 +5194,13 @@ static const struct genl_ops devlink_nl_ops[] = { .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | DEVLINK_NL_FLAG_NO_LOCK, }, + { + .cmd = DEVLINK_CMD_FLASH_UPDATE, + .doit = devlink_nl_cmd_flash_update, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { -- cgit v1.2.3 From 746c9398f5ac2b3f5730da4ed09e99ef4bb50b4a Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 8 Feb 2019 01:02:55 -0500 Subject: arch: move common mmap flags to linux/mman.h Now that we have 3 mmap flags shared by all architectures, let's move them into the common header. This will help discourage future architectures from duplicating code. Signed-off-by: Michael S. Tsirkin Signed-off-by: Arnd Bergmann --- arch/alpha/include/uapi/asm/mman.h | 4 +--- arch/mips/include/uapi/asm/mman.h | 4 +--- arch/parisc/include/uapi/asm/mman.h | 4 +--- arch/xtensa/include/uapi/asm/mman.h | 4 +--- include/uapi/asm-generic/mman-common.h | 4 +--- include/uapi/linux/mman.h | 4 ++++ 6 files changed, 9 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index f9d4e6b6d4bd..ac23379b7a87 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -10,9 +10,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 3035ca499cd8..c2b40969eb1f 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -27,9 +27,7 @@ /* * Flags for mmap */ -#define MAP_SHARED 0x001 /* Share changes */ -#define MAP_PRIVATE 0x002 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 870fbf8c7088..c98162f494db 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -10,9 +10,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x2b /* Mask for type of mapping, includes bits 0x08 and 0x20 */ #define MAP_FIXED 0x04 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x10 /* don't use a file */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 58f29a9d895d..be726062412b 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -34,9 +34,7 @@ /* * Flags for mmap */ -#define MAP_SHARED 0x001 /* Share changes */ -#define MAP_PRIVATE 0x002 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_FIXED 0x010 /* Interpret addr exactly */ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index e7ee32861d51..abd238d0f7a4 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -15,9 +15,7 @@ #define PROT_GROWSDOWN 0x01000000 /* mprotect flag: extend change to start of growsdown vma */ #define PROT_GROWSUP 0x02000000 /* mprotect flag: extend change to end of growsup vma */ -#define MAP_SHARED 0x01 /* Share changes */ -#define MAP_PRIVATE 0x02 /* Changes are private */ -#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ +/* 0x01 - 0x03 are defined in linux/mman.h */ #define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_ANONYMOUS 0x20 /* don't use a file */ diff --git a/include/uapi/linux/mman.h b/include/uapi/linux/mman.h index d0f515d53299..fc1a64c3447b 100644 --- a/include/uapi/linux/mman.h +++ b/include/uapi/linux/mman.h @@ -12,6 +12,10 @@ #define OVERCOMMIT_ALWAYS 1 #define OVERCOMMIT_NEVER 2 +#define MAP_SHARED 0x01 /* Share changes */ +#define MAP_PRIVATE 0x02 /* Changes are private */ +#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */ + /* * Huge page size encoding when MAP_HUGETLB is specified, and a huge page * size other than the default is desired. See hugetlb_encode.h. -- cgit v1.2.3 From a7fe4ca72b1fe7877de5672640d0b4e023d0fdca Mon Sep 17 00:00:00 2001 From: Vivek Kasireddy Date: Thu, 7 Feb 2019 22:18:43 -0500 Subject: media: v4l: Add 32-bit packed YUV formats The formats added in this patch include: V4L2_PIX_FMT_AYUV32 V4L2_PIX_FMT_XYUV32 V4L2_PIX_FMT_VUYA32 V4L2_PIX_FMT_VUYX32 These formats enable the trasmission of alpha channel data to other drivers and userspace applications in addition to YUV data. For example, buffers generated by drivers in one of these formats can be used by the Weston compositor to display as a texture or flipped directly onto the overlay planes with the help of a DRM driver. Signed-off-by: Vivek Kasireddy Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst | 170 ++++++++++++++++++++- drivers/media/v4l2-core/v4l2-ioctl.c | 4 + include/uapi/linux/videodev2.h | 4 + 3 files changed, 177 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst index f53e8f57a003..7fcee1c11ac4 100644 --- a/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst +++ b/Documentation/media/uapi/v4l/pixfmt-packed-yuv.rst @@ -190,6 +190,170 @@ component of each pixel in one 16 or 32 bit word. - Cr\ :sub:`2` - Cr\ :sub:`1` - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-AYUV32: + + - ``V4L2_PIX_FMT_AYUV32`` + - 'AYUV' + + - a\ :sub:`7` + - a\ :sub:`6` + - a\ :sub:`5` + - a\ :sub:`4` + - a\ :sub:`3` + - a\ :sub:`2` + - a\ :sub:`1` + - a\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-XYUV32: + + - ``V4L2_PIX_FMT_XYUV32`` + - 'XYUV' + + - + - + - + - + - + - + - + - + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + - + * .. _V4L2-PIX-FMT-VUYA32: + + - ``V4L2_PIX_FMT_VUYA32`` + - 'VUYA' + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - a\ :sub:`7` + - a\ :sub:`6` + - a\ :sub:`5` + - a\ :sub:`4` + - a\ :sub:`3` + - a\ :sub:`2` + - a\ :sub:`1` + - a\ :sub:`0` + - + * .. _V4L2-PIX-FMT-VUYX32: + + - ``V4L2_PIX_FMT_VUYX32`` + - 'VUYX' + + - Cr\ :sub:`7` + - Cr\ :sub:`6` + - Cr\ :sub:`5` + - Cr\ :sub:`4` + - Cr\ :sub:`3` + - Cr\ :sub:`2` + - Cr\ :sub:`1` + - Cr\ :sub:`0` + + - Cb\ :sub:`7` + - Cb\ :sub:`6` + - Cb\ :sub:`5` + - Cb\ :sub:`4` + - Cb\ :sub:`3` + - Cb\ :sub:`2` + - Cb\ :sub:`1` + - Cb\ :sub:`0` + + - Y'\ :sub:`7` + - Y'\ :sub:`6` + - Y'\ :sub:`5` + - Y'\ :sub:`4` + - Y'\ :sub:`3` + - Y'\ :sub:`2` + - Y'\ :sub:`1` + - Y'\ :sub:`0` + + - + - + - + - + - + - + - + - .. raw:: latex @@ -202,4 +366,8 @@ component of each pixel in one 16 or 32 bit word. #) The value of a = alpha bits is undefined when reading from the driver, ignored when writing to the driver, except when alpha blending has been negotiated for a :ref:`Video Overlay ` or - :ref:`Video Output Overlay `. + :ref:`Video Output Overlay ` for the formats Y444, YUV555 and + YUV4. However, for formats AYUV32 and VUYA32, the alpha component is + expected to contain a meaningful value that can be used by drivers + and applications. And, the formats XYUV32 and VUYX32 contain undefined + alpha values that must be ignored by all applications and drivers. diff --git a/drivers/media/v4l2-core/v4l2-ioctl.c b/drivers/media/v4l2-core/v4l2-ioctl.c index 206b7348797e..6f707466b5d2 100644 --- a/drivers/media/v4l2-core/v4l2-ioctl.c +++ b/drivers/media/v4l2-core/v4l2-ioctl.c @@ -1220,6 +1220,10 @@ static void v4l_fill_fmtdesc(struct v4l2_fmtdesc *fmt) case V4L2_PIX_FMT_YUV555: descr = "16-bit A/XYUV 1-5-5-5"; break; case V4L2_PIX_FMT_YUV565: descr = "16-bit YUV 5-6-5"; break; case V4L2_PIX_FMT_YUV32: descr = "32-bit A/XYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_AYUV32: descr = "32-bit AYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_XYUV32: descr = "32-bit XYUV 8-8-8-8"; break; + case V4L2_PIX_FMT_VUYA32: descr = "32-bit VUYA 8-8-8-8"; break; + case V4L2_PIX_FMT_VUYX32: descr = "32-bit VUYX 8-8-8-8"; break; case V4L2_PIX_FMT_YUV410: descr = "Planar YUV 4:1:0"; break; case V4L2_PIX_FMT_YUV420: descr = "Planar YUV 4:2:0"; break; case V4L2_PIX_FMT_HI240: descr = "8-bit Dithered RGB (BTTV)"; break; diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 9a920f071ff9..1db220da3bcc 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -562,6 +562,10 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YUV555 v4l2_fourcc('Y', 'U', 'V', 'O') /* 16 YUV-5-5-5 */ #define V4L2_PIX_FMT_YUV565 v4l2_fourcc('Y', 'U', 'V', 'P') /* 16 YUV-5-6-5 */ #define V4L2_PIX_FMT_YUV32 v4l2_fourcc('Y', 'U', 'V', '4') /* 32 YUV-8-8-8-8 */ +#define V4L2_PIX_FMT_AYUV32 v4l2_fourcc('A', 'Y', 'U', 'V') /* 32 AYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_XYUV32 v4l2_fourcc('X', 'Y', 'U', 'V') /* 32 XYUV-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYA32 v4l2_fourcc('V', 'U', 'Y', 'A') /* 32 VUYA-8-8-8-8 */ +#define V4L2_PIX_FMT_VUYX32 v4l2_fourcc('V', 'U', 'Y', 'X') /* 32 VUYX-8-8-8-8 */ #define V4L2_PIX_FMT_HI240 v4l2_fourcc('H', 'I', '2', '4') /* 8 8-bit color */ #define V4L2_PIX_FMT_HM12 v4l2_fourcc('H', 'M', '1', '2') /* 8 YUV 4:2:0 16x16 macroblocks */ #define V4L2_PIX_FMT_M420 v4l2_fourcc('M', '4', '2', '0') /* 12 YUV 4:2:0 2 lines y, 1 line uv interleaved */ -- cgit v1.2.3 From 721074b03411327e7bf41555d4cc7c18f49313f7 Mon Sep 17 00:00:00 2001 From: Patrick Lerda Date: Thu, 17 Jan 2019 03:50:13 -0500 Subject: media: rc: rcmm decoder and encoder media: add support for RCMM infrared remote controls. Signed-off-by: Patrick Lerda Signed-off-by: Sean Young Signed-off-by: Mauro Carvalho Chehab --- Documentation/media/lirc.h.rst.exceptions | 3 + MAINTAINERS | 5 + drivers/media/rc/Kconfig | 13 ++ drivers/media/rc/Makefile | 1 + drivers/media/rc/ir-rcmm-decoder.c | 254 ++++++++++++++++++++++++++++++ drivers/media/rc/rc-core-priv.h | 5 + drivers/media/rc/rc-main.c | 9 ++ include/media/rc-map.h | 14 +- include/uapi/linux/lirc.h | 6 + tools/include/uapi/linux/lirc.h | 12 ++ tools/testing/selftests/ir/ir_loopback.c | 9 ++ 11 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 drivers/media/rc/ir-rcmm-decoder.c (limited to 'include/uapi/linux') diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index 379b9e7df5d0..7a8b8ff4f076 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -60,6 +60,9 @@ ignore symbol RC_PROTO_SHARP ignore symbol RC_PROTO_XMP ignore symbol RC_PROTO_CEC ignore symbol RC_PROTO_IMON +ignore symbol RC_PROTO_RCMM12 +ignore symbol RC_PROTO_RCMM24 +ignore symbol RC_PROTO_RCMM32 # Undocumented macros diff --git a/MAINTAINERS b/MAINTAINERS index 914d333ee73d..9662ad6c58e4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16536,6 +16536,11 @@ M: David Härdeman S: Maintained F: drivers/media/rc/winbond-cir.c +RCMM REMOTE CONTROLS DECODER +M: Patrick Lerda +S: Maintained +F: drivers/media/rc/ir-rcmm-decoder.c + WINSYSTEMS EBC-C384 WATCHDOG DRIVER M: William Breathitt Gray L: linux-watchdog@vger.kernel.org diff --git a/drivers/media/rc/Kconfig b/drivers/media/rc/Kconfig index 8a216068a35a..8e95f6eac89d 100644 --- a/drivers/media/rc/Kconfig +++ b/drivers/media/rc/Kconfig @@ -133,6 +133,19 @@ config IR_IMON_DECODER remote control and you would like to use it with a raw IR receiver, or if you wish to use an encoder to transmit this IR. +config IR_RCMM_DECODER + tristate "Enable IR raw decoder for the RC-MM protocol" + depends on RC_CORE + help + Enable this option when you have IR with RC-MM protocol, and + you need the software decoder. The driver supports 12, + 24 and 32 bits RC-MM variants. You can enable or disable the + different modes using the following RC protocol keywords: + 'rc-mm-12', 'rc-mm-24' and 'rc-mm-32'. + + To compile this driver as a module, choose M here: the module + will be called ir-rcmm-decoder. + endif #RC_DECODERS menuconfig RC_DEVICES diff --git a/drivers/media/rc/Makefile b/drivers/media/rc/Makefile index 92c163816849..48d23433b3c0 100644 --- a/drivers/media/rc/Makefile +++ b/drivers/media/rc/Makefile @@ -16,6 +16,7 @@ obj-$(CONFIG_IR_SHARP_DECODER) += ir-sharp-decoder.o obj-$(CONFIG_IR_MCE_KBD_DECODER) += ir-mce_kbd-decoder.o obj-$(CONFIG_IR_XMP_DECODER) += ir-xmp-decoder.o obj-$(CONFIG_IR_IMON_DECODER) += ir-imon-decoder.o +obj-$(CONFIG_IR_RCMM_DECODER) += ir-rcmm-decoder.o # stand-alone IR receivers/transmitters obj-$(CONFIG_RC_ATI_REMOTE) += ati_remote.o diff --git a/drivers/media/rc/ir-rcmm-decoder.c b/drivers/media/rc/ir-rcmm-decoder.c new file mode 100644 index 000000000000..f1096ac1e5c5 --- /dev/null +++ b/drivers/media/rc/ir-rcmm-decoder.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0+ +// ir-rcmm-decoder.c - A decoder for the RCMM IR protocol +// +// Copyright (C) 2018 by Patrick Lerda + +#include "rc-core-priv.h" +#include +#include + +#define RCMM_UNIT 166667 /* nanosecs */ +#define RCMM_PREFIX_PULSE 416666 /* 166666.666666666*2.5 */ +#define RCMM_PULSE_0 277777 /* 166666.666666666*(1+2/3) */ +#define RCMM_PULSE_1 444444 /* 166666.666666666*(2+2/3) */ +#define RCMM_PULSE_2 611111 /* 166666.666666666*(3+2/3) */ +#define RCMM_PULSE_3 777778 /* 166666.666666666*(4+2/3) */ + +enum rcmm_state { + STATE_INACTIVE, + STATE_LOW, + STATE_BUMP, + STATE_VALUE, + STATE_FINISHED, +}; + +static bool rcmm_mode(const struct rcmm_dec *data) +{ + return !((0x000c0000 & data->bits) == 0x000c0000); +} + +static int rcmm_miscmode(struct rc_dev *dev, struct rcmm_dec *data) +{ + switch (data->count) { + case 24: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM24) { + rc_keydown(dev, RC_PROTO_RCMM24, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + + case 12: + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM12) { + rc_keydown(dev, RC_PROTO_RCMM12, data->bits, 0); + data->state = STATE_INACTIVE; + return 0; + } + return -1; + } + + return -1; +} + +/** + * ir_rcmm_decode() - Decode one RCMM pulse or space + * @dev: the struct rc_dev descriptor of the device + * @ev: the struct ir_raw_event descriptor of the pulse/space + * + * This function returns -EINVAL if the pulse violates the state machine + */ +static int ir_rcmm_decode(struct rc_dev *dev, struct ir_raw_event ev) +{ + struct rcmm_dec *data = &dev->raw->rcmm; + u32 scancode; + u8 toggle; + int value; + + if (!(dev->enabled_protocols & (RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12))) + return 0; + + if (!is_timing_event(ev)) { + if (ev.reset) + data->state = STATE_INACTIVE; + return 0; + } + + switch (data->state) { + case STATE_INACTIVE: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PREFIX_PULSE, RCMM_UNIT / 2)) + break; + + data->state = STATE_LOW; + data->count = 0; + data->bits = 0; + return 0; + + case STATE_LOW: + if (ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + break; + + data->state = STATE_BUMP; + return 0; + + case STATE_BUMP: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + data->state = STATE_VALUE; + return 0; + + case STATE_VALUE: + if (ev.pulse) + break; + + if (eq_margin(ev.duration, RCMM_PULSE_0, RCMM_UNIT / 2)) + value = 0; + else if (eq_margin(ev.duration, RCMM_PULSE_1, RCMM_UNIT / 2)) + value = 1; + else if (eq_margin(ev.duration, RCMM_PULSE_2, RCMM_UNIT / 2)) + value = 2; + else if (eq_margin(ev.duration, RCMM_PULSE_3, RCMM_UNIT / 2)) + value = 3; + else + value = -1; + + if (value == -1) { + if (!rcmm_miscmode(dev, data)) + return 0; + break; + } + + data->bits <<= 2; + data->bits |= value; + + data->count += 2; + + if (data->count < 32) + data->state = STATE_BUMP; + else + data->state = STATE_FINISHED; + + return 0; + + case STATE_FINISHED: + if (!ev.pulse) + break; + + if (!eq_margin(ev.duration, RCMM_UNIT, RCMM_UNIT / 2)) + break; + + if (rcmm_mode(data)) { + toggle = !!(0x8000 & data->bits); + scancode = data->bits & ~0x8000; + } else { + toggle = 0; + scancode = data->bits; + } + + if (dev->enabled_protocols & RC_PROTO_BIT_RCMM32) { + rc_keydown(dev, RC_PROTO_RCMM32, scancode, toggle); + data->state = STATE_INACTIVE; + return 0; + } + + break; + } + + data->state = STATE_INACTIVE; + return -EINVAL; +} + +static const int rcmmspace[] = { + RCMM_PULSE_0, + RCMM_PULSE_1, + RCMM_PULSE_2, + RCMM_PULSE_3, +}; + +static int ir_rcmm_rawencoder(struct ir_raw_event **ev, unsigned int max, + unsigned int n, u32 data) +{ + int i; + int ret; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_PREFIX_PULSE, RCMM_PULSE_0); + if (ret) + return ret; + + for (i = n - 2; i >= 0; i -= 2) { + const unsigned int space = rcmmspace[(data >> i) & 3]; + + ret = ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, space); + if (ret) + return ret; + } + + return ir_raw_gen_pulse_space(ev, &max, RCMM_UNIT, RCMM_PULSE_3 * 2); +} + +static int ir_rcmm_encode(enum rc_proto protocol, u32 scancode, + struct ir_raw_event *events, unsigned int max) +{ + struct ir_raw_event *e = events; + int ret; + + switch (protocol) { + case RC_PROTO_RCMM32: + ret = ir_rcmm_rawencoder(&e, max, 32, scancode); + break; + case RC_PROTO_RCMM24: + ret = ir_rcmm_rawencoder(&e, max, 24, scancode); + break; + case RC_PROTO_RCMM12: + ret = ir_rcmm_rawencoder(&e, max, 12, scancode); + break; + default: + ret = -EINVAL; + } + + if (ret < 0) + return ret; + + return e - events; +} + +static struct ir_raw_handler rcmm_handler = { + .protocols = RC_PROTO_BIT_RCMM32 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM12, + .decode = ir_rcmm_decode, + .encode = ir_rcmm_encode, + .carrier = 36000, + .min_timeout = RCMM_PULSE_3 + RCMM_UNIT, +}; + +static int __init ir_rcmm_decode_init(void) +{ + ir_raw_handler_register(&rcmm_handler); + + pr_info("IR RCMM protocol handler initialized\n"); + return 0; +} + +static void __exit ir_rcmm_decode_exit(void) +{ + ir_raw_handler_unregister(&rcmm_handler); +} + +module_init(ir_rcmm_decode_init); +module_exit(ir_rcmm_decode_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick Lerda"); +MODULE_DESCRIPTION("RCMM IR protocol decoder"); diff --git a/drivers/media/rc/rc-core-priv.h b/drivers/media/rc/rc-core-priv.h index c2cbe7f6266c..9f21b3e8b377 100644 --- a/drivers/media/rc/rc-core-priv.h +++ b/drivers/media/rc/rc-core-priv.h @@ -131,6 +131,11 @@ struct ir_raw_event_ctrl { unsigned int bits; bool stick_keyboard; } imon; + struct rcmm_dec { + int state; + unsigned int count; + u32 bits; + } rcmm; }; /* Mutex for locking raw IR processing and handler change */ diff --git a/drivers/media/rc/rc-main.c b/drivers/media/rc/rc-main.c index 66a174979b3c..dc38e9c0a2ff 100644 --- a/drivers/media/rc/rc-main.c +++ b/drivers/media/rc/rc-main.c @@ -70,6 +70,12 @@ static const struct { [RC_PROTO_CEC] = { .name = "cec", .repeat_period = 0 }, [RC_PROTO_IMON] = { .name = "imon", .scancode_bits = 0x7fffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM12] = { .name = "rc-mm-12", + .scancode_bits = 0x00000fff, .repeat_period = 114 }, + [RC_PROTO_RCMM24] = { .name = "rc-mm-24", + .scancode_bits = 0x00ffffff, .repeat_period = 114 }, + [RC_PROTO_RCMM32] = { .name = "rc-mm-32", + .scancode_bits = 0xffffffff, .repeat_period = 114 }, }; /* Used to keep track of known keymaps */ @@ -1006,6 +1012,9 @@ static const struct { { RC_PROTO_BIT_XMP, "xmp", "ir-xmp-decoder" }, { RC_PROTO_BIT_CEC, "cec", NULL }, { RC_PROTO_BIT_IMON, "imon", "ir-imon-decoder" }, + { RC_PROTO_BIT_RCMM12 | + RC_PROTO_BIT_RCMM24 | + RC_PROTO_BIT_RCMM32, "rc-mm", "ir-rcmm-decoder" }, }; /** diff --git a/include/media/rc-map.h b/include/media/rc-map.h index d621acadfbf3..e5e86d595645 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -37,6 +37,9 @@ #define RC_PROTO_BIT_XMP BIT_ULL(RC_PROTO_XMP) #define RC_PROTO_BIT_CEC BIT_ULL(RC_PROTO_CEC) #define RC_PROTO_BIT_IMON BIT_ULL(RC_PROTO_IMON) +#define RC_PROTO_BIT_RCMM12 BIT_ULL(RC_PROTO_RCMM12) +#define RC_PROTO_BIT_RCMM24 BIT_ULL(RC_PROTO_RCMM24) +#define RC_PROTO_BIT_RCMM32 BIT_ULL(RC_PROTO_RCMM32) #define RC_PROTO_BIT_ALL \ (RC_PROTO_BIT_UNKNOWN | RC_PROTO_BIT_OTHER | \ @@ -51,7 +54,8 @@ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ RC_PROTO_BIT_XMP | RC_PROTO_BIT_CEC | \ - RC_PROTO_BIT_IMON) + RC_PROTO_BIT_IMON | RC_PROTO_BIT_RCMM12 | \ + RC_PROTO_BIT_RCMM24 | RC_PROTO_BIT_RCMM32) /* All rc protocols for which we have decoders */ #define RC_PROTO_BIT_ALL_IR_DECODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -64,7 +68,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | RC_PROTO_BIT_RC6_6A_32 | \ RC_PROTO_BIT_RC6_MCE | RC_PROTO_BIT_SHARP | \ - RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_XMP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_PROTO_BIT_ALL_IR_ENCODER \ (RC_PROTO_BIT_RC5 | RC_PROTO_BIT_RC5X_20 | \ @@ -77,7 +83,9 @@ RC_PROTO_BIT_RC6_0 | RC_PROTO_BIT_RC6_6A_20 | \ RC_PROTO_BIT_RC6_6A_24 | \ RC_PROTO_BIT_RC6_6A_32 | RC_PROTO_BIT_RC6_MCE | \ - RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON) + RC_PROTO_BIT_SHARP | RC_PROTO_BIT_IMON | \ + RC_PROTO_BIT_RCMM12 | RC_PROTO_BIT_RCMM24 | \ + RC_PROTO_BIT_RCMM32) #define RC_SCANCODE_UNKNOWN(x) (x) #define RC_SCANCODE_OTHER(x) (x) diff --git a/include/uapi/linux/lirc.h b/include/uapi/linux/lirc.h index 6b319581882f..45fcbf99d72e 100644 --- a/include/uapi/linux/lirc.h +++ b/include/uapi/linux/lirc.h @@ -192,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -218,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/include/uapi/linux/lirc.h b/tools/include/uapi/linux/lirc.h index f189931042a7..45fcbf99d72e 100644 --- a/tools/include/uapi/linux/lirc.h +++ b/tools/include/uapi/linux/lirc.h @@ -133,6 +133,12 @@ #define LIRC_SET_WIDEBAND_RECEIVER _IOW('i', 0x00000023, __u32) +/* + * Return the recording timeout, which is either set by + * the ioctl LIRC_SET_REC_TIMEOUT or by the kernel after setting the protocols. + */ +#define LIRC_GET_REC_TIMEOUT _IOR('i', 0x00000024, __u32) + /* * struct lirc_scancode - decoded scancode with protocol for use with * LIRC_MODE_SCANCODE @@ -186,6 +192,9 @@ struct lirc_scancode { * @RC_PROTO_XMP: XMP protocol * @RC_PROTO_CEC: CEC protocol * @RC_PROTO_IMON: iMon Pad protocol + * @RC_PROTO_RCMM12: RC-MM protocol 12 bits + * @RC_PROTO_RCMM24: RC-MM protocol 24 bits + * @RC_PROTO_RCMM32: RC-MM protocol 32 bits */ enum rc_proto { RC_PROTO_UNKNOWN = 0, @@ -212,6 +221,9 @@ enum rc_proto { RC_PROTO_XMP = 21, RC_PROTO_CEC = 22, RC_PROTO_IMON = 23, + RC_PROTO_RCMM12 = 24, + RC_PROTO_RCMM24 = 25, + RC_PROTO_RCMM32 = 26, }; #endif diff --git a/tools/testing/selftests/ir/ir_loopback.c b/tools/testing/selftests/ir/ir_loopback.c index 858c19caf224..570a7358942c 100644 --- a/tools/testing/selftests/ir/ir_loopback.c +++ b/tools/testing/selftests/ir/ir_loopback.c @@ -51,6 +51,10 @@ static const struct { { RC_PROTO_RC6_6A_32, "rc-6-6a-32", 0xffffffff, "rc-6" }, { RC_PROTO_RC6_MCE, "rc-6-mce", 0x00007fff, "rc-6" }, { RC_PROTO_SHARP, "sharp", 0x1fff, "sharp" }, + { RC_PROTO_IMON, "imon", 0x7fffffff, "imon" }, + { RC_PROTO_RCMM12, "rcmm-12", 0x00000fff, "rcmm" }, + { RC_PROTO_RCMM24, "rcmm-24", 0x00ffffff, "rcmm" }, + { RC_PROTO_RCMM32, "rcmm-32", 0xffffffff, "rcmm" }, }; int lirc_open(const char *rc) @@ -139,6 +143,11 @@ int main(int argc, char **argv) (((scancode >> 8) ^ ~scancode) & 0xff) == 0) continue; + if (rc_proto == RC_PROTO_RCMM32 && + (scancode & 0x000c0000) != 0x000c0000 && + scancode & 0x00008000) + continue; + struct lirc_scancode lsc = { .rc_proto = rc_proto, .scancode = scancode -- cgit v1.2.3 From 2736d94f351b92749c07efef01f7c10548e39ad8 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Tue, 22 Jan 2019 10:50:10 +0200 Subject: ethtool: Added support for 50Gbps per lane link modes Added support for 50Gbps per lane link modes. Define various 50G, 100G and 200G link modes using it. Signed-off-by: Aya Levin Reviewed-by: Eran Ben Elisha Signed-off-by: Saeed Mahameed --- include/uapi/linux/ethtool.h | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 17be76aeb468..378c52308d89 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1453,6 +1453,21 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_FEC_NONE_BIT = 49, ETHTOOL_LINK_MODE_FEC_RS_BIT = 50, ETHTOOL_LINK_MODE_FEC_BASER_BIT = 51, + ETHTOOL_LINK_MODE_50000baseKR_Full_BIT = 52, + ETHTOOL_LINK_MODE_50000baseSR_Full_BIT = 53, + ETHTOOL_LINK_MODE_50000baseCR_Full_BIT = 54, + ETHTOOL_LINK_MODE_50000baseLR_ER_FR_Full_BIT = 55, + ETHTOOL_LINK_MODE_50000baseDR_Full_BIT = 56, + ETHTOOL_LINK_MODE_100000baseKR2_Full_BIT = 57, + ETHTOOL_LINK_MODE_100000baseSR2_Full_BIT = 58, + ETHTOOL_LINK_MODE_100000baseCR2_Full_BIT = 59, + ETHTOOL_LINK_MODE_100000baseLR2_ER2_FR2_Full_BIT = 60, + ETHTOOL_LINK_MODE_100000baseDR2_Full_BIT = 61, + ETHTOOL_LINK_MODE_200000baseKR4_Full_BIT = 62, + ETHTOOL_LINK_MODE_200000baseSR4_Full_BIT = 63, + ETHTOOL_LINK_MODE_200000baseLR4_ER4_FR4_Full_BIT = 64, + ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, + ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* @@ -1461,7 +1476,7 @@ enum ethtool_link_mode_bit_indices { */ __ETHTOOL_LINK_MODE_LAST - = ETHTOOL_LINK_MODE_FEC_BASER_BIT, + = ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT, }; #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ @@ -1569,6 +1584,7 @@ enum ethtool_link_mode_bit_indices { #define SPEED_50000 50000 #define SPEED_56000 56000 #define SPEED_100000 100000 +#define SPEED_200000 200000 #define SPEED_UNKNOWN -1 -- cgit v1.2.3 From fadccd8fc2d06cf7fd222245d7e04b00fae946cf Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 Feb 2019 09:37:13 +0100 Subject: nvme_ioctl.h: remove duplicate GPL boilerplate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We already have a ЅPDX header, so no need to duplicate the information. Signed-off-by: Christoph Hellwig Reviewed-by: Sagi Grimberg --- include/linux/nvme.h | 10 +--------- include/uapi/linux/nvme_ioctl.h | 9 --------- 2 files changed, 1 insertion(+), 18 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index bbcc83886899..baa49e6a23cc 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -1,15 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for the NVM Express interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _LINUX_NVME_H diff --git a/include/uapi/linux/nvme_ioctl.h b/include/uapi/linux/nvme_ioctl.h index 6e74b1eaf541..1c215ea1798e 100644 --- a/include/uapi/linux/nvme_ioctl.h +++ b/include/uapi/linux/nvme_ioctl.h @@ -2,15 +2,6 @@ /* * Definitions for the NVM Express ioctl interface * Copyright (c) 2011-2014, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. */ #ifndef _UAPI_LINUX_NVME_IOCTL_H -- cgit v1.2.3 From 61697a6abd24acba941359c6268a94f4afe4a53d Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Fri, 18 Jan 2019 14:19:26 -0500 Subject: dm: eliminate 'split_discard_bios' flag from DM target interface There is no need to have DM core split discards on behalf of a DM target now that blk_queue_split() handles splitting discards based on the queue_limits. A DM target just needs to set max_discard_sectors, discard_granularity, etc, in queue_limits. Signed-off-by: Mike Snitzer --- drivers/md/dm-cache-target.c | 1 - drivers/md/dm-raid.c | 14 +++++++++----- drivers/md/dm-thin.c | 1 - drivers/md/dm-zoned-target.c | 1 - drivers/md/dm.c | 25 ++++++------------------- include/linux/device-mapper.h | 6 ------ include/uapi/linux/dm-ioctl.h | 4 ++-- 7 files changed, 17 insertions(+), 35 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index b29a8327eed1..adc529f12b6b 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2496,7 +2496,6 @@ static int cache_create(struct cache_args *ca, struct cache **result) ti->num_discard_bios = 1; ti->discards_supported = true; - ti->split_discard_bios = false; ti->per_io_data_size = sizeof(struct per_bio_data); diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index adcfe8ae10aa..9fdef6897316 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -2986,11 +2986,6 @@ static void configure_discard_support(struct raid_set *rs) } } - /* - * RAID1 and RAID10 personalities require bio splitting, - * RAID0/4/5/6 don't and process large discard bios properly. - */ - ti->split_discard_bios = !!(rs_is_raid1(rs) || rs_is_raid10(rs)); ti->num_discard_bios = 1; } @@ -3747,6 +3742,15 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size); blk_limits_io_opt(limits, chunk_size * mddev_data_stripes(rs)); + + /* + * RAID1 and RAID10 personalities require bio splitting, + * RAID0/4/5/6 don't and process large discard bios properly. + */ + if (rs_is_raid1(rs) || rs_is_raid10(rs)) { + limits->discard_granularity = chunk_size; + limits->max_discard_sectors = chunk_size; + } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index e83b63608262..0d9ded0f5e50 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -4227,7 +4227,6 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) if (tc->pool->pf.discard_enabled) { ti->discards_supported = true; ti->num_discard_bios = 1; - ti->split_discard_bios = false; } mutex_unlock(&dm_thin_pool_table.mutex); diff --git a/drivers/md/dm-zoned-target.c b/drivers/md/dm-zoned-target.c index 6af5babe6837..8865c1709e16 100644 --- a/drivers/md/dm-zoned-target.c +++ b/drivers/md/dm-zoned-target.c @@ -727,7 +727,6 @@ static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv) ti->per_io_data_size = sizeof(struct dmz_bioctx); ti->flush_supported = true; ti->discards_supported = true; - ti->split_discard_bios = true; /* The exposed capacity is the number of chunks that can be mapped */ ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift; diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7a774fcd0194..55f12df3589d 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1478,17 +1478,10 @@ static unsigned get_num_write_zeroes_bios(struct dm_target *ti) return ti->num_write_zeroes_bios; } -typedef bool (*is_split_required_fn)(struct dm_target *ti); - -static bool is_split_required_for_discard(struct dm_target *ti) -{ - return ti->split_discard_bios; -} - static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, bool is_split_required) + unsigned num_bios) { - unsigned len; + unsigned len = ci->sector_count; /* * Even though the device advertised support for this type of @@ -1499,11 +1492,6 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * if (!num_bios) return -EOPNOTSUPP; - if (!is_split_required) - len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); - else - len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; @@ -1514,23 +1502,22 @@ static int __send_changing_extent_only(struct clone_info *ci, struct dm_target * static int __send_discard(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti), - is_split_required_for_discard(ti)); + return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti)); } static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti)); } static int __send_write_same(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti)); } static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti) { - return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false); + return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti)); } static bool is_abnormal_io(struct bio *bio) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index e528baebad69..0f5b3d7c6cb3 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -315,12 +315,6 @@ struct dm_target { * whether or not its underlying devices have support. */ bool discards_supported:1; - - /* - * Set if the target required discard bios to be split - * on max_io_len boundary. - */ - bool split_discard_bios:1; }; /* Each target can link one of these into the table */ diff --git a/include/uapi/linux/dm-ioctl.h b/include/uapi/linux/dm-ioctl.h index d1e49514977b..f396a82dfd3e 100644 --- a/include/uapi/linux/dm-ioctl.h +++ b/include/uapi/linux/dm-ioctl.h @@ -270,9 +270,9 @@ enum { #define DM_DEV_SET_GEOMETRY _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl) #define DM_VERSION_MAJOR 4 -#define DM_VERSION_MINOR 39 +#define DM_VERSION_MINOR 40 #define DM_VERSION_PATCHLEVEL 0 -#define DM_VERSION_EXTRA "-ioctl (2018-04-03)" +#define DM_VERSION_EXTRA "-ioctl (2019-01-18)" /* Status bits */ #define DM_READONLY_FLAG (1 << 0) /* In/Out */ -- cgit v1.2.3 From 54719527fd06e80fce52b98537414035cd21e8d4 Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Thu, 21 Feb 2019 14:12:01 +0200 Subject: devlink: Rename devlink health attributes Rename devlink health attributes for better reflect the attributes use. Add COUNT prefix on error counter attribute and recovery counter attribute. Fixes: 7afe335a8bed ("devlink: Add health get command") Signed-off-by: Aya Levin Signed-off-by: Eran Ben Elisha Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/devlink.h | 4 ++-- net/core/devlink.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 53de8802a000..5bb4ea67d84f 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -323,8 +323,8 @@ enum devlink_attr { DEVLINK_ATTR_HEALTH_REPORTER, /* nested */ DEVLINK_ATTR_HEALTH_REPORTER_NAME, /* string */ DEVLINK_ATTR_HEALTH_REPORTER_STATE, /* u8 */ - DEVLINK_ATTR_HEALTH_REPORTER_ERR, /* u64 */ - DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, /* u64 */ + DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, /* u64 */ DEVLINK_ATTR_HEALTH_REPORTER_DUMP_TS, /* u64 */ DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, /* u64 */ DEVLINK_ATTR_HEALTH_REPORTER_AUTO_RECOVER, /* u8 */ diff --git a/net/core/devlink.c b/net/core/devlink.c index 04d98550c78c..5135997ecbe7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4650,10 +4650,10 @@ devlink_nl_health_reporter_fill(struct sk_buff *msg, if (nla_put_u8(msg, DEVLINK_ATTR_HEALTH_REPORTER_STATE, reporter->health_state)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR, + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_ERR_COUNT, reporter->error_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; - if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER, + if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_RECOVER_COUNT, reporter->recovery_count, DEVLINK_ATTR_PAD)) goto reporter_nest_cancel; if (nla_put_u64_64bit(msg, DEVLINK_ATTR_HEALTH_REPORTER_GRACEFUL_PERIOD, -- cgit v1.2.3 From ca8d4794f669e721fb5198f6d142e42dd8080239 Mon Sep 17 00:00:00 2001 From: Callum Sinclair Date: Mon, 18 Feb 2019 10:07:52 +1300 Subject: ipmr: ip6mr: Create new sockopt to clear mfc cache or vifs Currently the only way to clear the forwarding cache was to delete the entries one by one using the MRT_DEL_MFC socket option or to destroy and recreate the socket. Create a new socket option which with the use of optional flags can clear any combination of multicast entries (static or not static) and multicast vifs (static or not static). Calling the new socket option MRT_FLUSH with the flags MRT_FLUSH_MFC and MRT_FLUSH_VIFS will clear all entries and vifs on the socket except for static entries. Signed-off-by: Callum Sinclair Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/mroute.h | 9 ++++- include/uapi/linux/mroute6.h | 9 ++++- net/ipv4/ipmr.c | 75 +++++++++++++++++++++++++++--------------- net/ipv6/ip6mr.c | 78 +++++++++++++++++++++++++++++--------------- 4 files changed, 115 insertions(+), 56 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/mroute.h b/include/uapi/linux/mroute.h index 5d37a9ccce63..11c8c1fc1124 100644 --- a/include/uapi/linux/mroute.h +++ b/include/uapi/linux/mroute.h @@ -28,12 +28,19 @@ #define MRT_TABLE (MRT_BASE+9) /* Specify mroute table ID */ #define MRT_ADD_MFC_PROXY (MRT_BASE+10) /* Add a (*,*|G) mfc entry */ #define MRT_DEL_MFC_PROXY (MRT_BASE+11) /* Del a (*,*|G) mfc entry */ -#define MRT_MAX (MRT_BASE+11) +#define MRT_FLUSH (MRT_BASE+12) /* Flush all mfc entries and/or vifs */ +#define MRT_MAX (MRT_BASE+12) #define SIOCGETVIFCNT SIOCPROTOPRIVATE /* IP protocol privates */ #define SIOCGETSGCNT (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) +/* MRT_FLUSH optional flags */ +#define MRT_FLUSH_MFC 1 /* Flush multicast entries */ +#define MRT_FLUSH_MFC_STATIC 2 /* Flush static multicast entries */ +#define MRT_FLUSH_VIFS 4 /* Flush multicast vifs */ +#define MRT_FLUSH_VIFS_STATIC 8 /* Flush static multicast vifs */ + #define MAXVIFS 32 typedef unsigned long vifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short vifi_t; diff --git a/include/uapi/linux/mroute6.h b/include/uapi/linux/mroute6.h index 9999cc006390..c36177a86516 100644 --- a/include/uapi/linux/mroute6.h +++ b/include/uapi/linux/mroute6.h @@ -31,12 +31,19 @@ #define MRT6_TABLE (MRT6_BASE+9) /* Specify mroute table ID */ #define MRT6_ADD_MFC_PROXY (MRT6_BASE+10) /* Add a (*,*|G) mfc entry */ #define MRT6_DEL_MFC_PROXY (MRT6_BASE+11) /* Del a (*,*|G) mfc entry */ -#define MRT6_MAX (MRT6_BASE+11) +#define MRT6_FLUSH (MRT6_BASE+12) /* Flush all mfc entries and/or vifs */ +#define MRT6_MAX (MRT6_BASE+12) #define SIOCGETMIFCNT_IN6 SIOCPROTOPRIVATE /* IP protocol privates */ #define SIOCGETSGCNT_IN6 (SIOCPROTOPRIVATE+1) #define SIOCGETRPF (SIOCPROTOPRIVATE+2) +/* MRT6_FLUSH optional flags */ +#define MRT6_FLUSH_MFC 1 /* Flush multicast entries */ +#define MRT6_FLUSH_MFC_STATIC 2 /* Flush static multicast entries */ +#define MRT6_FLUSH_MIFS 4 /* Flushing multicast vifs */ +#define MRT6_FLUSH_MIFS_STATIC 8 /* Flush static multicast vifs */ + #define MAXMIFS 32 typedef unsigned long mifbitmap_t; /* User mode code depends on this lot */ typedef unsigned short mifi_t; diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e536970557dd..2c931120c494 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -110,7 +110,7 @@ static int ipmr_cache_report(struct mr_table *mrt, static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc, int cmd); static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); -static void mroute_clean_tables(struct mr_table *mrt, bool all); +static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES @@ -415,7 +415,8 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id) static void ipmr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt, true); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC | + MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -1296,7 +1297,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, } /* Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, bool all) +static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct net *net = read_pnet(&mrt->net); struct mr_mfc *c, *tmp; @@ -1305,35 +1306,44 @@ static void mroute_clean_tables(struct mr_table *mrt, bool all) int i; /* Shut down all active vif entries */ - for (i = 0; i < mrt->maxvif; i++) { - if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) - continue; - vif_delete(mrt, i, 0, &list); + if (flags & (MRT_FLUSH_VIFS | MRT_FLUSH_VIFS_STATIC)) { + for (i = 0; i < mrt->maxvif; i++) { + if (((mrt->vif_table[i].flags & VIFF_STATIC) && + !(flags & MRT_FLUSH_VIFS_STATIC)) || + (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT_FLUSH_VIFS))) + continue; + vif_delete(mrt, i, 0, &list); + } + unregister_netdevice_many(&list); } - unregister_netdevice_many(&list); /* Wipe the cache */ - list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); - list_del_rcu(&c->list); - cache = (struct mfc_cache *)c; - call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, - mrt->id); - mroute_netlink_event(mrt, cache, RTM_DELROUTE); - mr_cache_put(c); - } - - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { - list_del(&c->list); + if (flags & (MRT_FLUSH_MFC | MRT_FLUSH_MFC_STATIC)) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC_STATIC)) || + (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT_FLUSH_MFC))) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params); + list_del_rcu(&c->list); cache = (struct mfc_cache *)c; + call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache, + mrt->id); mroute_netlink_event(mrt, cache, RTM_DELROUTE); - ipmr_destroy_unres(mrt, cache); + mr_cache_put(c); + } + } + + if (flags & MRT_FLUSH_MFC) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { + list_del(&c->list); + cache = (struct mfc_cache *)c; + mroute_netlink_event(mrt, cache, RTM_DELROUTE); + ipmr_destroy_unres(mrt, cache); + } + spin_unlock_bh(&mfc_unres_lock); } - spin_unlock_bh(&mfc_unres_lock); } } @@ -1354,7 +1364,7 @@ static void mrtsock_destruct(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); - mroute_clean_tables(mrt, false); + mroute_clean_tables(mrt, MRT_FLUSH_VIFS | MRT_FLUSH_MFC); } } rtnl_unlock(); @@ -1479,6 +1489,17 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, sk == rtnl_dereference(mrt->mroute_sk), parent); break; + case MRT_FLUSH: + if (optlen != sizeof(val)) { + ret = -EINVAL; + break; + } + if (get_user(val, (int __user *)optval)) { + ret = -EFAULT; + break; + } + mroute_clean_tables(mrt, val); + break; /* Control PIM assert. */ case MRT_ASSERT: if (optlen != sizeof(val)) { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index cc01aa3f2b5e..3594f1d9c68c 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -97,7 +97,7 @@ static void mr6_netlink_event(struct mr_table *mrt, struct mfc6_cache *mfc, static void mrt6msg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt); static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb); -static void mroute_clean_tables(struct mr_table *mrt, bool all); +static void mroute_clean_tables(struct mr_table *mrt, int flags); static void ipmr_expire_process(struct timer_list *t); #ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES @@ -393,7 +393,8 @@ static struct mr_table *ip6mr_new_table(struct net *net, u32 id) static void ip6mr_free_table(struct mr_table *mrt) { del_timer_sync(&mrt->ipmr_expire_timer); - mroute_clean_tables(mrt, true); + mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC | + MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC); rhltable_destroy(&mrt->mfc_hash); kfree(mrt); } @@ -1496,42 +1497,51 @@ static int ip6mr_mfc_add(struct net *net, struct mr_table *mrt, * Close the multicast socket, and clear the vif tables etc */ -static void mroute_clean_tables(struct mr_table *mrt, bool all) +static void mroute_clean_tables(struct mr_table *mrt, int flags) { struct mr_mfc *c, *tmp; LIST_HEAD(list); int i; /* Shut down all active vif entries */ - for (i = 0; i < mrt->maxvif; i++) { - if (!all && (mrt->vif_table[i].flags & VIFF_STATIC)) - continue; - mif6_delete(mrt, i, 0, &list); + if (flags & (MRT6_FLUSH_MIFS | MRT6_FLUSH_MIFS_STATIC)) { + for (i = 0; i < mrt->maxvif; i++) { + if (((mrt->vif_table[i].flags & VIFF_STATIC) && + !(flags & MRT6_FLUSH_MIFS_STATIC)) || + (!(mrt->vif_table[i].flags & VIFF_STATIC) && !(flags & MRT6_FLUSH_MIFS))) + continue; + mif6_delete(mrt, i, 0, &list); + } + unregister_netdevice_many(&list); } - unregister_netdevice_many(&list); /* Wipe the cache */ - list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { - if (!all && (c->mfc_flags & MFC_STATIC)) - continue; - rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); - list_del_rcu(&c->list); - call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), - FIB_EVENT_ENTRY_DEL, - (struct mfc6_cache *)c, mrt->id); - mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); - mr_cache_put(c); + if (flags & (MRT6_FLUSH_MFC | MRT6_FLUSH_MFC_STATIC)) { + list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) { + if (((c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC_STATIC)) || + (!(c->mfc_flags & MFC_STATIC) && !(flags & MRT6_FLUSH_MFC))) + continue; + rhltable_remove(&mrt->mfc_hash, &c->mnode, ip6mr_rht_params); + list_del_rcu(&c->list); + call_ip6mr_mfc_entry_notifiers(read_pnet(&mrt->net), + FIB_EVENT_ENTRY_DEL, + (struct mfc6_cache *)c, mrt->id); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, RTM_DELROUTE); + mr_cache_put(c); + } } - if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { - spin_lock_bh(&mfc_unres_lock); - list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { - list_del(&c->list); - mr6_netlink_event(mrt, (struct mfc6_cache *)c, - RTM_DELROUTE); - ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); + if (flags & MRT6_FLUSH_MFC) { + if (atomic_read(&mrt->cache_resolve_queue_len) != 0) { + spin_lock_bh(&mfc_unres_lock); + list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) { + list_del(&c->list); + mr6_netlink_event(mrt, (struct mfc6_cache *)c, + RTM_DELROUTE); + ip6mr_destroy_unres(mrt, (struct mfc6_cache *)c); + } + spin_unlock_bh(&mfc_unres_lock); } - spin_unlock_bh(&mfc_unres_lock); } } @@ -1587,7 +1597,7 @@ int ip6mr_sk_done(struct sock *sk) NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); - mroute_clean_tables(mrt, false); + mroute_clean_tables(mrt, MRT6_FLUSH_MIFS | MRT6_FLUSH_MFC); err = 0; break; } @@ -1703,6 +1713,20 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns rtnl_unlock(); return ret; + case MRT6_FLUSH: + { + int flags; + + if (optlen != sizeof(flags)) + return -EINVAL; + if (get_user(flags, (int __user *)optval)) + return -EFAULT; + rtnl_lock(); + mroute_clean_tables(mrt, flags); + rtnl_unlock(); + return 0; + } + /* * Control PIM assert (to activate pim will activate assert) */ -- cgit v1.2.3 From e728fdf0628971d43cb4e48860defc6e8a553761 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 22 Feb 2019 19:25:59 +0100 Subject: net: phy: improve definition of __ETHTOOL_LINK_MODE_MASK_NBITS The way to define __ETHTOOL_LINK_MODE_MASK_NBITS seems to be overly complicated, go with a standard approach instead. Whilst we're at it, move the comment to the right place. v2: - rebased Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- include/linux/ethtool.h | 4 ---- include/uapi/linux/ethtool.h | 17 +++++++++-------- 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 19a8de5326fb..e6ebc9761822 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -98,10 +98,6 @@ static inline u32 ethtool_rxfh_indir_default(u32 index, u32 n_rx_rings) return index % n_rx_rings; } -/* number of link mode bits/ulongs handled internally by kernel */ -#define __ETHTOOL_LINK_MODE_MASK_NBITS \ - (__ETHTOOL_LINK_MODE_LAST + 1) - /* declare a link mode bitmap */ #define __ETHTOOL_DECLARE_LINK_MODE_MASK(name) \ DECLARE_BITMAP(name, __ETHTOOL_LINK_MODE_MASK_NBITS) diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 378c52308d89..3652b239dad1 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1432,6 +1432,13 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_56000baseSR4_Full_BIT = 29, ETHTOOL_LINK_MODE_56000baseLR4_Full_BIT = 30, ETHTOOL_LINK_MODE_25000baseCR_Full_BIT = 31, + + /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit + * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* + * macro for bits > 31. The only way to use indices > 31 is to + * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. + */ + ETHTOOL_LINK_MODE_25000baseKR_Full_BIT = 32, ETHTOOL_LINK_MODE_25000baseSR_Full_BIT = 33, ETHTOOL_LINK_MODE_50000baseCR2_Full_BIT = 34, @@ -1469,14 +1476,8 @@ enum ethtool_link_mode_bit_indices { ETHTOOL_LINK_MODE_200000baseDR4_Full_BIT = 65, ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT = 66, - /* Last allowed bit for __ETHTOOL_LINK_MODE_LEGACY_MASK is bit - * 31. Please do NOT define any SUPPORTED_* or ADVERTISED_* - * macro for bits > 31. The only way to use indices > 31 is to - * use the new ETHTOOL_GLINKSETTINGS/ETHTOOL_SLINKSETTINGS API. - */ - - __ETHTOOL_LINK_MODE_LAST - = ETHTOOL_LINK_MODE_200000baseCR4_Full_BIT, + /* must be last entry */ + __ETHTOOL_LINK_MODE_MASK_NBITS }; #define __ETHTOOL_LINK_MODE_LEGACY_MASK(base_name) \ -- cgit v1.2.3 From 228a73abde5c04428678e917b271f8526cfd90ed Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Fri, 4 Jan 2019 13:31:54 +0800 Subject: btrfs: introduce new ioctl to unregister a btrfs device Support for a new command that can be used eg. as a command $ btrfs device scan --forget [dev]' (the final name may change though) to undo the effects of 'btrfs device scan [dev]'. For this purpose this patch proposes to use ioctl #5 as it was empty and is next to the SCAN ioctl. The new ioctl BTRFS_IOC_FORGET_DEV works only on the control device (/dev/btrfs-control) to unregister one or all devices, devices that are not mounted. The argument is struct btrfs_ioctl_vol_args, ::name specifies the device path. To unregister all device, the path is an empty string. Again, the devices are removed only if they aren't part of a mounte filesystem. This new ioctl provides: - release of unwanted btrfs_fs_devices and btrfs_devices structures from memory if the device is not going to be mounted - ability to mount filesystem in degraded mode, when one devices is corrupted like in split brain raid1 - running test cases which would require reloading the kernel module but this is not possible eg. due to mounted filesystem or built-in Signed-off-by: Anand Jain Reviewed-by: David Sterba [ update changelog ] Signed-off-by: David Sterba --- fs/btrfs/super.c | 3 +++ fs/btrfs/volumes.c | 11 +++++++++++ fs/btrfs/volumes.h | 1 + include/uapi/linux/btrfs.h | 2 ++ 4 files changed, 17 insertions(+) (limited to 'include/uapi/linux') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 0a3f122dd61f..f9d13a30aa8a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2190,6 +2190,9 @@ static long btrfs_control_ioctl(struct file *file, unsigned int cmd, ret = PTR_ERR_OR_ZERO(device); mutex_unlock(&uuid_mutex); break; + case BTRFS_IOC_FORGET_DEV: + ret = btrfs_forget_devices(vol->name); + break; case BTRFS_IOC_DEVICES_READY: mutex_lock(&uuid_mutex); device = btrfs_scan_one_device(vol->name, FMODE_READ, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7ca31c83e730..fe122e6099ae 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1446,6 +1446,17 @@ static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr, return 0; } +int btrfs_forget_devices(const char *path) +{ + int ret; + + mutex_lock(&uuid_mutex); + ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL); + mutex_unlock(&uuid_mutex); + + return ret; +} + /* * Look for a btrfs signature on a device. This may be called out of the mount path * and we are not allowed to call set_blocksize during the scan. The superblock diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 656ea8d85770..3ad9d58d1b66 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -416,6 +416,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); +int btrfs_forget_devices(const char *path); int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index e0763bc4158e..c195896d478f 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -837,6 +837,8 @@ enum btrfs_err_code { struct btrfs_ioctl_vol_args) #define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_FORGET_DEV _IOW(BTRFS_IOCTL_MAGIC, 5, \ + struct btrfs_ioctl_vol_args) /* trans start and trans end are dangerous, and only for * use by applications that know how to avoid the * resulting deadlocks -- cgit v1.2.3 From 3f7ae5f3dc5295ac17d6521130ed8a8f8a723fbf Mon Sep 17 00:00:00 2001 From: "Mohit P. Tahiliani" Date: Tue, 26 Feb 2019 00:39:59 +0530 Subject: net: sched: pie: add more cases to auto-tune alpha and beta The current implementation scales the local alpha and beta variables in the calculate_probability function by the same amount for all values of drop probability below 1%. RFC 8033 suggests using additional cases for auto-tuning alpha and beta when the drop probability is less than 1%. In order to add more auto-tuning cases, MAX_PROB must be scaled by u64 instead of u32 to prevent underflow when scaling the local alpha and beta variables in the calculate_probability function. Signed-off-by: Mohit P. Tahiliani Signed-off-by: Dhaval Khandla Signed-off-by: Hrishikesh Hiraskar Signed-off-by: Manish Kumar B Signed-off-by: Sachin D. Patil Signed-off-by: Leslie Monis Acked-by: Dave Taht Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 2 +- net/sched/sch_pie.c | 65 +++++++++++++++++++++--------------------- 2 files changed, 33 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 0d18b1d1fbbc..1eb572ef3f27 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -954,7 +954,7 @@ enum { #define TCA_PIE_MAX (__TCA_PIE_MAX - 1) struct tc_pie_xstats { - __u32 prob; /* current probability */ + __u64 prob; /* current probability */ __u32 delay; /* current delay in ms */ __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ __u32 packets_in; /* total number of packets enqueued */ diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index d88ab53593b3..30f158582499 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -33,7 +33,7 @@ #define QUEUE_THRESHOLD 16384 #define DQCOUNT_INVALID -1 -#define MAX_PROB 0xffffffff +#define MAX_PROB 0xffffffffffffffff #define PIE_SCALE 8 /* parameters used */ @@ -49,7 +49,7 @@ struct pie_params { /* variables used */ struct pie_vars { - u32 prob; /* probability but scaled by u32 limit. */ + u64 prob; /* probability but scaled by u64 limit. */ psched_time_t burst_time; psched_time_t qdelay; psched_time_t qdelay_old; @@ -99,8 +99,8 @@ static void pie_vars_init(struct pie_vars *vars) static bool drop_early(struct Qdisc *sch, u32 packet_size) { struct pie_sched_data *q = qdisc_priv(sch); - u32 rnd; - u32 local_prob = q->vars.prob; + u64 rnd; + u64 local_prob = q->vars.prob; u32 mtu = psched_mtu(qdisc_dev(sch)); /* If there is still burst allowance left skip random early drop */ @@ -124,11 +124,11 @@ static bool drop_early(struct Qdisc *sch, u32 packet_size) * probablity. Smaller packets will have lower drop prob in this case */ if (q->params.bytemode && packet_size <= mtu) - local_prob = (local_prob / mtu) * packet_size; + local_prob = (u64)packet_size * div_u64(local_prob, mtu); else local_prob = q->vars.prob; - rnd = prandom_u32(); + prandom_bytes(&rnd, 8); if (rnd < local_prob) return true; @@ -317,9 +317,10 @@ static void calculate_probability(struct Qdisc *sch) u32 qlen = sch->qstats.backlog; /* queue size in bytes */ psched_time_t qdelay = 0; /* in pschedtime */ psched_time_t qdelay_old = q->vars.qdelay; /* in pschedtime */ - s32 delta = 0; /* determines the change in probability */ - u32 oldprob; - u32 alpha, beta; + s64 delta = 0; /* determines the change in probability */ + u64 oldprob; + u64 alpha, beta; + u32 power; bool update_prob = true; q->vars.qdelay_old = q->vars.qdelay; @@ -339,38 +340,36 @@ static void calculate_probability(struct Qdisc *sch) * value for alpha as 0.125. In this implementation, we use values 0-32 * passed from user space to represent this. Also, alpha and beta have * unit of HZ and need to be scaled before they can used to update - * probability. alpha/beta are updated locally below by 1) scaling them - * appropriately 2) scaling down by 16 to come to 0-2 range. - * Please see paper for details. - * - * We scale alpha and beta differently depending on whether we are in - * light, medium or high dropping mode. + * probability. alpha/beta are updated locally below by scaling down + * by 16 to come to 0-2 range. */ - if (q->vars.prob < MAX_PROB / 100) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 7; - } else if (q->vars.prob < MAX_PROB / 10) { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 5; - } else { - alpha = - (q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; - beta = - (q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + alpha = ((u64)q->params.alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + beta = ((u64)q->params.beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4; + + /* We scale alpha and beta differently depending on how heavy the + * congestion is. Please see RFC 8033 for details. + */ + if (q->vars.prob < MAX_PROB / 10) { + alpha >>= 1; + beta >>= 1; + + power = 100; + while (q->vars.prob < div_u64(MAX_PROB, power) && + power <= 1000000) { + alpha >>= 2; + beta >>= 2; + power *= 10; + } } /* alpha and beta should be between 0 and 32, in multiples of 1/16 */ - delta += alpha * ((qdelay - q->params.target)); - delta += beta * ((qdelay - qdelay_old)); + delta += alpha * (u64)(qdelay - q->params.target); + delta += beta * (u64)(qdelay - qdelay_old); oldprob = q->vars.prob; /* to ensure we increase probability in steps of no more than 2% */ - if (delta > (s32)(MAX_PROB / (100 / 2)) && + if (delta > (s64)(MAX_PROB / (100 / 2)) && q->vars.prob >= MAX_PROB / 10) delta = (MAX_PROB / 100) * 2; -- cgit v1.2.3 From e5567f5f67621877726f99be040af9fbedda37dc Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:04:51 -0800 Subject: PCI/ATS: Add pci_prg_resp_pasid_required() interface. Return the PRG Response PASID Required bit in the Page Request Status Register. As per PCIe spec r4.0, sec 10.5.2.3, if this bit is Set, the device expects a PASID TLP Prefix on PRG Response Messages when the corresponding Page Requests had a PASID TLP Prefix. If Clear, the device does not expect PASID TLP Prefixes on any PRG Response Message, and the device behavior is undefined if the device receives a PRG Response Message with a PASID TLP Prefix. Also the device behavior is undefined if this bit is Set and the device receives a PRG Response Message with no PASID TLP Prefix when the corresponding Page Requests had a PASID TLP Prefix. This function will be used by drivers like IOMMU, if it is required to check the status of the PRG Response PASID Required bit before enabling the PASID support of the device. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 30 ++++++++++++++++++++++++++++++ include/linux/pci-ats.h | 5 +++++ include/uapi/linux/pci_regs.h | 1 + 3 files changed, 36 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 5b78f3b1b918..420cd0a578d0 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -368,6 +368,36 @@ int pci_pasid_features(struct pci_dev *pdev) } EXPORT_SYMBOL_GPL(pci_pasid_features); +/** + * pci_prg_resp_pasid_required - Return PRG Response PASID Required bit + * status. + * @pdev: PCI device structure + * + * Returns 1 if PASID is required in PRG Response Message, 0 otherwise. + * + * Even though the PRG response PASID status is read from PRI Status + * Register, since this API will mainly be used by PASID users, this + * function is defined within #ifdef CONFIG_PCI_PASID instead of + * CONFIG_PCI_PRI. + */ +int pci_prg_resp_pasid_required(struct pci_dev *pdev) +{ + u16 status; + int pos; + + pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_PRI); + if (!pos) + return 0; + + pci_read_config_word(pdev, pos + PCI_PRI_STATUS, &status); + + if (status & PCI_PRI_STATUS_PASID) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_prg_resp_pasid_required); + #define PASID_NUMBER_SHIFT 8 #define PASID_NUMBER_MASK (0x1f << PASID_NUMBER_SHIFT) /** diff --git a/include/linux/pci-ats.h b/include/linux/pci-ats.h index 7c4b8e27268c..facfd6a18fe1 100644 --- a/include/linux/pci-ats.h +++ b/include/linux/pci-ats.h @@ -40,6 +40,7 @@ void pci_disable_pasid(struct pci_dev *pdev); void pci_restore_pasid_state(struct pci_dev *pdev); int pci_pasid_features(struct pci_dev *pdev); int pci_max_pasids(struct pci_dev *pdev); +int pci_prg_resp_pasid_required(struct pci_dev *pdev); #else /* CONFIG_PCI_PASID */ @@ -66,6 +67,10 @@ static inline int pci_max_pasids(struct pci_dev *pdev) return -EINVAL; } +static int pci_prg_resp_pasid_required(struct pci_dev *pdev) +{ + return 0; +} #endif /* CONFIG_PCI_PASID */ diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index e1e9888c85e6..898be572b010 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -880,6 +880,7 @@ #define PCI_PRI_STATUS_RF 0x001 /* Response Failure */ #define PCI_PRI_STATUS_UPRGI 0x002 /* Unexpected PRG index */ #define PCI_PRI_STATUS_STOPPED 0x100 /* PRI Stopped */ +#define PCI_PRI_STATUS_PASID 0x8000 /* PRG Response PASID Required */ #define PCI_PRI_MAX_REQ 0x08 /* PRI max reqs supported */ #define PCI_PRI_ALLOC_REQ 0x0c /* PRI max reqs allowed */ #define PCI_EXT_CAP_PRI_SIZEOF 16 -- cgit v1.2.3 From 8c938ddc6df3bbe72809db1be6c9f3af83f5d7a9 Mon Sep 17 00:00:00 2001 From: Kuppuswamy Sathyanarayanan Date: Tue, 19 Feb 2019 11:06:09 -0800 Subject: PCI/ATS: Add pci_ats_page_aligned() interface Return the Page Aligned Request bit in the ATS Capability Register. As per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit is set, it indicates the Untranslated Addresses generated by the device are always aligned to a 4096 byte boundary. An IOMMU that can only translate page-aligned addresses can only be used with devices that always produce aligned Untranslated Addresses. This interface will be used by drivers for such IOMMUs to determine whether devices can use the ATS service. Cc: Ashok Raj Cc: Jacob Pan Cc: Keith Busch Suggested-by: Ashok Raj Signed-off-by: Kuppuswamy Sathyanarayanan Acked-by: Bjorn Helgaas Signed-off-by: Joerg Roedel --- drivers/pci/ats.c | 27 +++++++++++++++++++++++++++ include/linux/pci.h | 2 ++ include/uapi/linux/pci_regs.h | 1 + 3 files changed, 30 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/pci/ats.c b/drivers/pci/ats.c index 420cd0a578d0..97c08146534a 100644 --- a/drivers/pci/ats.c +++ b/drivers/pci/ats.c @@ -142,6 +142,33 @@ int pci_ats_queue_depth(struct pci_dev *dev) } EXPORT_SYMBOL_GPL(pci_ats_queue_depth); +/** + * pci_ats_page_aligned - Return Page Aligned Request bit status. + * @pdev: the PCI device + * + * Returns 1, if the Untranslated Addresses generated by the device + * are always aligned or 0 otherwise. + * + * Per PCIe spec r4.0, sec 10.5.1.2, if the Page Aligned Request bit + * is set, it indicates the Untranslated Addresses generated by the + * device are always aligned to a 4096 byte boundary. + */ +int pci_ats_page_aligned(struct pci_dev *pdev) +{ + u16 cap; + + if (!pdev->ats_cap) + return 0; + + pci_read_config_word(pdev, pdev->ats_cap + PCI_ATS_CAP, &cap); + + if (cap & PCI_ATS_CAP_PAGE_ALIGNED) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(pci_ats_page_aligned); + #ifdef CONFIG_PCI_PRI /** * pci_enable_pri - Enable PRI capability diff --git a/include/linux/pci.h b/include/linux/pci.h index 65f1d8c2f082..9724a8c0496b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1524,11 +1524,13 @@ void pci_ats_init(struct pci_dev *dev); int pci_enable_ats(struct pci_dev *dev, int ps); void pci_disable_ats(struct pci_dev *dev); int pci_ats_queue_depth(struct pci_dev *dev); +int pci_ats_page_aligned(struct pci_dev *dev); #else static inline void pci_ats_init(struct pci_dev *d) { } static inline int pci_enable_ats(struct pci_dev *d, int ps) { return -ENODEV; } static inline void pci_disable_ats(struct pci_dev *d) { } static inline int pci_ats_queue_depth(struct pci_dev *d) { return -ENODEV; } +static inline int pci_ats_page_aligned(struct pci_dev *dev) { return 0; } #endif #ifdef CONFIG_PCIE_PTM diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 898be572b010..5c98133f2c94 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -866,6 +866,7 @@ #define PCI_ATS_CAP 0x04 /* ATS Capability Register */ #define PCI_ATS_CAP_QDEP(x) ((x) & 0x1f) /* Invalidate Queue Depth */ #define PCI_ATS_MAX_QDEP 32 /* Max Invalidate Queue Depth */ +#define PCI_ATS_CAP_PAGE_ALIGNED 0x0020 /* Page Aligned Request */ #define PCI_ATS_CTRL 0x06 /* ATS Control Register */ #define PCI_ATS_CTRL_ENABLE 0x8000 /* ATS Enable */ #define PCI_ATS_CTRL_STU(x) ((x) & 0x1f) /* Smallest Translation Unit */ -- cgit v1.2.3 From 5f8f8b93aeb8371c54af08bece2bd04bc2d48707 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Feb 2019 14:28:40 -0800 Subject: bpf: expose program stats via bpf_prog_info Return bpf program run_time_ns and run_cnt via bpf_prog_info Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 2 ++ kernel/bpf/syscall.c | 5 +++++ 2 files changed, 7 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index bcdd2474eee7..2e308e90ffea 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2813,6 +2813,8 @@ struct bpf_prog_info { __u32 jited_line_info_rec_size; __u32 nr_prog_tags; __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 31cf66fc3f5c..174581dfe225 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2152,6 +2152,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info); struct bpf_prog_info info = {}; u32 info_len = attr->info.info_len; + struct bpf_prog_stats stats; char __user *uinsns; u32 ulen; int err; @@ -2191,6 +2192,10 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, if (err) return err; + bpf_prog_get_stats(prog, &stats); + info.run_time_ns = stats.nsecs; + info.run_cnt = stats.cnt; + if (!capable(CAP_SYS_ADMIN)) { info.jited_prog_len = 0; info.xlated_prog_len = 0; -- cgit v1.2.3 From 2b188cc1bb857a9d4701ae59aa7768b5124e262e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 7 Jan 2019 10:46:33 -0700 Subject: Add io_uring IO interface The submission queue (SQ) and completion queue (CQ) rings are shared between the application and the kernel. This eliminates the need to copy data back and forth to submit and complete IO. IO submissions use the io_uring_sqe data structure, and completions are generated in the form of io_uring_cqe data structures. The SQ ring is an index into the io_uring_sqe array, which makes it possible to submit a batch of IOs without them being contiguous in the ring. The CQ ring is always contiguous, as completion events are inherently unordered, and hence any io_uring_cqe entry can point back to an arbitrary submission. Two new system calls are added for this: io_uring_setup(entries, params) Sets up an io_uring instance for doing async IO. On success, returns a file descriptor that the application can mmap to gain access to the SQ ring, CQ ring, and io_uring_sqes. io_uring_enter(fd, to_submit, min_complete, flags, sigset, sigsetsize) Initiates IO against the rings mapped to this fd, or waits for them to complete, or both. The behavior is controlled by the parameters passed in. If 'to_submit' is non-zero, then we'll try and submit new IO. If IORING_ENTER_GETEVENTS is set, the kernel will wait for 'min_complete' events, if they aren't already available. It's valid to set IORING_ENTER_GETEVENTS and 'min_complete' == 0 at the same time, this allows the kernel to return already completed events without waiting for them. This is useful only for polling, as for IRQ driven IO, the application can just check the CQ ring without entering the kernel. With this setup, it's possible to do async IO with a single system call. Future developments will enable polled IO with this interface, and polled submission as well. The latter will enable an application to do IO without doing ANY system calls at all. For IRQ driven IO, an application only needs to enter the kernel for completions if it wants to wait for them to occur. Each io_uring is backed by a workqueue, to support buffered async IO as well. We will only punt to an async context if the command would need to wait for IO on the device side. Any data that can be accessed directly in the page cache is done inline. This avoids the slowness issue of usual threadpools, since cached data is accessed as quickly as a sync interface. Sample application: http://git.kernel.dk/cgit/fio/plain/t/io_uring.c Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- arch/x86/entry/syscalls/syscall_32.tbl | 2 + arch/x86/entry/syscalls/syscall_64.tbl | 2 + fs/Makefile | 1 + fs/io_uring.c | 1255 ++++++++++++++++++++++++++++++++ include/linux/fs.h | 9 + include/linux/sched/user.h | 2 +- include/linux/syscalls.h | 6 + include/uapi/asm-generic/unistd.h | 6 +- include/uapi/linux/io_uring.h | 95 +++ init/Kconfig | 9 + kernel/sys_ni.c | 2 + net/unix/garbage.c | 3 + 12 files changed, 1390 insertions(+), 2 deletions(-) create mode 100644 fs/io_uring.c create mode 100644 include/uapi/linux/io_uring.h (limited to 'include/uapi/linux') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 3cf7b533b3d1..481c126259e9 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -398,3 +398,5 @@ 384 i386 arch_prctl sys_arch_prctl __ia32_compat_sys_arch_prctl 385 i386 io_pgetevents sys_io_pgetevents __ia32_compat_sys_io_pgetevents 386 i386 rseq sys_rseq __ia32_sys_rseq +425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup +426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index f0b1709a5ffb..6a32a430c8e0 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -343,6 +343,8 @@ 332 common statx __x64_sys_statx 333 common io_pgetevents __x64_sys_io_pgetevents 334 common rseq __x64_sys_rseq +425 common io_uring_setup __x64_sys_io_uring_setup +426 common io_uring_enter __x64_sys_io_uring_enter # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/Makefile b/fs/Makefile index 293733f61594..8e15d6fc4340 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_IO_URING) += io_uring.o obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o diff --git a/fs/io_uring.c b/fs/io_uring.c new file mode 100644 index 000000000000..f68052290426 --- /dev/null +++ b/fs/io_uring.c @@ -0,0 +1,1255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Shared application/kernel submission and completion ring pairs, for + * supporting fast/efficient IO. + * + * A note on the read/write ordering memory barriers that are matched between + * the application and kernel side. When the application reads the CQ ring + * tail, it must use an appropriate smp_rmb() to order with the smp_wmb() + * the kernel uses after writing the tail. Failure to do so could cause a + * delay in when the application notices that completion events available. + * This isn't a fatal condition. Likewise, the application must use an + * appropriate smp_wmb() both before writing the SQ tail, and after writing + * the SQ tail. The first one orders the sqe writes with the tail write, and + * the latter is paired with the smp_rmb() the kernel will issue before + * reading the SQ tail on submission. + * + * Also see the examples in the liburing library: + * + * git://git.kernel.dk/liburing + * + * io_uring also uses READ/WRITE_ONCE() for _any_ store or load that happens + * from data shared between the kernel and application. This is done both + * for ordering purposes, but also to ensure that once a value is loaded from + * data that the application could potentially modify, it remains stable. + * + * Copyright (C) 2018-2019 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "internal.h" + +#define IORING_MAX_ENTRIES 4096 + +struct io_uring { + u32 head ____cacheline_aligned_in_smp; + u32 tail ____cacheline_aligned_in_smp; +}; + +struct io_sq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 dropped; + u32 flags; + u32 array[]; +}; + +struct io_cq_ring { + struct io_uring r; + u32 ring_mask; + u32 ring_entries; + u32 overflow; + struct io_uring_cqe cqes[]; +}; + +struct io_ring_ctx { + struct { + struct percpu_ref refs; + } ____cacheline_aligned_in_smp; + + struct { + unsigned int flags; + bool compat; + bool account_mem; + + /* SQ ring */ + struct io_sq_ring *sq_ring; + unsigned cached_sq_head; + unsigned sq_entries; + unsigned sq_mask; + struct io_uring_sqe *sq_sqes; + } ____cacheline_aligned_in_smp; + + /* IO offload */ + struct workqueue_struct *sqo_wq; + struct mm_struct *sqo_mm; + + struct { + /* CQ ring */ + struct io_cq_ring *cq_ring; + unsigned cached_cq_tail; + unsigned cq_entries; + unsigned cq_mask; + struct wait_queue_head cq_wait; + struct fasync_struct *cq_fasync; + } ____cacheline_aligned_in_smp; + + struct user_struct *user; + + struct completion ctx_done; + + struct { + struct mutex uring_lock; + wait_queue_head_t wait; + } ____cacheline_aligned_in_smp; + + struct { + spinlock_t completion_lock; + } ____cacheline_aligned_in_smp; + +#if defined(CONFIG_UNIX) + struct socket *ring_sock; +#endif +}; + +struct sqe_submit { + const struct io_uring_sqe *sqe; + unsigned short index; + bool has_user; +}; + +struct io_kiocb { + struct kiocb rw; + + struct sqe_submit submit; + + struct io_ring_ctx *ctx; + struct list_head list; + unsigned int flags; +#define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ + u64 user_data; + + struct work_struct work; +}; + +#define IO_PLUG_THRESHOLD 2 + +static struct kmem_cache *req_cachep; + +static const struct file_operations io_uring_fops; + +struct sock *io_uring_get_socket(struct file *file) +{ +#if defined(CONFIG_UNIX) + if (file->f_op == &io_uring_fops) { + struct io_ring_ctx *ctx = file->private_data; + + return ctx->ring_sock->sk; + } +#endif + return NULL; +} +EXPORT_SYMBOL(io_uring_get_socket); + +static void io_ring_ctx_ref_free(struct percpu_ref *ref) +{ + struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs); + + complete(&ctx->ctx_done); +} + +static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) +{ + struct io_ring_ctx *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) { + kfree(ctx); + return NULL; + } + + ctx->flags = p->flags; + init_waitqueue_head(&ctx->cq_wait); + init_completion(&ctx->ctx_done); + mutex_init(&ctx->uring_lock); + init_waitqueue_head(&ctx->wait); + spin_lock_init(&ctx->completion_lock); + return ctx; +} + +static void io_commit_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + + if (ctx->cached_cq_tail != READ_ONCE(ring->r.tail)) { + /* order cqe stores with ring update */ + smp_store_release(&ring->r.tail, ctx->cached_cq_tail); + + /* + * Write sider barrier of tail update, app has read side. See + * comment at the top of this file. + */ + smp_wmb(); + + if (wq_has_sleeper(&ctx->cq_wait)) { + wake_up_interruptible(&ctx->cq_wait); + kill_fasync(&ctx->cq_fasync, SIGIO, POLL_IN); + } + } +} + +static struct io_uring_cqe *io_get_cqring(struct io_ring_ctx *ctx) +{ + struct io_cq_ring *ring = ctx->cq_ring; + unsigned tail; + + tail = ctx->cached_cq_tail; + /* See comment at the top of the file */ + smp_rmb(); + if (tail + 1 == READ_ONCE(ring->r.head)) + return NULL; + + ctx->cached_cq_tail++; + return &ring->cqes[tail & ctx->cq_mask]; +} + +static void io_cqring_fill_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + struct io_uring_cqe *cqe; + + /* + * If we can't get a cq entry, userspace overflowed the + * submission (by quite a lot). Increment the overflow count in + * the ring. + */ + cqe = io_get_cqring(ctx); + if (cqe) { + WRITE_ONCE(cqe->user_data, ki_user_data); + WRITE_ONCE(cqe->res, res); + WRITE_ONCE(cqe->flags, ev_flags); + } else { + unsigned overflow = READ_ONCE(ctx->cq_ring->overflow); + + WRITE_ONCE(ctx->cq_ring->overflow, overflow + 1); + } +} + +static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, + long res, unsigned ev_flags) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->completion_lock, flags); + io_cqring_fill_event(ctx, ki_user_data, res, ev_flags); + io_commit_cqring(ctx); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) +{ + percpu_ref_put_many(&ctx->refs, refs); + + if (waitqueue_active(&ctx->wait)) + wake_up(&ctx->wait); +} + +static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + if (!percpu_ref_tryget(&ctx->refs)) + return NULL; + + req = kmem_cache_alloc(req_cachep, __GFP_NOWARN); + if (req) { + req->ctx = ctx; + req->flags = 0; + return req; + } + + io_ring_drop_ctx_refs(ctx, 1); + return NULL; +} + +static void io_free_req(struct io_kiocb *req) +{ + io_ring_drop_ctx_refs(req->ctx, 1); + kmem_cache_free(req_cachep, req); +} + +static void kiocb_end_write(struct kiocb *kiocb) +{ + if (kiocb->ki_flags & IOCB_WRITE) { + struct inode *inode = file_inode(kiocb->ki_filp); + + /* + * Tell lockdep we inherited freeze protection from submission + * thread. + */ + if (S_ISREG(inode->i_mode)) + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + file_end_write(kiocb->ki_filp); + } +} + +static void io_complete_rw(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + fput(kiocb->ki_filp); + io_cqring_add_event(req->ctx, req->user_data, res, 0); + io_free_req(req); +} + +/* + * If we tracked the file through the SCM inflight mechanism, we could support + * any file. For now, just ensure that anything potentially problematic is done + * inline. + */ +static bool io_file_supports_async(struct file *file) +{ + umode_t mode = file_inode(file)->i_mode; + + if (S_ISBLK(mode) || S_ISCHR(mode)) + return true; + if (S_ISREG(mode) && file->f_op != &io_uring_fops) + return true; + + return false; +} + +static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + struct kiocb *kiocb = &req->rw; + unsigned ioprio; + int fd, ret; + + /* For -EAGAIN retry, everything is already prepped */ + if (kiocb->ki_filp) + return 0; + + fd = READ_ONCE(sqe->fd); + kiocb->ki_filp = fget(fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + kiocb->ki_pos = READ_ONCE(sqe->off); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + + ioprio = READ_ONCE(sqe->ioprio); + if (ioprio) { + ret = ioprio_check_cap(ioprio); + if (ret) + goto out_fput; + + kiocb->ki_ioprio = ioprio; + } else + kiocb->ki_ioprio = get_current_ioprio(); + + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); + if (unlikely(ret)) + goto out_fput; + if (force_nonblock) { + kiocb->ki_flags |= IOCB_NOWAIT; + req->flags |= REQ_F_FORCE_NONBLOCK; + } + if (kiocb->ki_flags & IOCB_HIPRI) { + ret = -EINVAL; + goto out_fput; + } + + kiocb->ki_complete = io_complete_rw; + return 0; +out_fput: + fput(kiocb->ki_filp); + return ret; +} + +static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) +{ + switch (ret) { + case -EIOCBQUEUED: + break; + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* + * We can't just restart the syscall, since previously + * submitted sqes may already be in progress. Just fail this + * IO with EINTR. + */ + ret = -EINTR; + /* fall through */ + default: + kiocb->ki_complete(kiocb, ret, 0); + } +} + +static int io_import_iovec(struct io_ring_ctx *ctx, int rw, + const struct sqe_submit *s, struct iovec **iovec, + struct iov_iter *iter) +{ + const struct io_uring_sqe *sqe = s->sqe; + void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); + size_t sqe_len = READ_ONCE(sqe->len); + + if (!s->has_user) + return -EFAULT; + +#ifdef CONFIG_COMPAT + if (ctx->compat) + return compat_import_iovec(rw, buf, sqe_len, UIO_FASTIOV, + iovec, iter); +#endif + + return import_iovec(rw, buf, sqe_len, UIO_FASTIOV, iovec, iter); +} + +static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + file = kiocb->ki_filp; + + ret = -EBADF; + if (unlikely(!(file->f_mode & FMODE_READ))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->read_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, READ, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(READ, file, &kiocb->ki_pos, iov_iter_count(&iter)); + if (!ret) { + ssize_t ret2; + + /* Catch -EAGAIN return for forced non-blocking submission */ + ret2 = call_read_iter(file, kiocb, &iter); + if (!force_nonblock || ret2 != -EAGAIN) + io_rw_done(kiocb, ret2); + else + ret = -EAGAIN; + } + kfree(iovec); +out_fput: + /* Hold on to the file for -EAGAIN */ + if (unlikely(ret && ret != -EAGAIN)) + fput(file); + return ret; +} + +static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, + bool force_nonblock) +{ + struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs; + struct kiocb *kiocb = &req->rw; + struct iov_iter iter; + struct file *file; + ssize_t ret; + + ret = io_prep_rw(req, s->sqe, force_nonblock); + if (ret) + return ret; + /* Hold on to the file for -EAGAIN */ + if (force_nonblock && !(kiocb->ki_flags & IOCB_DIRECT)) + return -EAGAIN; + + ret = -EBADF; + file = kiocb->ki_filp; + if (unlikely(!(file->f_mode & FMODE_WRITE))) + goto out_fput; + ret = -EINVAL; + if (unlikely(!file->f_op->write_iter)) + goto out_fput; + + ret = io_import_iovec(req->ctx, WRITE, s, &iovec, &iter); + if (ret) + goto out_fput; + + ret = rw_verify_area(WRITE, file, &kiocb->ki_pos, + iov_iter_count(&iter)); + if (!ret) { + /* + * Open-code file_start_write here to grab freeze protection, + * which will be released by another thread in + * io_complete_rw(). Fool lockdep by telling it the lock got + * released so that it doesn't complain about the held lock when + * we return to userspace. + */ + if (S_ISREG(file_inode(file)->i_mode)) { + __sb_start_write(file_inode(file)->i_sb, + SB_FREEZE_WRITE, true); + __sb_writers_release(file_inode(file)->i_sb, + SB_FREEZE_WRITE); + } + kiocb->ki_flags |= IOCB_WRITE; + io_rw_done(kiocb, call_write_iter(file, kiocb, &iter)); + } + kfree(iovec); +out_fput: + if (unlikely(ret)) + fput(file); + return ret; +} + +/* + * IORING_OP_NOP just posts a completion event, nothing else. + */ +static int io_nop(struct io_kiocb *req, u64 user_data) +{ + struct io_ring_ctx *ctx = req->ctx; + long err = 0; + + /* + * Twilight zone - it's possible that someone issued an opcode that + * has a file attached, then got -EAGAIN on submission, and changed + * the sqe before we retried it from async context. Avoid dropping + * a file reference for this malicious case, and flag the error. + */ + if (req->rw.ki_filp) { + err = -EBADF; + fput(req->rw.ki_filp); + } + io_cqring_add_event(ctx, user_data, err, 0); + io_free_req(req); + return 0; +} + +static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct sqe_submit *s, bool force_nonblock) +{ + ssize_t ret; + int opcode; + + if (unlikely(s->index >= ctx->sq_entries)) + return -EINVAL; + req->user_data = READ_ONCE(s->sqe->user_data); + + opcode = READ_ONCE(s->sqe->opcode); + switch (opcode) { + case IORING_OP_NOP: + ret = io_nop(req, req->user_data); + break; + case IORING_OP_READV: + ret = io_read(req, s, force_nonblock); + break; + case IORING_OP_WRITEV: + ret = io_write(req, s, force_nonblock); + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static void io_sq_wq_submit_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct sqe_submit *s = &req->submit; + const struct io_uring_sqe *sqe = s->sqe; + struct io_ring_ctx *ctx = req->ctx; + mm_segment_t old_fs = get_fs(); + int ret; + + /* Ensure we clear previously set forced non-block flag */ + req->flags &= ~REQ_F_FORCE_NONBLOCK; + req->rw.ki_flags &= ~IOCB_NOWAIT; + + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + goto err; + } + + use_mm(ctx->sqo_mm); + set_fs(USER_DS); + s->has_user = true; + + ret = __io_submit_sqe(ctx, req, s, false); + + set_fs(old_fs); + unuse_mm(ctx->sqo_mm); + mmput(ctx->sqo_mm); +err: + if (ret) { + io_cqring_add_event(ctx, sqe->user_data, ret, 0); + io_free_req(req); + } + + /* async context always use a copy of the sqe */ + kfree(sqe); +} + +static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_kiocb *req; + ssize_t ret; + + /* enforce forwards compatibility on users */ + if (unlikely(s->sqe->flags)) + return -EINVAL; + + req = io_get_req(ctx); + if (unlikely(!req)) + return -EAGAIN; + + req->rw.ki_filp = NULL; + + ret = __io_submit_sqe(ctx, req, s, true); + if (ret == -EAGAIN) { + struct io_uring_sqe *sqe_copy; + + sqe_copy = kmalloc(sizeof(*sqe_copy), GFP_KERNEL); + if (sqe_copy) { + memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy)); + s->sqe = sqe_copy; + + memcpy(&req->submit, s, sizeof(*s)); + INIT_WORK(&req->work, io_sq_wq_submit_work); + queue_work(ctx->sqo_wq, &req->work); + ret = 0; + } + } + if (ret) + io_free_req(req); + + return ret; +} + +static void io_commit_sqring(struct io_ring_ctx *ctx) +{ + struct io_sq_ring *ring = ctx->sq_ring; + + if (ctx->cached_sq_head != READ_ONCE(ring->r.head)) { + /* + * Ensure any loads from the SQEs are done at this point, + * since once we write the new head, the application could + * write new data to them. + */ + smp_store_release(&ring->r.head, ctx->cached_sq_head); + + /* + * write side barrier of head update, app has read side. See + * comment at the top of this file + */ + smp_wmb(); + } +} + +/* + * Undo last io_get_sqring() + */ +static void io_drop_sqring(struct io_ring_ctx *ctx) +{ + ctx->cached_sq_head--; +} + +/* + * Fetch an sqe, if one is available. Note that s->sqe will point to memory + * that is mapped by userspace. This means that care needs to be taken to + * ensure that reads are stable, as we cannot rely on userspace always + * being a good citizen. If members of the sqe are validated and then later + * used, it's important that those reads are done through READ_ONCE() to + * prevent a re-load down the line. + */ +static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) +{ + struct io_sq_ring *ring = ctx->sq_ring; + unsigned head; + + /* + * The cached sq head (or cq tail) serves two purposes: + * + * 1) allows us to batch the cost of updating the user visible + * head updates. + * 2) allows the kernel side to track the head on its own, even + * though the application is the one updating it. + */ + head = ctx->cached_sq_head; + /* See comment at the top of this file */ + smp_rmb(); + if (head == READ_ONCE(ring->r.tail)) + return false; + + head = READ_ONCE(ring->array[head & ctx->sq_mask]); + if (head < ctx->sq_entries) { + s->index = head; + s->sqe = &ctx->sq_sqes[head]; + ctx->cached_sq_head++; + return true; + } + + /* drop invalid entries */ + ctx->cached_sq_head++; + ring->dropped++; + /* See comment at the top of this file */ + smp_wmb(); + return false; +} + +static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) +{ + int i, ret = 0, submit = 0; + struct blk_plug plug; + + if (to_submit > IO_PLUG_THRESHOLD) + blk_start_plug(&plug); + + for (i = 0; i < to_submit; i++) { + struct sqe_submit s; + + if (!io_get_sqring(ctx, &s)) + break; + + s.has_user = true; + ret = io_submit_sqe(ctx, &s); + if (ret) { + io_drop_sqring(ctx); + break; + } + + submit++; + } + io_commit_sqring(ctx); + + if (to_submit > IO_PLUG_THRESHOLD) + blk_finish_plug(&plug); + + return submit ? submit : ret; +} + +static unsigned io_cqring_events(struct io_cq_ring *ring) +{ + return READ_ONCE(ring->r.tail) - READ_ONCE(ring->r.head); +} + +/* + * Wait until events become available, if we don't already have some. The + * application must reap them itself, as they reside on the shared cq ring. + */ +static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, + const sigset_t __user *sig, size_t sigsz) +{ + struct io_cq_ring *ring = ctx->cq_ring; + sigset_t ksigmask, sigsaved; + DEFINE_WAIT(wait); + int ret; + + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + return 0; + + if (sig) { + ret = set_user_sigmask(sig, &ksigmask, &sigsaved, sigsz); + if (ret) + return ret; + } + + do { + prepare_to_wait(&ctx->wait, &wait, TASK_INTERRUPTIBLE); + + ret = 0; + /* See comment at the top of this file */ + smp_rmb(); + if (io_cqring_events(ring) >= min_events) + break; + + schedule(); + + ret = -EINTR; + if (signal_pending(current)) + break; + } while (1); + + finish_wait(&ctx->wait, &wait); + + if (sig) + restore_user_sigmask(sig, &sigsaved); + + return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; +} + +static int io_sq_offload_start(struct io_ring_ctx *ctx) +{ + int ret; + + mmgrab(current->mm); + ctx->sqo_mm = current->mm; + + /* Do QD, or 2 * CPUS, whatever is smallest */ + ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, + min(ctx->sq_entries - 1, 2 * num_online_cpus())); + if (!ctx->sqo_wq) { + ret = -ENOMEM; + goto err; + } + + return 0; +err: + mmdrop(ctx->sqo_mm); + ctx->sqo_mm = NULL; + return ret; +} + +static void io_unaccount_mem(struct user_struct *user, unsigned long nr_pages) +{ + atomic_long_sub(nr_pages, &user->locked_vm); +} + +static int io_account_mem(struct user_struct *user, unsigned long nr_pages) +{ + unsigned long page_limit, cur_pages, new_pages; + + /* Don't allow more pages than we can safely lock */ + page_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + do { + cur_pages = atomic_long_read(&user->locked_vm); + new_pages = cur_pages + nr_pages; + if (new_pages > page_limit) + return -ENOMEM; + } while (atomic_long_cmpxchg(&user->locked_vm, cur_pages, + new_pages) != cur_pages); + + return 0; +} + +static void io_mem_free(void *ptr) +{ + struct page *page = virt_to_head_page(ptr); + + if (put_page_testzero(page)) + free_compound_page(page); +} + +static void *io_mem_alloc(size_t size) +{ + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_COMP | + __GFP_NORETRY; + + return (void *) __get_free_pages(gfp_flags, get_order(size)); +} + +static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t bytes; + + bytes = struct_size(sq_ring, array, sq_entries); + bytes += array_size(sizeof(struct io_uring_sqe), sq_entries); + bytes += struct_size(cq_ring, cqes, cq_entries); + + return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; +} + +static void io_ring_ctx_free(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_wq) + destroy_workqueue(ctx->sqo_wq); + if (ctx->sqo_mm) + mmdrop(ctx->sqo_mm); +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) + sock_release(ctx->ring_sock); +#endif + + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + io_mem_free(ctx->cq_ring); + + percpu_ref_exit(&ctx->refs); + if (ctx->account_mem) + io_unaccount_mem(ctx->user, + ring_pages(ctx->sq_entries, ctx->cq_entries)); + free_uid(ctx->user); + kfree(ctx); +} + +static __poll_t io_uring_poll(struct file *file, poll_table *wait) +{ + struct io_ring_ctx *ctx = file->private_data; + __poll_t mask = 0; + + poll_wait(file, &ctx->cq_wait, wait); + /* See comment at the top of this file */ + smp_rmb(); + if (READ_ONCE(ctx->sq_ring->r.tail) + 1 != ctx->cached_sq_head) + mask |= EPOLLOUT | EPOLLWRNORM; + if (READ_ONCE(ctx->cq_ring->r.head) != ctx->cached_cq_tail) + mask |= EPOLLIN | EPOLLRDNORM; + + return mask; +} + +static int io_uring_fasync(int fd, struct file *file, int on) +{ + struct io_ring_ctx *ctx = file->private_data; + + return fasync_helper(fd, file, on, &ctx->cq_fasync); +} + +static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) +{ + mutex_lock(&ctx->uring_lock); + percpu_ref_kill(&ctx->refs); + mutex_unlock(&ctx->uring_lock); + + wait_for_completion(&ctx->ctx_done); + io_ring_ctx_free(ctx); +} + +static int io_uring_release(struct inode *inode, struct file *file) +{ + struct io_ring_ctx *ctx = file->private_data; + + file->private_data = NULL; + io_ring_ctx_wait_and_kill(ctx); + return 0; +} + +static int io_uring_mmap(struct file *file, struct vm_area_struct *vma) +{ + loff_t offset = (loff_t) vma->vm_pgoff << PAGE_SHIFT; + unsigned long sz = vma->vm_end - vma->vm_start; + struct io_ring_ctx *ctx = file->private_data; + unsigned long pfn; + struct page *page; + void *ptr; + + switch (offset) { + case IORING_OFF_SQ_RING: + ptr = ctx->sq_ring; + break; + case IORING_OFF_SQES: + ptr = ctx->sq_sqes; + break; + case IORING_OFF_CQ_RING: + ptr = ctx->cq_ring; + break; + default: + return -EINVAL; + } + + page = virt_to_head_page(ptr); + if (sz > (PAGE_SIZE << compound_order(page))) + return -EINVAL; + + pfn = virt_to_phys(ptr) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, + u32, min_complete, u32, flags, const sigset_t __user *, sig, + size_t, sigsz) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + int submitted = 0; + struct fd f; + + if (flags & ~IORING_ENTER_GETEVENTS) + return -EINVAL; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ret = -ENXIO; + ctx = f.file->private_data; + if (!percpu_ref_tryget(&ctx->refs)) + goto out_fput; + + ret = 0; + if (to_submit) { + to_submit = min(to_submit, ctx->sq_entries); + + mutex_lock(&ctx->uring_lock); + submitted = io_ring_submit(ctx, to_submit); + mutex_unlock(&ctx->uring_lock); + + if (submitted < 0) + goto out_ctx; + } + if (flags & IORING_ENTER_GETEVENTS) { + min_complete = min(min_complete, ctx->cq_entries); + + /* + * The application could have included the 'to_submit' count + * in how many events it wanted to wait for. If we failed to + * submit the desired count, we may need to adjust the number + * of events to poll/wait for. + */ + if (submitted < to_submit) + min_complete = min_t(unsigned, submitted, min_complete); + + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } + +out_ctx: + io_ring_drop_ctx_refs(ctx, 1); +out_fput: + fdput(f); + return submitted ? submitted : ret; +} + +static const struct file_operations io_uring_fops = { + .release = io_uring_release, + .mmap = io_uring_mmap, + .poll = io_uring_poll, + .fasync = io_uring_fasync, +}; + +static int io_allocate_scq_urings(struct io_ring_ctx *ctx, + struct io_uring_params *p) +{ + struct io_sq_ring *sq_ring; + struct io_cq_ring *cq_ring; + size_t size; + + sq_ring = io_mem_alloc(struct_size(sq_ring, array, p->sq_entries)); + if (!sq_ring) + return -ENOMEM; + + ctx->sq_ring = sq_ring; + sq_ring->ring_mask = p->sq_entries - 1; + sq_ring->ring_entries = p->sq_entries; + ctx->sq_mask = sq_ring->ring_mask; + ctx->sq_entries = sq_ring->ring_entries; + + size = array_size(sizeof(struct io_uring_sqe), p->sq_entries); + if (size == SIZE_MAX) + return -EOVERFLOW; + + ctx->sq_sqes = io_mem_alloc(size); + if (!ctx->sq_sqes) { + io_mem_free(ctx->sq_ring); + return -ENOMEM; + } + + cq_ring = io_mem_alloc(struct_size(cq_ring, cqes, p->cq_entries)); + if (!cq_ring) { + io_mem_free(ctx->sq_ring); + io_mem_free(ctx->sq_sqes); + return -ENOMEM; + } + + ctx->cq_ring = cq_ring; + cq_ring->ring_mask = p->cq_entries - 1; + cq_ring->ring_entries = p->cq_entries; + ctx->cq_mask = cq_ring->ring_mask; + ctx->cq_entries = cq_ring->ring_entries; + return 0; +} + +/* + * Allocate an anonymous fd, this is what constitutes the application + * visible backing of an io_uring instance. The application mmaps this + * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled, + * we have to tie this fd to a socket for file garbage collection purposes. + */ +static int io_uring_get_fd(struct io_ring_ctx *ctx) +{ + struct file *file; + int ret; + +#if defined(CONFIG_UNIX) + ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP, + &ctx->ring_sock); + if (ret) + return ret; +#endif + + ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (ret < 0) + goto err; + + file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx, + O_RDWR | O_CLOEXEC); + if (IS_ERR(file)) { + put_unused_fd(ret); + ret = PTR_ERR(file); + goto err; + } + +#if defined(CONFIG_UNIX) + ctx->ring_sock->file = file; +#endif + fd_install(ret, file); + return ret; +err: +#if defined(CONFIG_UNIX) + sock_release(ctx->ring_sock); + ctx->ring_sock = NULL; +#endif + return ret; +} + +static int io_uring_create(unsigned entries, struct io_uring_params *p) +{ + struct user_struct *user = NULL; + struct io_ring_ctx *ctx; + bool account_mem; + int ret; + + if (!entries || entries > IORING_MAX_ENTRIES) + return -EINVAL; + + /* + * Use twice as many entries for the CQ ring. It's possible for the + * application to drive a higher depth than the size of the SQ ring, + * since the sqes are only used at submission time. This allows for + * some flexibility in overcommitting a bit. + */ + p->sq_entries = roundup_pow_of_two(entries); + p->cq_entries = 2 * p->sq_entries; + + user = get_uid(current_user()); + account_mem = !capable(CAP_IPC_LOCK); + + if (account_mem) { + ret = io_account_mem(user, + ring_pages(p->sq_entries, p->cq_entries)); + if (ret) { + free_uid(user); + return ret; + } + } + + ctx = io_ring_ctx_alloc(p); + if (!ctx) { + if (account_mem) + io_unaccount_mem(user, ring_pages(p->sq_entries, + p->cq_entries)); + free_uid(user); + return -ENOMEM; + } + ctx->compat = in_compat_syscall(); + ctx->account_mem = account_mem; + ctx->user = user; + + ret = io_allocate_scq_urings(ctx, p); + if (ret) + goto err; + + ret = io_sq_offload_start(ctx); + if (ret) + goto err; + + ret = io_uring_get_fd(ctx); + if (ret < 0) + goto err; + + memset(&p->sq_off, 0, sizeof(p->sq_off)); + p->sq_off.head = offsetof(struct io_sq_ring, r.head); + p->sq_off.tail = offsetof(struct io_sq_ring, r.tail); + p->sq_off.ring_mask = offsetof(struct io_sq_ring, ring_mask); + p->sq_off.ring_entries = offsetof(struct io_sq_ring, ring_entries); + p->sq_off.flags = offsetof(struct io_sq_ring, flags); + p->sq_off.dropped = offsetof(struct io_sq_ring, dropped); + p->sq_off.array = offsetof(struct io_sq_ring, array); + + memset(&p->cq_off, 0, sizeof(p->cq_off)); + p->cq_off.head = offsetof(struct io_cq_ring, r.head); + p->cq_off.tail = offsetof(struct io_cq_ring, r.tail); + p->cq_off.ring_mask = offsetof(struct io_cq_ring, ring_mask); + p->cq_off.ring_entries = offsetof(struct io_cq_ring, ring_entries); + p->cq_off.overflow = offsetof(struct io_cq_ring, overflow); + p->cq_off.cqes = offsetof(struct io_cq_ring, cqes); + return ret; +err: + io_ring_ctx_wait_and_kill(ctx); + return ret; +} + +/* + * Sets up an aio uring context, and returns the fd. Applications asks for a + * ring size, we return the actual sq/cq ring sizes (among other things) in the + * params structure passed in. + */ +static long io_uring_setup(u32 entries, struct io_uring_params __user *params) +{ + struct io_uring_params p; + long ret; + int i; + + if (copy_from_user(&p, params, sizeof(p))) + return -EFAULT; + for (i = 0; i < ARRAY_SIZE(p.resv); i++) { + if (p.resv[i]) + return -EINVAL; + } + + if (p.flags) + return -EINVAL; + + ret = io_uring_create(entries, &p); + if (ret < 0) + return ret; + + if (copy_to_user(params, &p, sizeof(p))) + return -EFAULT; + + return ret; +} + +SYSCALL_DEFINE2(io_uring_setup, u32, entries, + struct io_uring_params __user *, params) +{ + return io_uring_setup(entries, params); +} + +static int __init io_uring_init(void) +{ + req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); + return 0; +}; +__initcall(io_uring_init); diff --git a/include/linux/fs.h b/include/linux/fs.h index dedcc2e9265c..61aa210f0c2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3517,4 +3517,13 @@ extern void inode_nohighmem(struct inode *inode); extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice); +#if defined(CONFIG_IO_URING) +extern struct sock *io_uring_get_socket(struct file *file); +#else +static inline struct sock *io_uring_get_socket(struct file *file) +{ + return NULL; +} +#endif + #endif /* _LINUX_FS_H */ diff --git a/include/linux/sched/user.h b/include/linux/sched/user.h index 39ad98c09c58..c7b5f86b91a1 100644 --- a/include/linux/sched/user.h +++ b/include/linux/sched/user.h @@ -40,7 +40,7 @@ struct user_struct { kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ - defined(CONFIG_NET) + defined(CONFIG_NET) || defined(CONFIG_IO_URING) atomic_long_t locked_vm; #endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 257cccba3062..3072dbaa7869 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -69,6 +69,7 @@ struct file_handle; struct sigaltstack; struct rseq; union bpf_attr; +struct io_uring_params; #include #include @@ -309,6 +310,11 @@ asmlinkage long sys_io_pgetevents_time32(aio_context_t ctx_id, struct io_event __user *events, struct old_timespec32 __user *timeout, const struct __aio_sigset *sig); +asmlinkage long sys_io_uring_setup(u32 entries, + struct io_uring_params __user *p); +asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, + u32 min_complete, u32 flags, + const sigset_t __user *sig, size_t sigsz); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index d90127298f12..87871e7b7ea7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -740,9 +740,13 @@ __SC_COMP(__NR_io_pgetevents, sys_io_pgetevents, compat_sys_io_pgetevents) __SYSCALL(__NR_rseq, sys_rseq) #define __NR_kexec_file_load 294 __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) +#define __NR_io_uring_setup 425 +__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) +#define __NR_io_uring_enter 426 +__SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) #undef __NR_syscalls -#define __NR_syscalls 295 +#define __NR_syscalls 427 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h new file mode 100644 index 000000000000..ac692823d6f4 --- /dev/null +++ b/include/uapi/linux/io_uring.h @@ -0,0 +1,95 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Header file for the io_uring interface. + * + * Copyright (C) 2019 Jens Axboe + * Copyright (C) 2019 Christoph Hellwig + */ +#ifndef LINUX_IO_URING_H +#define LINUX_IO_URING_H + +#include +#include + +/* + * IO submission data structure (Submission Queue Entry) + */ +struct io_uring_sqe { + __u8 opcode; /* type of operation for this sqe */ + __u8 flags; /* as of now unused */ + __u16 ioprio; /* ioprio for the request */ + __s32 fd; /* file descriptor to do IO on */ + __u64 off; /* offset into file */ + __u64 addr; /* pointer to buffer or iovecs */ + __u32 len; /* buffer size or number of iovecs */ + union { + __kernel_rwf_t rw_flags; + __u32 __resv; + }; + __u64 user_data; /* data to be passed back at completion time */ + __u64 __pad2[3]; +}; + +#define IORING_OP_NOP 0 +#define IORING_OP_READV 1 +#define IORING_OP_WRITEV 2 + +/* + * IO completion data structure (Completion Queue Entry) + */ +struct io_uring_cqe { + __u64 user_data; /* sqe->data submission passed back */ + __s32 res; /* result code for this event */ + __u32 flags; +}; + +/* + * Magic offsets for the application to mmap the data it needs + */ +#define IORING_OFF_SQ_RING 0ULL +#define IORING_OFF_CQ_RING 0x8000000ULL +#define IORING_OFF_SQES 0x10000000ULL + +/* + * Filled with the offset for mmap(2) + */ +struct io_sqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 flags; + __u32 dropped; + __u32 array; + __u32 resv1; + __u64 resv2; +}; + +struct io_cqring_offsets { + __u32 head; + __u32 tail; + __u32 ring_mask; + __u32 ring_entries; + __u32 overflow; + __u32 cqes; + __u64 resv[2]; +}; + +/* + * io_uring_enter(2) flags + */ +#define IORING_ENTER_GETEVENTS (1U << 0) + +/* + * Passed in for io_uring_setup(2). Copied back with updated info on success + */ +struct io_uring_params { + __u32 sq_entries; + __u32 cq_entries; + __u32 flags; + __u32 resv[7]; + struct io_sqring_offsets sq_off; + struct io_cqring_offsets cq_off; +}; + +#endif diff --git a/init/Kconfig b/init/Kconfig index c9386a365eea..53b54214a36e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1414,6 +1414,15 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config IO_URING + bool "Enable IO uring support" if EXPERT + select ANON_INODES + default y + help + This option enables support for the io_uring interface, enabling + applications to submit and complete IO through submission and + completion rings that are shared between the kernel and application. + config ADVISE_SYSCALLS bool "Enable madvise/fadvise syscalls" if EXPERT default y diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ab9d0e3c6d50..ee5e523564bb 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -46,6 +46,8 @@ COND_SYSCALL(io_getevents); COND_SYSCALL(io_pgetevents); COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_pgetevents); +COND_SYSCALL(io_uring_setup); +COND_SYSCALL(io_uring_enter); /* fs/xattr.c */ diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c36757e72844..f81854d74c7d 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -108,6 +108,9 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) u_sock = s; + } else { + /* Could be an io_uring instance */ + u_sock = io_uring_get_socket(filp); } return u_sock; } -- cgit v1.2.3 From c992fe2925d776be066d9f6cc13f9ea11d78b657 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 11 Jan 2019 09:43:02 -0700 Subject: io_uring: add fsync support Add a new fsync opcode, which either syncs a range if one is passed, or the whole file if the offset and length fields are both cleared to zero. A flag is provided to use fdatasync semantics, that is only force out metadata which is required to retrieve the file data, but not others like metadata. Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 54 +++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/io_uring.h | 8 ++++++- 2 files changed, 61 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index f68052290426..745413d92712 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -24,6 +24,7 @@ * data that the application could potentially modify, it remains stable. * * Copyright (C) 2018-2019 Jens Axboe + * Copyright (c) 2018-2019 Christoph Hellwig */ #include #include @@ -557,6 +558,56 @@ static int io_nop(struct io_kiocb *req, u64 user_data) return 0; } +static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + int fd; + + /* Prep already done */ + if (req->rw.ki_filp) + return 0; + + if (unlikely(sqe->addr || sqe->ioprio)) + return -EINVAL; + + fd = READ_ONCE(sqe->fd); + req->rw.ki_filp = fget(fd); + if (unlikely(!req->rw.ki_filp)) + return -EBADF; + + return 0; +} + +static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, + bool force_nonblock) +{ + loff_t sqe_off = READ_ONCE(sqe->off); + loff_t sqe_len = READ_ONCE(sqe->len); + loff_t end = sqe_off + sqe_len; + unsigned fsync_flags; + int ret; + + fsync_flags = READ_ONCE(sqe->fsync_flags); + if (unlikely(fsync_flags & ~IORING_FSYNC_DATASYNC)) + return -EINVAL; + + ret = io_prep_fsync(req, sqe); + if (ret) + return ret; + + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + ret = vfs_fsync_range(req->rw.ki_filp, sqe_off, + end > 0 ? end : LLONG_MAX, + fsync_flags & IORING_FSYNC_DATASYNC); + + fput(req->rw.ki_filp); + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_free_req(req); + return 0; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock) { @@ -578,6 +629,9 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_WRITEV: ret = io_write(req, s, force_nonblock); break; + case IORING_OP_FSYNC: + ret = io_fsync(req, s->sqe, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac692823d6f4..4589d56d0b68 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -24,7 +24,7 @@ struct io_uring_sqe { __u32 len; /* buffer size or number of iovecs */ union { __kernel_rwf_t rw_flags; - __u32 __resv; + __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ __u64 __pad2[3]; @@ -33,6 +33,12 @@ struct io_uring_sqe { #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 +#define IORING_OP_FSYNC 3 + +/* + * sqe->fsync_flags + */ +#define IORING_FSYNC_DATASYNC (1U << 0) /* * IO completion data structure (Completion Queue Entry) -- cgit v1.2.3 From def596e9557c91d9846fc4d84d26f2c564644416 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 08:59:42 -0700 Subject: io_uring: support for IO polling Add support for a polled io_uring instance. When a read or write is submitted to a polled io_uring, the application must poll for completions on the CQ ring through io_uring_enter(2). Polled IO may not generate IRQ completions, hence they need to be actively found by the application itself. To use polling, io_uring_setup() must be used with the IORING_SETUP_IOPOLL flag being set. It is illegal to mix and match polled and non-polled IO on an io_uring. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 275 ++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 5 + 2 files changed, 271 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 745413d92712..578797a4f318 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -124,6 +124,14 @@ struct io_ring_ctx { struct { spinlock_t completion_lock; + bool poll_multi_file; + /* + * ->poll_list is protected by the ctx->uring_lock for + * io_uring instances that don't use IORING_SETUP_SQPOLL. + * For SQPOLL, only the single threaded io_sq_thread() will + * manipulate the list, hence no extra locking is needed there. + */ + struct list_head poll_list; } ____cacheline_aligned_in_smp; #if defined(CONFIG_UNIX) @@ -135,6 +143,7 @@ struct sqe_submit { const struct io_uring_sqe *sqe; unsigned short index; bool has_user; + bool needs_lock; }; struct io_kiocb { @@ -146,12 +155,15 @@ struct io_kiocb { struct list_head list; unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ +#define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ u64 user_data; + u64 error; struct work_struct work; }; #define IO_PLUG_THRESHOLD 2 +#define IO_IOPOLL_BATCH 8 static struct kmem_cache *req_cachep; @@ -196,6 +208,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) mutex_init(&ctx->uring_lock); init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); + INIT_LIST_HEAD(&ctx->poll_list); return ctx; } @@ -297,12 +310,153 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx) return NULL; } +static void io_free_req_many(struct io_ring_ctx *ctx, void **reqs, int *nr) +{ + if (*nr) { + kmem_cache_free_bulk(req_cachep, *nr, reqs); + io_ring_drop_ctx_refs(ctx, *nr); + *nr = 0; + } +} + static void io_free_req(struct io_kiocb *req) { io_ring_drop_ctx_refs(req->ctx, 1); kmem_cache_free(req_cachep, req); } +/* + * Find and free completed poll iocbs + */ +static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, + struct list_head *done) +{ + void *reqs[IO_IOPOLL_BATCH]; + struct io_kiocb *req; + int to_free = 0; + + while (!list_empty(done)) { + req = list_first_entry(done, struct io_kiocb, list); + list_del(&req->list); + + io_cqring_fill_event(ctx, req->user_data, req->error, 0); + + reqs[to_free++] = req; + (*nr_events)++; + + fput(req->rw.ki_filp); + if (to_free == ARRAY_SIZE(reqs)) + io_free_req_many(ctx, reqs, &to_free); + } + io_commit_cqring(ctx); + + io_free_req_many(ctx, reqs, &to_free); +} + +static int io_do_iopoll(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + struct io_kiocb *req, *tmp; + LIST_HEAD(done); + bool spin; + int ret; + + /* + * Only spin for completions if we don't have multiple devices hanging + * off our complete list, and we're under the requested amount. + */ + spin = !ctx->poll_multi_file && *nr_events < min; + + ret = 0; + list_for_each_entry_safe(req, tmp, &ctx->poll_list, list) { + struct kiocb *kiocb = &req->rw; + + /* + * Move completed entries to our local list. If we find a + * request that requires polling, break out and complete + * the done list first, if we have entries there. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) { + list_move_tail(&req->list, &done); + continue; + } + if (!list_empty(&done)) + break; + + ret = kiocb->ki_filp->f_op->iopoll(kiocb, spin); + if (ret < 0) + break; + + if (ret && spin) + spin = false; + ret = 0; + } + + if (!list_empty(&done)) + io_iopoll_complete(ctx, nr_events, &done); + + return ret; +} + +/* + * Poll for a mininum of 'min' events. Note that if min == 0 we consider that a + * non-spinning poll check - we'll still enter the driver poll loop, but only + * as a non-spinning completion check. + */ +static int io_iopoll_getevents(struct io_ring_ctx *ctx, unsigned int *nr_events, + long min) +{ + while (!list_empty(&ctx->poll_list)) { + int ret; + + ret = io_do_iopoll(ctx, nr_events, min); + if (ret < 0) + return ret; + if (!min || *nr_events >= min) + return 0; + } + + return 1; +} + +/* + * We can't just wait for polled events to come to us, we have to actively + * find and complete them. + */ +static void io_iopoll_reap_events(struct io_ring_ctx *ctx) +{ + if (!(ctx->flags & IORING_SETUP_IOPOLL)) + return; + + mutex_lock(&ctx->uring_lock); + while (!list_empty(&ctx->poll_list)) { + unsigned int nr_events = 0; + + io_iopoll_getevents(ctx, &nr_events, 1); + } + mutex_unlock(&ctx->uring_lock); +} + +static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events, + long min) +{ + int ret = 0; + + do { + int tmin = 0; + + if (*nr_events < min) + tmin = min - *nr_events; + + ret = io_iopoll_getevents(ctx, nr_events, tmin); + if (ret <= 0) + break; + ret = 0; + } while (min && !*nr_events && !need_resched()); + + return ret; +} + static void kiocb_end_write(struct kiocb *kiocb) { if (kiocb->ki_flags & IOCB_WRITE) { @@ -329,6 +483,53 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) io_free_req(req); } +static void io_complete_rw_iopoll(struct kiocb *kiocb, long res, long res2) +{ + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); + + kiocb_end_write(kiocb); + + req->error = res; + if (res != -EAGAIN) + req->flags |= REQ_F_IOPOLL_COMPLETED; +} + +/* + * After the iocb has been issued, it's safe to be found on the poll list. + * Adding the kiocb to the list AFTER submission ensures that we don't + * find it from a io_iopoll_getevents() thread before the issuer is done + * accessing the kiocb cookie. + */ +static void io_iopoll_req_issued(struct io_kiocb *req) +{ + struct io_ring_ctx *ctx = req->ctx; + + /* + * Track whether we have multiple files in our lists. This will impact + * how we do polling eventually, not spinning if we're on potentially + * different devices. + */ + if (list_empty(&ctx->poll_list)) { + ctx->poll_multi_file = false; + } else if (!ctx->poll_multi_file) { + struct io_kiocb *list_req; + + list_req = list_first_entry(&ctx->poll_list, struct io_kiocb, + list); + if (list_req->rw.ki_filp != req->rw.ki_filp) + ctx->poll_multi_file = true; + } + + /* + * For fast devices, IO may have already completed. If it has, add + * it to the front so we find it first. + */ + if (req->flags & REQ_F_IOPOLL_COMPLETED) + list_add(&req->list, &ctx->poll_list); + else + list_add_tail(&req->list, &ctx->poll_list); +} + /* * If we tracked the file through the SCM inflight mechanism, we could support * any file. For now, just ensure that anything potentially problematic is done @@ -349,6 +550,7 @@ static bool io_file_supports_async(struct file *file) static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, bool force_nonblock) { + struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio; int fd, ret; @@ -384,12 +586,22 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_flags |= IOCB_NOWAIT; req->flags |= REQ_F_FORCE_NONBLOCK; } - if (kiocb->ki_flags & IOCB_HIPRI) { - ret = -EINVAL; - goto out_fput; - } + if (ctx->flags & IORING_SETUP_IOPOLL) { + ret = -EOPNOTSUPP; + if (!(kiocb->ki_flags & IOCB_DIRECT) || + !kiocb->ki_filp->f_op->iopoll) + goto out_fput; - kiocb->ki_complete = io_complete_rw; + req->error = 0; + kiocb->ki_flags |= IOCB_HIPRI; + kiocb->ki_complete = io_complete_rw_iopoll; + } else { + if (kiocb->ki_flags & IOCB_HIPRI) { + ret = -EINVAL; + goto out_fput; + } + kiocb->ki_complete = io_complete_rw; + } return 0; out_fput: fput(kiocb->ki_filp); @@ -543,6 +755,9 @@ static int io_nop(struct io_kiocb *req, u64 user_data) struct io_ring_ctx *ctx = req->ctx; long err = 0; + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + /* * Twilight zone - it's possible that someone issued an opcode that * has a file attached, then got -EAGAIN on submission, and changed @@ -566,6 +781,8 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->rw.ki_filp) return 0; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio)) return -EINVAL; @@ -637,7 +854,22 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, break; } - return ret; + if (ret) + return ret; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + if (req->error == -EAGAIN) + return -EAGAIN; + + /* workqueue context doesn't hold uring_lock, grab it now */ + if (s->needs_lock) + mutex_lock(&ctx->uring_lock); + io_iopoll_req_issued(req); + if (s->needs_lock) + mutex_unlock(&ctx->uring_lock); + } + + return 0; } static void io_sq_wq_submit_work(struct work_struct *work) @@ -661,8 +893,19 @@ static void io_sq_wq_submit_work(struct work_struct *work) use_mm(ctx->sqo_mm); set_fs(USER_DS); s->has_user = true; + s->needs_lock = true; - ret = __io_submit_sqe(ctx, req, s, false); + do { + ret = __io_submit_sqe(ctx, req, s, false); + /* + * We can get EAGAIN for polled IO even though we're forcing + * a sync submission from here, since we can't wait for + * request slots on the block side. + */ + if (ret != -EAGAIN) + break; + cond_resched(); + } while (1); set_fs(old_fs); unuse_mm(ctx->sqo_mm); @@ -799,6 +1042,8 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) break; s.has_user = true; + s.needs_lock = false; + ret = io_submit_sqe(ctx, &s); if (ret) { io_drop_sqring(ctx); @@ -947,6 +1192,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) destroy_workqueue(ctx->sqo_wq); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); + + io_iopoll_reap_events(ctx); + #if defined(CONFIG_UNIX) if (ctx->ring_sock) sock_release(ctx->ring_sock); @@ -993,6 +1241,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); } @@ -1074,6 +1323,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, goto out_ctx; } if (flags & IORING_ENTER_GETEVENTS) { + unsigned nr_events = 0; + min_complete = min(min_complete, ctx->cq_entries); /* @@ -1085,7 +1336,13 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (submitted < to_submit) min_complete = min_t(unsigned, submitted, min_complete); - ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + if (ctx->flags & IORING_SETUP_IOPOLL) { + mutex_lock(&ctx->uring_lock); + ret = io_iopoll_check(ctx, &nr_events, min_complete); + mutex_unlock(&ctx->uring_lock); + } else { + ret = io_cqring_wait(ctx, min_complete, sig, sigsz); + } } out_ctx: @@ -1282,7 +1539,7 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags) + if (p.flags & ~IORING_SETUP_IOPOLL) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 4589d56d0b68..5c457ea396e6 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -30,6 +30,11 @@ struct io_uring_sqe { __u64 __pad2[3]; }; +/* + * io_uring_setup() flags + */ +#define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ + #define IORING_OP_NOP 0 #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 -- cgit v1.2.3 From edafccee56ff31678a091ddb7219aba9b28bc3cb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 9 Jan 2019 09:16:05 -0700 Subject: io_uring: add support for pre-mapped user IO buffers If we have fixed user buffers, we can map them into the kernel when we setup the io_uring. That avoids the need to do get_user_pages() for each and every IO. To utilize this feature, the application must call io_uring_register() after having setup an io_uring instance, passing in IORING_REGISTER_BUFFERS as the opcode. The argument must be a pointer to an iovec array, and the nr_args should contain how many iovecs the application wishes to map. If successful, these buffers are now mapped into the kernel, eligible for IO. To use these fixed buffers, the application must use the IORING_OP_READ_FIXED and IORING_OP_WRITE_FIXED opcodes, and then set sqe->index to the desired buffer index. sqe->addr..sqe->addr+seq->len must point to somewhere inside the indexed buffer. The application may register buffers throughout the lifetime of the io_uring instance. It can call io_uring_register() with IORING_UNREGISTER_BUFFERS as the opcode to unregister the current set of buffers, and then register a new set. The application need not unregister buffers explicitly before shutting down the io_uring instance. It's perfectly valid to setup a larger buffer, and then sometimes only use parts of it for an IO. As long as the range is within the originally mapped region, it will work just fine. For now, buffers must not be file backed. If file backed buffers are passed in, the registration will fail with -1/EOPNOTSUPP. This restriction may be relaxed in the future. RLIMIT_MEMLOCK is used to check how much memory we can pin. A somewhat arbitrary 1G per buffer size is also imposed. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + fs/io_uring.c | 374 +++++++++++++++++++++++++++++++-- include/linux/syscalls.h | 2 + include/uapi/asm-generic/unistd.h | 4 +- include/uapi/linux/io_uring.h | 13 +- kernel/sys_ni.c | 1 + 7 files changed, 381 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 481c126259e9..2eefd2a7c1ce 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -400,3 +400,4 @@ 386 i386 rseq sys_rseq __ia32_sys_rseq 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter +427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 6a32a430c8e0..65c026185e61 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -345,6 +345,7 @@ 334 common rseq __x64_sys_rseq 425 common io_uring_setup __x64_sys_io_uring_setup 426 common io_uring_enter __x64_sys_io_uring_enter +427 common io_uring_register __x64_sys_io_uring_register # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/fs/io_uring.c b/fs/io_uring.c index 31f43ed894ba..c0c0f68568b5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,8 @@ #include #include #include +#include +#include #include @@ -81,6 +84,13 @@ struct io_cq_ring { struct io_uring_cqe cqes[]; }; +struct io_mapped_ubuf { + u64 ubuf; + size_t len; + struct bio_vec *bvec; + unsigned int nr_bvecs; +}; + struct io_ring_ctx { struct { struct percpu_ref refs; @@ -113,6 +123,10 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* if used, fixed mapped user buffers */ + unsigned nr_user_bufs; + struct io_mapped_ubuf *user_bufs; + struct user_struct *user; struct completion ctx_done; @@ -732,6 +746,46 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) } } +static int io_import_fixed(struct io_ring_ctx *ctx, int rw, + const struct io_uring_sqe *sqe, + struct iov_iter *iter) +{ + size_t len = READ_ONCE(sqe->len); + struct io_mapped_ubuf *imu; + unsigned index, buf_index; + size_t offset; + u64 buf_addr; + + /* attempt to use fixed buffers without having provided iovecs */ + if (unlikely(!ctx->user_bufs)) + return -EFAULT; + + buf_index = READ_ONCE(sqe->buf_index); + if (unlikely(buf_index >= ctx->nr_user_bufs)) + return -EFAULT; + + index = array_index_nospec(buf_index, ctx->nr_user_bufs); + imu = &ctx->user_bufs[index]; + buf_addr = READ_ONCE(sqe->addr); + + /* overflow */ + if (buf_addr + len < buf_addr) + return -EFAULT; + /* not inside the mapped region */ + if (buf_addr < imu->ubuf || buf_addr + len > imu->ubuf + imu->len) + return -EFAULT; + + /* + * May not be a start of buffer, set size appropriately + * and advance us to the beginning. + */ + offset = buf_addr - imu->ubuf; + iov_iter_bvec(iter, rw, imu->bvec, imu->nr_bvecs, offset + len); + if (offset) + iov_iter_advance(iter, offset); + return 0; +} + static int io_import_iovec(struct io_ring_ctx *ctx, int rw, const struct sqe_submit *s, struct iovec **iovec, struct iov_iter *iter) @@ -739,6 +793,23 @@ static int io_import_iovec(struct io_ring_ctx *ctx, int rw, const struct io_uring_sqe *sqe = s->sqe; void __user *buf = u64_to_user_ptr(READ_ONCE(sqe->addr)); size_t sqe_len = READ_ONCE(sqe->len); + u8 opcode; + + /* + * We're reading ->opcode for the second time, but the first read + * doesn't care whether it's _FIXED or not, so it doesn't matter + * whether ->opcode changes concurrently. The first read does care + * about whether it is a READ or a WRITE, so we don't trust this read + * for that purpose and instead let the caller pass in the read/write + * flag. + */ + opcode = READ_ONCE(sqe->opcode); + if (opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED) { + ssize_t ret = io_import_fixed(ctx, rw, sqe, iter); + *iovec = NULL; + return ret; + } if (!s->has_user) return -EFAULT; @@ -886,7 +957,7 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; - if (unlikely(sqe->addr || sqe->ioprio)) + if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; fd = READ_ONCE(sqe->fd); @@ -945,9 +1016,19 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, ret = io_nop(req, req->user_data); break; case IORING_OP_READV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; ret = io_read(req, s, force_nonblock, state); break; case IORING_OP_WRITEV: + if (unlikely(s->sqe->buf_index)) + return -EINVAL; + ret = io_write(req, s, force_nonblock, state); + break; + case IORING_OP_READ_FIXED: + ret = io_read(req, s, force_nonblock, state); + break; + case IORING_OP_WRITE_FIXED: ret = io_write(req, s, force_nonblock, state); break; case IORING_OP_FSYNC: @@ -976,28 +1057,46 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, return 0; } +static inline bool io_sqe_needs_user(const struct io_uring_sqe *sqe) +{ + u8 opcode = READ_ONCE(sqe->opcode); + + return !(opcode == IORING_OP_READ_FIXED || + opcode == IORING_OP_WRITE_FIXED); +} + static void io_sq_wq_submit_work(struct work_struct *work) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); struct sqe_submit *s = &req->submit; const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; - mm_segment_t old_fs = get_fs(); + mm_segment_t old_fs; + bool needs_user; int ret; /* Ensure we clear previously set forced non-block flag */ req->flags &= ~REQ_F_FORCE_NONBLOCK; req->rw.ki_flags &= ~IOCB_NOWAIT; - if (!mmget_not_zero(ctx->sqo_mm)) { - ret = -EFAULT; - goto err; - } - - use_mm(ctx->sqo_mm); - set_fs(USER_DS); - s->has_user = true; s->needs_lock = true; + s->has_user = false; + + /* + * If we're doing IO to fixed buffers, we don't need to get/set + * user context + */ + needs_user = io_sqe_needs_user(s->sqe); + if (needs_user) { + if (!mmget_not_zero(ctx->sqo_mm)) { + ret = -EFAULT; + goto err; + } + use_mm(ctx->sqo_mm); + old_fs = get_fs(); + set_fs(USER_DS); + s->has_user = true; + } do { ret = __io_submit_sqe(ctx, req, s, false, NULL); @@ -1011,9 +1110,11 @@ static void io_sq_wq_submit_work(struct work_struct *work) cond_resched(); } while (1); - set_fs(old_fs); - unuse_mm(ctx->sqo_mm); - mmput(ctx->sqo_mm); + if (needs_user) { + set_fs(old_fs); + unuse_mm(ctx->sqo_mm); + mmput(ctx->sqo_mm); + } err: if (ret) { io_cqring_add_event(ctx, sqe->user_data, ret, 0); @@ -1317,6 +1418,198 @@ static unsigned long ring_pages(unsigned sq_entries, unsigned cq_entries) return (bytes + PAGE_SIZE - 1) / PAGE_SIZE; } +static int io_sqe_buffer_unregister(struct io_ring_ctx *ctx) +{ + int i, j; + + if (!ctx->user_bufs) + return -ENXIO; + + for (i = 0; i < ctx->nr_user_bufs; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + + for (j = 0; j < imu->nr_bvecs; j++) + put_page(imu->bvec[j].bv_page); + + if (ctx->account_mem) + io_unaccount_mem(ctx->user, imu->nr_bvecs); + kfree(imu->bvec); + imu->nr_bvecs = 0; + } + + kfree(ctx->user_bufs); + ctx->user_bufs = NULL; + ctx->nr_user_bufs = 0; + return 0; +} + +static int io_copy_iov(struct io_ring_ctx *ctx, struct iovec *dst, + void __user *arg, unsigned index) +{ + struct iovec __user *src; + +#ifdef CONFIG_COMPAT + if (ctx->compat) { + struct compat_iovec __user *ciovs; + struct compat_iovec ciov; + + ciovs = (struct compat_iovec __user *) arg; + if (copy_from_user(&ciov, &ciovs[index], sizeof(ciov))) + return -EFAULT; + + dst->iov_base = (void __user *) (unsigned long) ciov.iov_base; + dst->iov_len = ciov.iov_len; + return 0; + } +#endif + src = (struct iovec __user *) arg; + if (copy_from_user(dst, &src[index], sizeof(*dst))) + return -EFAULT; + return 0; +} + +static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + struct vm_area_struct **vmas = NULL; + struct page **pages = NULL; + int i, j, got_pages = 0; + int ret = -EINVAL; + + if (ctx->user_bufs) + return -EBUSY; + if (!nr_args || nr_args > UIO_MAXIOV) + return -EINVAL; + + ctx->user_bufs = kcalloc(nr_args, sizeof(struct io_mapped_ubuf), + GFP_KERNEL); + if (!ctx->user_bufs) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + struct io_mapped_ubuf *imu = &ctx->user_bufs[i]; + unsigned long off, start, end, ubuf; + int pret, nr_pages; + struct iovec iov; + size_t size; + + ret = io_copy_iov(ctx, &iov, arg, i); + if (ret) + break; + + /* + * Don't impose further limits on the size and buffer + * constraints here, we'll -EINVAL later when IO is + * submitted if they are wrong. + */ + ret = -EFAULT; + if (!iov.iov_base || !iov.iov_len) + goto err; + + /* arbitrary limit, but we need something */ + if (iov.iov_len > SZ_1G) + goto err; + + ubuf = (unsigned long) iov.iov_base; + end = (ubuf + iov.iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ubuf >> PAGE_SHIFT; + nr_pages = end - start; + + if (ctx->account_mem) { + ret = io_account_mem(ctx->user, nr_pages); + if (ret) + goto err; + } + + ret = 0; + if (!pages || nr_pages > got_pages) { + kfree(vmas); + kfree(pages); + pages = kmalloc_array(nr_pages, sizeof(struct page *), + GFP_KERNEL); + vmas = kmalloc_array(nr_pages, + sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!pages || !vmas) { + ret = -ENOMEM; + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + got_pages = nr_pages; + } + + imu->bvec = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + ret = -ENOMEM; + if (!imu->bvec) { + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + ret = 0; + down_read(¤t->mm->mmap_sem); + pret = get_user_pages_longterm(ubuf, nr_pages, FOLL_WRITE, + pages, vmas); + if (pret == nr_pages) { + /* don't support file backed memory */ + for (j = 0; j < nr_pages; j++) { + struct vm_area_struct *vma = vmas[j]; + + if (vma->vm_file && + !is_file_hugepages(vma->vm_file)) { + ret = -EOPNOTSUPP; + break; + } + } + } else { + ret = pret < 0 ? pret : -EFAULT; + } + up_read(¤t->mm->mmap_sem); + if (ret) { + /* + * if we did partial map, or found file backed vmas, + * release any pages we did get + */ + if (pret > 0) { + for (j = 0; j < pret; j++) + put_page(pages[j]); + } + if (ctx->account_mem) + io_unaccount_mem(ctx->user, nr_pages); + goto err; + } + + off = ubuf & ~PAGE_MASK; + size = iov.iov_len; + for (j = 0; j < nr_pages; j++) { + size_t vec_len; + + vec_len = min_t(size_t, size, PAGE_SIZE - off); + imu->bvec[j].bv_page = pages[j]; + imu->bvec[j].bv_len = vec_len; + imu->bvec[j].bv_offset = off; + off = 0; + size -= vec_len; + } + /* store original address for later verification */ + imu->ubuf = ubuf; + imu->len = iov.iov_len; + imu->nr_bvecs = nr_pages; + + ctx->nr_user_bufs++; + } + kfree(pages); + kfree(vmas); + return 0; +err: + kfree(pages); + kfree(vmas); + io_sqe_buffer_unregister(ctx); + return ret; +} + static void io_ring_ctx_free(struct io_ring_ctx *ctx) { if (ctx->sqo_wq) @@ -1325,6 +1618,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mmdrop(ctx->sqo_mm); io_iopoll_reap_events(ctx); + io_sqe_buffer_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -1689,6 +1983,60 @@ SYSCALL_DEFINE2(io_uring_setup, u32, entries, return io_uring_setup(entries, params); } +static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, + void __user *arg, unsigned nr_args) +{ + int ret; + + percpu_ref_kill(&ctx->refs); + wait_for_completion(&ctx->ctx_done); + + switch (opcode) { + case IORING_REGISTER_BUFFERS: + ret = io_sqe_buffer_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_BUFFERS: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_buffer_unregister(ctx); + break; + default: + ret = -EINVAL; + break; + } + + /* bring the ctx back to life */ + reinit_completion(&ctx->ctx_done); + percpu_ref_reinit(&ctx->refs); + return ret; +} + +SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, + void __user *, arg, unsigned int, nr_args) +{ + struct io_ring_ctx *ctx; + long ret = -EBADF; + struct fd f; + + f = fdget(fd); + if (!f.file) + return -EBADF; + + ret = -EOPNOTSUPP; + if (f.file->f_op != &io_uring_fops) + goto out_fput; + + ctx = f.file->private_data; + + mutex_lock(&ctx->uring_lock); + ret = __io_uring_register(ctx, opcode, arg, nr_args); + mutex_unlock(&ctx->uring_lock); +out_fput: + fdput(f); + return ret; +} + static int __init io_uring_init(void) { req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3072dbaa7869..3681c05ac538 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -315,6 +315,8 @@ asmlinkage long sys_io_uring_setup(u32 entries, asmlinkage long sys_io_uring_enter(unsigned int fd, u32 to_submit, u32 min_complete, u32 flags, const sigset_t __user *sig, size_t sigsz); +asmlinkage long sys_io_uring_register(unsigned int fd, unsigned int op, + void __user *arg, unsigned int nr_args); /* fs/xattr.c */ asmlinkage long sys_setxattr(const char __user *path, const char __user *name, diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 87871e7b7ea7..d346229a1eb0 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -744,9 +744,11 @@ __SYSCALL(__NR_kexec_file_load, sys_kexec_file_load) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) #define __NR_io_uring_enter 426 __SYSCALL(__NR_io_uring_enter, sys_io_uring_enter) +#define __NR_io_uring_register 427 +__SYSCALL(__NR_io_uring_register, sys_io_uring_register) #undef __NR_syscalls -#define __NR_syscalls 427 +#define __NR_syscalls 428 /* * 32 bit systems traditionally used different diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 5c457ea396e6..cf28f7a11f12 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -27,7 +27,10 @@ struct io_uring_sqe { __u32 fsync_flags; }; __u64 user_data; /* data to be passed back at completion time */ - __u64 __pad2[3]; + union { + __u16 buf_index; /* index into fixed buffers, if used */ + __u64 __pad2[3]; + }; }; /* @@ -39,6 +42,8 @@ struct io_uring_sqe { #define IORING_OP_READV 1 #define IORING_OP_WRITEV 2 #define IORING_OP_FSYNC 3 +#define IORING_OP_READ_FIXED 4 +#define IORING_OP_WRITE_FIXED 5 /* * sqe->fsync_flags @@ -103,4 +108,10 @@ struct io_uring_params { struct io_cqring_offsets cq_off; }; +/* + * io_uring_register(2) opcodes and arguments + */ +#define IORING_REGISTER_BUFFERS 0 +#define IORING_UNREGISTER_BUFFERS 1 + #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ee5e523564bb..1bb6604dc19f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -48,6 +48,7 @@ COND_SYSCALL_COMPAT(io_getevents); COND_SYSCALL_COMPAT(io_pgetevents); COND_SYSCALL(io_uring_setup); COND_SYSCALL(io_uring_enter); +COND_SYSCALL(io_uring_register); /* fs/xattr.c */ -- cgit v1.2.3 From 6b06314c47e141031be043539900d80d2c7ba10f Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 22:13:58 -0700 Subject: io_uring: add file set registration We normally have to fget/fput for each IO we do on a file. Even with the batching we do, the cost of the atomic inc/dec of the file usage count adds up. This adds IORING_REGISTER_FILES, and IORING_UNREGISTER_FILES opcodes for the io_uring_register(2) system call. The arguments passed in must be an array of __s32 holding file descriptors, and nr_args should hold the number of file descriptors the application wishes to pin for the duration of the io_uring instance (or until IORING_UNREGISTER_FILES is called). When used, the application must set IOSQE_FIXED_FILE in the sqe->flags member. Then, instead of setting sqe->fd to the real fd, it sets sqe->fd to the index in the array passed in to IORING_REGISTER_FILES. Files are automatically unregistered when the io_uring instance is torn down. An application need only unregister if it wishes to register a new set of fds. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 311 +++++++++++++++++++++++++++++++++++++----- include/uapi/linux/io_uring.h | 9 +- 2 files changed, 288 insertions(+), 32 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index c0c0f68568b5..5eaa1c4a3f7c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include @@ -61,6 +62,7 @@ #include "internal.h" #define IORING_MAX_ENTRIES 4096 +#define IORING_MAX_FIXED_FILES 1024 struct io_uring { u32 head ____cacheline_aligned_in_smp; @@ -123,6 +125,14 @@ struct io_ring_ctx { struct fasync_struct *cq_fasync; } ____cacheline_aligned_in_smp; + /* + * If used, fixed file set. Writers must ensure that ->refs is dead, + * readers must ensure that ->refs is alive as long as the file* is + * used. Only updated through io_uring_register(2). + */ + struct file **user_files; + unsigned nr_user_files; + /* if used, fixed mapped user buffers */ unsigned nr_user_bufs; struct io_mapped_ubuf *user_bufs; @@ -170,6 +180,7 @@ struct io_kiocb { unsigned int flags; #define REQ_F_FORCE_NONBLOCK 1 /* inline submission attempt */ #define REQ_F_IOPOLL_COMPLETED 2 /* polled IO has completed */ +#define REQ_F_FIXED_FILE 4 /* ctx owns file */ u64 user_data; u64 error; @@ -404,15 +415,17 @@ static void io_iopoll_complete(struct io_ring_ctx *ctx, unsigned int *nr_events, * Batched puts of the same file, to avoid dirtying the * file usage count multiple times, if avoidable. */ - if (!file) { - file = req->rw.ki_filp; - file_count = 1; - } else if (file == req->rw.ki_filp) { - file_count++; - } else { - fput_many(file, file_count); - file = req->rw.ki_filp; - file_count = 1; + if (!(req->flags & REQ_F_FIXED_FILE)) { + if (!file) { + file = req->rw.ki_filp; + file_count = 1; + } else if (file == req->rw.ki_filp) { + file_count++; + } else { + fput_many(file, file_count); + file = req->rw.ki_filp; + file_count = 1; + } } if (to_free == ARRAY_SIZE(reqs)) @@ -544,13 +557,19 @@ static void kiocb_end_write(struct kiocb *kiocb) } } +static void io_fput(struct io_kiocb *req) +{ + if (!(req->flags & REQ_F_FIXED_FILE)) + fput(req->rw.ki_filp); +} + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) { struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw); kiocb_end_write(kiocb); - fput(kiocb->ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, req->user_data, res, 0); io_free_req(req); } @@ -666,19 +685,29 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, { struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; - unsigned ioprio; + unsigned ioprio, flags; int fd, ret; /* For -EAGAIN retry, everything is already prepped */ if (kiocb->ki_filp) return 0; + flags = READ_ONCE(sqe->flags); fd = READ_ONCE(sqe->fd); - kiocb->ki_filp = io_file_get(state, fd); - if (unlikely(!kiocb->ki_filp)) - return -EBADF; - if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) - force_nonblock = false; + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || + (unsigned) fd >= ctx->nr_user_files)) + return -EBADF; + kiocb->ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + kiocb->ki_filp = io_file_get(state, fd); + if (unlikely(!kiocb->ki_filp)) + return -EBADF; + if (force_nonblock && !io_file_supports_async(kiocb->ki_filp)) + force_nonblock = false; + } kiocb->ki_pos = READ_ONCE(sqe->off); kiocb->ki_flags = iocb_flags(kiocb->ki_filp); kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); @@ -718,10 +747,14 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, } return 0; out_fput: - /* in case of error, we didn't use this file reference. drop it. */ - if (state) - state->used_refs--; - io_file_put(state, kiocb->ki_filp); + if (!(flags & IOSQE_FIXED_FILE)) { + /* + * in case of error, we didn't use this file reference. drop it. + */ + if (state) + state->used_refs--; + io_file_put(state, kiocb->ki_filp); + } return ret; } @@ -863,7 +896,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, out_fput: /* Hold on to the file for -EAGAIN */ if (unlikely(ret && ret != -EAGAIN)) - fput(file); + io_fput(req); return ret; } @@ -917,7 +950,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, kfree(iovec); out_fput: if (unlikely(ret)) - fput(file); + io_fput(req); return ret; } @@ -940,7 +973,7 @@ static int io_nop(struct io_kiocb *req, u64 user_data) */ if (req->rw.ki_filp) { err = -EBADF; - fput(req->rw.ki_filp); + io_fput(req); } io_cqring_add_event(ctx, user_data, err, 0); io_free_req(req); @@ -949,21 +982,32 @@ static int io_nop(struct io_kiocb *req, u64 user_data) static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) { + struct io_ring_ctx *ctx = req->ctx; + unsigned flags; int fd; /* Prep already done */ if (req->rw.ki_filp) return 0; - if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) return -EINVAL; fd = READ_ONCE(sqe->fd); - req->rw.ki_filp = fget(fd); - if (unlikely(!req->rw.ki_filp)) - return -EBADF; + flags = READ_ONCE(sqe->flags); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + req->rw.ki_filp = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + req->rw.ki_filp = fget(fd); + if (unlikely(!req->rw.ki_filp)) + return -EBADF; + } return 0; } @@ -993,7 +1037,7 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, end > 0 ? end : LLONG_MAX, fsync_flags & IORING_FSYNC_DATASYNC); - fput(req->rw.ki_filp); + io_fput(req); io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); io_free_req(req); return 0; @@ -1132,7 +1176,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s, ssize_t ret; /* enforce forwards compatibility on users */ - if (unlikely(s->sqe->flags)) + if (unlikely(s->sqe->flags & ~IOSQE_FIXED_FILE)) return -EINVAL; req = io_get_req(ctx, state); @@ -1344,6 +1388,201 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, return READ_ONCE(ring->r.head) == READ_ONCE(ring->r.tail) ? ret : 0; } +static void __io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ +#if defined(CONFIG_UNIX) + if (ctx->ring_sock) { + struct sock *sock = ctx->ring_sock->sk; + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL) + kfree_skb(skb); + } +#else + int i; + + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); +#endif +} + +static int io_sqe_files_unregister(struct io_ring_ctx *ctx) +{ + if (!ctx->user_files) + return -ENXIO; + + __io_sqe_files_unregister(ctx); + kfree(ctx->user_files); + ctx->user_files = NULL; + ctx->nr_user_files = 0; + return 0; +} + +static void io_finish_async(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_wq) { + destroy_workqueue(ctx->sqo_wq); + ctx->sqo_wq = NULL; + } +} + +#if defined(CONFIG_UNIX) +static void io_destruct_skb(struct sk_buff *skb) +{ + struct io_ring_ctx *ctx = skb->sk->sk_user_data; + + io_finish_async(ctx); + unix_destruct_scm(skb); +} + +/* + * Ensure the UNIX gc is aware of our file set, so we are certain that + * the io_uring can be safely unregistered on process exit, even if we have + * loops in the file referencing. + */ +static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) +{ + struct sock *sk = ctx->ring_sock->sk; + struct scm_fp_list *fpl; + struct sk_buff *skb; + int i; + + if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + unsigned long inflight = ctx->user->unix_inflight + nr; + + if (inflight > task_rlimit(current, RLIMIT_NOFILE)) + return -EMFILE; + } + + fpl = kzalloc(sizeof(*fpl), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + + skb = alloc_skb(0, GFP_KERNEL); + if (!skb) { + kfree(fpl); + return -ENOMEM; + } + + skb->sk = sk; + skb->destructor = io_destruct_skb; + + fpl->user = get_uid(ctx->user); + for (i = 0; i < nr; i++) { + fpl->fp[i] = get_file(ctx->user_files[i + offset]); + unix_inflight(fpl->user, fpl->fp[i]); + } + + fpl->max = fpl->count = nr; + UNIXCB(skb).fp = fpl; + refcount_add(skb->truesize, &sk->sk_wmem_alloc); + skb_queue_head(&sk->sk_receive_queue, skb); + + for (i = 0; i < nr; i++) + fput(fpl->fp[i]); + + return 0; +} + +/* + * If UNIX sockets are enabled, fd passing can cause a reference cycle which + * causes regular reference counting to break down. We rely on the UNIX + * garbage collection to take care of this problem for us. + */ +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + unsigned left, total; + int ret = 0; + + total = 0; + left = ctx->nr_user_files; + while (left) { + unsigned this_files = min_t(unsigned, left, SCM_MAX_FD); + int ret; + + ret = __io_sqe_files_scm(ctx, this_files, total); + if (ret) + break; + left -= this_files; + total += this_files; + } + + if (!ret) + return 0; + + while (total < ctx->nr_user_files) { + fput(ctx->user_files[total]); + total++; + } + + return ret; +} +#else +static int io_sqe_files_scm(struct io_ring_ctx *ctx) +{ + return 0; +} +#endif + +static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, + unsigned nr_args) +{ + __s32 __user *fds = (__s32 __user *) arg; + int fd, ret = 0; + unsigned i; + + if (ctx->user_files) + return -EBUSY; + if (!nr_args) + return -EINVAL; + if (nr_args > IORING_MAX_FIXED_FILES) + return -EMFILE; + + ctx->user_files = kcalloc(nr_args, sizeof(struct file *), GFP_KERNEL); + if (!ctx->user_files) + return -ENOMEM; + + for (i = 0; i < nr_args; i++) { + ret = -EFAULT; + if (copy_from_user(&fd, &fds[i], sizeof(fd))) + break; + + ctx->user_files[i] = fget(fd); + + ret = -EBADF; + if (!ctx->user_files[i]) + break; + /* + * Don't allow io_uring instances to be registered. If UNIX + * isn't enabled, then this causes a reference cycle and this + * instance can never get freed. If UNIX is enabled we'll + * handle it just fine, but there's still no point in allowing + * a ring fd as it doesn't support regular read/write anyway. + */ + if (ctx->user_files[i]->f_op == &io_uring_fops) { + fput(ctx->user_files[i]); + break; + } + ctx->nr_user_files++; + ret = 0; + } + + if (ret) { + for (i = 0; i < ctx->nr_user_files; i++) + fput(ctx->user_files[i]); + + kfree(ctx->user_files); + ctx->nr_user_files = 0; + return ret; + } + + ret = io_sqe_files_scm(ctx); + if (ret) + io_sqe_files_unregister(ctx); + + return ret; +} + static int io_sq_offload_start(struct io_ring_ctx *ctx) { int ret; @@ -1612,13 +1851,13 @@ err: static void io_ring_ctx_free(struct io_ring_ctx *ctx) { - if (ctx->sqo_wq) - destroy_workqueue(ctx->sqo_wq); + io_finish_async(ctx); if (ctx->sqo_mm) mmdrop(ctx->sqo_mm); io_iopoll_reap_events(ctx); io_sqe_buffer_unregister(ctx); + io_sqe_files_unregister(ctx); #if defined(CONFIG_UNIX) if (ctx->ring_sock) @@ -1858,6 +2097,7 @@ static int io_uring_get_fd(struct io_ring_ctx *ctx) #if defined(CONFIG_UNIX) ctx->ring_sock->file = file; + ctx->ring_sock->sk->sk_user_data = ctx; #endif fd_install(ret, file); return ret; @@ -2001,6 +2241,15 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode, break; ret = io_sqe_buffer_unregister(ctx); break; + case IORING_REGISTER_FILES: + ret = io_sqe_files_register(ctx, arg, nr_args); + break; + case IORING_UNREGISTER_FILES: + ret = -EINVAL; + if (arg || nr_args) + break; + ret = io_sqe_files_unregister(ctx); + break; default: ret = -EINVAL; break; diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index cf28f7a11f12..6257478d55e9 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -16,7 +16,7 @@ */ struct io_uring_sqe { __u8 opcode; /* type of operation for this sqe */ - __u8 flags; /* as of now unused */ + __u8 flags; /* IOSQE_ flags */ __u16 ioprio; /* ioprio for the request */ __s32 fd; /* file descriptor to do IO on */ __u64 off; /* offset into file */ @@ -33,6 +33,11 @@ struct io_uring_sqe { }; }; +/* + * sqe->flags + */ +#define IOSQE_FIXED_FILE (1U << 0) /* use fixed fileset */ + /* * io_uring_setup() flags */ @@ -113,5 +118,7 @@ struct io_uring_params { */ #define IORING_REGISTER_BUFFERS 0 #define IORING_UNREGISTER_BUFFERS 1 +#define IORING_REGISTER_FILES 2 +#define IORING_UNREGISTER_FILES 3 #endif -- cgit v1.2.3 From 6c271ce2f1d572f7fa225700a13cfe7ced492434 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 10 Jan 2019 11:22:30 -0700 Subject: io_uring: add submission polling This enables an application to do IO, without ever entering the kernel. By using the SQ ring to fill in new sqes and watching for completions on the CQ ring, we can submit and reap IOs without doing a single system call. The kernel side thread will poll for new submissions, and in case of HIPRI/polled IO, it'll also poll for completions. By default, we allow 1 second of active spinning. This can by changed by passing in a different grace period at io_uring_register(2) time. If the thread exceeds this idle time without having any work to do, it will set: sq_ring->flags |= IORING_SQ_NEED_WAKEUP. The application will have to call io_uring_enter() to start things back up again. If IO is kept busy, that will never be needed. Basically an application that has this feature enabled will guard it's io_uring_enter(2) call with: read_barrier(); if (*sq_ring->flags & IORING_SQ_NEED_WAKEUP) io_uring_enter(fd, 0, 0, IORING_ENTER_SQ_WAKEUP); instead of calling it unconditionally. It's mandatory to use fixed files with this feature. Failure to do so will result in the application getting an -EBADF CQ entry when submitting IO. Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- fs/io_uring.c | 249 ++++++++++++++++++++++++++++++++++++++++-- include/uapi/linux/io_uring.h | 12 +- 2 files changed, 253 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index 5eaa1c4a3f7c..0a4caedf82c1 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -108,12 +109,16 @@ struct io_ring_ctx { unsigned cached_sq_head; unsigned sq_entries; unsigned sq_mask; + unsigned sq_thread_idle; struct io_uring_sqe *sq_sqes; } ____cacheline_aligned_in_smp; /* IO offload */ struct workqueue_struct *sqo_wq; + struct task_struct *sqo_thread; /* if using sq thread polling */ struct mm_struct *sqo_mm; + wait_queue_head_t sqo_wait; + unsigned sqo_stop; struct { /* CQ ring */ @@ -168,6 +173,7 @@ struct sqe_submit { unsigned short index; bool has_user; bool needs_lock; + bool needs_fixed_file; }; struct io_kiocb { @@ -327,6 +333,8 @@ static void io_cqring_add_event(struct io_ring_ctx *ctx, u64 ki_user_data, if (waitqueue_active(&ctx->wait)) wake_up(&ctx->wait); + if (waitqueue_active(&ctx->sqo_wait)) + wake_up(&ctx->sqo_wait); } static void io_ring_drop_ctx_refs(struct io_ring_ctx *ctx, unsigned refs) @@ -680,9 +688,10 @@ static bool io_file_supports_async(struct file *file) return false; } -static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, +static int io_prep_rw(struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) { + const struct io_uring_sqe *sqe = s->sqe; struct io_ring_ctx *ctx = req->ctx; struct kiocb *kiocb = &req->rw; unsigned ioprio, flags; @@ -702,6 +711,8 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, kiocb->ki_filp = ctx->user_files[fd]; req->flags |= REQ_F_FIXED_FILE; } else { + if (s->needs_fixed_file) + return -EBADF; kiocb->ki_filp = io_file_get(state, fd); if (unlikely(!kiocb->ki_filp)) return -EBADF; @@ -865,7 +876,7 @@ static ssize_t io_read(struct io_kiocb *req, const struct sqe_submit *s, struct file *file; ssize_t ret; - ret = io_prep_rw(req, s->sqe, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; file = kiocb->ki_filp; @@ -909,7 +920,7 @@ static ssize_t io_write(struct io_kiocb *req, const struct sqe_submit *s, struct file *file; ssize_t ret; - ret = io_prep_rw(req, s->sqe, force_nonblock, state); + ret = io_prep_rw(req, s, force_nonblock, state); if (ret) return ret; /* Hold on to the file for -EAGAIN */ @@ -1301,6 +1312,169 @@ static bool io_get_sqring(struct io_ring_ctx *ctx, struct sqe_submit *s) return false; } +static int io_submit_sqes(struct io_ring_ctx *ctx, struct sqe_submit *sqes, + unsigned int nr, bool has_user, bool mm_fault) +{ + struct io_submit_state state, *statep = NULL; + int ret, i, submitted = 0; + + if (nr > IO_PLUG_THRESHOLD) { + io_submit_state_start(&state, ctx, nr); + statep = &state; + } + + for (i = 0; i < nr; i++) { + if (unlikely(mm_fault)) { + ret = -EFAULT; + } else { + sqes[i].has_user = has_user; + sqes[i].needs_lock = true; + sqes[i].needs_fixed_file = true; + ret = io_submit_sqe(ctx, &sqes[i], statep); + } + if (!ret) { + submitted++; + continue; + } + + io_cqring_add_event(ctx, sqes[i].sqe->user_data, ret, 0); + } + + if (statep) + io_submit_state_end(&state); + + return submitted; +} + +static int io_sq_thread(void *data) +{ + struct sqe_submit sqes[IO_IOPOLL_BATCH]; + struct io_ring_ctx *ctx = data; + struct mm_struct *cur_mm = NULL; + mm_segment_t old_fs; + DEFINE_WAIT(wait); + unsigned inflight; + unsigned long timeout; + + old_fs = get_fs(); + set_fs(USER_DS); + + timeout = inflight = 0; + while (!kthread_should_stop() && !ctx->sqo_stop) { + bool all_fixed, mm_fault = false; + int i; + + if (inflight) { + unsigned nr_events = 0; + + if (ctx->flags & IORING_SETUP_IOPOLL) { + /* + * We disallow the app entering submit/complete + * with polling, but we still need to lock the + * ring to prevent racing with polled issue + * that got punted to a workqueue. + */ + mutex_lock(&ctx->uring_lock); + io_iopoll_check(ctx, &nr_events, 0); + mutex_unlock(&ctx->uring_lock); + } else { + /* + * Normal IO, just pretend everything completed. + * We don't have to poll completions for that. + */ + nr_events = inflight; + } + + inflight -= nr_events; + if (!inflight) + timeout = jiffies + ctx->sq_thread_idle; + } + + if (!io_get_sqring(ctx, &sqes[0])) { + /* + * We're polling. If we're within the defined idle + * period, then let us spin without work before going + * to sleep. + */ + if (inflight || !time_after(jiffies, timeout)) { + cpu_relax(); + continue; + } + + /* + * Drop cur_mm before scheduling, we can't hold it for + * long periods (or over schedule()). Do this before + * adding ourselves to the waitqueue, as the unuse/drop + * may sleep. + */ + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + cur_mm = NULL; + } + + prepare_to_wait(&ctx->sqo_wait, &wait, + TASK_INTERRUPTIBLE); + + /* Tell userspace we may need a wakeup call */ + ctx->sq_ring->flags |= IORING_SQ_NEED_WAKEUP; + smp_wmb(); + + if (!io_get_sqring(ctx, &sqes[0])) { + if (kthread_should_stop()) { + finish_wait(&ctx->sqo_wait, &wait); + break; + } + if (signal_pending(current)) + flush_signals(current); + schedule(); + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + continue; + } + finish_wait(&ctx->sqo_wait, &wait); + + ctx->sq_ring->flags &= ~IORING_SQ_NEED_WAKEUP; + smp_wmb(); + } + + i = 0; + all_fixed = true; + do { + if (all_fixed && io_sqe_needs_user(sqes[i].sqe)) + all_fixed = false; + + i++; + if (i == ARRAY_SIZE(sqes)) + break; + } while (io_get_sqring(ctx, &sqes[i])); + + /* Unless all new commands are FIXED regions, grab mm */ + if (!all_fixed && !cur_mm) { + mm_fault = !mmget_not_zero(ctx->sqo_mm); + if (!mm_fault) { + use_mm(ctx->sqo_mm); + cur_mm = ctx->sqo_mm; + } + } + + inflight += io_submit_sqes(ctx, sqes, i, cur_mm != NULL, + mm_fault); + + /* Commit SQ ring head once we've consumed all SQEs */ + io_commit_sqring(ctx); + } + + set_fs(old_fs); + if (cur_mm) { + unuse_mm(cur_mm); + mmput(cur_mm); + } + return 0; +} + static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) { struct io_submit_state state, *statep = NULL; @@ -1319,6 +1493,7 @@ static int io_ring_submit(struct io_ring_ctx *ctx, unsigned int to_submit) s.has_user = true; s.needs_lock = false; + s.needs_fixed_file = false; ret = io_submit_sqe(ctx, &s, statep); if (ret) { @@ -1418,8 +1593,20 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } +static void io_sq_thread_stop(struct io_ring_ctx *ctx) +{ + if (ctx->sqo_thread) { + ctx->sqo_stop = 1; + mb(); + kthread_stop(ctx->sqo_thread); + ctx->sqo_thread = NULL; + } +} + static void io_finish_async(struct io_ring_ctx *ctx) { + io_sq_thread_stop(ctx); + if (ctx->sqo_wq) { destroy_workqueue(ctx->sqo_wq); ctx->sqo_wq = NULL; @@ -1583,13 +1770,47 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg, return ret; } -static int io_sq_offload_start(struct io_ring_ctx *ctx) +static int io_sq_offload_start(struct io_ring_ctx *ctx, + struct io_uring_params *p) { int ret; + init_waitqueue_head(&ctx->sqo_wait); mmgrab(current->mm); ctx->sqo_mm = current->mm; + ctx->sq_thread_idle = msecs_to_jiffies(p->sq_thread_idle); + if (!ctx->sq_thread_idle) + ctx->sq_thread_idle = HZ; + + ret = -EINVAL; + if (!cpu_possible(p->sq_thread_cpu)) + goto err; + + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (p->flags & IORING_SETUP_SQ_AFF) { + int cpu; + + cpu = array_index_nospec(p->sq_thread_cpu, NR_CPUS); + ctx->sqo_thread = kthread_create_on_cpu(io_sq_thread, + ctx, cpu, + "io_uring-sq"); + } else { + ctx->sqo_thread = kthread_create(io_sq_thread, ctx, + "io_uring-sq"); + } + if (IS_ERR(ctx->sqo_thread)) { + ret = PTR_ERR(ctx->sqo_thread); + ctx->sqo_thread = NULL; + goto err; + } + wake_up_process(ctx->sqo_thread); + } else if (p->flags & IORING_SETUP_SQ_AFF) { + /* Can't have SQ_AFF without SQPOLL */ + ret = -EINVAL; + goto err; + } + /* Do QD, or 2 * CPUS, whatever is smallest */ ctx->sqo_wq = alloc_workqueue("io_ring-wq", WQ_UNBOUND | WQ_FREEZABLE, min(ctx->sq_entries - 1, 2 * num_online_cpus())); @@ -1600,6 +1821,7 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx) return 0; err: + io_sq_thread_stop(ctx); mmdrop(ctx->sqo_mm); ctx->sqo_mm = NULL; return ret; @@ -1959,7 +2181,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, int submitted = 0; struct fd f; - if (flags & ~IORING_ENTER_GETEVENTS) + if (flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP)) return -EINVAL; f = fdget(fd); @@ -1975,6 +2197,18 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (!percpu_ref_tryget(&ctx->refs)) goto out_fput; + /* + * For SQ polling, the thread will do all submissions and completions. + * Just return the requested submit count, and wake the thread if + * we were asked to. + */ + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (flags & IORING_ENTER_SQ_WAKEUP) + wake_up(&ctx->sqo_wait); + submitted = to_submit; + goto out_ctx; + } + ret = 0; if (to_submit) { to_submit = min(to_submit, ctx->sq_entries); @@ -2156,7 +2390,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p) if (ret) goto err; - ret = io_sq_offload_start(ctx); + ret = io_sq_offload_start(ctx, p); if (ret) goto err; @@ -2204,7 +2438,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params) return -EINVAL; } - if (p.flags & ~IORING_SETUP_IOPOLL) + if (p.flags & ~(IORING_SETUP_IOPOLL | IORING_SETUP_SQPOLL | + IORING_SETUP_SQ_AFF)) return -EINVAL; ret = io_uring_create(entries, &p); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 6257478d55e9..0ec74bab8dbe 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -42,6 +42,8 @@ struct io_uring_sqe { * io_uring_setup() flags */ #define IORING_SETUP_IOPOLL (1U << 0) /* io_context is polled */ +#define IORING_SETUP_SQPOLL (1U << 1) /* SQ poll thread */ +#define IORING_SETUP_SQ_AFF (1U << 2) /* sq_thread_cpu is valid */ #define IORING_OP_NOP 0 #define IORING_OP_READV 1 @@ -86,6 +88,11 @@ struct io_sqring_offsets { __u64 resv2; }; +/* + * sq_ring->flags + */ +#define IORING_SQ_NEED_WAKEUP (1U << 0) /* needs io_uring_enter wakeup */ + struct io_cqring_offsets { __u32 head; __u32 tail; @@ -100,6 +107,7 @@ struct io_cqring_offsets { * io_uring_enter(2) flags */ #define IORING_ENTER_GETEVENTS (1U << 0) +#define IORING_ENTER_SQ_WAKEUP (1U << 1) /* * Passed in for io_uring_setup(2). Copied back with updated info on success @@ -108,7 +116,9 @@ struct io_uring_params { __u32 sq_entries; __u32 cq_entries; __u32 flags; - __u32 resv[7]; + __u32 sq_thread_cpu; + __u32 sq_thread_idle; + __u32 resv[5]; struct io_sqring_offsets sq_off; struct io_cqring_offsets cq_off; }; -- cgit v1.2.3 From f7c917ba11a67632a8452ea99fe132f626a7a2cc Mon Sep 17 00:00:00 2001 From: brakmo Date: Fri, 1 Mar 2019 12:38:46 -0800 Subject: bpf: add bpf helper bpf_skb_ecn_set_ce This patch adds a new bpf helper BPF_FUNC_skb_ecn_set_ce "int bpf_skb_ecn_set_ce(struct sk_buff *skb)". It is added to BPF_PROG_TYPE_CGROUP_SKB typed bpf_prog which currently can be attached to the ingress and egress path. The helper is needed because his type of bpf_prog cannot modify the skb directly. This helper is used to set the ECN field of ECN capable IP packets to ce (congestion encountered) in the IPv6 or IPv4 header of the skb. It can be used by a bpf_prog to manage egress or ingress network bandwdith limit per cgroupv2 by inducing an ECN response in the TCP sender. This works best when using DCTCP. Signed-off-by: Lawrence Brakmo Signed-off-by: Martin KaFai Lau Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 10 +++++++++- net/core/filter.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 2e308e90ffea..3c38ac9a92a7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2359,6 +2359,13 @@ union bpf_attr { * Return * A **struct bpf_tcp_sock** pointer on success, or NULL in * case of failure. + * + * int bpf_skb_ecn_set_ce(struct sk_buf *skb) + * Description + * Sets ECN of IP header to ce (congestion encountered) if + * current value is ect (ECN capable). Works with IPv6 and IPv4. + * Return + * 1 if set, 0 if not set. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2457,7 +2464,8 @@ union bpf_attr { FN(spin_lock), \ FN(spin_unlock), \ FN(sk_fullsock), \ - FN(tcp_sock), + FN(tcp_sock), \ + FN(skb_ecn_set_ce), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 85749f6ec789..558ca72f2254 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5426,6 +5426,32 @@ static const struct bpf_func_proto bpf_tcp_sock_proto = { .arg1_type = ARG_PTR_TO_SOCK_COMMON, }; +BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb) +{ + unsigned int iphdr_len; + + if (skb->protocol == cpu_to_be16(ETH_P_IP)) + iphdr_len = sizeof(struct iphdr); + else if (skb->protocol == cpu_to_be16(ETH_P_IPV6)) + iphdr_len = sizeof(struct ipv6hdr); + else + return 0; + + if (skb_headlen(skb) < iphdr_len) + return 0; + + if (skb_cloned(skb) && !skb_clone_writable(skb, iphdr_len)) + return 0; + + return INET_ECN_set_ce(skb); +} + +static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = { + .func = bpf_skb_ecn_set_ce, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; #endif /* CONFIG_INET */ bool bpf_helper_changes_pkt_data(void *func) @@ -5585,6 +5611,8 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) #ifdef CONFIG_INET case BPF_FUNC_tcp_sock: return &bpf_tcp_sock_proto; + case BPF_FUNC_skb_ecn_set_ce: + return &bpf_skb_ecn_set_ce_proto; #endif default: return sk_filter_func_proto(func_id, prog); -- cgit v1.2.3 From 0b5c7efdfc6e389ec6840579fe90bdb6f42b08dc Mon Sep 17 00:00:00 2001 From: Kevin Darbyshire-Bryant Date: Fri, 1 Mar 2019 16:04:05 +0100 Subject: sch_cake: Permit use of connmarks as tin classifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add flag 'FWMARK' to enable use of firewall connmarks as tin selector. The connmark (skbuff->mark) needs to be in the range 1->tin_cnt ie. for diffserv3 the mark needs to be 1->3. Background Typically CAKE uses DSCP as the basis for tin selection. DSCP values are relatively easily changed as part of the egress path, usually with iptables & the mangle table, ingress is more challenging. CAKE is often used on the WAN interface of a residential gateway where passthrough of DSCP from the ISP is either missing or set to unhelpful values thus use of ingress DSCP values for tin selection isn't helpful in that environment. An approach to solving the ingress tin selection problem is to use CAKE's understanding of tc filters. Naive tc filters could match on source/destination port numbers and force tin selection that way, but multiple filters don't scale particularly well as each filter must be traversed whether it matches or not. e.g. a simple example to map 3 firewall marks to tins: MAJOR=$( tc qdisc show dev $DEV | head -1 | awk '{print $3}' ) tc filter add dev $DEV parent $MAJOR protocol all handle 0x01 fw action skbedit priority ${MAJOR}1 tc filter add dev $DEV parent $MAJOR protocol all handle 0x02 fw action skbedit priority ${MAJOR}2 tc filter add dev $DEV parent $MAJOR protocol all handle 0x03 fw action skbedit priority ${MAJOR}3 Another option is to use eBPF cls_act with tc filters e.g. MAJOR=$( tc qdisc show dev $DEV | head -1 | awk '{print $3}' ) tc filter add dev $DEV parent $MAJOR bpf da obj my-bpf-fwmark-to-class.o This has the disadvantages of a) needing someone to write & maintain the bpf program, b) a bpf toolchain to compile it and c) needing to hardcode the major number in the bpf program so it matches the cake instance (or forcing the cake instance to a particular major number) since the major number cannot be passed to the bpf program via tc command line. As already hinted at by the previous examples, it would be helpful to associate tins with something that survives the Internet path and ideally allows tin selection on both egress and ingress. Netfilter's conntrack permits setting an identifying mark on a connection which can also be restored to an ingress packet with tc action connmark e.g. tc filter add dev eth0 parent ffff: protocol all prio 10 u32 \ match u32 0 0 flowid 1:1 action connmark action mirred egress redirect dev ifb1 Since tc's connmark action has restored any connmark into skb->mark, any of the previous solutions are based upon it and in one form or another copy that mark to the skb->priority field where again CAKE picks this up. This change cuts out at least one of the (less intuitive & non-scalable) middlemen and permit direct access to skb->mark. Signed-off-by: Kevin Darbyshire-Bryant Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 1 + net/sched/sch_cake.c | 34 +++++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 1eb572ef3f27..7ee74c3474bf 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1021,6 +1021,7 @@ enum { TCA_CAKE_INGRESS, TCA_CAKE_ACK_FILTER, TCA_CAKE_SPLIT_GSO, + TCA_CAKE_FWMARK, __TCA_CAKE_MAX }; #define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 4d688b3b471b..f1cc4779699b 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -258,7 +258,8 @@ enum { CAKE_FLAG_AUTORATE_INGRESS = BIT(1), CAKE_FLAG_INGRESS = BIT(2), CAKE_FLAG_WASH = BIT(3), - CAKE_FLAG_SPLIT_GSO = BIT(4) + CAKE_FLAG_SPLIT_GSO = BIT(4), + CAKE_FLAG_FWMARK = BIT(5) }; /* COBALT operates the Codel and BLUE algorithms in parallel, in order to @@ -1566,12 +1567,20 @@ static struct cake_tin_data *cake_select_tin(struct Qdisc *sch, if (q->rate_flags & CAKE_FLAG_WASH) cake_wash_diffserv(skb); } else if (q->tin_mode != CAKE_DIFFSERV_BESTEFFORT) { - /* extract the Diffserv Precedence field, if it exists */ - /* and clear DSCP bits if washing */ - tin = q->tin_index[cake_handle_diffserv(skb, - q->rate_flags & CAKE_FLAG_WASH)]; - if (unlikely(tin >= q->tin_cnt)) - tin = 0; + if (q->rate_flags & CAKE_FLAG_FWMARK && /* use fw mark */ + skb->mark && + skb->mark <= q->tin_cnt) { + tin = q->tin_order[skb->mark - 1]; + if (q->rate_flags & CAKE_FLAG_WASH) + cake_wash_diffserv(skb); + } else { + /* extract the Diffserv Precedence field, if it exists */ + /* and clear DSCP bits if washing */ + tin = q->tin_index[cake_handle_diffserv(skb, + q->rate_flags & CAKE_FLAG_WASH)]; + if (unlikely(tin >= q->tin_cnt)) + tin = 0; + } } else { tin = 0; if (q->rate_flags & CAKE_FLAG_WASH) @@ -2624,6 +2633,13 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt, q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO; } + if (tb[TCA_CAKE_FWMARK]) { + if (!!nla_get_u32(tb[TCA_CAKE_FWMARK])) + q->rate_flags |= CAKE_FLAG_FWMARK; + else + q->rate_flags &= ~CAKE_FLAG_FWMARK; + } + if (q->tins) { sch_tree_lock(sch); cake_reconfigure(sch); @@ -2783,6 +2799,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb) !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO))) goto nla_put_failure; + if (nla_put_u32(skb, TCA_CAKE_FWMARK, + !!(q->rate_flags & CAKE_FLAG_FWMARK))) + goto nla_put_failure; + return nla_nest_end(skb, opts); nla_put_failure: -- cgit v1.2.3 From 9036b2fe092a107856edd1a3bad48b83f2b45000 Mon Sep 17 00:00:00 2001 From: Francesco Ruggeri Date: Fri, 1 Mar 2019 15:31:03 -0800 Subject: net: ipv6: add socket option IPV6_ROUTER_ALERT_ISOLATE By default IPv6 socket with IPV6_ROUTER_ALERT socket option set will receive all IPv6 RA packets from all namespaces. IPV6_ROUTER_ALERT_ISOLATE socket option restricts packets received by the socket to be only from the socket's namespace. Signed-off-by: Maxim Martynov Signed-off-by: Francesco Ruggeri Reviewed-by: David Ahern Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 ++- include/uapi/linux/in6.h | 1 + net/ipv6/ip6_output.c | 6 ++++++ net/ipv6/ipv6_sockglue.c | 10 ++++++++++ 4 files changed, 19 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 6d45ce784bea..ea7c7906591e 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -281,7 +281,8 @@ struct ipv6_pinfo { dontfrag:1, autoflowlabel:1, autoflowlabel_set:1, - mc_all:1; + mc_all:1, + rtalert_isolate:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index 71d82fe15b03..9f2273a08356 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -178,6 +178,7 @@ struct in6_flowlabel_req { #define IPV6_JOIN_ANYCAST 27 #define IPV6_LEAVE_ANYCAST 28 #define IPV6_MULTICAST_ALL 29 +#define IPV6_ROUTER_ALERT_ISOLATE 30 /* IPV6_MTU_DISCOVER values */ #define IPV6_PMTUDISC_DONT 0 diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 5f9fa0302b5a..edbd12067170 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -300,6 +300,12 @@ static int ip6_call_ra_chain(struct sk_buff *skb, int sel) if (sk && ra->sel == sel && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == skb->dev->ifindex)) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (np && np->rtalert_isolate && + !net_eq(sock_net(sk), dev_net(skb->dev))) { + continue; + } if (last) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); if (skb2) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 973e215c3114..40f21fef25ff 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -787,6 +787,12 @@ done: goto e_inval; retv = ip6_ra_control(sk, val); break; + case IPV6_ROUTER_ALERT_ISOLATE: + if (optlen < sizeof(int)) + goto e_inval; + np->rtalert_isolate = valbool; + retv = 0; + break; case IPV6_MTU_DISCOVER: if (optlen < sizeof(int)) goto e_inval; @@ -1358,6 +1364,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->rxopt.bits.recvfragsize; break; + case IPV6_ROUTER_ALERT_ISOLATE: + val = np->rtalert_isolate; + break; + default: return -ENOPROTOOPT; } -- cgit v1.2.3 From ca215086b14b89a0e70fc211314944aa6ce50020 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 5 Mar 2019 15:42:23 -0800 Subject: mm: convert PG_balloon to PG_offline PG_balloon was introduced to implement page migration/compaction for pages inflated in virtio-balloon. Nowadays, it is only a marker that a page is part of virtio-balloon and therefore logically offline. We also want to make use of this flag in other balloon drivers - for inflated pages or when onlining a section but keeping some pages offline (e.g. used right now by XEN and Hyper-V via set_online_page_callback()). We are going to expose this flag to dump tools like makedumpfile. But instead of exposing PG_balloon, let's generalize the concept of marking pages as logically offline, so it can be reused for other purposes later on. Rename PG_balloon to PG_offline. This is an indicator that the page is logically offline, the content stale and that it should not be touched (e.g. a hypervisor would have to allocate backing storage in order for the guest to dump an unused page). We can then e.g. exclude such pages from dumps. We replace and reuse KPF_BALLOON (23), as this shouldn't really harm (and for now the semantics stay the same). In following patches, we will make use of this bit also in other balloon drivers. While at it, document PGTABLE. [akpm@linux-foundation.org: fix comment text, per David] Link: http://lkml.kernel.org/r/20181119101616.8901-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Konstantin Khlebnikov Acked-by: Michael S. Tsirkin Acked-by: Pankaj gupta Cc: Jonathan Corbet Cc: Alexey Dobriyan Cc: Mike Rapoport Cc: Christian Hansen Cc: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Stephen Rothwell Cc: Matthew Wilcox Cc: Michal Hocko Cc: Pavel Tatashin Cc: Alexander Duyck Cc: Naoya Horiguchi Cc: Miles Chen Cc: David Rientjes Cc: Kazuhito Hagio Cc: Arnd Bergmann Cc: Baoquan He Cc: Borislav Petkov Cc: Boris Ostrovsky Cc: Dave Young Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Juergen Gross Cc: Julien Freche Cc: Kairui Song Cc: "K. Y. Srinivasan" Cc: Len Brown Cc: Lianbo Jiang Cc: Michal Hocko Cc: Nadav Amit Cc: Omar Sandoval Cc: Pavel Machek Cc: Rafael J. Wysocki Cc: "Rafael J. Wysocki" Cc: Stefano Stabellini Cc: Stephen Hemminger Cc: Vitaly Kuznetsov Cc: Xavier Deguillard Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/mm/pagemap.rst | 9 ++++++--- fs/proc/page.c | 4 ++-- include/linux/balloon_compaction.h | 8 ++++---- include/linux/page-flags.h | 11 +++++++---- include/uapi/linux/kernel-page-flags.h | 2 +- tools/vm/page-types.c | 2 +- 6 files changed, 21 insertions(+), 15 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst index 3f7bade2c231..340a5aee9b80 100644 --- a/Documentation/admin-guide/mm/pagemap.rst +++ b/Documentation/admin-guide/mm/pagemap.rst @@ -75,9 +75,10 @@ number of times a page is mapped. 20. NOPAGE 21. KSM 22. THP - 23. BALLOON + 23. OFFLINE 24. ZERO_PAGE 25. IDLE + 26. PGTABLE * ``/proc/kpagecgroup``. This file contains a 64-bit inode number of the memory cgroup each page is charged to, indexed by PFN. Only available when @@ -118,8 +119,8 @@ Short descriptions to the page flags identical memory pages dynamically shared between one or more processes 22 - THP contiguous pages which construct transparent hugepages -23 - BALLOON - balloon compaction page +23 - OFFLINE + page is logically offline 24 - ZERO_PAGE zero page for pfn_zero or huge_zero page 25 - IDLE @@ -128,6 +129,8 @@ Short descriptions to the page flags Note that this flag may be stale in case the page was accessed via a PTE. To make sure the flag is up-to-date one has to read ``/sys/kernel/mm/page_idle/bitmap`` first. +26 - PGTABLE + page is in use as a page table IO related page flags --------------------- diff --git a/fs/proc/page.c b/fs/proc/page.c index 40b05e0d4274..544d1ee15aee 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -152,8 +152,8 @@ u64 stable_page_flags(struct page *page) else if (page_count(page) == 0 && is_free_buddy_page(page)) u |= 1 << KPF_BUDDY; - if (PageBalloon(page)) - u |= 1 << KPF_BALLOON; + if (PageOffline(page)) + u |= 1 << KPF_OFFLINE; if (PageTable(page)) u |= 1 << KPF_PGTABLE; diff --git a/include/linux/balloon_compaction.h b/include/linux/balloon_compaction.h index cbe50da5a59d..f111c780ef1d 100644 --- a/include/linux/balloon_compaction.h +++ b/include/linux/balloon_compaction.h @@ -95,7 +95,7 @@ extern int balloon_page_migrate(struct address_space *mapping, static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); __SetPageMovable(page, balloon->inode->i_mapping); set_page_private(page, (unsigned long)balloon); list_add(&page->lru, &balloon->pages); @@ -111,7 +111,7 @@ static inline void balloon_page_insert(struct balloon_dev_info *balloon, */ static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); __ClearPageMovable(page); set_page_private(page, 0); /* @@ -141,13 +141,13 @@ static inline gfp_t balloon_mapping_gfp_mask(void) static inline void balloon_page_insert(struct balloon_dev_info *balloon, struct page *page) { - __SetPageBalloon(page); + __SetPageOffline(page); list_add(&page->lru, &balloon->pages); } static inline void balloon_page_delete(struct page *page) { - __ClearPageBalloon(page); + __ClearPageOffline(page); list_del(&page->lru); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 39b4494e29f1..808b4183e30d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -671,7 +671,7 @@ PAGEFLAG_FALSE(DoubleMap) /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 -#define PG_balloon 0x00000100 +#define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 @@ -706,10 +706,13 @@ static __always_inline void __ClearPage##uname(struct page *page) \ PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is true for pages that are on the balloon page list - * (see mm/balloon_compaction.c). + * PageOffline() indicates that the page is logically offline although the + * containing section is online. (e.g. inflated in a balloon driver or + * not onlined when onlining the section). + * The content of these pages is effectively stale. Such pages should not + * be touched (read/write/dump/save) except by their owner. */ -PAGE_TYPE_OPS(Balloon, balloon) +PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index 21b9113c69da..6f2f2720f3ac 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -32,7 +32,7 @@ #define KPF_KSM 21 #define KPF_THP 22 -#define KPF_BALLOON 23 +#define KPF_OFFLINE 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 #define KPF_PGTABLE 26 diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 1ff3a6c0367b..6f64b2b93234 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -133,7 +133,7 @@ static const char * const page_flag_names[] = { [KPF_NOPAGE] = "n:nopage", [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", - [KPF_BALLOON] = "o:balloon", + [KPF_OFFLINE] = "o:offline", [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", -- cgit v1.2.3 From ab3948f58ff841e51feb845720624665ef5b7ef3 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 5 Mar 2019 15:47:54 -0800 Subject: mm/memfd: add an F_SEAL_FUTURE_WRITE seal to memfd Android uses ashmem for sharing memory regions. We are looking forward to migrating all usecases of ashmem to memfd so that we can possibly remove the ashmem driver in the future from staging while also benefiting from using memfd and contributing to it. Note staging drivers are also not ABI and generally can be removed at anytime. One of the main usecases Android has is the ability to create a region and mmap it as writeable, then add protection against making any "future" writes while keeping the existing already mmap'ed writeable-region active. This allows us to implement a usecase where receivers of the shared memory buffer can get a read-only view, while the sender continues to write to the buffer. See CursorWindow documentation in Android for more details: https://developer.android.com/reference/android/database/CursorWindow This usecase cannot be implemented with the existing F_SEAL_WRITE seal. To support the usecase, this patch adds a new F_SEAL_FUTURE_WRITE seal which prevents any future mmap and write syscalls from succeeding while keeping the existing mmap active. A better way to do F_SEAL_FUTURE_WRITE seal was discussed [1] last week where we don't need to modify core VFS structures to get the same behavior of the seal. This solves several side-effects pointed by Andy. self-tests are provided in later patch to verify the expected semantics. [1] https://lore.kernel.org/lkml/20181111173650.GA256781@google.com/ Thanks a lot to Andy for suggestions to improve code. Link: http://lkml.kernel.org/r/20190112203816.85534-2-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Acked-by: John Stultz Cc: Andy Lutomirski Cc: Minchan Kim Cc: Jann Horn Cc: Al Viro Cc: Andy Lutomirski Cc: Hugh Dickins Cc: J. Bruce Fields Cc: Jeff Layton Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 2 +- include/uapi/linux/fcntl.h | 1 + mm/memfd.c | 3 ++- mm/shmem.c | 25 ++++++++++++++++++++++--- 4 files changed, 26 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a7fa037b876b..b0eef008de67 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) inode_lock(inode); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { inode_unlock(inode); return -EPERM; } diff --git a/include/uapi/linux/fcntl.h b/include/uapi/linux/fcntl.h index 6448cdd9a350..a2f8658f1c55 100644 --- a/include/uapi/linux/fcntl.h +++ b/include/uapi/linux/fcntl.h @@ -41,6 +41,7 @@ #define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */ #define F_SEAL_GROW 0x0004 /* prevent file from growing */ #define F_SEAL_WRITE 0x0008 /* prevent writes */ +#define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */ /* (1U << 31) is reserved for signed error codes */ /* diff --git a/mm/memfd.c b/mm/memfd.c index 97264c79d2cd..650e65a46b9c 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -131,7 +131,8 @@ static unsigned int *memfd_file_seals_ptr(struct file *file) #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ - F_SEAL_WRITE) + F_SEAL_WRITE | \ + F_SEAL_FUTURE_WRITE) static int memfd_add_seals(struct file *file, unsigned int seals) { diff --git a/mm/shmem.c b/mm/shmem.c index 283a1833dafc..b3db3779a30a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2190,6 +2190,24 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { + struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + + if (info->seals & F_SEAL_FUTURE_WRITE) { + /* + * New PROT_WRITE and MAP_SHARED mmaps are not allowed when + * "future write" seal active. + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return -EPERM; + + /* + * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED + * read-only mapping, take care to not allow mprotect to revert + * protections. + */ + vma->vm_flags &= ~(VM_MAYWRITE); + } + file_accessed(file); vma->vm_ops = &shmem_vm_ops; if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && @@ -2440,8 +2458,9 @@ shmem_write_begin(struct file *file, struct address_space *mapping, pgoff_t index = pos >> PAGE_SHIFT; /* i_mutex is held by caller */ - if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) { - if (info->seals & F_SEAL_WRITE) + if (unlikely(info->seals & (F_SEAL_GROW | + F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) return -EPERM; if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size) return -EPERM; @@ -2704,7 +2723,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); /* protected by i_mutex */ - if (info->seals & F_SEAL_WRITE) { + if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; } -- cgit v1.2.3 From 221c5eb2338232f7340386de1c43decc32682e58 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 17 Jan 2019 09:41:58 -0700 Subject: io_uring: add support for IORING_OP_POLL This is basically a direct port of bfe4037e722e, which implements a one-shot poll command through aio. Description below is based on that commit as well. However, instead of adding a POLL command and relying on io_cancel(2) to remove it, we mimic the epoll(2) interface of having a command to add a poll notification, IORING_OP_POLL_ADD, and one to remove it again, IORING_OP_POLL_REMOVE. To poll for a file descriptor the application should submit an sqe of type IORING_OP_POLL. It will poll the fd for the events specified in the poll_events field. Unlike poll or epoll without EPOLLONESHOT this interface always works in one shot mode, that is once the sqe is completed, it will have to be resubmitted. Reviewed-by: Hannes Reinecke Based-on-code-from: Christoph Hellwig Signed-off-by: Jens Axboe --- fs/io_uring.c | 263 +++++++++++++++++++++++++++++++++++++++++- include/uapi/linux/io_uring.h | 3 + 2 files changed, 265 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/fs/io_uring.c b/fs/io_uring.c index d2a3a1bc85cc..85b914bd823c 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -161,6 +161,7 @@ struct io_ring_ctx { * manipulate the list, hence no extra locking is needed there. */ struct list_head poll_list; + struct list_head cancel_list; } ____cacheline_aligned_in_smp; #if defined(CONFIG_UNIX) @@ -176,8 +177,20 @@ struct sqe_submit { bool needs_fixed_file; }; +struct io_poll_iocb { + struct file *file; + struct wait_queue_head *head; + __poll_t events; + bool woken; + bool canceled; + struct wait_queue_entry wait; +}; + struct io_kiocb { - struct kiocb rw; + union { + struct kiocb rw; + struct io_poll_iocb poll; + }; struct sqe_submit submit; @@ -261,6 +274,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p) init_waitqueue_head(&ctx->wait); spin_lock_init(&ctx->completion_lock); INIT_LIST_HEAD(&ctx->poll_list); + INIT_LIST_HEAD(&ctx->cancel_list); return ctx; } @@ -1058,6 +1072,246 @@ static int io_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void io_poll_remove_one(struct io_kiocb *req) +{ + struct io_poll_iocb *poll = &req->poll; + + spin_lock(&poll->head->lock); + WRITE_ONCE(poll->canceled, true); + if (!list_empty(&poll->wait.entry)) { + list_del_init(&poll->wait.entry); + queue_work(req->ctx->sqo_wq, &req->work); + } + spin_unlock(&poll->head->lock); + + list_del_init(&req->list); +} + +static void io_poll_remove_all(struct io_ring_ctx *ctx) +{ + struct io_kiocb *req; + + spin_lock_irq(&ctx->completion_lock); + while (!list_empty(&ctx->cancel_list)) { + req = list_first_entry(&ctx->cancel_list, struct io_kiocb,list); + io_poll_remove_one(req); + } + spin_unlock_irq(&ctx->completion_lock); +} + +/* + * Find a running poll command that matches one specified in sqe->addr, + * and remove it if found. + */ +static int io_poll_remove(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_ring_ctx *ctx = req->ctx; + struct io_kiocb *poll_req, *next; + int ret = -ENOENT; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->len || sqe->buf_index || + sqe->poll_events) + return -EINVAL; + + spin_lock_irq(&ctx->completion_lock); + list_for_each_entry_safe(poll_req, next, &ctx->cancel_list, list) { + if (READ_ONCE(sqe->addr) == poll_req->user_data) { + io_poll_remove_one(poll_req); + ret = 0; + break; + } + } + spin_unlock_irq(&ctx->completion_lock); + + io_cqring_add_event(req->ctx, sqe->user_data, ret, 0); + io_free_req(req); + return 0; +} + +static void io_poll_complete(struct io_kiocb *req, __poll_t mask) +{ + io_cqring_add_event(req->ctx, req->user_data, mangle_poll(mask), 0); + io_fput(req); + io_free_req(req); +} + +static void io_poll_complete_work(struct work_struct *work) +{ + struct io_kiocb *req = container_of(work, struct io_kiocb, work); + struct io_poll_iocb *poll = &req->poll; + struct poll_table_struct pt = { ._key = poll->events }; + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = 0; + + if (!READ_ONCE(poll->canceled)) + mask = vfs_poll(poll->file, &pt) & poll->events; + + /* + * Note that ->ki_cancel callers also delete iocb from active_reqs after + * calling ->ki_cancel. We need the ctx_lock roundtrip here to + * synchronize with them. In the cancellation case the list_del_init + * itself is not actually needed, but harmless so we keep it in to + * avoid further branches in the fast path. + */ + spin_lock_irq(&ctx->completion_lock); + if (!mask && !READ_ONCE(poll->canceled)) { + add_wait_queue(poll->head, &poll->wait); + spin_unlock_irq(&ctx->completion_lock); + return; + } + list_del_init(&req->list); + spin_unlock_irq(&ctx->completion_lock); + + io_poll_complete(req, mask); +} + +static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + void *key) +{ + struct io_poll_iocb *poll = container_of(wait, struct io_poll_iocb, + wait); + struct io_kiocb *req = container_of(poll, struct io_kiocb, poll); + struct io_ring_ctx *ctx = req->ctx; + __poll_t mask = key_to_poll(key); + + poll->woken = true; + + /* for instances that support it check for an event match first: */ + if (mask) { + unsigned long flags; + + if (!(mask & poll->events)) + return 0; + + /* try to complete the iocb inline if we can: */ + if (spin_trylock_irqsave(&ctx->completion_lock, flags)) { + list_del(&req->list); + spin_unlock_irqrestore(&ctx->completion_lock, flags); + + list_del_init(&poll->wait.entry); + io_poll_complete(req, mask); + return 1; + } + } + + list_del_init(&poll->wait.entry); + queue_work(ctx->sqo_wq, &req->work); + return 1; +} + +struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; + int error; +}; + +static void io_poll_queue_proc(struct file *file, struct wait_queue_head *head, + struct poll_table_struct *p) +{ + struct io_poll_table *pt = container_of(p, struct io_poll_table, pt); + + if (unlikely(pt->req->poll.head)) { + pt->error = -EINVAL; + return; + } + + pt->error = 0; + pt->req->poll.head = head; + add_wait_queue(head, &pt->req->poll.wait); +} + +static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_poll_iocb *poll = &req->poll; + struct io_ring_ctx *ctx = req->ctx; + struct io_poll_table ipt; + unsigned flags; + __poll_t mask; + u16 events; + int fd; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index) + return -EINVAL; + + INIT_WORK(&req->work, io_poll_complete_work); + events = READ_ONCE(sqe->poll_events); + poll->events = demangle_poll(events) | EPOLLERR | EPOLLHUP; + + flags = READ_ONCE(sqe->flags); + fd = READ_ONCE(sqe->fd); + + if (flags & IOSQE_FIXED_FILE) { + if (unlikely(!ctx->user_files || fd >= ctx->nr_user_files)) + return -EBADF; + poll->file = ctx->user_files[fd]; + req->flags |= REQ_F_FIXED_FILE; + } else { + poll->file = fget(fd); + } + if (unlikely(!poll->file)) + return -EBADF; + + poll->head = NULL; + poll->woken = false; + poll->canceled = false; + + ipt.pt._qproc = io_poll_queue_proc; + ipt.pt._key = poll->events; + ipt.req = req; + ipt.error = -EINVAL; /* same as no support for IOCB_CMD_POLL */ + + /* initialized the list so that we can do list_empty checks */ + INIT_LIST_HEAD(&poll->wait.entry); + init_waitqueue_func_entry(&poll->wait, io_poll_wake); + + /* one for removal from waitqueue, one for this function */ + refcount_set(&req->refs, 2); + + mask = vfs_poll(poll->file, &ipt.pt) & poll->events; + if (unlikely(!poll->head)) { + /* we did not manage to set up a waitqueue, done */ + goto out; + } + + spin_lock_irq(&ctx->completion_lock); + spin_lock(&poll->head->lock); + if (poll->woken) { + /* wake_up context handles the rest */ + mask = 0; + ipt.error = 0; + } else if (mask || ipt.error) { + /* if we get an error or a mask we are done */ + WARN_ON_ONCE(list_empty(&poll->wait.entry)); + list_del_init(&poll->wait.entry); + } else { + /* actually waiting for an event */ + list_add_tail(&req->list, &ctx->cancel_list); + } + spin_unlock(&poll->head->lock); + spin_unlock_irq(&ctx->completion_lock); + +out: + if (unlikely(ipt.error)) { + if (!(flags & IOSQE_FIXED_FILE)) + fput(poll->file); + /* + * Drop one of our refs to this req, __io_submit_sqe() will + * drop the other one since we're returning an error. + */ + io_free_req(req); + return ipt.error; + } + + if (mask) + io_poll_complete(req, mask); + io_free_req(req); + return 0; +} + static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, const struct sqe_submit *s, bool force_nonblock, struct io_submit_state *state) @@ -1093,6 +1347,12 @@ static int __io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, case IORING_OP_FSYNC: ret = io_fsync(req, s->sqe, force_nonblock); break; + case IORING_OP_POLL_ADD: + ret = io_poll_add(req, s->sqe); + break; + case IORING_OP_POLL_REMOVE: + ret = io_poll_remove(req, s->sqe); + break; default: ret = -EINVAL; break; @@ -2131,6 +2391,7 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) percpu_ref_kill(&ctx->refs); mutex_unlock(&ctx->uring_lock); + io_poll_remove_all(ctx); io_iopoll_reap_events(ctx); wait_for_completion(&ctx->ctx_done); io_ring_ctx_free(ctx); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index 0ec74bab8dbe..e23408692118 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -25,6 +25,7 @@ struct io_uring_sqe { union { __kernel_rwf_t rw_flags; __u32 fsync_flags; + __u16 poll_events; }; __u64 user_data; /* data to be passed back at completion time */ union { @@ -51,6 +52,8 @@ struct io_uring_sqe { #define IORING_OP_FSYNC 3 #define IORING_OP_READ_FIXED 4 #define IORING_OP_WRITE_FIXED 5 +#define IORING_OP_POLL_ADD 6 +#define IORING_OP_POLL_REMOVE 7 /* * sqe->fsync_flags -- cgit v1.2.3 From 54d50897d544c874562253e2a8f70dfcad22afe8 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Mar 2019 16:27:14 -0800 Subject: linux/kernel.h: split *_MAX and *_MIN macros into tends to be cluttered because we often put various sort of unrelated stuff in it. So, we have split out a sensible chunk of code into a separate header from time to time. This commit splits out the *_MAX and *_MIN defines. The standard header contains various MAX, MIN constants including numerial limits. [1] I think it makes sense to move in-kernel MAX, MIN constants into include/linux/limits.h. We already have include/uapi/linux/limits.h to contain some user-space constants. I changed its include guard to _UAPI_LINUX_LIMITS_H. This change has no impact to the user-space because scripts/headers_install.sh rips off the '_UAPI' prefix from the include guards of exported headers. [1] http://pubs.opengroup.org/onlinepubs/009604499/basedefs/limits.h.html Link: http://lkml.kernel.org/r/1549156242-20806-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Alex Elder Cc: Alexey Dobriyan Cc: Zhang Yanmin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 29 +---------------------------- include/linux/limits.h | 36 ++++++++++++++++++++++++++++++++++++ include/uapi/linux/limits.h | 4 ++-- 3 files changed, 39 insertions(+), 30 deletions(-) create mode 100644 include/linux/limits.h (limited to 'include/uapi/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index a9ff66977e10..34a5036debd3 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -4,6 +4,7 @@ #include +#include #include #include #include @@ -17,34 +18,6 @@ #include #include -#define USHRT_MAX ((unsigned short)~0U) -#define SHRT_MAX ((short)(USHRT_MAX>>1)) -#define SHRT_MIN ((short)(-SHRT_MAX - 1)) -#define INT_MAX ((int)(~0U>>1)) -#define INT_MIN (-INT_MAX - 1) -#define UINT_MAX (~0U) -#define LONG_MAX ((long)(~0UL>>1)) -#define LONG_MIN (-LONG_MAX - 1) -#define ULONG_MAX (~0UL) -#define LLONG_MAX ((long long)(~0ULL>>1)) -#define LLONG_MIN (-LLONG_MAX - 1) -#define ULLONG_MAX (~0ULL) -#define SIZE_MAX (~(size_t)0) -#define PHYS_ADDR_MAX (~(phys_addr_t)0) - -#define U8_MAX ((u8)~0U) -#define S8_MAX ((s8)(U8_MAX>>1)) -#define S8_MIN ((s8)(-S8_MAX - 1)) -#define U16_MAX ((u16)~0U) -#define S16_MAX ((s16)(U16_MAX>>1)) -#define S16_MIN ((s16)(-S16_MAX - 1)) -#define U32_MAX ((u32)~0U) -#define S32_MAX ((s32)(U32_MAX>>1)) -#define S32_MIN ((s32)(-S32_MAX - 1)) -#define U64_MAX ((u64)~0ULL) -#define S64_MAX ((s64)(U64_MAX>>1)) -#define S64_MIN ((s64)(-S64_MAX - 1)) - #define STACK_MAGIC 0xdeadbeef /** diff --git a/include/linux/limits.h b/include/linux/limits.h new file mode 100644 index 000000000000..76afcd24ff8c --- /dev/null +++ b/include/linux/limits.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_LIMITS_H +#define _LINUX_LIMITS_H + +#include +#include + +#define USHRT_MAX ((unsigned short)~0U) +#define SHRT_MAX ((short)(USHRT_MAX >> 1)) +#define SHRT_MIN ((short)(-SHRT_MAX - 1)) +#define INT_MAX ((int)(~0U >> 1)) +#define INT_MIN (-INT_MAX - 1) +#define UINT_MAX (~0U) +#define LONG_MAX ((long)(~0UL >> 1)) +#define LONG_MIN (-LONG_MAX - 1) +#define ULONG_MAX (~0UL) +#define LLONG_MAX ((long long)(~0ULL >> 1)) +#define LLONG_MIN (-LLONG_MAX - 1) +#define ULLONG_MAX (~0ULL) +#define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) + +#define U8_MAX ((u8)~0U) +#define S8_MAX ((s8)(U8_MAX >> 1)) +#define S8_MIN ((s8)(-S8_MAX - 1)) +#define U16_MAX ((u16)~0U) +#define S16_MAX ((s16)(U16_MAX >> 1)) +#define S16_MIN ((s16)(-S16_MAX - 1)) +#define U32_MAX ((u32)~0U) +#define S32_MAX ((s32)(U32_MAX >> 1)) +#define S32_MIN ((s32)(-S32_MAX - 1)) +#define U64_MAX ((u64)~0ULL) +#define S64_MAX ((s64)(U64_MAX >> 1)) +#define S64_MIN ((s64)(-S64_MAX - 1)) + +#endif /* _LINUX_LIMITS_H */ diff --git a/include/uapi/linux/limits.h b/include/uapi/linux/limits.h index c3547f07605c..6bcbe3068761 100644 --- a/include/uapi/linux/limits.h +++ b/include/uapi/linux/limits.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _LINUX_LIMITS_H -#define _LINUX_LIMITS_H +#ifndef _UAPI_LINUX_LIMITS_H +#define _UAPI_LINUX_LIMITS_H #define NR_OPEN 1024 -- cgit v1.2.3 From 60d6d04ca3abb34d5e89f030dbea440d9715a168 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Mar 2019 16:29:09 -0800 Subject: autofs: add ignore mount option Add an autofs file system mount option that can be used to provide a generic indicator to applications that the mount entry should be ignored when displaying mount information. In other OSes that provide autofs and that provide a mount list to user space based on the kernel mount list a no-op mount option ("ignore" is the one use on the most common OS) is allowed so that autofs file system users can optionally use it. The idea is that it be used by user space programs to exclude autofs mounts from consideration when reading the mounts list. Prior to the change to link /etc/mtab to /proc/self/mounts all I needed to do to achieve this was to use mount(2) and not update the mtab but now that no longer works. I know the symlinking happened a long time ago and I considered doing this then but, at the time I couldn't remember the commonly used option name and thought persuading the various utility maintainers would be too hard. But now I have a RHEL request to do this for compatibility for a widely used product so I want to go ahead with it and try and enlist the help of some utility package maintainers. Clearly, without the option nothing can be done so it's at least a start. Link: http://lkml.kernel.org/r/154725123970.11260.6113771566924907275.stgit@pluto-themaw-net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 1 + fs/autofs/inode.c | 9 ++++++++- include/uapi/linux/auto_fs.h | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 3e59f0ed777b..b735f2b1e462 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -105,6 +105,7 @@ struct autofs_wait_queue { #define AUTOFS_SBI_CATATONIC 0x0001 #define AUTOFS_SBI_STRICTEXPIRE 0x0002 +#define AUTOFS_SBI_IGNORE 0x0004 struct autofs_sb_info { u32 magic; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 078992eee299..8647ecaa89fc 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -89,6 +89,8 @@ static int autofs_show_options(struct seq_file *m, struct dentry *root) seq_printf(m, ",indirect"); if (sbi->flags & AUTOFS_SBI_STRICTEXPIRE) seq_printf(m, ",strictexpire"); + if (sbi->flags & AUTOFS_SBI_IGNORE) + seq_printf(m, ",ignore"); #ifdef CONFIG_CHECKPOINT_RESTORE if (sbi->pipe) seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); @@ -111,7 +113,8 @@ static const struct super_operations autofs_sops = { }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire}; + Opt_indirect, Opt_direct, Opt_offset, Opt_strictexpire, + Opt_ignore}; static const match_table_t tokens = { {Opt_fd, "fd=%u"}, @@ -124,6 +127,7 @@ static const match_table_t tokens = { {Opt_direct, "direct"}, {Opt_offset, "offset"}, {Opt_strictexpire, "strictexpire"}, + {Opt_ignore, "ignore"}, {Opt_err, NULL} }; @@ -206,6 +210,9 @@ static int parse_options(char *options, case Opt_strictexpire: sbi->flags |= AUTOFS_SBI_STRICTEXPIRE; break; + case Opt_ignore: + sbi->flags |= AUTOFS_SBI_IGNORE; + break; default: return 1; } diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 082119630b49..1f7925afad2d 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -23,7 +23,7 @@ #define AUTOFS_MIN_PROTO_VERSION 3 #define AUTOFS_MAX_PROTO_VERSION 5 -#define AUTOFS_PROTO_SUBVERSION 4 +#define AUTOFS_PROTO_SUBVERSION 5 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed -- cgit v1.2.3 From 6eb3c3d0a52dca337e327ae8868ca1f44a712e02 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 7 Mar 2019 16:29:26 -0800 Subject: exec: increase BINPRM_BUF_SIZE to 256 Large enterprise clients often run applications out of networked file systems where the IT mandated layout of project volumes can end up leading to paths that are longer than 128 characters. Bumping this up to the next order of two solves this problem in all but the most egregious case while still fitting into a 512b slab. [oleg@redhat.com: update comment, per Kees] Link: http://lkml.kernel.org/r/20181112160956.GA28472@redhat.com Signed-off-by: Oleg Nesterov Reported-by: Ben Woodard Reviewed-by: Andrew Morton Acked-by: Michal Hocko Acked-by: Kees Cook Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- include/uapi/linux/binfmts.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/fs/exec.c b/fs/exec.c index bf0ace3841ad..2e0033348d8e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1563,7 +1563,7 @@ static void bprm_fill_uid(struct linux_binprm *bprm) /* * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes + * Check permissions, then read the first BINPRM_BUF_SIZE bytes * * This may be called multiple times for binary chains (scripts for example). */ diff --git a/include/uapi/linux/binfmts.h b/include/uapi/linux/binfmts.h index 4abad03a8853..689025d9c185 100644 --- a/include/uapi/linux/binfmts.h +++ b/include/uapi/linux/binfmts.h @@ -16,6 +16,6 @@ struct pt_regs; #define MAX_ARG_STRINGS 0x7FFFFFFF /* sizeof(linux_binprm->buf) */ -#define BINPRM_BUF_SIZE 128 +#define BINPRM_BUF_SIZE 256 #endif /* _UAPI_LINUX_BINFMTS_H */ -- cgit v1.2.3