From 9cf0a0b4b64ae103cf0e7dfaa72b44ecda24c0eb Mon Sep 17 00:00:00 2001 From: Alexei Avshalom Lazar Date: Mon, 13 Aug 2018 15:33:00 +0300 Subject: cfg80211: Add support for 60GHz band channels 5 and 6 The current support in the 60GHz band is for channels 1-4. Add support for channels 5 and 6. This requires enlarging ieee80211_channel.center_freq from u16 to u32. Signed-off-by: Alexei Avshalom Lazar Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 7acc16f34942..023989604fc6 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4338,7 +4338,7 @@ enum nl80211_txrate_gi { * enum nl80211_band - Frequency band * @NL80211_BAND_2GHZ: 2.4 GHz ISM band * @NL80211_BAND_5GHZ: around 5 GHz band (4.9 - 5.7 GHz) - * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 64.80 GHz) + * @NL80211_BAND_60GHZ: around 60 GHz band (58.32 - 69.12 GHz) * @NUM_NL80211_BANDS: number of bands, avoid using this in userspace * since newer kernel versions may support more bands */ -- cgit v1.2.3 From 9c06602b1b920ed6b546632bdbbc1f400eea5242 Mon Sep 17 00:00:00 2001 From: Balaji Pothunoori Date: Thu, 19 Jul 2018 18:56:27 +0530 Subject: cfg80211: clarify frames covered by average ACK signal report Modify the API to include all ACK frames in average ACK signal strength reporting, not just ACKs for data frames. Make exposing the data conditional on implementing the extended feature flag. This is how it was really implemented in mac80211, update the code there to use the new defines and clean up some of the setting code. Keep nl80211.h source compatibility by keeping the old names. Signed-off-by: Balaji Pothunoori [rewrite commit log, change compatibility to be old=new instead of the other way around, update kernel-doc, roll in mac80211 changes, make mac80211 depend on valid bit instead of HW flag] Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 18 +++++++++++------- net/mac80211/sta_info.c | 6 +++--- net/wireless/nl80211.c | 7 ++++--- 3 files changed, 18 insertions(+), 13 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 023989604fc6..1766a12b231c 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3050,8 +3050,7 @@ enum nl80211_sta_bss_param { * received from the station (u64, usec) * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) - * @NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG: avg signal strength of (data) - * ACK frame (s8, dBm) + * @NL80211_STA_INFO_ACK_SIGNAL_AVG: avg signal strength of ACK frames (s8, dBm) * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3091,13 +3090,17 @@ enum nl80211_sta_info { NL80211_STA_INFO_RX_DURATION, NL80211_STA_INFO_PAD, NL80211_STA_INFO_ACK_SIGNAL, - NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG, + NL80211_STA_INFO_ACK_SIGNAL_AVG, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, NL80211_STA_INFO_MAX = __NL80211_STA_INFO_AFTER_LAST - 1 }; +/* we renamed this - stay compatible */ +#define NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG NL80211_STA_INFO_ACK_SIGNAL_AVG + + /** * enum nl80211_tid_stats - per TID statistics attributes * @__NL80211_TID_STATS_INVALID: attribute number 0 is reserved @@ -5213,9 +5216,8 @@ enum nl80211_feature_flags { * "radar detected" event. * @NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211: Driver supports sending and * receiving control port frames over nl80211 instead of the netdevice. - * @NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT: This Driver support data ack - * rssi if firmware support, this flag is to intimate about ack rssi - * support to nl80211. + * @NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT: This driver/device supports + * (average) ACK signal strength reporting. * @NL80211_EXT_FEATURE_TXQS: Driver supports FQ-CoDel-enabled intermediate * TXQs. * @NL80211_EXT_FEATURE_SCAN_RANDOM_SN: Driver/device supports randomizing the @@ -5255,7 +5257,9 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_HIGH_ACCURACY_SCAN, NL80211_EXT_FEATURE_DFS_OFFLOAD, NL80211_EXT_FEATURE_CONTROL_PORT_OVER_NL80211, - NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT, + NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT, + /* we renamed this - stay compatible */ + NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT = NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT, NL80211_EXT_FEATURE_TXQS, NL80211_EXT_FEATURE_SCAN_RANDOM_SN, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index f34202242d24..a231d623b2d2 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2323,13 +2323,13 @@ void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo, sinfo->filled |= BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL); } - if (ieee80211_hw_check(&sta->local->hw, REPORTS_TX_ACK_STATUS) && - !(sinfo->filled & BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG))) { + if (!(sinfo->filled & BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG)) && + sta->status_stats.ack_signal_filled) { sinfo->avg_ack_signal = -(s8)ewma_avg_signal_read( &sta->status_stats.avg_ack_signal); sinfo->filled |= - BIT_ULL(NL80211_STA_INFO_DATA_ACK_SIGNAL_AVG); + BIT_ULL(NL80211_STA_INFO_ACK_SIGNAL_AVG); } } diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 5fb9b7dd9831..62e6679de481 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4724,10 +4724,11 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); - PUT_SINFO(ACK_SIGNAL, ack_signal, u8); if (wiphy_ext_feature_isset(&rdev->wiphy, - NL80211_EXT_FEATURE_DATA_ACK_SIGNAL_SUPPORT)) - PUT_SINFO(DATA_ACK_SIGNAL_AVG, avg_ack_signal, s8); + NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) { + PUT_SINFO(ACK_SIGNAL, ack_signal, u8); + PUT_SINFO(ACK_SIGNAL_AVG, avg_ack_signal, s8); + } #undef PUT_SINFO #undef PUT_SINFO_U64 -- cgit v1.2.3 From 9b3004953503462a4fab31b85e44ae446d48f0bd Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Tue, 28 Aug 2018 19:56:58 +0200 Subject: ethtool: drop get_settings and set_settings callbacks Since [gs]et_settings ethtool_ops callbacks have been deprecated in February 2016, all in tree NIC drivers have been converted to provide [gs]et_link_ksettings() and out of tree drivers have had enough time to do the same. Drop get_settings() and set_settings() and implement both ETHTOOL_[GS]SET and ETHTOOL_[GS]LINKSETTINGS only using [gs]et_link_ksettings(). Signed-off-by: Michal Kubecek Signed-off-by: David S. Miller --- Documentation/ABI/testing/sysfs-class-net | 4 +- include/linux/ethtool.h | 33 ++----- include/uapi/linux/ethtool.h | 15 +-- net/core/ethtool.c | 158 +++++++----------------------- 4 files changed, 50 insertions(+), 160 deletions(-) (limited to 'include/uapi') diff --git a/Documentation/ABI/testing/sysfs-class-net b/Documentation/ABI/testing/sysfs-class-net index 2f1788111cd9..e2e0fe553ad8 100644 --- a/Documentation/ABI/testing/sysfs-class-net +++ b/Documentation/ABI/testing/sysfs-class-net @@ -117,7 +117,7 @@ Description: full: full duplex Note: This attribute is only valid for interfaces that implement - the ethtool get_settings method (mostly Ethernet). + the ethtool get_link_ksettings method (mostly Ethernet). What: /sys/class/net//flags Date: April 2005 @@ -224,7 +224,7 @@ Description: an integer representing the link speed in Mbits/sec. Note: this attribute is only valid for interfaces that implement - the ethtool get_settings method (mostly Ethernet ). + the ethtool get_link_ksettings method (mostly Ethernet). What: /sys/class/net//tx_queue_len Date: April 2005 diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index f8a2245b70ac..afd9596ce636 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -183,14 +183,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, /** * struct ethtool_ops - optional netdev operations - * @get_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings - * API. Get various device settings including Ethernet link - * settings. The @cmd parameter is expected to have been cleared - * before get_settings is called. Returns a negative error code - * or zero. - * @set_settings: DEPRECATED, use %get_link_ksettings/%set_link_ksettings - * API. Set various device settings including Ethernet link - * settings. Returns a negative error code or zero. * @get_drvinfo: Report driver/device information. Should only set the * @driver, @version, @fw_version and @bus_info fields. If not * implemented, the @driver and @bus_info fields will be filled in @@ -297,19 +289,16 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * a TX queue has this number, return -EINVAL. If only a RX queue or a TX * queue has this number, ignore the inapplicable fields. * Returns a negative error code or zero. - * @get_link_ksettings: When defined, takes precedence over the - * %get_settings method. Get various device settings - * including Ethernet link settings. The %cmd and - * %link_mode_masks_nwords fields should be ignored (use - * %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), any - * change to them will be overwritten by kernel. Returns a - * negative error code or zero. - * @set_link_ksettings: When defined, takes precedence over the - * %set_settings method. Set various device settings including - * Ethernet link settings. The %cmd and %link_mode_masks_nwords - * fields should be ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS - * instead of the latter), any change to them will be overwritten - * by kernel. Returns a negative error code or zero. + * @get_link_ksettings: Get various device settings including Ethernet link + * settings. The %cmd and %link_mode_masks_nwords fields should be + * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), + * any change to them will be overwritten by kernel. Returns a negative + * error code or zero. + * @set_link_ksettings: Set various device settings including Ethernet link + * settings. The %cmd and %link_mode_masks_nwords fields should be + * ignored (use %__ETHTOOL_LINK_MODE_MASK_NBITS instead of the latter), + * any change to them will be overwritten by kernel. Returns a negative + * error code or zero. * @get_fecparam: Get the network device Forward Error Correction parameters. * @set_fecparam: Set the network device Forward Error Correction parameters. * @get_ethtool_phy_stats: Return extended statistics about the PHY device. @@ -329,8 +318,6 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, * of the generic netdev features interface. */ struct ethtool_ops { - int (*get_settings)(struct net_device *, struct ethtool_cmd *); - int (*set_settings)(struct net_device *, struct ethtool_cmd *); void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); int (*get_regs_len)(struct net_device *); void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index dc69391d2bba..c8f8e2455bf3 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -91,10 +91,6 @@ * %ETHTOOL_GSET to get the current values before making specific * changes and then applying them with %ETHTOOL_SSET. * - * Drivers that implement set_settings() should validate all fields - * other than @cmd that are not described as read-only or deprecated, - * and must ignore all fields described as read-only. - * * Deprecated fields should be ignored by both users and drivers. */ struct ethtool_cmd { @@ -1800,14 +1796,9 @@ enum ethtool_reset_flags { * rejected. * * Deprecated %ethtool_cmd fields transceiver, maxtxpkt and maxrxpkt - * are not available in %ethtool_link_settings. Until all drivers are - * converted to ignore them or to the new %ethtool_link_settings API, - * for both queries and changes, users should always try - * %ETHTOOL_GLINKSETTINGS first, and if it fails with -ENOTSUPP stick - * only to %ETHTOOL_GSET and %ETHTOOL_SSET consistently. If it - * succeeds, then users should stick to %ETHTOOL_GLINKSETTINGS and - * %ETHTOOL_SLINKSETTINGS (which would support drivers implementing - * either %ethtool_cmd or %ethtool_link_settings). + * are not available in %ethtool_link_settings. These fields will be + * always set to zero in %ETHTOOL_GSET reply and %ETHTOOL_SSET will + * fail if any of them is set to non-zero value. * * Users should assume that all fields not marked read-only are * writable and subject to validation by the driver. They should use diff --git a/net/core/ethtool.c b/net/core/ethtool.c index c9993c6c2fd4..9d4e56d97080 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -539,47 +539,17 @@ struct ethtool_link_usettings { } link_modes; }; -/* Internal kernel helper to query a device ethtool_link_settings. - * - * Backward compatibility note: for compatibility with legacy drivers - * that implement only the ethtool_cmd API, this has to work with both - * drivers implementing get_link_ksettings API and drivers - * implementing get_settings API. When drivers implement get_settings - * and report ethtool_cmd deprecated fields - * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored - * because the resulting struct ethtool_link_settings does not report them. - */ +/* Internal kernel helper to query a device ethtool_link_settings. */ int __ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings *link_ksettings) { - int err; - struct ethtool_cmd cmd; - ASSERT_RTNL(); - if (dev->ethtool_ops->get_link_ksettings) { - memset(link_ksettings, 0, sizeof(*link_ksettings)); - return dev->ethtool_ops->get_link_ksettings(dev, - link_ksettings); - } - - /* driver doesn't support %ethtool_link_ksettings API. revert to - * legacy %ethtool_cmd API, unless it's not supported either. - * TODO: remove when ethtool_ops::get_settings disappears internally - */ - if (!dev->ethtool_ops->get_settings) + if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; - memset(&cmd, 0, sizeof(cmd)); - cmd.cmd = ETHTOOL_GSET; - err = dev->ethtool_ops->get_settings(dev, &cmd); - if (err < 0) - return err; - - /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt - */ - convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd); - return err; + memset(link_ksettings, 0, sizeof(*link_ksettings)); + return dev->ethtool_ops->get_link_ksettings(dev, link_ksettings); } EXPORT_SYMBOL(__ethtool_get_link_ksettings); @@ -635,16 +605,7 @@ store_link_ksettings_for_user(void __user *to, return 0; } -/* Query device for its ethtool_link_settings. - * - * Backward compatibility note: this function must fail when driver - * does not implement ethtool::get_link_ksettings, even if legacy - * ethtool_ops::get_settings is implemented. This tells new versions - * of ethtool that they should use the legacy API %ETHTOOL_GSET for - * this driver, so that they can correctly access the ethtool_cmd - * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver - * implements ethtool_ops::get_settings anymore. - */ +/* Query device for its ethtool_link_settings. */ static int ethtool_get_link_ksettings(struct net_device *dev, void __user *useraddr) { @@ -652,7 +613,6 @@ static int ethtool_get_link_ksettings(struct net_device *dev, struct ethtool_link_ksettings link_ksettings; ASSERT_RTNL(); - if (!dev->ethtool_ops->get_link_ksettings) return -EOPNOTSUPP; @@ -699,16 +659,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev, return store_link_ksettings_for_user(useraddr, &link_ksettings); } -/* Update device ethtool_link_settings. - * - * Backward compatibility note: this function must fail when driver - * does not implement ethtool::set_link_ksettings, even if legacy - * ethtool_ops::set_settings is implemented. This tells new versions - * of ethtool that they should use the legacy API %ETHTOOL_SSET for - * this driver, so that they can correctly update the ethtool_cmd - * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver - * implements ethtool_ops::get_settings anymore. - */ +/* Update device ethtool_link_settings. */ static int ethtool_set_link_ksettings(struct net_device *dev, void __user *useraddr) { @@ -746,51 +697,31 @@ static int ethtool_set_link_ksettings(struct net_device *dev, /* Query device for its ethtool_cmd settings. * - * Backward compatibility note: for compatibility with legacy ethtool, - * this has to work with both drivers implementing get_link_ksettings - * API and drivers implementing get_settings API. When drivers - * implement get_link_ksettings and report higher link mode bits, a - * kernel warning is logged once (with name of 1st driver/device) to - * recommend user to upgrade ethtool, but the command is successful - * (only the lower link mode bits reported back to user). + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now implemented via get_link_ksettings. When driver reports higher link mode + * bits, a kernel warning is logged once (with name of 1st driver/device) to + * recommend user to upgrade ethtool, but the command is successful (only the + * lower link mode bits reported back to user). Deprecated fields from + * ethtool_cmd (transceiver/maxrxpkt/maxtxpkt) are always set to zero. */ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) { + struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; + int err; ASSERT_RTNL(); + if (!dev->ethtool_ops->get_link_ksettings) + return -EOPNOTSUPP; - if (dev->ethtool_ops->get_link_ksettings) { - /* First, use link_ksettings API if it is supported */ - int err; - struct ethtool_link_ksettings link_ksettings; - - memset(&link_ksettings, 0, sizeof(link_ksettings)); - err = dev->ethtool_ops->get_link_ksettings(dev, - &link_ksettings); - if (err < 0) - return err; - convert_link_ksettings_to_legacy_settings(&cmd, - &link_ksettings); - - /* send a sensible cmd tag back to user */ - cmd.cmd = ETHTOOL_GSET; - } else { - /* driver doesn't support %ethtool_link_ksettings - * API. revert to legacy %ethtool_cmd API, unless it's - * not supported either. - */ - int err; - - if (!dev->ethtool_ops->get_settings) - return -EOPNOTSUPP; + memset(&link_ksettings, 0, sizeof(link_ksettings)); + err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings); + if (err < 0) + return err; + convert_link_ksettings_to_legacy_settings(&cmd, &link_ksettings); - memset(&cmd, 0, sizeof(cmd)); - cmd.cmd = ETHTOOL_GSET; - err = dev->ethtool_ops->get_settings(dev, &cmd); - if (err < 0) - return err; - } + /* send a sensible cmd tag back to user */ + cmd.cmd = ETHTOOL_GSET; if (copy_to_user(useraddr, &cmd, sizeof(cmd))) return -EFAULT; @@ -800,48 +731,29 @@ static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) /* Update device link settings with given ethtool_cmd. * - * Backward compatibility note: for compatibility with legacy ethtool, - * this has to work with both drivers implementing set_link_ksettings - * API and drivers implementing set_settings API. When drivers - * implement set_link_ksettings and user's request updates deprecated - * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel - * warning is logged once (with name of 1st driver/device) to - * recommend user to upgrade ethtool, and the request is rejected. + * Backward compatibility note: for compatibility with legacy ethtool, this is + * now always implemented via set_link_settings. When user's request updates + * deprecated ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel + * warning is logged once (with name of 1st driver/device) to recommend user to + * upgrade ethtool, and the request is rejected. */ static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) { + struct ethtool_link_ksettings link_ksettings; struct ethtool_cmd cmd; ASSERT_RTNL(); if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; - - /* first, try new %ethtool_link_ksettings API. */ - if (dev->ethtool_ops->set_link_ksettings) { - struct ethtool_link_ksettings link_ksettings; - - if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, - &cmd)) - return -EINVAL; - - link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS; - link_ksettings.base.link_mode_masks_nwords - = __ETHTOOL_LINK_MODE_MASK_NU32; - return dev->ethtool_ops->set_link_ksettings(dev, - &link_ksettings); - } - - /* legacy %ethtool_cmd API */ - - /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings - * disappears internally - */ - - if (!dev->ethtool_ops->set_settings) + if (!dev->ethtool_ops->set_link_ksettings) return -EOPNOTSUPP; - return dev->ethtool_ops->set_settings(dev, &cmd); + if (!convert_legacy_settings_to_link_ksettings(&link_ksettings, &cmd)) + return -EINVAL; + link_ksettings.base.link_mode_masks_nwords = + __ETHTOOL_LINK_MODE_MASK_NU32; + return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings); } static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, -- cgit v1.2.3 From b9de3963cc2b373a655636335cb8c4ed12fc9d3b Mon Sep 17 00:00:00 2001 From: Florent Fourcot Date: Thu, 30 Aug 2018 16:39:23 +0200 Subject: net/sched: fix type of htb statistics tokens and ctokens are defined as s64 in htb_class structure, and clamped to 32bits value during netlink dumps: cl->xstats.tokens = clamp_t(s64, PSCHED_NS2TICKS(cl->tokens), INT_MIN, INT_MAX); Defining it as u32 is working since userspace (tc) is printing it as signed int, but a correct definition from the beginning is probably better. In the same time, 'giants' structure member is unused since years, so update the comment to mark it unused. Signed-off-by: Florent Fourcot Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 8975fd1a1421..e9b7244ac381 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -395,9 +395,9 @@ enum { struct tc_htb_xstats { __u32 lends; __u32 borrows; - __u32 giants; /* too big packets (rate will not be accurate) */ - __u32 tokens; - __u32 ctokens; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; }; /* HFSC section */ -- cgit v1.2.3 From 2b815b04dfe45d1278fd4137675fe1398f656b0a Mon Sep 17 00:00:00 2001 From: Alexander Wetzel Date: Fri, 31 Aug 2018 15:00:37 +0200 Subject: nl80211: Add CAN_REPLACE_PTK0 API Drivers able to correctly replace a in-use key should set @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0 to allow the user space (e.g. hostapd or wpa_supplicant) to rekey PTK keys. The user space must detect a PTK rekey attempt and only go ahead with it when the driver has set this flag. If the driver is not supporting the feature the user space either must not replace the PTK key or perform a full re-association instead. Ignoring this flag and continuing to rekey the connection can still work but has to be considered insecure and broken. Depending on the driver it can leak clear text packets or freeze the connection and is only supported to allow the user space to be updated. Signed-off-by: Alexander Wetzel Reviewed-by: Denis Kenzior Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 1766a12b231c..cfc94178d608 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -5226,6 +5226,11 @@ enum nl80211_feature_flags { * except for supported rates from the probe request content if requested * by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag. * + * @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0: Driver/device confirm that they are + * able to rekey an in-use key correctly. Userspace must not rekey PTK keys + * if this flag is not set. Ignoring this can leak clear text packets and/or + * freeze the connection. + * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. */ @@ -5263,6 +5268,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_TXQS, NL80211_EXT_FEATURE_SCAN_RANDOM_SN, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, + NL80211_EXT_FEATURE_CAN_REPLACE_PTK0, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.2.3 From fa788d986a3aac5069378ed04697bd06f83d3488 Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Mon, 3 Sep 2018 16:23:36 +0200 Subject: packet: add sockopt to ignore outgoing packets Currently, the only way to ignore outgoing packets on a packet socket is via the BPF filter. With MSG_ZEROCOPY, packets that are looped into AF_PACKET are copied in dev_queue_xmit_nit(), and this copy happens even if the filter run from packet_rcv() would reject them. So the presence of a packet socket on the interface takes away the benefits of MSG_ZEROCOPY, even if the packet socket is not interested in outgoing packets. (Even when MSG_ZEROCOPY is not used, the skb is unnecessarily cloned, but the cost for that is much lower.) Add a socket option to allow AF_PACKET sockets to ignore outgoing packets to solve this. Note that the *BSDs already have something similar: BIOCSSEESENT/BIOCSDIRECTION and BIOCSDIRFILT. The first intended user is lldpd. Signed-off-by: Vincent Whitchurch Signed-off-by: David S. Miller --- include/linux/netdevice.h | 1 + include/uapi/linux/if_packet.h | 1 + net/core/dev.c | 3 +++ net/packet/af_packet.c | 17 +++++++++++++++++ 4 files changed, 22 insertions(+) (limited to 'include/uapi') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 4271f6b4e419..e2b3bd750c98 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2343,6 +2343,7 @@ static inline struct sk_buff *call_gro_receive_sk(gro_receive_sk_t cb, struct packet_type { __be16 type; /* This is really htons(ether_type). */ + bool ignore_outgoing; struct net_device *dev; /* NULL is wildcarded here */ int (*func) (struct sk_buff *, struct net_device *, diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 67b61d91d89b..467b654bd4c7 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -57,6 +57,7 @@ struct sockaddr_ll { #define PACKET_QDISC_BYPASS 20 #define PACKET_ROLLOVER_STATS 21 #define PACKET_FANOUT_DATA 22 +#define PACKET_IGNORE_OUTGOING 23 #define PACKET_FANOUT_HASH 0 #define PACKET_FANOUT_LB 1 diff --git a/net/core/dev.c b/net/core/dev.c index 82114e1111e6..ca78dc5a79a3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1969,6 +1969,9 @@ void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); again: list_for_each_entry_rcu(ptype, ptype_list, list) { + if (ptype->ignore_outgoing) + continue; + /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) */ diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 75c92a87e7b2..f85f67b5c1f4 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3805,6 +3805,20 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv return fanout_set_data(po, optval, optlen); } + case PACKET_IGNORE_OUTGOING: + { + int val; + + if (optlen != sizeof(val)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(val))) + return -EFAULT; + if (val < 0 || val > 1) + return -EINVAL; + + po->prot_hook.ignore_outgoing = !!val; + return 0; + } case PACKET_TX_HAS_OFF: { unsigned int val; @@ -3928,6 +3942,9 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, ((u32)po->fanout->flags << 24)) : 0); break; + case PACKET_IGNORE_OUTGOING: + val = po->prot_hook.ignore_outgoing; + break; case PACKET_ROLLOVER_STATS: if (!po->rollover) return -EINVAL; -- cgit v1.2.3 From 9f3c057c146fce335c160e95ca893d5bc34e7d00 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Sep 2018 21:53:48 +0200 Subject: if_addr: add IFA_TARGET_NETNSID This adds a new IFA_TARGET_NETNSID property to be used by address families such as PF_INET and PF_INET6. The IFA_TARGET_NETNSID property can be used to send a network namespace identifier as part of a request. If a IFA_TARGET_NETNSID property is identified it will be used to retrieve the target network namespace in which the request is to be made. Signed-off-by: Christian Brauner Cc: Jiri Benc Cc: Nicolas Dichtel Signed-off-by: David S. Miller --- include/uapi/linux/if_addr.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_addr.h b/include/uapi/linux/if_addr.h index ebaf5701c9db..dfcf3ce0097f 100644 --- a/include/uapi/linux/if_addr.h +++ b/include/uapi/linux/if_addr.h @@ -34,6 +34,7 @@ enum { IFA_MULTICAST, IFA_FLAGS, IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ + IFA_TARGET_NETNSID, __IFA_MAX, }; -- cgit v1.2.3 From 19d8f1ad12fd746e60707a58d954980013c7a35a Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 4 Sep 2018 21:53:52 +0200 Subject: if_link: add IFLA_TARGET_NETNSID alias This adds IFLA_TARGET_NETNSID as an alias for IFLA_IF_NETNSID for RTM_*LINK requests. The new name is clearer and also aligns with the newly introduced IFA_TARGET_NETNSID propert for RTM_*ADDR requests. Signed-off-by: Christian Brauner Suggested-by: Nicolas Dichtel Cc: Jiri Benc Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 43391e2d1153..29d49b989acd 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -161,6 +161,7 @@ enum { IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index cf01b6824244..1c73d63068b1 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -161,6 +161,7 @@ enum { IFLA_EVENT, IFLA_NEW_NETNSID, IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, IFLA_NEW_IFINDEX, -- cgit v1.2.3 From 86c55361e569400b6286f30283a9c143a18c20d9 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Fri, 7 Sep 2018 17:22:21 +0300 Subject: net: sched: cls_flower: dump offload count value Change flower in_hw_count type to fixed-size u32 and dump it as TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared blocks and re-offload functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/sch_generic.h | 2 +- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index fdaa5506e6f7..d326fd553b58 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) } static inline void -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, u32 *flags, bool add) { if (add) { diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index be382fb0592d..401d0c1e612d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -483,6 +483,8 @@ enum { TCA_FLOWER_KEY_ENC_OPTS, TCA_FLOWER_KEY_ENC_OPTS_MASK, + TCA_FLOWER_IN_HW_COUNT, + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6fd9bdd93796..4b8dd37dd4f8 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -98,7 +98,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - unsigned int in_hw_count; + u32 in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; -- cgit v1.2.3 From 435f2e7cc0b783615d7fbcf08f5f00d289f9caeb Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Tue, 11 Sep 2018 09:39:53 +0300 Subject: net: bridge: add support for sticky fdb entries Add support for entries which are "sticky", i.e. will not change their port if they show up from a different one. A new ndm flag is introduced for that purpose - NTF_STICKY. We allow to set it only to non-local entries. Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/neighbour.h | 1 + net/bridge/br_fdb.c | 19 ++++++++++++++++--- net/bridge/br_private.h | 1 + 3 files changed, 18 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/neighbour.h b/include/uapi/linux/neighbour.h index 904db6148476..998155444e0d 100644 --- a/include/uapi/linux/neighbour.h +++ b/include/uapi/linux/neighbour.h @@ -43,6 +43,7 @@ enum { #define NTF_PROXY 0x08 /* == ATF_PUBL */ #define NTF_EXT_LEARNED 0x10 #define NTF_OFFLOADED 0x20 +#define NTF_STICKY 0x40 #define NTF_ROUTER 0x80 /* diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 502f66349530..a56ed7f2a3a3 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -584,7 +584,7 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, unsigned long now = jiffies; /* fastpath: update of existing entry */ - if (unlikely(source != fdb->dst)) { + if (unlikely(source != fdb->dst && !fdb->is_sticky)) { fdb->dst = source; fdb_modified = true; /* Take over HW learned entry */ @@ -656,6 +656,8 @@ static int fdb_fill_info(struct sk_buff *skb, const struct net_bridge *br, ndm->ndm_flags |= NTF_OFFLOADED; if (fdb->added_by_external_learn) ndm->ndm_flags |= NTF_EXT_LEARNED; + if (fdb->is_sticky) + ndm->ndm_flags |= NTF_STICKY; if (nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->key.addr)) goto nla_put_failure; @@ -772,8 +774,10 @@ skip: /* Update (create or replace) forwarding database entry */ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, - const __u8 *addr, __u16 state, __u16 flags, __u16 vid) + const u8 *addr, u16 state, u16 flags, u16 vid, + u8 ndm_flags) { + u8 is_sticky = !!(ndm_flags & NTF_STICKY); struct net_bridge_fdb_entry *fdb; bool modified = false; @@ -789,6 +793,9 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, return -EINVAL; } + if (is_sticky && (state & NUD_PERMANENT)) + return -EINVAL; + fdb = br_fdb_find(br, addr, vid); if (fdb == NULL) { if (!(flags & NLM_F_CREATE)) @@ -832,6 +839,12 @@ static int fdb_add_entry(struct net_bridge *br, struct net_bridge_port *source, modified = true; } + + if (is_sticky != fdb->is_sticky) { + fdb->is_sticky = is_sticky; + modified = true; + } + fdb->added_by_user = 1; fdb->used = jiffies; @@ -865,7 +878,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm->ndm_state, - nlh_flags, vid); + nlh_flags, vid, ndm->ndm_flags); spin_unlock_bh(&br->hash_lock); } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 11ed2029985f..d21035a17f4c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -181,6 +181,7 @@ struct net_bridge_fdb_entry { struct hlist_node fdb_node; unsigned char is_local:1, is_static:1, + is_sticky:1, added_by_user:1, added_by_external_learn:1, offloaded:1; -- cgit v1.2.3 From 52d0d404d39dd9eac71a181615d6ca15e23d8e38 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Wed, 12 Sep 2018 10:04:21 +0800 Subject: geneve: add ttl inherit support Similar with commit 72f6d71e491e6 ("vxlan: add ttl inherit support"), currently ttl == 0 means "use whatever default value" on geneve instead of inherit inner ttl. To respect compatibility with old behavior, let's add a new IFLA_GENEVE_TTL_INHERIT for geneve ttl inherit support. Reported-by: Jianlin Shi Suggested-by: Jiri Benc Signed-off-by: Hangbin Liu Reviewed-by: Jiri Benc Signed-off-by: David S. Miller --- drivers/net/geneve.c | 41 ++++++++++++++++++++++++++++++-------- include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/if_link.h | 1 + 3 files changed, 35 insertions(+), 8 deletions(-) (limited to 'include/uapi') diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c index 6acb6b5718b9..6625fabe2c88 100644 --- a/drivers/net/geneve.c +++ b/drivers/net/geneve.c @@ -69,6 +69,7 @@ struct geneve_dev { struct gro_cells gro_cells; bool collect_md; bool use_udp6_rx_checksums; + bool ttl_inherit; }; struct geneve_sock { @@ -843,7 +844,11 @@ static int geneve_xmit_skb(struct sk_buff *skb, struct net_device *dev, ttl = key->ttl; } else { tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, ip_hdr(skb), skb); - ttl = key->ttl ? : ip4_dst_hoplimit(&rt->dst); + if (geneve->ttl_inherit) + ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); + else + ttl = key->ttl; + ttl = ttl ? : ip4_dst_hoplimit(&rt->dst); } df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0; @@ -889,7 +894,11 @@ static int geneve6_xmit_skb(struct sk_buff *skb, struct net_device *dev, } else { prio = ip_tunnel_ecn_encap(ip6_tclass(fl6.flowlabel), ip_hdr(skb), skb); - ttl = key->ttl ? : ip6_dst_hoplimit(dst); + if (geneve->ttl_inherit) + ttl = ip_tunnel_get_ttl(ip_hdr(skb), skb); + else + ttl = key->ttl; + ttl = ttl ? : ip6_dst_hoplimit(dst); } err = geneve_build_skb(dst, skb, info, xnet, sizeof(struct ipv6hdr)); if (unlikely(err)) @@ -1091,6 +1100,7 @@ static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = { [IFLA_GENEVE_UDP_CSUM] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_TX] = { .type = NLA_U8 }, [IFLA_GENEVE_UDP_ZERO_CSUM6_RX] = { .type = NLA_U8 }, + [IFLA_GENEVE_TTL_INHERIT] = { .type = NLA_U8 }, }; static int geneve_validate(struct nlattr *tb[], struct nlattr *data[], @@ -1170,7 +1180,8 @@ static bool geneve_dst_addr_equal(struct ip_tunnel_info *a, static int geneve_configure(struct net *net, struct net_device *dev, struct netlink_ext_ack *extack, const struct ip_tunnel_info *info, - bool metadata, bool ipv6_rx_csum) + bool metadata, bool ipv6_rx_csum, + bool ttl_inherit) { struct geneve_net *gn = net_generic(net, geneve_net_id); struct geneve_dev *t, *geneve = netdev_priv(dev); @@ -1219,6 +1230,7 @@ static int geneve_configure(struct net *net, struct net_device *dev, geneve->info = *info; geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = ipv6_rx_csum; + geneve->ttl_inherit = ttl_inherit; err = register_netdevice(dev); if (err) @@ -1237,7 +1249,8 @@ static void init_tnl_info(struct ip_tunnel_info *info, __u16 dst_port) static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack, struct ip_tunnel_info *info, bool *metadata, - bool *use_udp6_rx_checksums, bool changelink) + bool *use_udp6_rx_checksums, bool *ttl_inherit, + bool changelink) { int attrtype; @@ -1315,6 +1328,9 @@ static int geneve_nl2info(struct nlattr *tb[], struct nlattr *data[], if (data[IFLA_GENEVE_TTL]) info->key.ttl = nla_get_u8(data[IFLA_GENEVE_TTL]); + if (data[IFLA_GENEVE_TTL_INHERIT]) + *ttl_inherit = true; + if (data[IFLA_GENEVE_TOS]) info->key.tos = nla_get_u8(data[IFLA_GENEVE_TOS]); @@ -1438,17 +1454,18 @@ static int geneve_newlink(struct net *net, struct net_device *dev, { bool use_udp6_rx_checksums = false; struct ip_tunnel_info info; + bool ttl_inherit = false; bool metadata = false; int err; init_tnl_info(&info, GENEVE_UDP_PORT); err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, false); + &use_udp6_rx_checksums, &ttl_inherit, false); if (err) return err; err = geneve_configure(net, dev, extack, &info, metadata, - use_udp6_rx_checksums); + use_udp6_rx_checksums, ttl_inherit); if (err) return err; @@ -1511,6 +1528,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], struct ip_tunnel_info info; bool metadata; bool use_udp6_rx_checksums; + bool ttl_inherit; int err; /* If the geneve device is configured for metadata (or externally @@ -1523,8 +1541,9 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], memcpy(&info, &geneve->info, sizeof(info)); metadata = geneve->collect_md; use_udp6_rx_checksums = geneve->use_udp6_rx_checksums; + ttl_inherit = geneve->ttl_inherit; err = geneve_nl2info(tb, data, extack, &info, &metadata, - &use_udp6_rx_checksums, true); + &use_udp6_rx_checksums, &ttl_inherit, true); if (err) return err; @@ -1537,6 +1556,7 @@ static int geneve_changelink(struct net_device *dev, struct nlattr *tb[], geneve->info = info; geneve->collect_md = metadata; geneve->use_udp6_rx_checksums = use_udp6_rx_checksums; + geneve->ttl_inherit = ttl_inherit; geneve_unquiesce(geneve, gs4, gs6); return 0; @@ -1562,6 +1582,7 @@ static size_t geneve_get_size(const struct net_device *dev) nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_CSUM */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_TX */ nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_UDP_ZERO_CSUM6_RX */ + nla_total_size(sizeof(__u8)) + /* IFLA_GENEVE_TTL_INHERIT */ 0; } @@ -1569,6 +1590,7 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) { struct geneve_dev *geneve = netdev_priv(dev); struct ip_tunnel_info *info = &geneve->info; + bool ttl_inherit = geneve->ttl_inherit; bool metadata = geneve->collect_md; __u8 tmp_vni[3]; __u32 vni; @@ -1614,6 +1636,9 @@ static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev) goto nla_put_failure; #endif + if (nla_put_u8(skb, IFLA_GENEVE_TTL_INHERIT, ttl_inherit)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -1650,7 +1675,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name, return dev; init_tnl_info(&info, dst_port); - err = geneve_configure(net, dev, NULL, &info, true, true); + err = geneve_configure(net, dev, NULL, &info, true, true, false); if (err) { free_netdev(dev); return ERR_PTR(err); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 29d49b989acd..58faab897201 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -555,6 +555,7 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 1c73d63068b1..141cbfdc5865 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -542,6 +542,7 @@ enum { IFLA_GENEVE_UDP_ZERO_CSUM6_TX, IFLA_GENEVE_UDP_ZERO_CSUM6_RX, IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, __IFLA_GENEVE_MAX }; #define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) -- cgit v1.2.3 From 15033f0457dca569b284bef0c8d3ad55fb37eacb Mon Sep 17 00:00:00 2001 From: Andre Naujoks Date: Mon, 10 Sep 2018 10:27:15 +0200 Subject: ipv6: Add sockopt IPV6_MULTICAST_ALL analogue to IP_MULTICAST_ALL The socket option will be enabled by default to ensure current behaviour is not changed. This is the same for the IPv4 version. A socket bound to in6addr_any and a specific port will receive all traffic on that port. Analogue to IP_MULTICAST_ALL, disable this behaviour, if one or more multicast groups were joined (using said socket) and only pass on multicast traffic from groups, which were explicitly joined via this socket. Without this option disabled a socket (system even) joined to multiple multicast groups is very hard to get right. Filtering by destination address has to take place in user space to avoid receiving multicast traffic from other multicast groups, which might have traffic on the same port. The extension of the IP_MULTICAST_ALL socketoption to just apply to ipv6, too, is not done to avoid changing the behaviour of current applications. Signed-off-by: Andre Naujoks Acked-By: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 ++- include/uapi/linux/in6.h | 1 + net/ipv6/af_inet6.c | 1 + net/ipv6/ipv6_sockglue.c | 11 +++++++++++ net/ipv6/mcast.c | 2 +- 5 files changed, 16 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 8415bf1a9776..495e834c1367 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -274,7 +274,8 @@ struct ipv6_pinfo { */ dontfrag:1, autoflowlabel:1, - autoflowlabel_set:1; + autoflowlabel_set:1, + mc_all:1; __u8 min_hopcount; __u8 tclass; __be32 rcv_flowinfo; diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h index ed291e55f024..71d82fe15b03 100644 --- a/include/uapi/linux/in6.h +++ b/include/uapi/linux/in6.h @@ -177,6 +177,7 @@ struct in6_flowlabel_req { #define IPV6_V6ONLY 26 #define IPV6_JOIN_ANYCAST 27 #define IPV6_LEAVE_ANYCAST 28 +#define IPV6_MULTICAST_ALL 29 /* IPV6_MTU_DISCOVER values */ #define IPV6_PMTUDISC_DONT 0 diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 9a4261e50272..77ef8478234f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -209,6 +209,7 @@ lookup_protocol: np->hop_limit = -1; np->mcast_hops = IPV6_DEFAULT_MCASTHOPS; np->mc_loop = 1; + np->mc_all = 1; np->pmtudisc = IPV6_PMTUDISC_WANT; np->repflow = net->ipv6.sysctl.flowlabel_reflect; sk->sk_ipv6only = net->ipv6.sysctl.bindv6only; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index c0cac9cc3a28..381ce38940ae 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -674,6 +674,13 @@ done: retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); break; } + case IPV6_MULTICAST_ALL: + if (optlen < sizeof(int)) + goto e_inval; + np->mc_all = valbool; + retv = 0; + break; + case MCAST_JOIN_GROUP: case MCAST_LEAVE_GROUP: { @@ -1266,6 +1273,10 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, val = np->mcast_oif; break; + case IPV6_MULTICAST_ALL: + val = np->mc_all; + break; + case IPV6_UNICAST_IF: val = (__force int)htonl((__u32) np->ucast_oif); break; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 4ae54aaca373..6895e1dc0b03 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -636,7 +636,7 @@ bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, } if (!mc) { rcu_read_unlock(); - return true; + return np->mc_all; } read_lock(&mc->sflock); psl = mc->sflist; -- cgit v1.2.3 From d58e468b1112dcd1d5193c0a89ff9f98b5a3e8b9 Mon Sep 17 00:00:00 2001 From: Petar Penkov Date: Fri, 14 Sep 2018 07:46:18 -0700 Subject: flow_dissector: implements flow dissector BPF hook Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector path. The BPF program is per-network namespace. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/skbuff.h | 7 +++ include/net/net_namespace.h | 3 + include/net/sch_generic.h | 12 +++- include/uapi/linux/bpf.h | 26 +++++++++ kernel/bpf/syscall.c | 8 +++ kernel/bpf/verifier.c | 32 +++++++++++ net/core/filter.c | 70 +++++++++++++++++++++++ net/core/flow_dissector.c | 134 ++++++++++++++++++++++++++++++++++++++++++++ 10 files changed, 291 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 523481a3471b..988a00797bcd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -212,6 +212,7 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS, /* reg points to bpf_flow_keys */ }; /* The information passed from prog-specific *_is_valid_access diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index cd26c090e7c0..22083712dd18 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 17a13e4785fc..ce0e863f02a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -243,6 +243,8 @@ struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; +struct bpf_prog; +union bpf_attr; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); + bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9b5fdc50519a..99d4148e0f90 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -43,6 +43,7 @@ struct ctl_table_header; struct net_generic; struct uevent_sock; struct netns_ipvs; +struct bpf_prog; #define NETDEV_HASHBITS 8 @@ -145,6 +146,8 @@ struct net { #endif struct net_generic __rcu *gen; + struct bpf_prog __rcu *flow_dissector_prog; + /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a6d00093f35e..1b81ba85fd2d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; +struct bpf_flow_keys; typedef int tc_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); @@ -307,9 +308,14 @@ struct tcf_proto { }; struct qdisc_skb_cb { - unsigned int pkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; + union { + struct { + unsigned int pkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; + }; + struct bpf_flow_keys *flow_keys; + }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 66917a4eba27..aa5ccd2385ed 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SENDMSG, BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, __MAX_BPF_ATTACH_TYPE }; @@ -2333,6 +2335,7 @@ struct __sk_buff { /* ... here. */ __u32 data_meta; + struct bpf_flow_keys *flow_keys; }; struct bpf_tunnel_key { @@ -2778,4 +2781,27 @@ enum bpf_task_fd_type { BPF_FD_TYPE_URETPROBE, /* filename + offset */ }; +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3c9636f03bb2..b3c2d09bcf7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1615,6 +1615,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_LIRC_MODE2: ptype = BPF_PROG_TYPE_LIRC_MODE2; break; + case BPF_FLOW_DISSECTOR: + ptype = BPF_PROG_TYPE_FLOW_DISSECTOR; + break; default: return -EINVAL; } @@ -1636,6 +1639,9 @@ static int bpf_prog_attach(const union bpf_attr *attr) case BPF_PROG_TYPE_LIRC_MODE2: ret = lirc_prog_attach(attr, prog); break; + case BPF_PROG_TYPE_FLOW_DISSECTOR: + ret = skb_flow_dissector_bpf_prog_attach(attr, prog); + break; default: ret = cgroup_bpf_prog_attach(attr, ptype, prog); } @@ -1688,6 +1694,8 @@ static int bpf_prog_detach(const union bpf_attr *attr) return sockmap_get_from_fd(attr, BPF_PROG_TYPE_SK_SKB, NULL); case BPF_LIRC_MODE2: return lirc_prog_detach(attr); + case BPF_FLOW_DISSECTOR: + return skb_flow_dissector_bpf_prog_detach(attr); default: return -EINVAL; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6ff1bac1795d..8ccbff4fff93 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -261,6 +261,7 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET] = "pkt", [PTR_TO_PACKET_META] = "pkt_meta", [PTR_TO_PACKET_END] = "pkt_end", + [PTR_TO_FLOW_KEYS] = "flow_keys", }; static char slot_type_char[] = { @@ -965,6 +966,7 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_PACKET: case PTR_TO_PACKET_META: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: case CONST_PTR_TO_MAP: return true; default: @@ -1238,6 +1240,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, case BPF_PROG_TYPE_LWT_XMIT: case BPF_PROG_TYPE_SK_SKB: case BPF_PROG_TYPE_SK_MSG: + case BPF_PROG_TYPE_FLOW_DISSECTOR: if (meta) return meta->pkt_access; @@ -1321,6 +1324,18 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, return -EACCES; } +static int check_flow_keys_access(struct bpf_verifier_env *env, int off, + int size) +{ + if (size < 0 || off < 0 || + (u64)off + size > sizeof(struct bpf_flow_keys)) { + verbose(env, "invalid access to flow keys off=%d size=%d\n", + off, size); + return -EACCES; + } + return 0; +} + static bool __is_pointer_value(bool allow_ptr_leaks, const struct bpf_reg_state *reg) { @@ -1422,6 +1437,9 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, * right in front, treat it the very same way. */ return check_pkt_ptr_alignment(env, reg, off, size, strict); + case PTR_TO_FLOW_KEYS: + pointer_desc = "flow keys "; + break; case PTR_TO_MAP_VALUE: pointer_desc = "value "; break; @@ -1692,6 +1710,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_packet_access(env, regno, off, size, false); if (!err && t == BPF_READ && value_regno >= 0) mark_reg_unknown(env, regs, value_regno); + } else if (reg->type == PTR_TO_FLOW_KEYS) { + if (t == BPF_WRITE && value_regno >= 0 && + is_pointer_value(env, value_regno)) { + verbose(env, "R%d leaks addr into flow keys\n", + value_regno); + return -EACCES; + } + + err = check_flow_keys_access(env, off, size); + if (!err && t == BPF_READ && value_regno >= 0) + mark_reg_unknown(env, regs, value_regno); } else { verbose(env, "R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1839,6 +1868,8 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, case PTR_TO_PACKET_META: return check_packet_access(env, regno, reg->off, access_size, zero_size_allowed); + case PTR_TO_FLOW_KEYS: + return check_flow_keys_access(env, reg->off, access_size); case PTR_TO_MAP_VALUE: return check_map_access(env, regno, reg->off, access_size, zero_size_allowed); @@ -4366,6 +4397,7 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, case PTR_TO_CTX: case CONST_PTR_TO_MAP: case PTR_TO_PACKET_END: + case PTR_TO_FLOW_KEYS: /* Only valid matches are exact, which memcmp() above * would have accepted */ diff --git a/net/core/filter.c b/net/core/filter.c index bf5b6efd369a..9cc76f134ddb 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5123,6 +5123,17 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +static const struct bpf_func_proto * +flow_dissector_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_skb_load_bytes: + return &bpf_skb_load_bytes_proto; + default: + return bpf_base_func_proto(func_id); + } +} + static const struct bpf_func_proto * lwt_out_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -5241,6 +5252,10 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type if (size != size_default) return false; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + if (size != sizeof(struct bpf_flow_keys *)) + return false; + break; default: /* Only narrow read access allowed for now. */ if (type == BPF_WRITE) { @@ -5266,6 +5281,7 @@ static bool sk_filter_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data): case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, data_end): + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5291,6 +5307,7 @@ static bool lwt_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range_till(struct __sk_buff, family, local_port): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5501,6 +5518,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_end): info->reg_type = PTR_TO_PACKET_END; break; + case bpf_ctx_range(struct __sk_buff, flow_keys): case bpf_ctx_range_till(struct __sk_buff, family, local_port): return false; } @@ -5702,6 +5720,7 @@ static bool sk_skb_is_valid_access(int off, int size, switch (off) { case bpf_ctx_range(struct __sk_buff, tc_classid): case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range(struct __sk_buff, flow_keys): return false; } @@ -5761,6 +5780,39 @@ static bool sk_msg_is_valid_access(int off, int size, return true; } +static bool flow_dissector_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (type == BPF_WRITE) { + switch (off) { + case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]): + break; + default: + return false; + } + } + + switch (off) { + case bpf_ctx_range(struct __sk_buff, data): + info->reg_type = PTR_TO_PACKET; + break; + case bpf_ctx_range(struct __sk_buff, data_end): + info->reg_type = PTR_TO_PACKET_END; + break; + case bpf_ctx_range(struct __sk_buff, flow_keys): + info->reg_type = PTR_TO_FLOW_KEYS; + break; + case bpf_ctx_range(struct __sk_buff, tc_classid): + case bpf_ctx_range(struct __sk_buff, data_meta): + case bpf_ctx_range_till(struct __sk_buff, family, local_port): + return false; + } + + return bpf_skb_is_valid_access(off, size, type, prog, info); +} + static u32 bpf_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -6055,6 +6107,15 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, bpf_target_off(struct sock_common, skc_num, 2, target_size)); break; + + case offsetof(struct __sk_buff, flow_keys): + off = si->off; + off -= offsetof(struct __sk_buff, flow_keys); + off += offsetof(struct sk_buff, cb); + off += offsetof(struct qdisc_skb_cb, flow_keys); + *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, + si->src_reg, off); + break; } return insn - insn_buf; @@ -7018,6 +7079,15 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { const struct bpf_prog_ops sk_msg_prog_ops = { }; +const struct bpf_verifier_ops flow_dissector_verifier_ops = { + .get_func_proto = flow_dissector_func_proto, + .is_valid_access = flow_dissector_is_valid_access, + .convert_ctx_access = bpf_convert_ctx_access, +}; + +const struct bpf_prog_ops flow_dissector_prog_ops = { +}; + int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ce9eeeb7c024..5c5dd74b5b3b 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -25,6 +25,9 @@ #include #include #include +#include + +static DEFINE_MUTEX(flow_dissector_mutex); static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -62,6 +65,44 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, } EXPORT_SYMBOL(skb_flow_dissector_init); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (attached) { + /* Only one BPF program can be attached at a time */ + mutex_unlock(&flow_dissector_mutex); + return -EEXIST; + } + rcu_assign_pointer(net->flow_dissector_prog, prog); + mutex_unlock(&flow_dissector_mutex); + return 0; +} + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr) +{ + struct bpf_prog *attached; + struct net *net; + + net = current->nsproxy->net_ns; + mutex_lock(&flow_dissector_mutex); + attached = rcu_dereference_protected(net->flow_dissector_prog, + lockdep_is_held(&flow_dissector_mutex)); + if (!attached) { + mutex_unlock(&flow_dissector_mutex); + return -ENOENT; + } + bpf_prog_put(attached); + RCU_INIT_POINTER(net->flow_dissector_prog, NULL); + mutex_unlock(&flow_dissector_mutex); + return 0; +} /** * skb_flow_get_be16 - extract be16 entity * @skb: sk_buff to extract from @@ -588,6 +629,60 @@ static bool skb_flow_dissect_allowed(int *num_hdrs) return (*num_hdrs <= MAX_FLOW_DISSECT_HDRS); } +static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, + struct flow_dissector *flow_dissector, + void *target_container) +{ + struct flow_dissector_key_control *key_control; + struct flow_dissector_key_basic *key_basic; + struct flow_dissector_key_addrs *key_addrs; + struct flow_dissector_key_ports *key_ports; + + key_control = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_CONTROL, + target_container); + key_control->thoff = flow_keys->thoff; + if (flow_keys->is_frag) + key_control->flags |= FLOW_DIS_IS_FRAGMENT; + if (flow_keys->is_first_frag) + key_control->flags |= FLOW_DIS_FIRST_FRAG; + if (flow_keys->is_encap) + key_control->flags |= FLOW_DIS_ENCAPSULATION; + + key_basic = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_BASIC, + target_container); + key_basic->n_proto = flow_keys->n_proto; + key_basic->ip_proto = flow_keys->ip_proto; + + if (flow_keys->addr_proto == ETH_P_IP && + dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_IPV4_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV4_ADDRS, + target_container); + key_addrs->v4addrs.src = flow_keys->ipv4_src; + key_addrs->v4addrs.dst = flow_keys->ipv4_dst; + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + } else if (flow_keys->addr_proto == ETH_P_IPV6 && + dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS)) { + key_addrs = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_IPV6_ADDRS, + target_container); + memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs)); + key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; + } + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_PORTS)) { + key_ports = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_PORTS, + target_container); + key_ports->src = flow_keys->sport; + key_ports->dst = flow_keys->dport; + } +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -619,6 +714,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_vlan *key_vlan; enum flow_dissect_ret fdret; enum flow_dissector_key_id dissector_vlan = FLOW_DISSECTOR_KEY_MAX; + struct bpf_prog *attached; int num_hdrs = 0; u8 ip_proto = 0; bool ret; @@ -658,6 +754,44 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); + rcu_read_lock(); + attached = skb ? rcu_dereference(dev_net(skb->dev)->flow_dissector_prog) + : NULL; + if (attached) { + /* Note that even though the const qualifier is discarded + * throughout the execution of the BPF program, all changes(the + * control block) are reverted after the BPF program returns. + * Therefore, __skb_flow_dissect does not alter the skb. + */ + struct bpf_flow_keys flow_keys = {}; + struct bpf_skb_data_end cb_saved; + struct bpf_skb_data_end *cb; + u32 result; + + cb = (struct bpf_skb_data_end *)skb->cb; + + /* Save Control Block */ + memcpy(&cb_saved, cb, sizeof(cb_saved)); + memset(cb, 0, sizeof(cb_saved)); + + /* Pass parameters to the BPF program */ + cb->qdisc_cb.flow_keys = &flow_keys; + flow_keys.nhoff = nhoff; + + bpf_compute_data_pointers((struct sk_buff *)skb); + result = BPF_PROG_RUN(attached, skb); + + /* Restore state */ + memcpy(cb, &cb_saved, sizeof(cb_saved)); + + __skb_flow_bpf_to_target(&flow_keys, flow_dissector, + target_container); + key_control->thoff = min_t(u16, key_control->thoff, skb->len); + rcu_read_unlock(); + return result == BPF_OK; + } + rcu_read_unlock(); + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); -- cgit v1.2.3 From 02b408fae3d5552d10d1189fc0bd7e5b1e76af71 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 29 Aug 2018 00:19:00 +0200 Subject: netfilter: nf_tables: rt: allow checking if dst has xfrm attached Useful e.g. to avoid NATting inner headers of to-be-encrypted packets. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_rt.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index e23290ffdc77..6c44cbbb2cda 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -826,12 +826,14 @@ enum nft_meta_keys { * @NFT_RT_NEXTHOP4: routing nexthop for IPv4 * @NFT_RT_NEXTHOP6: routing nexthop for IPv6 * @NFT_RT_TCPMSS: fetch current path tcp mss + * @NFT_RT_XFRM: boolean, skb->dst->xfrm != NULL */ enum nft_rt_keys { NFT_RT_CLASSID, NFT_RT_NEXTHOP4, NFT_RT_NEXTHOP6, NFT_RT_TCPMSS, + NFT_RT_XFRM, __NFT_RT_MAX }; #define NFT_RT_MAX (__NFT_RT_MAX - 1) diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c index 76dba9f6b6f6..f35fa33913ae 100644 --- a/net/netfilter/nft_rt.c +++ b/net/netfilter/nft_rt.c @@ -90,6 +90,11 @@ static void nft_rt_get_eval(const struct nft_expr *expr, case NFT_RT_TCPMSS: nft_reg_store16(dest, get_tcpmss(pkt, dst)); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + nft_reg_store8(dest, !!dst->xfrm); + break; +#endif default: WARN_ON(1); goto err; @@ -130,6 +135,11 @@ static int nft_rt_get_init(const struct nft_ctx *ctx, case NFT_RT_TCPMSS: len = sizeof(u16); break; +#ifdef CONFIG_XFRM + case NFT_RT_XFRM: + len = sizeof(u8); + break; +#endif default: return -EOPNOTSUPP; } @@ -164,6 +174,7 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp case NFT_RT_NEXTHOP4: case NFT_RT_NEXTHOP6: case NFT_RT_CLASSID: + case NFT_RT_XFRM: return 0; case NFT_RT_TCPMSS: hooks = (1 << NF_INET_FORWARD) | -- cgit v1.2.3 From 6c47260250fc6114ce2012db13e1cd3938a27b73 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Sep 2018 18:09:40 +0200 Subject: netfilter: nf_tables: add xfrm expression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit supports fetching saddr/daddr of tunnel mode states, request id and spi. If direction is 'in', use inbound skb secpath, else dst->xfrm. Joint work with Máté Eckl. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 29 +++ net/netfilter/Kconfig | 7 + net/netfilter/Makefile | 1 + net/netfilter/nft_xfrm.c | 293 +++++++++++++++++++++++++++++++ 4 files changed, 330 insertions(+) create mode 100644 net/netfilter/nft_xfrm.c (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 6c44cbbb2cda..702e4f0bec56 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1514,6 +1514,35 @@ enum nft_devices_attributes { }; #define NFTA_DEVICE_MAX (__NFTA_DEVICE_MAX - 1) +/* + * enum nft_xfrm_attributes - nf_tables xfrm expr netlink attributes + * + * @NFTA_XFRM_DREG: destination register (NLA_U32) + * @NFTA_XFRM_KEY: enum nft_xfrm_keys (NLA_U32) + * @NFTA_XFRM_DIR: direction (NLA_U8) + * @NFTA_XFRM_SPNUM: index in secpath array (NLA_U32) + */ +enum nft_xfrm_attributes { + NFTA_XFRM_UNSPEC, + NFTA_XFRM_DREG, + NFTA_XFRM_KEY, + NFTA_XFRM_DIR, + NFTA_XFRM_SPNUM, + __NFTA_XFRM_MAX +}; +#define NFTA_XFRM_MAX (__NFTA_XFRM_MAX - 1) + +enum nft_xfrm_keys { + NFT_XFRM_KEY_UNSPEC, + NFT_XFRM_KEY_DADDR_IP4, + NFT_XFRM_KEY_DADDR_IP6, + NFT_XFRM_KEY_SADDR_IP4, + NFT_XFRM_KEY_SADDR_IP6, + NFT_XFRM_KEY_REQID, + NFT_XFRM_KEY_SPI, + __NFT_XFRM_KEY_MAX, +}; +#define NFT_XFRM_KEY_MAX (__NFT_XFRM_KEY_MAX - 1) /** * enum nft_trace_attributes - nf_tables trace netlink attributes diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index f61c306de1d0..2ab870ef233a 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -625,6 +625,13 @@ config NFT_FIB_INET The lookup will be delegated to the IPv4 or IPv6 FIB depending on the protocol of the packet. +config NFT_XFRM + tristate "Netfilter nf_tables xfrm/IPSec security association matching" + depends on XFRM + help + This option adds an expression that you can use to extract properties + of a packets security association. + config NFT_SOCKET tristate "Netfilter nf_tables socket match support" depends on IPV6 || IPV6=n diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 16895e045b66..4ddf3ef51ece 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_NFT_FIB_NETDEV) += nft_fib_netdev.o obj-$(CONFIG_NFT_SOCKET) += nft_socket.o obj-$(CONFIG_NFT_OSF) += nft_osf.o obj-$(CONFIG_NFT_TPROXY) += nft_tproxy.o +obj-$(CONFIG_NFT_XFRM) += nft_xfrm.o # nf_tables netdev obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c new file mode 100644 index 000000000000..3cf71a2e375b --- /dev/null +++ b/net/netfilter/nft_xfrm.c @@ -0,0 +1,293 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Generic part shared by ipv4 and ipv6 backends. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const struct nla_policy nft_xfrm_policy[NFTA_XFRM_MAX + 1] = { + [NFTA_XFRM_KEY] = { .type = NLA_U32 }, + [NFTA_XFRM_DIR] = { .type = NLA_U8 }, + [NFTA_XFRM_SPNUM] = { .type = NLA_U32 }, + [NFTA_XFRM_DREG] = { .type = NLA_U32 }, +}; + +struct nft_xfrm { + enum nft_xfrm_keys key:8; + enum nft_registers dreg:8; + u8 dir; + u8 spnum; +}; + +static int nft_xfrm_get_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int len = 0; + u32 spnum = 0; + u8 dir; + + if (!tb[NFTA_XFRM_KEY] || !tb[NFTA_XFRM_DIR] || !tb[NFTA_XFRM_DREG]) + return -EINVAL; + + switch (ctx->family) { + case NFPROTO_IPV4: + case NFPROTO_IPV6: + case NFPROTO_INET: + break; + default: + return -EOPNOTSUPP; + } + + priv->key = ntohl(nla_get_u32(tb[NFTA_XFRM_KEY])); + switch (priv->key) { + case NFT_XFRM_KEY_REQID: + case NFT_XFRM_KEY_SPI: + len = sizeof(u32); + break; + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + len = sizeof(struct in_addr); + break; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + len = sizeof(struct in6_addr); + break; + default: + return -EINVAL; + } + + dir = nla_get_u8(tb[NFTA_XFRM_DIR]); + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + priv->dir = dir; + break; + default: + return -EINVAL; + } + + if (tb[NFTA_XFRM_SPNUM]) + spnum = ntohl(nla_get_be32(tb[NFTA_XFRM_SPNUM])); + + if (spnum >= XFRM_MAX_DEPTH) + return -ERANGE; + + priv->spnum = spnum; + + priv->dreg = nft_parse_register(tb[NFTA_XFRM_DREG]); + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, len); +} + +/* Return true if key asks for daddr/saddr and current + * state does have a valid address (BEET, TUNNEL). + */ +static bool xfrm_state_addr_ok(enum nft_xfrm_keys k, u8 family, u8 mode) +{ + switch (k) { + case NFT_XFRM_KEY_DADDR_IP4: + case NFT_XFRM_KEY_SADDR_IP4: + if (family == NFPROTO_IPV4) + break; + return false; + case NFT_XFRM_KEY_DADDR_IP6: + case NFT_XFRM_KEY_SADDR_IP6: + if (family == NFPROTO_IPV6) + break; + return false; + default: + return true; + } + + return mode == XFRM_MODE_BEET || mode == XFRM_MODE_TUNNEL; +} + +static void nft_xfrm_state_get_key(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct xfrm_state *state, + u8 family) +{ + u32 *dest = ®s->data[priv->dreg]; + + if (!xfrm_state_addr_ok(priv->key, family, state->props.mode)) { + regs->verdict.code = NFT_BREAK; + return; + } + + switch (priv->key) { + case NFT_XFRM_KEY_UNSPEC: + case __NFT_XFRM_KEY_MAX: + WARN_ON_ONCE(1); + break; + case NFT_XFRM_KEY_DADDR_IP4: + *dest = state->id.daddr.a4; + return; + case NFT_XFRM_KEY_DADDR_IP6: + memcpy(dest, &state->id.daddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_SADDR_IP4: + *dest = state->props.saddr.a4; + return; + case NFT_XFRM_KEY_SADDR_IP6: + memcpy(dest, &state->props.saddr.in6, sizeof(struct in6_addr)); + return; + case NFT_XFRM_KEY_REQID: + *dest = state->props.reqid; + return; + case NFT_XFRM_KEY_SPI: + *dest = state->id.spi; + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval_in(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct sec_path *sp = pkt->skb->sp; + const struct xfrm_state *state; + + if (sp == NULL || sp->len <= priv->spnum) { + regs->verdict.code = NFT_BREAK; + return; + } + + state = sp->xvec[priv->spnum]; + nft_xfrm_state_get_key(priv, regs, state, nft_pf(pkt)); +} + +static void nft_xfrm_get_eval_out(const struct nft_xfrm *priv, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct dst_entry *dst = skb_dst(pkt->skb); + int i; + + for (i = 0; dst && dst->xfrm; + dst = ((const struct xfrm_dst *)dst)->child, i++) { + if (i < priv->spnum) + continue; + + nft_xfrm_state_get_key(priv, regs, dst->xfrm, nft_pf(pkt)); + return; + } + + regs->verdict.code = NFT_BREAK; +} + +static void nft_xfrm_get_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + switch (priv->dir) { + case XFRM_POLICY_IN: + nft_xfrm_get_eval_in(priv, regs, pkt); + break; + case XFRM_POLICY_OUT: + nft_xfrm_get_eval_out(priv, regs, pkt); + break; + default: + WARN_ON_ONCE(1); + regs->verdict.code = NFT_BREAK; + break; + } +} + +static int nft_xfrm_get_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_XFRM_DREG, priv->dreg)) + return -1; + + if (nla_put_be32(skb, NFTA_XFRM_KEY, htonl(priv->key))) + return -1; + if (nla_put_u8(skb, NFTA_XFRM_DIR, priv->dir)) + return -1; + if (nla_put_be32(skb, NFTA_XFRM_SPNUM, htonl(priv->spnum))) + return -1; + + return 0; +} + +static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, + const struct nft_data **data) +{ + const struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int hooks; + + switch (priv->dir) { + case XFRM_POLICY_IN: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_PRE_ROUTING); + break; + case XFRM_POLICY_OUT: + hooks = (1 << NF_INET_FORWARD) | + (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING); + break; + default: + WARN_ON_ONCE(1); + return -EINVAL; + } + + return nft_chain_validate_hooks(ctx->chain, hooks); +} + + +static struct nft_expr_type nft_xfrm_type; +static const struct nft_expr_ops nft_xfrm_get_ops = { + .type = &nft_xfrm_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_xfrm)), + .eval = nft_xfrm_get_eval, + .init = nft_xfrm_get_init, + .dump = nft_xfrm_get_dump, + .validate = nft_xfrm_validate, +}; + +static struct nft_expr_type nft_xfrm_type __read_mostly = { + .name = "xfrm", + .ops = &nft_xfrm_get_ops, + .policy = nft_xfrm_policy, + .maxattr = NFTA_XFRM_MAX, + .owner = THIS_MODULE, +}; + +static int __init nft_xfrm_module_init(void) +{ + return nft_register_expr(&nft_xfrm_type); +} + +static void __exit nft_xfrm_module_exit(void) +{ + nft_unregister_expr(&nft_xfrm_type); +} + +module_init(nft_xfrm_module_init); +module_exit(nft_xfrm_module_exit); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("nf_tables: xfrm/IPSec matching"); +MODULE_AUTHOR("Florian Westphal "); +MODULE_AUTHOR("Máté Eckl "); +MODULE_ALIAS_NFT_EXPR("xfrm"); -- cgit v1.2.3 From 0d704967f4a49cc2212350b3e4a8231f8b4283ed Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 4 Sep 2018 12:07:55 +0200 Subject: netfilter: xt_cgroup: shrink size of v2 path cgroup v2 path field is PATH_MAX which is too large, this is placing too much pressure on memory allocation for people with many rules doing cgroup v1 classid matching, side effects of this are bug reports like: https://bugzilla.kernel.org/show_bug.cgi?id=200639 This patch registers a new revision that shrinks the cgroup path to 512 bytes, which is the same approach we follow in similar extensions that have a path field. Cc: Tejun Heo Signed-off-by: Pablo Neira Ayuso Acked-by: Tejun Heo --- include/uapi/linux/netfilter/xt_cgroup.h | 16 +++++++ net/netfilter/xt_cgroup.c | 72 ++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_cgroup.h b/include/uapi/linux/netfilter/xt_cgroup.h index e96dfa1b34f7..b74e370d6133 100644 --- a/include/uapi/linux/netfilter/xt_cgroup.h +++ b/include/uapi/linux/netfilter/xt_cgroup.h @@ -22,4 +22,20 @@ struct xt_cgroup_info_v1 { void *priv __attribute__((aligned(8))); }; +#define XT_CGROUP_PATH_MAX 512 + +struct xt_cgroup_info_v2 { + __u8 has_path; + __u8 has_classid; + __u8 invert_path; + __u8 invert_classid; + union { + char path[XT_CGROUP_PATH_MAX]; + __u32 classid; + }; + + /* kernel internal data */ + void *priv __attribute__((aligned(8))); +}; + #endif /* _UAPI_XT_CGROUP_H */ diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 5d92e1781980..5cb1ecb29ea4 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -68,6 +68,38 @@ static int cgroup_mt_check_v1(const struct xt_mtchk_param *par) return 0; } +static int cgroup_mt_check_v2(const struct xt_mtchk_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + struct cgroup *cgrp; + + if ((info->invert_path & ~1) || (info->invert_classid & ~1)) + return -EINVAL; + + if (!info->has_path && !info->has_classid) { + pr_info("xt_cgroup: no path or classid specified\n"); + return -EINVAL; + } + + if (info->has_path && info->has_classid) { + pr_info_ratelimited("path and classid specified\n"); + return -EINVAL; + } + + info->priv = NULL; + if (info->has_path) { + cgrp = cgroup_get_from_path(info->path); + if (IS_ERR(cgrp)) { + pr_info_ratelimited("invalid path, errno=%ld\n", + PTR_ERR(cgrp)); + return -EINVAL; + } + info->priv = cgrp; + } + + return 0; +} + static bool cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) { @@ -99,6 +131,24 @@ static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) info->invert_classid; } +static bool cgroup_mt_v2(const struct sk_buff *skb, struct xt_action_param *par) +{ + const struct xt_cgroup_info_v2 *info = par->matchinfo; + struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data; + struct cgroup *ancestor = info->priv; + struct sock *sk = skb->sk; + + if (!sk || !sk_fullsock(sk) || !net_eq(xt_net(par), sock_net(sk))) + return false; + + if (ancestor) + return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^ + info->invert_path; + else + return (info->classid == sock_cgroup_classid(skcd)) ^ + info->invert_classid; +} + static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) { struct xt_cgroup_info_v1 *info = par->matchinfo; @@ -107,6 +157,14 @@ static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par) cgroup_put(info->priv); } +static void cgroup_mt_destroy_v2(const struct xt_mtdtor_param *par) +{ + struct xt_cgroup_info_v2 *info = par->matchinfo; + + if (info->priv) + cgroup_put(info->priv); +} + static struct xt_match cgroup_mt_reg[] __read_mostly = { { .name = "cgroup", @@ -134,6 +192,20 @@ static struct xt_match cgroup_mt_reg[] __read_mostly = { (1 << NF_INET_POST_ROUTING) | (1 << NF_INET_LOCAL_IN), }, + { + .name = "cgroup", + .revision = 2, + .family = NFPROTO_UNSPEC, + .checkentry = cgroup_mt_check_v2, + .match = cgroup_mt_v2, + .matchsize = sizeof(struct xt_cgroup_info_v2), + .usersize = offsetof(struct xt_cgroup_info_v2, priv), + .destroy = cgroup_mt_destroy_v2, + .me = THIS_MODULE, + .hooks = (1 << NF_INET_LOCAL_OUT) | + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), + }, }; static int __init cgroup_mt_init(void) -- cgit v1.2.3 From 30f8eb55873ef078f5f02f636061d9399debbeab Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Fri, 21 Sep 2018 12:39:29 +0200 Subject: net: if_arp: Fix incorrect indents MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixing incorrect indents and align comments. Signed-off-by: Håkon Bugge Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index 4605527ca41b..b68b4b3d9172 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -114,18 +114,18 @@ /* ARP ioctl request. */ struct arpreq { - struct sockaddr arp_pa; /* protocol address */ - struct sockaddr arp_ha; /* hardware address */ - int arp_flags; /* flags */ - struct sockaddr arp_netmask; /* netmask (only for proxy arps) */ - char arp_dev[16]; + struct sockaddr arp_pa; /* protocol address */ + struct sockaddr arp_ha; /* hardware address */ + int arp_flags; /* flags */ + struct sockaddr arp_netmask; /* netmask (only for proxy arps) */ + char arp_dev[16]; }; struct arpreq_old { - struct sockaddr arp_pa; /* protocol address */ - struct sockaddr arp_ha; /* hardware address */ - int arp_flags; /* flags */ - struct sockaddr arp_netmask; /* netmask (only for proxy arps) */ + struct sockaddr arp_pa; /* protocol address */ + struct sockaddr arp_ha; /* hardware address */ + int arp_flags; /* flags */ + struct sockaddr arp_netmask; /* netmask (only for proxy arps) */ }; /* ARP Flag values. */ -- cgit v1.2.3 From 6a12709da354ea149fdf86c4c9aba5b5033e9cf2 Mon Sep 17 00:00:00 2001 From: Håkon Bugge Date: Fri, 21 Sep 2018 12:39:30 +0200 Subject: net: if_arp: use define instead of hard-coded value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit uapi/linux/if_arp.h includes linux/netdevice.h, which uses IFNAMSIZ. Hence, use it instead of hard-coded value. Signed-off-by: Håkon Bugge Signed-off-by: David S. Miller --- include/uapi/linux/if_arp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index b68b4b3d9172..c3cc5a9e5eaf 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -118,7 +118,7 @@ struct arpreq { struct sockaddr arp_ha; /* hardware address */ int arp_flags; /* flags */ struct sockaddr arp_netmask; /* netmask (only for proxy arps) */ - char arp_dev[16]; + char arp_dev[IFNAMSIZ]; }; struct arpreq_old { -- cgit v1.2.3 From 5e111210a44301304f9054e995bf33f69b6de76f Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Fri, 21 Sep 2018 07:13:54 -0400 Subject: net/core: Add new basic hardware counter Add a new hardware specific basic counter, TCA_STATS_BASIC_HW. This can be used to count packets/bytes processed by hardware offload. Signed-off-by: Eelco Chaudron Signed-off-by: David S. Miller --- include/net/gen_stats.h | 4 +++ include/uapi/linux/gen_stats.h | 1 + net/core/gen_stats.c | 73 +++++++++++++++++++++++++++++++----------- 3 files changed, 59 insertions(+), 19 deletions(-) (limited to 'include/uapi') diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 883bb9085f15..946bd53a9f81 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -44,6 +44,10 @@ void __gnet_stats_copy_basic(const seqcount_t *running, struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); +int gnet_stats_copy_basic_hw(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b); int gnet_stats_copy_rate_est(struct gnet_dump *d, struct net_rate_estimator __rcu **ptr); int gnet_stats_copy_queue(struct gnet_dump *d, diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h index 24a861c0d29d..065408e16a80 100644 --- a/include/uapi/linux/gen_stats.h +++ b/include/uapi/linux/gen_stats.h @@ -12,6 +12,7 @@ enum { TCA_STATS_APP, TCA_STATS_RATE_EST64, TCA_STATS_PAD, + TCA_STATS_BASIC_HW, __TCA_STATS_MAX, }; #define TCA_STATS_MAX (__TCA_STATS_MAX - 1) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 188d693cb251..65a2e820364f 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -162,30 +162,18 @@ __gnet_stats_copy_basic(const seqcount_t *running, } EXPORT_SYMBOL(__gnet_stats_copy_basic); -/** - * gnet_stats_copy_basic - copy basic statistics into statistic TLV - * @running: seqcount_t pointer - * @d: dumping handle - * @cpu: copy statistic per cpu - * @b: basic statistics - * - * Appends the basic statistics to the top level TLV created by - * gnet_stats_start_copy(). - * - * Returns 0 on success or -1 with the statistic lock released - * if the room in the socket buffer was not sufficient. - */ int -gnet_stats_copy_basic(const seqcount_t *running, - struct gnet_dump *d, - struct gnet_stats_basic_cpu __percpu *cpu, - struct gnet_stats_basic_packed *b) +___gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b, + int type) { struct gnet_stats_basic_packed bstats = {0}; __gnet_stats_copy_basic(running, &bstats, cpu, b); - if (d->compat_tc_stats) { + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { d->tc_stats.bytes = bstats.bytes; d->tc_stats.packets = bstats.packets; } @@ -196,13 +184,60 @@ gnet_stats_copy_basic(const seqcount_t *running, memset(&sb, 0, sizeof(sb)); sb.bytes = bstats.bytes; sb.packets = bstats.packets; - return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb), + return gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); } return 0; } + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC); +} EXPORT_SYMBOL(gnet_stats_copy_basic); +/** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV + * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic_hw(const seqcount_t *running, + struct gnet_dump *d, + struct gnet_stats_basic_cpu __percpu *cpu, + struct gnet_stats_basic_packed *b) +{ + return ___gnet_stats_copy_basic(running, d, cpu, b, + TCA_STATS_BASIC_HW); +} +EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + /** * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV * @d: dumping handle -- cgit v1.2.3 From fb961945457f5177072c968aa38fee910ab893b9 Mon Sep 17 00:00:00 2001 From: Christian Göttsche Date: Sun, 23 Sep 2018 20:26:15 +0200 Subject: netfilter: nf_tables: add SECMARK support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the ability to set the security context of packets within the nf_tables framework. Add a nft_object for holding security contexts in the kernel and manipulating packets on the wire. Convert the security context strings at rule addition time to security identifiers. This is the same behavior like in xt_SECMARK and offers better performance than computing it per packet. Set the maximum security context length to 256. Signed-off-by: Christian Göttsche Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_tables_core.h | 4 ++ include/uapi/linux/netfilter/nf_tables.h | 18 +++++- net/netfilter/nf_tables_core.c | 28 ++++++-- net/netfilter/nft_meta.c | 108 +++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h index 8da837d2aaf9..2046d104f323 100644 --- a/include/net/netfilter/nf_tables_core.h +++ b/include/net/netfilter/nf_tables_core.h @@ -16,6 +16,10 @@ extern struct nft_expr_type nft_meta_type; extern struct nft_expr_type nft_rt_type; extern struct nft_expr_type nft_exthdr_type; +#ifdef CONFIG_NETWORK_SECMARK +extern struct nft_object_type nft_secmark_obj_type; +#endif + int nf_tables_core_module_init(void); void nf_tables_core_module_exit(void); diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 702e4f0bec56..5444e76870bb 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1176,6 +1176,21 @@ enum nft_quota_attributes { }; #define NFTA_QUOTA_MAX (__NFTA_QUOTA_MAX - 1) +/** + * enum nft_secmark_attributes - nf_tables secmark object netlink attributes + * + * @NFTA_SECMARK_CTX: security context (NLA_STRING) + */ +enum nft_secmark_attributes { + NFTA_SECMARK_UNSPEC, + NFTA_SECMARK_CTX, + __NFTA_SECMARK_MAX, +}; +#define NFTA_SECMARK_MAX (__NFTA_SECMARK_MAX - 1) + +/* Max security context length */ +#define NFT_SECMARK_CTX_MAXLEN 256 + /** * enum nft_reject_types - nf_tables reject expression reject types * @@ -1432,7 +1447,8 @@ enum nft_ct_timeout_timeout_attributes { #define NFT_OBJECT_CONNLIMIT 5 #define NFT_OBJECT_TUNNEL 6 #define NFT_OBJECT_CT_TIMEOUT 7 -#define __NFT_OBJECT_MAX 8 +#define NFT_OBJECT_SECMARK 8 +#define __NFT_OBJECT_MAX 9 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index ffd5c0f9412b..3fbce3b9c5ec 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -249,12 +249,24 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_exthdr_type, }; +static struct nft_object_type *nft_basic_objects[] = { +#ifdef CONFIG_NETWORK_SECMARK + &nft_secmark_obj_type, +#endif +}; + int __init nf_tables_core_module_init(void) { - int err, i; + int err, i, j = 0; + + for (i = 0; i < ARRAY_SIZE(nft_basic_objects); i++) { + err = nft_register_obj(nft_basic_objects[i]); + if (err) + goto err; + } - for (i = 0; i < ARRAY_SIZE(nft_basic_types); i++) { - err = nft_register_expr(nft_basic_types[i]); + for (j = 0; j < ARRAY_SIZE(nft_basic_types); j++) { + err = nft_register_expr(nft_basic_types[j]); if (err) goto err; } @@ -262,8 +274,12 @@ int __init nf_tables_core_module_init(void) return 0; err: + while (j-- > 0) + nft_unregister_expr(nft_basic_types[j]); + while (i-- > 0) - nft_unregister_expr(nft_basic_types[i]); + nft_unregister_obj(nft_basic_objects[i]); + return err; } @@ -274,4 +290,8 @@ void nf_tables_core_module_exit(void) i = ARRAY_SIZE(nft_basic_types); while (i-- > 0) nft_unregister_expr(nft_basic_types[i]); + + i = ARRAY_SIZE(nft_basic_objects); + while (i-- > 0) + nft_unregister_obj(nft_basic_objects[i]); } diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 297fe7d97c18..91fd6e677ad7 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -543,3 +543,111 @@ struct nft_expr_type nft_meta_type __read_mostly = { .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, }; + +#ifdef CONFIG_NETWORK_SECMARK +struct nft_secmark { + u32 secid; + char *ctx; +}; + +static const struct nla_policy nft_secmark_policy[NFTA_SECMARK_MAX + 1] = { + [NFTA_SECMARK_CTX] = { .type = NLA_STRING, .len = NFT_SECMARK_CTX_MAXLEN }, +}; + +static int nft_secmark_compute_secid(struct nft_secmark *priv) +{ + u32 tmp_secid = 0; + int err; + + err = security_secctx_to_secid(priv->ctx, strlen(priv->ctx), &tmp_secid); + if (err) + return err; + + if (!tmp_secid) + return -ENOENT; + + err = security_secmark_relabel_packet(tmp_secid); + if (err) + return err; + + priv->secid = tmp_secid; + return 0; +} + +static void nft_secmark_obj_eval(struct nft_object *obj, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_secmark *priv = nft_obj_data(obj); + struct sk_buff *skb = pkt->skb; + + skb->secmark = priv->secid; +} + +static int nft_secmark_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (tb[NFTA_SECMARK_CTX] == NULL) + return -EINVAL; + + priv->ctx = nla_strdup(tb[NFTA_SECMARK_CTX], GFP_KERNEL); + if (!priv->ctx) + return -ENOMEM; + + err = nft_secmark_compute_secid(priv); + if (err) { + kfree(priv->ctx); + return err; + } + + security_secmark_refcount_inc(); + + return 0; +} + +static int nft_secmark_obj_dump(struct sk_buff *skb, struct nft_object *obj, + bool reset) +{ + struct nft_secmark *priv = nft_obj_data(obj); + int err; + + if (nla_put_string(skb, NFTA_SECMARK_CTX, priv->ctx)) + return -1; + + if (reset) { + err = nft_secmark_compute_secid(priv); + if (err) + return err; + } + + return 0; +} + +static void nft_secmark_obj_destroy(const struct nft_ctx *ctx, struct nft_object *obj) +{ + struct nft_secmark *priv = nft_obj_data(obj); + + security_secmark_refcount_dec(); + + kfree(priv->ctx); +} + +static const struct nft_object_ops nft_secmark_obj_ops = { + .type = &nft_secmark_obj_type, + .size = sizeof(struct nft_secmark), + .init = nft_secmark_obj_init, + .eval = nft_secmark_obj_eval, + .dump = nft_secmark_obj_dump, + .destroy = nft_secmark_obj_destroy, +}; +struct nft_object_type nft_secmark_obj_type __read_mostly = { + .type = NFT_OBJECT_SECMARK, + .ops = &nft_secmark_obj_ops, + .maxattr = NFTA_SECMARK_MAX, + .policy = nft_secmark_policy, + .owner = THIS_MODULE, +}; +#endif /* CONFIG_NETWORK_SECMARK */ -- cgit v1.2.3 From b741f1630346defcbc8cc60f1a2bdae8b3b0036f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Fri, 28 Sep 2018 14:45:43 +0000 Subject: bpf: introduce per-cpu cgroup local storage This commit introduced per-cpu cgroup local storage. Per-cpu cgroup local storage is very similar to simple cgroup storage (let's call it shared), except all the data is per-cpu. The main goal of per-cpu variant is to implement super fast counters (e.g. packet counters), which don't require neither lookups, neither atomic operations. >From userspace's point of view, accessing a per-cpu cgroup storage is similar to other per-cpu map types (e.g. per-cpu hashmaps and arrays). Writing to a per-cpu cgroup storage is not atomic, but is performed by copying longs, so some minimal atomicity is here, exactly as with other per-cpu maps. Signed-off-by: Roman Gushchin Cc: Daniel Borkmann Cc: Alexei Starovoitov Acked-by: Song Liu Signed-off-by: Daniel Borkmann --- include/linux/bpf-cgroup.h | 20 +++++- include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/helpers.c | 8 ++- kernel/bpf/local_storage.c | 150 +++++++++++++++++++++++++++++++++++++++------ kernel/bpf/syscall.c | 11 +++- kernel/bpf/verifier.c | 15 +++-- 8 files changed, 179 insertions(+), 28 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 7e0c9a1d48b7..588dd5f0bd85 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -37,7 +37,10 @@ struct bpf_storage_buffer { }; struct bpf_cgroup_storage { - struct bpf_storage_buffer *buf; + union { + struct bpf_storage_buffer *buf; + void __percpu *percpu_buf; + }; struct bpf_cgroup_storage_map *map; struct bpf_cgroup_storage_key key; struct list_head list; @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) + return BPF_CGROUP_STORAGE_PERCPU; + return BPF_CGROUP_STORAGE_SHARED; } @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage); int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map); void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map); +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value); +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, + void *value, u64 flags); + /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc( struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; } static inline void bpf_cgroup_storage_free( struct bpf_cgroup_storage *storage) {} +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, + void *value) { + return 0; +} +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, + void *key, void *value, u64 flags) { + return 0; +} #define cgroup_bpf_enabled (0) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index b457fbe7b70b..018299a595c8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -274,6 +274,7 @@ struct bpf_prog_offload { enum bpf_cgroup_storage_type { BPF_CGROUP_STORAGE_SHARED, + BPF_CGROUP_STORAGE_PERCPU, __BPF_CGROUP_STORAGE_MAX }; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index c9bd6fb765b0..5432f4c9f50e 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) #endif #ifdef CONFIG_CGROUP_BPF BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index aa5ccd2385ed..e2070d819e04 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -127,6 +127,7 @@ enum bpf_map_type { BPF_MAP_TYPE_SOCKHASH, BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, }; enum bpf_prog_type { diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index e42f8789b7ea..6502115e8f55 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags) */ enum bpf_cgroup_storage_type stype = cgroup_storage_type(map); struct bpf_cgroup_storage *storage; + void *ptr; storage = this_cpu_read(bpf_cgroup_storage[stype]); - return (unsigned long)&READ_ONCE(storage->buf)->data[0]; + if (stype == BPF_CGROUP_STORAGE_SHARED) + ptr = &READ_ONCE(storage->buf)->data[0]; + else + ptr = this_cpu_ptr(storage->percpu_buf); + + return (unsigned long)ptr; } const struct bpf_func_proto bpf_get_local_storage_proto = { diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 6742292fb39e..944eb297465f 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key, return 0; } +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key, + void *value) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(storage->percpu_buf, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key, + void *value, u64 map_flags) +{ + struct bpf_cgroup_storage_map *map = map_to_storage(_map); + struct bpf_cgroup_storage_key *key = _key; + struct bpf_cgroup_storage *storage; + int cpu, off = 0; + u32 size; + + if (map_flags != BPF_ANY && map_flags != BPF_EXIST) + return -EINVAL; + + rcu_read_lock(); + storage = cgroup_storage_lookup(map, key, false); + if (!storage) { + rcu_read_unlock(); + return -ENOENT; + } + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(_map->value_size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu), + value + off, size); + off += size; + } + rcu_read_unlock(); + return 0; +} + static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key, void *_next_key) { @@ -287,60 +352,105 @@ void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *_map) spin_unlock_bh(&map->lock); } +static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) +{ + size_t size; + + if (cgroup_storage_type(map) == BPF_CGROUP_STORAGE_SHARED) { + size = sizeof(struct bpf_storage_buffer) + map->value_size; + *pages = round_up(sizeof(struct bpf_cgroup_storage) + size, + PAGE_SIZE) >> PAGE_SHIFT; + } else { + size = map->value_size; + *pages = round_up(round_up(size, 8) * num_possible_cpus(), + PAGE_SIZE) >> PAGE_SHIFT; + } + + return size; +} + struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { struct bpf_cgroup_storage *storage; struct bpf_map *map; + gfp_t flags; + size_t size; u32 pages; map = prog->aux->cgroup_storage[stype]; if (!map) return NULL; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + size = bpf_cgroup_storage_calculate_size(map, &pages); + if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), __GFP_ZERO | GFP_USER, map->numa_node); - if (!storage) { - bpf_map_uncharge_memlock(map, pages); - return ERR_PTR(-ENOMEM); - } + if (!storage) + goto enomem; - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, __GFP_ZERO | GFP_USER, - map->numa_node); - if (!storage->buf) { - bpf_map_uncharge_memlock(map, pages); - kfree(storage); - return ERR_PTR(-ENOMEM); + flags = __GFP_ZERO | GFP_USER; + + if (stype == BPF_CGROUP_STORAGE_SHARED) { + storage->buf = kmalloc_node(size, flags, map->numa_node); + if (!storage->buf) + goto enomem; + } else { + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + if (!storage->percpu_buf) + goto enomem; } storage->map = (struct bpf_cgroup_storage_map *)map; return storage; + +enomem: + bpf_map_uncharge_memlock(map, pages); + kfree(storage); + return ERR_PTR(-ENOMEM); +} + +static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + kfree(storage->buf); + kfree(storage); +} + +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu) +{ + struct bpf_cgroup_storage *storage = + container_of(rcu, struct bpf_cgroup_storage, rcu); + + free_percpu(storage->percpu_buf); + kfree(storage); } void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { - u32 pages; + enum bpf_cgroup_storage_type stype; struct bpf_map *map; + u32 pages; if (!storage) return; map = &storage->map->map; - pages = round_up(sizeof(struct bpf_cgroup_storage) + - sizeof(struct bpf_storage_buffer) + - map->value_size, PAGE_SIZE) >> PAGE_SHIFT; + + bpf_cgroup_storage_calculate_size(map, &pages); bpf_map_uncharge_memlock(map, pages); - kfree_rcu(storage->buf, rcu); - kfree_rcu(storage, rcu); + stype = cgroup_storage_type(map); + if (stype == BPF_CGROUP_STORAGE_SHARED) + call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); + else + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu); } void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8c91d2b41b1e..5742df21598c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else if (IS_FD_MAP(map)) value_size = sizeof(u32); @@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); } else if (IS_FD_ARRAY(map)) { @@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr) if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) value_size = round_up(map->value_size, 8) * num_possible_cpus(); else value_size = map->value_size; @@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) { + err = bpf_percpu_cgroup_storage_update(map, key, value, + attr->flags); } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e90899df585d..a8cc83a970d1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_MAP_TYPE_CGROUP_STORAGE: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: if (func_id != BPF_FUNC_get_local_storage) goto error; break; @@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, goto error; break; case BPF_FUNC_get_local_storage: - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE) + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) goto error; break; case BPF_FUNC_sk_select_reuseport: @@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return 0; } +static bool bpf_map_is_cgroup_storage(struct bpf_map *map) +{ + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE || + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE); +} + /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ @@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) } env->used_maps[env->used_map_cnt++] = map; - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE && + if (bpf_map_is_cgroup_storage(map) && bpf_cgroup_storage_assign(env->prog, map)) { - verbose(env, - "only one cgroup storage is allowed\n"); + verbose(env, "only one cgroup storage of each type is allowed\n"); fdput(f); return -EBUSY; } -- cgit v1.2.3 From 81e54d08d9d845053111f30045a93f3eb1c3ca96 Mon Sep 17 00:00:00 2001 From: Pradeep Kumar Chitrapu Date: Thu, 20 Sep 2018 17:30:09 -0700 Subject: cfg80211: support FTM responder configuration/statistics Allow userspace to enable fine timing measurement responder functionality with configurable lci/civic parameters in AP mode. This can be done at AP start or changing beacon parameters. A new EXT_FEATURE flag is introduced for drivers to advertise the capability. Also nl80211 API support for retrieving statistics is added. Signed-off-by: Johannes Berg Signed-off-by: Pradeep Kumar Chitrapu [remove unused cfg80211_ftm_responder_params, clarify docs, move validation into policy] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 52 +++++++++++++++++ include/uapi/linux/nl80211.h | 90 +++++++++++++++++++++++++++++ net/wireless/nl80211.c | 132 +++++++++++++++++++++++++++++++++++++++++-- net/wireless/rdev-ops.h | 15 +++++ net/wireless/trace.h | 44 +++++++++++++++ 5 files changed, 328 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9f3ed79c39d7..deb313105014 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -775,6 +775,12 @@ struct cfg80211_crypto_settings { * @assocresp_ies_len: length of assocresp_ies in octets * @probe_resp_len: length of probe response template (@probe_resp) * @probe_resp: probe response template (AP mode only) + * @ftm_responder: enable FTM responder functionality; -1 for no change + * (which also implies no change in LCI/civic location data) + * @lci: LCI subelement content + * @civicloc: Civic location subelement content + * @lci_len: LCI data length + * @civicloc_len: Civic location data length */ struct cfg80211_beacon_data { const u8 *head, *tail; @@ -782,12 +788,17 @@ struct cfg80211_beacon_data { const u8 *proberesp_ies; const u8 *assocresp_ies; const u8 *probe_resp; + const u8 *lci; + const u8 *civicloc; + s8 ftm_responder; size_t head_len, tail_len; size_t beacon_ies_len; size_t proberesp_ies_len; size_t assocresp_ies_len; size_t probe_resp_len; + size_t lci_len; + size_t civicloc_len; }; struct mac_address { @@ -2796,6 +2807,40 @@ struct cfg80211_external_auth_params { u16 status; }; +/** + * cfg80211_ftm_responder_stats - FTM responder statistics + * + * @filled: bitflag of flags using the bits of &enum nl80211_ftm_stats to + * indicate the relevant values in this struct for them + * @success_num: number of FTM sessions in which all frames were successfully + * answered + * @partial_num: number of FTM sessions in which part of frames were + * successfully answered + * @failed_num: number of failed FTM sessions + * @asap_num: number of ASAP FTM sessions + * @non_asap_num: number of non-ASAP FTM sessions + * @total_duration_ms: total sessions durations - gives an indication + * of how much time the responder was busy + * @unknown_triggers_num: number of unknown FTM triggers - triggers from + * initiators that didn't finish successfully the negotiation phase with + * the responder + * @reschedule_requests_num: number of FTM reschedule requests - initiator asks + * for a new scheduling although it already has scheduled FTM slot + * @out_of_window_triggers_num: total FTM triggers out of scheduled window + */ +struct cfg80211_ftm_responder_stats { + u32 filled; + u32 success_num; + u32 partial_num; + u32 failed_num; + u32 asap_num; + u32 non_asap_num; + u64 total_duration_ms; + u32 unknown_triggers_num; + u32 reschedule_requests_num; + u32 out_of_window_triggers_num; +}; + /** * struct cfg80211_ops - backend description for wireless configuration * @@ -3128,6 +3173,9 @@ struct cfg80211_external_auth_params { * * @tx_control_port: TX a control port frame (EAPoL). The noencrypt parameter * tells the driver that the frame should not be encrypted. + * + * @get_ftm_responder_stats: Retrieve FTM responder statistics, if available. + * Statistics should be cumulative, currently no way to reset is provided. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -3433,6 +3481,10 @@ struct cfg80211_ops { const u8 *buf, size_t len, const u8 *dest, const __be16 proto, const bool noencrypt); + + int (*get_ftm_responder_stats)(struct wiphy *wiphy, + struct net_device *dev, + struct cfg80211_ftm_responder_stats *ftm_stats); }; /* diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index cfc94178d608..dc6d5a1ef470 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -1033,6 +1033,9 @@ * %NL80211_ATTR_CHANNEL_WIDTH,%NL80211_ATTR_NSS attributes with its * address(specified in %NL80211_ATTR_MAC). * + * @NL80211_CMD_GET_FTM_RESPONDER_STATS: Retrieve FTM responder statistics, in + * the %NL80211_ATTR_FTM_RESPONDER_STATS attribute. + * * @NL80211_CMD_MAX: highest used command number * @__NL80211_CMD_AFTER_LAST: internal use */ @@ -1245,6 +1248,8 @@ enum nl80211_commands { NL80211_CMD_CONTROL_PORT_FRAME, + NL80211_CMD_GET_FTM_RESPONDER_STATS, + /* add new commands above here */ /* used to define NL80211_CMD_MAX below */ @@ -2241,6 +2246,14 @@ enum nl80211_commands { * association request when used with NL80211_CMD_NEW_STATION). Can be set * only if %NL80211_STA_FLAG_WME is set. * + * @NL80211_ATTR_FTM_RESPONDER: nested attribute which user-space can include + * in %NL80211_CMD_START_AP or %NL80211_CMD_SET_BEACON for fine timing + * measurement (FTM) responder functionality and containing parameters as + * possible, see &enum nl80211_ftm_responder_attr + * + * @NL80211_ATTR_FTM_RESPONDER_STATS: Nested attribute with FTM responder + * statistics, see &enum nl80211_ftm_responder_stats. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2682,6 +2695,10 @@ enum nl80211_attrs { NL80211_ATTR_HE_CAPABILITY, + NL80211_ATTR_FTM_RESPONDER, + + NL80211_ATTR_FTM_RESPONDER_STATS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -5225,6 +5242,8 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT: Driver/device can omit all data * except for supported rates from the probe request content if requested * by the %NL80211_SCAN_FLAG_MIN_PREQ_CONTENT flag. + * @NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER: Driver supports enabling fine + * timing measurement responder role. * * @NL80211_EXT_FEATURE_CAN_REPLACE_PTK0: Driver/device confirm that they are * able to rekey an in-use key correctly. Userspace must not rekey PTK keys @@ -5269,6 +5288,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_SCAN_RANDOM_SN, NL80211_EXT_FEATURE_SCAN_MIN_PREQ_CONTENT, NL80211_EXT_FEATURE_CAN_REPLACE_PTK0, + NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -5808,4 +5828,74 @@ enum nl80211_external_auth_action { NL80211_EXTERNAL_AUTH_ABORT, }; +/** + * enum nl80211_ftm_responder_attributes - fine timing measurement + * responder attributes + * @__NL80211_FTM_RESP_ATTR_INVALID: Invalid + * @NL80211_FTM_RESP_ATTR_ENABLED: FTM responder is enabled + * @NL80211_FTM_RESP_ATTR_LCI: The content of Measurement Report Element + * (9.4.2.22 in 802.11-2016) with type 8 - LCI (9.4.2.22.10) + * @NL80211_FTM_RESP_ATTR_CIVIC: The content of Measurement Report Element + * (9.4.2.22 in 802.11-2016) with type 11 - Civic (Section 9.4.2.22.13) + * @__NL80211_FTM_RESP_ATTR_LAST: Internal + * @NL80211_FTM_RESP_ATTR_MAX: highest FTM responder attribute. + */ +enum nl80211_ftm_responder_attributes { + __NL80211_FTM_RESP_ATTR_INVALID, + + NL80211_FTM_RESP_ATTR_ENABLED, + NL80211_FTM_RESP_ATTR_LCI, + NL80211_FTM_RESP_ATTR_CIVICLOC, + + /* keep last */ + __NL80211_FTM_RESP_ATTR_LAST, + NL80211_FTM_RESP_ATTR_MAX = __NL80211_FTM_RESP_ATTR_LAST - 1, +}; + +/* + * enum nl80211_ftm_responder_stats - FTM responder statistics + * + * These attribute types are used with %NL80211_ATTR_FTM_RESPONDER_STATS + * when getting FTM responder statistics. + * + * @__NL80211_FTM_STATS_INVALID: attribute number 0 is reserved + * @NL80211_FTM_STATS_SUCCESS_NUM: number of FTM sessions in which all frames + * were ssfully answered (u32) + * @NL80211_FTM_STATS_PARTIAL_NUM: number of FTM sessions in which part of the + * frames were successfully answered (u32) + * @NL80211_FTM_STATS_FAILED_NUM: number of failed FTM sessions (u32) + * @NL80211_FTM_STATS_ASAP_NUM: number of ASAP sessions (u32) + * @NL80211_FTM_STATS_NON_ASAP_NUM: number of non-ASAP sessions (u32) + * @NL80211_FTM_STATS_TOTAL_DURATION_MSEC: total sessions durations - gives an + * indication of how much time the responder was busy (u64, msec) + * @NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM: number of unknown FTM triggers - + * triggers from initiators that didn't finish successfully the negotiation + * phase with the responder (u32) + * @NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM: number of FTM reschedule requests + * - initiator asks for a new scheduling although it already has scheduled + * FTM slot (u32) + * @NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM: number of FTM triggers out of + * scheduled window (u32) + * @NL80211_FTM_STATS_PAD: used for padding, ignore + * @__NL80211_TXQ_ATTR_AFTER_LAST: Internal + * @NL80211_FTM_STATS_MAX: highest possible FTM responder stats attribute + */ +enum nl80211_ftm_responder_stats { + __NL80211_FTM_STATS_INVALID, + NL80211_FTM_STATS_SUCCESS_NUM, + NL80211_FTM_STATS_PARTIAL_NUM, + NL80211_FTM_STATS_FAILED_NUM, + NL80211_FTM_STATS_ASAP_NUM, + NL80211_FTM_STATS_NON_ASAP_NUM, + NL80211_FTM_STATS_TOTAL_DURATION_MSEC, + NL80211_FTM_STATS_UNKNOWN_TRIGGERS_NUM, + NL80211_FTM_STATS_RESCHEDULE_REQUESTS_NUM, + NL80211_FTM_STATS_OUT_OF_WINDOW_TRIGGERS_NUM, + NL80211_FTM_STATS_PAD, + + /* keep last */ + __NL80211_FTM_STATS_AFTER_LAST, + NL80211_FTM_STATS_MAX = __NL80211_FTM_STATS_AFTER_LAST - 1 +}; + #endif /* __LINUX_NL80211_H */ diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 90788ebe794e..235a43185e8d 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -201,6 +201,15 @@ cfg80211_get_dev_from_info(struct net *netns, struct genl_info *info) } /* policy for the attributes */ +static const struct nla_policy +nl80211_ftm_responder_policy[NL80211_FTM_RESP_ATTR_MAX + 1] = { + [NL80211_FTM_RESP_ATTR_ENABLED] = { .type = NLA_FLAG, }, + [NL80211_FTM_RESP_ATTR_LCI] = { .type = NLA_BINARY, + .len = U8_MAX }, + [NL80211_FTM_RESP_ATTR_CIVICLOC] = { .type = NLA_BINARY, + .len = U8_MAX }, +}; + static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_WIPHY] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_NAME] = { .type = NLA_NUL_STRING, @@ -430,6 +439,11 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { [NL80211_ATTR_TXQ_QUANTUM] = { .type = NLA_U32 }, [NL80211_ATTR_HE_CAPABILITY] = { .type = NLA_BINARY, .len = NL80211_HE_MAX_CAPABILITY_LEN }, + + [NL80211_ATTR_FTM_RESPONDER] = { + .type = NLA_NESTED, + .validation_data = nl80211_ftm_responder_policy, + }, }; /* policy for the key attributes */ @@ -3989,10 +4003,12 @@ static int validate_beacon_tx_rate(struct cfg80211_registered_device *rdev, return 0; } -static int nl80211_parse_beacon(struct nlattr *attrs[], +static int nl80211_parse_beacon(struct cfg80211_registered_device *rdev, + struct nlattr *attrs[], struct cfg80211_beacon_data *bcn) { bool haveinfo = false; + int err; if (!is_valid_ie_attr(attrs[NL80211_ATTR_BEACON_TAIL]) || !is_valid_ie_attr(attrs[NL80211_ATTR_IE]) || @@ -4043,6 +4059,35 @@ static int nl80211_parse_beacon(struct nlattr *attrs[], bcn->probe_resp_len = nla_len(attrs[NL80211_ATTR_PROBE_RESP]); } + if (attrs[NL80211_ATTR_FTM_RESPONDER]) { + struct nlattr *tb[NL80211_FTM_RESP_ATTR_MAX + 1]; + + err = nla_parse_nested(tb, NL80211_FTM_RESP_ATTR_MAX, + attrs[NL80211_ATTR_FTM_RESPONDER], + NULL, NULL); + if (err) + return err; + + if (tb[NL80211_FTM_RESP_ATTR_ENABLED] && + wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_ENABLE_FTM_RESPONDER)) + bcn->ftm_responder = 1; + else + return -EOPNOTSUPP; + + if (tb[NL80211_FTM_RESP_ATTR_LCI]) { + bcn->lci = nla_data(tb[NL80211_FTM_RESP_ATTR_LCI]); + bcn->lci_len = nla_len(tb[NL80211_FTM_RESP_ATTR_LCI]); + } + + if (tb[NL80211_FTM_RESP_ATTR_CIVICLOC]) { + bcn->civicloc = nla_data(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]); + bcn->civicloc_len = nla_len(tb[NL80211_FTM_RESP_ATTR_CIVICLOC]); + } + } else { + bcn->ftm_responder = -1; + } + return 0; } @@ -4189,7 +4234,7 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_BEACON_HEAD]) return -EINVAL; - err = nl80211_parse_beacon(info->attrs, ¶ms.beacon); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon); if (err) return err; @@ -4373,7 +4418,7 @@ static int nl80211_set_beacon(struct sk_buff *skb, struct genl_info *info) if (!wdev->beacon_interval) return -EINVAL; - err = nl80211_parse_beacon(info->attrs, ¶ms); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms); if (err) return err; @@ -7935,7 +7980,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (!need_new_beacon) goto skip_beacons; - err = nl80211_parse_beacon(info->attrs, ¶ms.beacon_after); + err = nl80211_parse_beacon(rdev, info->attrs, ¶ms.beacon_after); if (err) return err; @@ -7945,7 +7990,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) if (err) return err; - err = nl80211_parse_beacon(csa_attrs, ¶ms.beacon_csa); + err = nl80211_parse_beacon(rdev, csa_attrs, ¶ms.beacon_csa); if (err) return err; @@ -12984,6 +13029,76 @@ static int nl80211_tx_control_port(struct sk_buff *skb, struct genl_info *info) return err; } +static int nl80211_get_ftm_responder_stats(struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_ftm_responder_stats ftm_stats = {}; + struct sk_buff *msg; + void *hdr; + struct nlattr *ftm_stats_attr; + int err; + + if (wdev->iftype != NL80211_IFTYPE_AP || !wdev->beacon_interval) + return -EOPNOTSUPP; + + err = rdev_get_ftm_responder_stats(rdev, dev, &ftm_stats); + if (err) + return err; + + if (!ftm_stats.filled) + return -ENODATA; + + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!msg) + return -ENOMEM; + + hdr = nl80211hdr_put(msg, info->snd_portid, info->snd_seq, 0, + NL80211_CMD_GET_FTM_RESPONDER_STATS); + if (!hdr) + return -ENOBUFS; + + if (nla_put_u32(msg, NL80211_ATTR_IFINDEX, dev->ifindex)) + goto nla_put_failure; + + ftm_stats_attr = nla_nest_start(msg, NL80211_ATTR_FTM_RESPONDER_STATS); + if (!ftm_stats_attr) + goto nla_put_failure; + +#define SET_FTM(field, name, type) \ + do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \ + nla_put_ ## type(msg, NL80211_FTM_STATS_ ## name, \ + ftm_stats.field)) \ + goto nla_put_failure; } while (0) +#define SET_FTM_U64(field, name) \ + do { if ((ftm_stats.filled & BIT(NL80211_FTM_STATS_ ## name)) && \ + nla_put_u64_64bit(msg, NL80211_FTM_STATS_ ## name, \ + ftm_stats.field, NL80211_FTM_STATS_PAD)) \ + goto nla_put_failure; } while (0) + + SET_FTM(success_num, SUCCESS_NUM, u32); + SET_FTM(partial_num, PARTIAL_NUM, u32); + SET_FTM(failed_num, FAILED_NUM, u32); + SET_FTM(asap_num, ASAP_NUM, u32); + SET_FTM(non_asap_num, NON_ASAP_NUM, u32); + SET_FTM_U64(total_duration_ms, TOTAL_DURATION_MSEC); + SET_FTM(unknown_triggers_num, UNKNOWN_TRIGGERS_NUM, u32); + SET_FTM(reschedule_requests_num, RESCHEDULE_REQUESTS_NUM, u32); + SET_FTM(out_of_window_triggers_num, OUT_OF_WINDOW_TRIGGERS_NUM, u32); +#undef SET_FTM + + nla_nest_end(msg, ftm_stats_attr); + + genlmsg_end(msg, hdr); + return genlmsg_reply(msg, info); + +nla_put_failure: + nlmsg_free(msg); + return -ENOBUFS; +} + #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 @@ -13895,6 +14010,13 @@ static const struct genl_ops nl80211_ops[] = { .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, + { + .cmd = NL80211_CMD_GET_FTM_RESPONDER_STATS, + .doit = nl80211_get_ftm_responder_stats, + .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, + }, }; static struct genl_family nl80211_fam __ro_after_init = { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 364f5d67f05b..51380b5c32f2 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -1232,4 +1232,19 @@ rdev_external_auth(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct cfg80211_ftm_responder_stats *ftm_stats) +{ + int ret = -EOPNOTSUPP; + + trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); + if (rdev->ops->get_ftm_responder_stats) + ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, + ftm_stats); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + #endif /* __CFG80211_RDEV_OPS */ diff --git a/net/wireless/trace.h b/net/wireless/trace.h index e51348e24ff5..7e0380192445 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3250,6 +3250,50 @@ DEFINE_EVENT(wiphy_wdev_evt, rdev_get_txq_stats, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); + +TRACE_EVENT(rdev_get_ftm_responder_stats, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, + struct cfg80211_ftm_responder_stats *ftm_stats), + + TP_ARGS(wiphy, netdev, ftm_stats), + + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(u64, timestamp) + __field(u32, success_num) + __field(u32, partial_num) + __field(u32, failed_num) + __field(u32, asap_num) + __field(u32, non_asap_num) + __field(u64, duration) + __field(u32, unknown_triggers) + __field(u32, reschedule) + __field(u32, out_of_window) + ), + + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->success_num = ftm_stats->success_num; + __entry->partial_num = ftm_stats->partial_num; + __entry->failed_num = ftm_stats->failed_num; + __entry->asap_num = ftm_stats->asap_num; + __entry->non_asap_num = ftm_stats->non_asap_num; + __entry->duration = ftm_stats->total_duration_ms; + __entry->unknown_triggers = ftm_stats->unknown_triggers_num; + __entry->reschedule = ftm_stats->reschedule_requests_num; + __entry->out_of_window = ftm_stats->out_of_window_triggers_num; + ), + + TP_printk(WIPHY_PR_FMT "Ftm responder stats: success %u, partial %u, " + "failed %u, asap %u, non asap %u, total duration %llu, unknown " + "triggers %u, rescheduled %u, out of window %u", WIPHY_PR_ARG, + __entry->success_num, __entry->partial_num, __entry->failed_num, + __entry->asap_num, __entry->non_asap_num, __entry->duration, + __entry->unknown_triggers, __entry->reschedule, + __entry->out_of_window) +); #endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */ #undef TRACE_INCLUDE_PATH -- cgit v1.2.3 From 6acc9b432e6714d72d7d77ec7c27f6f8358d0c71 Mon Sep 17 00:00:00 2001 From: Joe Stringer Date: Tue, 2 Oct 2018 13:35:36 -0700 Subject: bpf: Add helper to retrieve socket in BPF This patch adds new BPF helper functions, bpf_sk_lookup_tcp() and bpf_sk_lookup_udp() which allows BPF programs to find out if there is a socket listening on this host, and returns a socket pointer which the BPF program can then access to determine, for instance, whether to forward or drop traffic. bpf_sk_lookup_xxx() may take a reference on the socket, so when a BPF program makes use of this function, it must subsequently pass the returned pointer into the newly added sk_release() to return the reference. By way of example, the following pseudocode would filter inbound connections at XDP if there is no corresponding service listening for the traffic: struct bpf_sock_tuple tuple; struct bpf_sock_ops *sk; populate_tuple(ctx, &tuple); // Extract the 5tuple from the packet sk = bpf_sk_lookup_tcp(ctx, &tuple, sizeof tuple, netns, 0); if (!sk) { // Couldn't find a socket listening for this traffic. Drop. return TC_ACT_SHOT; } bpf_sk_release(sk, 0); return TC_ACT_OK; Signed-off-by: Joe Stringer Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 93 +++++++++++++++++- kernel/bpf/verifier.c | 8 +- net/core/filter.c | 151 ++++++++++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 93 +++++++++++++++++- tools/testing/selftests/bpf/bpf_helpers.h | 12 +++ 5 files changed, 354 insertions(+), 3 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2229,7 +2300,10 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2399,6 +2473,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cd0d8bc00bd1..73c81bef6ae8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -153,6 +153,12 @@ static const struct bpf_verifier_ops * const bpf_verifier_ops[] = { * PTR_TO_MAP_VALUE, PTR_TO_SOCKET_OR_NULL becomes PTR_TO_SOCKET when the type * passes through a NULL-check conditional. For the branch wherein the state is * changed to CONST_IMM, the verifier releases the reference. + * + * For each helper function that allocates a reference, such as + * bpf_sk_lookup_tcp(), there is a corresponding release function, such as + * bpf_sk_release(). When a reference type passes into the release function, + * the verifier also releases the reference. If any unchecked or unreleased + * reference remains at the end of the program, the verifier rejects it. */ /* verifier_state + insn_idx are pushed to stack when branch is encountered */ @@ -300,7 +306,7 @@ static bool arg_type_is_refcounted(enum bpf_arg_type type) */ static bool is_release_function(enum bpf_func_id func_id) { - return false; + return func_id == BPF_FUNC_sk_release; } /* string representation of 'enum bpf_reg_type' */ diff --git a/net/core/filter.c b/net/core/filter.c index b2cb186252e4..591c698bc517 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -58,13 +58,17 @@ #include #include #include +#include #include #include #include +#include +#include #include #include #include #include +#include #include #include #include @@ -4813,6 +4817,141 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { }; #endif /* CONFIG_IPV6_SEG6_BPF */ +struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, + struct sk_buff *skb, u8 family, u8 proto) +{ + int dif = skb->dev->ifindex; + bool refcounted = false; + struct sock *sk = NULL; + + if (family == AF_INET) { + __be32 src4 = tuple->ipv4.saddr; + __be32 dst4 = tuple->ipv4.daddr; + int sdif = inet_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet_lookup(net, &tcp_hashinfo, skb, 0, + src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &refcounted); + else + sk = __udp4_lib_lookup(net, src4, tuple->ipv4.sport, + dst4, tuple->ipv4.dport, + dif, sdif, &udp_table, skb); +#if IS_ENABLED(CONFIG_IPV6) + } else { + struct in6_addr *src6 = (struct in6_addr *)&tuple->ipv6.saddr; + struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; + int sdif = inet6_sdif(skb); + + if (proto == IPPROTO_TCP) + sk = __inet6_lookup(net, &tcp_hashinfo, skb, 0, + src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &refcounted); + else + sk = __udp6_lib_lookup(net, src6, tuple->ipv6.sport, + dst6, tuple->ipv6.dport, + dif, sdif, &udp_table, skb); +#endif + } + + if (unlikely(sk && !refcounted && !sock_flag(sk, SOCK_RCU_FREE))) { + WARN_ONCE(1, "Found non-RCU, unreferenced socket!"); + sk = NULL; + } + return sk; +} + +/* bpf_sk_lookup performs the core lookup for different types of sockets, + * taking a reference on the socket if it doesn't have the flag SOCK_RCU_FREE. + * Returns the socket as an 'unsigned long' to simplify the casting in the + * callers to satisfy BPF_CALL declarations. + */ +static unsigned long +bpf_sk_lookup(struct sk_buff *skb, struct bpf_sock_tuple *tuple, u32 len, + u8 proto, u64 netns_id, u64 flags) +{ + struct net *caller_net; + struct sock *sk = NULL; + u8 family = AF_UNSPEC; + struct net *net; + + family = len == sizeof(tuple->ipv4) ? AF_INET : AF_INET6; + if (unlikely(family == AF_UNSPEC || netns_id > U32_MAX || flags)) + goto out; + + if (skb->dev) + caller_net = dev_net(skb->dev); + else + caller_net = sock_net(skb->sk); + if (netns_id) { + net = get_net_ns_by_id(caller_net, netns_id); + if (unlikely(!net)) + goto out; + sk = sk_lookup(net, tuple, skb, family, proto); + put_net(net); + } else { + net = caller_net; + sk = sk_lookup(net, tuple, skb, family, proto); + } + + if (sk) + sk = sk_to_full_sk(sk); +out: + return (unsigned long) sk; +} + +BPF_CALL_5(bpf_sk_lookup_tcp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_TCP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_tcp_proto = { + .func = bpf_sk_lookup_tcp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_5(bpf_sk_lookup_udp, struct sk_buff *, skb, + struct bpf_sock_tuple *, tuple, u32, len, u64, netns_id, u64, flags) +{ + return bpf_sk_lookup(skb, tuple, len, IPPROTO_UDP, netns_id, flags); +} + +static const struct bpf_func_proto bpf_sk_lookup_udp_proto = { + .func = bpf_sk_lookup_udp, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_PTR_TO_SOCKET_OR_NULL, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, + .arg4_type = ARG_ANYTHING, + .arg5_type = ARG_ANYTHING, +}; + +BPF_CALL_1(bpf_sk_release, struct sock *, sk) +{ + if (!sock_flag(sk, SOCK_RCU_FREE)) + sock_gen_put(sk); + return 0; +} + +static const struct bpf_func_proto bpf_sk_release_proto = { + .func = bpf_sk_release, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_SOCKET, +}; + bool bpf_helper_changes_pkt_data(void *func) { if (func == bpf_skb_vlan_push || @@ -5019,6 +5158,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_skb_ancestor_cgroup_id: return &bpf_skb_ancestor_cgroup_id_proto; #endif + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } @@ -5119,6 +5264,12 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_hash_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; + case BPF_FUNC_sk_lookup_tcp: + return &bpf_sk_lookup_tcp_proto; + case BPF_FUNC_sk_lookup_udp: + return &bpf_sk_lookup_udp_proto; + case BPF_FUNC_sk_release: + return &bpf_sk_release_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e2070d819e04..f9187b41dff6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -2144,6 +2144,77 @@ union bpf_attr { * request in the skb. * Return * 0 on success, or a negative error in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u32 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-NULL, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is zero, then the socket lookup table in the + * netns associated with the *ctx* will be used. For the TC hooks, + * this in the netns of the device in the skb. For socket hooks, + * this in the netns of the socket. If *netns* is non-zero, then + * it specifies the ID of the netns relative to the netns + * associated with the *ctx*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to *struct bpf_sock*, or NULL in case of failure. + * + * int bpf_sk_release(struct bpf_sock *sk) + * Description + * Release the reference held by *sock*. *sock* must be a non-NULL + * pointer that was returned from bpf_sk_lookup_xxx\ (). + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2229,7 +2300,10 @@ union bpf_attr { FN(get_current_cgroup_id), \ FN(get_local_storage), \ FN(sk_select_reuseport), \ - FN(skb_ancestor_cgroup_id), + FN(skb_ancestor_cgroup_id), \ + FN(sk_lookup_tcp), \ + FN(sk_lookup_udp), \ + FN(sk_release), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -2399,6 +2473,23 @@ struct bpf_sock { */ }; +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + #define XDP_PACKET_HEADROOM 256 /* User return codes for XDP prog type. diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index e4be7730222d..1d407b3494f9 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -143,6 +143,18 @@ static unsigned long long (*bpf_skb_cgroup_id)(void *ctx) = (void *) BPF_FUNC_skb_cgroup_id; static unsigned long long (*bpf_skb_ancestor_cgroup_id)(void *ctx, int level) = (void *) BPF_FUNC_skb_ancestor_cgroup_id; +static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned int netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_tcp; +static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, + struct bpf_sock_tuple *tuple, + int size, unsigned int netns_id, + unsigned long long flags) = + (void *) BPF_FUNC_sk_lookup_udp; +static int (*bpf_sk_release)(struct bpf_sock *sk) = + (void *) BPF_FUNC_sk_release; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions -- cgit v1.2.3 From e9837e55b0200da544a095a1fca36efd7fd3ba30 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Mon, 1 Oct 2018 18:23:08 -0700 Subject: netfilter: xt_quota: fix the behavior of xt_quota module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A major flaw of the current xt_quota module is that quota in a specific rule gets reset every time there is a rule change in the same table. It makes the xt_quota module not very useful in a table in which iptables rules are changed at run time. This fix introduces a new counter that is visible to userspace as the remaining quota of the current rule. When userspace restores the rules in a table, it can restore the counter to the remaining quota instead of resetting it to the full quota. Signed-off-by: Chenbo Feng Suggested-by: Maciej Żenczykowski Reviewed-by: Maciej Żenczykowski Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_quota.h | 8 +++-- net/netfilter/xt_quota.c | 55 +++++++++++++-------------------- 2 files changed, 27 insertions(+), 36 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_quota.h b/include/uapi/linux/netfilter/xt_quota.h index f3ba5d9e58b6..d72fd52adbba 100644 --- a/include/uapi/linux/netfilter/xt_quota.h +++ b/include/uapi/linux/netfilter/xt_quota.h @@ -15,9 +15,11 @@ struct xt_quota_info { __u32 flags; __u32 pad; __aligned_u64 quota; - - /* Used internally by the kernel */ - struct xt_quota_priv *master; +#ifdef __KERNEL__ + atomic64_t counter; +#else + __aligned_u64 remain; +#endif }; #endif /* _XT_QUOTA_H */ diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index 10d61a6eed71..6afa7f468a73 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,11 +11,6 @@ #include #include -struct xt_quota_priv { - spinlock_t lock; - uint64_t quota; -}; - MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston "); MODULE_DESCRIPTION("Xtables: countdown quota match"); @@ -26,54 +21,48 @@ static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct xt_quota_info *q = (void *)par->matchinfo; - struct xt_quota_priv *priv = q->master; + u64 current_count = atomic64_read(&q->counter); bool ret = q->flags & XT_QUOTA_INVERT; - - spin_lock_bh(&priv->lock); - if (priv->quota >= skb->len) { - priv->quota -= skb->len; - ret = !ret; - } else { - /* we do not allow even small packets from now on */ - priv->quota = 0; - } - spin_unlock_bh(&priv->lock); - - return ret; + u64 old_count, new_count; + + do { + if (current_count == 1) + return ret; + if (current_count <= skb->len) { + atomic64_set(&q->counter, 1); + return ret; + } + old_count = current_count; + new_count = current_count - skb->len; + current_count = atomic64_cmpxchg(&q->counter, old_count, + new_count); + } while (current_count != old_count); + return !ret; } static int quota_mt_check(const struct xt_mtchk_param *par) { struct xt_quota_info *q = par->matchinfo; + BUILD_BUG_ON(sizeof(atomic64_t) != sizeof(__aligned_u64)); + if (q->flags & ~XT_QUOTA_MASK) return -EINVAL; + if (atomic64_read(&q->counter) > q->quota + 1) + return -ERANGE; - q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); - if (q->master == NULL) - return -ENOMEM; - - spin_lock_init(&q->master->lock); - q->master->quota = q->quota; + if (atomic64_read(&q->counter) == 0) + atomic64_set(&q->counter, q->quota + 1); return 0; } -static void quota_mt_destroy(const struct xt_mtdtor_param *par) -{ - const struct xt_quota_info *q = par->matchinfo; - - kfree(q->master); -} - static struct xt_match quota_mt_reg __read_mostly = { .name = "quota", .revision = 0, .family = NFPROTO_UNSPEC, .match = quota_mt, .checkentry = quota_mt_check, - .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), - .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; -- cgit v1.2.3 From bbb4c4323a4d9cb5ca04db904aa3050a7586839a Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 4 Oct 2018 14:27:55 +0100 Subject: dns: Allow the dns resolver to retrieve a server set Allow the DNS resolver to retrieve a set of servers and their associated addresses, ports, preference and weight ratings. In terms of communication with userspace, "srv=1" is added to the callout string (the '1' indicating the maximum data version supported by the kernel) to ask the userspace side for this. If the userspace side doesn't recognise it, it will ignore the option and return the usual text address list. If the userspace side does recognise it, it will return some binary data that begins with a zero byte that would cause the string parsers to give an error. The second byte contains the version of the data in the blob (this may be between 1 and the version specified in the callout data). The remainder of the payload is version-specific. In version 1, the payload looks like (note that this is packed): u8 Non-string marker (ie. 0) u8 Content (0 => Server list) u8 Version (ie. 1) u8 Source (eg. DNS_RECORD_FROM_DNS_SRV) u8 Status (eg. DNS_LOOKUP_GOOD) u8 Number of servers foreach-server { u16 Name length (LE) u16 Priority (as per SRV record) (LE) u16 Weight (as per SRV record) (LE) u16 Port (LE) u8 Source (eg. DNS_RECORD_FROM_NSS) u8 Status (eg. DNS_LOOKUP_GOT_NOT_FOUND) u8 Protocol (eg. DNS_SERVER_PROTOCOL_UDP) u8 Number of addresses char[] Name (not NUL-terminated) foreach-address { u8 Family (AF_INET{,6}) union { u8[4] ipv4_addr u8[16] ipv6_addr } } } This can then be used to fetch a whole cell's VL-server configuration for AFS, for example. Signed-off-by: David Howells Signed-off-by: David S. Miller --- include/linux/dns_resolver.h | 4 +- include/uapi/linux/dns_resolver.h | 116 ++++++++++++++++++++++++++++++++++++++ net/dns_resolver/dns_key.c | 67 +++++++++++++++++++++- net/dns_resolver/dns_query.c | 5 +- 4 files changed, 182 insertions(+), 10 deletions(-) create mode 100644 include/uapi/linux/dns_resolver.h (limited to 'include/uapi') diff --git a/include/linux/dns_resolver.h b/include/linux/dns_resolver.h index 6ac3cad9aef1..34a744a1bafc 100644 --- a/include/linux/dns_resolver.h +++ b/include/linux/dns_resolver.h @@ -24,11 +24,9 @@ #ifndef _LINUX_DNS_RESOLVER_H #define _LINUX_DNS_RESOLVER_H -#ifdef __KERNEL__ +#include extern int dns_query(const char *type, const char *name, size_t namelen, const char *options, char **_result, time64_t *_expiry); -#endif /* KERNEL */ - #endif /* _LINUX_DNS_RESOLVER_H */ diff --git a/include/uapi/linux/dns_resolver.h b/include/uapi/linux/dns_resolver.h new file mode 100644 index 000000000000..129745f9c794 --- /dev/null +++ b/include/uapi/linux/dns_resolver.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* DNS resolver interface definitions. + * + * Copyright (C) 2018 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_DNS_RESOLVER_H +#define _UAPI_LINUX_DNS_RESOLVER_H + +#include + +/* + * Type of payload. + */ +enum dns_payload_content_type { + DNS_PAYLOAD_IS_SERVER_LIST = 0, /* List of servers, requested by srv=1 */ +}; + +/* + * Type of address that might be found in an address record. + */ +enum dns_payload_address_type { + DNS_ADDRESS_IS_IPV4 = 0, /* 4-byte AF_INET address */ + DNS_ADDRESS_IS_IPV6 = 1, /* 16-byte AF_INET6 address */ +}; + +/* + * Type of protocol used to access a server. + */ +enum dns_payload_protocol_type { + DNS_SERVER_PROTOCOL_UNSPECIFIED = 0, + DNS_SERVER_PROTOCOL_UDP = 1, /* Use UDP to talk to the server */ + DNS_SERVER_PROTOCOL_TCP = 2, /* Use TCP to talk to the server */ +}; + +/* + * Source of record included in DNS resolver payload. + */ +enum dns_record_source { + DNS_RECORD_UNAVAILABLE = 0, /* No source available (empty record) */ + DNS_RECORD_FROM_CONFIG = 1, /* From local configuration data */ + DNS_RECORD_FROM_DNS_A = 2, /* From DNS A or AAAA record */ + DNS_RECORD_FROM_DNS_AFSDB = 3, /* From DNS AFSDB record */ + DNS_RECORD_FROM_DNS_SRV = 4, /* From DNS SRV record */ + DNS_RECORD_FROM_NSS = 5, /* From NSS */ + NR__dns_record_source +}; + +/* + * Status of record included in DNS resolver payload. + */ +enum dns_lookup_status { + DNS_LOOKUP_NOT_DONE = 0, /* No lookup has been made */ + DNS_LOOKUP_GOOD = 1, /* Good records obtained */ + DNS_LOOKUP_GOOD_WITH_BAD = 2, /* Good records, some decoding errors */ + DNS_LOOKUP_BAD = 3, /* Couldn't decode results */ + DNS_LOOKUP_GOT_NOT_FOUND = 4, /* Got a "Not Found" result */ + DNS_LOOKUP_GOT_LOCAL_FAILURE = 5, /* Local failure during lookup */ + DNS_LOOKUP_GOT_TEMP_FAILURE = 6, /* Temporary failure during lookup */ + DNS_LOOKUP_GOT_NS_FAILURE = 7, /* Name server failure */ + NR__dns_lookup_status +}; + +/* + * Header at the beginning of binary format payload. + */ +struct dns_payload_header { + __u8 zero; /* Zero byte: marks this as not being text */ + __u8 content; /* enum dns_payload_content_type */ + __u8 version; /* Encoding version */ +} __packed; + +/* + * Header at the beginning of a V1 server list. This is followed directly by + * the server records. Each server records begins with a struct of type + * dns_server_list_v1_server. + */ +struct dns_server_list_v1_header { + struct dns_payload_header hdr; + __u8 source; /* enum dns_record_source */ + __u8 status; /* enum dns_lookup_status */ + __u8 nr_servers; /* Number of server records following this */ +} __packed; + +/* + * Header at the beginning of each V1 server record. This is followed by the + * characters of the name with no NUL-terminator, followed by the address + * records for that server. Each address record begins with a struct of type + * struct dns_server_list_v1_address. + */ +struct dns_server_list_v1_server { + __u16 name_len; /* Length of name (LE) */ + __u16 priority; /* Priority (as SRV record) (LE) */ + __u16 weight; /* Weight (as SRV record) (LE) */ + __u16 port; /* UDP/TCP port number (LE) */ + __u8 source; /* enum dns_record_source */ + __u8 status; /* enum dns_lookup_status */ + __u8 protocol; /* enum dns_payload_protocol_type */ + __u8 nr_addrs; +} __packed; + +/* + * Header at the beginning of each V1 address record. This is followed by the + * bytes of the address, 4 for IPV4 and 16 for IPV6. + */ +struct dns_server_list_v1_address { + __u8 address_type; /* enum dns_payload_address_type */ +} __packed; + +#endif /* _UAPI_LINUX_DNS_RESOLVER_H */ diff --git a/net/dns_resolver/dns_key.c b/net/dns_resolver/dns_key.c index 7f4534828f6c..a65d553e730d 100644 --- a/net/dns_resolver/dns_key.c +++ b/net/dns_resolver/dns_key.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -48,27 +49,86 @@ const struct cred *dns_resolver_cache; /* * Preparse instantiation data for a dns_resolver key. * - * The data must be a NUL-terminated string, with the NUL char accounted in - * datalen. + * For normal hostname lookups, the data must be a NUL-terminated string, with + * the NUL char accounted in datalen. * * If the data contains a '#' characters, then we take the clause after each * one to be an option of the form 'key=value'. The actual data of interest is * the string leading up to the first '#'. For instance: * * "ip1,ip2,...#foo=bar" + * + * For server list requests, the data must begin with a NUL char and be + * followed by a byte indicating the version of the data format. Version 1 + * looks something like (note this is packed): + * + * u8 Non-string marker (ie. 0) + * u8 Content (DNS_PAYLOAD_IS_*) + * u8 Version (e.g. 1) + * u8 Source of server list + * u8 Lookup status of server list + * u8 Number of servers + * foreach-server { + * __le16 Name length + * __le16 Priority (as per SRV record, low first) + * __le16 Weight (as per SRV record, higher first) + * __le16 Port + * u8 Source of address list + * u8 Lookup status of address list + * u8 Protocol (DNS_SERVER_PROTOCOL_*) + * u8 Number of addresses + * char[] Name (not NUL-terminated) + * foreach-address { + * u8 Family (DNS_ADDRESS_IS_*) + * union { + * u8[4] ipv4_addr + * u8[16] ipv6_addr + * } + * } + * } + * */ static int dns_resolver_preparse(struct key_preparsed_payload *prep) { + const struct dns_payload_header *bin; struct user_key_payload *upayload; unsigned long derrno; int ret; int datalen = prep->datalen, result_len = 0; const char *data = prep->data, *end, *opt; + if (datalen <= 1 || !data) + return -EINVAL; + + if (data[0] == 0) { + /* It may be a server list. */ + if (datalen <= sizeof(*bin)) + return -EINVAL; + + bin = (const struct dns_payload_header *)data; + kenter("[%u,%u],%u", bin->content, bin->version, datalen); + if (bin->content != DNS_PAYLOAD_IS_SERVER_LIST) { + pr_warn_ratelimited( + "dns_resolver: Unsupported content type (%u)\n", + bin->content); + return -EINVAL; + } + + if (bin->version != 1) { + pr_warn_ratelimited( + "dns_resolver: Unsupported server list version (%u)\n", + bin->version); + return -EINVAL; + } + + result_len = datalen; + goto store_result; + } + kenter("'%*.*s',%u", datalen, datalen, data, datalen); - if (datalen <= 1 || !data || data[datalen - 1] != '\0') + if (!data || data[datalen - 1] != '\0') return -EINVAL; datalen--; @@ -144,6 +204,7 @@ dns_resolver_preparse(struct key_preparsed_payload *prep) return 0; } +store_result: kdebug("store result"); prep->quotalen = result_len; diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c index 49da67034f29..76338c38738a 100644 --- a/net/dns_resolver/dns_query.c +++ b/net/dns_resolver/dns_query.c @@ -148,12 +148,9 @@ int dns_query(const char *type, const char *name, size_t namelen, if (_result) { ret = -ENOMEM; - *_result = kmalloc(len + 1, GFP_KERNEL); + *_result = kmemdup_nul(upayload->data, len, GFP_KERNEL); if (!*_result) goto put; - - memcpy(*_result, upayload->data, len); - (*_result)[len] = '\0'; } if (_expiry) -- cgit v1.2.3 From 5a781ccbd19e4664babcbe4b4ead7aa2b9283d22 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 28 Sep 2018 17:59:43 -0700 Subject: tc: Add support for configuring the taprio scheduler This traffic scheduler allows traffic classes states (transmission allowed/not allowed, in the simplest case) to be scheduled, according to a pre-generated time sequence. This is the basis of the IEEE 802.1Qbv specification. Example configuration: tc qdisc replace dev enp3s0 parent root handle 100 taprio \ num_tc 3 \ map 2 2 1 0 2 2 2 2 2 2 2 2 2 2 2 2 \ queues 1@0 1@1 2@2 \ base-time 1528743495910289987 \ sched-entry S 01 300000 \ sched-entry S 02 300000 \ sched-entry S 04 300000 \ clockid CLOCK_TAI The configuration format is similar to mqprio. The main difference is the presence of a schedule, built by multiple "sched-entry" definitions, each entry has the following format: sched-entry The only supported is "S", which means "SetGateStates", following the IEEE 802.1Qbv-2015 definition (Table 8-6). is a bitmask where each bit is a associated with a traffic class, so bit 0 (the least significant bit) being "on" means that traffic class 0 is "active" for that schedule entry. is a time duration in nanoseconds that specifies for how long that state defined by and should be held before moving to the next entry. This schedule is circular, that is, after the last entry is executed it starts from the first one, indefinitely. The other parameters can be defined as follows: - base-time: specifies the instant when the schedule starts, if 'base-time' is a time in the past, the schedule will start at base-time + (N * cycle-time) where N is the smallest integer so the resulting time is greater than "now", and "cycle-time" is the sum of all the intervals of the entries in the schedule; - clockid: specifies the reference clock to be used; The parameters should be similar to what the IEEE 802.1Q family of specification defines. Signed-off-by: Vinicius Costa Gomes Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 46 ++ net/sched/Kconfig | 11 + net/sched/Makefile | 1 + net/sched/sch_taprio.c | 962 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 1020 insertions(+) create mode 100644 net/sched/sch_taprio.c (limited to 'include/uapi') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index e9b7244ac381..89ee47c2f17d 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -1084,4 +1084,50 @@ enum { CAKE_ATM_MAX }; + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + #endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index e95741388311..1b9afdee5ba9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -194,6 +194,17 @@ config NET_SCH_ETF To compile this code as a module, choose M here: the module will be called sch_etf. +config NET_SCH_TAPRIO + tristate "Time Aware Priority (taprio) Scheduler" + help + Say Y here if you want to use the Time Aware Priority (taprio) packet + scheduling algorithm. + + See the top of for more details. + + To compile this code as a module, choose M here: the + module will be called sch_taprio. + config NET_SCH_GRED tristate "Generic Random Early Detection (GRED)" ---help--- diff --git a/net/sched/Makefile b/net/sched/Makefile index f0403f49edcb..8a40431d7b5c 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -57,6 +57,7 @@ obj-$(CONFIG_NET_SCH_HHF) += sch_hhf.o obj-$(CONFIG_NET_SCH_PIE) += sch_pie.o obj-$(CONFIG_NET_SCH_CBS) += sch_cbs.o obj-$(CONFIG_NET_SCH_ETF) += sch_etf.o +obj-$(CONFIG_NET_SCH_TAPRIO) += sch_taprio.o obj-$(CONFIG_NET_CLS_U32) += cls_u32.o obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c new file mode 100644 index 000000000000..206e4dbed12f --- /dev/null +++ b/net/sched/sch_taprio.c @@ -0,0 +1,962 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* net/sched/sch_taprio.c Time Aware Priority Scheduler + * + * Authors: Vinicius Costa Gomes + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TAPRIO_ALL_GATES_OPEN -1 + +struct sched_entry { + struct list_head list; + + /* The instant that this entry "closes" and the next one + * should open, the qdisc will make some effort so that no + * packet leaves after this time. + */ + ktime_t close_time; + atomic_t budget; + int index; + u32 gate_mask; + u32 interval; + u8 command; +}; + +struct taprio_sched { + struct Qdisc **qdiscs; + struct Qdisc *root; + s64 base_time; + int clockid; + int picos_per_byte; /* Using picoseconds because for 10Gbps+ + * speeds it's sub-nanoseconds per byte + */ + size_t num_entries; + + /* Protects the update side of the RCU protected current_entry */ + spinlock_t current_entry_lock; + struct sched_entry __rcu *current_entry; + struct list_head entries; + ktime_t (*get_time)(void); + struct hrtimer advance_timer; +}; + +static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, + struct sk_buff **to_free) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct Qdisc *child; + int queue; + + queue = skb_get_queue_mapping(skb); + + child = q->qdiscs[queue]; + if (unlikely(!child)) + return qdisc_drop(skb, sch, to_free); + + qdisc_qstats_backlog_inc(sch, skb); + sch->q.qlen++; + + return qdisc_enqueue(skb, child, to_free); +} + +static struct sk_buff *taprio_peek(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + gate_mask = entry ? entry->gate_mask : -1; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + int prio; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + return NULL; + + return skb; + } + + return NULL; +} + +static inline int length_to_duration(struct taprio_sched *q, int len) +{ + return (len * q->picos_per_byte) / 1000; +} + +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry; + struct sk_buff *skb; + u32 gate_mask; + int i; + + rcu_read_lock(); + entry = rcu_dereference(q->current_entry); + /* if there's no entry, it means that the schedule didn't + * start yet, so force all gates to be open, this is in + * accordance to IEEE 802.1Qbv-2015 Section 8.6.9.4.5 + * "AdminGateSates" + */ + gate_mask = entry ? entry->gate_mask : TAPRIO_ALL_GATES_OPEN; + rcu_read_unlock(); + + if (!gate_mask) + return NULL; + + for (i = 0; i < dev->num_tx_queues; i++) { + struct Qdisc *child = q->qdiscs[i]; + ktime_t guard; + int prio; + int len; + u8 tc; + + if (unlikely(!child)) + continue; + + skb = child->ops->peek(child); + if (!skb) + continue; + + prio = skb->priority; + tc = netdev_get_prio_tc_map(dev, prio); + + if (!(gate_mask & BIT(tc))) + continue; + + len = qdisc_pkt_len(skb); + guard = ktime_add_ns(q->get_time(), + length_to_duration(q, len)); + + /* In the case that there's no gate entry, there's no + * guard band ... + */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + ktime_after(guard, entry->close_time)) + return NULL; + + /* ... and no budget. */ + if (gate_mask != TAPRIO_ALL_GATES_OPEN && + atomic_sub_return(len, &entry->budget) < 0) + return NULL; + + skb = child->ops->dequeue(child); + if (unlikely(!skb)) + return NULL; + + qdisc_bstats_update(sch, skb); + qdisc_qstats_backlog_dec(sch, skb); + sch->q.qlen--; + + return skb; + } + + return NULL; +} + +static bool should_restart_cycle(const struct taprio_sched *q, + const struct sched_entry *entry) +{ + WARN_ON(!entry); + + return list_is_last(&entry->list, &q->entries); +} + +static enum hrtimer_restart advance_sched(struct hrtimer *timer) +{ + struct taprio_sched *q = container_of(timer, struct taprio_sched, + advance_timer); + struct sched_entry *entry, *next; + struct Qdisc *sch = q->root; + ktime_t close_time; + + spin_lock(&q->current_entry_lock); + entry = rcu_dereference_protected(q->current_entry, + lockdep_is_held(&q->current_entry_lock)); + + /* This is the case that it's the first time that the schedule + * runs, so it only happens once per schedule. The first entry + * is pre-calculated during the schedule initialization. + */ + if (unlikely(!entry)) { + next = list_first_entry(&q->entries, struct sched_entry, + list); + close_time = next->close_time; + goto first_run; + } + + if (should_restart_cycle(q, entry)) + next = list_first_entry(&q->entries, struct sched_entry, + list); + else + next = list_next_entry(entry, list); + + close_time = ktime_add_ns(entry->close_time, next->interval); + + next->close_time = close_time; + atomic_set(&next->budget, + (next->interval * 1000) / q->picos_per_byte); + +first_run: + rcu_assign_pointer(q->current_entry, next); + spin_unlock(&q->current_entry_lock); + + hrtimer_set_expires(&q->advance_timer, close_time); + + rcu_read_lock(); + __netif_schedule(sch); + rcu_read_unlock(); + + return HRTIMER_RESTART; +} + +static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_CMD] = { .type = NLA_U8 }, + [TCA_TAPRIO_SCHED_ENTRY_GATE_MASK] = { .type = NLA_U32 }, + [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, +}; + +static const struct nla_policy entry_list_policy[TCA_TAPRIO_SCHED_MAX + 1] = { + [TCA_TAPRIO_SCHED_ENTRY] = { .type = NLA_NESTED }, +}; + +static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { + [TCA_TAPRIO_ATTR_PRIOMAP] = { + .len = sizeof(struct tc_mqprio_qopt) + }, + [TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_BASE_TIME] = { .type = NLA_S64 }, + [TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] = { .type = NLA_NESTED }, + [TCA_TAPRIO_ATTR_SCHED_CLOCKID] = { .type = NLA_S32 }, +}; + +static int fill_sched_entry(struct nlattr **tb, struct sched_entry *entry, + struct netlink_ext_ack *extack) +{ + u32 interval = 0; + + if (tb[TCA_TAPRIO_SCHED_ENTRY_CMD]) + entry->command = nla_get_u8( + tb[TCA_TAPRIO_SCHED_ENTRY_CMD]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]) + entry->gate_mask = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_GATE_MASK]); + + if (tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]) + interval = nla_get_u32( + tb[TCA_TAPRIO_SCHED_ENTRY_INTERVAL]); + + if (interval == 0) { + NL_SET_ERR_MSG(extack, "Invalid interval for schedule entry"); + return -EINVAL; + } + + entry->interval = interval; + + return 0; +} + +static int parse_sched_entry(struct nlattr *n, struct sched_entry *entry, + int index, struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + int err; + + err = nla_parse_nested(tb, TCA_TAPRIO_SCHED_ENTRY_MAX, n, + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + entry->index = index; + + return fill_sched_entry(tb, entry, extack); +} + +/* Returns the number of entries in case of success */ +static int parse_sched_single_entry(struct nlattr *n, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb_entry[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { }; + struct nlattr *tb_list[TCA_TAPRIO_SCHED_MAX + 1] = { }; + struct sched_entry *entry; + bool found = false; + u32 index; + int err; + + err = nla_parse_nested(tb_list, TCA_TAPRIO_SCHED_MAX, + n, entry_list_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_list[TCA_TAPRIO_SCHED_ENTRY]) { + NL_SET_ERR_MSG(extack, "Single-entry must include an entry"); + return -EINVAL; + } + + err = nla_parse_nested(tb_entry, TCA_TAPRIO_SCHED_ENTRY_MAX, + tb_list[TCA_TAPRIO_SCHED_ENTRY], + entry_policy, NULL); + if (err < 0) { + NL_SET_ERR_MSG(extack, "Could not parse nested entry"); + return -EINVAL; + } + + if (!tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]) { + NL_SET_ERR_MSG(extack, "Entry must specify an index\n"); + return -EINVAL; + } + + index = nla_get_u32(tb_entry[TCA_TAPRIO_SCHED_ENTRY_INDEX]); + if (index >= q->num_entries) { + NL_SET_ERR_MSG(extack, "Index for single entry exceeds number of entries in schedule"); + return -EINVAL; + } + + list_for_each_entry(entry, &q->entries, list) { + if (entry->index == index) { + found = true; + break; + } + } + + if (!found) { + NL_SET_ERR_MSG(extack, "Could not find entry"); + return -ENOENT; + } + + err = fill_sched_entry(tb_entry, entry, extack); + if (err < 0) + return err; + + return q->num_entries; +} + +static int parse_sched_list(struct nlattr *list, + struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + struct nlattr *n; + int err, rem; + int i = 0; + + if (!list) + return -EINVAL; + + nla_for_each_nested(n, list, rem) { + struct sched_entry *entry; + + if (nla_type(n) != TCA_TAPRIO_SCHED_ENTRY) { + NL_SET_ERR_MSG(extack, "Attribute is not of type 'entry'"); + continue; + } + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + NL_SET_ERR_MSG(extack, "Not enough memory for entry"); + return -ENOMEM; + } + + err = parse_sched_entry(n, entry, i, extack); + if (err < 0) { + kfree(entry); + return err; + } + + list_add_tail(&entry->list, &q->entries); + i++; + } + + q->num_entries = i; + + return i; +} + +/* Returns the number of entries in case of success */ +static int parse_taprio_opt(struct nlattr **tb, struct taprio_sched *q, + struct netlink_ext_ack *extack) +{ + int err = 0; + int clockid; + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST] && + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY] && q->num_entries == 0) + return -EINVAL; + + if (q->clockid == -1 && !tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) + return -EINVAL; + + if (tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]) + q->base_time = nla_get_s64( + tb[TCA_TAPRIO_ATTR_SCHED_BASE_TIME]); + + if (tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]) { + clockid = nla_get_s32(tb[TCA_TAPRIO_ATTR_SCHED_CLOCKID]); + + /* We only support static clockids and we don't allow + * for it to be modified after the first init. + */ + if (clockid < 0 || (q->clockid != -1 && q->clockid != clockid)) + return -EINVAL; + + q->clockid = clockid; + } + + if (tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST]) + err = parse_sched_list( + tb[TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST], q, extack); + else if (tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY]) + err = parse_sched_single_entry( + tb[TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY], q, extack); + + /* parse_sched_* return the number of entries in the schedule, + * a schedule with zero entries is an error. + */ + if (err == 0) { + NL_SET_ERR_MSG(extack, "The schedule should contain at least one entry"); + return -EINVAL; + } + + return err; +} + +static int taprio_parse_mqprio_opt(struct net_device *dev, + struct tc_mqprio_qopt *qopt, + struct netlink_ext_ack *extack) +{ + int i, j; + + if (!qopt) { + NL_SET_ERR_MSG(extack, "'mqprio' configuration is necessary"); + return -EINVAL; + } + + /* Verify num_tc is not out of max range */ + if (qopt->num_tc > TC_MAX_QUEUE) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is outside valid range"); + return -EINVAL; + } + + /* taprio imposes that traffic classes map 1:n to tx queues */ + if (qopt->num_tc > dev->num_tx_queues) { + NL_SET_ERR_MSG(extack, "Number of traffic classes is greater than number of HW queues"); + return -EINVAL; + } + + /* Verify priority mapping uses valid tcs */ + for (i = 0; i < TC_BITMASK + 1; i++) { + if (qopt->prio_tc_map[i] >= qopt->num_tc) { + NL_SET_ERR_MSG(extack, "Invalid traffic class in priority to traffic class mapping"); + return -EINVAL; + } + } + + for (i = 0; i < qopt->num_tc; i++) { + unsigned int last = qopt->offset[i] + qopt->count[i]; + + /* Verify the queue count is in tx range being equal to the + * real_num_tx_queues indicates the last queue is in use. + */ + if (qopt->offset[i] >= dev->num_tx_queues || + !qopt->count[i] || + last > dev->real_num_tx_queues) { + NL_SET_ERR_MSG(extack, "Invalid queue in traffic class to queue mapping"); + return -EINVAL; + } + + /* Verify that the offset and counts do not overlap */ + for (j = i + 1; j < qopt->num_tc; j++) { + if (last > qopt->offset[j]) { + NL_SET_ERR_MSG(extack, "Detected overlap in the traffic class to queue mapping"); + return -EINVAL; + } + } + } + + return 0; +} + +static ktime_t taprio_get_start_time(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *entry; + ktime_t now, base, cycle; + s64 n; + + base = ns_to_ktime(q->base_time); + cycle = 0; + + /* Calculate the cycle_time, by summing all the intervals. + */ + list_for_each_entry(entry, &q->entries, list) + cycle = ktime_add_ns(cycle, entry->interval); + + if (!cycle) + return base; + + now = q->get_time(); + + if (ktime_after(base, now)) + return base; + + /* Schedule the start time for the beginning of the next + * cycle. + */ + n = div64_s64(ktime_sub_ns(now, base), cycle); + + return ktime_add_ns(base, (n + 1) * cycle); +} + +static void taprio_start_sched(struct Qdisc *sch, ktime_t start) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct sched_entry *first; + unsigned long flags; + + spin_lock_irqsave(&q->current_entry_lock, flags); + + first = list_first_entry(&q->entries, struct sched_entry, + list); + + first->close_time = ktime_add_ns(start, first->interval); + atomic_set(&first->budget, + (first->interval * 1000) / q->picos_per_byte); + rcu_assign_pointer(q->current_entry, NULL); + + spin_unlock_irqrestore(&q->current_entry_lock, flags); + + hrtimer_start(&q->advance_timer, start, HRTIMER_MODE_ABS); +} + +static int taprio_change(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_ATTR_MAX + 1] = { }; + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt *mqprio = NULL; + struct ethtool_link_ksettings ecmd; + int i, err, size; + s64 link_speed; + ktime_t start; + + err = nla_parse_nested(tb, TCA_TAPRIO_ATTR_MAX, opt, + taprio_policy, extack); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_TAPRIO_ATTR_PRIOMAP]) + mqprio = nla_data(tb[TCA_TAPRIO_ATTR_PRIOMAP]); + + err = taprio_parse_mqprio_opt(dev, mqprio, extack); + if (err < 0) + return err; + + /* A schedule with less than one entry is an error */ + size = parse_taprio_opt(tb, q, extack); + if (size < 0) + return size; + + hrtimer_init(&q->advance_timer, q->clockid, HRTIMER_MODE_ABS); + q->advance_timer.function = advance_sched; + + switch (q->clockid) { + case CLOCK_REALTIME: + q->get_time = ktime_get_real; + break; + case CLOCK_MONOTONIC: + q->get_time = ktime_get; + break; + case CLOCK_BOOTTIME: + q->get_time = ktime_get_boottime; + break; + case CLOCK_TAI: + q->get_time = ktime_get_clocktai; + break; + default: + return -ENOTSUPP; + } + + for (i = 0; i < dev->num_tx_queues; i++) { + struct netdev_queue *dev_queue; + struct Qdisc *qdisc; + + dev_queue = netdev_get_tx_queue(dev, i); + qdisc = qdisc_create_dflt(dev_queue, + &pfifo_qdisc_ops, + TC_H_MAKE(TC_H_MAJ(sch->handle), + TC_H_MIN(i + 1)), + extack); + if (!qdisc) + return -ENOMEM; + + if (i < dev->real_num_tx_queues) + qdisc_hash_add(qdisc, false); + + q->qdiscs[i] = qdisc; + } + + if (mqprio) { + netdev_set_num_tc(dev, mqprio->num_tc); + for (i = 0; i < mqprio->num_tc; i++) + netdev_set_tc_queue(dev, i, + mqprio->count[i], + mqprio->offset[i]); + + /* Always use supplied priority mappings */ + for (i = 0; i < TC_BITMASK + 1; i++) + netdev_set_prio_tc_map(dev, i, + mqprio->prio_tc_map[i]); + } + + if (!__ethtool_get_link_ksettings(dev, &ecmd)) + link_speed = ecmd.base.speed; + else + link_speed = SPEED_1000; + + q->picos_per_byte = div64_s64(NSEC_PER_SEC * 1000LL * 8, + link_speed * 1000 * 1000); + + start = taprio_get_start_time(sch); + if (!start) + return 0; + + taprio_start_sched(sch, start); + + return 0; +} + +static void taprio_destroy(struct Qdisc *sch) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct sched_entry *entry, *n; + unsigned int i; + + hrtimer_cancel(&q->advance_timer); + + if (q->qdiscs) { + for (i = 0; i < dev->num_tx_queues && q->qdiscs[i]; i++) + qdisc_put(q->qdiscs[i]); + + kfree(q->qdiscs); + } + q->qdiscs = NULL; + + netdev_set_num_tc(dev, 0); + + list_for_each_entry_safe(entry, n, &q->entries, list) { + list_del(&entry->list); + kfree(entry); + } +} + +static int taprio_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + + INIT_LIST_HEAD(&q->entries); + spin_lock_init(&q->current_entry_lock); + + /* We may overwrite the configuration later */ + hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); + + q->root = sch; + + /* We only support static clockids. Use an invalid value as default + * and get the valid one on taprio_change(). + */ + q->clockid = -1; + + if (sch->parent != TC_H_ROOT) + return -EOPNOTSUPP; + + if (!netif_is_multiqueue(dev)) + return -EOPNOTSUPP; + + /* pre-allocate qdisc, attachment can't fail */ + q->qdiscs = kcalloc(dev->num_tx_queues, + sizeof(q->qdiscs[0]), + GFP_KERNEL); + + if (!q->qdiscs) + return -ENOMEM; + + if (!opt) + return -EINVAL; + + return taprio_change(sch, opt, extack); +} + +static struct netdev_queue *taprio_queue_get(struct Qdisc *sch, + unsigned long cl) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx = cl - 1; + + if (ntx >= dev->num_tx_queues) + return NULL; + + return netdev_get_tx_queue(dev, ntx); +} + +static int taprio_graft(struct Qdisc *sch, unsigned long cl, + struct Qdisc *new, struct Qdisc **old, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return -EINVAL; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + *old = q->qdiscs[cl - 1]; + q->qdiscs[cl - 1] = new; + + if (new) + new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return 0; +} + +static int dump_entry(struct sk_buff *msg, + const struct sched_entry *entry) +{ + struct nlattr *item; + + item = nla_nest_start(msg, TCA_TAPRIO_SCHED_ENTRY); + if (!item) + return -ENOSPC; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INDEX, entry->index)) + goto nla_put_failure; + + if (nla_put_u8(msg, TCA_TAPRIO_SCHED_ENTRY_CMD, entry->command)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, + entry->gate_mask)) + goto nla_put_failure; + + if (nla_put_u32(msg, TCA_TAPRIO_SCHED_ENTRY_INTERVAL, + entry->interval)) + goto nla_put_failure; + + return nla_nest_end(msg, item); + +nla_put_failure: + nla_nest_cancel(msg, item); + return -1; +} + +static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + struct tc_mqprio_qopt opt = { 0 }; + struct nlattr *nest, *entry_list; + struct sched_entry *entry; + unsigned int i; + + opt.num_tc = netdev_get_num_tc(dev); + memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); + + for (i = 0; i < netdev_get_num_tc(dev); i++) { + opt.count[i] = dev->tc_to_txq[i].count; + opt.offset[i] = dev->tc_to_txq[i].offset; + } + + nest = nla_nest_start(skb, TCA_OPTIONS); + if (!nest) + return -ENOSPC; + + if (nla_put(skb, TCA_TAPRIO_ATTR_PRIOMAP, sizeof(opt), &opt)) + goto options_error; + + if (nla_put_s64(skb, TCA_TAPRIO_ATTR_SCHED_BASE_TIME, + q->base_time, TCA_TAPRIO_PAD)) + goto options_error; + + if (nla_put_s32(skb, TCA_TAPRIO_ATTR_SCHED_CLOCKID, q->clockid)) + goto options_error; + + entry_list = nla_nest_start(skb, TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST); + if (!entry_list) + goto options_error; + + list_for_each_entry(entry, &q->entries, list) { + if (dump_entry(skb, entry) < 0) + goto options_error; + } + + nla_nest_end(skb, entry_list); + + return nla_nest_end(skb, nest); + +options_error: + nla_nest_cancel(skb, nest); + return -1; +} + +static struct Qdisc *taprio_leaf(struct Qdisc *sch, unsigned long cl) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + if (!dev_queue) + return NULL; + + return dev_queue->qdisc_sleeping; +} + +static unsigned long taprio_find(struct Qdisc *sch, u32 classid) +{ + unsigned int ntx = TC_H_MIN(classid); + + if (!taprio_queue_get(sch, ntx)) + return 0; + return ntx; +} + +static int taprio_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle |= TC_H_MIN(cl); + tcm->tcm_info = dev_queue->qdisc_sleeping->handle; + + return 0; +} + +static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct gnet_dump *d) + __releases(d->lock) + __acquires(d->lock) +{ + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; + if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || + gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) + return -1; + return 0; +} + +static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct net_device *dev = qdisc_dev(sch); + unsigned long ntx; + + if (arg->stop) + return; + + arg->count = arg->skip; + for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { + if (arg->fn(sch, ntx + 1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct netdev_queue *taprio_select_queue(struct Qdisc *sch, + struct tcmsg *tcm) +{ + return taprio_queue_get(sch, TC_H_MIN(tcm->tcm_parent)); +} + +static const struct Qdisc_class_ops taprio_class_ops = { + .graft = taprio_graft, + .leaf = taprio_leaf, + .find = taprio_find, + .walk = taprio_walk, + .dump = taprio_dump_class, + .dump_stats = taprio_dump_class_stats, + .select_queue = taprio_select_queue, +}; + +static struct Qdisc_ops taprio_qdisc_ops __read_mostly = { + .cl_ops = &taprio_class_ops, + .id = "taprio", + .priv_size = sizeof(struct taprio_sched), + .init = taprio_init, + .destroy = taprio_destroy, + .peek = taprio_peek, + .dequeue = taprio_dequeue, + .enqueue = taprio_enqueue, + .dump = taprio_dump, + .owner = THIS_MODULE, +}; + +static int __init taprio_module_init(void) +{ + return register_qdisc(&taprio_qdisc_ops); +} + +static void __exit taprio_module_exit(void) +{ + unregister_qdisc(&taprio_qdisc_ops); +} + +module_init(taprio_module_init); +module_exit(taprio_module_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 89d35528d17d25819a755a2b52931e911baebc66 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 7 Oct 2018 20:16:27 -0700 Subject: netlink: Add new socket option to enable strict checking on dumps Add a new socket option, NETLINK_DUMP_STRICT_CHK, that userspace can use via setsockopt to request strict checking of headers and attributes on dump requests. To get dump features such as kernel side filtering based on data in the header or attributes appended to the dump request, userspace must call setsockopt() for NETLINK_DUMP_STRICT_CHK and a non-zero value. Since the netlink sock and its flags are private to the af_netlink code, the strict checking flag is passed to dump handlers via a flag in the netlink_callback struct. For old userspace on new kernel there is no impact as all of the data checks in later patches are wrapped in a check on the new strict flag. For new userspace on old kernel, the setsockopt will fail and even if new userspace sets data in the headers and appended attributes the kernel will silently ignore it. Moving forward when the setsockopt succeeds, the new userspace on old kernel means the dump request can pass an attribute the kernel does not understand. The dump will then fail as the older kernel does not understand it. New userspace on new kernel setting the socket option gets the benefit of the improved data dump. Kernel side the NETLINK_DUMP_STRICT_CHK uapi is converted to a generic NETLINK_F_STRICT_CHK flag which can potentially be leveraged for tighter checking on the NEW, DEL, and SET commands. Signed-off-by: David Ahern Acked-by: Christian Brauner Signed-off-by: David S. Miller --- include/linux/netlink.h | 1 + include/uapi/linux/netlink.h | 1 + net/netlink/af_netlink.c | 21 ++++++++++++++++++++- net/netlink/af_netlink.h | 1 + 4 files changed, 23 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 88c8a2d83eb3..72580f1a72a2 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -179,6 +179,7 @@ struct netlink_callback { struct netlink_ext_ack *extack; u16 family; u16 min_dump_alloc; + bool strict_check; unsigned int prev_seq, seq; long args[6]; }; diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 776bc92e9118..486ed1f0c0bc 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -155,6 +155,7 @@ enum nlmsgerr_attrs { #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 #define NETLINK_EXT_ACK 11 +#define NETLINK_DUMP_STRICT_CHK 12 struct nl_pktinfo { __u32 group; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 7ac585f33a9e..e613a9f89600 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1706,6 +1706,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_EXT_ACK; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (val) + nlk->flags |= NETLINK_F_STRICT_CHK; + else + nlk->flags &= ~NETLINK_F_STRICT_CHK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1799,6 +1806,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_DUMP_STRICT_CHK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_STRICT_CHK ? 1 : 0; + if (put_user(len, optlen) || put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2282,9 +2298,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, const struct nlmsghdr *nlh, struct netlink_dump_control *control) { + struct netlink_sock *nlk, *nlk2; struct netlink_callback *cb; struct sock *sk; - struct netlink_sock *nlk; int ret; refcount_inc(&skb->users); @@ -2318,6 +2334,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, cb->min_dump_alloc = control->min_dump_alloc; cb->skb = skb; + nlk2 = nlk_sk(NETLINK_CB(skb).sk); + cb->strict_check = !!(nlk2->flags & NETLINK_F_STRICT_CHK); + if (control->start) { ret = control->start(cb); if (ret) diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 962de7b3c023..5f454c8de6a4 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -15,6 +15,7 @@ #define NETLINK_F_LISTEN_ALL_NSID 0x10 #define NETLINK_F_CAP_ACK 0x20 #define NETLINK_F_EXT_ACK 0x40 +#define NETLINK_F_STRICT_CHK 0x80 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) -- cgit v1.2.3 From 0d4e14a32dcab9c4bd559d02874120fbb86b1322 Mon Sep 17 00:00:00 2001 From: Ankita Bajaj Date: Thu, 27 Sep 2018 18:01:57 +0300 Subject: nl80211: Add per peer statistics to compute FCS error rate Add support for drivers to report the total number of MPDUs received and the number of MPDUs received with an FCS error from a specific peer. These counters will be incremented only when the TA of the frame matches the MAC address of the peer irrespective of FCS error. It should be noted that the TA field in the frame might be corrupted when there is an FCS error and TA matching logic would fail in such cases. Hence, FCS error counter might not be fully accurate, but it can provide help in detecting bad RX links in significant number of cases. This FCS error counter without full accuracy can be used, e.g., to trigger a kick-out of a connected client with a bad link in AP mode to force such a client to roam to another AP. Signed-off-by: Ankita Bajaj Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 7 +++++++ include/uapi/linux/nl80211.h | 8 ++++++++ net/wireless/nl80211.c | 2 ++ 3 files changed, 17 insertions(+) (limited to 'include/uapi') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 0e16e723dcef..1fa41b7a1be3 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1303,6 +1303,10 @@ struct cfg80211_tid_stats { * @ack_signal: signal strength (in dBm) of the last ACK frame. * @avg_ack_signal: average rssi value of ack packet for the no of msdu's has * been sent. + * @rx_mpdu_count: number of MPDUs received from this station + * @fcs_err_count: number of packets (MPDUs) received from this station with + * an FCS error. This counter should be incremented only when TA of the + * received packet with an FCS error matches the peer MAC address. */ struct station_info { u64 filled; @@ -1349,6 +1353,9 @@ struct station_info { struct cfg80211_tid_stats *pertid; s8 ack_signal; s8 avg_ack_signal; + + u32 rx_mpdu_count; + u32 fcs_err_count; }; #if IS_ENABLED(CONFIG_CFG80211) diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index dc6d5a1ef470..6d610bae30a9 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3068,6 +3068,12 @@ enum nl80211_sta_bss_param { * @NL80211_STA_INFO_PAD: attribute used for padding for 64-bit alignment * @NL80211_STA_INFO_ACK_SIGNAL: signal strength of the last ACK frame(u8, dBm) * @NL80211_STA_INFO_ACK_SIGNAL_AVG: avg signal strength of ACK frames (s8, dBm) + * @NL80211_STA_INFO_RX_MPDUS: total number of received packets (MPDUs) + * (u32, from this station) + * @NL80211_STA_INFO_FCS_ERROR_COUNT: total number of packets (MPDUs) received + * with an FCS error (u32, from this station). This count may not include + * some packets with an FCS error due to TA corruption. Hence this counter + * might not be fully accurate. * @__NL80211_STA_INFO_AFTER_LAST: internal * @NL80211_STA_INFO_MAX: highest possible station info attribute */ @@ -3108,6 +3114,8 @@ enum nl80211_sta_info { NL80211_STA_INFO_PAD, NL80211_STA_INFO_ACK_SIGNAL, NL80211_STA_INFO_ACK_SIGNAL_AVG, + NL80211_STA_INFO_RX_MPDUS, + NL80211_STA_INFO_FCS_ERROR_COUNT, /* keep last */ __NL80211_STA_INFO_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 758bb069d000..744b5851bbf9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4761,6 +4761,8 @@ static int nl80211_send_station(struct sk_buff *msg, u32 cmd, u32 portid, PUT_SINFO_U64(RX_DROP_MISC, rx_dropped_misc); PUT_SINFO_U64(BEACON_RX, rx_beacon); PUT_SINFO(BEACON_SIGNAL_AVG, rx_beacon_signal_avg, u8); + PUT_SINFO(RX_MPDUS, rx_mpdu_count, u32); + PUT_SINFO(FCS_ERROR_COUNT, fcs_err_count, u32); if (wiphy_ext_feature_isset(&rdev->wiphy, NL80211_EXT_FEATURE_ACK_SIGNAL_SUPPORT)) { PUT_SINFO(ACK_SIGNAL, ack_signal, u8); -- cgit v1.2.3 From 9163a0fc1f0c0980f117cc25f4fa6ba9b0750a36 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Fri, 12 Oct 2018 13:41:16 +0300 Subject: net: bridge: add support for per-port vlan stats This patch adds an option to have per-port vlan stats instead of the default global stats. The option can be set only when there are no port vlans in the bridge since we need to allocate the stats if it is set when vlans are being added to ports (and respectively free them when being deleted). Also bump RTNL_MAX_TYPE as the bridge is the largest user of options. The current stats design allows us to add these without any changes to the fast-path, it all comes down to the per-vlan stats pointer which, if this option is enabled, will be allocated for each port vlan instead of using the global bridge-wide one. CC: bridge@lists.linux-foundation.org CC: Roopa Prabhu Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/uapi/linux/if_link.h | 1 + net/bridge/br_netlink.c | 14 ++++++++++++- net/bridge/br_private.h | 2 ++ net/bridge/br_sysfs_br.c | 17 +++++++++++++++ net/bridge/br_vlan.c | 49 ++++++++++++++++++++++++++++++++++++++++++-- net/core/rtnetlink.c | 2 +- 6 files changed, 81 insertions(+), 4 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 58faab897201..1debfa42cba1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -287,6 +287,7 @@ enum { IFLA_BR_MCAST_STATS_ENABLED, IFLA_BR_MCAST_IGMP_VERSION, IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, __IFLA_BR_MAX, }; diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index e5a5bc5d5232..3345f1984542 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1034,6 +1034,7 @@ static const struct nla_policy br_policy[IFLA_BR_MAX + 1] = { [IFLA_BR_MCAST_STATS_ENABLED] = { .type = NLA_U8 }, [IFLA_BR_MCAST_IGMP_VERSION] = { .type = NLA_U8 }, [IFLA_BR_MCAST_MLD_VERSION] = { .type = NLA_U8 }, + [IFLA_BR_VLAN_STATS_PER_PORT] = { .type = NLA_U8 }, }; static int br_changelink(struct net_device *brdev, struct nlattr *tb[], @@ -1114,6 +1115,14 @@ static int br_changelink(struct net_device *brdev, struct nlattr *tb[], if (err) return err; } + + if (data[IFLA_BR_VLAN_STATS_PER_PORT]) { + __u8 per_port = nla_get_u8(data[IFLA_BR_VLAN_STATS_PER_PORT]); + + err = br_vlan_set_stats_per_port(br, per_port); + if (err) + return err; + } #endif if (data[IFLA_BR_GROUP_FWD_MASK]) { @@ -1327,6 +1336,7 @@ static size_t br_get_size(const struct net_device *brdev) nla_total_size(sizeof(__be16)) + /* IFLA_BR_VLAN_PROTOCOL */ nla_total_size(sizeof(u16)) + /* IFLA_BR_VLAN_DEFAULT_PVID */ nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_ENABLED */ + nla_total_size(sizeof(u8)) + /* IFLA_BR_VLAN_STATS_PER_PORT */ #endif nla_total_size(sizeof(u16)) + /* IFLA_BR_GROUP_FWD_MASK */ nla_total_size(sizeof(struct ifla_bridge_id)) + /* IFLA_BR_ROOT_ID */ @@ -1417,7 +1427,9 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev) if (nla_put_be16(skb, IFLA_BR_VLAN_PROTOCOL, br->vlan_proto) || nla_put_u16(skb, IFLA_BR_VLAN_DEFAULT_PVID, br->default_pvid) || nla_put_u8(skb, IFLA_BR_VLAN_STATS_ENABLED, - br_opt_get(br, BROPT_VLAN_STATS_ENABLED))) + br_opt_get(br, BROPT_VLAN_STATS_ENABLED)) || + nla_put_u8(skb, IFLA_BR_VLAN_STATS_PER_PORT, + br_opt_get(br, IFLA_BR_VLAN_STATS_PER_PORT))) return -EMSGSIZE; #endif #ifdef CONFIG_BRIDGE_IGMP_SNOOPING diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 57229b9d800f..10ee39fdca5c 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -320,6 +320,7 @@ enum net_bridge_opts { BROPT_HAS_IPV6_ADDR, BROPT_NEIGH_SUPPRESS_ENABLED, BROPT_MTU_SET_BY_USER, + BROPT_VLAN_STATS_PER_PORT, }; struct net_bridge { @@ -859,6 +860,7 @@ int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val); int __br_vlan_set_proto(struct net_bridge *br, __be16 proto); int br_vlan_set_proto(struct net_bridge *br, unsigned long val); int br_vlan_set_stats(struct net_bridge *br, unsigned long val); +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val); int br_vlan_init(struct net_bridge *br); int br_vlan_set_default_pvid(struct net_bridge *br, unsigned long val); int __br_vlan_set_default_pvid(struct net_bridge *br, u16 pvid); diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index c93c5724609e..60182bef6341 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -803,6 +803,22 @@ static ssize_t vlan_stats_enabled_store(struct device *d, return store_bridge_parm(d, buf, len, br_vlan_set_stats); } static DEVICE_ATTR_RW(vlan_stats_enabled); + +static ssize_t vlan_stats_per_port_show(struct device *d, + struct device_attribute *attr, + char *buf) +{ + struct net_bridge *br = to_bridge(d); + return sprintf(buf, "%u\n", br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)); +} + +static ssize_t vlan_stats_per_port_store(struct device *d, + struct device_attribute *attr, + const char *buf, size_t len) +{ + return store_bridge_parm(d, buf, len, br_vlan_set_stats_per_port); +} +static DEVICE_ATTR_RW(vlan_stats_per_port); #endif static struct attribute *bridge_attrs[] = { @@ -856,6 +872,7 @@ static struct attribute *bridge_attrs[] = { &dev_attr_vlan_protocol.attr, &dev_attr_default_pvid.attr, &dev_attr_vlan_stats_enabled.attr, + &dev_attr_vlan_stats_per_port.attr, #endif NULL }; diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 5942e03dd845..9b707234e4ae 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -190,6 +190,19 @@ static void br_vlan_put_master(struct net_bridge_vlan *masterv) } } +static void nbp_vlan_rcu_free(struct rcu_head *rcu) +{ + struct net_bridge_vlan *v; + + v = container_of(rcu, struct net_bridge_vlan, rcu); + WARN_ON(br_vlan_is_master(v)); + /* if we had per-port stats configured then free them here */ + if (v->brvlan->stats != v->stats) + free_percpu(v->stats); + v->stats = NULL; + kfree(v); +} + /* This is the shared VLAN add function which works for both ports and bridge * devices. There are four possible calls to this function in terms of the * vlan entry type: @@ -245,7 +258,15 @@ static int __vlan_add(struct net_bridge_vlan *v, u16 flags) if (!masterv) goto out_filt; v->brvlan = masterv; - v->stats = masterv->stats; + if (br_opt_get(br, BROPT_VLAN_STATS_PER_PORT)) { + v->stats = netdev_alloc_pcpu_stats(struct br_vlan_stats); + if (!v->stats) { + err = -ENOMEM; + goto out_filt; + } + } else { + v->stats = masterv->stats; + } } else { err = br_switchdev_port_vlan_add(dev, v->vid, flags); if (err && err != -EOPNOTSUPP) @@ -329,7 +350,7 @@ static int __vlan_del(struct net_bridge_vlan *v) rhashtable_remove_fast(&vg->vlan_hash, &v->vnode, br_vlan_rht_params); __vlan_del_list(v); - kfree_rcu(v, rcu); + call_rcu(&v->rcu, nbp_vlan_rcu_free); } br_vlan_put_master(masterv); @@ -830,6 +851,30 @@ int br_vlan_set_stats(struct net_bridge *br, unsigned long val) return 0; } +int br_vlan_set_stats_per_port(struct net_bridge *br, unsigned long val) +{ + struct net_bridge_port *p; + + /* allow to change the option if there are no port vlans configured */ + list_for_each_entry(p, &br->port_list, list) { + struct net_bridge_vlan_group *vg = nbp_vlan_group(p); + + if (vg->num_vlans) + return -EBUSY; + } + + switch (val) { + case 0: + case 1: + br_opt_toggle(br, BROPT_VLAN_STATS_PER_PORT, !!val); + break; + default: + return -EINVAL; + } + + return 0; +} + static bool vlan_default_pvid(struct net_bridge_vlan_group *vg, u16 vid) { struct net_bridge_vlan *v; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 46328a10034a..0958c7be2c22 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -59,7 +59,7 @@ #include #include -#define RTNL_MAX_TYPE 48 +#define RTNL_MAX_TYPE 49 #define RTNL_SLAVE_MAX_TYPE 36 struct rtnl_link { -- cgit v1.2.3 From 61414f5ec9834df8aa4f55c90de16b71a3d6ca8d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Tue, 9 Oct 2018 23:57:43 +0100 Subject: FDDI: defza: Add support for DEC FDDIcontroller 700 TURBOchannel adapter Add support for the DEC FDDIcontroller 700 (DEFZA), Digital Equipment Corporation's first-generation FDDI network interface adapter, made for TURBOchannel and based on a discrete version of what eventually became Motorola's widely used CAMEL chipset. The CAMEL chipset is present for example in the DEC FDDIcontroller TURBOchannel, EISA and PCI adapters (DEFTA/DEFEA/DEFPA) that we support with the `defxx' driver, however the host bus interface logic and the firmware API are different in the DEFZA and hence a separate driver is required. There isn't much to say about the driver except that it works, but there is one peculiarity to mention. The adapter implements two Tx/Rx queue pairs. Of these one pair is the usual network Tx/Rx queue pair, in this case used by the adapter to exchange frames with the ring, via the RMC (Ring Memory Controller) chip. The Tx queue is handled directly by the RMC chip and resides in onboard packet memory. The Rx queue is maintained via DMA in host memory by adapter's firmware copying received data stored by the RMC in onboard packet memory. The other pair is used to communicate SMT frames with adapter's firmware. Any SMT frame received from the RMC via the Rx queue must be queued back by the driver to the SMT Rx queue for the firmware to process. Similarly the firmware uses the SMT Tx queue to supply the driver with SMT frames that must be queued back to the Tx queue for the RMC to send to the ring. This solution was chosen because the designers ran out of PCB space and could not squeeze in more logic onto the board that would be required to handle this SMT frame traffic without the need to involve the driver, as with the later DEFTA/DEFEA/DEFPA adapters. Finally the driver does some Frame Control byte decoding, so to avoid magic numbers some macros are added to . Signed-off-by: Maciej W. Rozycki Signed-off-by: David S. Miller --- Documentation/networking/00-INDEX | 2 + Documentation/networking/defza.txt | 57 ++ MAINTAINERS | 5 + drivers/net/fddi/Kconfig | 11 + drivers/net/fddi/Makefile | 1 + drivers/net/fddi/defza.c | 1535 ++++++++++++++++++++++++++++++++++++ drivers/net/fddi/defza.h | 791 +++++++++++++++++++ include/uapi/linux/if_fddi.h | 21 +- 8 files changed, 2420 insertions(+), 3 deletions(-) create mode 100644 Documentation/networking/defza.txt create mode 100644 drivers/net/fddi/defza.c create mode 100644 drivers/net/fddi/defza.h (limited to 'include/uapi') diff --git a/Documentation/networking/00-INDEX b/Documentation/networking/00-INDEX index f4f2b5d6c8d8..2d239770b95f 100644 --- a/Documentation/networking/00-INDEX +++ b/Documentation/networking/00-INDEX @@ -56,6 +56,8 @@ de4x5.txt - the Digital EtherWORKS DE4?? and DE5?? PCI Ethernet driver decnet.txt - info on using the DECnet networking layer in Linux. +defza.txt + - the DEC FDDIcontroller 700 (DEFZA-xx) TURBOchannel FDDI driver dl2k.txt - README for D-Link DL2000-based Gigabit Ethernet Adapters (dl2k.ko). dm9000.txt diff --git a/Documentation/networking/defza.txt b/Documentation/networking/defza.txt new file mode 100644 index 000000000000..663e4a906751 --- /dev/null +++ b/Documentation/networking/defza.txt @@ -0,0 +1,57 @@ +Notes on the DEC FDDIcontroller 700 (DEFZA-xx) driver v.1.1.4. + + +DEC FDDIcontroller 700 is DEC's first-generation TURBOchannel FDDI +network card, designed in 1990 specifically for the DECstation 5000 +model 200 workstation. The board is a single attachment station and +it was manufactured in two variations, both of which are supported. + +First is the SAS MMF DEFZA-AA option, the original design implementing +the standard MMF-PMD, however with a pair of ST connectors rather than +the usual MIC connector. The other one is the SAS ThinWire/STP DEFZA-CA +option, denoted 700-C, with the network medium selectable by a switch +between the DEC proprietary ThinWire-PMD using a BNC connector and the +standard STP-PMD using a DE-9F connector. This option can interface to +a DECconcentrator 500 device and, in the case of the STP-PMD, also other +FDDI equipment and was designed to make it easier to transition from +existing IEEE 802.3 10BASE2 Ethernet and IEEE 802.5 Token Ring networks +by providing means to reuse existing cabling. + +This driver handles any number of cards installed in a single system. +They get fddi0, fddi1, etc. interface names assigned in the order of +increasing TURBOchannel slot numbers. + +The board only supports DMA on the receive side. Transmission involves +the use of PIO. As a result under a heavy transmission load there will +be a significant impact on system performance. + +The board supports a 64-entry CAM for matching destination addresses. +Two entries are preoccupied by the Directed Beacon and Ring Purger +multicast addresses and the rest is used as a multicast filter. An +all-multi mode is also supported for LLC frames and it is used if +requested explicitly or if the CAM overflows. The promiscuous mode +supports separate enables for LLC and SMT frames, but this driver +doesn't support changing them individually. + + +Known problems: + +None. + + +To do: + +5. MAC address change. The card does not support changing the Media + Access Controller's address registers but a similar effect can be + achieved by adding an alias to the CAM. There is no way to disable + matching against the original address though. + +7. Queueing incoming/outgoing SMT frames in the driver if the SMT + receive/RMC transmit ring is full. (?) + +8. Retrieving/reporting FDDI/SNMP stats. + + +Both success and failure reports are welcome. + +Maciej W. Rozycki diff --git a/MAINTAINERS b/MAINTAINERS index 6d5161def3f3..031127139f3b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4170,6 +4170,11 @@ S: Maintained F: drivers/platform/x86/dell-smbios-wmi.c F: tools/wmi/dell-smbios-example.c +DEFZA FDDI NETWORK DRIVER +M: "Maciej W. Rozycki" +S: Maintained +F: drivers/net/fddi/defza.* + DELL LAPTOP DRIVER M: Matthew Garrett M: Pali Rohár diff --git a/drivers/net/fddi/Kconfig b/drivers/net/fddi/Kconfig index 3a424c864f4d..d62e8c6205f7 100644 --- a/drivers/net/fddi/Kconfig +++ b/drivers/net/fddi/Kconfig @@ -15,6 +15,17 @@ config FDDI if FDDI +config DEFZA + tristate "DEC FDDIcontroller 700/700-C (DEFZA-xx) support" + depends on FDDI && TC + help + This is support for the DEC FDDIcontroller 700 (DEFZA-AA, fiber) + and 700-C (DEFZA-CA, copper) TURBOchannel network cards which + can connect you to a local FDDI network. + + To compile this driver as a module, choose M here: the module + will be called defza. If unsure, say N. + config DEFXX tristate "Digital DEFTA/DEFEA/DEFPA adapter support" depends on FDDI && (PCI || EISA || TC) diff --git a/drivers/net/fddi/Makefile b/drivers/net/fddi/Makefile index 36da19c9a8aa..194b52cc20b0 100644 --- a/drivers/net/fddi/Makefile +++ b/drivers/net/fddi/Makefile @@ -3,4 +3,5 @@ # obj-$(CONFIG_DEFXX) += defxx.o +obj-$(CONFIG_DEFZA) += defza.o obj-$(CONFIG_SKFP) += skfp/ diff --git a/drivers/net/fddi/defza.c b/drivers/net/fddi/defza.c new file mode 100644 index 000000000000..7d01b70f7ed8 --- /dev/null +++ b/drivers/net/fddi/defza.c @@ -0,0 +1,1535 @@ +// SPDX-License-Identifier: GPL-2.0 +/* FDDI network adapter driver for DEC FDDIcontroller 700/700-C devices. + * + * Copyright (c) 2018 Maciej W. Rozycki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * References: + * + * Dave Sawyer & Phil Weeks & Frank Itkowsky, + * "DEC FDDIcontroller 700 Port Specification", + * Revision 1.1, Digital Equipment Corporation + */ + +/* ------------------------------------------------------------------------- */ +/* FZA configurable parameters. */ + +/* The number of transmit ring descriptors; either 0 for 512 or 1 for 1024. */ +#define FZA_RING_TX_MODE 0 + +/* The number of receive ring descriptors; from 2 up to 256. */ +#define FZA_RING_RX_SIZE 256 + +/* End of FZA configurable parameters. No need to change anything below. */ +/* ------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "defza.h" + +#define DRV_NAME "defza" +#define DRV_VERSION "v.1.1.4" +#define DRV_RELDATE "Oct 6 2018" + +static char version[] = + DRV_NAME ": " DRV_VERSION " " DRV_RELDATE " Maciej W. Rozycki\n"; + +MODULE_AUTHOR("Maciej W. Rozycki "); +MODULE_DESCRIPTION("DEC FDDIcontroller 700 (DEFZA-xx) driver"); +MODULE_LICENSE("GPL"); + +static int loopback; +module_param(loopback, int, 0644); + +/* Ring Purger Multicast */ +static u8 hw_addr_purger[8] = { 0x09, 0x00, 0x2b, 0x02, 0x01, 0x05 }; +/* Directed Beacon Multicast */ +static u8 hw_addr_beacon[8] = { 0x01, 0x80, 0xc2, 0x00, 0x01, 0x00 }; + +/* Shorthands for MMIO accesses that we require to be strongly ordered + * WRT preceding MMIO accesses. + */ +#define readw_o readw_relaxed +#define readl_o readl_relaxed + +#define writew_o writew_relaxed +#define writel_o writel_relaxed + +/* Shorthands for MMIO accesses that we are happy with being weakly ordered + * WRT preceding MMIO accesses. + */ +#define readw_u readw_relaxed +#define readl_u readl_relaxed +#define readq_u readq_relaxed + +#define writew_u writew_relaxed +#define writel_u writel_relaxed +#define writeq_u writeq_relaxed + +static inline struct sk_buff *fza_alloc_skb_irq(struct net_device *dev, + unsigned int length) +{ + return __netdev_alloc_skb(dev, length, GFP_ATOMIC); +} + +static inline struct sk_buff *fza_alloc_skb(struct net_device *dev, + unsigned int length) +{ + return __netdev_alloc_skb(dev, length, GFP_KERNEL); +} + +static inline void fza_skb_align(struct sk_buff *skb, unsigned int v) +{ + unsigned long x, y; + + x = (unsigned long)skb->data; + y = ALIGN(x, v); + + skb_reserve(skb, y - x); +} + +static inline void fza_reads(const void __iomem *from, void *to, + unsigned long size) +{ + if (sizeof(unsigned long) == 8) { + const u64 __iomem *src = from; + const u32 __iomem *src_trail; + u64 *dst = to; + u32 *dst_trail; + + for (size = (size + 3) / 4; size > 1; size -= 2) + *dst++ = readq_u(src++); + if (size) { + src_trail = (u32 __iomem *)src; + dst_trail = (u32 *)dst; + *dst_trail = readl_u(src_trail); + } + } else { + const u32 __iomem *src = from; + u32 *dst = to; + + for (size = (size + 3) / 4; size; size--) + *dst++ = readl_u(src++); + } +} + +static inline void fza_writes(const void *from, void __iomem *to, + unsigned long size) +{ + if (sizeof(unsigned long) == 8) { + const u64 *src = from; + const u32 *src_trail; + u64 __iomem *dst = to; + u32 __iomem *dst_trail; + + for (size = (size + 3) / 4; size > 1; size -= 2) + writeq_u(*src++, dst++); + if (size) { + src_trail = (u32 *)src; + dst_trail = (u32 __iomem *)dst; + writel_u(*src_trail, dst_trail); + } + } else { + const u32 *src = from; + u32 __iomem *dst = to; + + for (size = (size + 3) / 4; size; size--) + writel_u(*src++, dst++); + } +} + +static inline void fza_moves(const void __iomem *from, void __iomem *to, + unsigned long size) +{ + if (sizeof(unsigned long) == 8) { + const u64 __iomem *src = from; + const u32 __iomem *src_trail; + u64 __iomem *dst = to; + u32 __iomem *dst_trail; + + for (size = (size + 3) / 4; size > 1; size -= 2) + writeq_u(readq_u(src++), dst++); + if (size) { + src_trail = (u32 __iomem *)src; + dst_trail = (u32 __iomem *)dst; + writel_u(readl_u(src_trail), dst_trail); + } + } else { + const u32 __iomem *src = from; + u32 __iomem *dst = to; + + for (size = (size + 3) / 4; size; size--) + writel_u(readl_u(src++), dst++); + } +} + +static inline void fza_zeros(void __iomem *to, unsigned long size) +{ + if (sizeof(unsigned long) == 8) { + u64 __iomem *dst = to; + u32 __iomem *dst_trail; + + for (size = (size + 3) / 4; size > 1; size -= 2) + writeq_u(0, dst++); + if (size) { + dst_trail = (u32 __iomem *)dst; + writel_u(0, dst_trail); + } + } else { + u32 __iomem *dst = to; + + for (size = (size + 3) / 4; size; size--) + writel_u(0, dst++); + } +} + +static inline void fza_regs_dump(struct fza_private *fp) +{ + pr_debug("%s: iomem registers:\n", fp->name); + pr_debug(" reset: 0x%04x\n", readw_o(&fp->regs->reset)); + pr_debug(" interrupt event: 0x%04x\n", readw_u(&fp->regs->int_event)); + pr_debug(" status: 0x%04x\n", readw_u(&fp->regs->status)); + pr_debug(" interrupt mask: 0x%04x\n", readw_u(&fp->regs->int_mask)); + pr_debug(" control A: 0x%04x\n", readw_u(&fp->regs->control_a)); + pr_debug(" control B: 0x%04x\n", readw_u(&fp->regs->control_b)); +} + +static inline void fza_do_reset(struct fza_private *fp) +{ + /* Reset the board. */ + writew_o(FZA_RESET_INIT, &fp->regs->reset); + readw_o(&fp->regs->reset); /* Synchronize. */ + readw_o(&fp->regs->reset); /* Read it back for a small delay. */ + writew_o(FZA_RESET_CLR, &fp->regs->reset); + + /* Enable all interrupt events we handle. */ + writew_o(fp->int_mask, &fp->regs->int_mask); + readw_o(&fp->regs->int_mask); /* Synchronize. */ +} + +static inline void fza_do_shutdown(struct fza_private *fp) +{ + /* Disable the driver mode. */ + writew_o(FZA_CONTROL_B_IDLE, &fp->regs->control_b); + + /* And reset the board. */ + writew_o(FZA_RESET_INIT, &fp->regs->reset); + readw_o(&fp->regs->reset); /* Synchronize. */ + writew_o(FZA_RESET_CLR, &fp->regs->reset); + readw_o(&fp->regs->reset); /* Synchronize. */ +} + +static int fza_reset(struct fza_private *fp) +{ + unsigned long flags; + uint status, state; + long t; + + pr_info("%s: resetting the board...\n", fp->name); + + spin_lock_irqsave(&fp->lock, flags); + fp->state_chg_flag = 0; + fza_do_reset(fp); + spin_unlock_irqrestore(&fp->lock, flags); + + /* DEC says RESET needs up to 30 seconds to complete. My DEFZA-AA + * rev. C03 happily finishes in 9.7 seconds. :-) But we need to + * be on the safe side... + */ + t = wait_event_timeout(fp->state_chg_wait, fp->state_chg_flag, + 45 * HZ); + status = readw_u(&fp->regs->status); + state = FZA_STATUS_GET_STATE(status); + if (fp->state_chg_flag == 0) { + pr_err("%s: RESET timed out!, state %x\n", fp->name, state); + return -EIO; + } + if (state != FZA_STATE_UNINITIALIZED) { + pr_err("%s: RESET failed!, state %x, failure ID %x\n", + fp->name, state, FZA_STATUS_GET_TEST(status)); + return -EIO; + } + pr_info("%s: OK\n", fp->name); + pr_debug("%s: RESET: %lums elapsed\n", fp->name, + (45 * HZ - t) * 1000 / HZ); + + return 0; +} + +static struct fza_ring_cmd __iomem *fza_cmd_send(struct net_device *dev, + int command) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_ring_cmd __iomem *ring = fp->ring_cmd + fp->ring_cmd_index; + unsigned int old_mask, new_mask; + union fza_cmd_buf __iomem *buf; + struct netdev_hw_addr *ha; + int i; + + old_mask = fp->int_mask; + new_mask = old_mask & ~FZA_MASK_STATE_CHG; + writew_u(new_mask, &fp->regs->int_mask); + readw_o(&fp->regs->int_mask); /* Synchronize. */ + fp->int_mask = new_mask; + + buf = fp->mmio + readl_u(&ring->buffer); + + if ((readl_u(&ring->cmd_own) & FZA_RING_OWN_MASK) != + FZA_RING_OWN_HOST) { + pr_warn("%s: command buffer full, command: %u!\n", fp->name, + command); + return NULL; + } + + switch (command) { + case FZA_RING_CMD_INIT: + writel_u(FZA_RING_TX_MODE, &buf->init.tx_mode); + writel_u(FZA_RING_RX_SIZE, &buf->init.hst_rx_size); + fza_zeros(&buf->init.counters, sizeof(buf->init.counters)); + break; + + case FZA_RING_CMD_MODCAM: + i = 0; + fza_writes(&hw_addr_purger, &buf->cam.hw_addr[i++], + sizeof(*buf->cam.hw_addr)); + fza_writes(&hw_addr_beacon, &buf->cam.hw_addr[i++], + sizeof(*buf->cam.hw_addr)); + netdev_for_each_mc_addr(ha, dev) { + if (i >= FZA_CMD_CAM_SIZE) + break; + fza_writes(ha->addr, &buf->cam.hw_addr[i++], + sizeof(*buf->cam.hw_addr)); + } + while (i < FZA_CMD_CAM_SIZE) + fza_zeros(&buf->cam.hw_addr[i++], + sizeof(*buf->cam.hw_addr)); + break; + + case FZA_RING_CMD_PARAM: + writel_u(loopback, &buf->param.loop_mode); + writel_u(fp->t_max, &buf->param.t_max); + writel_u(fp->t_req, &buf->param.t_req); + writel_u(fp->tvx, &buf->param.tvx); + writel_u(fp->lem_threshold, &buf->param.lem_threshold); + fza_writes(&fp->station_id, &buf->param.station_id, + sizeof(buf->param.station_id)); + /* Convert to milliseconds due to buggy firmware. */ + writel_u(fp->rtoken_timeout / 12500, + &buf->param.rtoken_timeout); + writel_u(fp->ring_purger, &buf->param.ring_purger); + break; + + case FZA_RING_CMD_MODPROM: + if (dev->flags & IFF_PROMISC) { + writel_u(1, &buf->modprom.llc_prom); + writel_u(1, &buf->modprom.smt_prom); + } else { + writel_u(0, &buf->modprom.llc_prom); + writel_u(0, &buf->modprom.smt_prom); + } + if (dev->flags & IFF_ALLMULTI || + netdev_mc_count(dev) > FZA_CMD_CAM_SIZE - 2) + writel_u(1, &buf->modprom.llc_multi); + else + writel_u(0, &buf->modprom.llc_multi); + writel_u(1, &buf->modprom.llc_bcast); + break; + } + + /* Trigger the command. */ + writel_u(FZA_RING_OWN_FZA | command, &ring->cmd_own); + writew_o(FZA_CONTROL_A_CMD_POLL, &fp->regs->control_a); + + fp->ring_cmd_index = (fp->ring_cmd_index + 1) % FZA_RING_CMD_SIZE; + + fp->int_mask = old_mask; + writew_u(fp->int_mask, &fp->regs->int_mask); + + return ring; +} + +static int fza_init_send(struct net_device *dev, + struct fza_cmd_init *__iomem *init) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_ring_cmd __iomem *ring; + unsigned long flags; + u32 stat; + long t; + + spin_lock_irqsave(&fp->lock, flags); + fp->cmd_done_flag = 0; + ring = fza_cmd_send(dev, FZA_RING_CMD_INIT); + spin_unlock_irqrestore(&fp->lock, flags); + if (!ring) + /* This should never happen in the uninitialized state, + * so do not try to recover and just consider it fatal. + */ + return -ENOBUFS; + + /* INIT may take quite a long time (160ms for my C03). */ + t = wait_event_timeout(fp->cmd_done_wait, fp->cmd_done_flag, 3 * HZ); + if (fp->cmd_done_flag == 0) { + pr_err("%s: INIT command timed out!, state %x\n", fp->name, + FZA_STATUS_GET_STATE(readw_u(&fp->regs->status))); + return -EIO; + } + stat = readl_u(&ring->stat); + if (stat != FZA_RING_STAT_SUCCESS) { + pr_err("%s: INIT command failed!, status %02x, state %x\n", + fp->name, stat, + FZA_STATUS_GET_STATE(readw_u(&fp->regs->status))); + return -EIO; + } + pr_debug("%s: INIT: %lums elapsed\n", fp->name, + (3 * HZ - t) * 1000 / HZ); + + if (init) + *init = fp->mmio + readl_u(&ring->buffer); + return 0; +} + +static void fza_rx_init(struct fza_private *fp) +{ + int i; + + /* Fill the host receive descriptor ring. */ + for (i = 0; i < FZA_RING_RX_SIZE; i++) { + writel_o(0, &fp->ring_hst_rx[i].rmc); + writel_o((fp->rx_dma[i] + 0x1000) >> 9, + &fp->ring_hst_rx[i].buffer1); + writel_o(fp->rx_dma[i] >> 9 | FZA_RING_OWN_FZA, + &fp->ring_hst_rx[i].buf0_own); + } +} + +static void fza_set_rx_mode(struct net_device *dev) +{ + fza_cmd_send(dev, FZA_RING_CMD_MODCAM); + fza_cmd_send(dev, FZA_RING_CMD_MODPROM); +} + +union fza_buffer_txp { + struct fza_buffer_tx *data_ptr; + struct fza_buffer_tx __iomem *mmio_ptr; +}; + +static int fza_do_xmit(union fza_buffer_txp ub, int len, + struct net_device *dev, int smt) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_buffer_tx __iomem *rmc_tx_ptr; + int i, first, frag_len, left_len; + u32 own, rmc; + + if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) - + fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) * + FZA_TX_BUFFER_SIZE) < len) + return 1; + + first = fp->ring_rmc_tx_index; + + left_len = len; + frag_len = FZA_TX_BUFFER_SIZE; + /* First descriptor is relinquished last. */ + own = FZA_RING_TX_OWN_HOST; + /* First descriptor carries frame length; we don't use cut-through. */ + rmc = FZA_RING_TX_SOP | FZA_RING_TX_VBC | len; + do { + i = fp->ring_rmc_tx_index; + rmc_tx_ptr = &fp->buffer_tx[i]; + + if (left_len < FZA_TX_BUFFER_SIZE) + frag_len = left_len; + left_len -= frag_len; + + /* Length must be a multiple of 4 as only word writes are + * permitted! + */ + frag_len = (frag_len + 3) & ~3; + if (smt) + fza_moves(ub.mmio_ptr, rmc_tx_ptr, frag_len); + else + fza_writes(ub.data_ptr, rmc_tx_ptr, frag_len); + + if (left_len == 0) + rmc |= FZA_RING_TX_EOP; /* Mark last frag. */ + + writel_o(rmc, &fp->ring_rmc_tx[i].rmc); + writel_o(own, &fp->ring_rmc_tx[i].own); + + ub.data_ptr++; + fp->ring_rmc_tx_index = (fp->ring_rmc_tx_index + 1) % + fp->ring_rmc_tx_size; + + /* Settings for intermediate frags. */ + own = FZA_RING_TX_OWN_RMC; + rmc = 0; + } while (left_len > 0); + + if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) - + fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) * + FZA_TX_BUFFER_SIZE) < dev->mtu + dev->hard_header_len) { + netif_stop_queue(dev); + pr_debug("%s: queue stopped\n", fp->name); + } + + writel_o(FZA_RING_TX_OWN_RMC, &fp->ring_rmc_tx[first].own); + + /* Go, go, go! */ + writew_o(FZA_CONTROL_A_TX_POLL, &fp->regs->control_a); + + return 0; +} + +static int fza_do_recv_smt(struct fza_buffer_tx *data_ptr, int len, + u32 rmc, struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_buffer_tx __iomem *smt_rx_ptr; + u32 own; + int i; + + i = fp->ring_smt_rx_index; + own = readl_o(&fp->ring_smt_rx[i].own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA) + return 1; + + smt_rx_ptr = fp->mmio + readl_u(&fp->ring_smt_rx[i].buffer); + + /* Length must be a multiple of 4 as only word writes are permitted! */ + fza_writes(data_ptr, smt_rx_ptr, (len + 3) & ~3); + + writel_o(rmc, &fp->ring_smt_rx[i].rmc); + writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_rx[i].own); + + fp->ring_smt_rx_index = + (fp->ring_smt_rx_index + 1) % fp->ring_smt_rx_size; + + /* Grab it! */ + writew_o(FZA_CONTROL_A_SMT_RX_POLL, &fp->regs->control_a); + + return 0; +} + +static void fza_tx(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + u32 own, rmc; + int i; + + while (1) { + i = fp->ring_rmc_txd_index; + if (i == fp->ring_rmc_tx_index) + break; + own = readl_o(&fp->ring_rmc_tx[i].own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_TX_OWN_RMC) + break; + + rmc = readl_u(&fp->ring_rmc_tx[i].rmc); + /* Only process the first descriptor. */ + if ((rmc & FZA_RING_TX_SOP) != 0) { + if ((rmc & FZA_RING_TX_DCC_MASK) == + FZA_RING_TX_DCC_SUCCESS) { + int pkt_len = (rmc & FZA_RING_PBC_MASK) - 3; + /* Omit PRH. */ + + fp->stats.tx_packets++; + fp->stats.tx_bytes += pkt_len; + } else { + fp->stats.tx_errors++; + switch (rmc & FZA_RING_TX_DCC_MASK) { + case FZA_RING_TX_DCC_DTP_SOP: + case FZA_RING_TX_DCC_DTP: + case FZA_RING_TX_DCC_ABORT: + fp->stats.tx_aborted_errors++; + break; + case FZA_RING_TX_DCC_UNDRRUN: + fp->stats.tx_fifo_errors++; + break; + case FZA_RING_TX_DCC_PARITY: + default: + break; + } + } + } + + fp->ring_rmc_txd_index = (fp->ring_rmc_txd_index + 1) % + fp->ring_rmc_tx_size; + } + + if (((((fp->ring_rmc_txd_index - 1 + fp->ring_rmc_tx_size) - + fp->ring_rmc_tx_index) % fp->ring_rmc_tx_size) * + FZA_TX_BUFFER_SIZE) >= dev->mtu + dev->hard_header_len) { + if (fp->queue_active) { + netif_wake_queue(dev); + pr_debug("%s: queue woken\n", fp->name); + } + } +} + +static inline int fza_rx_err(struct fza_private *fp, + const u32 rmc, const u8 fc) +{ + int len, min_len, max_len; + + len = rmc & FZA_RING_PBC_MASK; + + if (unlikely((rmc & FZA_RING_RX_BAD) != 0)) { + fp->stats.rx_errors++; + + /* Check special status codes. */ + if ((rmc & (FZA_RING_RX_CRC | FZA_RING_RX_RRR_MASK | + FZA_RING_RX_DA_MASK | FZA_RING_RX_SA_MASK)) == + (FZA_RING_RX_CRC | FZA_RING_RX_RRR_DADDR | + FZA_RING_RX_DA_CAM | FZA_RING_RX_SA_ALIAS)) { + if (len >= 8190) + fp->stats.rx_length_errors++; + return 1; + } + if ((rmc & (FZA_RING_RX_CRC | FZA_RING_RX_RRR_MASK | + FZA_RING_RX_DA_MASK | FZA_RING_RX_SA_MASK)) == + (FZA_RING_RX_CRC | FZA_RING_RX_RRR_DADDR | + FZA_RING_RX_DA_CAM | FZA_RING_RX_SA_CAM)) { + /* Halt the interface to trigger a reset. */ + writew_o(FZA_CONTROL_A_HALT, &fp->regs->control_a); + readw_o(&fp->regs->control_a); /* Synchronize. */ + return 1; + } + + /* Check the MAC status. */ + switch (rmc & FZA_RING_RX_RRR_MASK) { + case FZA_RING_RX_RRR_OK: + if ((rmc & FZA_RING_RX_CRC) != 0) + fp->stats.rx_crc_errors++; + else if ((rmc & FZA_RING_RX_FSC_MASK) == 0 || + (rmc & FZA_RING_RX_FSB_ERR) != 0) + fp->stats.rx_frame_errors++; + return 1; + case FZA_RING_RX_RRR_SADDR: + case FZA_RING_RX_RRR_DADDR: + case FZA_RING_RX_RRR_ABORT: + /* Halt the interface to trigger a reset. */ + writew_o(FZA_CONTROL_A_HALT, &fp->regs->control_a); + readw_o(&fp->regs->control_a); /* Synchronize. */ + return 1; + case FZA_RING_RX_RRR_LENGTH: + fp->stats.rx_frame_errors++; + return 1; + default: + return 1; + } + } + + /* Packet received successfully; validate the length. */ + switch (fc & FDDI_FC_K_FORMAT_MASK) { + case FDDI_FC_K_FORMAT_MANAGEMENT: + if ((fc & FDDI_FC_K_CLASS_MASK) == FDDI_FC_K_CLASS_ASYNC) + min_len = 37; + else + min_len = 17; + break; + case FDDI_FC_K_FORMAT_LLC: + min_len = 20; + break; + default: + min_len = 17; + break; + } + max_len = 4495; + if (len < min_len || len > max_len) { + fp->stats.rx_errors++; + fp->stats.rx_length_errors++; + return 1; + } + + return 0; +} + +static void fza_rx(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + struct sk_buff *skb, *newskb; + struct fza_fddihdr *frame; + dma_addr_t dma, newdma; + u32 own, rmc, buf; + int i, len; + u8 fc; + + while (1) { + i = fp->ring_hst_rx_index; + own = readl_o(&fp->ring_hst_rx[i].buf0_own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA) + break; + + rmc = readl_u(&fp->ring_hst_rx[i].rmc); + skb = fp->rx_skbuff[i]; + dma = fp->rx_dma[i]; + + /* The RMC doesn't count the preamble and the starting + * delimiter. We fix it up here for a total of 3 octets. + */ + dma_rmb(); + len = (rmc & FZA_RING_PBC_MASK) + 3; + frame = (struct fza_fddihdr *)skb->data; + + /* We need to get at real FC. */ + dma_sync_single_for_cpu(fp->bdev, + dma + + ((u8 *)&frame->hdr.fc - (u8 *)frame), + sizeof(frame->hdr.fc), + DMA_FROM_DEVICE); + fc = frame->hdr.fc; + + if (fza_rx_err(fp, rmc, fc)) + goto err_rx; + + /* We have to 512-byte-align RX buffers... */ + newskb = fza_alloc_skb_irq(dev, FZA_RX_BUFFER_SIZE + 511); + if (newskb) { + fza_skb_align(newskb, 512); + newdma = dma_map_single(fp->bdev, newskb->data, + FZA_RX_BUFFER_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(fp->bdev, newdma)) { + dev_kfree_skb_irq(newskb); + newskb = NULL; + } + } + if (newskb) { + int pkt_len = len - 7; /* Omit P, SD and FCS. */ + int is_multi; + int rx_stat; + + dma_unmap_single(fp->bdev, dma, FZA_RX_BUFFER_SIZE, + DMA_FROM_DEVICE); + + /* Queue SMT frames to the SMT receive ring. */ + if ((fc & (FDDI_FC_K_CLASS_MASK | + FDDI_FC_K_FORMAT_MASK)) == + (FDDI_FC_K_CLASS_ASYNC | + FDDI_FC_K_FORMAT_MANAGEMENT) && + (rmc & FZA_RING_RX_DA_MASK) != + FZA_RING_RX_DA_PROM) { + if (fza_do_recv_smt((struct fza_buffer_tx *) + skb->data, len, rmc, + dev)) { + writel_o(FZA_CONTROL_A_SMT_RX_OVFL, + &fp->regs->control_a); + } + } + + is_multi = ((frame->hdr.daddr[0] & 0x01) != 0); + + skb_reserve(skb, 3); /* Skip over P and SD. */ + skb_put(skb, pkt_len); /* And cut off FCS. */ + skb->protocol = fddi_type_trans(skb, dev); + + rx_stat = netif_rx(skb); + if (rx_stat != NET_RX_DROP) { + fp->stats.rx_packets++; + fp->stats.rx_bytes += pkt_len; + if (is_multi) + fp->stats.multicast++; + } else { + fp->stats.rx_dropped++; + } + + skb = newskb; + dma = newdma; + fp->rx_skbuff[i] = skb; + fp->rx_dma[i] = dma; + } else { + fp->stats.rx_dropped++; + pr_notice("%s: memory squeeze, dropping packet\n", + fp->name); + } + +err_rx: + writel_o(0, &fp->ring_hst_rx[i].rmc); + buf = (dma + 0x1000) >> 9; + writel_o(buf, &fp->ring_hst_rx[i].buffer1); + buf = dma >> 9 | FZA_RING_OWN_FZA; + writel_o(buf, &fp->ring_hst_rx[i].buf0_own); + fp->ring_hst_rx_index = + (fp->ring_hst_rx_index + 1) % fp->ring_hst_rx_size; + } +} + +static void fza_tx_smt(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_buffer_tx __iomem *smt_tx_ptr, *skb_data_ptr; + int i, len; + u32 own; + + while (1) { + i = fp->ring_smt_tx_index; + own = readl_o(&fp->ring_smt_tx[i].own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA) + break; + + smt_tx_ptr = fp->mmio + readl_u(&fp->ring_smt_tx[i].buffer); + len = readl_u(&fp->ring_smt_tx[i].rmc) & FZA_RING_PBC_MASK; + + /* Queue the frame to the RMC transmit ring. */ + if (!netif_queue_stopped(dev)) + fza_do_xmit((union fza_buffer_txp) + { .mmio_ptr = smt_tx_ptr }, + len, dev, 1); + + writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_tx[i].own); + fp->ring_smt_tx_index = + (fp->ring_smt_tx_index + 1) % fp->ring_smt_tx_size; + } +} + +static void fza_uns(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + u32 own; + int i; + + while (1) { + i = fp->ring_uns_index; + own = readl_o(&fp->ring_uns[i].own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_OWN_FZA) + break; + + if (readl_u(&fp->ring_uns[i].id) == FZA_RING_UNS_RX_OVER) { + fp->stats.rx_errors++; + fp->stats.rx_over_errors++; + } + + writel_o(FZA_RING_OWN_FZA, &fp->ring_uns[i].own); + fp->ring_uns_index = + (fp->ring_uns_index + 1) % FZA_RING_UNS_SIZE; + } +} + +static void fza_tx_flush(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + u32 own; + int i; + + /* Clean up the SMT TX ring. */ + i = fp->ring_smt_tx_index; + do { + writel_o(FZA_RING_OWN_FZA, &fp->ring_smt_tx[i].own); + fp->ring_smt_tx_index = + (fp->ring_smt_tx_index + 1) % fp->ring_smt_tx_size; + + } while (i != fp->ring_smt_tx_index); + + /* Clean up the RMC TX ring. */ + i = fp->ring_rmc_tx_index; + do { + own = readl_o(&fp->ring_rmc_tx[i].own); + if ((own & FZA_RING_OWN_MASK) == FZA_RING_TX_OWN_RMC) { + u32 rmc = readl_u(&fp->ring_rmc_tx[i].rmc); + + writel_u(rmc | FZA_RING_TX_DTP, + &fp->ring_rmc_tx[i].rmc); + } + fp->ring_rmc_tx_index = + (fp->ring_rmc_tx_index + 1) % fp->ring_rmc_tx_size; + + } while (i != fp->ring_rmc_tx_index); + + /* Done. */ + writew_o(FZA_CONTROL_A_FLUSH_DONE, &fp->regs->control_a); +} + +static irqreturn_t fza_interrupt(int irq, void *dev_id) +{ + struct net_device *dev = dev_id; + struct fza_private *fp = netdev_priv(dev); + uint int_event; + + /* Get interrupt events. */ + int_event = readw_o(&fp->regs->int_event) & fp->int_mask; + if (int_event == 0) + return IRQ_NONE; + + /* Clear the events. */ + writew_u(int_event, &fp->regs->int_event); + + /* Now handle the events. The order matters. */ + + /* Command finished interrupt. */ + if ((int_event & FZA_EVENT_CMD_DONE) != 0) { + fp->irq_count_cmd_done++; + + spin_lock(&fp->lock); + fp->cmd_done_flag = 1; + wake_up(&fp->cmd_done_wait); + spin_unlock(&fp->lock); + } + + /* Transmit finished interrupt. */ + if ((int_event & FZA_EVENT_TX_DONE) != 0) { + fp->irq_count_tx_done++; + fza_tx(dev); + } + + /* Host receive interrupt. */ + if ((int_event & FZA_EVENT_RX_POLL) != 0) { + fp->irq_count_rx_poll++; + fza_rx(dev); + } + + /* SMT transmit interrupt. */ + if ((int_event & FZA_EVENT_SMT_TX_POLL) != 0) { + fp->irq_count_smt_tx_poll++; + fza_tx_smt(dev); + } + + /* Transmit ring flush request. */ + if ((int_event & FZA_EVENT_FLUSH_TX) != 0) { + fp->irq_count_flush_tx++; + fza_tx_flush(dev); + } + + /* Link status change interrupt. */ + if ((int_event & FZA_EVENT_LINK_ST_CHG) != 0) { + uint status; + + fp->irq_count_link_st_chg++; + status = readw_u(&fp->regs->status); + if (FZA_STATUS_GET_LINK(status) == FZA_LINK_ON) { + netif_carrier_on(dev); + pr_info("%s: link available\n", fp->name); + } else { + netif_carrier_off(dev); + pr_info("%s: link unavailable\n", fp->name); + } + } + + /* Unsolicited event interrupt. */ + if ((int_event & FZA_EVENT_UNS_POLL) != 0) { + fp->irq_count_uns_poll++; + fza_uns(dev); + } + + /* State change interrupt. */ + if ((int_event & FZA_EVENT_STATE_CHG) != 0) { + uint status, state; + + fp->irq_count_state_chg++; + + status = readw_u(&fp->regs->status); + state = FZA_STATUS_GET_STATE(status); + pr_debug("%s: state change: %x\n", fp->name, state); + switch (state) { + case FZA_STATE_RESET: + break; + + case FZA_STATE_UNINITIALIZED: + netif_carrier_off(dev); + del_timer_sync(&fp->reset_timer); + fp->ring_cmd_index = 0; + fp->ring_uns_index = 0; + fp->ring_rmc_tx_index = 0; + fp->ring_rmc_txd_index = 0; + fp->ring_hst_rx_index = 0; + fp->ring_smt_tx_index = 0; + fp->ring_smt_rx_index = 0; + if (fp->state > state) { + pr_info("%s: OK\n", fp->name); + fza_cmd_send(dev, FZA_RING_CMD_INIT); + } + break; + + case FZA_STATE_INITIALIZED: + if (fp->state > state) { + fza_set_rx_mode(dev); + fza_cmd_send(dev, FZA_RING_CMD_PARAM); + } + break; + + case FZA_STATE_RUNNING: + case FZA_STATE_MAINTENANCE: + fp->state = state; + fza_rx_init(fp); + fp->queue_active = 1; + netif_wake_queue(dev); + pr_debug("%s: queue woken\n", fp->name); + break; + + case FZA_STATE_HALTED: + fp->queue_active = 0; + netif_stop_queue(dev); + pr_debug("%s: queue stopped\n", fp->name); + del_timer_sync(&fp->reset_timer); + pr_warn("%s: halted, reason: %x\n", fp->name, + FZA_STATUS_GET_HALT(status)); + fza_regs_dump(fp); + pr_info("%s: resetting the board...\n", fp->name); + fza_do_reset(fp); + fp->timer_state = 0; + fp->reset_timer.expires = jiffies + 45 * HZ; + add_timer(&fp->reset_timer); + break; + + default: + pr_warn("%s: undefined state: %x\n", fp->name, state); + break; + } + + spin_lock(&fp->lock); + fp->state_chg_flag = 1; + wake_up(&fp->state_chg_wait); + spin_unlock(&fp->lock); + } + + return IRQ_HANDLED; +} + +static void fza_reset_timer(struct timer_list *t) +{ + struct fza_private *fp = from_timer(fp, t, reset_timer); + + if (!fp->timer_state) { + pr_err("%s: RESET timed out!\n", fp->name); + pr_info("%s: trying harder...\n", fp->name); + + /* Assert the board reset. */ + writew_o(FZA_RESET_INIT, &fp->regs->reset); + readw_o(&fp->regs->reset); /* Synchronize. */ + + fp->timer_state = 1; + fp->reset_timer.expires = jiffies + HZ; + } else { + /* Clear the board reset. */ + writew_u(FZA_RESET_CLR, &fp->regs->reset); + + /* Enable all interrupt events we handle. */ + writew_o(fp->int_mask, &fp->regs->int_mask); + readw_o(&fp->regs->int_mask); /* Synchronize. */ + + fp->timer_state = 0; + fp->reset_timer.expires = jiffies + 45 * HZ; + } + add_timer(&fp->reset_timer); +} + +static int fza_set_mac_address(struct net_device *dev, void *addr) +{ + return -EOPNOTSUPP; +} + +static netdev_tx_t fza_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + unsigned int old_mask, new_mask; + int ret; + u8 fc; + + skb_push(skb, 3); /* Make room for PRH. */ + + /* Decode FC to set PRH. */ + fc = skb->data[3]; + skb->data[0] = 0; + skb->data[1] = 0; + skb->data[2] = FZA_PRH2_NORMAL; + if ((fc & FDDI_FC_K_CLASS_MASK) == FDDI_FC_K_CLASS_SYNC) + skb->data[0] |= FZA_PRH0_FRAME_SYNC; + switch (fc & FDDI_FC_K_FORMAT_MASK) { + case FDDI_FC_K_FORMAT_MANAGEMENT: + if ((fc & FDDI_FC_K_CONTROL_MASK) == 0) { + /* Token. */ + skb->data[0] |= FZA_PRH0_TKN_TYPE_IMM; + skb->data[1] |= FZA_PRH1_TKN_SEND_NONE; + } else { + /* SMT or MAC. */ + skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR; + skb->data[1] |= FZA_PRH1_TKN_SEND_UNR; + } + skb->data[1] |= FZA_PRH1_CRC_NORMAL; + break; + case FDDI_FC_K_FORMAT_LLC: + case FDDI_FC_K_FORMAT_FUTURE: + skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR; + skb->data[1] |= FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR; + break; + case FDDI_FC_K_FORMAT_IMPLEMENTOR: + skb->data[0] |= FZA_PRH0_TKN_TYPE_UNR; + skb->data[1] |= FZA_PRH1_TKN_SEND_ORIG; + break; + } + + /* SMT transmit interrupts may sneak frames into the RMC + * transmit ring. We disable them while queueing a frame + * to maintain consistency. + */ + old_mask = fp->int_mask; + new_mask = old_mask & ~FZA_MASK_SMT_TX_POLL; + writew_u(new_mask, &fp->regs->int_mask); + readw_o(&fp->regs->int_mask); /* Synchronize. */ + fp->int_mask = new_mask; + ret = fza_do_xmit((union fza_buffer_txp) + { .data_ptr = (struct fza_buffer_tx *)skb->data }, + skb->len, dev, 0); + fp->int_mask = old_mask; + writew_u(fp->int_mask, &fp->regs->int_mask); + + if (ret) { + /* Probably an SMT packet filled the remaining space, + * so just stop the queue, but don't report it as an error. + */ + netif_stop_queue(dev); + pr_debug("%s: queue stopped\n", fp->name); + fp->stats.tx_dropped++; + } + + dev_kfree_skb(skb); + + return ret; +} + +static int fza_open(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + struct fza_ring_cmd __iomem *ring; + struct sk_buff *skb; + unsigned long flags; + dma_addr_t dma; + int ret, i; + u32 stat; + long t; + + for (i = 0; i < FZA_RING_RX_SIZE; i++) { + /* We have to 512-byte-align RX buffers... */ + skb = fza_alloc_skb(dev, FZA_RX_BUFFER_SIZE + 511); + if (skb) { + fza_skb_align(skb, 512); + dma = dma_map_single(fp->bdev, skb->data, + FZA_RX_BUFFER_SIZE, + DMA_FROM_DEVICE); + if (dma_mapping_error(fp->bdev, dma)) { + dev_kfree_skb(skb); + skb = NULL; + } + } + if (!skb) { + for (--i; i >= 0; i--) { + dma_unmap_single(fp->bdev, fp->rx_dma[i], + FZA_RX_BUFFER_SIZE, + DMA_FROM_DEVICE); + dev_kfree_skb(fp->rx_skbuff[i]); + fp->rx_dma[i] = 0; + fp->rx_skbuff[i] = NULL; + } + return -ENOMEM; + } + fp->rx_skbuff[i] = skb; + fp->rx_dma[i] = dma; + } + + ret = fza_init_send(dev, NULL); + if (ret != 0) + return ret; + + /* Purger and Beacon multicasts need to be supplied before PARAM. */ + fza_set_rx_mode(dev); + + spin_lock_irqsave(&fp->lock, flags); + fp->cmd_done_flag = 0; + ring = fza_cmd_send(dev, FZA_RING_CMD_PARAM); + spin_unlock_irqrestore(&fp->lock, flags); + if (!ring) + return -ENOBUFS; + + t = wait_event_timeout(fp->cmd_done_wait, fp->cmd_done_flag, 3 * HZ); + if (fp->cmd_done_flag == 0) { + pr_err("%s: PARAM command timed out!, state %x\n", fp->name, + FZA_STATUS_GET_STATE(readw_u(&fp->regs->status))); + return -EIO; + } + stat = readl_u(&ring->stat); + if (stat != FZA_RING_STAT_SUCCESS) { + pr_err("%s: PARAM command failed!, status %02x, state %x\n", + fp->name, stat, + FZA_STATUS_GET_STATE(readw_u(&fp->regs->status))); + return -EIO; + } + pr_debug("%s: PARAM: %lums elapsed\n", fp->name, + (3 * HZ - t) * 1000 / HZ); + + return 0; +} + +static int fza_close(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + unsigned long flags; + uint state; + long t; + int i; + + netif_stop_queue(dev); + pr_debug("%s: queue stopped\n", fp->name); + + del_timer_sync(&fp->reset_timer); + spin_lock_irqsave(&fp->lock, flags); + fp->state = FZA_STATE_UNINITIALIZED; + fp->state_chg_flag = 0; + /* Shut the interface down. */ + writew_o(FZA_CONTROL_A_SHUT, &fp->regs->control_a); + readw_o(&fp->regs->control_a); /* Synchronize. */ + spin_unlock_irqrestore(&fp->lock, flags); + + /* DEC says SHUT needs up to 10 seconds to complete. */ + t = wait_event_timeout(fp->state_chg_wait, fp->state_chg_flag, + 15 * HZ); + state = FZA_STATUS_GET_STATE(readw_o(&fp->regs->status)); + if (fp->state_chg_flag == 0) { + pr_err("%s: SHUT timed out!, state %x\n", fp->name, state); + return -EIO; + } + if (state != FZA_STATE_UNINITIALIZED) { + pr_err("%s: SHUT failed!, state %x\n", fp->name, state); + return -EIO; + } + pr_debug("%s: SHUT: %lums elapsed\n", fp->name, + (15 * HZ - t) * 1000 / HZ); + + for (i = 0; i < FZA_RING_RX_SIZE; i++) + if (fp->rx_skbuff[i]) { + dma_unmap_single(fp->bdev, fp->rx_dma[i], + FZA_RX_BUFFER_SIZE, DMA_FROM_DEVICE); + dev_kfree_skb(fp->rx_skbuff[i]); + fp->rx_dma[i] = 0; + fp->rx_skbuff[i] = NULL; + } + + return 0; +} + +static struct net_device_stats *fza_get_stats(struct net_device *dev) +{ + struct fza_private *fp = netdev_priv(dev); + + return &fp->stats; +} + +static int fza_probe(struct device *bdev) +{ + static const struct net_device_ops netdev_ops = { + .ndo_open = fza_open, + .ndo_stop = fza_close, + .ndo_start_xmit = fza_start_xmit, + .ndo_set_rx_mode = fza_set_rx_mode, + .ndo_set_mac_address = fza_set_mac_address, + .ndo_get_stats = fza_get_stats, + }; + static int version_printed; + char rom_rev[4], fw_rev[4], rmc_rev[4]; + struct tc_dev *tdev = to_tc_dev(bdev); + struct fza_cmd_init __iomem *init; + resource_size_t start, len; + struct net_device *dev; + struct fza_private *fp; + uint smt_ver, pmd_type; + void __iomem *mmio; + uint hw_addr[2]; + int ret, i; + + if (!version_printed) { + pr_info("%s", version); + version_printed = 1; + } + + dev = alloc_fddidev(sizeof(*fp)); + if (!dev) + return -ENOMEM; + SET_NETDEV_DEV(dev, bdev); + + fp = netdev_priv(dev); + dev_set_drvdata(bdev, dev); + + fp->bdev = bdev; + fp->name = dev_name(bdev); + + /* Request the I/O MEM resource. */ + start = tdev->resource.start; + len = tdev->resource.end - start + 1; + if (!request_mem_region(start, len, dev_name(bdev))) { + pr_err("%s: cannot reserve MMIO region\n", fp->name); + ret = -EBUSY; + goto err_out_kfree; + } + + /* MMIO mapping setup. */ + mmio = ioremap_nocache(start, len); + if (!mmio) { + pr_err("%s: cannot map MMIO\n", fp->name); + ret = -ENOMEM; + goto err_out_resource; + } + + /* Initialize the new device structure. */ + switch (loopback) { + case FZA_LOOP_NORMAL: + case FZA_LOOP_INTERN: + case FZA_LOOP_EXTERN: + break; + default: + loopback = FZA_LOOP_NORMAL; + } + + fp->mmio = mmio; + dev->irq = tdev->interrupt; + + pr_info("%s: DEC FDDIcontroller 700 or 700-C at 0x%08llx, irq %d\n", + fp->name, (long long)tdev->resource.start, dev->irq); + pr_debug("%s: mapped at: 0x%p\n", fp->name, mmio); + + fp->regs = mmio + FZA_REG_BASE; + fp->ring_cmd = mmio + FZA_RING_CMD; + fp->ring_uns = mmio + FZA_RING_UNS; + + init_waitqueue_head(&fp->state_chg_wait); + init_waitqueue_head(&fp->cmd_done_wait); + spin_lock_init(&fp->lock); + fp->int_mask = FZA_MASK_NORMAL; + + timer_setup(&fp->reset_timer, fza_reset_timer, 0); + + /* Sanitize the board. */ + fza_regs_dump(fp); + fza_do_shutdown(fp); + + ret = request_irq(dev->irq, fza_interrupt, IRQF_SHARED, fp->name, dev); + if (ret != 0) { + pr_err("%s: unable to get IRQ %d!\n", fp->name, dev->irq); + goto err_out_map; + } + + /* Enable the driver mode. */ + writew_o(FZA_CONTROL_B_DRIVER, &fp->regs->control_b); + + /* For some reason transmit done interrupts can trigger during + * reset. This avoids a division error in the handler. + */ + fp->ring_rmc_tx_size = FZA_RING_TX_SIZE; + + ret = fza_reset(fp); + if (ret != 0) + goto err_out_irq; + + ret = fza_init_send(dev, &init); + if (ret != 0) + goto err_out_irq; + + fza_reads(&init->hw_addr, &hw_addr, sizeof(hw_addr)); + memcpy(dev->dev_addr, &hw_addr, FDDI_K_ALEN); + + fza_reads(&init->rom_rev, &rom_rev, sizeof(rom_rev)); + fza_reads(&init->fw_rev, &fw_rev, sizeof(fw_rev)); + fza_reads(&init->rmc_rev, &rmc_rev, sizeof(rmc_rev)); + for (i = 3; i >= 0 && rom_rev[i] == ' '; i--) + rom_rev[i] = 0; + for (i = 3; i >= 0 && fw_rev[i] == ' '; i--) + fw_rev[i] = 0; + for (i = 3; i >= 0 && rmc_rev[i] == ' '; i--) + rmc_rev[i] = 0; + + fp->ring_rmc_tx = mmio + readl_u(&init->rmc_tx); + fp->ring_rmc_tx_size = readl_u(&init->rmc_tx_size); + fp->ring_hst_rx = mmio + readl_u(&init->hst_rx); + fp->ring_hst_rx_size = readl_u(&init->hst_rx_size); + fp->ring_smt_tx = mmio + readl_u(&init->smt_tx); + fp->ring_smt_tx_size = readl_u(&init->smt_tx_size); + fp->ring_smt_rx = mmio + readl_u(&init->smt_rx); + fp->ring_smt_rx_size = readl_u(&init->smt_rx_size); + + fp->buffer_tx = mmio + FZA_TX_BUFFER_ADDR(readl_u(&init->rmc_tx)); + + fp->t_max = readl_u(&init->def_t_max); + fp->t_req = readl_u(&init->def_t_req); + fp->tvx = readl_u(&init->def_tvx); + fp->lem_threshold = readl_u(&init->lem_threshold); + fza_reads(&init->def_station_id, &fp->station_id, + sizeof(fp->station_id)); + fp->rtoken_timeout = readl_u(&init->rtoken_timeout); + fp->ring_purger = readl_u(&init->ring_purger); + + smt_ver = readl_u(&init->smt_ver); + pmd_type = readl_u(&init->pmd_type); + + pr_debug("%s: INIT parameters:\n", fp->name); + pr_debug(" tx_mode: %u\n", readl_u(&init->tx_mode)); + pr_debug(" hst_rx_size: %u\n", readl_u(&init->hst_rx_size)); + pr_debug(" rmc_rev: %.4s\n", rmc_rev); + pr_debug(" rom_rev: %.4s\n", rom_rev); + pr_debug(" fw_rev: %.4s\n", fw_rev); + pr_debug(" mop_type: %u\n", readl_u(&init->mop_type)); + pr_debug(" hst_rx: 0x%08x\n", readl_u(&init->hst_rx)); + pr_debug(" rmc_tx: 0x%08x\n", readl_u(&init->rmc_tx)); + pr_debug(" rmc_tx_size: %u\n", readl_u(&init->rmc_tx_size)); + pr_debug(" smt_tx: 0x%08x\n", readl_u(&init->smt_tx)); + pr_debug(" smt_tx_size: %u\n", readl_u(&init->smt_tx_size)); + pr_debug(" smt_rx: 0x%08x\n", readl_u(&init->smt_rx)); + pr_debug(" smt_rx_size: %u\n", readl_u(&init->smt_rx_size)); + /* TC systems are always LE, so don't bother swapping. */ + pr_debug(" hw_addr: 0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + (readl_u(&init->hw_addr[0]) >> 0) & 0xff, + (readl_u(&init->hw_addr[0]) >> 8) & 0xff, + (readl_u(&init->hw_addr[0]) >> 16) & 0xff, + (readl_u(&init->hw_addr[0]) >> 24) & 0xff, + (readl_u(&init->hw_addr[1]) >> 0) & 0xff, + (readl_u(&init->hw_addr[1]) >> 8) & 0xff, + (readl_u(&init->hw_addr[1]) >> 16) & 0xff, + (readl_u(&init->hw_addr[1]) >> 24) & 0xff); + pr_debug(" def_t_req: %u\n", readl_u(&init->def_t_req)); + pr_debug(" def_tvx: %u\n", readl_u(&init->def_tvx)); + pr_debug(" def_t_max: %u\n", readl_u(&init->def_t_max)); + pr_debug(" lem_threshold: %u\n", readl_u(&init->lem_threshold)); + /* Don't bother swapping, see above. */ + pr_debug(" def_station_id: 0x%02x%02x%02x%02x%02x%02x%02x%02x\n", + (readl_u(&init->def_station_id[0]) >> 0) & 0xff, + (readl_u(&init->def_station_id[0]) >> 8) & 0xff, + (readl_u(&init->def_station_id[0]) >> 16) & 0xff, + (readl_u(&init->def_station_id[0]) >> 24) & 0xff, + (readl_u(&init->def_station_id[1]) >> 0) & 0xff, + (readl_u(&init->def_station_id[1]) >> 8) & 0xff, + (readl_u(&init->def_station_id[1]) >> 16) & 0xff, + (readl_u(&init->def_station_id[1]) >> 24) & 0xff); + pr_debug(" pmd_type_alt: %u\n", readl_u(&init->pmd_type_alt)); + pr_debug(" smt_ver: %u\n", readl_u(&init->smt_ver)); + pr_debug(" rtoken_timeout: %u\n", readl_u(&init->rtoken_timeout)); + pr_debug(" ring_purger: %u\n", readl_u(&init->ring_purger)); + pr_debug(" smt_ver_max: %u\n", readl_u(&init->smt_ver_max)); + pr_debug(" smt_ver_min: %u\n", readl_u(&init->smt_ver_min)); + pr_debug(" pmd_type: %u\n", readl_u(&init->pmd_type)); + + pr_info("%s: model %s, address %pMF\n", + fp->name, + pmd_type == FZA_PMD_TYPE_TW ? + "700-C (DEFZA-CA), ThinWire PMD selected" : + pmd_type == FZA_PMD_TYPE_STP ? + "700-C (DEFZA-CA), STP PMD selected" : + "700 (DEFZA-AA), MMF PMD", + dev->dev_addr); + pr_info("%s: ROM rev. %.4s, firmware rev. %.4s, RMC rev. %.4s, " + "SMT ver. %u\n", fp->name, rom_rev, fw_rev, rmc_rev, smt_ver); + + /* Now that we fetched initial parameters just shut the interface + * until opened. + */ + ret = fza_close(dev); + if (ret != 0) + goto err_out_irq; + + /* The FZA-specific entries in the device structure. */ + dev->netdev_ops = &netdev_ops; + + ret = register_netdev(dev); + if (ret != 0) + goto err_out_irq; + + pr_info("%s: registered as %s\n", fp->name, dev->name); + fp->name = (const char *)dev->name; + + get_device(bdev); + return 0; + +err_out_irq: + del_timer_sync(&fp->reset_timer); + fza_do_shutdown(fp); + free_irq(dev->irq, dev); + +err_out_map: + iounmap(mmio); + +err_out_resource: + release_mem_region(start, len); + +err_out_kfree: + free_netdev(dev); + + pr_err("%s: initialization failure, aborting!\n", fp->name); + return ret; +} + +static int fza_remove(struct device *bdev) +{ + struct net_device *dev = dev_get_drvdata(bdev); + struct fza_private *fp = netdev_priv(dev); + struct tc_dev *tdev = to_tc_dev(bdev); + resource_size_t start, len; + + put_device(bdev); + + unregister_netdev(dev); + + del_timer_sync(&fp->reset_timer); + fza_do_shutdown(fp); + free_irq(dev->irq, dev); + + iounmap(fp->mmio); + + start = tdev->resource.start; + len = tdev->resource.end - start + 1; + release_mem_region(start, len); + + free_netdev(dev); + + return 0; +} + +static struct tc_device_id const fza_tc_table[] = { + { "DEC ", "PMAF-AA " }, + { } +}; +MODULE_DEVICE_TABLE(tc, fza_tc_table); + +static struct tc_driver fza_driver = { + .id_table = fza_tc_table, + .driver = { + .name = "defza", + .bus = &tc_bus_type, + .probe = fza_probe, + .remove = fza_remove, + }, +}; + +static int fza_init(void) +{ + return tc_register_driver(&fza_driver); +} + +static void fza_exit(void) +{ + tc_unregister_driver(&fza_driver); +} + +module_init(fza_init); +module_exit(fza_exit); diff --git a/drivers/net/fddi/defza.h b/drivers/net/fddi/defza.h new file mode 100644 index 000000000000..b06acf32738e --- /dev/null +++ b/drivers/net/fddi/defza.h @@ -0,0 +1,791 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* FDDI network adapter driver for DEC FDDIcontroller 700/700-C devices. + * + * Copyright (c) 2018 Maciej W. Rozycki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * References: + * + * Dave Sawyer & Phil Weeks & Frank Itkowsky, + * "DEC FDDIcontroller 700 Port Specification", + * Revision 1.1, Digital Equipment Corporation + */ + +#include +#include +#include +#include +#include + +/* IOmem register offsets. */ +#define FZA_REG_BASE 0x100000 /* register base address */ +#define FZA_REG_RESET 0x100200 /* reset, r/w */ +#define FZA_REG_INT_EVENT 0x100400 /* interrupt event, r/w1c */ +#define FZA_REG_STATUS 0x100402 /* status, r/o */ +#define FZA_REG_INT_MASK 0x100404 /* interrupt mask, r/w */ +#define FZA_REG_CONTROL_A 0x100500 /* control A, r/w1s */ +#define FZA_REG_CONTROL_B 0x100502 /* control B, r/w */ + +/* Reset register constants. Bits 1:0 are r/w, others are fixed at 0. */ +#define FZA_RESET_DLU 0x0002 /* OR with INIT to blast flash memory */ +#define FZA_RESET_INIT 0x0001 /* switch into the reset state */ +#define FZA_RESET_CLR 0x0000 /* run self-test and return to work */ + +/* Interrupt event register constants. All bits are r/w1c. */ +#define FZA_EVENT_DLU_DONE 0x0800 /* flash memory write complete */ +#define FZA_EVENT_FLUSH_TX 0x0400 /* transmit ring flush request */ +#define FZA_EVENT_PM_PARITY_ERR 0x0200 /* onboard packet memory parity err */ +#define FZA_EVENT_HB_PARITY_ERR 0x0100 /* host bus parity error */ +#define FZA_EVENT_NXM_ERR 0x0080 /* non-existent memory access error; + * also raised for unaligned and + * unsupported partial-word accesses + */ +#define FZA_EVENT_LINK_ST_CHG 0x0040 /* link status change */ +#define FZA_EVENT_STATE_CHG 0x0020 /* adapter state change */ +#define FZA_EVENT_UNS_POLL 0x0010 /* unsolicited event service request */ +#define FZA_EVENT_CMD_DONE 0x0008 /* command done ack */ +#define FZA_EVENT_SMT_TX_POLL 0x0004 /* SMT frame transmit request */ +#define FZA_EVENT_RX_POLL 0x0002 /* receive request (packet avail.) */ +#define FZA_EVENT_TX_DONE 0x0001 /* RMC transmit done ack */ + +/* Status register constants. All bits are r/o. */ +#define FZA_STATUS_DLU_SHIFT 0xc /* down line upgrade status bits */ +#define FZA_STATUS_DLU_MASK 0x03 +#define FZA_STATUS_LINK_SHIFT 0xb /* link status bits */ +#define FZA_STATUS_LINK_MASK 0x01 +#define FZA_STATUS_STATE_SHIFT 0x8 /* adapter state bits */ +#define FZA_STATUS_STATE_MASK 0x07 +#define FZA_STATUS_HALT_SHIFT 0x0 /* halt reason bits */ +#define FZA_STATUS_HALT_MASK 0xff +#define FZA_STATUS_TEST_SHIFT 0x0 /* test failure bits */ +#define FZA_STATUS_TEST_MASK 0xff + +#define FZA_STATUS_GET_DLU(x) (((x) >> FZA_STATUS_DLU_SHIFT) & \ + FZA_STATUS_DLU_MASK) +#define FZA_STATUS_GET_LINK(x) (((x) >> FZA_STATUS_LINK_SHIFT) & \ + FZA_STATUS_LINK_MASK) +#define FZA_STATUS_GET_STATE(x) (((x) >> FZA_STATUS_STATE_SHIFT) & \ + FZA_STATUS_STATE_MASK) +#define FZA_STATUS_GET_HALT(x) (((x) >> FZA_STATUS_HALT_SHIFT) & \ + FZA_STATUS_HALT_MASK) +#define FZA_STATUS_GET_TEST(x) (((x) >> FZA_STATUS_TEST_SHIFT) & \ + FZA_STATUS_TEST_MASK) + +#define FZA_DLU_FAILURE 0x0 /* DLU catastrophic error; brain dead */ +#define FZA_DLU_ERROR 0x1 /* DLU error; old firmware intact */ +#define FZA_DLU_SUCCESS 0x2 /* DLU OK; new firmware loaded */ + +#define FZA_LINK_OFF 0x0 /* link unavailable */ +#define FZA_LINK_ON 0x1 /* link available */ + +#define FZA_STATE_RESET 0x0 /* resetting */ +#define FZA_STATE_UNINITIALIZED 0x1 /* after a reset */ +#define FZA_STATE_INITIALIZED 0x2 /* initialized */ +#define FZA_STATE_RUNNING 0x3 /* running (link active) */ +#define FZA_STATE_MAINTENANCE 0x4 /* running (link looped back) */ +#define FZA_STATE_HALTED 0x5 /* halted (error condition) */ + +#define FZA_HALT_UNKNOWN 0x00 /* unknown reason */ +#define FZA_HALT_HOST 0x01 /* host-directed HALT */ +#define FZA_HALT_HB_PARITY 0x02 /* host bus parity error */ +#define FZA_HALT_NXM 0x03 /* adapter non-existent memory ref. */ +#define FZA_HALT_SW 0x04 /* adapter software fault */ +#define FZA_HALT_HW 0x05 /* adapter hardware fault */ +#define FZA_HALT_PC_TRACE 0x06 /* PC Trace path test */ +#define FZA_HALT_DLSW 0x07 /* data link software fault */ +#define FZA_HALT_DLHW 0x08 /* data link hardware fault */ + +#define FZA_TEST_FATAL 0x00 /* self-test catastrophic failure */ +#define FZA_TEST_68K 0x01 /* 68000 CPU */ +#define FZA_TEST_SRAM_BWADDR 0x02 /* SRAM byte/word address */ +#define FZA_TEST_SRAM_DBUS 0x03 /* SRAM data bus */ +#define FZA_TEST_SRAM_STUCK1 0x04 /* SRAM stuck-at range 1 */ +#define FZA_TEST_SRAM_STUCK2 0x05 /* SRAM stuck-at range 2 */ +#define FZA_TEST_SRAM_COUPL1 0x06 /* SRAM coupling range 1 */ +#define FZA_TEST_SRAM_COUPL2 0x07 /* SRAM coupling */ +#define FZA_TEST_FLASH_CRC 0x08 /* Flash CRC */ +#define FZA_TEST_ROM 0x09 /* option ROM */ +#define FZA_TEST_PHY_CSR 0x0a /* PHY CSR */ +#define FZA_TEST_MAC_BIST 0x0b /* MAC BiST */ +#define FZA_TEST_MAC_CSR 0x0c /* MAC CSR */ +#define FZA_TEST_MAC_ADDR_UNIQ 0x0d /* MAC unique address */ +#define FZA_TEST_ELM_BIST 0x0e /* ELM BiST */ +#define FZA_TEST_ELM_CSR 0x0f /* ELM CSR */ +#define FZA_TEST_ELM_ADDR_UNIQ 0x10 /* ELM unique address */ +#define FZA_TEST_CAM 0x11 /* CAM */ +#define FZA_TEST_NIROM 0x12 /* NI ROM checksum */ +#define FZA_TEST_SC_LOOP 0x13 /* SC loopback packet */ +#define FZA_TEST_LM_LOOP 0x14 /* LM loopback packet */ +#define FZA_TEST_EB_LOOP 0x15 /* EB loopback packet */ +#define FZA_TEST_SC_LOOP_BYPS 0x16 /* SC bypass loopback packet */ +#define FZA_TEST_LM_LOOP_LOCAL 0x17 /* LM local loopback packet */ +#define FZA_TEST_EB_LOOP_LOCAL 0x18 /* EB local loopback packet */ +#define FZA_TEST_CDC_LOOP 0x19 /* CDC loopback packet */ +#define FZA_TEST_FIBER_LOOP 0x1A /* FIBER loopback packet */ +#define FZA_TEST_CAM_MATCH_LOOP 0x1B /* CAM match packet loopback */ +#define FZA_TEST_68K_IRQ_STUCK 0x1C /* 68000 interrupt line stuck-at */ +#define FZA_TEST_IRQ_PRESENT 0x1D /* interrupt present register */ +#define FZA_TEST_RMC_BIST 0x1E /* RMC BiST */ +#define FZA_TEST_RMC_CSR 0x1F /* RMC CSR */ +#define FZA_TEST_RMC_ADDR_UNIQ 0x20 /* RMC unique address */ +#define FZA_TEST_PM_DPATH 0x21 /* packet memory data path */ +#define FZA_TEST_PM_ADDR 0x22 /* packet memory address */ +#define FZA_TEST_RES_23 0x23 /* reserved */ +#define FZA_TEST_PM_DESC 0x24 /* packet memory descriptor */ +#define FZA_TEST_PM_OWN 0x25 /* packet memory own bit */ +#define FZA_TEST_PM_PARITY 0x26 /* packet memory parity */ +#define FZA_TEST_PM_BSWAP 0x27 /* packet memory byte swap */ +#define FZA_TEST_PM_WSWAP 0x28 /* packet memory word swap */ +#define FZA_TEST_PM_REF 0x29 /* packet memory refresh */ +#define FZA_TEST_PM_CSR 0x2A /* PM CSR */ +#define FZA_TEST_PORT_STATUS 0x2B /* port status register */ +#define FZA_TEST_HOST_IRQMASK 0x2C /* host interrupt mask */ +#define FZA_TEST_TIMER_IRQ1 0x2D /* RTOS timer */ +#define FZA_TEST_FORCE_IRQ1 0x2E /* force RTOS IRQ1 */ +#define FZA_TEST_TIMER_IRQ5 0x2F /* IRQ5 backoff timer */ +#define FZA_TEST_FORCE_IRQ5 0x30 /* force IRQ5 */ +#define FZA_TEST_RES_31 0x31 /* reserved */ +#define FZA_TEST_IC_PRIO 0x32 /* interrupt controller priority */ +#define FZA_TEST_PM_FULL 0x33 /* full packet memory */ +#define FZA_TEST_PMI_DMA 0x34 /* PMI DMA */ + +/* Interrupt mask register constants. All bits are r/w. */ +#define FZA_MASK_RESERVED 0xf000 /* unused */ +#define FZA_MASK_DLU_DONE 0x0800 /* flash memory write complete */ +#define FZA_MASK_FLUSH_TX 0x0400 /* transmit ring flush request */ +#define FZA_MASK_PM_PARITY_ERR 0x0200 /* onboard packet memory parity error + */ +#define FZA_MASK_HB_PARITY_ERR 0x0100 /* host bus parity error */ +#define FZA_MASK_NXM_ERR 0x0080 /* adapter non-existent memory + * reference + */ +#define FZA_MASK_LINK_ST_CHG 0x0040 /* link status change */ +#define FZA_MASK_STATE_CHG 0x0020 /* adapter state change */ +#define FZA_MASK_UNS_POLL 0x0010 /* unsolicited event service request */ +#define FZA_MASK_CMD_DONE 0x0008 /* command ring entry processed */ +#define FZA_MASK_SMT_TX_POLL 0x0004 /* SMT frame transmit request */ +#define FZA_MASK_RCV_POLL 0x0002 /* receive request (packet available) + */ +#define FZA_MASK_TX_DONE 0x0001 /* RMC transmit done acknowledge */ + +/* Which interrupts to receive: 0/1 is mask/unmask. */ +#define FZA_MASK_NONE 0x0000 +#define FZA_MASK_NORMAL \ + ((~(FZA_MASK_RESERVED | FZA_MASK_DLU_DONE | \ + FZA_MASK_PM_PARITY_ERR | FZA_MASK_HB_PARITY_ERR | \ + FZA_MASK_NXM_ERR)) & 0xffff) + +/* Control A register constants. */ +#define FZA_CONTROL_A_HB_PARITY_ERR 0x8000 /* host bus parity error */ +#define FZA_CONTROL_A_NXM_ERR 0x4000 /* adapter non-existent memory + * reference + */ +#define FZA_CONTROL_A_SMT_RX_OVFL 0x0040 /* SMT receive overflow */ +#define FZA_CONTROL_A_FLUSH_DONE 0x0020 /* flush tx request complete */ +#define FZA_CONTROL_A_SHUT 0x0010 /* turn the interface off */ +#define FZA_CONTROL_A_HALT 0x0008 /* halt the controller */ +#define FZA_CONTROL_A_CMD_POLL 0x0004 /* command ring poll */ +#define FZA_CONTROL_A_SMT_RX_POLL 0x0002 /* SMT receive ring poll */ +#define FZA_CONTROL_A_TX_POLL 0x0001 /* transmit poll */ + +/* Control B register constants. All bits are r/w. + * + * Possible values: + * 0x0000 after booting into REX, + * 0x0003 after issuing `boot #/mop'. + */ +#define FZA_CONTROL_B_CONSOLE 0x0002 /* OR with DRIVER for console + * (TC firmware) mode + */ +#define FZA_CONTROL_B_DRIVER 0x0001 /* driver mode */ +#define FZA_CONTROL_B_IDLE 0x0000 /* no driver installed */ + +#define FZA_RESET_PAD \ + (FZA_REG_RESET - FZA_REG_BASE) +#define FZA_INT_EVENT_PAD \ + (FZA_REG_INT_EVENT - FZA_REG_RESET - sizeof(u16)) +#define FZA_CONTROL_A_PAD \ + (FZA_REG_CONTROL_A - FZA_REG_INT_MASK - sizeof(u16)) + +/* Layout of registers. */ +struct fza_regs { + u8 pad0[FZA_RESET_PAD]; + u16 reset; /* reset register */ + u8 pad1[FZA_INT_EVENT_PAD]; + u16 int_event; /* interrupt event register */ + u16 status; /* status register */ + u16 int_mask; /* interrupt mask register */ + u8 pad2[FZA_CONTROL_A_PAD]; + u16 control_a; /* control A register */ + u16 control_b; /* control B register */ +}; + +/* Command descriptor ring entry. */ +struct fza_ring_cmd { + u32 cmd_own; /* bit 31: ownership, bits [30:0]: command */ + u32 stat; /* command status */ + u32 buffer; /* address of the buffer in the FZA space */ + u32 pad0; +}; + +#define FZA_RING_CMD 0x200400 /* command ring address */ +#define FZA_RING_CMD_SIZE 0x40 /* command descriptor ring + * size +/* Command constants. */ +#define FZA_RING_CMD_MASK 0x7fffffff +#define FZA_RING_CMD_NOP 0x00000000 /* nop */ +#define FZA_RING_CMD_INIT 0x00000001 /* initialize */ +#define FZA_RING_CMD_MODCAM 0x00000002 /* modify CAM */ +#define FZA_RING_CMD_PARAM 0x00000003 /* set system parameters */ +#define FZA_RING_CMD_MODPROM 0x00000004 /* modify promiscuous mode */ +#define FZA_RING_CMD_SETCHAR 0x00000005 /* set link characteristics */ +#define FZA_RING_CMD_RDCNTR 0x00000006 /* read counters */ +#define FZA_RING_CMD_STATUS 0x00000007 /* get link status */ +#define FZA_RING_CMD_RDCAM 0x00000008 /* read CAM */ + +/* Command status constants. */ +#define FZA_RING_STAT_SUCCESS 0x00000000 + +/* Unsolicited event descriptor ring entry. */ +struct fza_ring_uns { + u32 own; /* bit 31: ownership, bits [30:0]: reserved */ + u32 id; /* event ID */ + u32 buffer; /* address of the buffer in the FZA space */ + u32 pad0; /* reserved */ +}; + +#define FZA_RING_UNS 0x200800 /* unsolicited ring address */ +#define FZA_RING_UNS_SIZE 0x40 /* unsolicited descriptor ring + * size + */ +/* Unsolicited event constants. */ +#define FZA_RING_UNS_UND 0x00000000 /* undefined event ID */ +#define FZA_RING_UNS_INIT_IN 0x00000001 /* ring init initiated */ +#define FZA_RING_UNS_INIT_RX 0x00000002 /* ring init received */ +#define FZA_RING_UNS_BEAC_IN 0x00000003 /* ring beaconing initiated */ +#define FZA_RING_UNS_DUP_ADDR 0x00000004 /* duplicate address detected */ +#define FZA_RING_UNS_DUP_TOK 0x00000005 /* duplicate token detected */ +#define FZA_RING_UNS_PURG_ERR 0x00000006 /* ring purger error */ +#define FZA_RING_UNS_STRIP_ERR 0x00000007 /* bridge strip error */ +#define FZA_RING_UNS_OP_OSC 0x00000008 /* ring op oscillation */ +#define FZA_RING_UNS_BEAC_RX 0x00000009 /* directed beacon received */ +#define FZA_RING_UNS_PCT_IN 0x0000000a /* PC trace initiated */ +#define FZA_RING_UNS_PCT_RX 0x0000000b /* PC trace received */ +#define FZA_RING_UNS_TX_UNDER 0x0000000c /* transmit underrun */ +#define FZA_RING_UNS_TX_FAIL 0x0000000d /* transmit failure */ +#define FZA_RING_UNS_RX_OVER 0x0000000e /* receive overrun */ + +/* RMC (Ring Memory Control) transmit descriptor ring entry. */ +struct fza_ring_rmc_tx { + u32 rmc; /* RMC information */ + u32 avl; /* available for host (unused by RMC) */ + u32 own; /* bit 31: ownership, bits [30:0]: reserved */ + u32 pad0; /* reserved */ +}; + +#define FZA_TX_BUFFER_ADDR(x) (0x200000 | (((x) & 0xffff) << 5)) +#define FZA_TX_BUFFER_SIZE 512 +struct fza_buffer_tx { + u32 data[FZA_TX_BUFFER_SIZE / sizeof(u32)]; +}; + +/* Transmit ring RMC constants. */ +#define FZA_RING_TX_SOP 0x80000000 /* start of packet */ +#define FZA_RING_TX_EOP 0x40000000 /* end of packet */ +#define FZA_RING_TX_DTP 0x20000000 /* discard this packet */ +#define FZA_RING_TX_VBC 0x10000000 /* valid buffer byte count */ +#define FZA_RING_TX_DCC_MASK 0x0f000000 /* DMA completion code */ +#define FZA_RING_TX_DCC_SUCCESS 0x01000000 /* transmit succeeded */ +#define FZA_RING_TX_DCC_DTP_SOP 0x02000000 /* DTP set at SOP */ +#define FZA_RING_TX_DCC_DTP 0x04000000 /* DTP set within packet */ +#define FZA_RING_TX_DCC_ABORT 0x05000000 /* MAC-requested abort */ +#define FZA_RING_TX_DCC_PARITY 0x06000000 /* xmit data parity error */ +#define FZA_RING_TX_DCC_UNDRRUN 0x07000000 /* transmit underrun */ +#define FZA_RING_TX_XPO_MASK 0x003fe000 /* transmit packet offset */ + +/* Host receive descriptor ring entry. */ +struct fza_ring_hst_rx { + u32 buf0_own; /* bit 31: ownership, bits [30:23]: unused, + * bits [22:0]: right-shifted address of the + * buffer in system memory (low buffer) + */ + u32 buffer1; /* bits [31:23]: unused, + * bits [22:0]: right-shifted address of the + * buffer in system memory (high buffer) + */ + u32 rmc; /* RMC information */ + u32 pad0; +}; + +#define FZA_RX_BUFFER_SIZE (4096 + 512) /* buffer length */ + +/* Receive ring RMC constants. */ +#define FZA_RING_RX_SOP 0x80000000 /* start of packet */ +#define FZA_RING_RX_EOP 0x40000000 /* end of packet */ +#define FZA_RING_RX_FSC_MASK 0x38000000 /* # of frame status bits */ +#define FZA_RING_RX_FSB_MASK 0x07c00000 /* frame status bits */ +#define FZA_RING_RX_FSB_ERR 0x04000000 /* error detected */ +#define FZA_RING_RX_FSB_ADDR 0x02000000 /* address recognized */ +#define FZA_RING_RX_FSB_COP 0x01000000 /* frame copied */ +#define FZA_RING_RX_FSB_F0 0x00800000 /* first additional flag */ +#define FZA_RING_RX_FSB_F1 0x00400000 /* second additional flag */ +#define FZA_RING_RX_BAD 0x00200000 /* bad packet */ +#define FZA_RING_RX_CRC 0x00100000 /* CRC error */ +#define FZA_RING_RX_RRR_MASK 0x000e0000 /* MAC receive status bits */ +#define FZA_RING_RX_RRR_OK 0x00000000 /* receive OK */ +#define FZA_RING_RX_RRR_SADDR 0x00020000 /* source address matched */ +#define FZA_RING_RX_RRR_DADDR 0x00040000 /* dest address not matched */ +#define FZA_RING_RX_RRR_ABORT 0x00060000 /* RMC abort */ +#define FZA_RING_RX_RRR_LENGTH 0x00080000 /* invalid length */ +#define FZA_RING_RX_RRR_FRAG 0x000a0000 /* fragment */ +#define FZA_RING_RX_RRR_FORMAT 0x000c0000 /* format error */ +#define FZA_RING_RX_RRR_RESET 0x000e0000 /* MAC reset */ +#define FZA_RING_RX_DA_MASK 0x00018000 /* daddr match status bits */ +#define FZA_RING_RX_DA_NONE 0x00000000 /* no match */ +#define FZA_RING_RX_DA_PROM 0x00008000 /* promiscuous match */ +#define FZA_RING_RX_DA_CAM 0x00010000 /* CAM entry match */ +#define FZA_RING_RX_DA_LOCAL 0x00018000 /* link addr or LLC bcast */ +#define FZA_RING_RX_SA_MASK 0x00006000 /* saddr match status bits */ +#define FZA_RING_RX_SA_NONE 0x00000000 /* no match */ +#define FZA_RING_RX_SA_ALIAS 0x00002000 /* alias address match */ +#define FZA_RING_RX_SA_CAM 0x00004000 /* CAM entry match */ +#define FZA_RING_RX_SA_LOCAL 0x00006000 /* link address match */ + +/* SMT (Station Management) transmit/receive descriptor ring entry. */ +struct fza_ring_smt { + u32 own; /* bit 31: ownership, bits [30:0]: unused */ + u32 rmc; /* RMC information */ + u32 buffer; /* address of the buffer */ + u32 pad0; /* reserved */ +}; + +/* Ownership constants. + * + * Only an owner is permitted to process a given ring entry. + * RMC transmit ring meanings are reversed. + */ +#define FZA_RING_OWN_MASK 0x80000000 +#define FZA_RING_OWN_FZA 0x00000000 /* permit FZA, forbid host */ +#define FZA_RING_OWN_HOST 0x80000000 /* permit host, forbid FZA */ +#define FZA_RING_TX_OWN_RMC 0x80000000 /* permit RMC, forbid host */ +#define FZA_RING_TX_OWN_HOST 0x00000000 /* permit host, forbid RMC */ + +/* RMC constants. */ +#define FZA_RING_PBC_MASK 0x00001fff /* frame length */ + +/* Layout of counter buffers. */ + +struct fza_counter { + u32 msw; + u32 lsw; +}; + +struct fza_counters { + struct fza_counter sys_buf; /* system buffer unavailable */ + struct fza_counter tx_under; /* transmit underruns */ + struct fza_counter tx_fail; /* transmit failures */ + struct fza_counter rx_over; /* receive data overruns */ + struct fza_counter frame_cnt; /* frame count */ + struct fza_counter error_cnt; /* error count */ + struct fza_counter lost_cnt; /* lost count */ + struct fza_counter rinit_in; /* ring initialization initiated */ + struct fza_counter rinit_rx; /* ring initialization received */ + struct fza_counter beac_in; /* ring beacon initiated */ + struct fza_counter dup_addr; /* duplicate address test failures */ + struct fza_counter dup_tok; /* duplicate token detected */ + struct fza_counter purg_err; /* ring purge errors */ + struct fza_counter strip_err; /* bridge strip errors */ + struct fza_counter pct_in; /* traces initiated */ + struct fza_counter pct_rx; /* traces received */ + struct fza_counter lem_rej; /* LEM rejects */ + struct fza_counter tne_rej; /* TNE expiry rejects */ + struct fza_counter lem_event; /* LEM events */ + struct fza_counter lct_rej; /* LCT rejects */ + struct fza_counter conn_cmpl; /* connections completed */ + struct fza_counter el_buf; /* elasticity buffer errors */ +}; + +/* Layout of command buffers. */ + +/* INIT command buffer. + * + * Values of default link parameters given are as obtained from a + * DEFZA-AA rev. C03 board. The board counts time in units of 80ns. + */ +struct fza_cmd_init { + u32 tx_mode; /* transmit mode */ + u32 hst_rx_size; /* host receive ring entries */ + + struct fza_counters counters; /* counters */ + + u8 rmc_rev[4]; /* RMC revision */ + u8 rom_rev[4]; /* ROM revision */ + u8 fw_rev[4]; /* firmware revision */ + + u32 mop_type; /* MOP device type */ + + u32 hst_rx; /* base of host rx descriptor ring */ + u32 rmc_tx; /* base of RMC tx descriptor ring */ + u32 rmc_tx_size; /* size of RMC tx descriptor ring */ + u32 smt_tx; /* base of SMT tx descriptor ring */ + u32 smt_tx_size; /* size of SMT tx descriptor ring */ + u32 smt_rx; /* base of SMT rx descriptor ring */ + u32 smt_rx_size; /* size of SMT rx descriptor ring */ + + u32 hw_addr[2]; /* link address */ + + u32 def_t_req; /* default Requested TTRT (T_REQ) -- + * C03: 100000 [80ns] + */ + u32 def_tvx; /* default Valid Transmission Time + * (TVX) -- C03: 32768 [80ns] + */ + u32 def_t_max; /* default Maximum TTRT (T_MAX) -- + * C03: 2162688 [80ns] + */ + u32 lem_threshold; /* default LEM threshold -- C03: 8 */ + u32 def_station_id[2]; /* default station ID */ + + u32 pmd_type_alt; /* alternative PMD type code */ + + u32 smt_ver; /* SMT version */ + + u32 rtoken_timeout; /* default restricted token timeout + * -- C03: 12500000 [80ns] + */ + u32 ring_purger; /* default ring purger enable -- + * C03: 1 + */ + + u32 smt_ver_max; /* max SMT version ID */ + u32 smt_ver_min; /* min SMT version ID */ + u32 pmd_type; /* PMD type code */ +}; + +/* INIT command PMD type codes. */ +#define FZA_PMD_TYPE_MMF 0 /* Multimode fiber */ +#define FZA_PMD_TYPE_TW 101 /* ThinWire */ +#define FZA_PMD_TYPE_STP 102 /* STP */ + +/* MODCAM/RDCAM command buffer. */ +#define FZA_CMD_CAM_SIZE 64 /* CAM address entry count */ +struct fza_cmd_cam { + u32 hw_addr[FZA_CMD_CAM_SIZE][2]; /* CAM address entries */ +}; + +/* PARAM command buffer. + * + * Permitted ranges given are as defined by the spec and obtained from a + * DEFZA-AA rev. C03 board, respectively. The rtoken_timeout field is + * erroneously interpreted in units of ms. + */ +struct fza_cmd_param { + u32 loop_mode; /* loopback mode */ + u32 t_max; /* Maximum TTRT (T_MAX) + * def: ??? [80ns] + * C03: [t_req+1,4294967295] [80ns] + */ + u32 t_req; /* Requested TTRT (T_REQ) + * def: [50000,2097151] [80ns] + * C03: [50001,t_max-1] [80ns] + */ + u32 tvx; /* Valid Transmission Time (TVX) + * def: [29375,65280] [80ns] + * C03: [29376,65279] [80ns] + */ + u32 lem_threshold; /* LEM threshold */ + u32 station_id[2]; /* station ID */ + u32 rtoken_timeout; /* restricted token timeout + * def: [0,125000000] [80ns] + * C03: [0,9999] [ms] + */ + u32 ring_purger; /* ring purger enable: 0|1 */ +}; + +/* Loopback modes for the PARAM command. */ +#define FZA_LOOP_NORMAL 0 +#define FZA_LOOP_INTERN 1 +#define FZA_LOOP_EXTERN 2 + +/* MODPROM command buffer. */ +struct fza_cmd_modprom { + u32 llc_prom; /* LLC promiscuous enable */ + u32 smt_prom; /* SMT promiscuous enable */ + u32 llc_multi; /* LLC multicast promiscuous enable */ + u32 llc_bcast; /* LLC broadcast promiscuous enable */ +}; + +/* SETCHAR command buffer. + * + * Permitted ranges are as for the PARAM command. + */ +struct fza_cmd_setchar { + u32 t_max; /* Maximum TTRT (T_MAX) */ + u32 t_req; /* Requested TTRT (T_REQ) */ + u32 tvx; /* Valid Transmission Time (TVX) */ + u32 lem_threshold; /* LEM threshold */ + u32 rtoken_timeout; /* restricted token timeout */ + u32 ring_purger; /* ring purger enable */ +}; + +/* RDCNTR command buffer. */ +struct fza_cmd_rdcntr { + struct fza_counters counters; /* counters */ +}; + +/* STATUS command buffer. */ +struct fza_cmd_status { + u32 led_state; /* LED state */ + u32 rmt_state; /* ring management state */ + u32 link_state; /* link state */ + u32 dup_addr; /* duplicate address flag */ + u32 ring_purger; /* ring purger state */ + u32 t_neg; /* negotiated TTRT [80ns] */ + u32 una[2]; /* upstream neighbour address */ + u32 una_timeout; /* UNA timed out */ + u32 strip_mode; /* frame strip mode */ + u32 yield_mode; /* claim token yield mode */ + u32 phy_state; /* PHY state */ + u32 neigh_phy; /* neighbour PHY type */ + u32 reject; /* reject reason */ + u32 phy_lee; /* PHY link error estimate [-log10] */ + u32 una_old[2]; /* old upstream neighbour address */ + u32 rmt_mac; /* remote MAC indicated */ + u32 ring_err; /* ring error reason */ + u32 beac_rx[2]; /* sender of last directed beacon */ + u32 un_dup_addr; /* upstream neighbr dup address flag */ + u32 dna[2]; /* downstream neighbour address */ + u32 dna_old[2]; /* old downstream neighbour address */ +}; + +/* Common command buffer. */ +union fza_cmd_buf { + struct fza_cmd_init init; + struct fza_cmd_cam cam; + struct fza_cmd_param param; + struct fza_cmd_modprom modprom; + struct fza_cmd_setchar setchar; + struct fza_cmd_rdcntr rdcntr; + struct fza_cmd_status status; +}; + +/* MAC (Media Access Controller) chip packet request header constants. */ + +/* Packet request header byte #0. */ +#define FZA_PRH0_FMT_TYPE_MASK 0xc0 /* type of packet, always zero */ +#define FZA_PRH0_TOK_TYPE_MASK 0x30 /* type of token required + * to send this frame + */ +#define FZA_PRH0_TKN_TYPE_ANY 0x30 /* use either token type */ +#define FZA_PRH0_TKN_TYPE_UNR 0x20 /* use an unrestricted token */ +#define FZA_PRH0_TKN_TYPE_RST 0x10 /* use a restricted token */ +#define FZA_PRH0_TKN_TYPE_IMM 0x00 /* send immediately, no token required + */ +#define FZA_PRH0_FRAME_MASK 0x08 /* type of frame to send */ +#define FZA_PRH0_FRAME_SYNC 0x08 /* send a synchronous frame */ +#define FZA_PRH0_FRAME_ASYNC 0x00 /* send an asynchronous frame */ +#define FZA_PRH0_MODE_MASK 0x04 /* send mode */ +#define FZA_PRH0_MODE_IMMED 0x04 /* an immediate mode, send regardless + * of the ring operational state + */ +#define FZA_PRH0_MODE_NORMAL 0x00 /* a normal mode, send only if ring + * operational + */ +#define FZA_PRH0_SF_MASK 0x02 /* send frame first */ +#define FZA_PRH0_SF_FIRST 0x02 /* send this frame first + * with this token capture + */ +#define FZA_PRH0_SF_NORMAL 0x00 /* treat this frame normally */ +#define FZA_PRH0_BCN_MASK 0x01 /* beacon frame */ +#define FZA_PRH0_BCN_BEACON 0x01 /* send the frame only + * if in the beacon state + */ +#define FZA_PRH0_BCN_DATA 0x01 /* send the frame only + * if in the data state + */ +/* Packet request header byte #1. */ + /* bit 7 always zero */ +#define FZA_PRH1_SL_MASK 0x40 /* send frame last */ +#define FZA_PRH1_SL_LAST 0x40 /* send this frame last, releasing + * the token afterwards + */ +#define FZA_PRH1_SL_NORMAL 0x00 /* treat this frame normally */ +#define FZA_PRH1_CRC_MASK 0x20 /* CRC append */ +#define FZA_PRH1_CRC_NORMAL 0x20 /* calculate the CRC and append it + * as the FCS field to the frame + */ +#define FZA_PRH1_CRC_SKIP 0x00 /* leave the frame as is */ +#define FZA_PRH1_TKN_SEND_MASK 0x18 /* type of token to send after the + * frame if this is the last frame + */ +#define FZA_PRH1_TKN_SEND_ORIG 0x18 /* send a token of the same type as the + * originally captured one + */ +#define FZA_PRH1_TKN_SEND_RST 0x10 /* send a restricted token */ +#define FZA_PRH1_TKN_SEND_UNR 0x08 /* send an unrestricted token */ +#define FZA_PRH1_TKN_SEND_NONE 0x00 /* send no token */ +#define FZA_PRH1_EXTRA_FS_MASK 0x07 /* send extra frame status indicators + */ +#define FZA_PRH1_EXTRA_FS_ST 0x07 /* TR RR ST II */ +#define FZA_PRH1_EXTRA_FS_SS 0x06 /* TR RR SS II */ +#define FZA_PRH1_EXTRA_FS_SR 0x05 /* TR RR SR II */ +#define FZA_PRH1_EXTRA_FS_NONE1 0x04 /* TR RR II II */ +#define FZA_PRH1_EXTRA_FS_RT 0x03 /* TR RR RT II */ +#define FZA_PRH1_EXTRA_FS_RS 0x02 /* TR RR RS II */ +#define FZA_PRH1_EXTRA_FS_RR 0x01 /* TR RR RR II */ +#define FZA_PRH1_EXTRA_FS_NONE 0x00 /* TR RR II II */ +/* Packet request header byte #2. */ +#define FZA_PRH2_NORMAL 0x00 /* always zero */ + +/* PRH used for LLC frames. */ +#define FZA_PRH0_LLC (FZA_PRH0_TKN_TYPE_UNR) +#define FZA_PRH1_LLC (FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR) +#define FZA_PRH2_LLC (FZA_PRH2_NORMAL) + +/* PRH used for SMT frames. */ +#define FZA_PRH0_SMT (FZA_PRH0_TKN_TYPE_UNR) +#define FZA_PRH1_SMT (FZA_PRH1_CRC_NORMAL | FZA_PRH1_TKN_SEND_UNR) +#define FZA_PRH2_SMT (FZA_PRH2_NORMAL) + +#if ((FZA_RING_RX_SIZE) < 2) || ((FZA_RING_RX_SIZE) > 256) +# error FZA_RING_RX_SIZE has to be from 2 up to 256 +#endif +#if ((FZA_RING_TX_MODE) != 0) && ((FZA_RING_TX_MODE) != 1) +# error FZA_RING_TX_MODE has to be either 0 or 1 +#endif + +#define FZA_RING_TX_SIZE (512 << (FZA_RING_TX_MODE)) + +struct fza_private { + struct device *bdev; /* pointer to the bus device */ + const char *name; /* printable device name */ + void __iomem *mmio; /* MMIO ioremap cookie */ + struct fza_regs __iomem *regs; /* pointer to FZA registers */ + + struct sk_buff *rx_skbuff[FZA_RING_RX_SIZE]; + /* all skbs assigned to the host + * receive descriptors + */ + dma_addr_t rx_dma[FZA_RING_RX_SIZE]; + /* their corresponding DMA addresses */ + + struct fza_ring_cmd __iomem *ring_cmd; + /* pointer to the command descriptor + * ring + */ + int ring_cmd_index; /* index to the command descriptor ring + * for the next command + */ + struct fza_ring_uns __iomem *ring_uns; + /* pointer to the unsolicited + * descriptor ring + */ + int ring_uns_index; /* index to the unsolicited descriptor + * ring for the next event + */ + + struct fza_ring_rmc_tx __iomem *ring_rmc_tx; + /* pointer to the RMC transmit + * descriptor ring (obtained from the + * INIT command) + */ + int ring_rmc_tx_size; /* number of entries in the RMC + * transmit descriptor ring (obtained + * from the INIT command) + */ + int ring_rmc_tx_index; /* index to the RMC transmit descriptor + * ring for the next transmission + */ + int ring_rmc_txd_index; /* index to the RMC transmit descriptor + * ring for the next transmit done + * acknowledge + */ + + struct fza_ring_hst_rx __iomem *ring_hst_rx; + /* pointer to the host receive + * descriptor ring (obtained from the + * INIT command) + */ + int ring_hst_rx_size; /* number of entries in the host + * receive descriptor ring (set by the + * INIT command) + */ + int ring_hst_rx_index; /* index to the host receive descriptor + * ring for the next transmission + */ + + struct fza_ring_smt __iomem *ring_smt_tx; + /* pointer to the SMT transmit + * descriptor ring (obtained from the + * INIT command) + */ + int ring_smt_tx_size; /* number of entries in the SMT + * transmit descriptor ring (obtained + * from the INIT command) + */ + int ring_smt_tx_index; /* index to the SMT transmit descriptor + * ring for the next transmission + */ + + struct fza_ring_smt __iomem *ring_smt_rx; + /* pointer to the SMT transmit + * descriptor ring (obtained from the + * INIT command) + */ + int ring_smt_rx_size; /* number of entries in the SMT + * receive descriptor ring (obtained + * from the INIT command) + */ + int ring_smt_rx_index; /* index to the SMT receive descriptor + * ring for the next transmission + */ + + struct fza_buffer_tx __iomem *buffer_tx; + /* pointer to the RMC transmit buffers + */ + + uint state; /* adapter expected state */ + + spinlock_t lock; /* for device & private data access */ + uint int_mask; /* interrupt source selector */ + + int cmd_done_flag; /* command completion trigger */ + wait_queue_head_t cmd_done_wait; + + int state_chg_flag; /* state change trigger */ + wait_queue_head_t state_chg_wait; + + struct timer_list reset_timer; /* RESET time-out trigger */ + int timer_state; /* RESET trigger state */ + + int queue_active; /* whether to enable queueing */ + + struct net_device_stats stats; + + uint irq_count_flush_tx; /* transmit flush irqs */ + uint irq_count_uns_poll; /* unsolicited event irqs */ + uint irq_count_smt_tx_poll; /* SMT transmit irqs */ + uint irq_count_rx_poll; /* host receive irqs */ + uint irq_count_tx_done; /* transmit done irqs */ + uint irq_count_cmd_done; /* command done irqs */ + uint irq_count_state_chg; /* state change irqs */ + uint irq_count_link_st_chg; /* link status change irqs */ + + uint t_max; /* T_MAX */ + uint t_req; /* T_REQ */ + uint tvx; /* TVX */ + uint lem_threshold; /* LEM threshold */ + uint station_id[2]; /* station ID */ + uint rtoken_timeout; /* restricted token timeout */ + uint ring_purger; /* ring purger enable flag */ +}; + +struct fza_fddihdr { + u8 pa[2]; /* preamble */ + u8 sd; /* starting delimiter */ + struct fddihdr hdr; +} __packed; diff --git a/include/uapi/linux/if_fddi.h b/include/uapi/linux/if_fddi.h index 75eed8b62823..7239aa9c0766 100644 --- a/include/uapi/linux/if_fddi.h +++ b/include/uapi/linux/if_fddi.h @@ -6,9 +6,10 @@ * * Global definitions for the ANSI FDDI interface. * - * Version: @(#)if_fddi.h 1.0.2 Sep 29 2004 + * Version: @(#)if_fddi.h 1.0.3 Oct 6 2018 * - * Author: Lawrence V. Stefani, + * Author: Lawrence V. Stefani, + * Maintainer: Maciej W. Rozycki, * * if_fddi.h is based on previous if_ether.h and if_tr.h work by * Fred N. van Kempen, @@ -45,7 +46,21 @@ #define FDDI_K_OUI_LEN 3 /* Octets in OUI in 802.2 SNAP header */ -/* Define FDDI Frame Control (FC) Byte values */ +/* Define FDDI Frame Control (FC) Byte masks */ +#define FDDI_FC_K_CLASS_MASK 0x80 /* class bit */ +#define FDDI_FC_K_CLASS_SYNC 0x80 +#define FDDI_FC_K_CLASS_ASYNC 0x00 +#define FDDI_FC_K_ALEN_MASK 0x40 /* address length bit */ +#define FDDI_FC_K_ALEN_48 0x40 +#define FDDI_FC_K_ALEN_16 0x00 +#define FDDI_FC_K_FORMAT_MASK 0x30 /* format bits */ +#define FDDI_FC_K_FORMAT_FUTURE 0x30 +#define FDDI_FC_K_FORMAT_IMPLEMENTOR 0x20 +#define FDDI_FC_K_FORMAT_LLC 0x10 +#define FDDI_FC_K_FORMAT_MANAGEMENT 0x00 +#define FDDI_FC_K_CONTROL_MASK 0x0f /* control bits */ + +/* Define FDDI Frame Control (FC) Byte specific values */ #define FDDI_FC_K_VOID 0x00 #define FDDI_FC_K_NON_RESTRICTED_TOKEN 0x80 #define FDDI_FC_K_RESTRICTED_TOKEN 0xC0 -- cgit v1.2.3 From 9771b8ccdfa6dcb1ac5128ca7fe8649f3092d392 Mon Sep 17 00:00:00 2001 From: "Justin.Lee1@Dell.com" Date: Thu, 11 Oct 2018 18:07:37 +0000 Subject: net/ncsi: Extend NC-SI Netlink interface to allow user space to send NC-SI command The new command (NCSI_CMD_SEND_CMD) is added to allow user space application to send NC-SI command to the network card. Also, add a new attribute (NCSI_ATTR_DATA) for transferring request and response. The work flow is as below. Request: User space application -> Netlink interface (msg) -> new Netlink handler - ncsi_send_cmd_nl() -> ncsi_xmit_cmd() Response: Response received - ncsi_rcv_rsp() -> internal response handler - ncsi_rsp_handler_xxx() -> ncsi_rsp_handler_netlink() -> ncsi_send_netlink_rsp () -> Netlink interface (msg) -> user space application Command timeout - ncsi_request_timeout() -> ncsi_send_netlink_timeout () -> Netlink interface (msg with zero data length) -> user space application Error: Error detected -> ncsi_send_netlink_err () -> Netlink interface (err msg) -> user space application Signed-off-by: Justin Lee Reviewed-by: Samuel Mendoza-Jonas Signed-off-by: David S. Miller --- include/uapi/linux/ncsi.h | 6 ++ net/ncsi/internal.h | 7 ++ net/ncsi/ncsi-cmd.c | 8 ++ net/ncsi/ncsi-manage.c | 16 ++++ net/ncsi/ncsi-netlink.c | 204 ++++++++++++++++++++++++++++++++++++++++++++++ net/ncsi/ncsi-netlink.h | 12 +++ net/ncsi/ncsi-rsp.c | 67 +++++++++++++-- 7 files changed, 315 insertions(+), 5 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/ncsi.h b/include/uapi/linux/ncsi.h index 4c292ecbb748..0a26a5576645 100644 --- a/include/uapi/linux/ncsi.h +++ b/include/uapi/linux/ncsi.h @@ -23,6 +23,9 @@ * optionally the preferred NCSI_ATTR_CHANNEL_ID. * @NCSI_CMD_CLEAR_INTERFACE: clear any preferred package/channel combination. * Requires NCSI_ATTR_IFINDEX. + * @NCSI_CMD_SEND_CMD: send NC-SI command to network card. + * Requires NCSI_ATTR_IFINDEX, NCSI_ATTR_PACKAGE_ID + * and NCSI_ATTR_CHANNEL_ID. * @NCSI_CMD_MAX: highest command number */ enum ncsi_nl_commands { @@ -30,6 +33,7 @@ enum ncsi_nl_commands { NCSI_CMD_PKG_INFO, NCSI_CMD_SET_INTERFACE, NCSI_CMD_CLEAR_INTERFACE, + NCSI_CMD_SEND_CMD, __NCSI_CMD_AFTER_LAST, NCSI_CMD_MAX = __NCSI_CMD_AFTER_LAST - 1 @@ -43,6 +47,7 @@ enum ncsi_nl_commands { * @NCSI_ATTR_PACKAGE_LIST: nested array of NCSI_PKG_ATTR attributes * @NCSI_ATTR_PACKAGE_ID: package ID * @NCSI_ATTR_CHANNEL_ID: channel ID + * @NCSI_ATTR_DATA: command payload * @NCSI_ATTR_MAX: highest attribute number */ enum ncsi_nl_attrs { @@ -51,6 +56,7 @@ enum ncsi_nl_attrs { NCSI_ATTR_PACKAGE_LIST, NCSI_ATTR_PACKAGE_ID, NCSI_ATTR_CHANNEL_ID, + NCSI_ATTR_DATA, __NCSI_ATTR_AFTER_LAST, NCSI_ATTR_MAX = __NCSI_ATTR_AFTER_LAST - 1 diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 3d0a33b874f5..13c9b5eeb3b7 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -175,6 +175,8 @@ struct ncsi_package; #define NCSI_RESERVED_CHANNEL 0x1f #define NCSI_CHANNEL_INDEX(c) ((c) & ((1 << NCSI_PACKAGE_SHIFT) - 1)) #define NCSI_TO_CHANNEL(p, c) (((p) << NCSI_PACKAGE_SHIFT) | (c)) +#define NCSI_MAX_PACKAGE 8 +#define NCSI_MAX_CHANNEL 32 struct ncsi_channel { unsigned char id; @@ -220,11 +222,15 @@ struct ncsi_request { bool used; /* Request that has been assigned */ unsigned int flags; /* NCSI request property */ #define NCSI_REQ_FLAG_EVENT_DRIVEN 1 +#define NCSI_REQ_FLAG_NETLINK_DRIVEN 2 struct ncsi_dev_priv *ndp; /* Associated NCSI device */ struct sk_buff *cmd; /* Associated NCSI command packet */ struct sk_buff *rsp; /* Associated NCSI response packet */ struct timer_list timer; /* Timer on waiting for response */ bool enabled; /* Time has been enabled or not */ + u32 snd_seq; /* netlink sending sequence number */ + u32 snd_portid; /* netlink portid of sender */ + struct nlmsghdr nlhdr; /* netlink message header */ }; enum { @@ -310,6 +316,7 @@ struct ncsi_cmd_arg { unsigned int dwords[4]; }; unsigned char *data; /* NCSI OEM data */ + struct genl_info *info; /* Netlink information */ }; extern struct list_head ncsi_dev_list; diff --git a/net/ncsi/ncsi-cmd.c b/net/ncsi/ncsi-cmd.c index 82b7d9201db8..356af474e43c 100644 --- a/net/ncsi/ncsi-cmd.c +++ b/net/ncsi/ncsi-cmd.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" @@ -346,6 +347,13 @@ int ncsi_xmit_cmd(struct ncsi_cmd_arg *nca) if (!nr) return -ENOMEM; + /* track netlink information */ + if (nca->req_flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + nr->snd_seq = nca->info->snd_seq; + nr->snd_portid = nca->info->snd_portid; + nr->nlhdr = *nca->info->nlhdr; + } + /* Prepare the packet */ nca->id = nr->id; ret = nch->handler(nr->cmd, nca); diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 091284760d21..6aa0614d2d28 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -19,6 +19,7 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" @@ -406,6 +407,9 @@ static void ncsi_request_timeout(struct timer_list *t) { struct ncsi_request *nr = from_timer(nr, t, timer); struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_cmd_pkt *cmd; + struct ncsi_package *np; + struct ncsi_channel *nc; unsigned long flags; /* If the request already had associated response, @@ -419,6 +423,18 @@ static void ncsi_request_timeout(struct timer_list *t) } spin_unlock_irqrestore(&ndp->lock, flags); + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + if (nr->cmd) { + /* Find the package */ + cmd = (struct ncsi_cmd_pkt *) + skb_network_header(nr->cmd); + ncsi_find_package_and_channel(ndp, + cmd->cmd.common.channel, + &np, &nc); + ncsi_send_netlink_timeout(nr, np, nc); + } + } + /* Release the request */ ncsi_free_request(nr); } diff --git a/net/ncsi/ncsi-netlink.c b/net/ncsi/ncsi-netlink.c index 32cb7751d216..33314381b4f5 100644 --- a/net/ncsi/ncsi-netlink.c +++ b/net/ncsi/ncsi-netlink.c @@ -19,6 +19,7 @@ #include #include "internal.h" +#include "ncsi-pkt.h" #include "ncsi-netlink.h" static struct genl_family ncsi_genl_family; @@ -28,6 +29,7 @@ static const struct nla_policy ncsi_genl_policy[NCSI_ATTR_MAX + 1] = { [NCSI_ATTR_PACKAGE_LIST] = { .type = NLA_NESTED }, [NCSI_ATTR_PACKAGE_ID] = { .type = NLA_U32 }, [NCSI_ATTR_CHANNEL_ID] = { .type = NLA_U32 }, + [NCSI_ATTR_DATA] = { .type = NLA_BINARY, .len = 2048 }, }; static struct ncsi_dev_priv *ndp_from_ifindex(struct net *net, u32 ifindex) @@ -365,6 +367,202 @@ static int ncsi_clear_interface_nl(struct sk_buff *msg, struct genl_info *info) return 0; } +static int ncsi_send_cmd_nl(struct sk_buff *msg, struct genl_info *info) +{ + struct ncsi_dev_priv *ndp; + struct ncsi_pkt_hdr *hdr; + struct ncsi_cmd_arg nca; + unsigned char *data; + u32 package_id; + u32 channel_id; + int len, ret; + + if (!info || !info->attrs) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_IFINDEX]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_PACKAGE_ID]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_CHANNEL_ID]) { + ret = -EINVAL; + goto out; + } + + if (!info->attrs[NCSI_ATTR_DATA]) { + ret = -EINVAL; + goto out; + } + + ndp = ndp_from_ifindex(get_net(sock_net(msg->sk)), + nla_get_u32(info->attrs[NCSI_ATTR_IFINDEX])); + if (!ndp) { + ret = -ENODEV; + goto out; + } + + package_id = nla_get_u32(info->attrs[NCSI_ATTR_PACKAGE_ID]); + channel_id = nla_get_u32(info->attrs[NCSI_ATTR_CHANNEL_ID]); + + if (package_id >= NCSI_MAX_PACKAGE || channel_id >= NCSI_MAX_CHANNEL) { + ret = -ERANGE; + goto out_netlink; + } + + len = nla_len(info->attrs[NCSI_ATTR_DATA]); + if (len < sizeof(struct ncsi_pkt_hdr)) { + netdev_info(ndp->ndev.dev, "NCSI: no command to send %u\n", + package_id); + ret = -EINVAL; + goto out_netlink; + } else { + data = (unsigned char *)nla_data(info->attrs[NCSI_ATTR_DATA]); + } + + hdr = (struct ncsi_pkt_hdr *)data; + + nca.ndp = ndp; + nca.package = (unsigned char)package_id; + nca.channel = (unsigned char)channel_id; + nca.type = hdr->type; + nca.req_flags = NCSI_REQ_FLAG_NETLINK_DRIVEN; + nca.info = info; + nca.payload = ntohs(hdr->length); + nca.data = data + sizeof(*hdr); + + ret = ncsi_xmit_cmd(&nca); +out_netlink: + if (ret != 0) { + netdev_err(ndp->ndev.dev, + "NCSI: Error %d sending command\n", + ret); + ncsi_send_netlink_err(ndp->ndev.dev, + info->snd_seq, + info->snd_portid, + info->nlhdr, + ret); + } +out: + return ret; +} + +int ncsi_send_netlink_rsp(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc) +{ + struct sk_buff *skb; + struct net *net; + void *hdr; + int rc; + + net = dev_net(nr->rsp->dev); + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); + if (!hdr) { + kfree_skb(skb); + return -EMSGSIZE; + } + + nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->rsp->dev->ifindex); + if (np) + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); + if (nc) + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); + else + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); + + rc = nla_put(skb, NCSI_ATTR_DATA, nr->rsp->len, (void *)nr->rsp->data); + if (rc) + goto err; + + genlmsg_end(skb, hdr); + return genlmsg_unicast(net, skb, nr->snd_portid); + +err: + kfree_skb(skb); + return rc; +} + +int ncsi_send_netlink_timeout(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc) +{ + struct sk_buff *skb; + struct net *net; + void *hdr; + + skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_put(skb, nr->snd_portid, nr->snd_seq, + &ncsi_genl_family, 0, NCSI_CMD_SEND_CMD); + if (!hdr) { + kfree_skb(skb); + return -EMSGSIZE; + } + + net = dev_net(nr->cmd->dev); + + nla_put_u32(skb, NCSI_ATTR_IFINDEX, nr->cmd->dev->ifindex); + + if (np) + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, np->id); + else + nla_put_u32(skb, NCSI_ATTR_PACKAGE_ID, + NCSI_PACKAGE_INDEX((((struct ncsi_pkt_hdr *) + nr->cmd->data)->channel))); + + if (nc) + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, nc->id); + else + nla_put_u32(skb, NCSI_ATTR_CHANNEL_ID, NCSI_RESERVED_CHANNEL); + + genlmsg_end(skb, hdr); + return genlmsg_unicast(net, skb, nr->snd_portid); +} + +int ncsi_send_netlink_err(struct net_device *dev, + u32 snd_seq, + u32 snd_portid, + struct nlmsghdr *nlhdr, + int err) +{ + struct nlmsghdr *nlh; + struct nlmsgerr *nle; + struct sk_buff *skb; + struct net *net; + + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + + net = dev_net(dev); + + nlh = nlmsg_put(skb, snd_portid, snd_seq, + NLMSG_ERROR, sizeof(*nle), 0); + nle = (struct nlmsgerr *)nlmsg_data(nlh); + nle->error = err; + memcpy(&nle->msg, nlhdr, sizeof(*nlh)); + + nlmsg_end(skb, nlh); + + return nlmsg_unicast(net->genl_sock, skb, snd_portid); +} + static const struct genl_ops ncsi_ops[] = { { .cmd = NCSI_CMD_PKG_INFO, @@ -385,6 +583,12 @@ static const struct genl_ops ncsi_ops[] = { .doit = ncsi_clear_interface_nl, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NCSI_CMD_SEND_CMD, + .policy = ncsi_genl_policy, + .doit = ncsi_send_cmd_nl, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_family ncsi_genl_family __ro_after_init = { diff --git a/net/ncsi/ncsi-netlink.h b/net/ncsi/ncsi-netlink.h index 91a5c256f8c4..c4a46887a932 100644 --- a/net/ncsi/ncsi-netlink.h +++ b/net/ncsi/ncsi-netlink.h @@ -14,6 +14,18 @@ #include "internal.h" +int ncsi_send_netlink_rsp(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc); +int ncsi_send_netlink_timeout(struct ncsi_request *nr, + struct ncsi_package *np, + struct ncsi_channel *nc); +int ncsi_send_netlink_err(struct net_device *dev, + u32 snd_seq, + u32 snd_portid, + struct nlmsghdr *nlhdr, + int err); + int ncsi_init_netlink(struct net_device *dev); int ncsi_unregister_netlink(struct net_device *dev); diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index d66b34749027..85fa59afae34 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -16,9 +16,11 @@ #include #include #include +#include #include "internal.h" #include "ncsi-pkt.h" +#include "ncsi-netlink.h" static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, unsigned short payload) @@ -32,15 +34,25 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, * before calling this function. */ h = (struct ncsi_rsp_pkt_hdr *)skb_network_header(nr->rsp); - if (h->common.revision != NCSI_PKT_REVISION) + + if (h->common.revision != NCSI_PKT_REVISION) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: unsupported header revision\n"); return -EINVAL; - if (ntohs(h->common.length) != payload) + } + if (ntohs(h->common.length) != payload) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: payload length mismatched\n"); return -EINVAL; + } /* Check on code and reason */ if (ntohs(h->code) != NCSI_PKT_RSP_C_COMPLETED || - ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) - return -EINVAL; + ntohs(h->reason) != NCSI_PKT_RSP_R_NO_ERROR) { + netdev_dbg(nr->ndp->ndev.dev, + "NCSI: non zero response/reason code\n"); + return -EPERM; + } /* Validate checksum, which might be zeroes if the * sender doesn't support checksum according to NCSI @@ -52,8 +64,11 @@ static int ncsi_validate_rsp_pkt(struct ncsi_request *nr, checksum = ncsi_calculate_checksum((unsigned char *)h, sizeof(*h) + payload - 4); - if (*pchecksum != htonl(checksum)) + + if (*pchecksum != htonl(checksum)) { + netdev_dbg(nr->ndp->ndev.dev, "NCSI: checksum mismatched\n"); return -EINVAL; + } return 0; } @@ -941,6 +956,26 @@ static int ncsi_rsp_handler_gpuuid(struct ncsi_request *nr) return 0; } +static int ncsi_rsp_handler_netlink(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct ncsi_rsp_pkt *rsp; + struct ncsi_package *np; + struct ncsi_channel *nc; + int ret; + + /* Find the package */ + rsp = (struct ncsi_rsp_pkt *)skb_network_header(nr->rsp); + ncsi_find_package_and_channel(ndp, rsp->rsp.common.channel, + &np, &nc); + if (!np) + return -ENODEV; + + ret = ncsi_send_netlink_rsp(nr, np, nc); + + return ret; +} + static struct ncsi_rsp_handler { unsigned char type; int payload; @@ -1043,6 +1078,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, netdev_warn(ndp->ndev.dev, "NCSI: 'bad' packet ignored for type 0x%x\n", hdr->type); + + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + if (ret == -EPERM) + goto out_netlink; + else + ncsi_send_netlink_err(ndp->ndev.dev, + nr->snd_seq, + nr->snd_portid, + &nr->nlhdr, + ret); + } goto out; } @@ -1052,6 +1098,17 @@ int ncsi_rcv_rsp(struct sk_buff *skb, struct net_device *dev, netdev_err(ndp->ndev.dev, "NCSI: Handler for packet type 0x%x returned %d\n", hdr->type, ret); + +out_netlink: + if (nr->flags == NCSI_REQ_FLAG_NETLINK_DRIVEN) { + ret = ncsi_rsp_handler_netlink(nr); + if (ret) { + netdev_err(ndp->ndev.dev, + "NCSI: Netlink handler for packet type 0x%x returned %d\n", + hdr->type, ret); + } + } + out: ncsi_free_request(nr); return ret; -- cgit v1.2.3 From a218dc82f0b5c6c8ad3d58c9870ed69e26c08b3e Mon Sep 17 00:00:00 2001 From: Fernando Fernandez Mancera Date: Wed, 10 Oct 2018 09:57:13 +0200 Subject: netfilter: nft_osf: Add ttl option support Add ttl option support to the nftables "osf" expression. Signed-off-by: Fernando Fernandez Mancera Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/nfnetlink_osf.h | 3 ++- include/uapi/linux/netfilter/nf_tables.h | 7 +++++ net/netfilter/nfnetlink_osf.c | 46 +++++++++++++++----------------- net/netfilter/nft_osf.c | 15 ++++++++++- 4 files changed, 44 insertions(+), 27 deletions(-) (limited to 'include/uapi') diff --git a/include/linux/netfilter/nfnetlink_osf.h b/include/linux/netfilter/nfnetlink_osf.h index ecf7dab81e9e..c6000046c966 100644 --- a/include/linux/netfilter/nfnetlink_osf.h +++ b/include/linux/netfilter/nfnetlink_osf.h @@ -27,6 +27,7 @@ bool nf_osf_match(const struct sk_buff *skb, u_int8_t family, const struct list_head *nf_osf_fingers); const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers); + const struct list_head *nf_osf_fingers, + const int ttl_check); #endif /* _NFOSF_H */ diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 5444e76870bb..579974b0bf0d 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1511,9 +1511,16 @@ enum nft_flowtable_hook_attributes { }; #define NFTA_FLOWTABLE_HOOK_MAX (__NFTA_FLOWTABLE_HOOK_MAX - 1) +/** + * enum nft_osf_attributes - nftables osf expression netlink attributes + * + * @NFTA_OSF_DREG: destination register (NLA_U32: nft_registers) + * @NFTA_OSF_TTL: Value of the TTL osf option (NLA_U8) + */ enum nft_osf_attributes { NFTA_OSF_UNSPEC, NFTA_OSF_DREG, + NFTA_OSF_TTL, __NFTA_OSF_MAX, }; #define NFTA_OSF_MAX (__NFTA_OSF_MAX - 1) diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 00db27dfd2ff..6f41dd74729d 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -30,32 +30,27 @@ EXPORT_SYMBOL_GPL(nf_osf_fingers); static inline int nf_osf_ttl(const struct sk_buff *skb, int ttl_check, unsigned char f_ttl) { + struct in_device *in_dev = __in_dev_get_rcu(skb->dev); const struct iphdr *ip = ip_hdr(skb); - - if (ttl_check != -1) { - if (ttl_check == NF_OSF_TTL_TRUE) - return ip->ttl == f_ttl; - if (ttl_check == NF_OSF_TTL_NOCHECK) - return 1; - else if (ip->ttl <= f_ttl) - return 1; - else { - struct in_device *in_dev = __in_dev_get_rcu(skb->dev); - int ret = 0; - - for_ifa(in_dev) { - if (inet_ifa_match(ip->saddr, ifa)) { - ret = (ip->ttl == f_ttl); - break; - } - } - endfor_ifa(in_dev); - - return ret; + int ret = 0; + + if (ttl_check == NF_OSF_TTL_TRUE) + return ip->ttl == f_ttl; + if (ttl_check == NF_OSF_TTL_NOCHECK) + return 1; + else if (ip->ttl <= f_ttl) + return 1; + + for_ifa(in_dev) { + if (inet_ifa_match(ip->saddr, ifa)) { + ret = (ip->ttl == f_ttl); + break; } } - return ip->ttl == f_ttl; + endfor_ifa(in_dev); + + return ret; } struct nf_osf_hdr_ctx { @@ -213,7 +208,7 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, if (!tcp) return false; - ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : -1; + ttl_check = (info->flags & NF_OSF_TTL) ? info->ttl : 0; list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { @@ -257,7 +252,8 @@ nf_osf_match(const struct sk_buff *skb, u_int8_t family, EXPORT_SYMBOL_GPL(nf_osf_match); const char *nf_osf_find(const struct sk_buff *skb, - const struct list_head *nf_osf_fingers) + const struct list_head *nf_osf_fingers, + const int ttl_check) { const struct iphdr *ip = ip_hdr(skb); const struct nf_osf_user_finger *f; @@ -275,7 +271,7 @@ const char *nf_osf_find(const struct sk_buff *skb, list_for_each_entry_rcu(kf, &nf_osf_fingers[ctx.df], finger_entry) { f = &kf->finger; - if (!nf_osf_match_one(skb, f, -1, &ctx)) + if (!nf_osf_match_one(skb, f, ttl_check, &ctx)) continue; genre = f->genre; diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index a35fb59ace73..0b452fd470c4 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -6,10 +6,12 @@ struct nft_osf { enum nft_registers dreg:8; + u8 ttl; }; static const struct nla_policy nft_osf_policy[NFTA_OSF_MAX + 1] = { [NFTA_OSF_DREG] = { .type = NLA_U32 }, + [NFTA_OSF_TTL] = { .type = NLA_U8 }, }; static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, @@ -33,7 +35,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, return; } - os_name = nf_osf_find(skb, nf_osf_fingers); + os_name = nf_osf_find(skb, nf_osf_fingers, priv->ttl); if (!os_name) strncpy((char *)dest, "unknown", NFT_OSF_MAXGENRELEN); else @@ -46,6 +48,14 @@ static int nft_osf_init(const struct nft_ctx *ctx, { struct nft_osf *priv = nft_expr_priv(expr); int err; + u8 ttl; + + if (nla_get_u8(tb[NFTA_OSF_TTL])) { + ttl = nla_get_u8(tb[NFTA_OSF_TTL]); + if (ttl > 2) + return -EINVAL; + priv->ttl = ttl; + } priv->dreg = nft_parse_register(tb[NFTA_OSF_DREG]); err = nft_validate_register_store(ctx, priv->dreg, NULL, @@ -60,6 +70,9 @@ static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr) { const struct nft_osf *priv = nft_expr_priv(expr); + if (nla_put_u8(skb, NFTA_OSF_TTL, priv->ttl)) + goto nla_put_failure; + if (nft_dump_register(skb, NFTA_OSF_DREG, priv->dreg)) goto nla_put_failure; -- cgit v1.2.3 From b55cbc8d9b44aaee94f19e995a5f241d453763ee Mon Sep 17 00:00:00 2001 From: Nicolas Dichtel Date: Wed, 17 Oct 2018 16:24:48 +0200 Subject: bpf: fix doc of bpf_skb_adjust_room() in uapi len_diff is signed. Fixes: fa15601ab31e ("bpf: add documentation for eBPF helpers (33-41)") CC: Quentin Monnet Signed-off-by: Nicolas Dichtel Reviewed-by: Quentin Monnet Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f9187b41dff6..5e46f6732781 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1433,7 +1433,7 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * - * int bpf_skb_adjust_room(struct sk_buff *skb, u32 len_diff, u32 mode, u64 flags) + * int bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) * Description * Grow or shrink the room for data in the packet associated to * *skb* by *len_diff*, and according to the selected *mode*. -- cgit v1.2.3 From af510ebd8913bee016492832f532ed919b51c09c Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Fri, 19 Oct 2018 11:48:24 +0200 Subject: Revert "netfilter: xt_quota: fix the behavior of xt_quota module" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit e9837e55b0200da544a095a1fca36efd7fd3ba30. When talking to Maze and Chenbo, we agreed to keep this back by now due to problems in the ruleset listing path with 32-bit arches. Signed-off-by: Maciej Żenczykowski Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/xt_quota.h | 8 ++--- net/netfilter/xt_quota.c | 55 ++++++++++++++++++++------------- 2 files changed, 36 insertions(+), 27 deletions(-) (limited to 'include/uapi') diff --git a/include/uapi/linux/netfilter/xt_quota.h b/include/uapi/linux/netfilter/xt_quota.h index d72fd52adbba..f3ba5d9e58b6 100644 --- a/include/uapi/linux/netfilter/xt_quota.h +++ b/include/uapi/linux/netfilter/xt_quota.h @@ -15,11 +15,9 @@ struct xt_quota_info { __u32 flags; __u32 pad; __aligned_u64 quota; -#ifdef __KERNEL__ - atomic64_t counter; -#else - __aligned_u64 remain; -#endif + + /* Used internally by the kernel */ + struct xt_quota_priv *master; }; #endif /* _XT_QUOTA_H */ diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c index fceae245eb03..10d61a6eed71 100644 --- a/net/netfilter/xt_quota.c +++ b/net/netfilter/xt_quota.c @@ -11,6 +11,11 @@ #include #include +struct xt_quota_priv { + spinlock_t lock; + uint64_t quota; +}; + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Sam Johnston "); MODULE_DESCRIPTION("Xtables: countdown quota match"); @@ -21,48 +26,54 @@ static bool quota_mt(const struct sk_buff *skb, struct xt_action_param *par) { struct xt_quota_info *q = (void *)par->matchinfo; - u64 current_count = atomic64_read(&q->counter); + struct xt_quota_priv *priv = q->master; bool ret = q->flags & XT_QUOTA_INVERT; - u64 old_count, new_count; - - do { - if (current_count == 1) - return ret; - if (current_count <= skb->len) { - atomic64_set(&q->counter, 1); - return ret; - } - old_count = current_count; - new_count = current_count - skb->len; - current_count = atomic64_cmpxchg(&q->counter, old_count, - new_count); - } while (current_count != old_count); - return !ret; + + spin_lock_bh(&priv->lock); + if (priv->quota >= skb->len) { + priv->quota -= skb->len; + ret = !ret; + } else { + /* we do not allow even small packets from now on */ + priv->quota = 0; + } + spin_unlock_bh(&priv->lock); + + return ret; } static int quota_mt_check(const struct xt_mtchk_param *par) { struct xt_quota_info *q = par->matchinfo; - BUILD_BUG_ON(sizeof(atomic64_t) != sizeof(__u64)); - if (q->flags & ~XT_QUOTA_MASK) return -EINVAL; - if (atomic64_read(&q->counter) > q->quota + 1) - return -ERANGE; - if (atomic64_read(&q->counter) == 0) - atomic64_set(&q->counter, q->quota + 1); + q->master = kmalloc(sizeof(*q->master), GFP_KERNEL); + if (q->master == NULL) + return -ENOMEM; + + spin_lock_init(&q->master->lock); + q->master->quota = q->quota; return 0; } +static void quota_mt_destroy(const struct xt_mtdtor_param *par) +{ + const struct xt_quota_info *q = par->matchinfo; + + kfree(q->master); +} + static struct xt_match quota_mt_reg __read_mostly = { .name = "quota", .revision = 0, .family = NFPROTO_UNSPEC, .match = quota_mt, .checkentry = quota_mt_check, + .destroy = quota_mt_destroy, .matchsize = sizeof(struct xt_quota_info), + .usersize = offsetof(struct xt_quota_info, master), .me = THIS_MODULE, }; -- cgit v1.2.3 From f1a2e44a3aeccb3ff18d3ccc0b0203e70b95bd92 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:25 +0200 Subject: bpf: add queue and stack maps Queue/stack maps implement a FIFO/LIFO data storage for ebpf programs. These maps support peek, pop and push operations that are exposed to eBPF programs through the new bpf_map[peek/pop/push] helpers. Those operations are exposed to userspace applications through the already existing syscalls in the following way: BPF_MAP_LOOKUP_ELEM -> peek BPF_MAP_LOOKUP_AND_DELETE_ELEM -> pop BPF_MAP_UPDATE_ELEM -> push Queue/stack maps are implemented using a buffer, tail and head indexes, hence BPF_F_NO_PREALLOC is not supported. As opposite to other maps, queue and stack do not use RCU for protecting maps values, the bpf_map[peek/pop] have a ARG_PTR_TO_UNINIT_MAP_VALUE argument that is a pointer to a memory zone where to save the value of a map. Basically the same as ARG_PTR_TO_UNINIT_MEM, but the size has not be passed as an extra argument. Our main motivation for implementing queue/stack maps was to keep track of a pool of elements, like network ports in a SNAT, however we forsee other use cases, like for exampling saving last N kernel events in a map and then analysing from userspace. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 6 + include/linux/bpf_types.h | 2 + include/uapi/linux/bpf.h | 29 ++++- kernel/bpf/Makefile | 2 +- kernel/bpf/core.c | 3 + kernel/bpf/helpers.c | 43 +++++++ kernel/bpf/queue_stack_maps.c | 288 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 + kernel/bpf/verifier.c | 19 ++- net/core/filter.c | 6 + 10 files changed, 401 insertions(+), 3 deletions(-) create mode 100644 kernel/bpf/queue_stack_maps.c (limited to 'include/uapi') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 0f8b863e0229..33014ae73103 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -39,6 +39,9 @@ struct bpf_map_ops { void *(*map_lookup_elem)(struct bpf_map *map, void *key); int (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags); int (*map_delete_elem)(struct bpf_map *map, void *key); + int (*map_push_elem)(struct bpf_map *map, void *value, u64 flags); + int (*map_pop_elem)(struct bpf_map *map, void *value); + int (*map_peek_elem)(struct bpf_map *map, void *value); /* funcs called by prog_array and perf_event_array map */ void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, @@ -811,6 +814,9 @@ static inline int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_map_push_elem_proto; +extern const struct bpf_func_proto bpf_map_pop_elem_proto; +extern const struct bpf_func_proto bpf_map_peek_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 7bad4e1947ed..44d9ab4809bd 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -69,3 +69,5 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_XSKMAP, xsk_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, reuseport_array_ops) #endif #endif +BPF_MAP_TYPE(BPF_MAP_TYPE_QUEUE, queue_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK, stack_map_ops) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 5e46f6732781..70082cb626b4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -128,6 +128,8 @@ enum bpf_map_type { BPF_MAP_TYPE_CGROUP_STORAGE, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, }; enum bpf_prog_type { @@ -462,6 +464,28 @@ union bpf_attr { * Return * 0 on success, or a negative error in case of failure. * + * int bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is removed to + * make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * int bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * * int bpf_probe_read(void *dst, u32 size, const void *src) * Description * For tracing programs, safely attempt to read *size* bytes from @@ -2303,7 +2327,10 @@ union bpf_attr { FN(skb_ancestor_cgroup_id), \ FN(sk_lookup_tcp), \ FN(sk_lookup_udp), \ - FN(sk_release), + FN(sk_release), \ + FN(map_push_elem), \ + FN(map_pop_elem), \ + FN(map_peek_elem), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index ff8262626b8f..4c2fa3ac56f6 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -3,7 +3,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o -obj-$(CONFIG_BPF_SYSCALL) += local_storage.o +obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_SYSCALL) += btf.o ifeq ($(CONFIG_NET),y) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index defcf4df6d91..7c7eeea8cffc 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1783,6 +1783,9 @@ BPF_CALL_0(bpf_user_rnd_u32) const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_map_push_elem_proto __weak; +const struct bpf_func_proto bpf_map_pop_elem_proto __weak; +const struct bpf_func_proto bpf_map_peek_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 6502115e8f55..ab0d5e3f9892 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -76,6 +76,49 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg2_type = ARG_PTR_TO_MAP_KEY, }; +BPF_CALL_3(bpf_map_push_elem, struct bpf_map *, map, void *, value, u64, flags) +{ + return map->ops->map_push_elem(map, value, flags); +} + +const struct bpf_func_proto bpf_map_push_elem_proto = { + .func = bpf_map_push_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_MAP_VALUE, + .arg3_type = ARG_ANYTHING, +}; + +BPF_CALL_2(bpf_map_pop_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_pop_elem(map, value); +} + +const struct bpf_func_proto bpf_map_pop_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + +BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) +{ + return map->ops->map_peek_elem(map, value); +} + +const struct bpf_func_proto bpf_map_peek_elem_proto = { + .func = bpf_map_pop_elem, + .gpl_only = false, + .pkt_access = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_UNINIT_MAP_VALUE, +}; + const struct bpf_func_proto bpf_get_prandom_u32_proto = { .func = bpf_user_rnd_u32, .gpl_only = false, diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c new file mode 100644 index 000000000000..12a93fb37449 --- /dev/null +++ b/kernel/bpf/queue_stack_maps.c @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * queue_stack_maps.c: BPF queue and stack maps + * + * Copyright (c) 2018 Politecnico di Torino + */ +#include +#include +#include +#include "percpu_freelist.h" + +#define QUEUE_STACK_CREATE_FLAG_MASK \ + (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) + + +struct bpf_queue_stack { + struct bpf_map map; + raw_spinlock_t lock; + u32 head, tail; + u32 size; /* max_entries + 1 */ + + char elements[0] __aligned(8); +}; + +static struct bpf_queue_stack *bpf_queue_stack(struct bpf_map *map) +{ + return container_of(map, struct bpf_queue_stack, map); +} + +static bool queue_stack_map_is_empty(struct bpf_queue_stack *qs) +{ + return qs->head == qs->tail; +} + +static bool queue_stack_map_is_full(struct bpf_queue_stack *qs) +{ + u32 head = qs->head + 1; + + if (unlikely(head >= qs->size)) + head = 0; + + return head == qs->tail; +} + +/* Called from syscall */ +static int queue_stack_map_alloc_check(union bpf_attr *attr) +{ + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 0 || + attr->map_flags & ~QUEUE_STACK_CREATE_FLAG_MASK) + return -EINVAL; + + if (attr->value_size > KMALLOC_MAX_SIZE) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) +{ + int ret, numa_node = bpf_map_attr_numa_node(attr); + struct bpf_queue_stack *qs; + u32 size, value_size; + u64 queue_size, cost; + + size = attr->max_entries + 1; + value_size = attr->value_size; + + queue_size = sizeof(*qs) + (u64) value_size * size; + + cost = queue_size; + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + ret = bpf_map_precharge_memlock(cost); + if (ret < 0) + return ERR_PTR(ret); + + qs = bpf_map_area_alloc(queue_size, numa_node); + if (!qs) + return ERR_PTR(-ENOMEM); + + memset(qs, 0, sizeof(*qs)); + + bpf_map_init_from_attr(&qs->map, attr); + + qs->map.pages = cost; + qs->size = size; + + raw_spin_lock_init(&qs->lock); + + return &qs->map; +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void queue_stack_map_free(struct bpf_map *map) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + + /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, + * so the programs (can be more than one that used this map) were + * disconnected from events. Wait for outstanding critical sections in + * these programs to complete + */ + synchronize_rcu(); + + bpf_map_area_free(qs); +} + +static int __queue_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + ptr = &qs->elements[qs->tail * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) { + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + + +static int __stack_map_get(struct bpf_map *map, void *value, bool delete) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long flags; + int err = 0; + void *ptr; + u32 index; + + raw_spin_lock_irqsave(&qs->lock, flags); + + if (queue_stack_map_is_empty(qs)) { + err = -ENOENT; + goto out; + } + + index = qs->head - 1; + if (unlikely(index >= qs->size)) + index = qs->size - 1; + + ptr = &qs->elements[index * qs->map.value_size]; + memcpy(value, ptr, qs->map.value_size); + + if (delete) + qs->head = index; + +out: + raw_spin_unlock_irqrestore(&qs->lock, flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static int queue_map_peek_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_peek_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, false); +} + +/* Called from syscall or from eBPF program */ +static int queue_map_pop_elem(struct bpf_map *map, void *value) +{ + return __queue_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int stack_map_pop_elem(struct bpf_map *map, void *value) +{ + return __stack_map_get(map, value, true); +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_push_elem(struct bpf_map *map, void *value, + u64 flags) +{ + struct bpf_queue_stack *qs = bpf_queue_stack(map); + unsigned long irq_flags; + int err = 0; + void *dst; + + /* BPF_EXIST is used to force making room for a new element in case the + * map is full + */ + bool replace = (flags & BPF_EXIST); + + /* Check supported flags for queue and stack maps */ + if (flags & BPF_NOEXIST || flags > BPF_EXIST) + return -EINVAL; + + raw_spin_lock_irqsave(&qs->lock, irq_flags); + + if (queue_stack_map_is_full(qs)) { + if (!replace) { + err = -E2BIG; + goto out; + } + /* advance tail pointer to overwrite oldest element */ + if (unlikely(++qs->tail >= qs->size)) + qs->tail = 0; + } + + dst = &qs->elements[qs->head * qs->map.value_size]; + memcpy(dst, value, qs->map.value_size); + + if (unlikely(++qs->head >= qs->size)) + qs->head = 0; + +out: + raw_spin_unlock_irqrestore(&qs->lock, irq_flags); + return err; +} + +/* Called from syscall or from eBPF program */ +static void *queue_stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int queue_stack_map_delete_elem(struct bpf_map *map, void *key) +{ + return -EINVAL; +} + +/* Called from syscall */ +static int queue_stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + return -EINVAL; +} + +const struct bpf_map_ops queue_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = queue_map_pop_elem, + .map_peek_elem = queue_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; + +const struct bpf_map_ops stack_map_ops = { + .map_alloc_check = queue_stack_map_alloc_check, + .map_alloc = queue_stack_map_alloc, + .map_free = queue_stack_map_free, + .map_lookup_elem = queue_stack_map_lookup_elem, + .map_update_elem = queue_stack_map_update_elem, + .map_delete_elem = queue_stack_map_delete_elem, + .map_push_elem = queue_stack_map_push_elem, + .map_pop_elem = stack_map_pop_elem, + .map_peek_elem = stack_map_peek_elem, + .map_get_next_key = queue_stack_map_get_next_key, +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 78d9dd95e25f..1617407f9ee5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -727,6 +727,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_fd_htab_map_lookup_elem(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) { err = bpf_fd_reuseport_array_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_peek_elem(map, value); } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -857,6 +860,9 @@ static int map_update_elem(union bpf_attr *attr) /* rcu_read_lock() is not needed */ err = bpf_fd_reuseport_array_update_elem(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_push_elem(map, value, attr->flags); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d84c91ac3b70..7d6d9cf9ebd5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2324,6 +2324,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (func_id != BPF_FUNC_sk_select_reuseport) goto error; break; + case BPF_MAP_TYPE_QUEUE: + case BPF_MAP_TYPE_STACK: + if (func_id != BPF_FUNC_map_peek_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_push_elem) + goto error; + break; default: break; } @@ -2380,6 +2387,13 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) goto error; break; + case BPF_FUNC_map_peek_elem: + case BPF_FUNC_map_pop_elem: + case BPF_FUNC_map_push_elem: + if (map->map_type != BPF_MAP_TYPE_QUEUE && + map->map_type != BPF_MAP_TYPE_STACK) + goto error; + break; default: break; } @@ -2675,7 +2689,10 @@ record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, if (func_id != BPF_FUNC_tail_call && func_id != BPF_FUNC_map_lookup_elem && func_id != BPF_FUNC_map_update_elem && - func_id != BPF_FUNC_map_delete_elem) + func_id != BPF_FUNC_map_delete_elem && + func_id != BPF_FUNC_map_push_elem && + func_id != BPF_FUNC_map_pop_elem && + func_id != BPF_FUNC_map_peek_elem) return 0; if (meta->map_ptr == NULL) { diff --git a/net/core/filter.c b/net/core/filter.c index 1a3ac6c46873..ea48ec789b5c 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4876,6 +4876,12 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_map_push_elem: + return &bpf_map_push_elem_proto; + case BPF_FUNC_map_pop_elem: + return &bpf_map_pop_elem_proto; + case BPF_FUNC_map_peek_elem: + return &bpf_map_peek_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; case BPF_FUNC_get_smp_processor_id: -- cgit v1.2.3 From bd513cd08f10cbe28856f99ae951e86e86803861 Mon Sep 17 00:00:00 2001 From: Mauricio Vasquez B Date: Thu, 18 Oct 2018 15:16:30 +0200 Subject: bpf: add MAP_LOOKUP_AND_DELETE_ELEM syscall The previous patch implemented a bpf queue/stack maps that provided the peek/pop/push functions. There is not a direct relationship between those functions and the current maps syscalls, hence a new MAP_LOOKUP_AND_DELETE_ELEM syscall is added, this is mapped to the pop operation in the queue/stack maps and it is still to implement in other kind of maps. Signed-off-by: Mauricio Vasquez B Acked-by: Song Liu Signed-off-by: Alexei Starovoitov --- include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'include/uapi') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 70082cb626b4..a2fb333290dc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -103,6 +103,7 @@ enum bpf_cmd { BPF_BTF_LOAD, BPF_BTF_GET_FD_BY_ID, BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, }; enum bpf_map_type { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 1617407f9ee5..49ae64a26562 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -999,6 +999,69 @@ err_put: return err; } +#define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value + +static int map_lookup_and_delete_elem(union bpf_attr *attr) +{ + void __user *ukey = u64_to_user_ptr(attr->key); + void __user *uvalue = u64_to_user_ptr(attr->value); + int ufd = attr->map_fd; + struct bpf_map *map; + void *key, *value, *ptr; + u32 value_size; + struct fd f; + int err; + + if (CHECK_ATTR(BPF_MAP_LOOKUP_AND_DELETE_ELEM)) + return -EINVAL; + + f = fdget(ufd); + map = __bpf_map_get(f); + if (IS_ERR(map)) + return PTR_ERR(map); + + if (!(f.file->f_mode & FMODE_CAN_WRITE)) { + err = -EPERM; + goto err_put; + } + + key = __bpf_copy_key(ukey, map->key_size); + if (IS_ERR(key)) { + err = PTR_ERR(key); + goto err_put; + } + + value_size = map->value_size; + + err = -ENOMEM; + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); + if (!value) + goto free_key; + + if (map->map_type == BPF_MAP_TYPE_QUEUE || + map->map_type == BPF_MAP_TYPE_STACK) { + err = map->ops->map_pop_elem(map, value); + } else { + err = -ENOTSUPP; + } + + if (err) + goto free_value; + + if (copy_to_user(uvalue, value, value_size) != 0) + goto free_value; + + err = 0; + +free_value: + kfree(value); +free_key: + kfree(key); +err_put: + fdput(f); + return err; +} + static const struct bpf_prog_ops * const bpf_prog_types[] = { #define BPF_PROG_TYPE(_id, _name) \ [_id] = & _name ## _prog_ops, @@ -2472,6 +2535,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_TASK_FD_QUERY: err = bpf_task_fd_query(&attr, uattr); break; + case BPF_MAP_LOOKUP_AND_DELETE_ELEM: + err = map_lookup_and_delete_elem(&attr); + break; default: err = -EINVAL; break; -- cgit v1.2.3 From 6fff607e2f14bd7c63c06c464a6f93b8efbabe28 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Fri, 19 Oct 2018 19:56:49 -0700 Subject: bpf: sk_msg program helper bpf_msg_push_data This allows user to push data into a msg using sk_msg program types. The format is as follows, bpf_msg_push_data(msg, offset, len, flags) this will insert 'len' bytes at offset 'offset'. For example to prepend 10 bytes at the front of the message the user can, bpf_msg_push_data(msg, 0, 10, 0); This will invalidate data bounds so BPF user will have to then recheck data bounds after calling this. After this the msg size will have been updated and the user is free to write into the added bytes. We allow any offset/len as long as it is within the (data, data_end) range. However, a copy will be required if the ring is full and its possible for the helper to fail with ENOMEM or EINVAL errors which need to be handled by the BPF program. This can be used similar to XDP metadata to pass data between sk_msg layer and lower layers. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/skmsg.h | 5 ++ include/uapi/linux/bpf.h | 20 ++++++- net/core/filter.c | 134 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 1 deletion(-) (limited to 'include/uapi') diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h index 84e18863f6a4..2a11e9d91dfa 100644 --- a/include/linux/skmsg.h +++ b/include/linux/skmsg.h @@ -207,6 +207,11 @@ static inline struct scatterlist *sk_msg_elem(struct sk_msg *msg, int which) return &msg->sg.data[which]; } +static inline struct scatterlist sk_msg_elem_cpy(struct sk_msg *msg, int which) +{ + return msg->sg.data[which]; +} + static inline struct page *sk_msg_page(struct sk_msg *msg, int which) { return sg_page(sk_msg_elem(msg, which)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a2fb333290dc..852dc17ab47a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -2240,6 +2240,23 @@ union bpf_attr { * pointer that was returned from bpf_sk_lookup_xxx\ (). * Return * 0 on success, or a negative error in case of failure. + * + * int bpf_msg_push_data(struct sk_buff *skb, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into msg at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the msg. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * + * Return + * 0 on success, or a negative error in case of failure. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -2331,7 +2348,8 @@ union bpf_attr { FN(sk_release), \ FN(map_push_elem), \ FN(map_pop_elem), \ - FN(map_peek_elem), + FN(map_peek_elem), \ + FN(msg_push_data), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 5fd5139e8638..35c6933c2622 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2297,6 +2297,137 @@ static const struct bpf_func_proto bpf_msg_pull_data_proto = { .arg4_type = ARG_ANYTHING, }; +BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start, + u32, len, u64, flags) +{ + struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge; + u32 new, i = 0, l, space, copy = 0, offset = 0; + u8 *raw, *to, *from; + struct page *page; + + if (unlikely(flags)) + return -EINVAL; + + /* First find the starting scatterlist element */ + i = msg->sg.start; + do { + l = sk_msg_elem(msg, i)->length; + + if (start < offset + l) + break; + offset += l; + sk_msg_iter_var_next(i); + } while (i != msg->sg.end); + + if (start >= offset + l) + return -EINVAL; + + space = MAX_MSG_FRAGS - sk_msg_elem_used(msg); + + /* If no space available will fallback to copy, we need at + * least one scatterlist elem available to push data into + * when start aligns to the beginning of an element or two + * when it falls inside an element. We handle the start equals + * offset case because its the common case for inserting a + * header. + */ + if (!space || (space == 1 && start != offset)) + copy = msg->sg.data[i].length; + + page = alloc_pages(__GFP_NOWARN | GFP_ATOMIC | __GFP_COMP, + get_order(copy + len)); + if (unlikely(!page)) + return -ENOMEM; + + if (copy) { + int front, back; + + raw = page_address(page); + + psge = sk_msg_elem(msg, i); + front = start - offset; + back = psge->length - front; + from = sg_virt(psge); + + if (front) + memcpy(raw, from, front); + + if (back) { + from += front; + to = raw + front + len; + + memcpy(to, from, back); + } + + put_page(sg_page(psge)); + } else if (start - offset) { + psge = sk_msg_elem(msg, i); + rsge = sk_msg_elem_cpy(msg, i); + + psge->length = start - offset; + rsge.length -= psge->length; + rsge.offset += start; + + sk_msg_iter_var_next(i); + sg_unmark_end(psge); + sk_msg_iter_next(msg, end); + } + + /* Slot(s) to place newly allocated data */ + new = i; + + /* Shift one or two slots as needed */ + if (!copy) { + sge = sk_msg_elem_cpy(msg, i); + + sk_msg_iter_var_next(i); + sg_unmark_end(&sge); + sk_msg_iter_next(msg, end); + + nsge = sk_msg_elem_cpy(msg, i); + if (rsge.length) { + sk_msg_iter_var_next(i); + nnsge = sk_msg_elem_cpy(msg, i); + } + + while (i != msg->sg.end) { + msg->sg.data[i] = sge; + sge = nsge; + sk_msg_iter_var_next(i); + if (rsge.length) { + nsge = nnsge; + nnsge = sk_msg_elem_cpy(msg, i); + } else { + nsge = sk_msg_elem_cpy(msg, i); + } + } + } + + /* Place newly allocated data buffer */ + sk_mem_charge(msg->sk, len); + msg->sg.size += len; + msg->sg.copy[new] = false; + sg_set_page(&msg->sg.data[new], page, len + copy, 0); + if (rsge.length) { + get_page(sg_page(&rsge)); + sk_msg_iter_var_next(new); + msg->sg.data[new] = rsge; + } + + sk_msg_compute_data_pointers(msg); + return 0; +} + +static const struct bpf_func_proto bpf_msg_push_data_proto = { + .func = bpf_msg_push_data, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_ANYTHING, +}; + BPF_CALL_1(bpf_get_cgroup_classid, const struct sk_buff *, skb) { return task_get_classid(skb); @@ -4854,6 +4985,7 @@ bool bpf_helper_changes_pkt_data(void *func) func == bpf_xdp_adjust_head || func == bpf_xdp_adjust_meta || func == bpf_msg_pull_data || + func == bpf_msg_push_data || func == bpf_xdp_adjust_tail || #if IS_ENABLED(CONFIG_IPV6_SEG6_BPF) func == bpf_lwt_seg6_store_bytes || @@ -5130,6 +5262,8 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_cork_bytes_proto; case BPF_FUNC_msg_pull_data: return &bpf_msg_pull_data_proto; + case BPF_FUNC_msg_push_data: + return &bpf_msg_push_data_proto; case BPF_FUNC_get_local_storage: return &bpf_get_local_storage_proto; default: -- cgit v1.2.3