From 4a4b8169501b18c3450ac735a7e277b24886a651 Mon Sep 17 00:00:00 2001 From: Andrew Zaborowski Date: Fri, 10 Feb 2017 10:02:31 +0100 Subject: cfg80211: Accept multiple RSSI thresholds for CQM Change the SET CQM command's RSSI threshold attribute to accept any number of thresholds as a sorted array. The API should be backwards compatible so that if one s32 threshold value is passed, the old mechanism is enabled. The netlink event generated is the same in both cases. cfg80211 handles an arbitrary number of RSSI thresholds but drivers have to provide a method (set_cqm_rssi_range_config) that configures a range set by a high and a low value. Drivers have to call back when the RSSI goes out of that range and there's no additional event for each time the range is reconfigured as there was with the current one-threshold API. This method doesn't have a hysteresis parameter because there's no benefit to the cfg80211 code from having the hysteresis be handled by hardware/driver in terms of the number of wakeups. At the same time it would likely be less consistent between drivers if offloaded or done in the drivers. Signed-off-by: Andrew Zaborowski Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5ed257c4cd4e..9a499b15cfbc 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3942,7 +3942,10 @@ enum nl80211_ps_state { * @__NL80211_ATTR_CQM_INVALID: invalid * @NL80211_ATTR_CQM_RSSI_THOLD: RSSI threshold in dBm. This value specifies * the threshold for the RSSI level at which an event will be sent. Zero - * to disable. + * to disable. Alternatively, if %NL80211_EXT_FEATURE_CQM_RSSI_LIST is + * set, multiple values can be supplied as a low-to-high sorted array of + * threshold values in dBm. Events will be sent when the RSSI value + * crosses any of the thresholds. * @NL80211_ATTR_CQM_RSSI_HYST: RSSI hysteresis in dBm. This value specifies * the minimum amount the RSSI level must change after an event before a * new event may be issued (to reduce effects of RSSI oscillation). @@ -4753,6 +4756,9 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI: The driver supports sched_scan * for reporting BSSs with better RSSI than the current connected BSS * (%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI). + * @NL80211_EXT_FEATURE_CQM_RSSI_LIST: With this driver the + * %NL80211_ATTR_CQM_RSSI_THOLD attribute accepts a list of zero or more + * RSSI threshold values to monitor rather than exactly one threshold. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4771,6 +4777,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI, + NL80211_EXT_FEATURE_CQM_RSSI_LIST, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, -- cgit v1.3 From b35a51c7dd25a823767969e3089542d7478777e9 Mon Sep 17 00:00:00 2001 From: Vasanthakumar Thiagarajan Date: Mon, 27 Feb 2017 17:04:33 +0530 Subject: cfg80211: Make pre-CAC results valid only for ETSI domain DFS requirement for ETSI domain (section 4.7.1.4 in ETSI EN 301 893 V1.8.1) is the only one which explicitly states that once DFS channel is marked as available afer the CAC, this channel will remain in available state even moving to a different operating channel. But the same is not explicitly stated in FCC DFS requirement. Also, Pre-CAC requriements are not explicitly mentioned in FCC requirement. Current implementation in keeping DFS channel in available state is same as described in ETSI domain. For non-ETSI DFS domain, this patch gives a grace period of 2 seconds since the completion of successful CAC before moving the channel's DFS state to 'usable' from 'available' state. The same grace period is checked against the channel's dfs_state_entered timestamp while deciding if a DFS channel is available for operation. There is a new radar event, NL80211_RADAR_PRE_CAC_EXPIRED, reported when DFS channel is moved from available to usable state after the grace period. Also make sure the DFS channel state is reset to usable once the beaconing operation on that channel is brought down (like stop_ap, leave_ibss and leave_mesh) in non-ETSI domain. Signed-off-by: Vasanthakumar Thiagarajan Signed-off-by: Johannes Berg --- include/uapi/linux/nl80211.h | 5 +++ net/wireless/ap.c | 5 +++ net/wireless/chan.c | 101 +++++++++++++++++++++++++++++++++++++++++++ net/wireless/core.h | 10 +++++ net/wireless/ibss.c | 1 + net/wireless/mesh.c | 1 + net/wireless/mlme.c | 40 +++++++++++++---- net/wireless/reg.c | 28 ++++++++++++ net/wireless/reg.h | 14 ++++++ 9 files changed, 196 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 9a499b15cfbc..cd4dfef58fab 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -4913,12 +4913,17 @@ enum nl80211_smps_mode { * change to the channel status. * @NL80211_RADAR_NOP_FINISHED: The Non-Occupancy Period for this channel is * over, channel becomes usable. + * @NL80211_RADAR_PRE_CAC_EXPIRED: Channel Availability Check done on this + * non-operating channel is expired and no longer valid. New CAC must + * be done on this channel before starting the operation. This is not + * applicable for ETSI dfs domain where pre-CAC is valid for ever. */ enum nl80211_radar_event { NL80211_RADAR_DETECTED, NL80211_RADAR_CAC_FINISHED, NL80211_RADAR_CAC_ABORTED, NL80211_RADAR_NOP_FINISHED, + NL80211_RADAR_PRE_CAC_EXPIRED, }; /** diff --git a/net/wireless/ap.c b/net/wireless/ap.c index bdad1f951561..25666d3009be 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -32,6 +32,11 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev); + + /* Should we apply the grace period during beaconing interface + * shutdown also? + */ + cfg80211_sched_dfs_chan_update(rdev); } return err; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 5497d022fada..099f13c0c39e 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -456,6 +456,107 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, return (r1 + r2 > 0); } +/* + * Checks if center frequency of chan falls with in the bandwidth + * range of chandef. + */ +bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, + struct ieee80211_channel *chan) +{ + int width; + u32 cf_offset, freq; + + if (chandef->chan->center_freq == chan->center_freq) + return true; + + width = cfg80211_chandef_get_width(chandef); + if (width <= 20) + return false; + + cf_offset = width / 2 - 10; + + for (freq = chandef->center_freq1 - width / 2 + 10; + freq <= chandef->center_freq1 + width / 2 - 10; freq += 20) { + if (chan->center_freq == freq) + return true; + } + + if (!chandef->center_freq2) + return false; + + for (freq = chandef->center_freq2 - width / 2 + 10; + freq <= chandef->center_freq2 + width / 2 - 10; freq += 20) { + if (chan->center_freq == freq) + return true; + } + + return false; +} + +bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) +{ + bool active = false; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->chandef.chan) + return false; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + active = wdev->beacon_interval != 0; + break; + case NL80211_IFTYPE_ADHOC: + active = wdev->ssid_len != 0; + break; + case NL80211_IFTYPE_MESH_POINT: + active = wdev->mesh_id_len != 0; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_P2P_DEVICE: + /* Can NAN type be considered as beaconing interface? */ + case NL80211_IFTYPE_NAN: + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + WARN_ON(1); + } + + return active; +} + +bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan) +{ + struct wireless_dev *wdev; + + ASSERT_RTNL(); + + if (!(chan->flags & IEEE80211_CHAN_RADAR)) + return false; + + list_for_each_entry(wdev, &wiphy->wdev_list, list) { + wdev_lock(wdev); + if (!cfg80211_beaconing_iface_active(wdev)) { + wdev_unlock(wdev); + continue; + } + + if (cfg80211_is_sub_chan(&wdev->chandef, chan)) { + wdev_unlock(wdev); + return true; + } + wdev_unlock(wdev); + } + + return false; +} static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, u32 center_freq, diff --git a/net/wireless/core.h b/net/wireless/core.h index efa690a7ef8d..519a29ebde5b 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -466,6 +466,16 @@ unsigned int cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef); +void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); + +bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan); + +bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev); + +bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, + struct ieee80211_channel *chan); + static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { unsigned long end = jiffies; diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 364f900a3dc4..10bf040a0982 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -190,6 +190,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) if (!nowext) wdev->wext.ibss.ssid_len = 0; #endif + cfg80211_sched_dfs_chan_update(rdev); } void cfg80211_clear_ibss(struct net_device *dev, bool nowext) diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 2d8518a37eab..ec0b1c20ac99 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -262,6 +262,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); rdev_set_qos_map(rdev, dev, NULL); + cfg80211_sched_dfs_chan_update(rdev); } return err; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 22b3d9990065..cd29366a5206 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -745,6 +745,12 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, } EXPORT_SYMBOL(cfg80211_rx_mgmt); +void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) +{ + cancel_delayed_work(&rdev->dfs_update_channels_wk); + queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); +} + void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); @@ -755,6 +761,8 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; + unsigned long time_dfs_update; + enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, @@ -770,11 +778,27 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; - if (c->dfs_state != NL80211_DFS_UNAVAILABLE) + if (!(c->flags & IEEE80211_CHAN_RADAR)) + continue; + + if (c->dfs_state != NL80211_DFS_UNAVAILABLE && + c->dfs_state != NL80211_DFS_AVAILABLE) continue; - timeout = c->dfs_state_entered + msecs_to_jiffies( - IEEE80211_DFS_MIN_NOP_TIME_MS); + if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { + time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; + radar_event = NL80211_RADAR_NOP_FINISHED; + } else { + if (regulatory_pre_cac_allowed(wiphy) || + cfg80211_any_wiphy_oper_chan(wiphy, c)) + continue; + + time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; + radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; + } + + timeout = c->dfs_state_entered + + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; @@ -784,8 +808,8 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, - NL80211_RADAR_NOP_FINISHED, - NULL, GFP_ATOMIC); + radar_event, NULL, + GFP_ATOMIC); continue; } @@ -810,7 +834,6 @@ void cfg80211_radar_event(struct wiphy *wiphy, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - unsigned long timeout; trace_cfg80211_radar_event(wiphy, chandef); @@ -820,9 +843,7 @@ void cfg80211_radar_event(struct wiphy *wiphy, */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); - timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_NOP_TIME_MS); - queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, - timeout); + cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); } @@ -851,6 +872,7 @@ void cfg80211_cac_event(struct net_device *netdev, msecs_to_jiffies(wdev->cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); + cfg80211_sched_dfs_chan_update(rdev); break; case NL80211_RADAR_CAC_ABORTED: break; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 753efcd51fa3..e59b192459e8 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -3120,6 +3120,34 @@ bool regulatory_indoor_allowed(void) return reg_is_indoor; } +bool regulatory_pre_cac_allowed(struct wiphy *wiphy) +{ + const struct ieee80211_regdomain *regd = NULL; + const struct ieee80211_regdomain *wiphy_regd = NULL; + bool pre_cac_allowed = false; + + rcu_read_lock(); + + regd = rcu_dereference(cfg80211_regdomain); + wiphy_regd = rcu_dereference(wiphy->regd); + if (!wiphy_regd) { + if (regd->dfs_region == NL80211_DFS_ETSI) + pre_cac_allowed = true; + + rcu_read_unlock(); + + return pre_cac_allowed; + } + + if (regd->dfs_region == wiphy_regd->dfs_region && + wiphy_regd->dfs_region == NL80211_DFS_ETSI) + pre_cac_allowed = true; + + rcu_read_unlock(); + + return pre_cac_allowed; +} + int __init regulatory_init(void) { int err = 0; diff --git a/net/wireless/reg.h b/net/wireless/reg.h index f6ced316b5a4..ff078f093989 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -143,4 +143,18 @@ int cfg80211_get_unii(int freq); */ bool regulatory_indoor_allowed(void); +/* + * Grace period to timeout pre-CAC results on the dfs channels. This timeout + * value is used for Non-ETSI domain. + * TODO: May be make this timeout available through regdb? + */ +#define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 + +/** + * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain + * @wiphy: wiphy for which pre-CAC capability is checked. + + * Pre-CAC is allowed only in ETSI domain. + */ +bool regulatory_pre_cac_allowed(struct wiphy *wiphy); #endif /* __NET_WIRELESS_REG_H */ -- cgit v1.3 From 3206caded81ad9bdb2e7ff4c0b94ec5913df8618 Mon Sep 17 00:00:00 2001 From: Laura Garcia Liebana Date: Thu, 2 Mar 2017 17:00:14 +0100 Subject: netfilter: nft_hash: support of symmetric hash This patch provides symmetric hash support according to source ip address and port, and destination ip address and port. For this purpose, the __skb_get_hash_symmetric() is used to identify the flow as it uses FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL flag by default. The new attribute NFTA_HASH_TYPE has been included to support different types of hashing functions. Currently supported NFT_HASH_JENKINS through jhash and NFT_HASH_SYM through symhash. The main difference between both types are: - jhash requires an expression with sreg, symhash doesn't. - symhash supports modulus and offset, but not seed. Examples: nft add rule ip nat prerouting ct mark set jhash ip saddr mod 2 nft add rule ip nat prerouting ct mark set symhash mod 2 By default, jenkins hash will be used if no hash type is provided for compatibility reasons. Signed-off-by: Laura Garcia Liebana Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 13 +++++ net/netfilter/nft_hash.c | 99 +++++++++++++++++++++++++++++++- 2 files changed, 111 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05215d30fe5c..4f7d75682c59 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -815,6 +815,17 @@ enum nft_rt_keys { NFT_RT_NEXTHOP6, }; +/** + * enum nft_hash_types - nf_tables hash expression types + * + * @NFT_HASH_JENKINS: Jenkins Hash + * @NFT_HASH_SYM: Symmetric Hash + */ +enum nft_hash_types { + NFT_HASH_JENKINS, + NFT_HASH_SYM, +}; + /** * enum nft_hash_attributes - nf_tables hash expression netlink attributes * @@ -824,6 +835,7 @@ enum nft_rt_keys { * @NFTA_HASH_MODULUS: modulus value (NLA_U32) * @NFTA_HASH_SEED: seed value (NLA_U32) * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) + * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types) */ enum nft_hash_attributes { NFTA_HASH_UNSPEC, @@ -833,6 +845,7 @@ enum nft_hash_attributes { NFTA_HASH_MODULUS, NFTA_HASH_SEED, NFTA_HASH_OFFSET, + NFTA_HASH_TYPE, __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index ccb834ef049b..a6a4633725bb 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -38,6 +38,25 @@ static void nft_jhash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } +struct nft_symhash { + enum nft_registers dreg:8; + u32 modulus; + u32 offset; +}; + +static void nft_symhash_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + u32 h; + + h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + + regs->data[priv->dreg] = h + priv->offset; +} + static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, @@ -45,6 +64,7 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, + [NFTA_HASH_TYPE] = { .type = NLA_U32 }, }; static int nft_jhash_init(const struct nft_ctx *ctx, @@ -92,6 +112,32 @@ static int nft_jhash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } +static int nft_symhash_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + + if (!tb[NFTA_HASH_DREG] || + !tb[NFTA_HASH_MODULUS]) + return -EINVAL; + + if (tb[NFTA_HASH_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); + + priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); + + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); + if (priv->modulus <= 1) + return -ERANGE; + + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + static int nft_jhash_dump(struct sk_buff *skb, const struct nft_expr *expr) { @@ -110,6 +156,28 @@ static int nft_jhash_dump(struct sk_buff *skb, if (priv->offset != 0) if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_JENKINS))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static int nft_symhash_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_symhash *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) + goto nla_put_failure; + if (priv->offset != 0) + if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_SYM))) + goto nla_put_failure; return 0; nla_put_failure: @@ -125,9 +193,38 @@ static const struct nft_expr_ops nft_jhash_ops = { .dump = nft_jhash_dump, }; +static const struct nft_expr_ops nft_symhash_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), + .eval = nft_symhash_eval, + .init = nft_symhash_init, + .dump = nft_symhash_dump, +}; + +static const struct nft_expr_ops * +nft_hash_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + u32 type; + + if (!tb[NFTA_HASH_TYPE]) + return &nft_jhash_ops; + + type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); + switch (type) { + case NFT_HASH_SYM: + return &nft_symhash_ops; + case NFT_HASH_JENKINS: + return &nft_jhash_ops; + default: + break; + } + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_hash_type __read_mostly = { .name = "hash", - .ops = &nft_jhash_ops, + .select_ops = &nft_hash_select_ops, .policy = nft_hash_policy, .maxattr = NFTA_HASH_MAX, .owner = THIS_MODULE, -- cgit v1.3 From df789fe752065f2ce761ba434125e335b514899f Mon Sep 17 00:00:00 2001 From: David Forster Date: Thu, 23 Feb 2017 16:27:18 +0000 Subject: ipv6: Provide ipv6 version of "disable_policy" sysctl This provides equivalent functionality to the existing ipv4 "disable_policy" systcl. ie. Allows IPsec processing to be skipped on terminating packets on a per-interface basis. Signed-off-by: David Forster Signed-off-by: David S. Miller --- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + net/ipv6/addrconf.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 71be5b330d21..f0d79bd054ca 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -70,6 +70,7 @@ struct ipv6_devconf { #endif __u32 enhanced_dad; __u32 addr_gen_mode; + __s32 disable_policy; struct ctl_table_header *sysctl_header; }; diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 8ef9e75e004e..d8f6a1ac9af4 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -183,6 +183,7 @@ enum { DEVCONF_SEG6_REQUIRE_HMAC, DEVCONF_ENHANCED_DAD, DEVCONF_ADDR_GEN_MODE, + DEVCONF_DISABLE_POLICY, DEVCONF_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 363172527e43..8c69768a5c46 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -245,6 +245,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { #endif .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, + .disable_policy = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -297,6 +298,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #endif .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, + .disable_policy = 0, }; /* Check if a valid qdisc is available */ @@ -944,6 +946,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, int scope, u32 flags, u32 valid_lft, u32 prefered_lft) { + struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; unsigned int hash; @@ -990,6 +993,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } + if (net->ipv6.devconf_all->disable_policy || + idev->cnf.disable_policy) + rt->dst.flags |= DST_NOPOLICY; + neigh_parms_data_state_setall(idev->nd_parms); ifa->addr = *addr; @@ -5003,6 +5010,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #endif array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; + array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; } static inline size_t inet6_ifla6_size(void) @@ -5827,6 +5835,105 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } +static +void addrconf_set_nopolicy(struct rt6_info *rt, int action) +{ + if (rt) { + if (action) + rt->dst.flags |= DST_NOPOLICY; + else + rt->dst.flags &= ~DST_NOPOLICY; + } +} + +static +void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) +{ + struct inet6_ifaddr *ifa; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + spin_lock(&ifa->lock); + if (ifa->rt) { + struct rt6_info *rt = ifa->rt; + struct fib6_table *table = rt->rt6i_table; + int cpu; + + read_lock(&table->tb6_lock); + addrconf_set_nopolicy(ifa->rt, val); + if (rt->rt6i_pcpu) { + for_each_possible_cpu(cpu) { + struct rt6_info **rtp; + + rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + addrconf_set_nopolicy(*rtp, val); + } + } + read_unlock(&table->tb6_lock); + } + spin_unlock(&ifa->lock); + } + read_unlock_bh(&idev->lock); +} + +static +int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) +{ + struct inet6_dev *idev; + struct net *net; + + if (!rtnl_trylock()) + return restart_syscall(); + + *valp = val; + + net = (struct net *)ctl->extra2; + if (valp == &net->ipv6.devconf_dflt->disable_policy) { + rtnl_unlock(); + return 0; + } + + if (valp == &net->ipv6.devconf_all->disable_policy) { + struct net_device *dev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) + addrconf_disable_policy_idev(idev, val); + } + } else { + idev = (struct inet6_dev *)ctl->extra1; + addrconf_disable_policy_idev(idev, val); + } + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + lctl = *ctl; + lctl.data = &val; + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write && (*valp != val)) + ret = addrconf_disable_policy(ctl, valp, val); + + if (ret) + *ppos = pos; + + return ret; +} + static int minus_one = -1; static const int one = 1; static const int two_five_five = 255; @@ -6184,6 +6291,13 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = addrconf_sysctl_addr_gen_mode, }, + { + .procname = "disable_policy", + .data = &ipv6_devconf.disable_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable_policy, + }, { /* sentinel */ } -- cgit v1.3 From 49b499718fa1b0d639663cfd03085b9bfd23cdc8 Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Wed, 8 Mar 2017 16:03:32 +0100 Subject: net: sched: make default fifo qdiscs appear in the dump The original reason [1] for having hidden qdiscs (potential scalability issues in qdisc_match_from_root() with single linked list in case of large amount of qdiscs) has been invalidated by 59cc1f61f0 ("net: sched: convert qdisc linked list to hashtable"). This allows us for bringing more clarity and determinism into the dump by making default pfifo qdiscs visible. We're not turning this on by default though, at it was deemed [2] too intrusive / unnecessary change of default behavior towards userspace. Instead, TCA_DUMP_INVISIBLE netlink attribute is introduced, which allows applications to request complete qdisc hierarchy dump, including the ones that have always been implicit/invisible. Singleton noop_qdisc stays invisible, as teaching the whole infrastructure about singletons would require quite some surgery with very little gain (seeing no qdisc or seeing noop qdisc in the dump is probably setting the same user expectation). [1] http://lkml.kernel.org/r/1460732328.10638.74.camel@edumazet-glaptop3.roam.corp.google.com [2] http://lkml.kernel.org/r/20161021.105935.1907696543877061916.davem@davemloft.net Signed-off-by: Jiri Kosina Signed-off-by: David S. Miller --- include/net/pkt_sched.h | 2 +- include/net/sch_generic.h | 1 + include/uapi/linux/rtnetlink.h | 1 + net/sched/sch_api.c | 42 ++++++++++++++++++++++++++++++------------ net/sched/sch_cbq.c | 5 +++++ net/sched/sch_drr.c | 2 ++ net/sched/sch_dsmark.c | 2 ++ net/sched/sch_generic.c | 2 +- net/sched/sch_hfsc.c | 4 ++++ net/sched/sch_htb.c | 2 ++ net/sched/sch_mq.c | 2 +- net/sched/sch_mqprio.c | 2 +- net/sched/sch_multiq.c | 2 ++ net/sched/sch_prio.c | 5 ++++- net/sched/sch_qfq.c | 2 ++ net/sched/sch_red.c | 2 ++ net/sched/sch_sfb.c | 2 ++ net/sched/sch_tbf.c | 2 ++ 18 files changed, 65 insertions(+), 17 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f1b76b8e6d2d..bec46f63f10c 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -92,7 +92,7 @@ int unregister_qdisc(struct Qdisc_ops *qops); void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); -void qdisc_hash_add(struct Qdisc *q); +void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index aeec4086afb2..65d502610314 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -66,6 +66,7 @@ struct Qdisc { #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ +#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 6546917d605a..75fcf5eff093 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -545,6 +545,7 @@ enum { TCA_STATS2, TCA_STAB, TCA_PAD, + TCA_DUMP_INVISIBLE, __TCA_MAX }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bcf49cd22786..62567bfe52c7 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -274,7 +274,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) return NULL; } -void qdisc_hash_add(struct Qdisc *q) +void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { struct Qdisc *root = qdisc_dev(q)->qdisc; @@ -282,6 +282,8 @@ void qdisc_hash_add(struct Qdisc *q) WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); + if (invisible) + q->flags |= TCQ_F_INVISIBLE; } } EXPORT_SYMBOL(qdisc_hash_add); @@ -1003,7 +1005,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, goto err_out4; } - qdisc_hash_add(sch); + qdisc_hash_add(sch, false); return sch; } @@ -1401,9 +1403,14 @@ nla_put_failure: return -1; } -static bool tc_qdisc_dump_ignore(struct Qdisc *q) +static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) { - return (q->flags & TCQ_F_BUILTIN) ? true : false; + if (q->flags & TCQ_F_BUILTIN) + return true; + if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) + return true; + + return false; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, @@ -1417,12 +1424,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (old && !tc_qdisc_dump_ignore(old)) { + if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) goto err_out; } - if (new && !tc_qdisc_dump_ignore(new)) { + if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; @@ -1439,7 +1446,8 @@ err_out: static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, - int *q_idx_p, int s_q_idx, bool recur) + int *q_idx_p, int s_q_idx, bool recur, + bool dump_invisible) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; @@ -1452,7 +1460,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (q_idx < s_q_idx) { q_idx++; } else { - if (!tc_qdisc_dump_ignore(q) && + if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) @@ -1474,7 +1482,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, q_idx++; continue; } - if (!tc_qdisc_dump_ignore(q) && + if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) @@ -1496,12 +1504,21 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; + const struct nlmsghdr *nlh = cb->nlh; + struct tcmsg *tcm = nlmsg_data(nlh); + struct nlattr *tca[TCA_MAX + 1]; + int err; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; idx = 0; ASSERT_RTNL(); + + err = nlmsg_parse(nlh, sizeof(*tcm), tca, TCA_MAX, NULL); + if (err < 0) + return err; + for_each_netdev(net, dev) { struct netdev_queue *dev_queue; @@ -1512,13 +1529,14 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) q_idx = 0; if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, - true) < 0) + true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, - &q_idx, s_q_idx, false) < 0) + &q_idx, s_q_idx, false, + tca[TCA_DUMP_INVISIBLE]) < 0) goto done; cont: @@ -1762,7 +1780,7 @@ static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, { struct qdisc_dump_args arg; - if (tc_qdisc_dump_ignore(q) || + if (tc_qdisc_dump_ignore(q, false) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index d6ca18dc04c3..cf93e5ff3d63 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1161,6 +1161,8 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) sch->handle); if (!q->link.q) q->link.q = &noop_qdisc; + else + qdisc_hash_add(q->link.q, true); q->link.priority = TC_CBQ_MAXPRIO - 1; q->link.priority2 = TC_CBQ_MAXPRIO - 1; @@ -1600,6 +1602,9 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!cl->q) cl->q = &noop_qdisc; + else + qdisc_hash_add(cl->q, true); + cl->common.classid = classid; cl->tparent = parent; cl->qdisc = sch; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index bb4cbdf75004..9fe67e257dfa 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -117,6 +117,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; + else + qdisc_hash_add(cl->qdisc, true); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 802ac7c2e5e8..1b98cb2160ff 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -368,6 +368,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (p->q == NULL) p->q = &noop_qdisc; + else + qdisc_hash_add(p->q, true); pr_debug("%s: qdisc %p\n", __func__, p->q); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index b052b27a984e..3e64d23e098c 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -795,7 +795,7 @@ static void attach_default_qdiscs(struct net_device *dev) } #ifdef CONFIG_NET_SCHED if (dev->qdisc) - qdisc_hash_add(dev->qdisc); + qdisc_hash_add(dev->qdisc, false); #endif } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ffaa6fb0990..0198c6cdda49 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1066,6 +1066,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; + else + qdisc_hash_add(cl->qdisc, true); INIT_LIST_HEAD(&cl->children); cl->vt_tree = RB_ROOT; cl->cf_tree = RB_ROOT; @@ -1425,6 +1427,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) sch->handle); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; + else + qdisc_hash_add(q->root.qdisc, true); INIT_LIST_HEAD(&q->root.children); q->root.vt_tree = RB_ROOT; q->root.cf_tree = RB_ROOT; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4cd5fb134bc9..95867033542e 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1460,6 +1460,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_class_hash_insert(&q->clhash, &cl->common); if (parent) parent->children++; + if (cl->un.leaf.q != &noop_qdisc) + qdisc_hash_add(cl->un.leaf.q, true); } else { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 20b7f1646f69..cadfdd4f1e52 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -84,7 +84,7 @@ static void mq_attach(struct Qdisc *sch) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) - qdisc_hash_add(qdisc); + qdisc_hash_add(qdisc, false); #endif } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 922683418e53..b851e209da4d 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -175,7 +175,7 @@ static void mqprio_attach(struct Qdisc *sch) if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) - qdisc_hash_add(qdisc); + qdisc_hash_add(qdisc, false); } kfree(priv->qdiscs); priv->qdiscs = NULL; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e7839a0d0eaa..43a3a10b3c81 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -217,6 +217,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; + if (child != &noop_qdisc) + qdisc_hash_add(child, true); if (old != &noop_qdisc) { qdisc_tree_reduce_backlog(old, diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index d4d7db267b6e..92c2e6d448d7 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -192,8 +192,11 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) qdisc_destroy(child); } - for (i = oldbands; i < q->bands; i++) + for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; + if (q->queues[i] != &noop_qdisc) + qdisc_hash_add(q->queues[i], true); + } sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f9e712ce2d15..6c85f3e9239b 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -494,6 +494,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, goto destroy_class; } + if (cl->qdisc != &noop_qdisc) + qdisc_hash_add(cl->qdisc, true); sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 249b2a18acbd..799ea6dd69b2 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -191,6 +191,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return PTR_ERR(child); } + if (child != &noop_qdisc) + qdisc_hash_add(child, true); sch_tree_lock(sch); q->flags = ctl->flags; q->limit = ctl->limit; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index fe6963d21519..ae862f172c94 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -513,6 +513,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) if (IS_ERR(child)) return PTR_ERR(child); + if (child != &noop_qdisc) + qdisc_hash_add(child, true); sch_tree_lock(sch); qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 303355c449ab..40c29a801391 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -396,6 +396,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; + if (child != &noop_qdisc); + qdisc_hash_add(child, true); } q->limit = qopt->limit; if (tb[TCA_TBF_PBURST]) -- cgit v1.3 From c95129d127c6d3d9fca189c6f94c539a7f086b1a Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Mar 2017 12:11:06 +0800 Subject: sctp: add support for generating assoc reset event notification This patch is to add Association Reset Event described in rfc6525 section 6.1.2. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 4 ++++ include/uapi/linux/sctp.h | 15 +++++++++++++++ net/sctp/ulpevent.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 324b5965fc4d..2ab7ed44de52 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -132,6 +132,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, __u16 *stream_list, gfp_t gfp); +struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( + const struct sctp_association *asoc, __u16 flags, + __u32 local_tsn, __u32 remote_tsn, gfp_t gfp); + void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d3ae381fcf33..77358297c2f9 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -502,6 +502,17 @@ struct sctp_stream_reset_event { __u16 strreset_stream_list[]; }; +#define SCTP_ASSOC_RESET_DENIED 0x0004 +#define SCTP_ASSOC_RESET_FAILED 0x0008 +struct sctp_assoc_reset_event { + __u16 assocreset_type; + __u16 assocreset_flags; + __u32 assocreset_length; + sctp_assoc_t assocreset_assoc_id; + __u32 assocreset_local_tsn; + __u32 assocreset_remote_tsn; +}; + /* * Described in Section 7.3 * Ancillary Data and Notification Interest Options @@ -518,6 +529,7 @@ struct sctp_event_subscribe { __u8 sctp_authentication_event; __u8 sctp_sender_dry_event; __u8 sctp_stream_reset_event; + __u8 sctp_assoc_reset_event; }; /* @@ -543,6 +555,7 @@ union sctp_notification { struct sctp_authkey_event sn_authkey_event; struct sctp_sender_dry_event sn_sender_dry_event; struct sctp_stream_reset_event sn_strreset_event; + struct sctp_assoc_reset_event sn_assocreset_event; }; /* Section 5.3.1 @@ -572,6 +585,8 @@ enum sctp_sn_type { #define SCTP_SENDER_DRY_EVENT SCTP_SENDER_DRY_EVENT SCTP_STREAM_RESET_EVENT, #define SCTP_STREAM_RESET_EVENT SCTP_STREAM_RESET_EVENT + SCTP_ASSOC_RESET_EVENT, +#define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT }; /* Notification error codes used to fill up the error fields in some diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c8881bc542a0..420d7f35256a 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -883,6 +883,34 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( return event; } +struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( + const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, + __u32 remote_tsn, gfp_t gfp) +{ + struct sctp_assoc_reset_event *areset; + struct sctp_ulpevent *event; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_assoc_reset_event), + MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + areset = (struct sctp_assoc_reset_event *) + skb_put(skb, sizeof(struct sctp_assoc_reset_event)); + + areset->assocreset_type = SCTP_ASSOC_RESET_EVENT; + areset->assocreset_flags = flags; + areset->assocreset_length = sizeof(struct sctp_assoc_reset_event); + sctp_ulpevent_set_owner(event, asoc); + areset->assocreset_assoc_id = sctp_assoc2id(asoc); + areset->assocreset_local_tsn = local_tsn; + areset->assocreset_remote_tsn = remote_tsn; + + return event; +} + /* Return the notification type, assuming this is a notification * event. */ -- cgit v1.3 From b444153fb5a647448c2080ad28656ad183cae4fc Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Mar 2017 12:11:08 +0800 Subject: sctp: add support for generating add stream change event notification This patch is to add Stream Change Event described in rfc6525 section 6.1.3. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/net/sctp/ulpevent.h | 4 ++++ include/uapi/linux/sctp.h | 15 +++++++++++++++ net/sctp/ulpevent.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 47 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 2ab7ed44de52..1060494ac230 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -136,6 +136,10 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, __u32 remote_tsn, gfp_t gfp); +struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( + const struct sctp_association *asoc, __u16 flags, + __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp); + void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 77358297c2f9..fd652e6501b4 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -513,6 +513,17 @@ struct sctp_assoc_reset_event { __u32 assocreset_remote_tsn; }; +#define SCTP_ASSOC_CHANGE_DENIED 0x0004 +#define SCTP_ASSOC_CHANGE_FAILED 0x0008 +struct sctp_stream_change_event { + __u16 strchange_type; + __u16 strchange_flags; + __u32 strchange_length; + sctp_assoc_t strchange_assoc_id; + __u16 strchange_instrms; + __u16 strchange_outstrms; +}; + /* * Described in Section 7.3 * Ancillary Data and Notification Interest Options @@ -530,6 +541,7 @@ struct sctp_event_subscribe { __u8 sctp_sender_dry_event; __u8 sctp_stream_reset_event; __u8 sctp_assoc_reset_event; + __u8 sctp_stream_change_event; }; /* @@ -556,6 +568,7 @@ union sctp_notification { struct sctp_sender_dry_event sn_sender_dry_event; struct sctp_stream_reset_event sn_strreset_event; struct sctp_assoc_reset_event sn_assocreset_event; + struct sctp_stream_change_event sn_strchange_event; }; /* Section 5.3.1 @@ -587,6 +600,8 @@ enum sctp_sn_type { #define SCTP_STREAM_RESET_EVENT SCTP_STREAM_RESET_EVENT SCTP_ASSOC_RESET_EVENT, #define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT + SCTP_STREAM_CHANGE_EVENT, +#define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT }; /* Notification error codes used to fill up the error fields in some diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 420d7f35256a..ec2b3e013c2f 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -911,6 +911,34 @@ struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( return event; } +struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( + const struct sctp_association *asoc, __u16 flags, + __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp) +{ + struct sctp_stream_change_event *schange; + struct sctp_ulpevent *event; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_stream_change_event), + MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + schange = (struct sctp_stream_change_event *) + skb_put(skb, sizeof(struct sctp_stream_change_event)); + + schange->strchange_type = SCTP_STREAM_CHANGE_EVENT; + schange->strchange_flags = flags; + schange->strchange_length = sizeof(struct sctp_stream_change_event); + sctp_ulpevent_set_owner(event, asoc); + schange->strchange_assoc_id = sctp_assoc2id(asoc); + schange->strchange_instrms = strchange_instrms; + schange->strchange_outstrms = strchange_outstrms; + + return event; +} + /* Return the notification type, assuming this is a notification * event. */ -- cgit v1.3 From c0d8bab6ae518cedfb5246e99ece43fe51d79b56 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 10 Mar 2017 12:11:12 +0800 Subject: sctp: add get and set sockopt for reconf_enable This patchset is to add SCTP_RECONFIG_SUPPORTED sockopt, it would set and get asoc reconf_enable value when asoc_id is set, or it would set and get ep reconf_enalbe value if asoc_id is 0. It is also to add sysctl interface for users to set the default value for reconf_enable. After this patch, stream reconf will work. Signed-off-by: Xin Long Signed-off-by: David S. Miller --- include/uapi/linux/sctp.h | 1 + net/sctp/socket.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++ net/sctp/sysctl.c | 7 ++++ 3 files changed, 89 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index fd652e6501b4..7212870ef5d7 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -115,6 +115,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_PR_SUPPORTED 113 #define SCTP_DEFAULT_PRINFO 114 #define SCTP_PR_ASSOC_STATUS 115 +#define SCTP_RECONFIG_SUPPORTED 117 #define SCTP_ENABLE_STREAM_RESET 118 #define SCTP_RESET_STREAMS 119 #define SCTP_RESET_ASSOC 120 diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6f0a9be50f50..24e28cfb542b 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3758,6 +3758,39 @@ out: return retval; } +static int sctp_setsockopt_reconfig_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->reconf_enable = !!params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->reconf_enable = !!params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_setsockopt_enable_strreset(struct sock *sk, char __user *optval, unsigned int optlen) @@ -4038,6 +4071,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_DEFAULT_PRINFO: retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); break; + case SCTP_RECONFIG_SUPPORTED: + retval = sctp_setsockopt_reconfig_supported(sk, optval, optlen); + break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_setsockopt_enable_strreset(sk, optval, optlen); break; @@ -6540,6 +6576,47 @@ out: return retval; } +static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->reconf_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->reconf_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt_enable_strreset(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -6748,6 +6825,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; + case SCTP_RECONFIG_SUPPORTED: + retval = sctp_getsockopt_reconfig_supported(sk, len, optval, + optlen); + break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index daf8554fd42a..0e732f68c2bf 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -274,6 +274,13 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "reconf_enable", + .data = &init_net.sctp.reconf_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "auth_enable", .data = &init_net.sctp.auth_enable, -- cgit v1.3 From 1a64edf54f55d7956cf5a0d95898bc1f84f9b818 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 8 Mar 2017 16:48:44 +0100 Subject: netfilter: nft_ct: add helper set support this allows to assign connection tracking helpers to connections via nft objref infrastructure. The idea is to first specifiy a helper object: table ip filter { ct helper some-name { type "ftp" protocol tcp l3proto ip } } and then assign it via nft add ... ct helper set "some-name" helper assignment works for new conntracks only as we cannot expand the conntrack extension area once it has been committed to the main conntrack table. ipv4 and ipv6 protocols are tracked stored separately so we can also handle families that observe both ipv4 and ipv6 traffic. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 12 ++- net/netfilter/nft_ct.c | 171 +++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 4f7d75682c59..34c8d08b687a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1259,10 +1259,20 @@ enum nft_fib_flags { NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ }; +enum nft_ct_helper_attributes { + NFTA_CT_HELPER_UNSPEC, + NFTA_CT_HELPER_NAME, + NFTA_CT_HELPER_L3PROTO, + NFTA_CT_HELPER_L4PROTO, + __NFTA_CT_HELPER_MAX, +}; +#define NFTA_CT_HELPER_MAX (__NFTA_CT_HELPER_MAX - 1) + #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 -#define __NFT_OBJECT_MAX 3 +#define NFT_OBJECT_CT_HELPER 3 +#define __NFT_OBJECT_MAX 4 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index bf548a7a71ec..4144ae845bdd 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -32,6 +32,12 @@ struct nft_ct { }; }; +struct nft_ct_helper_obj { + struct nf_conntrack_helper *helper4; + struct nf_conntrack_helper *helper6; + u8 l4proto; +}; + #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; @@ -730,6 +736,162 @@ static struct nft_expr_type nft_notrack_type __read_mostly = { .owner = THIS_MODULE, }; +static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_helper_obj *priv = nft_obj_data(obj); + struct nf_conntrack_helper *help4, *help6; + char name[NF_CT_HELPER_NAME_LEN]; + int family = ctx->afi->family; + + if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) + return -EINVAL; + + priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); + if (!priv->l4proto) + return -ENOENT; + + nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + + if (tb[NFTA_CT_HELPER_L3PROTO]) + family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); + + help4 = NULL; + help6 = NULL; + + switch (family) { + case NFPROTO_IPV4: + if (ctx->afi->family == NFPROTO_IPV6) + return -EINVAL; + + help4 = nf_conntrack_helper_try_module_get(name, family, + priv->l4proto); + break; + case NFPROTO_IPV6: + if (ctx->afi->family == NFPROTO_IPV4) + return -EINVAL; + + help6 = nf_conntrack_helper_try_module_get(name, family, + priv->l4proto); + break; + case NFPROTO_NETDEV: /* fallthrough */ + case NFPROTO_BRIDGE: /* same */ + case NFPROTO_INET: + help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, + priv->l4proto); + help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, + priv->l4proto); + break; + default: + return -EAFNOSUPPORT; + } + + /* && is intentional; only error if INET found neither ipv4 or ipv6 */ + if (!help4 && !help6) + return -ENOENT; + + priv->helper4 = help4; + priv->helper6 = help6; + + return 0; +} + +static void nft_ct_helper_obj_destroy(struct nft_object *obj) +{ + struct nft_ct_helper_obj *priv = nft_obj_data(obj); + + if (priv->helper4) + module_put(priv->helper4->me); + if (priv->helper6) + module_put(priv->helper6->me); +} + +static void nft_ct_helper_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_helper_obj *priv = nft_obj_data(obj); + struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); + struct nf_conntrack_helper *to_assign = NULL; + struct nf_conn_help *help; + + if (!ct || + nf_ct_is_confirmed(ct) || + nf_ct_is_template(ct) || + priv->l4proto != nf_ct_protonum(ct)) + return; + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + to_assign = priv->helper4; + break; + case NFPROTO_IPV6: + to_assign = priv->helper6; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (!to_assign) + return; + + if (test_bit(IPS_HELPER_BIT, &ct->status)) + return; + + help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); + if (help) { + rcu_assign_pointer(help->helper, to_assign); + set_bit(IPS_HELPER_BIT, &ct->status); + } +} + +static int nft_ct_helper_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_helper_obj *priv = nft_obj_data(obj); + const struct nf_conntrack_helper *helper = priv->helper4; + u16 family; + + if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) + return -1; + + if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) + return -1; + + if (priv->helper4 && priv->helper6) + family = NFPROTO_INET; + else if (priv->helper6) + family = NFPROTO_IPV6; + else + family = NFPROTO_IPV4; + + if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) + return -1; + + return 0; +} + +static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { + [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_helper_obj __read_mostly = { + .type = NFT_OBJECT_CT_HELPER, + .size = sizeof(struct nft_ct_helper_obj), + .maxattr = NFTA_CT_HELPER_MAX, + .policy = nft_ct_helper_policy, + .eval = nft_ct_helper_obj_eval, + .init = nft_ct_helper_obj_init, + .destroy = nft_ct_helper_obj_destroy, + .dump = nft_ct_helper_obj_dump, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -744,7 +906,14 @@ static int __init nft_ct_module_init(void) if (err < 0) goto err1; + err = nft_register_obj(&nft_ct_helper_obj); + if (err < 0) + goto err2; + return 0; + +err2: + nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; @@ -752,6 +921,7 @@ err1: static void __exit nft_ct_module_exit(void) { + nft_unregister_obj(&nft_ct_helper_obj); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } @@ -763,3 +933,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); -- cgit v1.3 From 055c4b34b94f696d9bd9aad53a11378a0fc409c9 Mon Sep 17 00:00:00 2001 From: Phil Sutter Date: Fri, 10 Mar 2017 18:08:02 +0100 Subject: netfilter: nft_fib: Support existence check Instead of the actual interface index or name, set destination register to just 1 or 0 depending on whether the lookup succeeded or not if NFTA_FIB_F_PRESENT was set in userspace. Signed-off-by: Phil Sutter Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nft_fib.h | 2 +- include/uapi/linux/netfilter/nf_tables.h | 1 + net/ipv4/netfilter/nft_fib_ipv4.c | 4 ++-- net/ipv6/netfilter/nft_fib_ipv6.c | 2 +- net/netfilter/nft_fib.c | 14 +++++++++----- 5 files changed, 14 insertions(+), 9 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 5ceb2205e4e3..381af9469e6a 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -32,6 +32,6 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); -void nft_fib_store_result(void *reg, enum nft_fib_result r, +void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct nft_pktinfo *pkt, int index); #endif diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 34c8d08b687a..8f3842690d17 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -1257,6 +1257,7 @@ enum nft_fib_flags { NFTA_FIB_F_MARK = 1 << 2, /* use skb->mark */ NFTA_FIB_F_IIF = 1 << 3, /* restrict to iif */ NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ + NFTA_FIB_F_PRESENT = 1 << 5, /* check existence only */ }; enum nft_ct_helper_attributes { diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 2981291910dd..f4e4462cb5bb 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -90,7 +90,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, nft_in(pkt)->ifindex); return; } @@ -99,7 +99,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, get_ifindex(pkt->skb->dev)); return; } diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 765facf03d45..e8d88d82636b 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -159,7 +159,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, nft_in(pkt)->ifindex); return; } diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index fd0b19303b0d..21df8cccea65 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -24,7 +24,8 @@ const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { EXPORT_SYMBOL(nft_fib_policy); #define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF) + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -133,19 +134,22 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) } EXPORT_SYMBOL_GPL(nft_fib_dump); -void nft_fib_store_result(void *reg, enum nft_fib_result r, +void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct nft_pktinfo *pkt, int index) { struct net_device *dev; u32 *dreg = reg; - switch (r) { + switch (priv->result) { case NFT_FIB_RESULT_OIF: - *dreg = index; + *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; break; case NFT_FIB_RESULT_OIFNAME: dev = dev_get_by_index_rcu(nft_net(pkt), index); - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + if (priv->flags & NFTA_FIB_F_PRESENT) + *dreg = !!dev; + else + strncpy(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); -- cgit v1.3 From 5b441ac8784c1e7f3c619f14da4c3f52e87348d5 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Fri, 10 Mar 2017 20:43:24 +0000 Subject: mpls: allow TTL propagation to IP packets to be configured Provide the ability to control on a per-route basis whether the TTL value from an MPLS packet is propagated to an IPv4/IPv6 packet when the last label is popped as per the theoretical model in RFC 3443 through a new route attribute, RTA_TTL_PROPAGATE which can be 0 to mean disable propagation and 1 to mean enable propagation. In order to provide the ability to change the behaviour for packets arriving with IPv4/IPv6 Explicit Null labels and to provide an easy way for a user to change the behaviour for all existing routes without having to reprogram them, a global knob is provided. This is done through the addition of a new per-namespace sysctl, "net.mpls.ip_ttl_propagate", which defaults to enabled. If the per-route attribute is set (either enabled or disabled) then it overrides the global configuration. Signed-off-by: Robert Shearman Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/mpls-sysctl.txt | 11 ++++ include/net/netns/mpls.h | 2 + include/uapi/linux/rtnetlink.h | 1 + net/mpls/af_mpls.c | 87 +++++++++++++++++++++++++++++--- net/mpls/internal.h | 7 +++ 5 files changed, 100 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt index 15d8d16934fd..9badd1d6685f 100644 --- a/Documentation/networking/mpls-sysctl.txt +++ b/Documentation/networking/mpls-sysctl.txt @@ -19,6 +19,17 @@ platform_labels - INTEGER Possible values: 0 - 1048575 Default: 0 +ip_ttl_propagate - BOOL + Control whether TTL is propagated from the IPv4/IPv6 header to + the MPLS header on imposing labels and propagated from the + MPLS header to the IPv4/IPv6 header on popping the last label. + + If disabled, the MPLS transport network will appear as a + single hop to transit traffic. + + 0 - disabled / RFC 3443 [Short] Pipe Model + 1 - enabled / RFC 3443 Uniform Model (default) + conf//input - BOOL Control whether packets can be input on this interface. diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index d29203651c01..08652eedabb2 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -9,8 +9,10 @@ struct mpls_route; struct ctl_table_header; struct netns_mpls { + int ip_ttl_propagate; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct ctl_table_header *ctl; }; diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 75fcf5eff093..3dd72aee4d32 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -319,6 +319,7 @@ enum rtattr_type_t { RTA_EXPIRES, RTA_PAD, RTA_UID, + RTA_TTL_PROPAGATE, __RTA_MAX }; diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 3818686182b2..0e1046f21af8 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -32,6 +32,7 @@ #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int zero = 0; +static int one = 1; static int label_limit = (1 << 20) - 1; static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, @@ -220,8 +221,8 @@ out: return &rt->rt_nh[nh_index]; } -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, - struct mpls_entry_decoded dec) +static bool mpls_egress(struct net *net, struct mpls_route *rt, + struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; @@ -246,22 +247,46 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); + u8 new_ttl; skb->protocol = htons(ETH_P_IP); + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * TTL, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + new_ttl = dec.ttl; + else + new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; + csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), - htons(dec.ttl << 8)); - hdr4->ttl = dec.ttl; + htons(new_ttl << 8)); + hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); - hdr6->hop_limit = dec.ttl; + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * hop limit, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + hdr6->hop_limit = dec.ttl; + else if (hdr6->hop_limit) + hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: + /* Should have decided which protocol it is by now */ break; } @@ -361,7 +386,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ - if (!mpls_egress(rt, skb, dec)) + if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) goto err; } else { bool bos; @@ -412,6 +437,7 @@ static struct packet_type mpls_packet_type __read_mostly = { static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { @@ -421,6 +447,7 @@ struct mpls_route_config { u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; + u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; @@ -856,6 +883,7 @@ static int mpls_route_add(struct mpls_route_config *cfg) rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; + rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) err = mpls_nh_build_multi(cfg, rt); @@ -1576,6 +1604,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; + cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; @@ -1622,6 +1651,17 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_mp_len = nla_len(nla); break; } + case RTA_TTL_PROPAGATE: + { + u8 ttl_propagate = nla_get_u8(nla); + + if (ttl_propagate > 1) + goto errout; + cfg->rc_ttl_propagate = ttl_propagate ? + MPLS_TTL_PROP_ENABLED : + MPLS_TTL_PROP_DISABLED; + break; + } default: /* Unsupported attribute */ goto errout; @@ -1682,6 +1722,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; + + if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { + bool ttl_propagate = + rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; + + if (nla_put_u8(skb, RTA_TTL_PROPAGATE, + ttl_propagate)) + goto nla_put_failure; + } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; @@ -1792,7 +1841,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) - + nla_total_size(4); /* RTA_DST */ + + nla_total_size(4) /* RTA_DST */ + + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; @@ -1876,6 +1926,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; + rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, @@ -1889,6 +1940,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; + rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, @@ -1970,6 +2022,9 @@ static int mpls_platform_labels(struct ctl_table *table, int write, return ret; } +#define MPLS_NS_SYSCTL_OFFSET(field) \ + (&((struct net *)0)->field) + static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", @@ -1978,21 +2033,37 @@ static const struct ctl_table mpls_table[] = { .mode = 0644, .proc_handler = mpls_platform_labels, }, + { + .procname = "ip_ttl_propagate", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, { } }; static int mpls_net_init(struct net *net) { struct ctl_table *table; + int i; net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; + net->mpls.ip_ttl_propagate = 1; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; - table[0].data = net; + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + table[i].data = (char *)net + (uintptr_t)table[i].data; + net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); if (net->mpls.ctl == NULL) { kfree(table); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 76360d8b9579..62928d8fabd1 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -90,6 +90,12 @@ struct mpls_nh { /* next hop label forwarding entry */ u8 nh_via_table; }; +enum mpls_ttl_propagation { + MPLS_TTL_PROP_DEFAULT, + MPLS_TTL_PROP_ENABLED, + MPLS_TTL_PROP_DISABLED, +}; + /* The route, nexthops and vias are stored together in the same memory * block: * @@ -116,6 +122,7 @@ struct mpls_route { /* next hop label forwarding entry */ u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; + u8 rt_ttl_propagate; unsigned int rt_nhn; unsigned int rt_nhn_alive; struct mpls_nh rt_nh[0]; -- cgit v1.3 From a59166e470868d92f0813977817e99e699398af5 Mon Sep 17 00:00:00 2001 From: Robert Shearman Date: Fri, 10 Mar 2017 20:43:25 +0000 Subject: mpls: allow TTL propagation from IP packets to be configured Allow TTL propagation from IP packets to MPLS packets to be configured. Add a new optional LWT attribute, MPLS_IPTUNNEL_TTL, which allows the TTL to be set in the resulting MPLS packet, with the value of 0 having the semantics of enabling propagation of the TTL from the IP header (i.e. non-zero values disable propagation). Also allow the configuration to be overridden globally by reusing the same sysctl to control whether the TTL is propagated from IP packets into the MPLS header. If the per-LWT attribute is set then it overrides the global configuration. If the TTL isn't propagated then a default TTL value is used which can be configured via a new sysctl, "net.mpls.default_ttl". This is kept separate from the configuration of whether IP TTL propagation is enabled as it can be used in the future when non-IP payloads are supported (i.e. where there is no payload TTL that can be propagated). Signed-off-by: Robert Shearman Acked-by: David Ahern Tested-by: David Ahern Signed-off-by: David S. Miller --- Documentation/networking/mpls-sysctl.txt | 8 ++++ include/net/mpls_iptunnel.h | 2 + include/net/netns/mpls.h | 1 + include/uapi/linux/mpls_iptunnel.h | 2 + net/mpls/af_mpls.c | 11 +++++ net/mpls/mpls_iptunnel.c | 73 ++++++++++++++++++++++++++------ 6 files changed, 84 insertions(+), 13 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/mpls-sysctl.txt b/Documentation/networking/mpls-sysctl.txt index 9badd1d6685f..2f24a1912a48 100644 --- a/Documentation/networking/mpls-sysctl.txt +++ b/Documentation/networking/mpls-sysctl.txt @@ -30,6 +30,14 @@ ip_ttl_propagate - BOOL 0 - disabled / RFC 3443 [Short] Pipe Model 1 - enabled / RFC 3443 Uniform Model (default) +default_ttl - BOOL + Default TTL value to use for MPLS packets where it cannot be + propagated from an IP header, either because one isn't present + or ip_ttl_propagate has been disabled. + + Possible values: 1 - 255 + Default: 255 + conf//input - BOOL Control whether packets can be input on this interface. diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h index 179253f9dcfd..a18af6a16eb5 100644 --- a/include/net/mpls_iptunnel.h +++ b/include/net/mpls_iptunnel.h @@ -19,6 +19,8 @@ struct mpls_iptunnel_encap { u32 label[MAX_NEW_LABELS]; u8 labels; + u8 ttl_propagate; + u8 default_ttl; }; static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index 08652eedabb2..6608b3693385 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -10,6 +10,7 @@ struct ctl_table_header; struct netns_mpls { int ip_ttl_propagate; + int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h index d80a0498f77e..f5e45095b0bb 100644 --- a/include/uapi/linux/mpls_iptunnel.h +++ b/include/uapi/linux/mpls_iptunnel.h @@ -16,11 +16,13 @@ /* MPLS tunnel attributes * [RTA_ENCAP] = { * [MPLS_IPTUNNEL_DST] + * [MPLS_IPTUNNEL_TTL] * } */ enum { MPLS_IPTUNNEL_UNSPEC, MPLS_IPTUNNEL_DST, + MPLS_IPTUNNEL_TTL, __MPLS_IPTUNNEL_MAX, }; #define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 0e1046f21af8..0c5d111abe36 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -34,6 +34,7 @@ static int zero = 0; static int one = 1; static int label_limit = (1 << 20) - 1; +static int ttl_max = 255; static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, @@ -2042,6 +2043,15 @@ static const struct ctl_table mpls_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "default_ttl", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &ttl_max, + }, { } }; @@ -2053,6 +2063,7 @@ static int mpls_net_init(struct net *net) net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; net->mpls.ip_ttl_propagate = 1; + net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index e4e4424f9eb1..22f71fce0bfb 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -29,6 +29,7 @@ static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, + [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) @@ -49,6 +50,7 @@ static int mpls_xmit(struct sk_buff *skb) struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; + struct net *net; int err = 0; bool bos; int i; @@ -56,17 +58,7 @@ static int mpls_xmit(struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; - - /* Obtain the ttl */ - if (dst->ops->family == AF_INET) { - ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; - } else if (dst->ops->family == AF_INET6) { - ttl = ipv6_hdr(skb)->hop_limit; - rt6 = (struct rt6_info *)dst; - } else { - goto drop; - } + net = dev_net(out_dev); skb_orphan(skb); @@ -78,6 +70,38 @@ static int mpls_xmit(struct sk_buff *skb) tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); + /* Obtain the ttl using the following set of rules. + * + * LWT ttl propagation setting: + * - disabled => use default TTL value from LWT + * - enabled => use TTL value from IPv4/IPv6 header + * - default => + * Global ttl propagation setting: + * - disabled => use default TTL value from global setting + * - enabled => use TTL value from IPv4/IPv6 header + */ + if (dst->ops->family == AF_INET) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + } else if (dst->ops->family == AF_INET6) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + } else { + goto drop; + } + /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); @@ -160,6 +184,17 @@ static int mpls_build_state(struct nlattr *nla, &tun_encap_info->labels, tun_encap_info->label); if (ret) goto errout; + + tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; + + if (tb[MPLS_IPTUNNEL_TTL]) { + tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); + /* TTL 0 implies propagate from IP header */ + tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? + MPLS_TTL_PROP_DISABLED : + MPLS_TTL_PROP_ENABLED; + } + newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); @@ -186,6 +221,10 @@ static int mpls_fill_encap_info(struct sk_buff *skb, tun_encap_info->label)) goto nla_put_failure; + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && + nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -195,10 +234,16 @@ nla_put_failure: static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; + int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); - return nla_total_size(tun_encap_info->labels * 4); + nlsize = nla_total_size(tun_encap_info->labels * 4); + + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) + nlsize += nla_total_size(1); + + return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) @@ -207,7 +252,9 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; - if (a_hdr->labels != b_hdr->labels) + if (a_hdr->labels != b_hdr->labels || + a_hdr->ttl_propagate != b_hdr->ttl_propagate || + a_hdr->default_ttl != b_hdr->default_ttl) return 1; for (l = 0; l < MAX_NEW_LABELS; l++) -- cgit v1.3 From 2026fecf516bc04df20cb50874957cd8c364fb4e Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 15 Mar 2017 10:39:18 -0700 Subject: mqprio: Change handling of hw u8 to allow for multiple hardware offload modes This patch is meant to allow for support of multiple hardware offload type for a single device. There is currently no bounds checking for the hw member of the mqprio_qopt structure. This results in us being able to pass values from 1 to 255 with all being treated the same. On retreiving the value it is returned as 1 for anything 1 or greater being set. With this change we are currently adding limited bounds checking by defining an enum and using those values to limit the reported hardware offloads. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/uapi/linux/pkt_sched.h | 8 ++++++++ net/sched/sch_mqprio.c | 26 ++++++++++++++++---------- 2 files changed, 24 insertions(+), 10 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index df7451d35131..099bf5528fed 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -617,6 +617,14 @@ struct tc_drr_stats { #define TC_QOPT_BITMASK 15 #define TC_QOPT_MAX_QUEUE 16 +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b851e209da4d..5f55bf149d9f 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -21,7 +21,7 @@ struct mqprio_sched { struct Qdisc **qdiscs; - int hw_owned; + int hw_offload; }; static void mqprio_destroy(struct Qdisc *sch) @@ -39,7 +39,7 @@ static void mqprio_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } - if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) + if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); @@ -59,15 +59,20 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) return -EINVAL; } - /* net_device does not support requested operation */ - if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) - return -EINVAL; + /* Limit qopt->hw to maximum supported offload value. Drivers have + * the option of overriding this later if they don't support the a + * given offload type. + */ + if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) + qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; - /* if hw owned qcount and qoffset are taken from LLD so - * no reason to verify them here + /* If hardware offload is requested we will leave it to the device + * to either populate the queue counts itself or to validate the + * provided queue counts. If ndo_setup_tc is not present then + * hardware doesn't support offload and we should return an error. */ if (qopt->hw) - return 0; + return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; @@ -142,10 +147,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, { .tc = qopt->num_tc }}; - priv->hw_owned = 1; err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) return err; + + priv->hw_offload = qopt->hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -243,7 +249,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); - opt.hw = priv->hw_owned; + opt.hw = priv->hw_offload; for (i = 0; i < netdev_get_num_tc(dev); i++) { opt.count[i] = dev->tc_to_txq[i].count; -- cgit v1.3 From 4396e46187ca5070219b81773c4e65088dac50cc Mon Sep 17 00:00:00 2001 From: Soheil Hassas Yeganeh Date: Wed, 15 Mar 2017 16:30:46 -0400 Subject: tcp: remove tcp_tw_recycle The tcp_tw_recycle was already broken for connections behind NAT, since the per-destination timestamp is not monotonically increasing for multiple machines behind a single destination address. After the randomization of TCP timestamp offsets in commit 8a5bd45f6616 (tcp: randomize tcp timestamp offsets for each connection), the tcp_tw_recycle is broken for all types of connections for the same reason: the timestamps received from a single machine is not monotonically increasing, anymore. Remove tcp_tw_recycle, since it is not functional. Also, remove the PAWSPassive SNMP counter since it is only used for tcp_tw_recycle, and simplify tcp_v4_route_req and tcp_v6_route_req since the strict argument is only set when tcp_tw_recycle is enabled. Signed-off-by: Soheil Hassas Yeganeh Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Cc: Lutz Vieweg Cc: Florian Westphal Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 5 ----- include/net/netns/ipv4.h | 1 - include/net/tcp.h | 3 +-- include/uapi/linux/snmp.h | 1 - net/ipv4/proc.c | 1 - net/ipv4/sysctl_net_ipv4.c | 7 ------- net/ipv4/tcp_input.c | 30 +++++------------------------- net/ipv4/tcp_ipv4.c | 15 ++------------- net/ipv6/tcp_ipv6.c | 5 +---- 9 files changed, 9 insertions(+), 59 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ab0230461377..ed3d0791eb27 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -640,11 +640,6 @@ tcp_tso_win_divisor - INTEGER building larger TSO frames. Default: 3 -tcp_tw_recycle - BOOLEAN - Enable fast recycling TIME-WAIT sockets. Default value is 0. - It should not be changed without advice/request of technical - experts. - tcp_tw_reuse - BOOLEAN Allow to reuse TIME-WAIT sockets for new connections when it is safe from protocol viewpoint. Default value is 0. diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 622d2da27135..2e9d649ba169 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -33,7 +33,6 @@ struct inet_timewait_death_row { atomic_t tw_count; struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; - int sysctl_tw_recycle; int sysctl_max_tw_buckets; }; diff --git a/include/net/tcp.h b/include/net/tcp.h index c81f3b958d44..e614ad4d613e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1810,8 +1810,7 @@ struct tcp_request_sock_ops { __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict); + const struct request_sock *req); __u32 (*init_seq_tsoff)(const struct sk_buff *skb, u32 *tsoff); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 3b2bed7ca9a4..cec0e171d20c 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -177,7 +177,6 @@ enum LINUX_MIB_TIMEWAITED, /* TimeWaited */ LINUX_MIB_TIMEWAITRECYCLED, /* TimeWaitRecycled */ LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ - LINUX_MIB_PAWSPASSIVEREJECTED, /* PAWSPassiveRejected */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 69cf49e8356d..4ccbf464d1ac 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -199,7 +199,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), - SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d6880a6149ee..11aaef0939b2 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -980,13 +980,6 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_tw_recycle", - .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index aafec0676d3e..bb09c7095988 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6327,31 +6327,11 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off); if (!want_cookie && !isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { - bool strict; - - dst = af_ops->route_req(sk, &fl, req, &strict); - - if (dst && strict && - !tcp_peer_is_proven(req, dst)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } /* Kill the following clause, if you dislike this way. */ - else if (!net->ipv4.sysctl_tcp_syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst)) { + if (!net->ipv4.sysctl_tcp_syncookies && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -6367,7 +6347,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, isn = af_ops->init_seq_tsoff(skb, &tcp_rsk(req)->ts_off); } if (!dst) { - dst = af_ops->route_req(sk, &fl, req, NULL); + dst = af_ops->route_req(sk, &fl, req); if (!dst) goto drop_and_free; } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index d8b401fff9fe..7482b5d11861 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1213,19 +1213,9 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict) + const struct request_sock *req) { - struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); - - if (strict) { - if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) - *strict = true; - else - *strict = false; - } - - return dst; + return inet_csk_route_req(sk, &fl->u.ip4, req); } struct request_sock_ops tcp_request_sock_ops __read_mostly = { @@ -2462,7 +2452,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 0; cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 853cb43e3e3c..0f08d718a002 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -722,11 +722,8 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict) + const struct request_sock *req) { - if (strict) - *strict = true; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } -- cgit v1.3 From 798c166173ffb50128993641fcf791df51bed48e Mon Sep 17 00:00:00 2001 From: andy zhou Date: Mon, 20 Mar 2017 16:32:29 -0700 Subject: openvswitch: Optimize sample action for the clone use cases With the introduction of open flow 'clone' action, the OVS user space can now translate the 'clone' action into kernel datapath 'sample' action, with 100% probability, to ensure that the clone semantics, which is that the packet seen by the clone action is the same as the packet seen by the action after clone, is faithfully carried out in the datapath. While the sample action in the datpath has the matching semantics, its implementation is only optimized for its original use. Specifically, there are two limitation: First, there is a 3 level of nesting restriction, enforced at the flow downloading time. This limit turns out to be too restrictive for the 'clone' use case. Second, the implementation avoid recursive call only if the sample action list has a single userspace action. The main optimization implemented in this series removes the static nesting limit check, instead, implement the run time recursion limit check, and recursion avoidance similar to that of the 'recirc' action. This optimization solve both #1 and #2 issues above. One related optimization attempts to avoid copying flow key as long as the actions enclosed does not change the flow key. The detection is performed only once at the flow downloading time. Another related optimization is to rewrite the action list at flow downloading time in order to save the fast path from parsing the sample action list in its original form repeatedly. Signed-off-by: Andy Zhou Acked-by: Pravin B Shelar Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 15 +++++ net/openvswitch/actions.c | 107 ++++++++++++++--------------- net/openvswitch/datapath.h | 2 - net/openvswitch/flow_netlink.c | 141 +++++++++++++++++++++++++++------------ 4 files changed, 167 insertions(+), 98 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7f41f7d0000f..66d1c3ccfd8e 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -578,10 +578,25 @@ enum ovs_sample_attr { OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */ OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ __OVS_SAMPLE_ATTR_MAX, + +#ifdef __KERNEL__ + OVS_SAMPLE_ATTR_ARG /* struct sample_arg */ +#endif }; #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1) +#ifdef __KERNEL__ +struct sample_arg { + bool exec; /* When true, actions in sample will not + * change flow keys. False otherwise. + */ + u32 probability; /* Same value as + * 'OVS_SAMPLE_ATTR_PROBABILITY'. + */ +}; +#endif + /** * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action. * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 8c9c60cd359f..3529f7b87a44 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -928,73 +928,70 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } +/* When 'last' is true, sample() should always consume the 'skb'. + * Otherwise, sample() should keep 'skb' intact regardless what + * actions are executed within sample(). + */ static int sample(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len) + bool last) { - const struct nlattr *acts_list = NULL; - const struct nlattr *a; - int rem; - u32 cutlen = 0; + struct nlattr *actions; + struct nlattr *sample_arg; + struct sw_flow_key *orig_key = key; + int rem = nla_len(attr); + int err = 0; + const struct sample_arg *arg; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { - u32 probability; + /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */ + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); - switch (nla_type(a)) { - case OVS_SAMPLE_ATTR_PROBABILITY: - probability = nla_get_u32(a); - if (!probability || prandom_u32() > probability) - return 0; - break; - - case OVS_SAMPLE_ATTR_ACTIONS: - acts_list = a; - break; - } + if ((arg->probability != U32_MAX) && + (!arg->probability || prandom_u32() > arg->probability)) { + if (last) + consume_skb(skb); + return 0; } - rem = nla_len(acts_list); - a = nla_data(acts_list); - - /* Actions list is empty, do nothing */ - if (unlikely(!rem)) + /* Unless the last action, sample works on the clone of SKB. */ + skb = last ? skb : skb_clone(skb, GFP_ATOMIC); + if (!skb) { + /* Out of memory, skip this sample action. + */ return 0; + } - /* The only known usage of sample action is having a single user-space - * action, or having a truncate action followed by a single user-space - * action. Treat this usage as a special case. - * The output_userspace() should clone the skb to be sent to the - * user space. This skb will be consumed by its caller. + /* In case the sample actions won't change 'key', + * it can be used directly to execute sample actions. + * Otherwise, allocate a new key from the + * next recursion level of 'flow_keys'. If + * successful, execute the sample actions without + * deferring. + * + * Defer the sample actions if the recursion + * limit has been reached. */ - if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) { - struct ovs_action_trunc *trunc = nla_data(a); - - if (skb->len > trunc->max_len) - cutlen = skb->len - trunc->max_len; - - a = nla_next(a, &rem); + if (!arg->exec) { + __this_cpu_inc(exec_actions_level); + key = clone_key(key); } - if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && - nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a, actions, - actions_len, cutlen); + if (key) { + err = do_execute_actions(dp, skb, key, actions, rem); + } else if (!add_deferred_actions(skb, orig_key, actions, rem)) { - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) - /* Skip the sample action when out of memory. */ - return 0; - - if (!add_deferred_actions(skb, key, nla_data(acts_list), - nla_len(acts_list))) { if (net_ratelimit()) - pr_warn("%s: deferred actions limit reached, dropping sample action\n", + pr_warn("%s: deferred action limit reached, drop sample action\n", ovs_dp_name(dp)); - kfree_skb(skb); } - return 0; + + if (!arg->exec) + __this_cpu_dec(exec_actions_level); + + return err; } static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, @@ -1244,9 +1241,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = execute_masked_set_action(skb, key, nla_data(a)); break; - case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a, attr, len); + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = sample(dp, skb, key, a, last); + if (last) + return err; + break; + } case OVS_ACTION_ATTR_CT: if (!is_flow_key_valid(key)) { diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 1c6e9377436d..da931bdef8a7 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -34,8 +34,6 @@ #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 -#define SAMPLE_ACTION_DEPTH 3 - /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 6f5fa50f716d..2acfb5af2c45 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2017 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -59,6 +59,39 @@ struct ovs_len_tbl { #define OVS_ATTR_NESTED -1 #define OVS_ATTR_VARIABLE -2 +static bool actions_may_change_flow(const struct nlattr *actions) +{ + struct nlattr *nla; + int rem; + + nla_for_each_nested(nla, actions, rem) { + u16 action = nla_type(nla); + + switch (action) { + case OVS_ACTION_ATTR_OUTPUT: + case OVS_ACTION_ATTR_RECIRC: + case OVS_ACTION_ATTR_TRUNC: + case OVS_ACTION_ATTR_USERSPACE: + break; + + case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_HASH: + case OVS_ACTION_ATTR_POP_ETH: + case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_VLAN: + case OVS_ACTION_ATTR_PUSH_ETH: + case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_VLAN: + case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_SET: + case OVS_ACTION_ATTR_SET_MASKED: + default: + return true; + } + } + return false; +} + static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) { @@ -2021,18 +2054,20 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, - int depth, struct sw_flow_actions **sfa, + struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, int depth, + const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; const struct nlattr *a; - int rem, start, err, st_acts; + int rem, start, err; + struct sample_arg arg; memset(attrs, 0, sizeof(attrs)); nla_for_each_nested(a, attr, rem) { @@ -2056,20 +2091,32 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + + /* When both skb and flow may be changed, put the sample + * into a deferred fifo. On the other hand, if only skb + * may be modified, the actions can be executed in place. + * + * Do this analysis at the flow installation time. + * Set 'clone_action->exec' to true if the actions can be + * executed without being deferred. + * + * If the sample is the last action, it can always be excuted + * rather than deferred. + */ + arg.exec = last || !actions_may_change_flow(actions); + arg.probability = nla_get_u32(probability); + + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg), + log); if (err) return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); - if (st_acts < 0) - return st_acts; - err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, log); + if (err) return err; - add_nested_action_end(*sfa, st_acts); add_nested_action_end(*sfa, start); return 0; @@ -2406,16 +2453,13 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, - int depth, struct sw_flow_actions **sfa, + struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; int rem, err; - if (depth >= SAMPLE_ACTION_DEPTH) - return -EOVERFLOW; - nla_for_each_nested(a, attr, rem) { /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { @@ -2553,13 +2597,17 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return err; break; - case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(net, a, key, depth, sfa, - eth_type, vlan_tci, log); + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_sample(net, a, key, sfa, + eth_type, vlan_tci, + log, last); if (err) return err; skip_copy = true; break; + } case OVS_ACTION_ATTR_CT: err = ovs_ct_copy_action(net, a, key, sfa, log); @@ -2613,7 +2661,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return PTR_ERR(*sfa); (*sfa)->orig_len = nla_len(attr); - err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, + err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, key->eth.vlan.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); @@ -2621,39 +2669,44 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return err; } -static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +static int sample_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) { - const struct nlattr *a; - struct nlattr *start; - int err = 0, rem; + struct nlattr *start, *ac_start = NULL, *sample_arg; + int err = 0, rem = nla_len(attr); + const struct sample_arg *arg; + struct nlattr *actions; start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); if (!start) return -EMSGSIZE; - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - struct nlattr *st_sample; + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); - switch (type) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, - sizeof(u32), nla_data(a))) - return -EMSGSIZE; - break; - case OVS_SAMPLE_ATTR_ACTIONS: - st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!st_sample) - return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); - if (err) - return err; - nla_nest_end(skb, st_sample); - break; - } + if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) { + err = -EMSGSIZE; + goto out; + } + + ac_start = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(actions, rem, skb); + +out: + if (err) { + nla_nest_cancel(skb, ac_start); + nla_nest_cancel(skb, start); + } else { + nla_nest_end(skb, ac_start); + nla_nest_end(skb, start); } - nla_nest_end(skb, start); return err; } -- cgit v1.3 From bbea124bc99df968011e76eba105fe964a4eceab Mon Sep 17 00:00:00 2001 From: Joel Scherpelz Date: Wed, 22 Mar 2017 18:19:04 +0900 Subject: net: ipv6: Add sysctl for minimum prefix len acceptable in RIOs. This commit adds a new sysctl accept_ra_rt_info_min_plen that defines the minimum acceptable prefix length of Route Information Options. The new sysctl is intended to be used together with accept_ra_rt_info_max_plen to configure a range of acceptable prefix lengths. It is useful to prevent misconfigurations from unintentionally blackholing too much of the IPv6 address space (e.g., home routers announcing RIOs for fc00::/7, which is incorrect). Signed-off-by: Joel Scherpelz Acked-by: Lorenzo Colitti Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 13 +++++++++++-- include/linux/ipv6.h | 1 + include/uapi/linux/ipv6.h | 1 + include/uapi/linux/sysctl.h | 1 + net/ipv6/addrconf.c | 10 ++++++++++ net/ipv6/ndisc.c | 2 ++ 6 files changed, 26 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index b57308e76b1d..eaee2c8d4c00 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -1461,11 +1461,20 @@ accept_ra_pinfo - BOOLEAN Functional default: enabled if accept_ra is enabled. disabled if accept_ra is disabled. +accept_ra_rt_info_min_plen - INTEGER + Minimum prefix length of Route Information in RA. + + Route Information w/ prefix smaller than this variable shall + be ignored. + + Functional default: 0 if accept_ra_rtr_pref is enabled. + -1 if accept_ra_rtr_pref is disabled. + accept_ra_rt_info_max_plen - INTEGER Maximum prefix length of Route Information in RA. - Route Information w/ prefix larger than or equal to this - variable shall be ignored. + Route Information w/ prefix larger than this variable shall + be ignored. Functional default: 0 if accept_ra_rtr_pref is enabled. -1 if accept_ra_rtr_pref is disabled. diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index f0d79bd054ca..e1b442996f81 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -37,6 +37,7 @@ struct ipv6_devconf { __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO + __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index d8f6a1ac9af4..2ae59178189d 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -184,6 +184,7 @@ enum { DEVCONF_ENHANCED_DAD, DEVCONF_ADDR_GEN_MODE, DEVCONF_DISABLE_POLICY, + DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, DEVCONF_MAX }; diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d2b12152e358..e13d48058b8d 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -568,6 +568,7 @@ enum { NET_IPV6_PROXY_NDP=23, NET_IPV6_ACCEPT_SOURCE_ROUTE=25, NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, + NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, __NET_IPV6_MAX }; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8c69768a5c46..dff5beb26a01 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -224,6 +224,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -277,6 +278,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -4979,6 +4981,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_PROBE_INTERVAL] = jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen; array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif #endif @@ -6121,6 +6124,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_IPV6_ROUTE_INFO + { + .procname = "accept_ra_rt_info_min_plen", + .data = &ipv6_devconf.accept_ra_rt_info_min_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_rt_info_max_plen", .data = &ipv6_devconf.accept_ra_rt_info_max_plen, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 112ccbc0a8ac..b5812b3f7539 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1418,6 +1418,8 @@ skip_linkparms: if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) + continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, -- cgit v1.3 From 56f668dfe00dcf086734f1c42ea999398fad6572 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:33 -0700 Subject: bpf: Add array of maps support This patch adds a few helper funcs to enable map-in-map support (i.e. outer_map->inner_map). The first outer_map type BPF_MAP_TYPE_ARRAY_OF_MAPS is also added in this patch. The next patch will introduce a hash of maps type. Any bpf map type can be acted as an inner_map. The exception is BPF_MAP_TYPE_PROG_ARRAY because the extra level of indirection makes it harder to verify the owner_prog_type and owner_jited. Multi-level map-in-map is not supported (i.e. map->map is ok but not map->map->map). When adding an inner_map to an outer_map, it currently checks the map_type, key_size, value_size, map_flags, max_entries and ops. The verifier also uses those map's properties to do static analysis. map_flags is needed because we need to ensure BPF_PROG_TYPE_PERF_EVENT is using a preallocated hashtab for the inner_hash also. ops and max_entries are needed to generate inlined map-lookup instructions. For simplicity reason, a simple '==' test is used for both map_flags and max_entries. The equality of ops is implied by the equality of map_type. During outer_map creation time, an inner_map_fd is needed to create an outer_map. However, the inner_map_fd's life time does not depend on the outer_map. The inner_map_fd is merely used to initialize the inner_map_meta of the outer_map. Also, for the outer_map: * It allows element update and delete from syscall * It allows element lookup from bpf_prog The above is similar to the current fd_array pattern. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 2 + kernel/bpf/Makefile | 2 +- kernel/bpf/arraymap.c | 63 +++++++++++++++++++++++++++++++ kernel/bpf/map_in_map.c | 97 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/map_in_map.h | 23 ++++++++++++ kernel/bpf/syscall.c | 7 +++- kernel/bpf/verifier.c | 42 ++++++++++++++++----- 8 files changed, 225 insertions(+), 12 deletions(-) create mode 100644 kernel/bpf/map_in_map.c create mode 100644 kernel/bpf/map_in_map.h (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da8c64ca8dc9..3f3cdf9b15e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -50,6 +50,7 @@ struct bpf_map { const struct bpf_map_ops *ops; struct work_struct work; atomic_t usercnt; + struct bpf_map *inner_map_meta; }; struct bpf_map_type_list { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0539a0ceef38..1701ec1e7de3 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -96,6 +96,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +153,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd..e1e5e658f2db 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 4d7d5d0ed76a..bc9da93db403 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,6 +17,8 @@ #include #include +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -602,3 +604,64 @@ static int __init register_cgroup_array_map(void) } late_initcall(register_cgroup_array_map); #endif + +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) +{ + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); +} + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static const struct bpf_map_ops array_of_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list array_of_map_type __ro_after_init = { + .ops = &array_of_map_ops, + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, +}; + +static int __init register_array_of_map(void) +{ + bpf_register_map_type(&array_of_map_type); + return 0; +} +late_initcall(register_array_of_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 000000000000..59bcdf821ae4 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 000000000000..177fadb689dc --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 48c914b983bd..6e24fdf1f373 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -215,7 +215,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -352,6 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -438,7 +440,8 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9bf82267f2f9..3b8f528c5473 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1199,6 +1199,9 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -2101,14 +2104,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -3033,16 +3041,32 @@ process_bpf_exit: return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } -- cgit v1.3 From bcc6b1b7ebf857a9fe56202e2be3361131588c15 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 22 Mar 2017 10:00:34 -0700 Subject: bpf: Add hash of maps support This patch adds hash of maps support (hashmap->bpf_map). BPF_MAP_TYPE_HASH_OF_MAPS is added. A map-in-map contains a pointer to another map and lets call this pointer 'inner_map_ptr'. Notes on deleting inner_map_ptr from a hash map: 1. For BPF_F_NO_PREALLOC map-in-map, when deleting an inner_map_ptr, the htab_elem itself will go through a rcu grace period and the inner_map_ptr resides in the htab_elem. 2. For pre-allocated htab_elem (!BPF_F_NO_PREALLOC), when deleting an inner_map_ptr, the htab_elem may get reused immediately. This situation is similar to the existing prealloc-ated use cases. However, the bpf_map_fd_put_ptr() calls bpf_map_put() which calls inner_map->ops->map_free(inner_map) which will go through a rcu grace period (i.e. all bpf_map's map_free currently goes through a rcu grace period). Hence, the inner_map_ptr is still safe for the rcu reader side. This patch also includes BPF_MAP_TYPE_HASH_OF_MAPS to the check_map_prealloc() in the verifier. preallocation is a must for BPF_PROG_TYPE_PERF_EVENT. Hence, even we don't expect heavy updates to map-in-map, enforcing BPF_F_NO_PREALLOC for map-in-map is impossible without disallowing BPF_PROG_TYPE_PERF_EVENT from using map-in-map first. Signed-off-by: Martin KaFai Lau Acked-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 + include/uapi/linux/bpf.h | 1 + kernel/bpf/hashtab.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 8 +++- kernel/bpf/verifier.c | 4 +- 5 files changed, 134 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3f3cdf9b15e8..2ae39a3e9ead 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -277,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); void bpf_fd_array_map_clear(struct bpf_map *map); +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1701ec1e7de3..ce6f029ac368 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -97,6 +97,7 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 000153acb6d5..343fb5394c95 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { struct hlist_nulls_head head; @@ -88,6 +89,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -603,6 +609,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + if (l->state == HTAB_EXTRA_ELEM_USED) { l->state = HTAB_EXTRA_ELEM_FREE; return; @@ -1057,6 +1071,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1213,12 +1228,118 @@ static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, }; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); + + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +static const struct bpf_map_ops htab_of_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; + +static struct bpf_map_type_list htab_of_map_type __ro_after_init = { + .ops = &htab_of_map_ops, + .type = BPF_MAP_TYPE_HASH_OF_MAPS, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); bpf_register_map_type(&htab_percpu_type); bpf_register_map_type(&htab_lru_type); bpf_register_map_type(&htab_lru_percpu_type); + bpf_register_map_type(&htab_of_map_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 6e24fdf1f373..c35ebfe6d84d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -352,7 +352,8 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); - } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { err = -ENOTSUPP; } else { rcu_read_lock(); @@ -446,6 +447,11 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3b8f528c5473..09923cc5c7c7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1200,6 +1200,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (func_id != BPF_FUNC_map_lookup_elem) goto error; default: @@ -3044,7 +3045,8 @@ process_bpf_exit: static int check_map_prealloc(struct bpf_map *map) { return (map->map_type != BPF_MAP_TYPE_HASH && - map->map_type != BPF_MAP_TYPE_PERCPU_HASH) || + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || !(map->map_flags & BPF_F_NO_PREALLOC); } -- cgit v1.3 From 91b8270f2a4d1d9b268de90451cdca63a70052d6 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 22 Mar 2017 17:27:34 -0700 Subject: Add a helper function to get socket cookie in eBPF Retrieve the socket cookie generated by sock_gen_cookie() from a sk_buff with a known socket. Generates a new cookie if one was not yet set.If the socket pointer inside sk_buff is NULL, 0 is returned. The helper function coud be useful in monitoring per socket networking traffic statistics and provide a unique socket identifier per namespace. Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Willem de Bruijn Signed-off-by: Chenbo Feng Signed-off-by: David S. Miller --- include/linux/sock_diag.h | 1 + include/uapi/linux/bpf.h | 9 ++++++++- net/core/filter.c | 17 +++++++++++++++++ net/core/sock_diag.c | 2 +- tools/include/uapi/linux/bpf.h | 3 ++- 5 files changed, 29 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index a0596ca0e80a..a2f8109bb215 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -24,6 +24,7 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +u64 sock_gen_cookie(struct sock *sk); int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ce6f029ac368..cdfc5595fbc1 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -459,6 +459,12 @@ union bpf_attr { * Return: * > 0 length of the string including the trailing NUL on success * < 0 error + * + * u64 bpf_bpf_get_socket_cookie(skb) + * Get the cookie for the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: 8 Bytes non-decreasing number on success or 0 if the socket + * field is missing inside sk_buff */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -506,7 +512,8 @@ union bpf_attr { FN(get_numa_node_id), \ FN(skb_change_head), \ FN(xdp_adjust_head), \ - FN(probe_read_str), + FN(probe_read_str), \ + FN(get_socket_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index c7f0ccd1c0d3..35b0f97c3fdf 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -2606,6 +2607,18 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2640,6 +2653,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; default: return bpf_base_func_proto(func_id); } @@ -2699,6 +2714,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; default: return bpf_base_func_proto(func_id); } diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 8d11ee75a100..fb9d0e2fd148 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; -static u64 sock_gen_cookie(struct sock *sk) +u64 sock_gen_cookie(struct sock *sk) { while (1) { u64 res = atomic64_read(&sk->sk_cookie); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ce6f029ac368..a3851859e5f3 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -506,7 +506,8 @@ union bpf_attr { FN(get_numa_node_id), \ FN(skb_change_head), \ FN(xdp_adjust_head), \ - FN(probe_read_str), + FN(probe_read_str), \ + FN(get_socket_cookie), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.3 From 6acc5c2910689fc6ee181bf63085c5efff6a42bd Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 22 Mar 2017 17:27:35 -0700 Subject: Add a eBPF helper function to retrieve socket uid Returns the owner uid of the socket inside a sk_buff. This is useful to perform per-UID accounting of network traffic or per-UID packet filtering. The socket need to be a fullsock otherwise overflowuid is returned. Signed-off-by: Chenbo Feng Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 9 ++++++++- net/core/filter.c | 22 ++++++++++++++++++++++ tools/include/uapi/linux/bpf.h | 3 ++- 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index cdfc5595fbc1..28317a04c34d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -465,6 +465,12 @@ union bpf_attr { * @skb: pointer to skb * Return: 8 Bytes non-decreasing number on success or 0 if the socket * field is missing inside sk_buff + * + * u32 bpf_get_socket_uid(skb) + * Get the owner uid of the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: uid of the socket owner on success or 0 if the socket pointer + * inside sk_buff is NULL */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -513,7 +519,8 @@ union bpf_attr { FN(skb_change_head), \ FN(xdp_adjust_head), \ FN(probe_read_str), \ - FN(get_socket_cookie), + FN(get_socket_cookie), \ + FN(get_socket_uid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/net/core/filter.c b/net/core/filter.c index 35b0f97c3fdf..dfb9f61a2fd5 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2619,6 +2619,24 @@ static const struct bpf_func_proto bpf_get_socket_cookie_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); +} + +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2655,6 +2673,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_skb_load_bytes_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; default: return bpf_base_func_proto(func_id); } @@ -2716,6 +2736,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_skb_under_cgroup_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; default: return bpf_base_func_proto(func_id); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a3851859e5f3..1ea08ce35567 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -507,7 +507,8 @@ union bpf_attr { FN(skb_change_head), \ FN(xdp_adjust_head), \ FN(probe_read_str), \ - FN(get_socket_cookie), + FN(get_socket_cookie), \ + FN(get_socket_uid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call -- cgit v1.3 From ae6336b57ede8cdf801b04e6d943617bb945e3f9 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Fri, 24 Mar 2017 23:23:20 +0100 Subject: gtp: rename SGSN netlink attribute This is a mostly cosmetic rename of the SGSN netlink attribute to the GTP link. The justification for this is that we will be making the module support decapsulation of "downstream" SGSN packets, in which case the netlink parameter actually refers to the upstream GGSN peer. Renaming the parameter makes the relationship clearer. The legacy name is maintained as a define in the header file in order to not break existing code. Signed-off-by: Jonas Bonn Acked-by: Pablo Neira Ayuso Acked-by: Harald Welte Signed-off-by: David S. Miller --- drivers/net/gtp.c | 22 +++++++++++----------- include/uapi/linux/gtp.h | 3 ++- 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 3e1854f34420..1f6d911e77c7 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -56,7 +56,7 @@ struct pdp_ctx { u16 af; struct in_addr ms_addr_ip4; - struct in_addr sgsn_addr_ip4; + struct in_addr peer_addr_ip4; struct sock *sk; struct net_device *dev; @@ -489,17 +489,17 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, } netdev_dbg(dev, "found PDP context %p\n", pctx); - rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->sgsn_addr_ip4.s_addr); + rt = ip4_route_output_gtp(&fl4, pctx->sk, pctx->peer_addr_ip4.s_addr); if (IS_ERR(rt)) { netdev_dbg(dev, "no route to SSGN %pI4\n", - &pctx->sgsn_addr_ip4.s_addr); + &pctx->peer_addr_ip4.s_addr); dev->stats.tx_carrier_errors++; goto err; } if (rt->dst.dev == dev) { netdev_dbg(dev, "circular route to SSGN %pI4\n", - &pctx->sgsn_addr_ip4.s_addr); + &pctx->peer_addr_ip4.s_addr); dev->stats.collisions++; goto err_rt; } @@ -866,8 +866,8 @@ static void ipv4_pdp_fill(struct pdp_ctx *pctx, struct genl_info *info) { pctx->gtp_version = nla_get_u32(info->attrs[GTPA_VERSION]); pctx->af = AF_INET; - pctx->sgsn_addr_ip4.s_addr = - nla_get_be32(info->attrs[GTPA_SGSN_ADDRESS]); + pctx->peer_addr_ip4.s_addr = + nla_get_be32(info->attrs[GTPA_PEER_ADDRESS]); pctx->ms_addr_ip4.s_addr = nla_get_be32(info->attrs[GTPA_MS_ADDRESS]); @@ -957,13 +957,13 @@ static int ipv4_pdp_add(struct gtp_dev *gtp, struct sock *sk, switch (pctx->gtp_version) { case GTP_V0: netdev_dbg(dev, "GTPv0-U: new PDP ctx id=%llx ssgn=%pI4 ms=%pI4 (pdp=%p)\n", - pctx->u.v0.tid, &pctx->sgsn_addr_ip4, + pctx->u.v0.tid, &pctx->peer_addr_ip4, &pctx->ms_addr_ip4, pctx); break; case GTP_V1: netdev_dbg(dev, "GTPv1-U: new PDP ctx id=%x/%x ssgn=%pI4 ms=%pI4 (pdp=%p)\n", pctx->u.v1.i_tei, pctx->u.v1.o_tei, - &pctx->sgsn_addr_ip4, &pctx->ms_addr_ip4, pctx); + &pctx->peer_addr_ip4, &pctx->ms_addr_ip4, pctx); break; } @@ -994,7 +994,7 @@ static int gtp_genl_new_pdp(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[GTPA_VERSION] || !info->attrs[GTPA_LINK] || - !info->attrs[GTPA_SGSN_ADDRESS] || + !info->attrs[GTPA_PEER_ADDRESS] || !info->attrs[GTPA_MS_ADDRESS]) return -EINVAL; @@ -1126,7 +1126,7 @@ static int gtp_genl_fill_info(struct sk_buff *skb, u32 snd_portid, u32 snd_seq, goto nlmsg_failure; if (nla_put_u32(skb, GTPA_VERSION, pctx->gtp_version) || - nla_put_be32(skb, GTPA_SGSN_ADDRESS, pctx->sgsn_addr_ip4.s_addr) || + nla_put_be32(skb, GTPA_PEER_ADDRESS, pctx->peer_addr_ip4.s_addr) || nla_put_be32(skb, GTPA_MS_ADDRESS, pctx->ms_addr_ip4.s_addr)) goto nla_put_failure; @@ -1237,7 +1237,7 @@ static struct nla_policy gtp_genl_policy[GTPA_MAX + 1] = { [GTPA_LINK] = { .type = NLA_U32, }, [GTPA_VERSION] = { .type = NLA_U32, }, [GTPA_TID] = { .type = NLA_U64, }, - [GTPA_SGSN_ADDRESS] = { .type = NLA_U32, }, + [GTPA_PEER_ADDRESS] = { .type = NLA_U32, }, [GTPA_MS_ADDRESS] = { .type = NLA_U32, }, [GTPA_FLOW] = { .type = NLA_U16, }, [GTPA_NET_NS_FD] = { .type = NLA_U32, }, diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 72a04a0e8cce..57d1edb8efd9 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -19,7 +19,8 @@ enum gtp_attrs { GTPA_LINK, GTPA_VERSION, GTPA_TID, /* for GTPv0 only */ - GTPA_SGSN_ADDRESS, + GTPA_PEER_ADDRESS, /* Remote GSN peer, either SGSN or GGSN */ +#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS /* maintain legacy attr name */ GTPA_MS_ADDRESS, GTPA_FLOW, GTPA_NET_NS_FD, -- cgit v1.3 From 91ed81f9abc76d5a61b07cb8286c680c9330b7a1 Mon Sep 17 00:00:00 2001 From: Jonas Bonn Date: Fri, 24 Mar 2017 23:23:21 +0100 Subject: gtp: support SGSN-side tunnels The GTP-tunnel driver is explicitly GGSN-side as it searches for PDP contexts based on the incoming packets _destination_ address. If we want to place ourselves on the SGSN side of the tunnel, then we want to be identifying PDP contexts based on _source_ address. Let it be noted that in a "real" configuration this module would never be used: the SGSN normally does not see IP packets as input. The justification for this functionality is for PGW load-testing applications where the input to the SGSN is locally generally IP traffic. This patch adds a "role" argument at GTP-link creation time to specify whether we are on the GGSN or SGSN side of the tunnel; this flag is then used to determine which part of the IP packet to use in determining the PDP context. Signed-off-by: Jonas Bonn Acked-by: Pablo Neira Ayuso Acked-by: Harald Welte Signed-off-by: David S. Miller --- drivers/net/gtp.c | 42 ++++++++++++++++++++++++++++++------------ include/uapi/linux/if_link.h | 7 +++++++ 2 files changed, 37 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/gtp.c b/drivers/net/gtp.c index 1f6d911e77c7..4fea1b3dfbb4 100644 --- a/drivers/net/gtp.c +++ b/drivers/net/gtp.c @@ -74,6 +74,7 @@ struct gtp_dev { struct net_device *dev; + unsigned int role; unsigned int hash_size; struct hlist_head *tid_hash; struct hlist_head *addr_hash; @@ -154,8 +155,8 @@ static struct pdp_ctx *ipv4_pdp_find(struct gtp_dev *gtp, __be32 ms_addr) return NULL; } -static bool gtp_check_src_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx, - unsigned int hdrlen) +static bool gtp_check_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx, + unsigned int hdrlen, unsigned int role) { struct iphdr *iph; @@ -164,27 +165,31 @@ static bool gtp_check_src_ms_ipv4(struct sk_buff *skb, struct pdp_ctx *pctx, iph = (struct iphdr *)(skb->data + hdrlen); - return iph->saddr == pctx->ms_addr_ip4.s_addr; + if (role == GTP_ROLE_SGSN) + return iph->daddr == pctx->ms_addr_ip4.s_addr; + else + return iph->saddr == pctx->ms_addr_ip4.s_addr; } -/* Check if the inner IP source address in this packet is assigned to any +/* Check if the inner IP address in this packet is assigned to any * existing mobile subscriber. */ -static bool gtp_check_src_ms(struct sk_buff *skb, struct pdp_ctx *pctx, - unsigned int hdrlen) +static bool gtp_check_ms(struct sk_buff *skb, struct pdp_ctx *pctx, + unsigned int hdrlen, unsigned int role) { switch (ntohs(skb->protocol)) { case ETH_P_IP: - return gtp_check_src_ms_ipv4(skb, pctx, hdrlen); + return gtp_check_ms_ipv4(skb, pctx, hdrlen, role); } return false; } -static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, unsigned int hdrlen) +static int gtp_rx(struct pdp_ctx *pctx, struct sk_buff *skb, + unsigned int hdrlen, unsigned int role) { struct pcpu_sw_netstats *stats; - if (!gtp_check_src_ms(skb, pctx, hdrlen)) { + if (!gtp_check_ms(skb, pctx, hdrlen, role)) { netdev_dbg(pctx->dev, "No PDP ctx for this MS\n"); return 1; } @@ -239,7 +244,7 @@ static int gtp0_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) return 1; } - return gtp_rx(pctx, skb, hdrlen); + return gtp_rx(pctx, skb, hdrlen, gtp->role); } static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) @@ -281,7 +286,7 @@ static int gtp1u_udp_encap_recv(struct gtp_dev *gtp, struct sk_buff *skb) return 1; } - return gtp_rx(pctx, skb, hdrlen); + return gtp_rx(pctx, skb, hdrlen, gtp->role); } static void gtp_encap_destroy(struct sock *sk) @@ -481,7 +486,11 @@ static int gtp_build_skb_ip4(struct sk_buff *skb, struct net_device *dev, * Prepend PDP header with TEI/TID from PDP ctx. */ iph = ip_hdr(skb); - pctx = ipv4_pdp_find(gtp, iph->daddr); + if (gtp->role == GTP_ROLE_SGSN) + pctx = ipv4_pdp_find(gtp, iph->saddr); + else + pctx = ipv4_pdp_find(gtp, iph->daddr); + if (!pctx) { netdev_dbg(dev, "no PDP ctx found for %pI4, skip\n", &iph->daddr); @@ -685,6 +694,7 @@ static const struct nla_policy gtp_policy[IFLA_GTP_MAX + 1] = { [IFLA_GTP_FD0] = { .type = NLA_U32 }, [IFLA_GTP_FD1] = { .type = NLA_U32 }, [IFLA_GTP_PDP_HASHSIZE] = { .type = NLA_U32 }, + [IFLA_GTP_ROLE] = { .type = NLA_U32 }, }; static int gtp_validate(struct nlattr *tb[], struct nlattr *data[]) @@ -810,6 +820,7 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) { struct sock *sk1u = NULL; struct sock *sk0 = NULL; + unsigned int role = GTP_ROLE_GGSN; if (data[IFLA_GTP_FD0]) { u32 fd0 = nla_get_u32(data[IFLA_GTP_FD0]); @@ -830,8 +841,15 @@ static int gtp_encap_enable(struct gtp_dev *gtp, struct nlattr *data[]) } } + if (data[IFLA_GTP_ROLE]) { + role = nla_get_u32(data[IFLA_GTP_ROLE]); + if (role > GTP_ROLE_SGSN) + return -EINVAL; + } + gtp->sk0 = sk0; gtp->sk1u = sk1u; + gtp->role = role; return 0; } diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 320fc1e747ee..8b405afb2376 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -538,11 +538,18 @@ enum { #define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) /* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + enum { IFLA_GTP_UNSPEC, IFLA_GTP_FD0, IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) -- cgit v1.3 From 1555d204e743b6956d2be294a317121f6112238d Mon Sep 17 00:00:00 2001 From: Arkadi Sharshevsky Date: Tue, 28 Mar 2017 17:24:10 +0200 Subject: devlink: Support for pipeline debug (dpipe) The pipeline debug is used to export the pipeline abstractions for the main objects - tables, headers and entries. The only support for set is for changing the counter parameter on specific table. The basic structures: Header - can represent a real protocol header information or internal metadata. Generic protocol headers like IPv4 can be shared between drivers. Each driver can add local headers. Field - part of a header. Can represent protocol field or specific ASIC metadata field. Hardware special metadata fields can be mapped to different resources, for example switch ASIC ports can have internal number which from the systems point of view is mapped to netdeivce ifindex. Match - represent specific match rule. Can describe match on specific field or header. The header index should be specified as well in order to support several header instances of the same type (tunneling). Action - represents specific action rule. Actions can describe operations on specific field values for example like set, increment, etc. And header operation like add and delete. Value - represents value which can be associated with specific match or action. Table - represents a hardware block which can be described with match/ action behavior. The match/action can be done on the packets data or on the internal metadata that it gathered along the packets traversal throw the pipeline which is vendor specific and should be exported in order to provide understanding of ASICs behavior. Entry - represents single record in a specific table. The entry is identified by specific combination of values for match/action. Prior to accessing the tables/entries the drivers provide the header/ field data base which is used by driver to user-space. The data base is split between the shared headers and unique headers. Signed-off-by: Arkadi Sharshevsky Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/net/devlink.h | 259 ++++++++++++++ include/uapi/linux/devlink.h | 67 +++- net/core/devlink.c | 836 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1161 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index d29e5fc82582..24de13f8c94f 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -25,6 +25,8 @@ struct devlink { struct list_head list; struct list_head port_list; struct list_head sb_list; + struct list_head dpipe_table_list; + struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; possible_net_t _net; @@ -49,6 +51,178 @@ struct devlink_sb_pool_info { enum devlink_sb_threshold_type threshold_type; }; +/** + * struct devlink_dpipe_field - dpipe field object + * @name: field name + * @id: index inside the headers field array + * @bitwidth: bitwidth + * @mapping_type: mapping type + */ +struct devlink_dpipe_field { + const char *name; + unsigned int id; + unsigned int bitwidth; + enum devlink_dpipe_field_mapping_type mapping_type; +}; + +/** + * struct devlink_dpipe_header - dpipe header object + * @name: header name + * @id: index, global/local detrmined by global bit + * @fields: fields + * @fields_count: number of fields + * @global: indicates if header is shared like most protocol header + * or driver specific + */ +struct devlink_dpipe_header { + const char *name; + unsigned int id; + struct devlink_dpipe_field *fields; + unsigned int fields_count; + bool global; +}; + +/** + * struct devlink_dpipe_match - represents match operation + * @type: type of match + * @header_index: header index (packets can have several headers of same + * type like in case of tunnels) + * @header: header + * @fieled_id: field index + */ +struct devlink_dpipe_match { + enum devlink_dpipe_match_type type; + unsigned int header_index; + struct devlink_dpipe_header *header; + unsigned int field_id; +}; + +/** + * struct devlink_dpipe_action - represents action operation + * @type: type of action + * @header_index: header index (packets can have several headers of same + * type like in case of tunnels) + * @header: header + * @fieled_id: field index + */ +struct devlink_dpipe_action { + enum devlink_dpipe_action_type type; + unsigned int header_index; + struct devlink_dpipe_header *header; + unsigned int field_id; +}; + +/** + * struct devlink_dpipe_value - represents value of match/action + * @action: action + * @match: match + * @mapping_value: in case the field has some mapping this value + * specified the mapping value + * @mapping_valid: specify if mapping value is valid + * @value_size: value size + * @value: value + * @mask: bit mask + */ +struct devlink_dpipe_value { + union { + struct devlink_dpipe_action *action; + struct devlink_dpipe_match *match; + }; + unsigned int mapping_value; + bool mapping_valid; + unsigned int value_size; + void *value; + void *mask; +}; + +/** + * struct devlink_dpipe_entry - table entry object + * @index: index of the entry in the table + * @match_values: match values + * @matche_values_count: count of matches tuples + * @action_values: actions values + * @action_values_count: count of actions values + * @counter: value of counter + * @counter_valid: Specify if value is valid from hardware + */ +struct devlink_dpipe_entry { + u64 index; + struct devlink_dpipe_value *match_values; + unsigned int match_values_count; + struct devlink_dpipe_value *action_values; + unsigned int action_values_count; + u64 counter; + bool counter_valid; +}; + +/** + * struct devlink_dpipe_dump_ctx - context provided to driver in order + * to dump + * @info: info + * @cmd: devlink command + * @skb: skb + * @nest: top attribute + * @hdr: hdr + */ +struct devlink_dpipe_dump_ctx { + struct genl_info *info; + enum devlink_command cmd; + struct sk_buff *skb; + struct nlattr *nest; + void *hdr; +}; + +struct devlink_dpipe_table_ops; + +/** + * struct devlink_dpipe_table - table object + * @priv: private + * @name: table name + * @size: maximum number of entries + * @counters_enabled: indicates if counters are active + * @counter_control_extern: indicates if counter control is in dpipe or + * external tool + * @table_ops: table operations + * @rcu: rcu + */ +struct devlink_dpipe_table { + void *priv; + struct list_head list; + const char *name; + u64 size; + bool counters_enabled; + bool counter_control_extern; + struct devlink_dpipe_table_ops *table_ops; + struct rcu_head rcu; +}; + +/** + * struct devlink_dpipe_table_ops - dpipe_table ops + * @actions_dump - dumps all tables actions + * @matches_dump - dumps all tables matches + * @entries_dump - dumps all active entries in the table + * @counters_set_update - when changing the counter status hardware sync + * maybe needed to allocate/free counter related + * resources + */ +struct devlink_dpipe_table_ops { + int (*actions_dump)(void *priv, struct sk_buff *skb); + int (*matches_dump)(void *priv, struct sk_buff *skb); + int (*entries_dump)(void *priv, bool counters_enabled, + struct devlink_dpipe_dump_ctx *dump_ctx); + int (*counters_set_update)(void *priv, bool enable); +}; + +/** + * struct devlink_dpipe_headers - dpipe headers + * @headers - header array can be shared (global bit) or driver specific + * @headers_count - count of headers + */ +struct devlink_dpipe_headers { + struct devlink_dpipe_header **headers; + unsigned int headers_count; +}; + struct devlink_ops { int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); @@ -132,6 +306,26 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u16 egress_pools_count, u16 ingress_tc_count, u16 egress_tc_count); void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index); +int devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern); +void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name); +int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers); +void devlink_dpipe_headers_unregister(struct devlink *devlink); +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name); +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx); +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry); +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx); +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action); +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match); #else @@ -200,6 +394,71 @@ static inline void devlink_sb_unregister(struct devlink *devlink, { } +static inline int +devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern) +{ + return 0; +} + +static inline void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ +} + +static inline int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers * + dpipe_headers) +{ + return 0; +} + +static inline void devlink_dpipe_headers_unregister(struct devlink *devlink) +{ +} + +static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + return false; +} + +static inline int +devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + return 0; +} + +static inline int +devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return 0; +} + +static inline int +devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + return 0; +} + +static inline int +devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + return 0; +} + +static inline int +devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 0f1f3a12e23c..b47bee277347 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -65,8 +65,12 @@ enum devlink_command { #define DEVLINK_CMD_ESWITCH_MODE_SET /* obsolete, never use this! */ \ DEVLINK_CMD_ESWITCH_SET - /* add new commands above here */ + DEVLINK_CMD_DPIPE_TABLE_GET, + DEVLINK_CMD_DPIPE_ENTRIES_GET, + DEVLINK_CMD_DPIPE_HEADERS_GET, + DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 }; @@ -148,10 +152,71 @@ enum devlink_attr { DEVLINK_ATTR_ESWITCH_MODE, /* u16 */ DEVLINK_ATTR_ESWITCH_INLINE_MODE, /* u8 */ + DEVLINK_ATTR_DPIPE_TABLES, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_NAME, /* string */ + DEVLINK_ATTR_DPIPE_TABLE_SIZE, /* u64 */ + DEVLINK_ATTR_DPIPE_TABLE_MATCHES, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_ACTIONS, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, /* u8 */ + + DEVLINK_ATTR_DPIPE_ENTRIES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_INDEX, /* u64 */ + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, /* u64 */ + + DEVLINK_ATTR_DPIPE_MATCH, /* nested */ + DEVLINK_ATTR_DPIPE_MATCH_VALUE, /* nested */ + DEVLINK_ATTR_DPIPE_MATCH_TYPE, /* u32 */ + + DEVLINK_ATTR_DPIPE_ACTION, /* nested */ + DEVLINK_ATTR_DPIPE_ACTION_VALUE, /* nested */ + DEVLINK_ATTR_DPIPE_ACTION_TYPE, /* u32 */ + + DEVLINK_ATTR_DPIPE_VALUE, + DEVLINK_ATTR_DPIPE_VALUE_MASK, + DEVLINK_ATTR_DPIPE_VALUE_MAPPING, /* u32 */ + + DEVLINK_ATTR_DPIPE_HEADERS, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER_NAME, /* string */ + DEVLINK_ATTR_DPIPE_HEADER_ID, /* u32 */ + DEVLINK_ATTR_DPIPE_HEADER_FIELDS, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, /* u8 */ + DEVLINK_ATTR_DPIPE_HEADER_INDEX, /* u32 */ + + DEVLINK_ATTR_DPIPE_FIELD, /* nested */ + DEVLINK_ATTR_DPIPE_FIELD_NAME, /* string */ + DEVLINK_ATTR_DPIPE_FIELD_ID, /* u32 */ + DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, /* u32 */ + DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, /* u32 */ + + DEVLINK_ATTR_PAD, + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 }; +/* Mapping between internal resource described by the field and system + * structure + */ +enum devlink_dpipe_field_mapping_type { + DEVLINK_DPIPE_FIELD_MAPPING_TYPE_NONE, + DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX, +}; + +/* Match type - specify the type of the match */ +enum devlink_dpipe_match_type { + DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT, +}; + +/* Action type - specify the action type */ +enum devlink_dpipe_action_type { + DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/net/core/devlink.c b/net/core/devlink.c index e9c1e6acfb6d..24b766003a61 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1493,8 +1493,686 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (err) return err; } + return 0; +} + +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + struct devlink_dpipe_header *header = match->header; + struct devlink_dpipe_field *field = &header->fields[match->field_id]; + struct nlattr *match_attr; + + match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH); + if (!match_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, match_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, match_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); + +static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *matches_attr; + + matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES); + if (!matches_attr) + return -EMSGSIZE; + + if (table->table_ops->matches_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, matches_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, matches_attr); + return -EMSGSIZE; +} + +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + struct devlink_dpipe_header *header = action->header; + struct devlink_dpipe_field *field = &header->fields[action->field_id]; + struct nlattr *action_attr; + + action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION); + if (!action_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, action_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, action_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); + +static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *actions_attr; + + actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); + if (!actions_attr) + return -EMSGSIZE; + + if (table->table_ops->actions_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, actions_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, actions_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_table_put(struct sk_buff *skb, + struct devlink_dpipe_table *table) +{ + struct nlattr *table_attr; + + table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE); + if (!table_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table->size, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, + table->counters_enabled)) + goto nla_put_failure; + + if (devlink_dpipe_matches_put(table, skb)) + goto nla_put_failure; + + if (devlink_dpipe_actions_put(table, skb)) + goto nla_put_failure; + + nla_nest_end(skb, table_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, table_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, + struct genl_info *info) +{ + int err; + + if (*pskb) { + err = genlmsg_reply(*pskb, info); + if (err) + return err; + } + *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!*pskb) + return -ENOMEM; + return 0; +} + +static int devlink_dpipe_tables_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + struct nlattr *tables_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + table = list_first_entry(dpipe_tables, + struct devlink_dpipe_table, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES); + if (!tables_attr) + goto nla_put_failure; + + i = 0; + incomplete = false; + list_for_each_entry_from(table, dpipe_tables, list) { + if (!table_name) { + err = devlink_dpipe_table_put(skb, table); + if (err) { + if (!i) + goto err_table_put; + incomplete = true; + break; + } + } else { + if (!strcmp(table->name, table_name)) { + err = devlink_dpipe_table_put(skb, table); + if (err) + break; + } + } + i++; + } + + nla_nest_end(skb, tables_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name = NULL; + + if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + + return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, + &devlink->dpipe_table_list, + table_name); +} + +static int devlink_dpipe_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, + value->value_size, value->value)) + return -EMSGSIZE; + if (value->mask) + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, + value->value_size, value->mask)) + return -EMSGSIZE; + if (value->mapping_valid) + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, + value->mapping_value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->action) + return -EINVAL; + if (devlink_dpipe_action_put(skb, value->action)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *action_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + action_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ACTION_VALUE); + if (!action_attr) + return -EMSGSIZE; + err = devlink_dpipe_action_value_put(skb, &values[i]); + if (err) + goto err_action_value_put; + nla_nest_end(skb, action_attr); + } + return 0; + +err_action_value_put: + nla_nest_cancel(skb, action_attr); + return err; +} + +static int devlink_dpipe_match_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->match) + return -EINVAL; + if (devlink_dpipe_match_put(skb, value->match)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_match_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *match_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + match_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_MATCH_VALUE); + if (!match_attr) + return -EMSGSIZE; + err = devlink_dpipe_match_value_put(skb, &values[i]); + if (err) + goto err_match_value_put; + nla_nest_end(skb, match_attr); + } + return 0; + +err_match_value_put: + nla_nest_cancel(skb, match_attr); + return err; +} + +static int devlink_dpipe_entry_put(struct sk_buff *skb, + struct devlink_dpipe_entry *entry) +{ + struct nlattr *entry_attr, *matches_attr, *actions_attr; + int err; + + entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (entry->counter_valid) + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, + entry->counter, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + matches_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); + if (!matches_attr) + goto nla_put_failure; + + err = devlink_dpipe_match_values_put(skb, entry->match_values, + entry->match_values_count); + if (err) { + nla_nest_cancel(skb, matches_attr); + goto err_match_values_put; + } + nla_nest_end(skb, matches_attr); + + actions_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); + if (!actions_attr) + goto nla_put_failure; + + err = devlink_dpipe_action_values_put(skb, entry->action_values, + entry->action_values_count); + if (err) { + nla_nest_cancel(skb, actions_attr); + goto err_action_values_put; + } + nla_nest_end(skb, actions_attr); + nla_nest_end(skb, entry_attr); return 0; + +nla_put_failure: + err = -EMSGSIZE; +err_match_values_put: +err_action_values_put: + nla_nest_cancel(skb, entry_attr); + return err; +} + +static struct devlink_dpipe_table * +devlink_dpipe_table_find(struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + list_for_each_entry_rcu(table, dpipe_tables, list) { + if (!strcmp(table->name, table_name)) + return table; + } + return NULL; +} + +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct devlink *devlink; + int err; + + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, + dump_ctx->info); + if (err) + return err; + + dump_ctx->hdr = genlmsg_put(dump_ctx->skb, + dump_ctx->info->snd_portid, + dump_ctx->info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, + dump_ctx->cmd); + if (!dump_ctx->hdr) + goto nla_put_failure; + + devlink = dump_ctx->info->user_ptr[0]; + if (devlink_nl_put_handle(dump_ctx->skb, devlink)) + goto nla_put_failure; + dump_ctx->nest = nla_nest_start(dump_ctx->skb, + DEVLINK_ATTR_DPIPE_ENTRIES); + if (!dump_ctx->nest) + goto nla_put_failure; + return 0; + +nla_put_failure: + genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr); + nlmsg_free(dump_ctx->skb); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); + +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return devlink_dpipe_entry_put(dump_ctx->skb, entry); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); + +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + nla_nest_end(dump_ctx->skb, dump_ctx->nest); + genlmsg_end(dump_ctx->skb, dump_ctx->hdr); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); + +static int devlink_dpipe_entries_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_table *table) +{ + struct devlink_dpipe_dump_ctx dump_ctx; + struct nlmsghdr *nlh; + int err; + + dump_ctx.skb = NULL; + dump_ctx.cmd = cmd; + dump_ctx.info = info; + + err = table->table_ops->entries_dump(table->priv, + table->counters_enabled, + &dump_ctx); + if (err) + goto err_entries_dump; + +send_done: + nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(dump_ctx.skb, info); + +err_entries_dump: +err_skb_send_alloc: + genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr); + nlmsg_free(dump_ctx.skb); + return err; +} + +static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + const char *table_name; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (!table->table_ops->entries_dump) + return -EINVAL; + + return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, + 0, table); +} + +static int devlink_dpipe_fields_put(struct sk_buff *skb, + const struct devlink_dpipe_header *header) +{ + struct devlink_dpipe_field *field; + struct nlattr *field_attr; + int i; + + for (i = 0; i < header->fields_count; i++) { + field = &header->fields[i]; + field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD); + if (!field_attr) + return -EMSGSIZE; + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) + goto nla_put_failure; + nla_nest_end(skb, field_attr); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, field_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_header_put(struct sk_buff *skb, + struct devlink_dpipe_header *header) +{ + struct nlattr *fields_attr, *header_attr; + int err; + + header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER); + if (!header) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS); + if (!fields_attr) + goto nla_put_failure; + + err = devlink_dpipe_fields_put(skb, header); + if (err) { + nla_nest_cancel(skb, fields_attr); + goto nla_put_failure; + } + nla_nest_end(skb, fields_attr); + nla_nest_end(skb, header_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; + nla_nest_cancel(skb, header_attr); + return err; +} + +static int devlink_dpipe_headers_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_headers * + dpipe_headers) +{ + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *headers_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *hdr; + int i, j; + int err; + + i = 0; +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS); + if (!headers_attr) + goto nla_put_failure; + + j = 0; + for (; i < dpipe_headers->headers_count; i++) { + err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); + if (err) { + if (!j) + goto err_table_put; + break; + } + j++; + } + nla_nest_end(skb, headers_attr); + genlmsg_end(skb, hdr); + if (i != dpipe_headers->headers_count) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink->dpipe_headers) + return -EOPNOTSUPP; + return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, + 0, devlink->dpipe_headers); +} + +static int devlink_dpipe_table_counters_set(struct devlink *devlink, + const char *table_name, + bool enable) +{ + struct devlink_dpipe_table *table; + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (table->counter_control_extern) + return -EOPNOTSUPP; + + if (!(table->counters_enabled ^ enable)) + return 0; + + table->counters_enabled = enable; + if (table->table_ops->counters_set_update) + table->table_ops->counters_set_update(table->priv, enable); + return 0; +} + +static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name; + bool counters_enable; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] || + !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); + + return devlink_dpipe_table_counters_set(devlink, table_name, + counters_enable); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1512,6 +2190,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1644,6 +2324,34 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .doit = devlink_nl_cmd_dpipe_table_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .doit = devlink_nl_cmd_dpipe_entries_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .doit = devlink_nl_cmd_dpipe_headers_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .doit = devlink_nl_cmd_dpipe_table_counters_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -1680,6 +2388,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -1880,6 +2589,133 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) } EXPORT_SYMBOL_GPL(devlink_sb_unregister); +/** + * devlink_dpipe_headers_register - register dpipe headers + * + * @devlink: devlink + * @dpipe_headers: dpipe header array + * + * Register the headers supported by hardware. + */ +int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = dpipe_headers; + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); + +/** + * devlink_dpipe_headers_unregister - unregister dpipe headers + * + * @devlink: devlink + * + * Unregister the headers supported by hardware. + */ +void devlink_dpipe_headers_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = NULL; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); + +/** + * devlink_dpipe_table_counter_enabled - check if counter allocation + * required + * @devlink: devlink + * @table_name: tables name + * + * Used by driver to check if counter allocation is required. + * After counter allocation is turned on the table entries + * are updated to include counter statistics. + * + * After that point on the driver must respect the counter + * state so that each entry added to the table is added + * with a counter. + */ +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + bool enabled; + + rcu_read_lock(); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + enabled = false; + if (table) + enabled = table->counters_enabled; + rcu_read_unlock(); + return enabled; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); + +/** + * devlink_dpipe_table_register - register dpipe table + * + * @devlink: devlink + * @table_name: table name + * @table_ops: table ops + * @priv: priv + * @size: size + * @counter_control_extern: external control for counters + */ +int devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern) +{ + struct devlink_dpipe_table *table; + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) + return -EEXIST; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->name = table_name; + table->table_ops = table_ops; + table->priv = priv; + table->size = size; + table->counter_control_extern = counter_control_extern; + + mutex_lock(&devlink_mutex); + list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); + +/** + * devlink_dpipe_table_unregister - unregister dpipe table + * + * @devlink: devlink + * @table_name: table name + */ +void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + mutex_lock(&devlink_mutex); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + goto unlock; + list_del_rcu(&table->list); + mutex_unlock(&devlink_mutex); + kfree_rcu(table, rcu); + return; +unlock: + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); -- cgit v1.3 From 983701eb0676db96c604db8a3f7ae936a7029ff5 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Tue, 28 Mar 2017 14:28:01 -0700 Subject: rtnetlink: Add RTM_DELNETCONF Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/uapi/linux/rtnetlink.h | 2 ++ security/selinux/nlmsgtab.c | 1 + 2 files changed, 3 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 3dd72aee4d32..cce061382e40 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -122,6 +122,8 @@ enum { RTM_NEWNETCONF = 80, #define RTM_NEWNETCONF RTM_NEWNETCONF + RTM_DELNETCONF, +#define RTM_DELNETCONF RTM_DELNETCONF RTM_GETNETCONF = 82, #define RTM_GETNETCONF RTM_GETNETCONF diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c index 2ca9cde939d4..8e67bb4c9cab 100644 --- a/security/selinux/nlmsgtab.c +++ b/security/selinux/nlmsgtab.c @@ -69,6 +69,7 @@ static struct nlmsg_perm nlmsg_route_perms[] = { RTM_GETDCB, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_SETDCB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_NEWNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, + { RTM_DELNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_GETNETCONF, NETLINK_ROUTE_SOCKET__NLMSG_READ }, { RTM_NEWMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, { RTM_DELMDB, NETLINK_ROUTE_SOCKET__NLMSG_WRITE }, -- cgit v1.3 From a3caf7440dedd2399f90f27ff11ac390bf03e6c4 Mon Sep 17 00:00:00 2001 From: Vidyullatha Kanchanapally Date: Fri, 31 Mar 2017 00:22:34 +0300 Subject: cfg80211: Add support for FILS shared key authentication offload Enhance nl80211 and cfg80211 connect request and response APIs to support FILS shared key authentication offload. The new nl80211 attributes can be used to provide additional information to the driver to establish a FILS connection. Also enhance the set/del PMKSA to allow support for adding and deleting PMKSA based on FILS cache identifier. Add a new feature flag that drivers can use to advertize support for FILS shared key authentication and association in station mode when using their own SME. Signed-off-by: Vidyullatha Kanchanapally Signed-off-by: Jouni Malinen Signed-off-by: Johannes Berg --- include/linux/ieee80211.h | 13 +++++++ include/net/cfg80211.h | 57 +++++++++++++++++++++++++++- include/uapi/linux/nl80211.h | 86 ++++++++++++++++++++++++++++++++++++++++-- net/wireless/nl80211.c | 90 +++++++++++++++++++++++++++++++++++++++++--- net/wireless/sme.c | 25 +++++++++++- 5 files changed, 259 insertions(+), 12 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 22bf0676d928..294fa6273a62 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -1723,6 +1723,9 @@ enum ieee80211_statuscode { WLAN_STATUS_REJECT_DSE_BAND = 96, WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, + /* 802.11ai */ + WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108, + WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109, }; @@ -2104,6 +2107,12 @@ enum ieee80211_key_len { #define FILS_NONCE_LEN 16 #define FILS_MAX_KEK_LEN 64 +#define FILS_ERP_MAX_USERNAME_LEN 16 +#define FILS_ERP_MAX_REALM_LEN 253 +#define FILS_ERP_MAX_RRK_LEN 64 + +#define PMK_MAX_LEN 48 + /* Public action codes */ enum ieee80211_pub_actioncode { WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4, @@ -2355,6 +2364,10 @@ enum ieee80211_sa_query_action { #define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) #define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) #define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) +#define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) +#define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) +#define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_MAX_KEY_LEN 32 diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index da12d5b86e1b..042137d7d226 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -2073,6 +2073,19 @@ struct cfg80211_bss_selection { * the BSSID of the current association, i.e., to the value that is * included in the Current AP address field of the Reassociation Request * frame. + * @fils_erp_username: EAP re-authentication protocol (ERP) username part of the + * NAI or %NULL if not specified. This is used to construct FILS wrapped + * data IE. + * @fils_erp_username_len: Length of @fils_erp_username in octets. + * @fils_erp_realm: EAP re-authentication protocol (ERP) realm part of NAI or + * %NULL if not specified. This specifies the domain name of ER server and + * is used to construct FILS wrapped data IE. + * @fils_erp_realm_len: Length of @fils_erp_realm in octets. + * @fils_erp_next_seq_num: The next sequence number to use in the FILS ERP + * messages. This is also used to construct FILS wrapped data IE. + * @fils_erp_rrk: ERP re-authentication Root Key (rRK) used to derive additional + * keys in FILS or %NULL if not specified. + * @fils_erp_rrk_len: Length of @fils_erp_rrk in octets. */ struct cfg80211_connect_params { struct ieee80211_channel *channel; @@ -2098,6 +2111,13 @@ struct cfg80211_connect_params { bool pbss; struct cfg80211_bss_selection bss_select; const u8 *prev_bssid; + const u8 *fils_erp_username; + size_t fils_erp_username_len; + const u8 *fils_erp_realm; + size_t fils_erp_realm_len; + u16 fils_erp_next_seq_num; + const u8 *fils_erp_rrk; + size_t fils_erp_rrk_len; }; /** @@ -2136,12 +2156,27 @@ enum wiphy_params_flags { * This structure is passed to the set/del_pmksa() method for PMKSA * caching. * - * @bssid: The AP's BSSID. - * @pmkid: The PMK material itself. + * @bssid: The AP's BSSID (may be %NULL). + * @pmkid: The identifier to refer a PMKSA. + * @pmk: The PMK for the PMKSA identified by @pmkid. This is used for key + * derivation by a FILS STA. Otherwise, %NULL. + * @pmk_len: Length of the @pmk. The length of @pmk can differ depending on + * the hash algorithm used to generate this. + * @ssid: SSID to specify the ESS within which a PMKSA is valid when using FILS + * cache identifier (may be %NULL). + * @ssid_len: Length of the @ssid in octets. + * @cache_id: 2-octet cache identifier advertized by a FILS AP identifying the + * scope of PMKSA. This is valid only if @ssid_len is non-zero (may be + * %NULL). */ struct cfg80211_pmksa { const u8 *bssid; const u8 *pmkid; + const u8 *pmk; + size_t pmk_len; + const u8 *ssid; + size_t ssid_len; + const u8 *cache_id; }; /** @@ -5153,6 +5188,17 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) * @req_ie_len: Association request IEs length * @resp_ie: Association response IEs (may be %NULL) * @resp_ie_len: Association response IEs length + * @fils_kek: KEK derived from a successful FILS connection (may be %NULL) + * @fils_kek_len: Length of @fils_kek in octets + * @update_erp_next_seq_num: Boolean value to specify whether the value in + * @fils_erp_next_seq_num is valid. + * @fils_erp_next_seq_num: The next sequence number to use in ERP message in + * FILS Authentication. This value should be specified irrespective of the + * status for a FILS connection. + * @pmk: A new PMK if derived from a successful FILS connection (may be %NULL). + * @pmk_len: Length of @pmk in octets + * @pmkid: A new PMKID if derived from a successful FILS connection or the PMKID + * used for this FILS connection (may be %NULL). * @timeout_reason: Reason for connection timeout. This is used when the * connection fails due to a timeout instead of an explicit rejection from * the AP. %NL80211_TIMEOUT_UNSPECIFIED is used when the timeout reason is @@ -5168,6 +5214,13 @@ struct cfg80211_connect_resp_params { size_t req_ie_len; const u8 *resp_ie; size_t resp_ie_len; + const u8 *fils_kek; + size_t fils_kek_len; + bool update_erp_next_seq_num; + u16 fils_erp_next_seq_num; + const u8 *pmk; + size_t pmk_len; + const u8 *pmkid; enum nl80211_timeout_reason timeout_reason; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index cd4dfef58fab..6095a6c4c412 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -172,6 +172,42 @@ * Multiple such rules can be created. */ +/** + * DOC: FILS shared key authentication offload + * + * FILS shared key authentication offload can be advertized by drivers by + * setting @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD flag. The drivers that support + * FILS shared key authentication offload should be able to construct the + * authentication and association frames for FILS shared key authentication and + * eventually do a key derivation as per IEEE 802.11ai. The below additional + * parameters should be given to driver in %NL80211_CMD_CONNECT. + * %NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message + * %NL80211_ATTR_FILS_ERP_RRK - used to generate the rIK and rMSK + * rIK should be used to generate an authentication tag on the ERP message and + * rMSK should be used to derive a PMKSA. + * rIK, rMSK should be generated and keyname_nai, sequence number should be used + * as specified in IETF RFC 6696. + * + * When FILS shared key authentication is completed, driver needs to provide the + * below additional parameters to userspace. + * %NL80211_ATTR_FILS_KEK - used for key renewal + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges + * %NL80211_ATTR_PMKID - used to identify the PMKSA used/generated + * %Nl80211_ATTR_PMK - used to update PMKSA cache in userspace + * The PMKSA can be maintained in userspace persistently so that it can be used + * later after reboots or wifi turn off/on also. + * + * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertized by a FILS + * capable AP supporting PMK caching. It specifies the scope within which the + * PMKSAs are cached in an ESS. %NL80211_CMD_SET_PMKSA and + * %NL80211_CMD_DEL_PMKSA are enhanced to allow support for PMKSA caching based + * on FILS cache identifier. Additionally %NL80211_ATTR_PMK is used with + * %NL80211_SET_PMKSA to specify the PMK corresponding to a PMKSA for driver to + * use in a FILS shared key connection with PMKSA caching. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -370,10 +406,18 @@ * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to * NL80211_CMD_GET_SURVEY and on the "scan" multicast group) * - * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry, using %NL80211_ATTR_MAC - * (for the BSSID) and %NL80211_ATTR_PMKID. + * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry using %NL80211_ATTR_MAC + * (for the BSSID), %NL80211_ATTR_PMKID, and optionally %NL80211_ATTR_PMK + * (PMK is used for PTKSA derivation in case of FILS shared key offload) or + * using %NL80211_ATTR_SSID, %NL80211_ATTR_FILS_CACHE_ID, + * %NL80211_ATTR_PMKID, and %NL80211_ATTR_PMK in case of FILS + * authentication where %NL80211_ATTR_FILS_CACHE_ID is the identifier + * advertized by a FILS capable AP identifying the scope of PMKSA in an + * ESS. * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC - * (for the BSSID) and %NL80211_ATTR_PMKID. + * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, + * %NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS + * authentication. * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries. * * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain @@ -2012,6 +2056,31 @@ enum nl80211_commands { * u32 attribute with an &enum nl80211_timeout_reason value. This is used, * e.g., with %NL80211_CMD_CONNECT event. * + * @NL80211_ATTR_FILS_ERP_USERNAME: EAP Re-authentication Protocol (ERP) + * username part of NAI used to refer keys rRK and rIK. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_REALM: EAP Re-authentication Protocol (ERP) realm part + * of NAI specifying the domain name of the ER server. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM: Unsigned 16-bit ERP next sequence number + * to use in ERP messages. This is used in generating the FILS wrapped data + * for FILS authentication and is used with %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_RRK: ERP re-authentication Root Key (rRK) for the + * NAI specified by %NL80211_ATTR_FILS_ERP_USERNAME and + * %NL80211_ATTR_FILS_ERP_REALM. This is used for generating rIK and rMSK + * from successful FILS authentication and is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertized by a FILS AP + * identifying the scope of PMKSAs. This is used with + * @NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA. + * + * @NL80211_ATTR_PMK: PMK for the PMKSA identified by %NL80211_ATTR_PMKID. + * This is used with @NL80211_CMD_SET_PMKSA. + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2423,6 +2492,14 @@ enum nl80211_attrs { NL80211_ATTR_TIMEOUT_REASON, + NL80211_ATTR_FILS_ERP_USERNAME, + NL80211_ATTR_FILS_ERP_REALM, + NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + NL80211_ATTR_FILS_ERP_RRK, + NL80211_ATTR_FILS_CACHE_ID, + + NL80211_ATTR_PMK, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -4759,6 +4836,8 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_CQM_RSSI_LIST: With this driver the * %NL80211_ATTR_CQM_RSSI_THOLD attribute accepts a list of zero or more * RSSI threshold values to monitor rather than exactly one threshold. + * @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD: Driver SME supports FILS shared key + * authentication with %NL80211_CMD_CONNECT. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4778,6 +4857,7 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI, NL80211_EXT_FEATURE_CQM_RSSI_LIST, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 3d635c865281..9910aae08f1a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -410,6 +410,15 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = sizeof(struct nl80211_bss_select_rssi_adjust) }, [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 }, + [NL80211_ATTR_FILS_ERP_USERNAME] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_USERNAME_LEN }, + [NL80211_ATTR_FILS_ERP_REALM] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_REALM_LEN }, + [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 }, + [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_RRK_LEN }, + [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, + [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, }; /* policy for the key attributes */ @@ -3832,6 +3841,19 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_CONNECT: + /* SAE not supported yet */ + if (auth_type == NL80211_AUTHTYPE_SAE) + return false; + /* FILS with SK PFS or PK not supported yet */ + if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || + auth_type == NL80211_AUTHTYPE_FILS_PK) + return false; + if (!wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && + auth_type == NL80211_AUTHTYPE_FILS_SK) + return false; + return true; case NL80211_CMD_START_AP: /* SAE not supported yet */ if (auth_type == NL80211_AUTHTYPE_SAE) @@ -8906,6 +8928,35 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } } + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && + info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] && + info->attrs[NL80211_ATTR_FILS_ERP_REALM] && + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] && + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + connect.fils_erp_username = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_username_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_realm = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_realm_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_next_seq_num = + nla_get_u16( + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]); + connect.fils_erp_rrk = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + connect.fils_erp_rrk_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] || + info->attrs[NL80211_ATTR_FILS_ERP_REALM] || + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] || + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + kzfree(connkeys); + return -EINVAL; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -9025,14 +9076,28 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); - if (!info->attrs[NL80211_ATTR_MAC]) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_PMKID]) return -EINVAL; pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); - pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_MAC]) { + pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + } else if (info->attrs[NL80211_ATTR_SSID] && + info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA || + info->attrs[NL80211_ATTR_PMK])) { + pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + pmksa.cache_id = + nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + } else { + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_PMK]) { + pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]); + pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]); + } if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) @@ -13471,7 +13536,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct sk_buff *msg; void *hdr; - msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len, gfp); + msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len + + cr->fils_kek_len + cr->pmk_len + + (cr->pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!msg) return; @@ -13496,7 +13563,18 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_REQ_IE, cr->req_ie_len, cr->req_ie)) || (cr->resp_ie && nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len, - cr->resp_ie))) + cr->resp_ie)) || + (cr->update_erp_next_seq_num && + nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + cr->fils_erp_next_seq_num)) || + (cr->status == WLAN_STATUS_SUCCESS && + ((cr->fils_kek && + nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len, + cr->fils_kek)) || + (cr->pmk && + nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) || + (cr->pmkid && + nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid))))) goto nla_put_failure; genlmsg_end(msg, hdr); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index ebd7adc27246..6459bb7c21f7 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -805,7 +805,9 @@ void cfg80211_connect_done(struct net_device *dev, } ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + - params->req_ie_len + params->resp_ie_len, gfp); + params->req_ie_len + params->resp_ie_len + + params->fils_kek_len + params->pmk_len + + (params->pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { cfg80211_put_bss(wdev->wiphy, params->bss); return; @@ -832,6 +834,27 @@ void cfg80211_connect_done(struct net_device *dev, params->resp_ie_len); next += params->resp_ie_len; } + if (params->fils_kek_len) { + ev->cr.fils_kek = next; + ev->cr.fils_kek_len = params->fils_kek_len; + memcpy((void *)ev->cr.fils_kek, params->fils_kek, + params->fils_kek_len); + next += params->fils_kek_len; + } + if (params->pmk_len) { + ev->cr.pmk = next; + ev->cr.pmk_len = params->pmk_len; + memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len); + next += params->pmk_len; + } + if (params->pmkid) { + ev->cr.pmkid = next; + memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN); + next += WLAN_PMKID_LEN; + } + ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num; + if (params->update_erp_next_seq_num) + ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num; if (params->bss) cfg80211_hold_bss(bss_from_pub(params->bss)); ev->cr.bss = params->bss; -- cgit v1.3 From 1cf1cae963c2e6032aebe1637e995bc2f5d330f4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Mar 2017 21:45:38 -0700 Subject: bpf: introduce BPF_PROG_TEST_RUN command development and testing of networking bpf programs is quite cumbersome. Despite availability of user space bpf interpreters the kernel is the ultimate authority and execution environment. Current test frameworks for TC include creation of netns, veth, qdiscs and use of various packet generators just to test functionality of a bpf program. XDP testing is even more complicated, since qemu needs to be started with gro/gso disabled and precise queue configuration, transferring of xdp program from host into guest, attaching to virtio/eth0 and generating traffic from the host while capturing the results from the guest. Moreover analyzing performance bottlenecks in XDP program is impossible in virtio environment, since cost of running the program is tiny comparing to the overhead of virtio packet processing, so performance testing can only be done on physical nic with another server generating traffic. Furthermore ongoing changes to user space control plane of production applications cannot be run on the test servers leaving bpf programs stubbed out for testing. Last but not least, the upstream llvm changes are validated by the bpf backend testsuite which has no ability to test the code generated. To improve this situation introduce BPF_PROG_TEST_RUN command to test and performance benchmark bpf programs. Joint work with Daniel Borkmann. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- include/linux/bpf.h | 7 ++ include/uapi/linux/bpf.h | 12 ++++ kernel/bpf/syscall.c | 27 +++++++- net/Makefile | 2 +- net/bpf/Makefile | 1 + net/bpf/test_run.c | 172 +++++++++++++++++++++++++++++++++++++++++++++++ net/core/filter.c | 5 ++ 7 files changed, 223 insertions(+), 3 deletions(-) create mode 100644 net/bpf/Makefile create mode 100644 net/bpf/test_run.c (limited to 'include/uapi/linux') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2ae39a3e9ead..bbb513da5075 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -169,6 +169,8 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog); + int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); }; struct bpf_prog_type_list { @@ -233,6 +235,11 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 28317a04c34d..a1d95386f562 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_cmd { BPF_OBJ_GET, BPF_PROG_ATTACH, BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, }; enum bpf_map_type { @@ -189,6 +190,17 @@ union bpf_attr { __u32 attach_type; __u32 attach_flags; }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; + __u32 data_size_out; + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + } test; } __attribute__((aligned(8))); /* BPF helper function descriptions: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c35ebfe6d84d..ab0cf4c43690 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -973,6 +973,28 @@ static int bpf_prog_detach(const union bpf_attr *attr) } #endif /* CONFIG_CGROUP_BPF */ +#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration + +static int bpf_prog_test_run(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog *prog; + int ret = -ENOTSUPP; + + if (CHECK_ATTR(BPF_PROG_TEST_RUN)) + return -EINVAL; + + prog = bpf_prog_get(attr->test.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->ops->test_run) + ret = prog->aux->ops->test_run(prog, attr, uattr); + + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1039,7 +1061,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; - #ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); @@ -1048,7 +1069,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; #endif - + case BPF_PROG_TEST_RUN: + err = bpf_prog_test_run(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/net/Makefile b/net/Makefile index 9b681550e3a3..9086ffbb5085 100644 --- a/net/Makefile +++ b/net/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ diff --git a/net/bpf/Makefile b/net/bpf/Makefile new file mode 100644 index 000000000000..27b2992a0692 --- /dev/null +++ b/net/bpf/Makefile @@ -0,0 +1 @@ +obj-y := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c new file mode 100644 index 000000000000..8a6d0a37c30c --- /dev/null +++ b/net/bpf/test_run.c @@ -0,0 +1,172 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) +{ + u32 ret; + + preempt_disable(); + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + preempt_enable(); + + return ret; +} + +static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +{ + u64 time_start, time_spent = 0; + u32 ret = 0, i; + + if (!repeat) + repeat = 1; + time_start = ktime_get_ns(); + for (i = 0; i < repeat; i++) { + ret = bpf_test_run_one(prog, ctx); + if (need_resched()) { + if (signal_pending(current)) + break; + time_spent += ktime_get_ns() - time_start; + cond_resched(); + time_start = ktime_get_ns(); + } + } + time_spent += ktime_get_ns() - time_start; + do_div(time_spent, repeat); + *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + + return ret; +} + +static int bpf_test_finish(union bpf_attr __user *uattr, const void *data, + u32 size, u32 retval, u32 duration) +{ + void __user *data_out = u64_to_user_ptr(uattr->test.data_out); + int err = -EFAULT; + + if (data_out && copy_to_user(data_out, data, size)) + goto out; + if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) + goto out; + if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) + goto out; + if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) + goto out; + err = 0; +out: + return err; +} + +static void *bpf_test_init(const union bpf_attr *kattr, u32 size, + u32 headroom, u32 tailroom) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + void *data; + + if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + return ERR_PTR(-EINVAL); + + data = kzalloc(size + headroom + tailroom, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(data + headroom, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + return data; +} + +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + bool is_l2 = false, is_direct_pkt_access = false; + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + u32 retval, duration; + struct sk_buff *skb; + void *data; + int ret; + + data = bpf_test_init(kattr, size, NET_SKB_PAD, + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (IS_ERR(data)) + return PTR_ERR(data); + + switch (prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + is_l2 = true; + /* fall through */ + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + is_direct_pkt_access = true; + break; + default: + break; + } + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + return -ENOMEM; + } + + skb_reserve(skb, NET_SKB_PAD); + __skb_put(skb, size); + skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev); + skb_reset_network_header(skb); + + if (is_l2) + __skb_push(skb, ETH_HLEN); + if (is_direct_pkt_access) + bpf_compute_data_end(skb); + retval = bpf_test_run(prog, skb, repeat, &duration); + if (!is_l2) + __skb_push(skb, ETH_HLEN); + size = skb->len; + /* bpf program can never convert linear skb to non-linear */ + if (WARN_ON_ONCE(skb_is_nonlinear(skb))) + size = skb_headlen(skb); + ret = bpf_test_finish(uattr, skb->data, size, retval, duration); + kfree_skb(skb); + return ret; +} + +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct xdp_buff xdp = {}; + u32 retval, duration; + void *data; + int ret; + + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + xdp.data_hard_start = data; + xdp.data = data + XDP_PACKET_HEADROOM; + xdp.data_end = xdp.data + size; + + retval = bpf_test_run(prog, &xdp, repeat, &duration); + if (xdp.data != data + XDP_PACKET_HEADROOM) + size = xdp.data_end - xdp.data; + ret = bpf_test_finish(uattr, xdp.data, size, retval, duration); + kfree(data); + return ret; +} diff --git a/net/core/filter.c b/net/core/filter.c index dfb9f61a2fd5..15e9a81ffebe 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3309,24 +3309,28 @@ static const struct bpf_verifier_ops tc_cls_act_ops = { .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; static const struct bpf_verifier_ops xdp_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .test_run = bpf_prog_test_run_xdp, }; static const struct bpf_verifier_ops cg_skb_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; static const struct bpf_verifier_ops lwt_inout_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; static const struct bpf_verifier_ops lwt_xmit_ops = { @@ -3334,6 +3338,7 @@ static const struct bpf_verifier_ops lwt_xmit_ops = { .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; static const struct bpf_verifier_ops cg_sock_ops = { -- cgit v1.3 From d229d48d183fbc1391908decc7d2bcf09ca2f38f Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sat, 1 Apr 2017 17:07:46 +0800 Subject: sctp: add SCTP_PR_STREAM_STATUS sockopt for prsctp Before when implementing sctp prsctp, SCTP_PR_STREAM_STATUS wasn't added, as it needs to save abandoned_(un)sent for every stream. After sctp stream reconf is added in sctp, assoc has structure sctp_stream_out to save per stream info. This patch is to add SCTP_PR_STREAM_STATUS by putting the prsctp per stream statistics into sctp_stream_out. v1->v2: fix an indent issue. Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 2 ++ include/uapi/linux/sctp.h | 1 + net/sctp/chunk.c | 14 +++++++++-- net/sctp/outqueue.c | 10 ++++++++ net/sctp/socket.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 84 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 592decebac75..3e61a54424a1 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1315,6 +1315,8 @@ struct sctp_inithdr_host { struct sctp_stream_out { __u16 ssn; __u8 state; + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; }; struct sctp_stream_in { diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index 7212870ef5d7..ced9d8b97426 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -115,6 +115,7 @@ typedef __s32 sctp_assoc_t; #define SCTP_PR_SUPPORTED 113 #define SCTP_DEFAULT_PRINFO 114 #define SCTP_PR_ASSOC_STATUS 115 +#define SCTP_PR_STREAM_STATUS 116 #define SCTP_RECONFIG_SUPPORTED 117 #define SCTP_ENABLE_STREAM_RESET 118 #define SCTP_RESET_STREAMS 119 diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index e3621cb4827f..697721a7a3f1 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -306,14 +306,24 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { - if (chunk->sent_count) + struct sctp_stream_out *streamout = + &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + + if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - else + streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { + struct sctp_stream_out *streamout = + &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 025ccff67072..3f78d7f06e14 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -353,6 +353,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { + struct sctp_stream_out *streamout; + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; @@ -361,8 +363,10 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); + streamout = &asoc->stream->out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -396,6 +400,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->out_qlen -= chk->skb->len; asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + if (chk->sinfo.sinfo_stream < asoc->stream->outcnt) { + struct sctp_stream_out *streamout = + &asoc->stream->out[chk->sinfo.sinfo_stream]; + + streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + } msg_len -= SCTP_DATA_SNDSIZE(chk) + sizeof(struct sk_buff) + diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ccc08fc39722..6489446925e6 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6576,6 +6576,61 @@ out: return retval; } +static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_out *streamout; + struct sctp_association *asoc; + struct sctp_prstatus params; + int retval = -EINVAL; + int policy; + + if (len < sizeof(params)) + goto out; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) { + retval = -EFAULT; + goto out; + } + + policy = params.sprstat_policy; + if (policy & ~SCTP_PR_SCTP_MASK) + goto out; + + asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); + if (!asoc || params.sprstat_sid >= asoc->stream->outcnt) + goto out; + + streamout = &asoc->stream->out[params.sprstat_sid]; + if (policy == SCTP_PR_SCTP_NONE) { + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { + params.sprstat_abandoned_unsent += + streamout->abandoned_unsent[policy]; + params.sprstat_abandoned_sent += + streamout->abandoned_sent[policy]; + } + } else { + params.sprstat_abandoned_unsent = + streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + params.sprstat_abandoned_sent = + streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + } + + if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -6825,6 +6880,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; + case SCTP_PR_STREAM_STATUS: + retval = sctp_getsockopt_pr_streamstatus(sk, len, optval, + optlen); + break; case SCTP_RECONFIG_SUPPORTED: retval = sctp_getsockopt_reconfig_supported(sk, len, optval, optlen); -- cgit v1.3 From 1f37b177fd36790be4f281d538a8c9de67013606 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 2 Apr 2017 14:30:06 -0700 Subject: phy/ethtool: Add missing SPEED_ strings Add all the currently available SPEED_ strings. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- drivers/net/phy/phy.c | 14 ++++++++++++++ include/uapi/linux/ethtool.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include/uapi/linux') diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 867c42154087..6811d1ef4ef2 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -50,8 +50,22 @@ static const char *phy_speed_to_str(int speed) return "1Gbps"; case SPEED_2500: return "2.5Gbps"; + case SPEED_5000: + return "5Gbps"; case SPEED_10000: return "10Gbps"; + case SPEED_20000: + return "20Gbps"; + case SPEED_25000: + return "25Gbps"; + case SPEED_40000: + return "40Gbps"; + case SPEED_50000: + return "50Gbps"; + case SPEED_56000: + return "56Gbps"; + case SPEED_100000: + return "100Gbps"; case SPEED_UNKNOWN: return "Unknown"; default: diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3dc91a46e8b8..5f4ea28eabe4 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1487,6 +1487,7 @@ enum ethtool_link_mode_bit_indices { */ /* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */ +/* Update drivers/net/phy/phy.c:phy_speed_to_str() when adding new values */ #define SPEED_10 10 #define SPEED_100 100 #define SPEED_1000 1000 -- cgit v1.3 From 457c79e54487b076cafa0e1ec5f177e751c54087 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Mon, 3 Apr 2017 18:13:32 -0700 Subject: netlink/diag: report flags for netlink sockets cb_running is reported in /proc/self/net/netlink and it is reported by the ss tool, when it gets information from the proc files. sock_diag is a new interface which is used instead of proc files, so it looks reasonable that this interface has to report no less information about sockets than proc files. We use these flags to dump and restore netlink sockets. Signed-off-by: Andrei Vagin Signed-off-by: David S. Miller --- include/uapi/linux/netlink_diag.h | 10 ++++++++++ net/netlink/af_netlink.c | 8 -------- net/netlink/af_netlink.h | 8 ++++++++ net/netlink/diag.c | 25 +++++++++++++++++++++++++ 4 files changed, 43 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 76b4d87c83a8..6dcd4de3397b 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -38,6 +38,7 @@ enum { NETLINK_DIAG_GROUPS, NETLINK_DIAG_RX_RING, NETLINK_DIAG_TX_RING, + NETLINK_DIAG_FLAGS, __NETLINK_DIAG_MAX, }; @@ -52,5 +53,14 @@ enum { /* deprecated since 4.6 */ #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ #endif +#define NDIAG_SHOW_FLAGS 0x00000008 /* show flags of a netlink socket */ + +/* flags */ +#define NDIAG_FLAG_CB_RUNNING 0x00000001 +#define NDIAG_FLAG_PKTINFO 0x00000002 +#define NDIAG_FLAG_BROADCAST_ERROR 0x00000004 +#define NDIAG_FLAG_NO_ENOBUFS 0x00000008 +#define NDIAG_FLAG_LISTEN_ALL_NSID 0x00000010 +#define NDIAG_FLAG_CAP_ACK 0x00000020 #endif diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 596eaff66649..fc232441cf23 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -78,14 +78,6 @@ struct listeners { /* state bits */ #define NETLINK_S_CONGESTED 0x0 -/* flags */ -#define NETLINK_F_KERNEL_SOCKET 0x1 -#define NETLINK_F_RECV_PKTINFO 0x2 -#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_F_RECV_NO_ENOBUFS 0x8 -#define NETLINK_F_LISTEN_ALL_NSID 0x10 -#define NETLINK_F_CAP_ACK 0x20 - static inline int netlink_is_kernel(struct sock *sk) { return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 4fdb38318977..f792f8d7f982 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -6,6 +6,14 @@ #include #include +/* flags */ +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 + #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index a5546249fb10..8faa20b4d457 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -19,6 +19,27 @@ static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) nlk->groups); } +static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + u32 flags = 0; + + if (nlk->cb_running) + flags |= NDIAG_FLAG_CB_RUNNING; + if (nlk->flags & NETLINK_F_RECV_PKTINFO) + flags |= NDIAG_FLAG_PKTINFO; + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + flags |= NDIAG_FLAG_BROADCAST_ERROR; + if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) + flags |= NDIAG_FLAG_NO_ENOBUFS; + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + flags |= NDIAG_FLAG_LISTEN_ALL_NSID; + if (nlk->flags & NETLINK_F_CAP_ACK) + flags |= NDIAG_FLAG_CAP_ACK; + + return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) @@ -52,6 +73,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && + sk_diag_put_flags(sk, skb)) + goto out_nlmsg_trim; + nlmsg_end(skb, nlh); return 0; -- cgit v1.3 From def12888c161e6fec0702e5ec9c3962846e3a21d Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Tue, 4 Apr 2017 09:23:42 -0400 Subject: rtnl: Add support for netdev event to link messages When netdev events happen, a rtnetlink_event() handler will send messages for every event in it's white list. These messages contain current information about a particular device, but they do not include the iformation about which event just happened. The consumer of the message has to try to infer this information. In some cases (ex: NETDEV_NOTIFY_PEERS), that is not possible. This patch adds a new extension to RTM_NEWLINK message called IFLA_EVENT that would have an encoding of the which event triggered this message. This would allow the the message consumer to easily determine if it is interested in a particular event or not. Signed-off-by: Vladislav Yasevich Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +- include/uapi/linux/if_link.h | 21 ++++++++++ net/core/dev.c | 2 +- net/core/rtnetlink.c | 92 +++++++++++++++++++++++++++++++++++++++----- 4 files changed, 107 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 57e54847b0b9..0459018173cf 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -18,7 +18,8 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned change, gfp_t flags); + unsigned change, unsigned long event, + gfp_t flags); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8b405afb2376..97f6d302f627 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -157,6 +157,7 @@ enum { IFLA_GSO_MAX_SIZE, IFLA_PAD, IFLA_XDP, + IFLA_EVENT, __IFLA_MAX }; @@ -899,4 +900,24 @@ enum { #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) +enum { + IFLA_EVENT_UNSPEC, + IFLA_EVENT_REBOOT, + IFLA_EVENT_CHANGE_MTU, + IFLA_EVENT_CHANGE_ADDR, + IFLA_EVENT_CHANGE_NAME, + IFLA_EVENT_FEAT_CHANGE, + IFLA_EVENT_BONDING_FAILOVER, + IFLA_EVENT_POST_TYPE_CHANGE, + IFLA_EVENT_NOTIFY_PEERS, + IFLA_EVENT_CHANGE_UPPER, + IFLA_EVENT_RESEND_IGMP, + IFLA_EVENT_PRE_CHANGE_MTU, + IFLA_EVENT_CHANGE_INFO_DATA, + IFLA_EVENT_PRE_CHANGE_UPPER, + IFLA_EVENT_CHANGE_LOWER_STATE, + IFLA_EVENT_UDP_TUNNEL_PUSH_INFO, + IFLA_EVENT_CHANGE_TX_QUEUE_LEN, +}; + #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/net/core/dev.c b/net/core/dev.c index ef9fe60ee294..7efb4178ffef 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6840,7 +6840,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, GFP_KERNEL); /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 58419da7961b..b2bd4c9ee860 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -944,6 +944,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size(dev) /* IFLA_XDP */ + + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1276,9 +1277,70 @@ err_cancel: return err; } +static int rtnl_fill_link_event(struct sk_buff *skb, unsigned long event) +{ + u32 rtnl_event; + + switch (event) { + case NETDEV_REBOOT: + rtnl_event = IFLA_EVENT_REBOOT; + break; + case NETDEV_CHANGEMTU: + rtnl_event = IFLA_EVENT_CHANGE_MTU; + break; + case NETDEV_CHANGEADDR: + rtnl_event = IFLA_EVENT_CHANGE_ADDR; + break; + case NETDEV_CHANGENAME: + rtnl_event = IFLA_EVENT_CHANGE_NAME; + break; + case NETDEV_FEAT_CHANGE: + rtnl_event = IFLA_EVENT_FEAT_CHANGE; + break; + case NETDEV_BONDING_FAILOVER: + rtnl_event = IFLA_EVENT_BONDING_FAILOVER; + break; + case NETDEV_POST_TYPE_CHANGE: + rtnl_event = IFLA_EVENT_POST_TYPE_CHANGE; + break; + case NETDEV_NOTIFY_PEERS: + rtnl_event = IFLA_EVENT_NOTIFY_PEERS; + break; + case NETDEV_CHANGEUPPER: + rtnl_event = IFLA_EVENT_CHANGE_UPPER; + break; + case NETDEV_RESEND_IGMP: + rtnl_event = IFLA_EVENT_RESEND_IGMP; + break; + case NETDEV_PRECHANGEMTU: + rtnl_event = IFLA_EVENT_PRE_CHANGE_MTU; + break; + case NETDEV_CHANGEINFODATA: + rtnl_event = IFLA_EVENT_CHANGE_INFO_DATA; + break; + case NETDEV_PRECHANGEUPPER: + rtnl_event = IFLA_EVENT_PRE_CHANGE_UPPER; + break; + case NETDEV_CHANGELOWERSTATE: + rtnl_event = IFLA_EVENT_CHANGE_LOWER_STATE; + break; + case NETDEV_UDP_TUNNEL_PUSH_INFO: + rtnl_event = IFLA_EVENT_UDP_TUNNEL_PUSH_INFO; + break; + case NETDEV_CHANGE_TX_QUEUE_LEN: + rtnl_event = IFLA_EVENT_CHANGE_TX_QUEUE_LEN; + break; + default: + return 0; + } + + return nla_put_u32(skb, IFLA_EVENT, rtnl_event); +} + static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask) + unsigned int flags, u32 ext_filter_mask, + unsigned long event) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1327,6 +1389,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; + if (rtnl_fill_link_event(skb, event)) + goto nla_put_failure; + if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1461,6 +1526,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, + [IFLA_EVENT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1619,7 +1685,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask); + ext_filter_mask, 0); /* If we ran out of room on the first message, * we're in trouble */ @@ -2710,7 +2776,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask); + nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2782,7 +2848,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, gfp_t flags) + unsigned int change, + unsigned long event, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2793,7 +2860,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2814,18 +2881,25 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) +static void rtmsg_ifinfo_event(int type, struct net_device *dev, + unsigned int change, unsigned long event, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) +{ + rtmsg_ifinfo_event(type, dev, change, 0, flags); +} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4132,7 +4206,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_UDP_TUNNEL_PUSH_INFO: case NETDEV_CHANGE_TX_QUEUE_LEN: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); + rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, event, GFP_KERNEL); break; default: break; -- cgit v1.3 From 261a0a54d1ebcd04fb4a1c3f971b2c3b7ae06925 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 6 Apr 2017 16:10:42 +0200 Subject: netlink: uapi: use hex numbers for NLM_F_* flags It's rather confusing that the netlink message flags are numbered 1, 2, 4, 8, 16, 32, , 0x100. Make that more understandable by numbering the lower ones with hex constants as well. Signed-off-by: Johannes Berg Signed-off-by: David S. Miller --- include/uapi/linux/netlink.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f3946a27bd07..b2c9c26ea30f 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -50,12 +50,12 @@ struct nlmsghdr { /* Flags values */ -#define NLM_F_REQUEST 1 /* It is request message. */ -#define NLM_F_MULTI 2 /* Multipart message, terminated by NLMSG_DONE */ -#define NLM_F_ACK 4 /* Reply with ack, with zero or error code */ -#define NLM_F_ECHO 8 /* Echo this request */ -#define NLM_F_DUMP_INTR 16 /* Dump was inconsistent due to sequence change */ -#define NLM_F_DUMP_FILTERED 32 /* Dump was filtered as requested */ +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ /* Modifiers to GET request */ #define NLM_F_ROOT 0x100 /* specify tree root */ -- cgit v1.3 From bf74b20d00b13919db7ae5d1015636e76f56f6ae Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 9 Apr 2017 14:45:21 -0700 Subject: Revert "rtnl: Add support for netdev event to link messages" This reverts commit def12888c161e6fec0702e5ec9c3962846e3a21d. As per discussion between Roopa Prabhu and David Ahern, it is advisable that we instead have the code collect the setlink triggered events into a bitmask emitted in the IFLA_EVENT netlink attribute. Signed-off-by: David S. Miller --- include/linux/rtnetlink.h | 3 +- include/uapi/linux/if_link.h | 21 ---------- net/core/dev.c | 2 +- net/core/rtnetlink.c | 92 +++++--------------------------------------- 4 files changed, 11 insertions(+), 107 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h index 0459018173cf..57e54847b0b9 100644 --- a/include/linux/rtnetlink.h +++ b/include/linux/rtnetlink.h @@ -18,8 +18,7 @@ extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned change, unsigned long event, - gfp_t flags); + unsigned change, gfp_t flags); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 97f6d302f627..8b405afb2376 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -157,7 +157,6 @@ enum { IFLA_GSO_MAX_SIZE, IFLA_PAD, IFLA_XDP, - IFLA_EVENT, __IFLA_MAX }; @@ -900,24 +899,4 @@ enum { #define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) -enum { - IFLA_EVENT_UNSPEC, - IFLA_EVENT_REBOOT, - IFLA_EVENT_CHANGE_MTU, - IFLA_EVENT_CHANGE_ADDR, - IFLA_EVENT_CHANGE_NAME, - IFLA_EVENT_FEAT_CHANGE, - IFLA_EVENT_BONDING_FAILOVER, - IFLA_EVENT_POST_TYPE_CHANGE, - IFLA_EVENT_NOTIFY_PEERS, - IFLA_EVENT_CHANGE_UPPER, - IFLA_EVENT_RESEND_IGMP, - IFLA_EVENT_PRE_CHANGE_MTU, - IFLA_EVENT_CHANGE_INFO_DATA, - IFLA_EVENT_PRE_CHANGE_UPPER, - IFLA_EVENT_CHANGE_LOWER_STATE, - IFLA_EVENT_UDP_TUNNEL_PUSH_INFO, - IFLA_EVENT_CHANGE_TX_QUEUE_LEN, -}; - #endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/net/core/dev.c b/net/core/dev.c index 7efb4178ffef..ef9fe60ee294 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6840,7 +6840,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) - skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, + skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, GFP_KERNEL); /* diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b2bd4c9ee860..58419da7961b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -944,7 +944,6 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ + rtnl_xdp_size(dev) /* IFLA_XDP */ - + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1277,70 +1276,9 @@ err_cancel: return err; } -static int rtnl_fill_link_event(struct sk_buff *skb, unsigned long event) -{ - u32 rtnl_event; - - switch (event) { - case NETDEV_REBOOT: - rtnl_event = IFLA_EVENT_REBOOT; - break; - case NETDEV_CHANGEMTU: - rtnl_event = IFLA_EVENT_CHANGE_MTU; - break; - case NETDEV_CHANGEADDR: - rtnl_event = IFLA_EVENT_CHANGE_ADDR; - break; - case NETDEV_CHANGENAME: - rtnl_event = IFLA_EVENT_CHANGE_NAME; - break; - case NETDEV_FEAT_CHANGE: - rtnl_event = IFLA_EVENT_FEAT_CHANGE; - break; - case NETDEV_BONDING_FAILOVER: - rtnl_event = IFLA_EVENT_BONDING_FAILOVER; - break; - case NETDEV_POST_TYPE_CHANGE: - rtnl_event = IFLA_EVENT_POST_TYPE_CHANGE; - break; - case NETDEV_NOTIFY_PEERS: - rtnl_event = IFLA_EVENT_NOTIFY_PEERS; - break; - case NETDEV_CHANGEUPPER: - rtnl_event = IFLA_EVENT_CHANGE_UPPER; - break; - case NETDEV_RESEND_IGMP: - rtnl_event = IFLA_EVENT_RESEND_IGMP; - break; - case NETDEV_PRECHANGEMTU: - rtnl_event = IFLA_EVENT_PRE_CHANGE_MTU; - break; - case NETDEV_CHANGEINFODATA: - rtnl_event = IFLA_EVENT_CHANGE_INFO_DATA; - break; - case NETDEV_PRECHANGEUPPER: - rtnl_event = IFLA_EVENT_PRE_CHANGE_UPPER; - break; - case NETDEV_CHANGELOWERSTATE: - rtnl_event = IFLA_EVENT_CHANGE_LOWER_STATE; - break; - case NETDEV_UDP_TUNNEL_PUSH_INFO: - rtnl_event = IFLA_EVENT_UDP_TUNNEL_PUSH_INFO; - break; - case NETDEV_CHANGE_TX_QUEUE_LEN: - rtnl_event = IFLA_EVENT_CHANGE_TX_QUEUE_LEN; - break; - default: - return 0; - } - - return nla_put_u32(skb, IFLA_EVENT, rtnl_event); -} - static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags, u32 ext_filter_mask, - unsigned long event) + unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1389,9 +1327,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) goto nla_put_failure; - if (rtnl_fill_link_event(skb, event)) - goto nla_put_failure; - if (rtnl_fill_link_ifmap(skb, dev)) goto nla_put_failure; @@ -1526,7 +1461,6 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_LINK_NETNSID] = { .type = NLA_S32 }, [IFLA_PROTO_DOWN] = { .type = NLA_U8 }, [IFLA_XDP] = { .type = NLA_NESTED }, - [IFLA_EVENT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1685,7 +1619,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0); + ext_filter_mask); /* If we ran out of room on the first message, * we're in trouble */ @@ -2776,7 +2710,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, 0, 0, ext_filter_mask, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2848,8 +2782,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, - unsigned int change, - unsigned long event, gfp_t flags) + unsigned int change, gfp_t flags) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -2860,7 +2793,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, if (skb == NULL) goto errout; - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0, event); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2881,25 +2814,18 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags); } -static void rtmsg_ifinfo_event(int type, struct net_device *dev, - unsigned int change, unsigned long event, - gfp_t flags) +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, + gfp_t flags) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags); + skb = rtmsg_ifinfo_build_skb(type, dev, change, flags); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } - -void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, - gfp_t flags) -{ - rtmsg_ifinfo_event(type, dev, change, 0, flags); -} EXPORT_SYMBOL(rtmsg_ifinfo); static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4206,7 +4132,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_UDP_TUNNEL_PUSH_INFO: case NETDEV_CHANGE_TX_QUEUE_LEN: - rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, event, GFP_KERNEL); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; default: break; -- cgit v1.3 From 3c60a531b9e175693a2d61f6bfd7ffacce4146cd Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sat, 8 Apr 2017 22:08:10 +0200 Subject: bpf: fix comment typo o s/bpf_bpf_get_socket_cookie/bpf_get_socket_cookie Signed-off-by: Alexander Alemayhu Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 2 +- tools/include/uapi/linux/bpf.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1d95386f562..1e062bb54eec 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -472,7 +472,7 @@ union bpf_attr { * > 0 length of the string including the trailing NUL on success * < 0 error * - * u64 bpf_bpf_get_socket_cookie(skb) + * u64 bpf_get_socket_cookie(skb) * Get the cookie for the socket stored inside sk_buff. * @skb: pointer to skb * Return: 8 Bytes non-decreasing number on success or 0 if the socket diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1d95386f562..1e062bb54eec 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -472,7 +472,7 @@ union bpf_attr { * > 0 length of the string including the trailing NUL on success * < 0 error * - * u64 bpf_bpf_get_socket_cookie(skb) + * u64 bpf_get_socket_cookie(skb) * Get the cookie for the socket stored inside sk_buff. * @skb: pointer to skb * Return: 8 Bytes non-decreasing number on success or 0 if the socket -- cgit v1.3 From 2d4bc93368f5a0ddb57c8c885cdad9c9b7a10ed5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 12 Apr 2017 14:34:04 +0200 Subject: netlink: extended ACK reporting Add the base infrastructure and UAPI for netlink extended ACK reporting. All "manual" calls to netlink_ack() pass NULL for now and thus don't get extended ACK reporting. Big thanks goes to Pablo Neira Ayuso for not only bringing up the whole topic at netconf (again) but also coming up with the nlattr passing trick and various other ideas. Signed-off-by: Johannes Berg Reviewed-by: David Ahern Signed-off-by: David S. Miller --- crypto/crypto_user.c | 3 +- drivers/infiniband/core/netlink.c | 5 +-- drivers/scsi/scsi_netlink.c | 2 +- include/linux/netlink.h | 26 +++++++++++++- include/net/netlink.h | 3 +- include/uapi/linux/netlink.h | 32 ++++++++++++++++++ kernel/audit.c | 2 +- net/core/rtnetlink.c | 3 +- net/core/sock_diag.c | 3 +- net/decnet/netfilter/dn_rtmsg.c | 2 +- net/hsr/hsr_netlink.c | 4 +-- net/netfilter/ipset/ip_set_core.c | 2 +- net/netfilter/nfnetlink.c | 22 ++++++------ net/netlink/af_netlink.c | 71 ++++++++++++++++++++++++++++++++++----- net/netlink/af_netlink.h | 1 + net/netlink/genetlink.c | 3 +- net/xfrm/xfrm_user.c | 3 +- 17 files changed, 153 insertions(+), 34 deletions(-) (limited to 'include/uapi/linux') diff --git a/crypto/crypto_user.c b/crypto/crypto_user.c index a90404a0c5ff..4a44830741c1 100644 --- a/crypto/crypto_user.c +++ b/crypto/crypto_user.c @@ -483,7 +483,8 @@ static const struct crypto_link { [CRYPTO_MSG_DELRNG - CRYPTO_MSG_BASE] = { .doit = crypto_del_rng }, }; -static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct nlattr *attrs[CRYPTOCFGA_MAX+1]; const struct crypto_link *link; diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c index 10469b0088b5..b784055423c8 100644 --- a/drivers/infiniband/core/netlink.c +++ b/drivers/infiniband/core/netlink.c @@ -146,7 +146,8 @@ nla_put_failure: } EXPORT_SYMBOL(ibnl_put_attr); -static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct ibnl_client *client; int type = nlh->nlmsg_type; @@ -209,7 +210,7 @@ static void ibnl_rcv_reply_skb(struct sk_buff *skb) if (nlh->nlmsg_flags & NLM_F_REQUEST) return; - ibnl_rcv_msg(skb, nlh); + ibnl_rcv_msg(skb, nlh, NULL); msglen = NLMSG_ALIGN(nlh->nlmsg_len); if (msglen > skb->len) diff --git a/drivers/scsi/scsi_netlink.c b/drivers/scsi/scsi_netlink.c index 109802f776ed..50e624fb8307 100644 --- a/drivers/scsi/scsi_netlink.c +++ b/drivers/scsi/scsi_netlink.c @@ -111,7 +111,7 @@ scsi_nl_rcv_msg(struct sk_buff *skb) next_msg: if ((err) || (nlh->nlmsg_flags & NLM_F_ACK)) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); skb_pull(skb, rlen); } diff --git a/include/linux/netlink.h b/include/linux/netlink.h index da14ab61f363..60e7137f840d 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -62,11 +62,35 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } +/** + * struct netlink_ext_ack - netlink extended ACK report struct + * @_msg: message string to report - don't access directly, use + * %NL_SET_ERR_MSG + * @bad_attr: attribute with error + */ +struct netlink_ext_ack { + const char *_msg; + const struct nlattr *bad_attr; +}; + +/* Always use this macro, this allows later putting the + * message into a separate section or such for things + * like translation or listing all possible messages. + * Currently string formatting is not supported (due + * to the lack of an output buffer.) + */ +#define NL_SET_ERR_MSG(extack, msg) do { \ + static const char _msg[] = (msg); \ + \ + (extack)->_msg = _msg; \ +} while (0) + extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); -extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); +extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack); extern int netlink_has_listeners(struct sock *sk, unsigned int group); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); diff --git a/include/net/netlink.h b/include/net/netlink.h index b239fcd33d80..a064ec3e2ee1 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -233,7 +233,8 @@ struct nl_info { }; int netlink_rcv_skb(struct sk_buff *skb, - int (*cb)(struct sk_buff *, struct nlmsghdr *)); + int (*cb)(struct sk_buff *, struct nlmsghdr *, + struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index b2c9c26ea30f..7df88770e029 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -69,6 +69,10 @@ struct nlmsghdr { #define NLM_F_CREATE 0x400 /* Create, if it does not exist */ #define NLM_F_APPEND 0x800 /* Add to end of list */ +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + /* 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL 4.4BSD CHANGE NLM_F_REPLACE @@ -101,6 +105,33 @@ struct nlmsghdr { struct nlmsgerr { int error; struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 }; #define NETLINK_ADD_MEMBERSHIP 1 @@ -115,6 +146,7 @@ struct nlmsgerr { #define NETLINK_LISTEN_ALL_NSID 8 #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 struct nl_pktinfo { __u32 group; diff --git a/kernel/audit.c b/kernel/audit.c index 2f4964cfde0b..d54bf5932374 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1402,7 +1402,7 @@ static void audit_receive_skb(struct sk_buff *skb) err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c138b6b75e59..3cc4a627a537 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -4046,7 +4046,8 @@ out: /* Process one rtnetlink message. */ -static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); rtnl_doit_func doit; diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index fb9d0e2fd148..217f4e3b82f6 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -238,7 +238,8 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { int ret; diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 85f2fdc360c2..c8bf5136a72b 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -96,7 +96,7 @@ static unsigned int dnrmg_hook(void *priv, } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0) static inline void dnrmg_receive_user_skb(struct sk_buff *skb) { diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 1ab30e7d3f99..81dac16933fc 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -350,7 +350,7 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) return 0; invalid: - netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); + netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; nla_put_failure: @@ -432,7 +432,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; invalid: - netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); + netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; nla_put_failure: diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c296f9b606d4..26356bf8cebf 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -1305,7 +1305,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) * manually :-( */ if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(cb->skb, nlh, ret); + netlink_ack(cb->skb, nlh, ret, NULL); return ret; } } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 68eda920160e..181d3bb800e6 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -148,7 +148,8 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ -static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; @@ -261,7 +262,7 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) { - netlink_ack(skb, nfnl_err->nlh, nfnl_err->err); + netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, NULL); nfnl_err_del(nfnl_err); } } @@ -284,13 +285,13 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, int err; if (subsys_id >= NFNL_SUBSYS_COUNT) - return netlink_ack(skb, nlh, -EINVAL); + return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) - return netlink_ack(oskb, nlh, -ENOMEM); + return netlink_ack(oskb, nlh, -ENOMEM, NULL); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); @@ -304,20 +305,20 @@ replay: #endif { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -ERESTART); + netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); } @@ -407,7 +408,8 @@ ack: * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM, + NULL); status |= NFNL_BATCH_FAILURE; goto done; } @@ -467,7 +469,7 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy); if (err < 0) { - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); return; } if (cda[NFNL_BATCH_GENID]) @@ -493,7 +495,7 @@ static void nfnetlink_rcv(struct sk_buff *skb) return; if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { - netlink_ack(skb, nlh, -EPERM); + netlink_ack(skb, nlh, -EPERM, NULL); return; } diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index fc232441cf23..c1564768000e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1652,6 +1652,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_CAP_ACK; err = 0; break; + case NETLINK_EXT_ACK: + if (val) + nlk->flags |= NETLINK_F_EXT_ACK; + else + nlk->flags &= ~NETLINK_F_EXT_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1736,6 +1743,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_EXT_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; + if (put_user(len, optlen) || put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2267,21 +2283,40 @@ error_free: } EXPORT_SYMBOL(__netlink_dump_start); -void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + size_t tlvlen = 0; struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); + unsigned int flags = 0; /* Error messages get the original request appened, unless the user - * requests to cap the error message. + * requests to cap the error message, and get extra error data if + * requested. */ - if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) - payload += nlmsg_len(nlh); + if (err) { + if (!(nlk->flags & NETLINK_F_CAP_ACK)) + payload += nlmsg_len(nlh); + else + flags |= NLM_F_CAPPED; + if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (extack->_msg) + tlvlen += nla_total_size(strlen(extack->_msg) + 1); + if (extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + } + } else { + flags |= NLM_F_CAPPED; + } - skb = nlmsg_new(payload, GFP_KERNEL); + if (tlvlen) + flags |= NLM_F_ACK_TLVS; + + skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -2297,17 +2332,35 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) } rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, 0); + NLMSG_ERROR, payload, flags); errmsg = nlmsg_data(rep); errmsg->error = err; memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); + + if (err && nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (extack->_msg) + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, + extack->_msg)); + if (extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + in_skb->data)); + } + + nlmsg_end(skb, rep); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, - struct nlmsghdr *)) + struct nlmsghdr *, + struct netlink_ext_ack *)) { + struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; int err; @@ -2328,13 +2381,13 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; - err = cb(skb, nlh); + err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index f792f8d7f982..3490f2430532 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -13,6 +13,7 @@ #define NETLINK_F_RECV_NO_ENOBUFS 0x8 #define NETLINK_F_LISTEN_ALL_NSID 0x10 #define NETLINK_F_CAP_ACK 0x20 +#define NETLINK_F_EXT_ACK 0x40 #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 92e0981f7404..57b2e3648bc0 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -605,7 +605,8 @@ out: return err; } -static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { const struct genl_family *family; int err; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4f7e62ddc17e..e93d5c0471b2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -2448,7 +2448,8 @@ static const struct xfrm_link { [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; -static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; -- cgit v1.3 From ba0dc5f6e0ba5a5d2f575bcdb35e5d1960cf7c04 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 12 Apr 2017 14:34:06 +0200 Subject: netlink: allow sending extended ACK with cookie on success Now that we have extended error reporting and a new message format for netlink ACK messages, also extend this to be able to return arbitrary cookie data on success. This will allow, for example, nl80211 to not send an extra message for cookies identifying newly created objects, but return those directly in the ACK message. The cookie data size is currently limited to 20 bytes (since Jamal talked about using SHA1 for identifiers.) Thanks to Jamal Hadi Salim for bringing up this idea during the discussions. Signed-off-by: Johannes Berg Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netlink.h | 7 +++++++ include/uapi/linux/netlink.h | 4 ++++ net/netlink/af_netlink.c | 33 ++++++++++++++++++++++----------- 3 files changed, 33 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 60e7137f840d..8d2a8924705c 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -62,15 +62,22 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } +/* this can be increased when necessary - don't expose to userland */ +#define NETLINK_MAX_COOKIE_LEN 20 + /** * struct netlink_ext_ack - netlink extended ACK report struct * @_msg: message string to report - don't access directly, use * %NL_SET_ERR_MSG * @bad_attr: attribute with error + * @cookie: cookie data to return to userspace (for success) + * @cookie_len: actual cookie data length */ struct netlink_ext_ack { const char *_msg; const struct nlattr *bad_attr; + u8 cookie[NETLINK_MAX_COOKIE_LEN]; + u8 cookie_len; }; /* Always use this macro, this allows later putting the diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index 7df88770e029..f86127a46cfc 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -122,6 +122,9 @@ struct nlmsgerr { * @NLMSGERR_ATTR_MSG: error message string (string) * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) * @__NLMSGERR_ATTR_MAX: number of attributes * @NLMSGERR_ATTR_MAX: highest attribute number */ @@ -129,6 +132,7 @@ enum nlmsgerr_attrs { NLMSGERR_ATTR_UNUSED, NLMSGERR_ATTR_MSG, NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, __NLMSGERR_ATTR_MAX, NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c1564768000e..ee841f00a6ec 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2311,6 +2311,10 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, } } else { flags |= NLM_F_CAPPED; + + if (nlk->flags & NETLINK_F_EXT_ACK && + extack && extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); } if (tlvlen) @@ -2337,17 +2341,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, errmsg->error = err; memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); - if (err && nlk->flags & NETLINK_F_EXT_ACK && extack) { - if (extack->_msg) - WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, - extack->_msg)); - if (extack->bad_attr && - !WARN_ON((u8 *)extack->bad_attr < in_skb->data || - (u8 *)extack->bad_attr >= in_skb->data + - in_skb->len)) - WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, - (u8 *)extack->bad_attr - - in_skb->data)); + if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (err) { + if (extack->_msg) + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, + extack->_msg)); + if (extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + in_skb->data)); + } else { + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, + extack->cookie)); + } } nlmsg_end(skb, rep); -- cgit v1.3 From d77e38e612a017480157fe6d2c1422f42cb5b7e3 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Fri, 14 Apr 2017 10:06:10 +0200 Subject: xfrm: Add an IPsec hardware offloading API This patch adds all the bits that are needed to do IPsec hardware offload for IPsec states and ESP packets. We add xfrmdev_ops to the net_device. xfrmdev_ops has function pointers that are needed to manage the xfrm states in the hardware and to do a per packet offloading decision. Joint work with: Ilan Tayari Guy Shapiro Yossi Kuperman Signed-off-by: Guy Shapiro Signed-off-by: Ilan Tayari Signed-off-by: Yossi Kuperman Signed-off-by: Steffen Klassert --- include/linux/netdevice.h | 14 +++++ include/net/xfrm.h | 65 +++++++++++++++++++++- include/uapi/linux/xfrm.h | 8 +++ net/ipv4/esp4.c | 7 +-- net/ipv4/xfrm4_output.c | 3 +- net/ipv6/esp6.c | 4 +- net/ipv6/xfrm6_output.c | 9 ++- net/xfrm/Makefile | 3 +- net/xfrm/xfrm_device.c | 138 +++++++++++++++++++++++++++++++++++++++++++++- net/xfrm/xfrm_input.c | 41 +++++++++++++- net/xfrm/xfrm_output.c | 44 +++++++++++++-- net/xfrm/xfrm_policy.c | 10 ++-- net/xfrm/xfrm_state.c | 74 +++++++++++++++++++++++++ net/xfrm/xfrm_user.c | 28 ++++++++++ 14 files changed, 424 insertions(+), 24 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5bb03d181848..b3eb83db0223 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -824,6 +824,16 @@ struct netdev_xdp { }; }; +#ifdef CONFIG_XFRM_OFFLOAD +struct xfrmdev_ops { + int (*xdo_dev_state_add) (struct xfrm_state *x); + void (*xdo_dev_state_delete) (struct xfrm_state *x); + void (*xdo_dev_state_free) (struct xfrm_state *x); + bool (*xdo_dev_offload_ok) (struct sk_buff *skb, + struct xfrm_state *x); +}; +#endif + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1697,6 +1707,10 @@ struct net_device { const struct ndisc_ops *ndisc_ops; #endif +#ifdef CONFIG_XFRM + const struct xfrmdev_ops *xfrmdev_ops; +#endif + const struct header_ops *header_ops; unsigned int flags; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 54515d989365..17603bf190c1 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -120,6 +120,13 @@ struct xfrm_state_walk { struct xfrm_address_filter *filter; }; +struct xfrm_state_offload { + struct net_device *dev; + unsigned long offload_handle; + unsigned int num_exthdrs; + u8 flags; +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -207,6 +214,8 @@ struct xfrm_state { struct xfrm_lifetime_cur curlft; struct tasklet_hrtimer mtimer; + struct xfrm_state_offload xso; + /* used to fix curlft->add_time when changing date */ long saved_tmo; @@ -1453,7 +1462,6 @@ struct xfrm6_tunnel { void xfrm_init(void); void xfrm4_init(void); int xfrm_state_init(struct net *net); -void xfrm_dev_init(void); void xfrm_state_fini(struct net *net); void xfrm4_state_init(void); void xfrm4_protocol_init(void); @@ -1559,6 +1567,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); +int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); @@ -1641,6 +1650,11 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) } #endif +struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family); + struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type); @@ -1846,6 +1860,55 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) } #endif +#ifdef CONFIG_XFRM_OFFLOAD +void __net_init xfrm_dev_init(void); +int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, + struct xfrm_user_offload *xuo); +bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); + +static inline void xfrm_dev_state_delete(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + + if (xso->dev) + xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); +} + +static inline void xfrm_dev_state_free(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + struct net_device *dev = xso->dev; + + if (dev && dev->xfrmdev_ops) { + dev->xfrmdev_ops->xdo_dev_state_free(x); + xso->dev = NULL; + dev_put(dev); + } +} +#else +static inline void __net_init xfrm_dev_init(void) +{ +} + +static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) +{ + return 0; +} + +static inline void xfrm_dev_state_delete(struct xfrm_state *x) +{ +} + +static inline void xfrm_dev_state_free(struct xfrm_state *x) +{ +} + +static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + return false; +} +#endif + static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_MARK]) diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 1fc62b239f1b..2b384ff09fa0 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -303,6 +303,7 @@ enum xfrm_attr_type_t { XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, + XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) @@ -494,6 +495,13 @@ struct xfrm_address_filter { __u8 dplen; }; +struct xfrm_user_offload { + int ifindex; + __u8 flags; +}; +#define XFRM_OFFLOAD_IPV6 1 +#define XFRM_OFFLOAD_INBOUND 2 + #ifndef __KERNEL__ /* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b1e24446e297..c6aba234b6e9 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -435,9 +435,6 @@ skip_cow2: aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); aead_request_set_ad(req, assoclen); - seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); - memset(iv, 0, ivlen); memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), min(ivlen, 8)); @@ -470,6 +467,7 @@ static int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -478,7 +476,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) u8 nexthdr[2]; int padlen; - kfree(ESP_SKB_CB(skb)->tmp); + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) goto out; diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7ee6518afa86..94b8702603bc 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) goto out; mtu = dst_mtu(skb_dst(skb)); - if (skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) { skb->protocol = htons(ETH_P_IP); if (skb->sk) diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ff54faa75631..3d3757d20d0a 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -450,6 +450,7 @@ error: static int esp_input_done2(struct sk_buff *skb, int err) { struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -458,7 +459,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) int padlen; u8 nexthdr[2]; - kfree(ESP_SKB_CB(skb)->tmp); + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) goto out; diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 4d09ce6fa90e..8ae87d4ec5ff 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -73,11 +73,16 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + if (skb->ignore_df) + goto out; + mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->ignore_df && skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + skb_gso_network_seglen(skb) > ip6_skb_dst_mtu(skb))) { skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); @@ -89,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; } - +out: return ret; } diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index 55b2ac300995..abf81b329dc1 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -4,7 +4,8 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ - xfrm_sysctl.o xfrm_replay.o xfrm_device.o + xfrm_sysctl.o xfrm_replay.o +obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 34a260a61be9..9bac2ba9052c 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -22,13 +22,149 @@ #include #include +int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, + struct xfrm_user_offload *xuo) +{ + int err; + struct dst_entry *dst; + struct net_device *dev; + struct xfrm_state_offload *xso = &x->xso; + xfrm_address_t *saddr; + xfrm_address_t *daddr; + + if (!x->type_offload) + return 0; + + /* We don't yet support UDP encapsulation, TFC padding and ESN. */ + if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) + return 0; + + dev = dev_get_by_index(net, xuo->ifindex); + if (!dev) { + if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { + saddr = &x->props.saddr; + daddr = &x->id.daddr; + } else { + saddr = &x->id.daddr; + daddr = &x->props.saddr; + } + + dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family); + if (IS_ERR(dst)) + return 0; + + dev = dst->dev; + + dev_hold(dev); + dst_release(dst); + } + + if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + dev_put(dev); + return 0; + } + + xso->dev = dev; + xso->num_exthdrs = 1; + xso->flags = xuo->flags; + + err = dev->xfrmdev_ops->xdo_dev_state_add(x); + if (err) { + dev_put(dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xfrm_dev_state_add); + +bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + int mtu; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct net_device *dev = x->xso.dev; + + if (!x->type_offload || x->encap) + return false; + + if ((x->xso.offload_handle && (dev == dst->path->dev)) && + !dst->child->xfrm && x->type->get_mtu) { + mtu = x->type->get_mtu(x, xdst->child_mtu_cached); + + if (skb->len <= mtu) + goto ok; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + goto ok; + } + + return false; + +ok: + if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok) + return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); + + return true; +} +EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); + +int xfrm_dev_register(struct net_device *dev) +{ + if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) + return NOTIFY_BAD; + if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && + !(dev->features & NETIF_F_HW_ESP)) + return NOTIFY_BAD; + + return NOTIFY_DONE; +} + +static int xfrm_dev_unregister(struct net_device *dev) +{ + return NOTIFY_DONE; +} + +static int xfrm_dev_feat_change(struct net_device *dev) +{ + if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) + return NOTIFY_BAD; + else if (!(dev->features & NETIF_F_HW_ESP)) + dev->xfrmdev_ops = NULL; + + if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && + !(dev->features & NETIF_F_HW_ESP)) + return NOTIFY_BAD; + + return NOTIFY_DONE; +} + +static int xfrm_dev_down(struct net_device *dev) +{ + if (dev->hw_features & NETIF_F_HW_ESP) + xfrm_dev_state_flush(dev_net(dev), dev, true); + + xfrm_garbage_collect(dev_net(dev)); + + return NOTIFY_DONE; +} + static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { + case NETDEV_REGISTER: + return xfrm_dev_register(dev); + + case NETDEV_UNREGISTER: + return xfrm_dev_unregister(dev); + + case NETDEV_FEAT_CHANGE: + return xfrm_dev_feat_change(dev); + case NETDEV_DOWN: - xfrm_garbage_collect(dev_net(dev)); + return xfrm_dev_down(dev); } return NOTIFY_DONE; } diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 46bdb4fbed0b..362d655eac27 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -107,6 +107,8 @@ struct sec_path *secpath_dup(struct sec_path *src) sp->len = 0; sp->olen = 0; + memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH])); + if (src) { int i; @@ -207,8 +209,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) unsigned int family; int decaps = 0; int async = 0; - struct xfrm_offload *xo; bool xfrm_gro = false; + bool crypto_done = false; + struct xfrm_offload *xo = xfrm_offload(skb); if (encap_type < 0) { x = xfrm_input_state(skb); @@ -226,6 +229,37 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) goto lock; } + if (xo && (xo->flags & CRYPTO_DONE)) { + crypto_done = true; + x = xfrm_input_state(skb); + family = XFRM_SPI_SKB_CB(skb)->family; + + if (!(xo->status & CRYPTO_SUCCESS)) { + if (xo->status & + (CRYPTO_TRANSPORT_AH_AUTH_FAILED | + CRYPTO_TRANSPORT_ESP_AUTH_FAILED | + CRYPTO_TUNNEL_AH_AUTH_FAILED | + CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { + + xfrm_audit_state_icvfail(x, skb, + x->type->proto); + x->stats.integrity_failed++; + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); + goto drop; + } + + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + + goto lock; + } + daddr = (xfrm_address_t *)(skb_network_header(skb) + XFRM_SPI_SKB_CB(skb)->daddroff); family = XFRM_SPI_SKB_CB(skb)->family; @@ -311,7 +345,10 @@ lock: skb_dst_force(skb); dev_hold(skb->dev); - nexthdr = x->type->input(x, skb); + if (crypto_done) + nexthdr = x->type_offload->input_tail(x, skb); + else + nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 8ba29fe58352..a15088613a6c 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -99,12 +99,13 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb_dst_force(skb); - /* Inner headers are invalid now. */ - skb->encapsulation = 0; - - err = x->type->output(x, skb); - if (err == -EINPROGRESS) - goto out; + if (xfrm_offload(skb)) { + x->type_offload->encap(x, skb); + } else { + err = x->type->output(x, skb); + if (err == -EINPROGRESS) + goto out; + } resume: if (err) { @@ -200,8 +201,38 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); + struct xfrm_state *x = skb_dst(skb)->xfrm; int err; + secpath_reset(skb); + + if (xfrm_dev_offload_ok(skb, x)) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + + sp->olen++; + sp->xvec[skb->sp->len++] = x; + xfrm_state_hold(x); + + if (skb_is_gso(skb)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; + + return xfrm_output2(net, sk, skb); + } + + if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM) + goto out; + } + if (skb_is_gso(skb)) return xfrm_output_gso(net, sk, skb); @@ -214,6 +245,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) } } +out: return xfrm_output2(net, sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 7befca2a0773..dd44ddc1aea5 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -116,11 +116,10 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa return afinfo; } -static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family) +struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -135,6 +134,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, return dst; } +EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 47fefe97d1e3..fc3c5aa38754 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -440,6 +440,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + xfrm_dev_state_free(x); security_xfrm_state_free(x); kfree(x); } @@ -609,6 +610,8 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + xfrm_dev_state_delete(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -653,12 +656,41 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) return err; } + +static inline int +xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) +{ + int i, err = 0; + + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct xfrm_state *x; + struct xfrm_state_offload *xso; + + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + xso = &x->xso; + + if (xso->dev == dev && + (err = security_xfrm_state_delete(x)) != 0) { + xfrm_audit_state_delete(x, 0, task_valid); + return err; + } + } + } + + return err; +} #else static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { return 0; } + +static inline int +xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) +{ + return 0; +} #endif int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) @@ -701,6 +733,48 @@ out: } EXPORT_SYMBOL(xfrm_state_flush); +int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid) +{ + int i, err = 0, cnt = 0; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid); + if (err) + goto out; + + err = -ESRCH; + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct xfrm_state *x; + struct xfrm_state_offload *xso; +restart: + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + xso = &x->xso; + + if (!xfrm_state_kern(x) && xso->dev == dev) { + xfrm_state_hold(x); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + + err = xfrm_state_delete(x); + xfrm_audit_state_delete(x, err ? 0 : 1, + task_valid); + xfrm_state_put(x); + if (!err) + cnt++; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + goto restart; + } + } + } + if (cnt) + err = 0; + +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + return err; +} +EXPORT_SYMBOL(xfrm_dev_state_flush); + void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { spin_lock_bh(&net->xfrm.xfrm_state_lock); diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4f7e62ddc17e..de3332e3f9e2 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -595,6 +595,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } + if (attrs[XFRMA_OFFLOAD_DEV] && + xfrm_dev_state_add(net, x, nla_data(attrs[XFRMA_OFFLOAD_DEV]))) + goto error; + if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@ -779,6 +783,23 @@ static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb) return 0; } +static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb) +{ + struct xfrm_user_offload *xuo; + struct nlattr *attr; + + attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo)); + if (attr == NULL) + return -EMSGSIZE; + + xuo = nla_data(attr); + + xuo->ifindex = xso->dev->ifindex; + xuo->flags = xso->flags; + + return 0; +} + static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) { struct xfrm_algo *algo; @@ -869,6 +890,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, &x->replay); if (ret) goto out; + if(x->xso.dev) + ret = copy_user_offload(&x->xso, skb); + if (ret) + goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -2406,6 +2431,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2622,6 +2648,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); + if (x->xso.dev) + l += nla_total_size(sizeof(x->xso)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); -- cgit v1.3 From cc41c84b7e7f2d7f6698bccc84890943fd021265 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Fri, 14 Apr 2017 20:31:08 +0200 Subject: netfilter: kill the fake untracked conntrack objects resurrect an old patch from Pablo Neira to remove the untracked objects. Currently, there are four possible states of an skb wrt. conntrack. 1. No conntrack attached, ct is NULL. 2. Normal (kmem cache allocated) ct attached. 3. a template (kmalloc'd), not in any hash tables at any point in time 4. the 'untracked' conntrack, a percpu nf_conn object, tagged via IPS_UNTRACKED_BIT in ct->status. Untracked is supposed to be identical to case 1. It exists only so users can check -m conntrack --ctstate UNTRACKED vs. -m conntrack --ctstate INVALID e.g. attempts to set connmark on INVALID or UNTRACKED conntracks is supposed to be a no-op. Thus currently we need to check ct == NULL || nf_ct_is_untracked(ct) in a lot of places in order to avoid altering untracked objects. The other consequence of the percpu untracked object is that all -j NOTRACK (and, later, kfree_skb of such skbs) result in an atomic op (inc/dec the untracked conntracks refcount). This adds a new kernel-private ctinfo state, IP_CT_UNTRACKED, to make the distinction instead. The (few) places that care about packet invalid (ct is NULL) vs. packet untracked now need to test ct == NULL vs. ctinfo == IP_CT_UNTRACKED, but all other places can omit the nf_ct_is_untracked() check. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/ip_vs.h | 6 +-- include/net/netfilter/nf_conntrack.h | 10 +---- include/uapi/linux/netfilter/nf_conntrack_common.h | 6 ++- net/ipv4/netfilter/nf_dup_ipv4.c | 3 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 3 +- net/ipv6/netfilter/nf_dup_ipv6.c | 3 +- net/netfilter/nf_conntrack_core.c | 48 +++------------------- net/netfilter/nf_nat_core.c | 3 -- net/netfilter/nft_ct.c | 14 +++---- net/netfilter/xt_CT.c | 16 ++++---- net/netfilter/xt_conntrack.c | 11 +++-- net/netfilter/xt_state.c | 13 +++--- 12 files changed, 39 insertions(+), 97 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 8a4a57b887fb..9a75d9933e63 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1556,12 +1556,8 @@ static inline void ip_vs_notrack(struct sk_buff *skb) struct nf_conn *ct = nf_ct_get(skb, &ctinfo); if (!ct || !nf_ct_is_untracked(ct)) { - struct nf_conn *untracked; - nf_conntrack_put(&ct->ct_general); - untracked = nf_ct_untracked_get(); - nf_conntrack_get(&untracked->ct_general); - nf_ct_set(skb, untracked, IP_CT_NEW); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } #endif } diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 19605878da47..012b99f563e5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -243,14 +243,6 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); -/* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); -static inline struct nf_conn *nf_ct_untracked_get(void) -{ - return raw_cpu_ptr(&nf_conntrack_untracked); -} -void nf_ct_untracked_status_or(unsigned long bits); - /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), @@ -283,7 +275,7 @@ static inline int nf_ct_is_dying(const struct nf_conn *ct) static inline int nf_ct_is_untracked(const struct nf_conn *ct) { - return test_bit(IPS_UNTRACKED_BIT, &ct->status); + return false; } /* Packet is received from loopback */ diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 6a8e33dd4ecb..b4a0a1940118 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -28,12 +28,14 @@ enum ip_conntrack_info { /* only for userspace compatibility */ #ifndef __KERNEL__ IP_CT_NEW_REPLY = IP_CT_NUMBER, +#else + IP_CT_UNTRACKED = 7, #endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) #define NF_CT_STATE_BIT(ctinfo) (1 << ((ctinfo) % IP_CT_IS_REPLY + 1)) -#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_NUMBER + 1)) +#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_UNTRACKED + 1)) /* Bitset representing status of connection. */ enum ip_conntrack_status { @@ -94,7 +96,7 @@ enum ip_conntrack_status { IPS_TEMPLATE_BIT = 11, IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), - /* Conntrack is a fake untracked entry */ + /* Conntrack is a fake untracked entry. Obsolete and not used anymore */ IPS_UNTRACKED_BIT = 12, IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index f0dbff05fc28..39895b9ddeb9 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -69,8 +69,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ nf_reset(skb); - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index d2c2ccbfbe72..d5f028e33f65 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -221,8 +221,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 888ecd106e5f..4a7ddeddbaab 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -58,8 +58,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_reset(skb); - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_IN) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bcf1d2a6539e..03150f60714d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -180,14 +180,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; seqcount_t nf_conntrack_generation __read_mostly; - -/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used - * for the nfctinfo. We cheat by (ab)using the PER CPU cache line - * alignment to enforce this. - */ -DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); -EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); - static unsigned int nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, @@ -1314,9 +1306,10 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, int ret; tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl) { + if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ - if (!nf_ct_is_template(tmpl)) { + if ((tmpl && !nf_ct_is_template(tmpl)) || + ctinfo == IP_CT_UNTRACKED) { NF_CT_STAT_INC_ATOMIC(net, ignore); return NF_ACCEPT; } @@ -1629,18 +1622,6 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -static int untrack_refs(void) -{ - int cnt = 0, cpu; - - for_each_possible_cpu(cpu) { - struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); - - cnt += atomic_read(&ct->ct_general.use) - 1; - } - return cnt; -} - void nf_conntrack_cleanup_start(void) { conntrack_gc_work.exiting = true; @@ -1650,8 +1631,6 @@ void nf_conntrack_cleanup_start(void) void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_destroy, NULL); - while (untrack_refs() > 0) - schedule(); cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); @@ -1825,20 +1804,11 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -void nf_ct_untracked_status_or(unsigned long bits) -{ - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(nf_conntrack_untracked, cpu).status |= bits; -} -EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); - int nf_conntrack_init_start(void) { int max_factor = 8; int ret = -ENOMEM; - int i, cpu; + int i; seqcount_init(&nf_conntrack_generation); @@ -1921,15 +1891,6 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_proto; - /* Set up fake conntrack: to never be deleted, not in any hashes */ - for_each_possible_cpu(cpu) { - struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); - write_pnet(&ct->ct_net, &init_net); - atomic_set(&ct->ct_general.use, 1); - } - /* - and look it like as a confirmed connection */ - nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); - conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); @@ -1977,6 +1938,7 @@ int nf_conntrack_init_net(struct net *net) int ret = -ENOMEM; int cpu; + BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); atomic_set(&net->ct.count, 0); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 5e35643da650..9cbf49f9c1b7 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -861,9 +861,6 @@ static int __init nf_nat_init(void) nf_ct_helper_expectfn_register(&follow_master_nat); - /* Initialize fake conntrack so that NAT will skip it */ - nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6e23dbbedd7f..6c6fd48b024c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -72,12 +72,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_STATE: - if (ct == NULL) - state = NF_CT_STATE_INVALID_BIT; - else if (nf_ct_is_untracked(ct)) + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else - state = NF_CT_STATE_BIT(ctinfo); + state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: @@ -718,12 +718,10 @@ static void nft_notrack_eval(const struct nft_expr *expr, ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ - if (ct) + if (ct || ctinfo == IP_CT_UNTRACKED) return; - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); - nf_ct_set(skb, ct, IP_CT_NEW); + nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index b008db0184b8..3cbe1bcf6a74 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -26,11 +26,12 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) if (skb->_nfct != 0) return XT_CONTINUE; - /* special case the untracked ct : we want the percpu object */ - if (!ct) - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); - nf_ct_set(skb, ct, IP_CT_NEW); + if (ct) { + atomic_inc(&ct->ct_general.use); + nf_ct_set(skb, ct, IP_CT_NEW); + } else { + nf_ct_set(skb, ct, IP_CT_UNTRACKED); + } return XT_CONTINUE; } @@ -335,7 +336,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { help = nfct_help(ct); if (help) module_put(help->helper->me); @@ -412,8 +413,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) if (skb->_nfct != 0) return XT_CONTINUE; - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return XT_CONTINUE; } diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index c0fb217bc649..39cf1d019240 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -172,12 +172,11 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, ct = nf_ct_get(skb, &ctinfo); - if (ct) { - if (nf_ct_is_untracked(ct)) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - } else + if (ct) + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 5746a33789a5..5fbd79194d21 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -28,14 +28,13 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par) unsigned int statebit; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (ct) + statebit = XT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + statebit = XT_STATE_UNTRACKED; + else statebit = XT_STATE_INVALID; - else { - if (nf_ct_is_untracked(ct)) - statebit = XT_STATE_UNTRACKED; - else - statebit = XT_STATE_BIT(ctinfo); - } + return (sinfo->statemask & statebit); } -- cgit v1.3 From 694a0055f039bc1d73aba10606ea74e798d2d759 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Sat, 15 Apr 2017 19:26:10 +0200 Subject: netfilter: nft_ct: allow to set ctnetlink event types of a connection By default the kernel emits all ctnetlink events for a connection. This allows to select the types of events to generate. This can be used to e.g. only send DESTROY events but no NEW/UPDATE ones and will work even if sysctl net.netfilter.nf_conntrack_events is set to 0. This was already possible via iptables' CT target, but the nft version has the advantage that it can also be used with already-established conntracks. The added nf_ct_is_template() check isn't a bug fix as we only support mark and labels (and unlike ecache the conntrack core doesn't copy those). Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/uapi/linux/netfilter/nf_tables.h | 2 ++ net/netfilter/nft_ct.c | 25 ++++++++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 8f3842690d17..683f6f88fcac 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -901,6 +901,7 @@ enum nft_rt_attributes { * @NFT_CT_BYTES: conntrack bytes * @NFT_CT_AVGPKT: conntrack average bytes per packet * @NFT_CT_ZONE: conntrack zone + * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack */ enum nft_ct_keys { NFT_CT_STATE, @@ -921,6 +922,7 @@ enum nft_ct_keys { NFT_CT_BYTES, NFT_CT_AVGPKT, NFT_CT_ZONE, + NFT_CT_EVENTMASK, }; /** diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 6c6fd48b024c..a34ceb38fc55 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -264,7 +264,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { @@ -283,6 +283,22 @@ static void nft_ct_set_eval(const struct nft_expr *expr, ®s->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + case NFT_CT_EVENTMASK: { + struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + u32 ctmask = regs->data[priv->sreg]; + + if (e) { + if (e->ctmask != ctmask) + e->ctmask = ctmask; + break; + } + + if (ctmask && !nf_ct_is_confirmed(ct)) + nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); + break; + } #endif default: break; @@ -538,6 +554,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, nft_ct_pcpu_template_refcnt++; len = sizeof(u16); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + case NFT_CT_EVENTMASK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); + break; #endif default: return -EOPNOTSUPP; -- cgit v1.3 From 01026edef9062b7d26ace74a5b4a5a33a2399501 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Apr 2017 17:27:32 +0200 Subject: nefilter: eache: reduce struct size from 32 to 24 byte Only "cache" needs to use ulong (its used with set_bit()), missed can use u16. Also add build-time assertion to ensure event bits fit. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_ecache.h | 4 ++-- include/uapi/linux/netfilter/nf_conntrack_common.h | 3 +++ net/netfilter/nf_conntrack_ecache.c | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 12d967b58726..2a10c6570fcc 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -20,11 +20,11 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ + u16 missed; /* missed events */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ + enum nf_ct_ecache_state state:8;/* ecache state */ u32 portid; /* netlink portid of destroyer */ - enum nf_ct_ecache_state state; /* ecache state */ }; static inline struct nf_conntrack_ecache * diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index b4a0a1940118..a8072cc7fa0b 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -119,6 +119,9 @@ enum ip_conntrack_events { IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ +#ifdef __KERNEL__ + __IPCT_MAX +#endif }; enum ip_conntrack_expect_events { diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 6161e92d2980..515212948125 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -420,6 +420,9 @@ int nf_conntrack_ecache_init(void) int ret = nf_ct_extend_register(&event_extend); if (ret < 0) pr_err("nf_ct_event: Unable to register event extension.\n"); + + BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ + return ret; } -- cgit v1.3 From 0a473b82cb23e7a35c4be6e9765c8487a65e8f55 Mon Sep 17 00:00:00 2001 From: Craig Gallek Date: Wed, 19 Apr 2017 12:30:53 -0400 Subject: ip6_tunnel: Allow policy-based routing through tunnels This feature allows the administrator to set an fwmark for packets traversing a tunnel. This allows the use of independent routing tables for tunneled packets without the use of iptables. Signed-off-by: Craig Gallek Signed-off-by: David S. Miller --- include/net/ip6_tunnel.h | 2 ++ include/uapi/linux/if_tunnel.h | 3 +++ net/ipv6/ip6_gre.c | 14 +++++++++++++- net/ipv6/ip6_tunnel.c | 15 ++++++++++++++- net/ipv6/ip6_vti.c | 10 +++++++++- 5 files changed, 41 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 1b1cf33cbfb0..08fbc7f7d8d7 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -33,6 +33,8 @@ struct __ip6_tnl_parm { __be16 o_flags; __be32 i_key; __be32 o_key; + + __u32 fwmark; }; /* IPv6 tunnel */ diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 92f3c8677523..6792d1967d31 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -75,6 +75,7 @@ enum { IFLA_IPTUN_ENCAP_SPORT, IFLA_IPTUN_ENCAP_DPORT, IFLA_IPTUN_COLLECT_METADATA, + IFLA_IPTUN_FWMARK, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) @@ -132,6 +133,7 @@ enum { IFLA_GRE_ENCAP_DPORT, IFLA_GRE_COLLECT_METADATA, IFLA_GRE_IGNORE_DF, + IFLA_GRE_FWMARK, __IFLA_GRE_MAX, }; @@ -147,6 +149,7 @@ enum { IFLA_VTI_OKEY, IFLA_VTI_LOCAL, IFLA_VTI_REMOTE, + IFLA_VTI_FWMARK, __IFLA_VTI_MAX, }; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 6fcb7cb49bb2..8d128ba79b66 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -544,6 +544,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) & IPV6_TCLASS_MASK; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -603,6 +605,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -780,6 +784,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; @@ -1249,6 +1254,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FLAGS]) parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); + + if (data[IFLA_GRE_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); } static int ip6gre_tap_init(struct net_device *dev) @@ -1470,6 +1478,8 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_FWMARK */ + nla_total_size(4) + 0; } @@ -1490,7 +1500,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || - nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) + nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1525,6 +1536,7 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 75fac933c209..ad15d38b41e8 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1256,6 +1256,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) & IPV6_TCLASS_MASK; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -1338,6 +1340,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -1467,6 +1471,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; @@ -1918,6 +1923,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; + + if (data[IFLA_IPTUN_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], @@ -2054,6 +2062,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } @@ -2069,7 +2079,8 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || - nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || @@ -2081,6 +2092,7 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; + return 0; nla_put_failure: @@ -2109,6 +2121,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 3d8a3b63b4fd..d67ef56454b2 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -657,6 +657,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; @@ -933,6 +934,9 @@ static void vti6_netlink_parms(struct nlattr *data[], if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); + + if (data[IFLA_VTI_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti6_newlink(struct net *src_net, struct net_device *dev, @@ -998,6 +1002,8 @@ static size_t vti6_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + + /* IFLA_VTI_FWMARK */ + nla_total_size(4) + 0; } @@ -1010,7 +1016,8 @@ static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || - nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key)) + nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) || + nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark)) goto nla_put_failure; return 0; @@ -1024,6 +1031,7 @@ static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, + [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti6_link_ops __read_mostly = { -- cgit v1.3 From 7acf8a1e8a28b3d7407a8d8061a7d0766cfac2f4 Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Wed, 19 Apr 2017 12:37:10 -0400 Subject: Replace 2 jiffies with sysctl netdev_budget_usecs to enable softirq tuning Constants used for tuning are generally a bad idea, especially as hardware changes over time. Replace the constant 2 jiffies with sysctl variable netdev_budget_usecs to enable sysadmins to tune the softirq processing. Also document the variable. For example, a very fast machine might tune this to 1000 microseconds, while my regression testing 486DX-25 needs it to be 4000 microseconds on a nearly idle network to prevent time_squeeze from being incremented. Version 2: changed jiffies to microseconds for predictable units. Signed-off-by: Matthew Whitehead Signed-off-by: David S. Miller --- Documentation/sysctl/net.txt | 11 ++++++++++- include/linux/netdevice.h | 1 + include/uapi/linux/sysctl.h | 1 + kernel/sysctl_binary.c | 1 + net/core/dev.c | 4 +++- net/core/sysctl_net_core.c | 8 ++++++++ 6 files changed, 24 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/Documentation/sysctl/net.txt b/Documentation/sysctl/net.txt index 2ebabc93014a..14db18c970b1 100644 --- a/Documentation/sysctl/net.txt +++ b/Documentation/sysctl/net.txt @@ -188,7 +188,16 @@ netdev_budget Maximum number of packets taken from all interfaces in one polling cycle (NAPI poll). In one polling cycle interfaces which are registered to polling are -probed in a round-robin manner. +probed in a round-robin manner. Also, a polling cycle may not exceed +netdev_budget_usecs microseconds, even if netdev_budget has not been +exhausted. + +netdev_budget_usecs +--------------------- + +Maximum number of microseconds in one NAPI polling cycle. Polling +will exit when either netdev_budget_usecs have elapsed during the +poll cycle or the number of packets processed reaches netdev_budget. netdev_max_backlog ------------------ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 0f3c38ce5417..c49cf21f2b31 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3296,6 +3296,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; +extern unsigned int netdev_budget_usecs; /* Called by rtnetlink.c:rtnl_unlock() */ void netdev_run_todo(void); diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index e13d48058b8d..177f5f139b36 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -274,6 +274,7 @@ enum NET_CORE_AEVENT_ETIME=20, NET_CORE_AEVENT_RSEQTH=21, NET_CORE_WARNINGS=22, + NET_CORE_BUDGET_USECS=23, }; /* /proc/sys/net/ethernet */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index ece4b177052b..4ee3e49530d2 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -197,6 +197,7 @@ static const struct bin_table bin_net_core_table[] = { { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, { CTL_INT, NET_CORE_WARNINGS, "warnings" }, + { CTL_INT, NET_CORE_BUDGET_USECS, "netdev_budget_usecs" }, {}, }; diff --git a/net/core/dev.c b/net/core/dev.c index 5d33e2baab2b..1c53c055b197 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3441,6 +3441,7 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; +unsigned int __read_mostly netdev_budget_usecs = 2000; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -5307,7 +5308,8 @@ out_unlock: static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; LIST_HEAD(list); LIST_HEAD(repoll); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7f9cc400eca0..ea23254b2457 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -452,6 +452,14 @@ static struct ctl_table net_core_table[] = { .extra1 = &one, .extra2 = &max_skb_frags, }, + { + .procname = "netdev_budget_usecs", + .data = &netdev_budget_usecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { } }; -- cgit v1.3 From b1d9fc41aab11f9520b2e0d57ae872e2ec5d6f32 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 19 Apr 2017 23:01:17 +0200 Subject: bpf: add napi_id read access to __sk_buff Add napi_id access to __sk_buff for socket filter program types, tc program types and other bpf_convert_ctx_access() users. Having access to skb->napi_id is useful for per RX queue listener siloing, f.e. in combination with SO_ATTACH_REUSEPORT_EBPF and when busy polling is used, meaning SO_REUSEPORT enabled listeners can then select the corresponding socket at SYN time already [1]. The skb is marked via skb_mark_napi_id() early in the receive path (e.g., napi_gro_receive()). Currently, sockets can only use SO_INCOMING_NAPI_ID from 6d4339028b35 ("net: Introduce SO_INCOMING_NAPI_ID") as a socket option to look up the NAPI ID associated with the queue for steering, which requires a prior sk_mark_napi_id() after the socket was looked up. Semantics for the __sk_buff napi_id access are similar, meaning if skb->napi_id is < MIN_NAPI_ID (e.g. outgoing packets using sender_cpu), then an invalid napi_id of 0 is returned to the program, otherwise a valid non-zero napi_id. [1] http://netdevconf.org/2.1/slides/apr6/dumazet-BUSY-POLLING-Netdev-2.1.pdf Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + net/core/filter.c | 14 ++++++++++++++ tools/include/uapi/linux/bpf.h | 1 + tools/testing/selftests/bpf/test_verifier.c | 3 +++ 4 files changed, 19 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1e062bb54eec..e553529929f6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -603,6 +603,7 @@ struct __sk_buff { __u32 tc_classid; __u32 data; __u32 data_end; + __u32 napi_id; }; struct bpf_tunnel_key { diff --git a/net/core/filter.c b/net/core/filter.c index 085925834727..9a37860a80fc 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -53,6 +53,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -3201,6 +3202,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, napi_id): +#if defined(CONFIG_NET_RX_BUSY_POLL) + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sk_buff, napi_id)); + *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1e062bb54eec..e553529929f6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -603,6 +603,7 @@ struct __sk_buff { __u32 tc_classid; __u32 data; __u32 data_end; + __u32 napi_id; }; struct bpf_tunnel_key { diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 6178b65fee59..95a8d5f3ab80 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -772,6 +772,9 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, offsetof(struct __sk_buff, vlan_tci)), BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_1, + offsetof(struct __sk_buff, napi_id)), + BPF_JMP_IMM(BPF_JGE, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), }, .result = ACCEPT, -- cgit v1.3 From 1f4407e2548827e3e6e7b943640a2da90c611306 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Fri, 21 Apr 2017 15:59:52 -0400 Subject: net: Remove NET_CORE_BUDGET_USECS from sysctl binary interface. We are not supposed to add new entries to this thing any more. Thanks to Eric Dumazet for noticing this. Signed-off-by: David S. Miller --- include/uapi/linux/sysctl.h | 1 - kernel/sysctl_binary.c | 1 - 2 files changed, 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index 177f5f139b36..e13d48058b8d 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -274,7 +274,6 @@ enum NET_CORE_AEVENT_ETIME=20, NET_CORE_AEVENT_RSEQTH=21, NET_CORE_WARNINGS=22, - NET_CORE_BUDGET_USECS=23, }; /* /proc/sys/net/ethernet */ diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 4ee3e49530d2..ece4b177052b 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -197,7 +197,6 @@ static const struct bin_table bin_net_core_table[] = { { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, { CTL_INT, NET_CORE_WARNINGS, "warnings" }, - { CTL_INT, NET_CORE_BUDGET_USECS, "netdev_budget_usecs" }, {}, }; -- cgit v1.3 From f43e9b069aeaf0f3d51fa30ddc9c0003e86623b8 Mon Sep 17 00:00:00 2001 From: Roi Dayan Date: Sun, 25 Sep 2016 13:52:44 +0300 Subject: net/devlink: Add E-Switch encapsulation control This is an e-switch global knob to enable HW support for applying encapsulation/decapsulation to VF traffic as part of SRIOV e-switch offloading. The actual encap/decap is carried out (along with the matching and other actions) per offloaded e-switch rules, e.g as done when offloading the TC tunnel key action. Signed-off-by: Roi Dayan Reviewed-by: Or Gerlitz Acked-by: Jiri Pirko Signed-off-by: Saeed Mahameed --- include/net/devlink.h | 2 ++ include/uapi/linux/devlink.h | 7 +++++++ net/core/devlink.c | 26 +++++++++++++++++++++++--- 3 files changed, 32 insertions(+), 3 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/devlink.h b/include/net/devlink.h index 24de13f8c94f..ed7687bbf5d0 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -268,6 +268,8 @@ struct devlink_ops { int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode); int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode); + int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode); + int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode); }; static inline void *devlink_priv(struct devlink *devlink) diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index b47bee277347..b0e807ac53bb 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -119,6 +119,11 @@ enum devlink_eswitch_inline_mode { DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT, }; +enum devlink_eswitch_encap_mode { + DEVLINK_ESWITCH_ENCAP_MODE_NONE, + DEVLINK_ESWITCH_ENCAP_MODE_BASIC, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -195,6 +200,8 @@ enum devlink_attr { DEVLINK_ATTR_PAD, + DEVLINK_ATTR_ESWITCH_ENCAP_MODE, /* u8 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, diff --git a/net/core/devlink.c b/net/core/devlink.c index 0afac5800b57..b0b87a292e7c 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1397,10 +1397,10 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; + u8 inline_mode, encap_mode; void *hdr; int err = 0; u16 mode; - u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) @@ -1429,6 +1429,15 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure; } + if (ops->eswitch_encap_mode_get) { + err = ops->eswitch_encap_mode_get(devlink, &encap_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); + if (err) + goto nla_put_failure; + } + genlmsg_end(msg, hdr); return 0; @@ -1468,9 +1477,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; - u16 mode; - u8 inline_mode; + u8 inline_mode, encap_mode; int err = 0; + u16 mode; if (!ops) return -EOPNOTSUPP; @@ -1493,6 +1502,16 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, if (err) return err; } + + if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { + if (!ops->eswitch_encap_mode_set) + return -EOPNOTSUPP; + encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); + err = ops->eswitch_encap_mode_set(devlink, encap_mode); + if (err) + return err; + } + return 0; } @@ -2190,6 +2209,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, }; -- cgit v1.3 From 531b374834c891ae2abf800693074df35a7d1a36 Mon Sep 17 00:00:00 2001 From: Gerard Garcia Date: Fri, 21 Apr 2017 10:10:44 +0100 Subject: VSOCK: Add vsockmon tap functions Add tap functions that can be used by the vsock transports to deliver packets to vsockmon virtual network devices. Signed-off-by: Gerard Garcia Signed-off-by: Stefan Hajnoczi Reviewed-by: Jorgen Hansen Signed-off-by: David S. Miller --- MAINTAINERS | 1 + include/net/af_vsock.h | 13 +++++ include/uapi/linux/if_arp.h | 1 + net/vmw_vsock/Makefile | 2 +- net/vmw_vsock/af_vsock_tap.c | 114 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+), 1 deletion(-) create mode 100644 net/vmw_vsock/af_vsock_tap.c (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index b283d5ef7b68..fdab4f9e8ac9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13317,6 +13317,7 @@ L: netdev@vger.kernel.org S: Maintained F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h +F: net/vmw_vsock/af_vsock_tap.c F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c F: drivers/vhost/vsock.c diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f32ed9ac181a..f9fb566e75cf 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -188,4 +188,17 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +/**** TAP ****/ + +struct vsock_tap { + struct net_device *dev; + struct module *module; + struct list_head list; +}; + +int vsock_init_tap(void); +int vsock_add_tap(struct vsock_tap *vt); +int vsock_remove_tap(struct vsock_tap *vt); +void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); + #endif /* __AF_VSOCK_H__ */ diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index 4d024d75d64b..cf73510b9238 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -95,6 +95,7 @@ #define ARPHRD_IP6GRE 823 /* GRE over IPv6 */ #define ARPHRD_NETLINK 824 /* Netlink header */ #define ARPHRD_6LOWPAN 825 /* IPv6 over LoWPAN */ +#define ARPHRD_VSOCKMON 826 /* Vsock monitor header */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index bc27c70e0e59..09fc2eb29dc8 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o -vsock-y += af_vsock.o vsock_addr.o +vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/af_vsock_tap.c b/net/vmw_vsock/af_vsock_tap.c new file mode 100644 index 000000000000..98f09b539366 --- /dev/null +++ b/net/vmw_vsock/af_vsock_tap.c @@ -0,0 +1,114 @@ +/* + * Tap functions for AF_VSOCK sockets. + * + * Code based on net/netlink/af_netlink.c tap functions. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +static DEFINE_SPINLOCK(vsock_tap_lock); +static struct list_head vsock_tap_all __read_mostly = + LIST_HEAD_INIT(vsock_tap_all); + +int vsock_add_tap(struct vsock_tap *vt) +{ + if (unlikely(vt->dev->type != ARPHRD_VSOCKMON)) + return -EINVAL; + + __module_get(vt->module); + + spin_lock(&vsock_tap_lock); + list_add_rcu(&vt->list, &vsock_tap_all); + spin_unlock(&vsock_tap_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_add_tap); + +int vsock_remove_tap(struct vsock_tap *vt) +{ + struct vsock_tap *tmp; + bool found = false; + + spin_lock(&vsock_tap_lock); + + list_for_each_entry(tmp, &vsock_tap_all, list) { + if (vt == tmp) { + list_del_rcu(&vt->list); + found = true; + goto out; + } + } + + pr_warn("vsock_remove_tap: %p not found\n", vt); +out: + spin_unlock(&vsock_tap_lock); + + synchronize_net(); + + if (found) + module_put(vt->module); + + return found ? 0 : -ENODEV; +} +EXPORT_SYMBOL_GPL(vsock_remove_tap); + +static int __vsock_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + int ret = 0; + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + dev_hold(dev); + + nskb->dev = dev; + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + + dev_put(dev); + } + + return ret; +} + +static void __vsock_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct vsock_tap *tmp; + + list_for_each_entry_rcu(tmp, &vsock_tap_all, list) { + ret = __vsock_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque) +{ + struct sk_buff *skb; + + rcu_read_lock(); + + if (likely(list_empty(&vsock_tap_all))) + goto out; + + skb = build_skb(opaque); + if (skb) { + __vsock_deliver_tap(skb); + consume_skb(skb); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(vsock_deliver_tap); -- cgit v1.3 From 0b2e66448ba20eb30ea62345d6beb9ee2a1ce06b Mon Sep 17 00:00:00 2001 From: Gerard Garcia Date: Fri, 21 Apr 2017 10:10:45 +0100 Subject: VSOCK: Add vsockmon device Add vsockmon virtual network device that receives packets from the vsock transports and exposes them to user space. Based on the nlmon device. Signed-off-by: Gerard Garcia Signed-off-by: Stefan Hajnoczi Signed-off-by: David S. Miller --- MAINTAINERS | 2 + drivers/net/Kconfig | 8 ++ drivers/net/Makefile | 1 + drivers/net/vsockmon.c | 170 ++++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/Kbuild | 1 + include/uapi/linux/vsockmon.h | 60 +++++++++++++++ 6 files changed, 242 insertions(+) create mode 100644 drivers/net/vsockmon.c create mode 100644 include/uapi/linux/vsockmon.h (limited to 'include/uapi/linux') diff --git a/MAINTAINERS b/MAINTAINERS index fdab4f9e8ac9..28ea78b12d0c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13317,9 +13317,11 @@ L: netdev@vger.kernel.org S: Maintained F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h +F: include/uapi/linux/vsockmon.h F: net/vmw_vsock/af_vsock_tap.c F: net/vmw_vsock/virtio_transport_common.c F: net/vmw_vsock/virtio_transport.c +F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c F: drivers/vhost/vsock.h diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 100fbdc9b95c..83a1616903f8 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -355,6 +355,14 @@ config NET_VRF This option enables the support for mapping interfaces into VRF's. The support enables VRF devices. +config VSOCKMON + tristate "Virtual vsock monitoring device" + depends on VHOST_VSOCK + ---help--- + This option enables a monitoring net device for vsock sockets. It is + mostly intended for developers or support to debug vsock issues. If + unsure, say N. + endif # NET_CORE config SUNGEM_PHY diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 57fc47ad5ab3..b2f6556d8848 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_GTP) += gtp.o obj-$(CONFIG_NLMON) += nlmon.o obj-$(CONFIG_NET_VRF) += vrf.o +obj-$(CONFIG_VSOCKMON) += vsockmon.o # # Networking Drivers diff --git a/drivers/net/vsockmon.c b/drivers/net/vsockmon.c new file mode 100644 index 000000000000..7f0136f2dd9d --- /dev/null +++ b/drivers/net/vsockmon.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +/* Virtio transport max packet size plus header */ +#define DEFAULT_MTU (VIRTIO_VSOCK_MAX_PKT_BUF_SIZE + \ + sizeof(struct af_vsockmon_hdr)) + +struct pcpu_lstats { + u64 rx_packets; + u64 rx_bytes; + struct u64_stats_sync syncp; +}; + +static int vsockmon_dev_init(struct net_device *dev) +{ + dev->lstats = netdev_alloc_pcpu_stats(struct pcpu_lstats); + if (!dev->lstats) + return -ENOMEM; + return 0; +} + +static void vsockmon_dev_uninit(struct net_device *dev) +{ + free_percpu(dev->lstats); +} + +struct vsockmon { + struct vsock_tap vt; +}; + +static int vsockmon_open(struct net_device *dev) +{ + struct vsockmon *vsockmon = netdev_priv(dev); + + vsockmon->vt.dev = dev; + vsockmon->vt.module = THIS_MODULE; + return vsock_add_tap(&vsockmon->vt); +} + +static int vsockmon_close(struct net_device *dev) +{ + struct vsockmon *vsockmon = netdev_priv(dev); + + return vsock_remove_tap(&vsockmon->vt); +} + +static netdev_tx_t vsockmon_xmit(struct sk_buff *skb, struct net_device *dev) +{ + int len = skb->len; + struct pcpu_lstats *stats = this_cpu_ptr(dev->lstats); + + u64_stats_update_begin(&stats->syncp); + stats->rx_bytes += len; + stats->rx_packets++; + u64_stats_update_end(&stats->syncp); + + dev_kfree_skb(skb); + + return NETDEV_TX_OK; +} + +static void +vsockmon_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) +{ + int i; + u64 bytes = 0, packets = 0; + + for_each_possible_cpu(i) { + const struct pcpu_lstats *vstats; + u64 tbytes, tpackets; + unsigned int start; + + vstats = per_cpu_ptr(dev->lstats, i); + + do { + start = u64_stats_fetch_begin_irq(&vstats->syncp); + tbytes = vstats->rx_bytes; + tpackets = vstats->rx_packets; + } while (u64_stats_fetch_retry_irq(&vstats->syncp, start)); + + packets += tpackets; + bytes += tbytes; + } + + stats->rx_packets = packets; + stats->tx_packets = 0; + + stats->rx_bytes = bytes; + stats->tx_bytes = 0; +} + +static int vsockmon_is_valid_mtu(int new_mtu) +{ + return new_mtu >= (int)sizeof(struct af_vsockmon_hdr); +} + +static int vsockmon_change_mtu(struct net_device *dev, int new_mtu) +{ + if (!vsockmon_is_valid_mtu(new_mtu)) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops vsockmon_ops = { + .ndo_init = vsockmon_dev_init, + .ndo_uninit = vsockmon_dev_uninit, + .ndo_open = vsockmon_open, + .ndo_stop = vsockmon_close, + .ndo_start_xmit = vsockmon_xmit, + .ndo_get_stats64 = vsockmon_get_stats64, + .ndo_change_mtu = vsockmon_change_mtu, +}; + +static u32 always_on(struct net_device *dev) +{ + return 1; +} + +static const struct ethtool_ops vsockmon_ethtool_ops = { + .get_link = always_on, +}; + +static void vsockmon_setup(struct net_device *dev) +{ + dev->type = ARPHRD_VSOCKMON; + dev->priv_flags |= IFF_NO_QUEUE; + + dev->netdev_ops = &vsockmon_ops; + dev->ethtool_ops = &vsockmon_ethtool_ops; + dev->destructor = free_netdev; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | + NETIF_F_HIGHDMA | NETIF_F_LLTX; + + dev->flags = IFF_NOARP; + + dev->mtu = DEFAULT_MTU; +} + +static struct rtnl_link_ops vsockmon_link_ops __read_mostly = { + .kind = "vsockmon", + .priv_size = sizeof(struct vsockmon), + .setup = vsockmon_setup, +}; + +static __init int vsockmon_register(void) +{ + return rtnl_link_register(&vsockmon_link_ops); +} + +static __exit void vsockmon_unregister(void) +{ + rtnl_link_unregister(&vsockmon_link_ops); +} + +module_init(vsockmon_register); +module_exit(vsockmon_unregister); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Gerard Garcia "); +MODULE_DESCRIPTION("Vsock monitoring device. Based on nlmon device."); +MODULE_ALIAS_RTNL_LINK("vsockmon"); diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index f8d9fed17ba9..6b0e2758585f 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -477,6 +477,7 @@ header-y += virtio_types.h header-y += virtio_vsock.h header-y += virtio_crypto.h header-y += vm_sockets.h +header-y += vsockmon.h header-y += vt.h header-y += vtpm_proxy.h header-y += wait.h diff --git a/include/uapi/linux/vsockmon.h b/include/uapi/linux/vsockmon.h new file mode 100644 index 000000000000..a08b522ef597 --- /dev/null +++ b/include/uapi/linux/vsockmon.h @@ -0,0 +1,60 @@ +#ifndef _UAPI_VSOCKMON_H +#define _UAPI_VSOCKMON_H + +#include + +/* + * vsockmon is the AF_VSOCK packet capture device. Packets captured have the + * following layout: + * + * +-----------------------------------+ + * | vsockmon header | + * | (struct af_vsockmon_hdr) | + * +-----------------------------------+ + * | transport header | + * | (af_vsockmon_hdr->len bytes long) | + * +-----------------------------------+ + * | payload | + * | (until end of packet) | + * +-----------------------------------+ + * + * The vsockmon header is a transport-independent description of the packet. + * It duplicates some of the information from the transport header so that + * no transport-specific knowledge is necessary to process packets. + * + * The transport header is useful for low-level transport-specific packet + * analysis. Transport type is given in af_vsockmon_hdr->transport and + * transport header length is given in af_vsockmon_hdr->len. + * + * If af_vsockmon_hdr->op is AF_VSOCK_OP_PAYLOAD then the payload follows the + * transport header. Other ops do not have a payload. + */ + +struct af_vsockmon_hdr { + __le64 src_cid; + __le64 dst_cid; + __le32 src_port; + __le32 dst_port; + __le16 op; /* enum af_vsockmon_op */ + __le16 transport; /* enum af_vsockmon_transport */ + __le16 len; /* Transport header length */ + __u8 reserved[2]; +}; + +enum af_vsockmon_op { + AF_VSOCK_OP_UNKNOWN = 0, + AF_VSOCK_OP_CONNECT = 1, + AF_VSOCK_OP_DISCONNECT = 2, + AF_VSOCK_OP_CONTROL = 3, + AF_VSOCK_OP_PAYLOAD = 4, +}; + +enum af_vsockmon_transport { + AF_VSOCK_TRANSPORT_UNKNOWN = 0, + AF_VSOCK_TRANSPORT_NO_INFO = 1, /* No transport information */ + + /* Transport header type: struct virtio_vsock_hdr */ + AF_VSOCK_TRANSPORT_VIRTIO = 2, +}; + +#endif -- cgit v1.3 From 4a69a864209e9ab436d4a58e8028ac96cc873d15 Mon Sep 17 00:00:00 2001 From: Mike Maloney Date: Fri, 21 Apr 2017 10:56:11 -0400 Subject: packet: add PACKET_FANOUT_FLAG_UNIQUEID to assign new fanout group id. Fanout uses a per net global namespace. A process that intends to create a new fanout group can accidentally join an existing group. It is not possible to detect this. Add socket option PACKET_FANOUT_FLAG_UNIQUEID. When specified the supplied fanout group id must be set to 0, and the kernel chooses an id that is not already in use. This is an ephemeral flag so that other sockets can be added to this group using setsockopt, but NOT specifying this flag. The current getsockopt(..., PACKET_FANOUT, ...) can be used to retrieve the new group id. We assume that there are not a lot of fanout groups and that this is not a high frequency call. The method assigns ids starting at zero and increases until it finds an unused id. It keeps track of the last assigned id, and uses it as a starting point to find new ids. Signed-off-by: Mike Maloney Acked-by: Willem de Bruijn Signed-off-by: David S. Miller --- include/uapi/linux/if_packet.h | 1 + net/packet/af_packet.c | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 9e7edfd8141e..4df96a7dd4fa 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -66,6 +66,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_CBPF 6 #define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 +#define PACKET_FANOUT_FLAG_UNIQUEID 0x2000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 struct tpacket_stats { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8489beff5c25..94052f42058b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1496,6 +1496,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); +static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { @@ -1629,6 +1630,36 @@ static void fanout_release_data(struct packet_fanout *f) }; } +static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) +{ + struct packet_fanout *f; + + list_for_each_entry(f, &fanout_list, list) { + if (f->id == candidate_id && + read_pnet(&f->net) == sock_net(sk)) { + return false; + } + } + return true; +} + +static bool fanout_find_new_id(struct sock *sk, u16 *new_id) +{ + u16 id = fanout_next_id; + + do { + if (__fanout_id_is_free(sk, id)) { + *new_id = id; + fanout_next_id = id + 1; + return true; + } + + id++; + } while (id != fanout_next_id); + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_rollover *rollover = NULL; @@ -1676,6 +1707,19 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) po->rollover = rollover; } + if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { + if (id != 0) { + err = -EINVAL; + goto out; + } + if (!fanout_find_new_id(sk, &id)) { + err = -ENOMEM; + goto out; + } + /* ephemeral flag for the first socket in the group: drop it */ + flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); + } + match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && -- cgit v1.3 From 120645513f55a4ac5543120d9e79925d30a0156f Mon Sep 17 00:00:00 2001 From: Jarno Rajahalme Date: Fri, 21 Apr 2017 16:48:06 -0700 Subject: openvswitch: Add eventmask support to CT action. Add a new optional conntrack action attribute OVS_CT_ATTR_EVENTMASK, which can be used in conjunction with the commit flag (OVS_CT_ATTR_COMMIT) to set the mask of bits specifying which conntrack events (IPCT_*) should be delivered via the Netfilter netlink multicast groups. Default behavior depends on the system configuration, but typically a lot of events are delivered. This can be very chatty for the NFNLGRP_CONNTRACK_UPDATE group, even if only some types of events are of interest. Netfilter core init_conntrack() adds the event cache extension, so we only need to set the ctmask value. However, if the system is configured without support for events, the setting will be skipped due to extension not being found. Signed-off-by: Jarno Rajahalme Reviewed-by: Greg Rose Acked-by: Joe Stringer Signed-off-by: David S. Miller --- include/uapi/linux/openvswitch.h | 12 ++++++++++++ net/openvswitch/conntrack.c | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 66d1c3ccfd8e..61b7d36dfe34 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -693,6 +693,17 @@ struct ovs_action_hash { * nothing if the connection is already committed will check that the current * packet is in conntrack entry's original direction. If directionality does * not match, will delete the existing conntrack entry and commit a new one. + * @OVS_CT_ATTR_EVENTMASK: Mask of bits indicating which conntrack event types + * (enum ip_conntrack_events IPCT_*) should be reported. For any bit set to + * zero, the corresponding event type is not generated. Default behavior + * depends on system configuration, but typically all event types are + * generated, hence listening on NFNLGRP_CONNTRACK_UPDATE events may get a lot + * of events. Explicitly passing this attribute allows limiting the updates + * received to the events of interest. The bit 1 << IPCT_NEW, 1 << + * IPCT_RELATED, and 1 << IPCT_DESTROY must be set to ones for those events to + * be received on NFNLGRP_CONNTRACK_NEW and NFNLGRP_CONNTRACK_DESTROY groups, + * respectively. Remaining bits control the changes for which an event is + * delivered on the NFNLGRP_CONNTRACK_UPDATE group. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -704,6 +715,7 @@ enum ovs_ct_attr { related connections. */ OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ OVS_CT_ATTR_FORCE_COMMIT, /* No argument */ + OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ __OVS_CT_ATTR_MAX }; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 58de4c2da673..4f7c3b5c080b 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -66,7 +66,9 @@ struct ovs_conntrack_info { u8 commit : 1; u8 nat : 3; /* enum ovs_ct_nat */ u8 force : 1; + u8 have_eventmask : 1; u16 family; + u32 eventmask; /* Mask of 1 << IPCT_*. */ struct md_mark mark; struct md_labels labels; #ifdef CONFIG_NF_NAT_NEEDED @@ -1007,6 +1009,20 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, if (!ct) return 0; + /* Set the conntrack event mask if given. NEW and DELETE events have + * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener + * typically would receive many kinds of updates. Setting the event + * mask allows those events to be filtered. The set event mask will + * remain in effect for the lifetime of the connection unless changed + * by a further CT action with both the commit flag and the eventmask + * option. */ + if (info->have_eventmask) { + struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); + + if (cache) + cache->ctmask = info->eventmask; + } + /* Apply changes before confirming the connection so that the initial * conntrack NEW netlink event carries the values given in the CT * action. @@ -1238,6 +1254,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { /* NAT length is checked when parsing the nested attributes. */ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, #endif + [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), + .maxlen = sizeof(u32) }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -1316,6 +1334,11 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, break; } #endif + case OVS_CT_ATTR_EVENTMASK: + info->have_eventmask = true; + info->eventmask = nla_get_u32(a); + break; + default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -1515,6 +1538,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, ct_info->helper->name)) return -EMSGSIZE; } + if (ct_info->have_eventmask && + nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) + return -EMSGSIZE; + #ifdef CONFIG_NF_NAT_NEEDED if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) return -EMSGSIZE; -- cgit v1.3 From 46c2fa39877ed70415ee2b1acfb9129e956f6de4 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 20 Apr 2017 14:45:47 -0700 Subject: net/tcp_fastopen: Add snmp counter for blackhole detection This counter records the number of times the firewall blackhole issue is detected and active TFO is disabled. Signed-off-by: Wei Wang Acked-by: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller --- include/net/tcp.h | 2 +- include/uapi/linux/snmp.h | 1 + net/ipv4/proc.c | 1 + net/ipv4/tcp_fastopen.c | 5 +++-- net/ipv4/tcp_input.c | 4 ++-- 5 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/tcp.h b/include/net/tcp.h index c1abc2abbdcb..da28bef1d82b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1507,7 +1507,7 @@ struct tcp_fastopen_context { }; extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; -void tcp_fastopen_active_disable(void); +void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_timeout_reset(void); diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index cec0e171d20c..95cffcb21dfd 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -259,6 +259,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ + LINUX_MIB_TCPFASTOPENBLACKHOLE, /* TCPFastOpenBlackholeDetect */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 4ccbf464d1ac..fa44e752a9a3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -281,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index ff2d30ffc6f3..4af82b914dd4 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -410,10 +410,11 @@ static unsigned long tfo_active_disable_stamp __read_mostly; /* Disable active TFO and record current jiffies and * tfo_active_disable_times */ -void tcp_fastopen_active_disable(void) +void tcp_fastopen_active_disable(struct sock *sk) { atomic_inc(&tfo_active_disable_times); tfo_active_disable_stamp = jiffies; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); } /* Reset tfo_active_disable_times to 0 */ @@ -469,7 +470,7 @@ void tcp_fastopen_active_disable_ofo_check(struct sock *sk) if (p && !rb_next(p)) { skb = rb_entry(p, struct sk_buff, rbnode); if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { - tcp_fastopen_active_disable(); + tcp_fastopen_active_disable(sk); return; } } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9f342a67dc74..5af2f04f8859 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5307,7 +5307,7 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, */ if (tp->syn_fastopen && !tp->data_segs_in && sk->sk_state == TCP_ESTABLISHED) - tcp_fastopen_active_disable(); + tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk, skb); } goto discard; @@ -6061,7 +6061,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { /* Receive out of order FIN after close() */ if (tp->syn_fastopen && th->fin) - tcp_fastopen_active_disable(); + tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; -- cgit v1.3 From a577d8f793ff2fd514915686079e3c09bcf0df11 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Sat, 22 Apr 2017 16:52:47 -0400 Subject: cls_flower: add support for matching MPLS fields (v2) Add support to the tc flower classifier to match based on fields in MPLS labels (TTL, Bottom of Stack, TC field, Label). Signed-off-by: Benjamin LaHaise Signed-off-by: Benjamin LaHaise Reviewed-by: Jakub Kicinski Cc: "David S. Miller" Cc: Simon Horman Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: Eric Dumazet Cc: Hadar Hen Zion Cc: Gao Feng Acked-by: Jiri Pirko Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 5 +++ net/sched/cls_flower.c | 74 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7a69f2a4ca0c..f1129e383b2a 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -432,6 +432,11 @@ enum { TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 31ee3404aeb4..3ecf07666df3 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs enc_ipv6; }; struct flow_dissector_key_ports enc_tp; + struct flow_dissector_key_mpls mpls; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -418,6 +420,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_MPLS_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -433,6 +439,31 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static void fl_set_key_mpls(struct nlattr **tb, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask) +{ + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { + key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + key_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { + key_val->mpls_bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); + key_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_TC]) { + key_val->mpls_tc = + nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]) & MPLS_TC_MASK; + key_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + key_val->mpls_label = + nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]) & + MPLS_LABEL_MASK; + key_mask->mpls_label = MPLS_LABEL_MASK; + } +} + static void fl_set_key_vlan(struct nlattr **tb, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) @@ -589,6 +620,9 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) || + key->basic.n_proto == htons(ETH_P_MPLS_MC)) { + fl_set_key_mpls(tb, &key->mpls, &mask->mpls); } else if (key->basic.n_proto == htons(ETH_P_ARP) || key->basic.n_proto == htons(ETH_P_RARP)) { fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, @@ -724,6 +758,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_MPLS, mpls); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -991,6 +1027,41 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_mpls(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + int err; + + if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + return 0; + if (mpls_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, + mpls_key->mpls_ttl); + if (err) + return err; + } + if (mpls_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, + mpls_key->mpls_tc); + if (err) + return err; + } + if (mpls_mask->mpls_label) { + err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, + mpls_key->mpls_label); + if (err) + return err; + } + if (mpls_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, + mpls_key->mpls_bos); + if (err) + return err; + } + return 0; +} + static int fl_dump_key_vlan(struct sk_buff *skb, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) @@ -1096,6 +1167,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->basic.n_proto))) goto nla_put_failure; + if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) + goto nla_put_failure; + if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) goto nla_put_failure; -- cgit v1.3 From a8f820a380a2a06fc4fe1a54159067958f800929 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 25 Apr 2017 08:19:44 +0200 Subject: can: add Virtual CAN Tunnel driver (vxcan) Similar to the virtual ethernet driver veth, vxcan implements a local CAN traffic tunnel between two virtual CAN network devices. See Kconfig entry for details. Signed-off-by: Oliver Hartkopp Signed-off-by: Marc Kleine-Budde --- drivers/net/can/Kconfig | 18 +++ drivers/net/can/Makefile | 1 + drivers/net/can/vxcan.c | 316 +++++++++++++++++++++++++++++++++++++++++ include/uapi/linux/can/vxcan.h | 12 ++ 4 files changed, 347 insertions(+) create mode 100644 drivers/net/can/vxcan.c create mode 100644 include/uapi/linux/can/vxcan.h (limited to 'include/uapi/linux') diff --git a/drivers/net/can/Kconfig b/drivers/net/can/Kconfig index aa20b69d2a1a..ac4ff394bc56 100644 --- a/drivers/net/can/Kconfig +++ b/drivers/net/can/Kconfig @@ -9,6 +9,24 @@ config CAN_VCAN This driver can also be built as a module. If so, the module will be called vcan. +config CAN_VXCAN + tristate "Virtual CAN Tunnel (vxcan)" + ---help--- + Similar to the virtual ethernet driver veth, vxcan implements a + local CAN traffic tunnel between two virtual CAN network devices. + When creating a vxcan, two vxcan devices are created as pair. + When one end receives the packet it appears on its pair and vice + versa. The vxcan can be used for cross namespace communication. + + In opposite to vcan loopback devices the vxcan only forwards CAN + frames to its pair and does *not* provide a local echo of sent + CAN frames. To disable a potential echo in af_can.c the vxcan driver + announces IFF_ECHO in the interface flags. To have a clean start + in each namespace the CAN GW hop counter is set to zero. + + This driver can also be built as a module. If so, the module + will be called vxcan. + config CAN_SLCAN tristate "Serial / USB serial CAN Adaptors (slcan)" depends on TTY diff --git a/drivers/net/can/Makefile b/drivers/net/can/Makefile index 8581e2b3e87f..4aabbee133b8 100644 --- a/drivers/net/can/Makefile +++ b/drivers/net/can/Makefile @@ -3,6 +3,7 @@ # obj-$(CONFIG_CAN_VCAN) += vcan.o +obj-$(CONFIG_CAN_VXCAN) += vxcan.o obj-$(CONFIG_CAN_SLCAN) += slcan.o obj-$(CONFIG_CAN_DEV) += can-dev.o diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c new file mode 100644 index 000000000000..7fbb24795681 --- /dev/null +++ b/drivers/net/can/vxcan.c @@ -0,0 +1,316 @@ +/* + * vxcan.c - Virtual CAN Tunnel for cross namespace communication + * + * This code is derived from drivers/net/can/vcan.c for the virtual CAN + * specific parts and from drivers/net/veth.c to implement the netlink API + * for network interface pairs in a common and established way. + * + * Copyright (c) 2017 Oliver Hartkopp + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the version 2 of the GNU General Public License + * as published by the Free Software Foundation + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DRV_NAME "vxcan" + +MODULE_DESCRIPTION("Virtual CAN Tunnel"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Oliver Hartkopp "); +MODULE_ALIAS_RTNL_LINK(DRV_NAME); + +struct vxcan_priv { + struct net_device __rcu *peer; +}; + +static netdev_tx_t vxcan_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct vxcan_priv *priv = netdev_priv(dev); + struct net_device *peer; + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + struct net_device_stats *peerstats, *srcstats = &dev->stats; + + if (can_dropped_invalid_skb(dev, skb)) + return NETDEV_TX_OK; + + rcu_read_lock(); + peer = rcu_dereference(priv->peer); + if (unlikely(!peer)) { + kfree_skb(skb); + dev->stats.tx_dropped++; + goto out_unlock; + } + + skb = can_create_echo_skb(skb); + if (!skb) + goto out_unlock; + + /* reset CAN GW hop counter */ + skb->csum_start = 0; + skb->pkt_type = PACKET_BROADCAST; + skb->dev = peer; + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (netif_rx_ni(skb) == NET_RX_SUCCESS) { + srcstats->tx_packets++; + srcstats->tx_bytes += cfd->len; + peerstats = &peer->stats; + peerstats->rx_packets++; + peerstats->rx_bytes += cfd->len; + } + +out_unlock: + rcu_read_unlock(); + return NETDEV_TX_OK; +} + + +static int vxcan_open(struct net_device *dev) +{ + struct vxcan_priv *priv = netdev_priv(dev); + struct net_device *peer = rtnl_dereference(priv->peer); + + if (!peer) + return -ENOTCONN; + + if (peer->flags & IFF_UP) { + netif_carrier_on(dev); + netif_carrier_on(peer); + } + return 0; +} + +static int vxcan_close(struct net_device *dev) +{ + struct vxcan_priv *priv = netdev_priv(dev); + struct net_device *peer = rtnl_dereference(priv->peer); + + netif_carrier_off(dev); + if (peer) + netif_carrier_off(peer); + + return 0; +} + +static int vxcan_get_iflink(const struct net_device *dev) +{ + struct vxcan_priv *priv = netdev_priv(dev); + struct net_device *peer; + int iflink; + + rcu_read_lock(); + peer = rcu_dereference(priv->peer); + iflink = peer ? peer->ifindex : 0; + rcu_read_unlock(); + + return iflink; +} + +static int vxcan_change_mtu(struct net_device *dev, int new_mtu) +{ + /* Do not allow changing the MTU while running */ + if (dev->flags & IFF_UP) + return -EBUSY; + + if (new_mtu != CAN_MTU && new_mtu != CANFD_MTU) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +static const struct net_device_ops vxcan_netdev_ops = { + .ndo_open = vxcan_open, + .ndo_stop = vxcan_close, + .ndo_start_xmit = vxcan_xmit, + .ndo_get_iflink = vxcan_get_iflink, + .ndo_change_mtu = vxcan_change_mtu, +}; + +static void vxcan_setup(struct net_device *dev) +{ + dev->type = ARPHRD_CAN; + dev->mtu = CAN_MTU; + dev->hard_header_len = 0; + dev->addr_len = 0; + dev->tx_queue_len = 0; + dev->flags = (IFF_NOARP|IFF_ECHO); + dev->netdev_ops = &vxcan_netdev_ops; + dev->destructor = free_netdev; +} + +/* forward declaration for rtnl_create_link() */ +static struct rtnl_link_ops vxcan_link_ops; + +static int vxcan_newlink(struct net *net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct vxcan_priv *priv; + struct net_device *peer; + struct net *peer_net; + + struct nlattr *peer_tb[IFLA_MAX + 1], **tbp = tb; + char ifname[IFNAMSIZ]; + unsigned char name_assign_type; + struct ifinfomsg *ifmp = NULL; + int err; + + /* register peer device */ + if (data && data[VXCAN_INFO_PEER]) { + struct nlattr *nla_peer; + + nla_peer = data[VXCAN_INFO_PEER]; + ifmp = nla_data(nla_peer); + err = rtnl_nla_parse_ifla(peer_tb, + nla_data(nla_peer) + + sizeof(struct ifinfomsg), + nla_len(nla_peer) - + sizeof(struct ifinfomsg), + NULL); + if (err < 0) + return err; + + tbp = peer_tb; + } + + if (tbp[IFLA_IFNAME]) { + nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + name_assign_type = NET_NAME_USER; + } else { + snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); + name_assign_type = NET_NAME_ENUM; + } + + peer_net = rtnl_link_get_net(net, tbp); + if (IS_ERR(peer_net)) + return PTR_ERR(peer_net); + + peer = rtnl_create_link(peer_net, ifname, name_assign_type, + &vxcan_link_ops, tbp); + if (IS_ERR(peer)) { + put_net(peer_net); + return PTR_ERR(peer); + } + + if (ifmp && dev->ifindex) + peer->ifindex = ifmp->ifi_index; + + err = register_netdevice(peer); + put_net(peer_net); + peer_net = NULL; + if (err < 0) { + free_netdev(peer); + return err; + } + + netif_carrier_off(peer); + + err = rtnl_configure_link(peer, ifmp); + if (err < 0) { + unregister_netdevice(peer); + return err; + } + + /* register first device */ + if (tb[IFLA_IFNAME]) + nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + else + snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); + + err = register_netdevice(dev); + if (err < 0) { + unregister_netdevice(peer); + return err; + } + + netif_carrier_off(dev); + + /* cross link the device pair */ + priv = netdev_priv(dev); + rcu_assign_pointer(priv->peer, peer); + + priv = netdev_priv(peer); + rcu_assign_pointer(priv->peer, dev); + + return 0; +} + +static void vxcan_dellink(struct net_device *dev, struct list_head *head) +{ + struct vxcan_priv *priv; + struct net_device *peer; + + priv = netdev_priv(dev); + peer = rtnl_dereference(priv->peer); + + /* Note : dellink() is called from default_device_exit_batch(), + * before a rcu_synchronize() point. The devices are guaranteed + * not being freed before one RCU grace period. + */ + RCU_INIT_POINTER(priv->peer, NULL); + unregister_netdevice_queue(dev, head); + + if (peer) { + priv = netdev_priv(peer); + RCU_INIT_POINTER(priv->peer, NULL); + unregister_netdevice_queue(peer, head); + } +} + +static const struct nla_policy vxcan_policy[VXCAN_INFO_MAX + 1] = { + [VXCAN_INFO_PEER] = { .len = sizeof(struct ifinfomsg) }, +}; + +static struct net *vxcan_get_link_net(const struct net_device *dev) +{ + struct vxcan_priv *priv = netdev_priv(dev); + struct net_device *peer = rtnl_dereference(priv->peer); + + return peer ? dev_net(peer) : dev_net(dev); +} + +static struct rtnl_link_ops vxcan_link_ops = { + .kind = DRV_NAME, + .priv_size = sizeof(struct vxcan_priv), + .setup = vxcan_setup, + .newlink = vxcan_newlink, + .dellink = vxcan_dellink, + .policy = vxcan_policy, + .maxtype = VXCAN_INFO_MAX, + .get_link_net = vxcan_get_link_net, +}; + +static __init int vxcan_init(void) +{ + pr_info("vxcan: Virtual CAN Tunnel driver\n"); + + return rtnl_link_register(&vxcan_link_ops); +} + +static __exit void vxcan_exit(void) +{ + rtnl_link_unregister(&vxcan_link_ops); +} + +module_init(vxcan_init); +module_exit(vxcan_exit); diff --git a/include/uapi/linux/can/vxcan.h b/include/uapi/linux/can/vxcan.h new file mode 100644 index 000000000000..ffb0b7156f7e --- /dev/null +++ b/include/uapi/linux/can/vxcan.h @@ -0,0 +1,12 @@ +#ifndef _UAPI_CAN_VXCAN_H +#define _UAPI_CAN_VXCAN_H + +enum { + VXCAN_INFO_UNSPEC, + VXCAN_INFO_PEER, + + __VXCAN_INFO_MAX +#define VXCAN_INFO_MAX (__VXCAN_INFO_MAX - 1) +}; + +#endif -- cgit v1.3 From b5cdae3291f7be7a34e75affe4c0ec1f7f328b64 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 18 Apr 2017 15:36:58 -0400 Subject: net: Generic XDP This provides a generic SKB based non-optimized XDP path which is used if either the driver lacks a specific XDP implementation, or the user requests it via a new IFLA_XDP_FLAGS value named XDP_FLAGS_SKB_MODE. It is arguable that perhaps I should have required something like this as part of the initial XDP feature merge. I believe this is critical for two reasons: 1) Accessibility. More people can play with XDP with less dependencies. Yes I know we have XDP support in virtio_net, but that just creates another depedency for learning how to use this facility. I wrote this to make life easier for the XDP newbies. 2) As a model for what the expected semantics are. If there is a pure generic core implementation, it serves as a semantic example for driver folks adding XDP support. One thing I have not tried to address here is the issue of XDP_PACKET_HEADROOM, thanks to Daniel for spotting that. It seems incredibly expensive to do a skb_cow(skb, XDP_PACKET_HEADROOM) or whatever even if the XDP program doesn't try to push headers at all. I think we really need the verifier to somehow propagate whether certain XDP helpers are used or not. v5: - Handle both negative and positive offset after running prog - Fix mac length in XDP_TX case (Alexei) - Use rcu_dereference_protected() in free_netdev (kbuild test robot) v4: - Fix MAC header adjustmnet before calling prog (David Ahern) - Disable LRO when generic XDP is installed (Michael Chan) - Bypass qdisc et al. on XDP_TX and record the event (Alexei) - Do not perform generic XDP on reinjected packets (DaveM) v3: - Make sure XDP program sees packet at MAC header, push back MAC header if we do XDP_TX. (Alexei) - Elide GRO when generic XDP is in use. (Alexei) - Add XDP_FLAG_SKB_MODE flag which the user can use to request generic XDP even if the driver has an XDP implementation. (Alexei) - Report whether SKB mode is in use in rtnl_xdp_fill() via XDP_FLAGS attribute. (Daniel) v2: - Add some "fall through" comments in switch statements based upon feedback from Andrew Lunn - Use RCU for generic xdp_prog, thanks to Johannes Berg. Tested-by: Andy Gospodarek Tested-by: Jesper Dangaard Brouer Tested-by: David Ahern Signed-off-by: David S. Miller --- include/linux/netdevice.h | 8 +++ include/uapi/linux/if_link.h | 4 +- net/core/dev.c | 155 +++++++++++++++++++++++++++++++++++++++++-- net/core/gro_cells.c | 2 +- net/core/rtnetlink.c | 40 ++++++----- 5 files changed, 187 insertions(+), 22 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5d5267febd56..46d220c2bf92 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1905,9 +1905,17 @@ struct net_device { struct lock_class_key *qdisc_tx_busylock; struct lock_class_key *qdisc_running_key; bool proto_down; + struct bpf_prog __rcu *xdp_prog; }; #define to_net_dev(d) container_of(d, struct net_device, dev) +static inline bool netif_elide_gro(const struct net_device *dev) +{ + if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) + return true; + return false; +} + #define NETDEV_ALIGN 32 static inline diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 8b405afb2376..633aa0276d32 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -887,7 +887,9 @@ enum { /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) -#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST) +#define XDP_FLAGS_SKB_MODE (2U << 0) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_SKB_MODE) enum { IFLA_XDP_UNSPEC, diff --git a/net/core/dev.c b/net/core/dev.c index db6e31564d06..1b3317c026c6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -95,6 +95,7 @@ #include #include #include +#include #include #include #include @@ -4251,6 +4252,125 @@ static int __netif_receive_skb(struct sk_buff *skb) return ret; } +static struct static_key generic_xdp_needed __read_mostly; + +static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +{ + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: { + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_key_slow_dec(&generic_xdp_needed); + } else if (new && !old) { + static_key_slow_inc(&generic_xdp_needed); + dev_disable_lro(dev); + } + break; + } + + case XDP_QUERY_PROG: + xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4262,6 +4382,21 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); + if (static_key_false(&generic_xdp_needed)) { + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + rcu_read_unlock(); + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return NET_RX_DROP; + } + } + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4494,7 +4629,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff enum gro_result ret; int grow; - if (!(skb->dev->features & NETIF_F_GRO)) + if (netif_elide_gro(skb->dev)) goto normal; if (skb->csum_bad) @@ -6723,6 +6858,7 @@ EXPORT_SYMBOL(dev_change_proto_down); */ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) { + int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; struct netdev_xdp xdp; @@ -6730,14 +6866,16 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) ASSERT_RTNL(); - if (!ops->ndo_xdp) - return -EOPNOTSUPP; + xdp_op = ops->ndo_xdp; + if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) + xdp_op = generic_xdp_install; + if (fd >= 0) { if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; - err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0) return err; if (xdp.prog_attached) @@ -6753,7 +6891,7 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) xdp.command = XDP_SETUP_PROG; xdp.prog = prog; - err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); @@ -7793,6 +7931,7 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -7811,6 +7950,12 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + prog = rcu_dereference_protected(dev->xdp_prog, 1); + if (prog) { + bpf_prog_put(prog); + static_key_slow_dec(&generic_xdp_needed); + } + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index c98bbfbd26b8..814e58a3ce8b 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -13,7 +13,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) struct net_device *dev = skb->dev; struct gro_cell *cell; - if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) return netif_rx(skb); cell = this_cpu_ptr(gcells->cells); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 088f9c8b4196..9031a6c8bfa7 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -896,15 +896,13 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } -static size_t rtnl_xdp_size(const struct net_device *dev) +static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ - nla_total_size(1); /* XDP_ATTACHED */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4); /* XDP_FLAGS */ - if (!dev->netdev_ops->ndo_xdp) - return 0; - else - return xdp_size; + return xdp_size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, @@ -943,7 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ - + rtnl_xdp_size(dev) /* IFLA_XDP */ + + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1251,23 +1249,35 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - struct netdev_xdp xdp_op = {}; struct nlattr *xdp; + u32 xdp_flags = 0; + u8 val = 0; int err; - if (!dev->netdev_ops->ndo_xdp) - return 0; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - xdp_op.command = XDP_QUERY_PROG; - err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); - if (err) - goto err_cancel; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + if (rcu_access_pointer(dev->xdp_prog)) { + xdp_flags = XDP_FLAGS_SKB_MODE; + val = 1; + } else if (dev->netdev_ops->ndo_xdp) { + struct netdev_xdp xdp_op = {}; + + xdp_op.command = XDP_QUERY_PROG; + err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); + if (err) + goto err_cancel; + val = xdp_op.prog_attached; + } + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val); if (err) goto err_cancel; + if (xdp_flags) { + err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags); + if (err) + goto err_cancel; + } nla_nest_end(skb, xdp); return 0; -- cgit v1.3 From ca986ad9bcd3893c8b0b4cc2cafcc8cf1554409c Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 21 Apr 2017 13:05:00 +0100 Subject: nl80211: allow multiple active scheduled scan requests This patch implements the idea to have multiple scheduled scan requests running concurrently. It mainly illustrates how to deal with the incoming request from user-space in terms of backward compatibility. In order to use multiple scheduled scans user-space needs to provide a flag attribute NL80211_ATTR_SCHED_SCAN_MULTI to indicate support. If not the request is treated as a legacy scan. Drivers currently supporting scheduled scan are now indicating they support a single scheduled scan request. This obsoletes WIPHY_FLAG_SUPPORTS_SCHED_SCAN. Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel [clean up netlink destroy path to avoid allocations, code cleanups] Signed-off-by: Johannes Berg --- drivers/net/wireless/ath/ath6kl/cfg80211.c | 2 +- .../broadcom/brcm80211/brcmfmac/cfg80211.c | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 2 +- drivers/net/wireless/marvell/mwifiex/cfg80211.c | 2 +- drivers/net/wireless/ti/wlcore/main.c | 2 +- include/net/cfg80211.h | 9 +- include/uapi/linux/nl80211.h | 12 ++- net/wireless/core.c | 29 +++--- net/wireless/core.h | 11 +- net/wireless/nl80211.c | 63 ++++++++--- net/wireless/rdev-ops.h | 2 +- net/wireless/scan.c | 115 +++++++++++++++++---- net/wireless/trace.h | 18 ++-- 13 files changed, 205 insertions(+), 64 deletions(-) (limited to 'include/uapi/linux') diff --git a/drivers/net/wireless/ath/ath6kl/cfg80211.c b/drivers/net/wireless/ath/ath6kl/cfg80211.c index 0c118b7c362c..1906412afa70 100644 --- a/drivers/net/wireless/ath/ath6kl/cfg80211.c +++ b/drivers/net/wireless/ath/ath6kl/cfg80211.c @@ -3973,7 +3973,7 @@ int ath6kl_cfg80211_init(struct ath6kl *ar) WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD; if (test_bit(ATH6KL_FW_CAPABILITY_SCHED_SCAN_V2, ar->fw_capabilities)) - ar->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; + ar->wiphy->max_sched_scan_reqs = 1; if (test_bit(ATH6KL_FW_CAPABILITY_INACTIVITY_TIMEOUT, ar->fw_capabilities)) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index 8c7f1ef288c6..c71173dc9965 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -6374,11 +6374,11 @@ err: static void brcmf_wiphy_pno_params(struct wiphy *wiphy) { /* scheduled scan settings */ + wiphy->max_sched_scan_reqs = 1; wiphy->max_sched_scan_ssids = BRCMF_PNO_MAX_PFN_COUNT; wiphy->max_match_sets = BRCMF_PNO_MAX_PFN_COUNT; wiphy->max_sched_scan_ie_len = BRCMF_SCAN_IE_LEN_MAX; wiphy->max_sched_scan_plan_interval = BRCMF_PNO_SCHED_SCAN_MAX_PERIOD; - wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; } #ifdef CONFIG_PM diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 5cdd95775ba6..8c58d47100a0 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -620,7 +620,7 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) else hw->wiphy->flags &= ~WIPHY_FLAG_PS_ON_BY_DEFAULT; - hw->wiphy->flags |= WIPHY_FLAG_SUPPORTS_SCHED_SCAN; + hw->wiphy->max_sched_scan_reqs = 1; hw->wiphy->max_sched_scan_ssids = PROBE_OPTION_MAX; hw->wiphy->max_match_sets = IWL_SCAN_MAX_PROFILES; /* we create the 802.11 header and zero length SSID IE. */ diff --git a/drivers/net/wireless/marvell/mwifiex/cfg80211.c b/drivers/net/wireless/marvell/mwifiex/cfg80211.c index 49b4c805b7d5..9927bd5aac56 100644 --- a/drivers/net/wireless/marvell/mwifiex/cfg80211.c +++ b/drivers/net/wireless/marvell/mwifiex/cfg80211.c @@ -4297,7 +4297,6 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) wiphy->flags |= WIPHY_FLAG_HAVE_AP_SME | WIPHY_FLAG_AP_PROBE_RESP_OFFLOAD | WIPHY_FLAG_AP_UAPSD | - WIPHY_FLAG_SUPPORTS_SCHED_SCAN | WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | WIPHY_FLAG_HAS_CHANNEL_SWITCH | WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -4316,6 +4315,7 @@ int mwifiex_register_cfg80211(struct mwifiex_adapter *adapter) NL80211_PROBE_RESP_OFFLOAD_SUPPORT_WPS2 | NL80211_PROBE_RESP_OFFLOAD_SUPPORT_P2P; + wiphy->max_sched_scan_reqs = 1; wiphy->max_sched_scan_ssids = MWIFIEX_MAX_SSID_LIST_LENGTH; wiphy->max_sched_scan_ie_len = MWIFIEX_MAX_VSIE_LEN; wiphy->max_match_sets = MWIFIEX_MAX_SSID_LIST_LENGTH; diff --git a/drivers/net/wireless/ti/wlcore/main.c b/drivers/net/wireless/ti/wlcore/main.c index a21fda910529..382ec15ec1af 100644 --- a/drivers/net/wireless/ti/wlcore/main.c +++ b/drivers/net/wireless/ti/wlcore/main.c @@ -6128,6 +6128,7 @@ static int wl1271_init_ieee80211(struct wl1271 *wl) wl->hw->wiphy->max_scan_ie_len = WL1271_CMD_TEMPL_MAX_SIZE - sizeof(struct ieee80211_header); + wl->hw->wiphy->max_sched_scan_reqs = 1; wl->hw->wiphy->max_sched_scan_ie_len = WL1271_CMD_TEMPL_MAX_SIZE - sizeof(struct ieee80211_header); @@ -6135,7 +6136,6 @@ static int wl1271_init_ieee80211(struct wl1271 *wl) wl->hw->wiphy->flags |= WIPHY_FLAG_AP_UAPSD | WIPHY_FLAG_HAS_REMAIN_ON_CHANNEL | - WIPHY_FLAG_SUPPORTS_SCHED_SCAN | WIPHY_FLAG_HAS_CHANNEL_SWITCH; wl->hw->wiphy->features |= NL80211_FEATURE_AP_SCAN; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index af958938b3b1..43c0f389c273 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1678,6 +1678,8 @@ struct cfg80211_bss_select_adjust { * @rcu_head: RCU callback used to free the struct * @owner_nlportid: netlink portid of owner (if this should is a request * owned by a particular socket) + * @nl_owner_dead: netlink owner socket was closed - this request be freed + * @list: for keeping list of requests. * @delay: delay in seconds to use before starting the first scan * cycle. The driver may ignore this parameter and start * immediately (or at any other time), if this feature is not @@ -1722,6 +1724,8 @@ struct cfg80211_sched_scan_request { unsigned long scan_start; struct rcu_head rcu_head; u32 owner_nlportid; + bool nl_owner_dead; + struct list_head list; /* keep last */ struct ieee80211_channel *channels[0]; @@ -3213,7 +3217,7 @@ enum wiphy_flags { WIPHY_FLAG_CONTROL_PORT_PROTOCOL = BIT(7), WIPHY_FLAG_IBSS_RSN = BIT(8), WIPHY_FLAG_MESH_AUTH = BIT(10), - WIPHY_FLAG_SUPPORTS_SCHED_SCAN = BIT(11), + /* use hole at 11 */ /* use hole at 12 */ WIPHY_FLAG_SUPPORTS_FW_ROAM = BIT(13), WIPHY_FLAG_AP_UAPSD = BIT(14), @@ -3551,6 +3555,8 @@ struct wiphy_iftype_ext_capab { * this variable determines its size * @max_scan_ssids: maximum number of SSIDs the device can scan for in * any given scan + * @max_sched_scan_reqs: maximum number of scheduled scan requests that + * the device can run concurrently. * @max_sched_scan_ssids: maximum number of SSIDs the device can scan * for in any given scheduled scan * @max_match_sets: maximum number of match sets the device can handle @@ -3687,6 +3693,7 @@ struct wiphy { int bss_priv_size; u8 max_scan_ssids; + u8 max_sched_scan_reqs; u8 max_sched_scan_ssids; u8 max_match_sets; u16 max_scan_ie_len; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 6095a6c4c412..f34127d241e5 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -387,7 +387,9 @@ * are used. Extra IEs can also be passed from the userspace by * using the %NL80211_ATTR_IE attribute. The first cycle of the * scheduled scan can be delayed by %NL80211_ATTR_SCHED_SCAN_DELAY - * is supplied. + * is supplied. If the device supports multiple concurrent scheduled + * scans, it will allow such when the caller provides the flag attribute + * %NL80211_ATTR_SCHED_SCAN_MULTI to indicate user-space support for it. * @NL80211_CMD_STOP_SCHED_SCAN: stop a scheduled scan. Returns -ENOENT if * scheduled scan is not running. The caller may assume that as soon * as the call returns, it is safe to start a new scheduled scan again. @@ -2081,6 +2083,11 @@ enum nl80211_commands { * @NL80211_ATTR_PMK: PMK for the PMKSA identified by %NL80211_ATTR_PMKID. * This is used with @NL80211_CMD_SET_PMKSA. * + * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to + * indicate that it supports multiple active scheduled scan requests. + * @NL80211_ATTR_SCHED_SCAN_MAX_REQS: indicates maximum number of scheduled + * scan request that may be active for the device (u32). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2500,6 +2507,9 @@ enum nl80211_attrs { NL80211_ATTR_PMK, + NL80211_ATTR_SCHED_SCAN_MULTI, + NL80211_ATTR_SCHED_SCAN_MAX_REQS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, diff --git a/net/wireless/core.c b/net/wireless/core.c index 4ea28de3a636..a3c0c48afb85 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -330,14 +330,16 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work) static void cfg80211_sched_scan_stop_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; + struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); rtnl_lock(); - - __cfg80211_stop_sched_scan(rdev, false); - + list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { + if (req->nl_owner_dead) + cfg80211_stop_sched_scan_req(rdev, req, false); + } rtnl_unlock(); } @@ -452,6 +454,7 @@ use_default_name: spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); + INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); INIT_LIST_HEAD(&rdev->mlme_unreg); @@ -1028,7 +1031,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; - struct cfg80211_sched_scan_request *sched_scan_req; + struct cfg80211_sched_scan_request *pos, *tmp; ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); @@ -1039,9 +1042,11 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - if (sched_scan_req && dev == sched_scan_req->dev) - __cfg80211_stop_sched_scan(rdev, false); + list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, + list) { + if (dev == pos->dev) + cfg80211_stop_sched_scan_req(rdev, pos, false); + } #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); @@ -1116,7 +1121,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; - struct cfg80211_sched_scan_request *sched_scan_req; + struct cfg80211_sched_scan_request *pos, *tmp; if (!wdev) return NOTIFY_DONE; @@ -1193,10 +1198,10 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, ___cfg80211_scan_done(rdev, false); } - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - if (WARN_ON(sched_scan_req && - sched_scan_req->dev == wdev->netdev)) { - __cfg80211_stop_sched_scan(rdev, false); + list_for_each_entry_safe(pos, tmp, + &rdev->sched_scan_req_list, list) { + if (WARN_ON(pos && pos->dev == wdev->netdev)) + cfg80211_stop_sched_scan_req(rdev, pos, false); } rdev->opencount--; diff --git a/net/wireless/core.h b/net/wireless/core.h index f9b748e3425a..06eaf96053a8 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -74,7 +74,7 @@ struct cfg80211_registered_device { u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; - struct cfg80211_sched_scan_request __rcu *sched_scan_req; + struct list_head sched_scan_req_list; unsigned long suspend_at; struct work_struct scan_done_wk; struct work_struct sched_scan_results_wk; @@ -416,9 +416,16 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, void __cfg80211_scan_done(struct work_struct *wk); void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message); +void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req); +int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, + bool want_multi); void __cfg80211_sched_scan_results(struct work_struct *wk); +int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req, + bool driver_initiated); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, - bool driver_initiated); + u64 reqid, bool driver_initiated); void cfg80211_upload_connect_keys(struct wireless_dev *wdev); int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 45f5f418e562..ac7e2314f9ec 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -419,6 +419,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = FILS_ERP_MAX_RRK_LEN }, [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, + [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -1376,7 +1377,7 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev, CMD(tdls_mgmt, TDLS_MGMT); CMD(tdls_oper, TDLS_OPER); } - if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + if (rdev->wiphy.max_sched_scan_reqs) CMD(sched_scan_start, START_SCHED_SCAN); CMD(probe_client, PROBE_CLIENT); CMD(set_noack_map, SET_NOACK_MAP); @@ -1815,6 +1816,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG)) goto nla_put_failure; + if (rdev->wiphy.max_sched_scan_reqs && + nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_MAX_REQS, + rdev->wiphy.max_sched_scan_reqs)) + goto nla_put_failure; + if (nla_put(msg, NL80211_ATTR_EXT_FEATURES, sizeof(rdev->wiphy.ext_features), rdev->wiphy.ext_features)) @@ -7336,14 +7342,16 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_sched_scan_request *sched_scan_req; + bool want_multi; int err; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || - !rdev->ops->sched_scan_start) + if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_start) return -EOPNOTSUPP; - if (rdev->sched_scan_req) - return -EINPROGRESS; + want_multi = info->attrs[NL80211_ATTR_SCHED_SCAN_MULTI]; + err = cfg80211_sched_scan_req_possible(rdev, want_multi); + if (err) + return err; sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, info->attrs, @@ -7353,6 +7361,14 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (err) goto out_err; + /* leave request id zero for legacy request + * or if driver does not support multi-scheduled scan + */ + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { + while (!sched_scan_req->reqid) + sched_scan_req->reqid = rdev->wiphy.cookie_counter++; + } + err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) goto out_free; @@ -7363,7 +7379,7 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) sched_scan_req->owner_nlportid = info->snd_portid; - rcu_assign_pointer(rdev->sched_scan_req, sched_scan_req); + cfg80211_add_sched_scan_req(rdev, sched_scan_req); nl80211_send_sched_scan(sched_scan_req, NL80211_CMD_START_SCHED_SCAN); return 0; @@ -7377,13 +7393,27 @@ out_err: static int nl80211_stop_sched_scan(struct sk_buff *skb, struct genl_info *info) { + struct cfg80211_sched_scan_request *req; struct cfg80211_registered_device *rdev = info->user_ptr[0]; + u64 cookie; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || - !rdev->ops->sched_scan_stop) + if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_stop) return -EOPNOTSUPP; - return __cfg80211_stop_sched_scan(rdev, false); + if (info->attrs[NL80211_ATTR_COOKIE]) { + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + return __cfg80211_stop_sched_scan(rdev, cookie, false); + } + + req = list_first_or_null_rcu(&rdev->sched_scan_req_list, + struct cfg80211_sched_scan_request, + list); + if (!req || req->reqid || + (req->owner_nlportid && + req->owner_nlportid != info->snd_portid)) + return -ENOENT; + + return cfg80211_stop_sched_scan_req(rdev, req, false); } static int nl80211_start_radar_detection(struct sk_buff *skb, @@ -14883,16 +14913,15 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_lock(); list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { - struct cfg80211_sched_scan_request *sched_scan_req = - rcu_dereference(rdev->sched_scan_req); - - if (sched_scan_req && notify->portid && - sched_scan_req->owner_nlportid == notify->portid) { - sched_scan_req->owner_nlportid = 0; + struct cfg80211_sched_scan_request *sched_scan_req; - if (rdev->ops->sched_scan_stop && - rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + list_for_each_entry_rcu(sched_scan_req, + &rdev->sched_scan_req_list, + list) { + if (sched_scan_req->owner_nlportid == notify->portid) { + sched_scan_req->nl_owner_dead = true; schedule_work(&rdev->sched_scan_stop_wk); + } } list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index e4a99989dd06..783f89c3e504 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -813,7 +813,7 @@ rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *request) { int ret; - trace_rdev_sched_scan_start(&rdev->wiphy, dev, request); + trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 6f4996c0f4df..bd9feed95c1e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -300,6 +300,70 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request, } EXPORT_SYMBOL(cfg80211_scan_done); +void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req) +{ + ASSERT_RTNL(); + + list_add_rcu(&req->list, &rdev->sched_scan_req_list); +} + +static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req) +{ + ASSERT_RTNL(); + + list_del_rcu(&req->list); + kfree_rcu(req, rcu_head); +} + +static struct cfg80211_sched_scan_request * +cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) +{ + struct cfg80211_sched_scan_request *pos; + + ASSERT_RTNL(); + + list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + if (pos->reqid == reqid) + return pos; + } + return ERR_PTR(-ENOENT); +} + +/* + * Determines if a scheduled scan request can be handled. When a legacy + * scheduled scan is running no other scheduled scan is allowed regardless + * whether the request is for legacy or multi-support scan. When a multi-support + * scheduled scan is running a request for legacy scan is not allowed. In this + * case a request for multi-support scan can be handled if resources are + * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached. + */ +int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, + bool want_multi) +{ + struct cfg80211_sched_scan_request *pos; + int i = 0; + + list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + /* request id zero means legacy in progress */ + if (!i && !pos->reqid) + return -EINPROGRESS; + i++; + } + + if (i) { + /* no legacy allowed when multi request(s) are active */ + if (!want_multi) + return -EINPROGRESS; + + /* resource limit reached */ + if (i == rdev->wiphy.max_sched_scan_reqs) + return -ENOSPC; + } + return 0; +} + void __cfg80211_sched_scan_results(struct work_struct *wk) { struct cfg80211_registered_device *rdev; @@ -310,10 +374,10 @@ void __cfg80211_sched_scan_results(struct work_struct *wk) rtnl_lock(); - request = rtnl_dereference(rdev->sched_scan_req); + request = cfg80211_find_sched_scan_req(rdev, 0); /* we don't have sched_scan_req anymore if the scan is stopping */ - if (request) { + if (!IS_ERR(request)) { if (request->flags & NL80211_SCAN_FLAG_FLUSH) { /* flush entries from previous scans */ spin_lock_bh(&rdev->bss_lock); @@ -329,10 +393,17 @@ void __cfg80211_sched_scan_results(struct work_struct *wk) void cfg80211_sched_scan_results(struct wiphy *wiphy) { + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_sched_scan_request *request; + trace_cfg80211_sched_scan_results(wiphy); /* ignore if we're not scanning */ - if (rcu_access_pointer(wiphy_to_rdev(wiphy)->sched_scan_req)) + rtnl_lock(); + request = cfg80211_find_sched_scan_req(rdev, 0); + rtnl_unlock(); + + if (!IS_ERR(request)) queue_work(cfg80211_wq, &wiphy_to_rdev(wiphy)->sched_scan_results_wk); } @@ -346,7 +417,7 @@ void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy) trace_cfg80211_sched_scan_stopped(wiphy); - __cfg80211_stop_sched_scan(rdev, true); + __cfg80211_stop_sched_scan(rdev, 0, true); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); @@ -358,34 +429,40 @@ void cfg80211_sched_scan_stopped(struct wiphy *wiphy) } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); -int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, - bool driver_initiated) +int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req, + bool driver_initiated) { - struct cfg80211_sched_scan_request *sched_scan_req; - struct net_device *dev; - ASSERT_RTNL(); - if (!rdev->sched_scan_req) - return -ENOENT; - - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - dev = sched_scan_req->dev; - if (!driver_initiated) { - int err = rdev_sched_scan_stop(rdev, dev); + int err = rdev_sched_scan_stop(rdev, req->dev); if (err) return err; } - nl80211_send_sched_scan(sched_scan_req, NL80211_CMD_SCHED_SCAN_STOPPED); + nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED); - RCU_INIT_POINTER(rdev->sched_scan_req, NULL); - kfree_rcu(sched_scan_req, rcu_head); + cfg80211_del_sched_scan_req(rdev, req); return 0; } +int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, + u64 reqid, bool driver_initiated) +{ + struct cfg80211_sched_scan_request *sched_scan_req; + + ASSERT_RTNL(); + + sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); + if (IS_ERR(sched_scan_req)) + return PTR_ERR(sched_scan_req); + + return cfg80211_stop_sched_scan_req(rdev, sched_scan_req, + driver_initiated); +} + void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { diff --git a/net/wireless/trace.h b/net/wireless/trace.h index fd55786f0462..52935c48b342 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -1610,20 +1610,26 @@ DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, TP_ARGS(wiphy, rx, tx) ); -TRACE_EVENT(rdev_sched_scan_start, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_sched_scan_request *request), - TP_ARGS(wiphy, netdev, request), +DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), + TP_ARGS(wiphy, netdev, id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(u64, id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->id = id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id) +); + +DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), + TP_ARGS(wiphy, netdev, id) ); TRACE_EVENT(rdev_tdls_mgmt, -- cgit v1.3 From 3007e3529ce1efd9c370a7b81633e45f730ae35b Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Fri, 21 Apr 2017 13:05:01 +0100 Subject: nl80211: add support for BSSIDs in scheduled scan matchsets This patch allows for the scheduled scan request to specify matchsets for specific BSSIDs. Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel [docs, netlink policy fix] Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 6 +++++- include/uapi/linux/nl80211.h | 4 ++++ net/wireless/nl80211.c | 40 ++++++++++++++++++++++++++++++---------- 3 files changed, 39 insertions(+), 11 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 43c0f389c273..4058518e267a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1613,11 +1613,15 @@ static inline void get_random_mask_addr(u8 *buf, const u8 *addr, const u8 *mask) /** * struct cfg80211_match_set - sets of attributes to match * - * @ssid: SSID to be matched; may be zero-length for no match (RSSI only) + * @ssid: SSID to be matched; may be zero-length in case of BSSID match + * or no match (RSSI only) + * @bssid: BSSID to be matched; may be all-zero BSSID in case of SSID match + * or no match (RSSI only) * @rssi_thold: don't report scan results below this threshold (in s32 dBm) */ struct cfg80211_match_set { struct cfg80211_ssid ssid; + u8 bssid[ETH_ALEN]; s32 rssi_thold; }; diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index f34127d241e5..b8c44b98f12d 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -3194,6 +3194,7 @@ enum nl80211_reg_rule_attr { * @__NL80211_SCHED_SCAN_MATCH_ATTR_INVALID: attribute number 0 is reserved * @NL80211_SCHED_SCAN_MATCH_ATTR_SSID: SSID to be used for matching, * only report BSS with matching SSID. + * (This cannot be used together with BSSID.) * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI: RSSI threshold (in dBm) for reporting a * BSS in scan results. Filtering is turned off if not specified. Note that * if this attribute is in a match set of its own, then it is treated as @@ -3209,6 +3210,8 @@ enum nl80211_reg_rule_attr { * BSS-es in the specified band is to be adjusted before doing * RSSI-based BSS selection. The attribute value is a packed structure * value as specified by &struct nl80211_bss_select_rssi_adjust. + * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching + * (this cannot be used together with SSID). * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter * attribute number currently defined * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use @@ -3220,6 +3223,7 @@ enum nl80211_sched_scan_match_attr { NL80211_SCHED_SCAN_MATCH_ATTR_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST, + NL80211_SCHED_SCAN_MATCH_ATTR_BSSID, /* keep last */ __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ac7e2314f9ec..dce69a87d4d0 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -497,6 +497,7 @@ static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, + [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { .len = ETH_ALEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, }; @@ -7036,8 +7037,15 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, NULL); if (err) return ERR_PTR(err); + + /* SSID and BSSID are mutually exclusive */ + if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] && + tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]) + return ERR_PTR(-EINVAL); + /* add other standalone attributes here */ - if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]) { + if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] || + tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]) { n_match_sets++; continue; } @@ -7208,7 +7216,7 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, nla_for_each_nested(attr, attrs[NL80211_ATTR_SCHED_SCAN_MATCH], tmp) { - struct nlattr *ssid, *rssi; + struct nlattr *ssid, *bssid, *rssi; err = nla_parse_nested(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, @@ -7217,7 +7225,8 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, if (err) goto out_free; ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]; - if (ssid) { + bssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]; + if (ssid || bssid) { if (WARN_ON(i >= n_match_sets)) { /* this indicates a programming error, * the loop above should have verified @@ -7227,14 +7236,25 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, goto out_free; } - if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out_free; + if (ssid) { + if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->match_sets[i].ssid.ssid, + nla_data(ssid), nla_len(ssid)); + request->match_sets[i].ssid.ssid_len = + nla_len(ssid); + } + if (bssid) { + if (nla_len(bssid) != ETH_ALEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->match_sets[i].bssid, + nla_data(bssid), ETH_ALEN); } - memcpy(request->match_sets[i].ssid.ssid, - nla_data(ssid), nla_len(ssid)); - request->match_sets[i].ssid.ssid_len = - nla_len(ssid); + /* special attribute - old implementation w/a */ request->match_sets[i].rssi_thold = default_match_rssi; -- cgit v1.3 From 99f906e9ad7b6e79ffeda30f45906a8448b9d6a2 Mon Sep 17 00:00:00 2001 From: Mike Manning Date: Wed, 26 Apr 2017 14:48:09 +0100 Subject: bridge: add per-port broadcast flood flag Support for l2 multicast flood control was added in commit b6cb5ac8331b ("net: bridge: add per-port multicast flood flag"). It allows broadcast as it was introduced specifically for unknown multicast flood control. But as broadcast is a special case of multicast, this may also need to be disabled. For this purpose, introduce a flag to disable the flooding of received l2 broadcasts. This approach is backwards compatible and provides flexibility in filtering for the desired packet types. Cc: Nikolay Aleksandrov Signed-off-by: Mike Manning Reviewed-by: Nikolay Aleksandrov Signed-off-by: David S. Miller --- include/linux/if_bridge.h | 1 + include/uapi/linux/if_link.h | 1 + net/bridge/br_forward.c | 24 +++++++++++++++++------- net/bridge/br_if.c | 2 +- net/bridge/br_netlink.c | 3 +++ net/bridge/br_sysfs_if.c | 2 ++ 6 files changed, 25 insertions(+), 8 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index c5847dc75a93..0c16866a7aac 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -48,6 +48,7 @@ struct br_ip_list { #define BR_MCAST_FLOOD BIT(11) #define BR_MULTICAST_TO_UNICAST BIT(12) #define BR_VLAN_TUNNEL BIT(13) +#define BR_BCAST_FLOOD BIT(14) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 633aa0276d32..8e56ac70e0d1 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -323,6 +323,7 @@ enum { IFLA_BRPORT_MCAST_FLOOD, IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 902af6ba481c..48fb17417fac 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -183,13 +183,23 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, struct net_bridge_port *p; list_for_each_entry_rcu(p, &br->port_list, list) { - /* Do not flood unicast traffic to ports that turn it off */ - if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) - continue; - /* Do not flood if mc off, except for traffic we originate */ - if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) - continue; + /* Do not flood unicast traffic to ports that turn it off, nor + * other traffic if flood off, except for traffic we originate + */ + switch (pkt_type) { + case BR_PKT_UNICAST: + if (!(p->flags & BR_FLOOD)) + continue; + break; + case BR_PKT_MULTICAST: + if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + case BR_PKT_BROADCAST: + if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f3544d96155c..7f8d05cf9065 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -361,7 +361,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD; + p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 650986473577..a572db710d4e 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -189,6 +189,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, + !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || @@ -683,6 +685,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); + br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 79aee759aba5..5d5d413a6cf8 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -173,6 +173,7 @@ BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); +BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -221,6 +222,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, + &brport_attr_broadcast_flood, NULL }; -- cgit v1.3 From 5d4e3443287b28a6e9c3436adbadba2497880bd1 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Wed, 26 Apr 2017 16:41:23 -0700 Subject: bpf: Fix inaccurate helper function description The description inside uapi/linux/bpf.h about bpf_get_socket_uid helper function is no longer valid. It returns overflowuid rather than 0 when failed. Signed-off-by: Chenbo Feng Acked-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e553529929f6..945a1f5f63c5 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -481,8 +481,7 @@ union bpf_attr { * u32 bpf_get_socket_uid(skb) * Get the owner uid of the socket stored inside sk_buff. * @skb: pointer to skb - * Return: uid of the socket owner on success or 0 if the socket pointer - * inside sk_buff is NULL + * Return: uid of the socket owner on success or overflowuid if failed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ -- cgit v1.3 From 9da3242e6a83b6f315aa9c394c939da8e4ad7774 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 2 May 2017 10:12:00 +0200 Subject: net: sched: add helpers to handle extended actions Jump is now the only one using value action opcode. This is going to change soon. So introduce helpers to work with this. Convert TC_ACT_JUMP. This also fixes the TC_ACT_JUMP check, which is incorrectly done as a bit check, not a value check. Fixes: e0ee84ded796 ("net sched actions: Complete the JUMPX opcode") Signed-off-by: Jiri Pirko Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 15 ++++++++++++++- net/sched/act_api.c | 2 +- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/uapi/linux') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index f1129e383b2a..d613be3b3239 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -37,7 +37,20 @@ enum { #define TC_ACT_QUEUED 5 #define TC_ACT_REPEAT 6 #define TC_ACT_REDIRECT 7 -#define TC_ACT_JUMP 0x10000000 + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_CMP(combined, opcode) \ + (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) /* Action type identifiers*/ enum { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 7f2cd702bb27..a90e8f355c00 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -453,7 +453,7 @@ repeat: if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ - if (ret & TC_ACT_JUMP) { + if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { /* faulty opcode, stop pipeline */ -- cgit v1.3