From bde000ae459f2829ed88e967f7fa7665b4e3afaf Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 May 2022 17:05:09 +0200
Subject: net: mac802154: Follow the count of ongoing transmissions

In order to create a synchronous API for MLME command purposes, we need
to be able to track the end of the ongoing transmissions. Let's
introduce an atomic variable which is incremented when a transmission
starts and decremented when relevant so that we know at any moment
whether there is an ongoing transmission.

The counter gets decremented in the following situations:
- The operation is asynchronous and there was a failure during the
  offloading process.
- The operation is synchronous and the synchronous operation failed.
- The operation finished, either successfully or not.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220519150516.443078-5-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index d8d8719315fd..678ff00c7d70 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -214,6 +214,9 @@ struct wpan_phy {
 	/* the network namespace this phy lives in currently */
 	possible_net_t _net;
 
+	/* Transmission monitoring */
+	atomic_t ongoing_txs;
+
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
-- 
cgit v1.2.3


From 20a19d1df3e4079cbaa045ec89bbefb831d4705d Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 May 2022 17:05:10 +0200
Subject: net: mac802154: Bring the ability to hold the transmit queue

Create a hold_txs atomic variable and increment/decrement it when
relevant, ie. when we want to hold the queue or release it: currently
all the "stopped" situations are suitable, but very soon we will more
extensively use this feature for MLME purposes.

Upon release, the atomic counter is decremented and checked. If it is
back to 0, then the netif queue gets woken up. This makes the whole
process fully transparent, provided that all the users of
ieee802154_wake/stop_queue() now call ieee802154_hold/release_queue()
instead.

In no situation individual drivers should call any of these helpers
manually in order to avoid messing with the counters. There are other
functions more suited for this purpose which have been introduced, such
as the _xmit_complete() and _xmit_error() helpers which will handle all
that for them.

One advantage is that, as no more drivers call the stop/wake helpers
directly, we can safely stop exporting them and only declare the
hold/release ones in a header only accessible to the core.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220519150516.443078-6-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h      |  6 +++--
 include/net/mac802154.h      | 27 -----------------------
 net/ieee802154/core.c        |  2 ++
 net/mac802154/cfg.c          |  4 ++--
 net/mac802154/ieee802154_i.h | 19 ++++++++++++++++
 net/mac802154/tx.c           |  6 ++---
 net/mac802154/util.c         | 52 ++++++++++++++++++++++++++++++++++++++------
 7 files changed, 75 insertions(+), 41 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 678ff00c7d70..e87f1b07f20f 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -11,7 +11,7 @@
 
 #include <linux/ieee802154.h>
 #include <linux/netdevice.h>
-#include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/bug.h>
 
 #include <net/nl802154.h>
@@ -214,8 +214,10 @@ struct wpan_phy {
 	/* the network namespace this phy lives in currently */
 	possible_net_t _net;
 
-	/* Transmission monitoring */
+	/* Transmission monitoring and control */
+	spinlock_t queue_lock;
 	atomic_t ongoing_txs;
+	atomic_t hold_txs;
 
 	char priv[] __aligned(NETDEV_ALIGN);
 };
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index bdac0ddbdcdb..357d25ef627a 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -460,33 +460,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw);
  */
 void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb,
 			   u8 lqi);
-/**
- * ieee802154_wake_queue - wake ieee802154 queue
- * @hw: pointer as obtained from ieee802154_alloc_hw().
- *
- * Tranceivers usually have either one transmit framebuffer or one framebuffer
- * for both transmitting and receiving. Hence, the core currently only handles
- * one frame at a time for each phy, which means we had to stop the queue to
- * avoid new skb to come during the transmission. The queue then needs to be
- * woken up after the operation.
- *
- * Drivers should use this function instead of netif_wake_queue.
- */
-void ieee802154_wake_queue(struct ieee802154_hw *hw);
-
-/**
- * ieee802154_stop_queue - stop ieee802154 queue
- * @hw: pointer as obtained from ieee802154_alloc_hw().
- *
- * Tranceivers usually have either one transmit framebuffer or one framebuffer
- * for both transmitting and receiving. Hence, the core currently only handles
- * one frame at a time for each phy, which means we need to tell upper layers to
- * stop giving us new skbs while we are busy with the transmitted one. The queue
- * must then be stopped before transmitting.
- *
- * Drivers should use this function instead of netif_stop_queue.
- */
-void ieee802154_stop_queue(struct ieee802154_hw *hw);
 
 /**
  * ieee802154_xmit_complete - frame transmission complete
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index de259b5170ab..47a4de6df88b 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -130,6 +130,8 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
 
 	init_waitqueue_head(&rdev->dev_wait);
 
+	spin_lock_init(&rdev->wpan_phy.queue_lock);
+
 	return &rdev->wpan_phy;
 }
 EXPORT_SYMBOL(wpan_phy_new);
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 1e4a9f74ed43..b51100fd9e3f 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy)
 	if (!local->open_count)
 		goto suspend;
 
-	ieee802154_stop_queue(&local->hw);
+	ieee802154_hold_queue(local);
 	synchronize_net();
 
 	/* stop hardware - this must stop RX */
@@ -72,7 +72,7 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy)
 		return ret;
 
 wake_up:
-	ieee802154_wake_queue(&local->hw);
+	ieee802154_release_queue(local);
 	local->suspended = false;
 	return 0;
 }
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index a8b7b9049f14..0c7ff9e0b632 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -130,6 +130,25 @@ netdev_tx_t
 ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev);
 enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer);
 
+/**
+ * ieee802154_hold_queue - hold ieee802154 queue
+ * @local: main mac object
+ *
+ * Hold a queue by incrementing an atomic counter and requesting the netif
+ * queues to be stopped. The queues cannot be woken up while the counter has not
+ * been reset with as any ieee802154_release_queue() calls as needed.
+ */
+void ieee802154_hold_queue(struct ieee802154_local *local);
+
+/**
+ * ieee802154_release_queue - release ieee802154 queue
+ * @local: main mac object
+ *
+ * Release a queue which is held by decrementing an atomic counter and wake it
+ * up only if the counter reaches 0.
+ */
+void ieee802154_release_queue(struct ieee802154_local *local);
+
 /* MIB callbacks */
 void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan);
 
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 33f64ecd96c7..6a53c83cf039 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -43,7 +43,7 @@ void ieee802154_xmit_sync_worker(struct work_struct *work)
 
 err_tx:
 	/* Restart the netif queue on each sub_if_data object. */
-	ieee802154_wake_queue(&local->hw);
+	ieee802154_release_queue(local);
 	atomic_dec(&local->phy->ongoing_txs);
 	kfree_skb(skb);
 	netdev_dbg(dev, "transmission failed\n");
@@ -75,7 +75,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
 	}
 
 	/* Stop the netif queue on each sub_if_data object. */
-	ieee802154_stop_queue(&local->hw);
+	ieee802154_hold_queue(local);
 	atomic_inc(&local->phy->ongoing_txs);
 
 	/* Drivers should preferably implement the async callback. In some rare
@@ -99,7 +99,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
 	return NETDEV_TX_OK;
 
 err_wake_netif_queue:
-	ieee802154_wake_queue(&local->hw);
+	ieee802154_release_queue(local);
 	atomic_dec(&local->phy->ongoing_txs);
 err_free_skb:
 	kfree_skb(skb);
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 76dc663e2af4..0ed8b5bcbe8a 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -13,7 +13,17 @@
 /* privid for wpan_phys to determine whether they belong to us or not */
 const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid;
 
-void ieee802154_wake_queue(struct ieee802154_hw *hw)
+/**
+ * ieee802154_wake_queue - wake ieee802154 queue
+ * @local: main mac object
+ *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we had to stop the queue to
+ * avoid new skb to come during the transmission. The queue then needs to be
+ * woken up after the operation.
+ */
+static void ieee802154_wake_queue(struct ieee802154_hw *hw)
 {
 	struct ieee802154_local *local = hw_to_local(hw);
 	struct ieee802154_sub_if_data *sdata;
@@ -27,9 +37,18 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw)
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(ieee802154_wake_queue);
 
-void ieee802154_stop_queue(struct ieee802154_hw *hw)
+/**
+ * ieee802154_stop_queue - stop ieee802154 queue
+ * @local: main mac object
+ *
+ * Tranceivers usually have either one transmit framebuffer or one framebuffer
+ * for both transmitting and receiving. Hence, the core currently only handles
+ * one frame at a time for each phy, which means we need to tell upper layers to
+ * stop giving us new skbs while we are busy with the transmitted one. The queue
+ * must then be stopped before transmitting.
+ */
+static void ieee802154_stop_queue(struct ieee802154_hw *hw)
 {
 	struct ieee802154_local *local = hw_to_local(hw);
 	struct ieee802154_sub_if_data *sdata;
@@ -43,14 +62,33 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw)
 	}
 	rcu_read_unlock();
 }
-EXPORT_SYMBOL(ieee802154_stop_queue);
+
+void ieee802154_hold_queue(struct ieee802154_local *local)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->phy->queue_lock, flags);
+	if (!atomic_fetch_inc(&local->phy->hold_txs))
+		ieee802154_stop_queue(&local->hw);
+	spin_unlock_irqrestore(&local->phy->queue_lock, flags);
+}
+
+void ieee802154_release_queue(struct ieee802154_local *local)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&local->phy->queue_lock, flags);
+	if (!atomic_dec_and_test(&local->phy->hold_txs))
+		ieee802154_wake_queue(&local->hw);
+	spin_unlock_irqrestore(&local->phy->queue_lock, flags);
+}
 
 enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer)
 {
 	struct ieee802154_local *local =
 		container_of(timer, struct ieee802154_local, ifs_timer);
 
-	ieee802154_wake_queue(&local->hw);
+	ieee802154_release_queue(local);
 
 	return HRTIMER_NORESTART;
 }
@@ -84,7 +122,7 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 				      hw->phy->sifs_period * NSEC_PER_USEC,
 				      HRTIMER_MODE_REL);
 	} else {
-		ieee802154_wake_queue(hw);
+		ieee802154_release_queue(local);
 	}
 
 	dev_consume_skb_any(skb);
@@ -98,7 +136,7 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb,
 	struct ieee802154_local *local = hw_to_local(hw);
 
 	local->tx_result = reason;
-	ieee802154_wake_queue(hw);
+	ieee802154_release_queue(local);
 	dev_kfree_skb_any(skb);
 	atomic_dec(&hw->phy->ongoing_txs);
 }
-- 
cgit v1.2.3


From f0feb34904735ffa21fe7b0c50f9f9527ec74b7a Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 May 2022 17:05:13 +0200
Subject: net: mac802154: Introduce a tx queue flushing mechanism

Right now we are able to stop a queue but we have no indication if a
transmission is ongoing or not.

Thanks to recent additions, we can track the number of ongoing
transmissions so we know if the last transmission is over. Adding on top
of it an internal wait queue also allows to be woken up asynchronously
when this happens. If, beforehands, we marked the queue to be held and
stopped it, we end up flushing and stopping the tx queue.

Thanks to this feature, we will soon be able to introduce a synchronous
transmit API.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220519150516.443078-9-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h      |  1 +
 net/ieee802154/core.c        |  1 +
 net/mac802154/cfg.c          |  2 +-
 net/mac802154/ieee802154_i.h |  1 +
 net/mac802154/tx.c           | 26 ++++++++++++++++++++++++--
 net/mac802154/util.c         |  6 ++++--
 6 files changed, 32 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index e87f1b07f20f..0804d79669a4 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -218,6 +218,7 @@ struct wpan_phy {
 	spinlock_t queue_lock;
 	atomic_t ongoing_txs;
 	atomic_t hold_txs;
+	wait_queue_head_t sync_txq;
 
 	char priv[] __aligned(NETDEV_ALIGN);
 };
diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c
index 47a4de6df88b..57546e07e06a 100644
--- a/net/ieee802154/core.c
+++ b/net/ieee802154/core.c
@@ -129,6 +129,7 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size)
 	wpan_phy_net_set(&rdev->wpan_phy, &init_net);
 
 	init_waitqueue_head(&rdev->dev_wait);
+	init_waitqueue_head(&rdev->wpan_phy.sync_txq);
 
 	spin_lock_init(&rdev->wpan_phy.queue_lock);
 
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index b51100fd9e3f..93df24f75572 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy)
 	if (!local->open_count)
 		goto suspend;
 
-	ieee802154_hold_queue(local);
+	ieee802154_sync_and_hold_queue(local);
 	synchronize_net();
 
 	/* stop hardware - this must stop RX */
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index e34db1d49ef4..a057827fc48a 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -124,6 +124,7 @@ extern struct ieee802154_mlme_ops mac802154_mlme_wpan;
 
 void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb);
 void ieee802154_xmit_sync_worker(struct work_struct *work);
+int ieee802154_sync_and_hold_queue(struct ieee802154_local *local);
 netdev_tx_t
 ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev);
 netdev_tx_t
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 607019b8f8ab..38f74b8b6740 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -44,7 +44,8 @@ void ieee802154_xmit_sync_worker(struct work_struct *work)
 err_tx:
 	/* Restart the netif queue on each sub_if_data object. */
 	ieee802154_release_queue(local);
-	atomic_dec(&local->phy->ongoing_txs);
+	if (!atomic_dec_and_test(&local->phy->ongoing_txs))
+		wake_up(&local->phy->sync_txq);
 	kfree_skb(skb);
 	netdev_dbg(dev, "transmission failed\n");
 }
@@ -100,12 +101,33 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb)
 
 err_wake_netif_queue:
 	ieee802154_release_queue(local);
-	atomic_dec(&local->phy->ongoing_txs);
+	if (!atomic_dec_and_test(&local->phy->ongoing_txs))
+		wake_up(&local->phy->sync_txq);
 err_free_skb:
 	kfree_skb(skb);
 	return NETDEV_TX_OK;
 }
 
+static int ieee802154_sync_queue(struct ieee802154_local *local)
+{
+	int ret;
+
+	ieee802154_hold_queue(local);
+	ieee802154_disable_queue(local);
+	wait_event(local->phy->sync_txq, !atomic_read(&local->phy->ongoing_txs));
+	ret = local->tx_result;
+	ieee802154_release_queue(local);
+
+	return ret;
+}
+
+int ieee802154_sync_and_hold_queue(struct ieee802154_local *local)
+{
+	ieee802154_hold_queue(local);
+
+	return ieee802154_sync_queue(local);
+}
+
 static netdev_tx_t
 ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb)
 {
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 999534f64485..5e1fcc7b0123 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -140,7 +140,8 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb,
 	}
 
 	dev_consume_skb_any(skb);
-	atomic_dec(&hw->phy->ongoing_txs);
+	if (!atomic_dec_and_test(&hw->phy->ongoing_txs))
+		wake_up(&hw->phy->sync_txq);
 }
 EXPORT_SYMBOL(ieee802154_xmit_complete);
 
@@ -152,7 +153,8 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb,
 	local->tx_result = reason;
 	ieee802154_release_queue(local);
 	dev_kfree_skb_any(skb);
-	atomic_dec(&hw->phy->ongoing_txs);
+	if (!atomic_dec_and_test(&hw->phy->ongoing_txs))
+		wake_up(&hw->phy->sync_txq);
 }
 EXPORT_SYMBOL(ieee802154_xmit_error);
 
-- 
cgit v1.2.3


From 2b13db13af50a5dcdb944723c828915a50f0c3b2 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Thu, 19 May 2022 17:05:15 +0200
Subject: net: mac802154: Add a warning in the hot path

We should never start a transmission after the queue has been stopped.

But because it might work we don't kill the function here but rather
warn loudly the user that something is wrong.

Set a flag when the queue should remain stopped. Reset this flag when
the queue actually gets restarded. Just check this value to know if a
transmission is legitimate, warn if it is not.

Turn the flags variable into an unsigned long to allow the use of atomic
helpers on it.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20220519150516.443078-11-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h |  5 ++++-
 net/mac802154/tx.c      | 16 +++++++++++++++-
 net/mac802154/util.c    |  1 +
 3 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 0804d79669a4..428cece22205 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -166,11 +166,14 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b)
  *	level setting.
  * @WPAN_PHY_FLAG_CCA_MODE: Indicates that transceiver will support cca mode
  *	setting.
+ * @WPAN_PHY_FLAG_STATE_QUEUE_STOPPED: Indicates that the transmit queue was
+ *	temporarily stopped.
  */
 enum wpan_phy_flags {
 	WPAN_PHY_FLAG_TXPOWER		= BIT(1),
 	WPAN_PHY_FLAG_CCA_ED_LEVEL	= BIT(2),
 	WPAN_PHY_FLAG_CCA_MODE		= BIT(3),
+	WPAN_PHY_FLAG_STATE_QUEUE_STOPPED = BIT(4),
 };
 
 struct wpan_phy {
@@ -182,7 +185,7 @@ struct wpan_phy {
 	 */
 	const void *privid;
 
-	u32 flags;
+	unsigned long flags;
 
 	/*
 	 * This is a PIB according to 802.15.4-2011.
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 4827391600f6..6188f42276e7 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -123,9 +123,13 @@ static int ieee802154_sync_queue(struct ieee802154_local *local)
 
 int ieee802154_sync_and_hold_queue(struct ieee802154_local *local)
 {
+	int ret;
+
 	ieee802154_hold_queue(local);
+	ret = ieee802154_sync_queue(local);
+	set_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
 
-	return ieee802154_sync_queue(local);
+	return ret;
 }
 
 int ieee802154_mlme_op_pre(struct ieee802154_local *local)
@@ -172,9 +176,19 @@ int ieee802154_mlme_tx_one(struct ieee802154_local *local, struct sk_buff *skb)
 	return ret;
 }
 
+static bool ieee802154_queue_is_stopped(struct ieee802154_local *local)
+{
+	return test_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
+}
+
 static netdev_tx_t
 ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb)
 {
+	/* Warn if the net interface tries to transmit frames while the
+	 * ieee802154 core assumes the queue is stopped.
+	 */
+	WARN_ON_ONCE(ieee802154_queue_is_stopped(local));
+
 	return ieee802154_tx(local, skb);
 }
 
diff --git a/net/mac802154/util.c b/net/mac802154/util.c
index 5e1fcc7b0123..60eb7bd3bfc1 100644
--- a/net/mac802154/util.c
+++ b/net/mac802154/util.c
@@ -29,6 +29,7 @@ static void ieee802154_wake_queue(struct ieee802154_hw *hw)
 	struct ieee802154_sub_if_data *sdata;
 
 	rcu_read_lock();
+	clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags);
 	list_for_each_entry_rcu(sdata, &local->interfaces, list) {
 		if (!sdata->dev)
 			continue;
-- 
cgit v1.2.3


From 1d9e4c91db17c9bf6f94ac234a4d4f2bffd52b97 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Tue, 6 Sep 2022 19:02:04 +0200
Subject: wifi: mac80211: add pointer from link STA to STA

While often not needed, this considerably simplifies going from a link
to the STA. This helps in cases such as debugfs where a single pointer
should allow accessing a specific link and the STA.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h  | 3 +++
 net/mac80211/sta_info.c | 1 +
 2 files changed, 4 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index ac2bad57933f..7778a92d9582 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -2176,6 +2176,7 @@ struct ieee80211_sta_aggregates {
  * All link specific info for a STA link for a non MLD STA(single)
  * or a MLD STA(multiple entries) are stored here.
  *
+ * @sta: reference to owning STA
  * @addr: MAC address of the Link STA. For non-MLO STA this is same as the addr
  *	in ieee80211_sta. For MLO Link STA this addr can be same or different
  *	from addr in ieee80211_sta (representing MLD STA addr)
@@ -2196,6 +2197,8 @@ struct ieee80211_sta_aggregates {
  *
  */
 struct ieee80211_link_sta {
+	struct ieee80211_sta *sta;
+
 	u8 addr[ETH_ALEN];
 	u8 link_id;
 	enum ieee80211_smps_mode smps_mode;
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index cebfd148bb40..71b1488bd390 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -511,6 +511,7 @@ static void sta_info_add_link(struct sta_info *sta,
 	link_info->sta = sta;
 	link_info->link_id = link_id;
 	link_info->pub = link_sta;
+	link_info->pub->sta = &sta->sta;
 	link_sta->link_id = link_id;
 	rcu_assign_pointer(sta->link[link_id], link_info);
 	rcu_assign_pointer(sta->sta.link[link_id], link_sta);
-- 
cgit v1.2.3


From d2caad527c191563116809990081ab4fc0dafdb6 Mon Sep 17 00:00:00 2001
From: Benjamin Berg <benjamin.berg@intel.com>
Date: Tue, 6 Sep 2022 14:26:52 +0200
Subject: wifi: mac80211: add API to show the link STAs in debugfs

Create debugfs data per-link. For drivers, there is a new operation
link_sta_add_debugfs which will always be called.

For non-MLO, the station directory will be used directly rather than
creating a corresponding subdirectory. As such, non-MLO drivers can
simply continue to create the data from sta_debugfs_add.

Signed-off-by: Benjamin Berg <benjamin.berg@intel.com>
[add missing inlines if !CONFIG_MAC80211_DEBUGFS]
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h     |  11 ++++
 net/mac80211/debugfs_sta.c | 127 +++++++++++++++++++++++++++++++++++++--------
 net/mac80211/debugfs_sta.h |  12 +++++
 net/mac80211/driver-ops.c  |  27 +++++++++-
 net/mac80211/driver-ops.h  |  16 ++++++
 net/mac80211/sta_info.c    |  25 +++++++++
 net/mac80211/sta_info.h    |   5 ++
 7 files changed, 200 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 7778a92d9582..c413050ec8dd 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -3790,6 +3790,13 @@ struct ieee80211_prep_tx_info {
  *	should be within a CONFIG_MAC80211_DEBUGFS conditional. This
  *	callback can sleep.
  *
+ * @link_sta_add_debugfs: Drivers can use this callback to add debugfs files
+ *	when a link is added to a mac80211 station. This callback
+ *	should be within a CPTCFG_MAC80211_DEBUGFS conditional. This
+ *	callback can sleep.
+ *	For non-MLO the callback will be called once for the deflink with the
+ *	station's directory rather than a separate subdirectory.
+ *
  * @sta_notify: Notifies low level driver about power state transition of an
  *	associated station, AP,  IBSS/WDS/mesh peer etc. For a VIF operating
  *	in AP mode, this callback will not be called when the flag
@@ -4260,6 +4267,10 @@ struct ieee80211_ops {
 				struct ieee80211_vif *vif,
 				struct ieee80211_sta *sta,
 				struct dentry *dir);
+	void (*link_sta_add_debugfs)(struct ieee80211_hw *hw,
+				     struct ieee80211_vif *vif,
+				     struct ieee80211_link_sta *link_sta,
+				     struct dentry *dir);
 #endif
 	void (*sta_notify)(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 			enum sta_notify_cmd, struct ieee80211_sta *sta);
diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c
index d3397c1248d3..68c07d4b95a5 100644
--- a/net/mac80211/debugfs_sta.c
+++ b/net/mac80211/debugfs_sta.c
@@ -5,7 +5,7 @@
  * Copyright 2007	Johannes Berg <johannes@sipsolutions.net>
  * Copyright 2013-2014  Intel Mobile Communications GmbH
  * Copyright(c) 2016 Intel Deutschland GmbH
- * Copyright (C) 2018 - 2021 Intel Corporation
+ * Copyright (C) 2018 - 2022 Intel Corporation
  */
 
 #include <linux/debugfs.h>
@@ -435,8 +435,16 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu
 }
 STA_OPS_RW(agg_status);
 
-static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
-				size_t count, loff_t *ppos)
+/* link sta attributes */
+#define LINK_STA_OPS(name)						\
+static const struct file_operations link_sta_ ##name## _ops = {		\
+	.read = link_sta_##name##_read,					\
+	.open = simple_open,						\
+	.llseek = generic_file_llseek,					\
+}
+
+static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf,
+				     size_t count, loff_t *ppos)
 {
 #define PRINT_HT_CAP(_cond, _str) \
 	do { \
@@ -446,8 +454,8 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
 	char *buf, *p;
 	int i;
 	ssize_t bufsz = 512;
-	struct sta_info *sta = file->private_data;
-	struct ieee80211_sta_ht_cap *htc = &sta->sta.deflink.ht_cap;
+	struct link_sta_info *link_sta = file->private_data;
+	struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap;
 	ssize_t ret;
 
 	buf = kzalloc(bufsz, GFP_KERNEL);
@@ -524,14 +532,14 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf,
 	kfree(buf);
 	return ret;
 }
-STA_OPS(ht_capa);
+LINK_STA_OPS(ht_capa);
 
-static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
-				 size_t count, loff_t *ppos)
+static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf,
+				      size_t count, loff_t *ppos)
 {
 	char *buf, *p;
-	struct sta_info *sta = file->private_data;
-	struct ieee80211_sta_vht_cap *vhtc = &sta->sta.deflink.vht_cap;
+	struct link_sta_info *link_sta = file->private_data;
+	struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap;
 	ssize_t ret;
 	ssize_t bufsz = 512;
 
@@ -638,15 +646,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf,
 	kfree(buf);
 	return ret;
 }
-STA_OPS(vht_capa);
+LINK_STA_OPS(vht_capa);
 
-static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf,
-				size_t count, loff_t *ppos)
+static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf,
+				     size_t count, loff_t *ppos)
 {
 	char *buf, *p;
 	size_t buf_sz = PAGE_SIZE;
-	struct sta_info *sta = file->private_data;
-	struct ieee80211_sta_he_cap *hec = &sta->sta.deflink.he_cap;
+	struct link_sta_info *link_sta = file->private_data;
+	struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap;
 	struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp;
 	u8 ppe_size;
 	u8 *cap;
@@ -1011,7 +1019,7 @@ out:
 	kfree(buf);
 	return ret;
 }
-STA_OPS(he_capa);
+LINK_STA_OPS(he_capa);
 
 #define DEBUGFS_ADD(name) \
 	debugfs_create_file(#name, 0400, \
@@ -1048,12 +1056,7 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta)
 	DEBUGFS_ADD(num_ps_buf_frames);
 	DEBUGFS_ADD(last_seq_ctrl);
 	DEBUGFS_ADD(agg_status);
-	DEBUGFS_ADD(ht_capa);
-	DEBUGFS_ADD(vht_capa);
-	DEBUGFS_ADD(he_capa);
-
-	DEBUGFS_ADD_COUNTER(rx_duplicates, deflink.rx_stats.num_duplicates);
-	DEBUGFS_ADD_COUNTER(rx_fragments, deflink.rx_stats.fragments);
+	/* FIXME: Kept here as the statistics are only done on the deflink */
 	DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered);
 
 	if (local->ops->wake_tx_queue) {
@@ -1076,3 +1079,83 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta)
 	debugfs_remove_recursive(sta->debugfs_dir);
 	sta->debugfs_dir = NULL;
 }
+
+#undef DEBUGFS_ADD
+#undef DEBUGFS_ADD_COUNTER
+
+#define DEBUGFS_ADD(name) \
+	debugfs_create_file(#name, 0400, \
+		link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops)
+#define DEBUGFS_ADD_COUNTER(name, field)				\
+	debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field)
+
+void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta)
+{
+	if (WARN_ON(!link_sta->sta->debugfs_dir))
+		return;
+
+	/* For non-MLO, leave the files in the main directory. */
+	if (link_sta->sta->sta.valid_links) {
+		char link_dir_name[10];
+
+		snprintf(link_dir_name, sizeof(link_dir_name),
+			 "link-%d", link_sta->link_id);
+
+		link_sta->debugfs_dir =
+			debugfs_create_dir(link_dir_name,
+					   link_sta->sta->debugfs_dir);
+	} else {
+		if (WARN_ON(link_sta != &link_sta->sta->deflink))
+			return;
+
+		link_sta->debugfs_dir = link_sta->sta->debugfs_dir;
+	}
+
+	DEBUGFS_ADD(ht_capa);
+	DEBUGFS_ADD(vht_capa);
+	DEBUGFS_ADD(he_capa);
+
+	DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates);
+	DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments);
+}
+
+void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta)
+{
+	if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) {
+		link_sta->debugfs_dir = NULL;
+		return;
+	}
+
+	if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) {
+		WARN_ON(link_sta != &link_sta->sta->deflink);
+		link_sta->sta->debugfs_dir = NULL;
+		return;
+	}
+
+	debugfs_remove_recursive(link_sta->debugfs_dir);
+	link_sta->debugfs_dir = NULL;
+}
+
+void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta)
+{
+	if (WARN_ON(!link_sta->debugfs_dir))
+		return;
+
+	drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata,
+				 link_sta->pub, link_sta->debugfs_dir);
+}
+
+void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta)
+{
+	if (!link_sta->debugfs_dir)
+		return;
+
+	if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir))
+		return;
+
+	/* Recreate the directory excluding the driver data */
+	debugfs_remove_recursive(link_sta->debugfs_dir);
+	link_sta->debugfs_dir = NULL;
+
+	ieee80211_link_sta_debugfs_add(link_sta);
+}
diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h
index d2e7c27ad6d1..cde8148bdb18 100644
--- a/net/mac80211/debugfs_sta.h
+++ b/net/mac80211/debugfs_sta.h
@@ -7,9 +7,21 @@
 #ifdef CONFIG_MAC80211_DEBUGFS
 void ieee80211_sta_debugfs_add(struct sta_info *sta);
 void ieee80211_sta_debugfs_remove(struct sta_info *sta);
+
+void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta);
+void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta);
+
+void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta);
+void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta);
 #else
 static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {}
 static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {}
+
+static inline void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) {}
+static inline void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) {}
+
+static inline void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) {}
+static inline void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) {}
 #endif
 
 #endif /* __MAC80211_DEBUGFS_STA_H */
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index 5392ffa18270..d737db4e07e2 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -7,6 +7,7 @@
 #include "ieee80211_i.h"
 #include "trace.h"
 #include "driver-ops.h"
+#include "debugfs_sta.h"
 
 int drv_start(struct ieee80211_local *local)
 {
@@ -497,6 +498,11 @@ int drv_change_sta_links(struct ieee80211_local *local,
 			 struct ieee80211_sta *sta,
 			 u16 old_links, u16 new_links)
 {
+	struct sta_info *info = container_of(sta, struct sta_info, sta);
+	struct link_sta_info *link_sta;
+	unsigned long links_to_add;
+	unsigned long links_to_rem;
+	unsigned int link_id;
 	int ret = -EOPNOTSUPP;
 
 	might_sleep();
@@ -510,11 +516,30 @@ int drv_change_sta_links(struct ieee80211_local *local,
 	if (old_links == new_links)
 		return 0;
 
+	links_to_add = ~old_links & new_links;
+	links_to_rem = old_links & ~new_links;
+
+	for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link_sta = rcu_dereference_protected(info->link[link_id],
+						     lockdep_is_held(&local->sta_mtx));
+
+		ieee80211_link_sta_debugfs_drv_remove(link_sta);
+	}
+
 	trace_drv_change_sta_links(local, sdata, sta, old_links, new_links);
 	if (local->ops->change_sta_links)
 		ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta,
 						   old_links, new_links);
 	trace_drv_return_int(local, ret);
 
-	return ret;
+	if (ret)
+		return ret;
+
+	for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) {
+		link_sta = rcu_dereference_protected(info->link[link_id],
+						     lockdep_is_held(&local->sta_mtx));
+		ieee80211_link_sta_debugfs_drv_add(link_sta);
+	}
+
+	return 0;
 }
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 81e40b0a3b16..809bad53e15b 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -480,6 +480,22 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local,
 		local->ops->sta_add_debugfs(&local->hw, &sdata->vif,
 					    sta, dir);
 }
+
+static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local,
+					    struct ieee80211_sub_if_data *sdata,
+					    struct ieee80211_link_sta *link_sta,
+					    struct dentry *dir)
+{
+	might_sleep();
+
+	sdata = get_bss_sdata(sdata);
+	if (!check_sdata_in_driver(sdata))
+		return;
+
+	if (local->ops->link_sta_add_debugfs)
+		local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif,
+						 link_sta, dir);
+}
 #endif
 
 static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local,
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index 71b1488bd390..e6beaea4075e 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -366,6 +366,9 @@ static void sta_remove_link(struct sta_info *sta, unsigned int link_id,
 	if (unhash)
 		link_sta_info_hash_del(sta->local, link_sta);
 
+	if (test_sta_flag(sta, WLAN_STA_INSERTED))
+		ieee80211_link_sta_debugfs_remove(link_sta);
+
 	if (link_sta != &sta->deflink)
 		alloc = container_of(link_sta, typeof(*alloc), info);
 
@@ -875,6 +878,26 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
 
 	ieee80211_sta_debugfs_add(sta);
 	rate_control_add_sta_debugfs(sta);
+	if (sta->sta.valid_links) {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+			struct link_sta_info *link_sta;
+
+			link_sta = rcu_dereference_protected(sta->link[i],
+							     lockdep_is_held(&local->sta_mtx));
+
+			if (!link_sta)
+				continue;
+
+			ieee80211_link_sta_debugfs_add(link_sta);
+			if (sdata->vif.active_links & BIT(i))
+				ieee80211_link_sta_debugfs_drv_add(link_sta);
+		}
+	} else {
+		ieee80211_link_sta_debugfs_add(&sta->deflink);
+		ieee80211_link_sta_debugfs_drv_add(&sta->deflink);
+	}
 
 	sinfo->generation = local->sta_generation;
 	cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
@@ -2824,6 +2847,8 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id)
 
 	sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta);
 
+	ieee80211_link_sta_debugfs_add(&alloc->info);
+
 	return 0;
 }
 
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 2517ea714dc4..6e672bf9c79d 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -513,6 +513,7 @@ struct ieee80211_fragment_cache {
  * @status_stats.avg_ack_signal: average ACK signal
  * @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
  *	taken from HT/VHT capabilities or VHT operating mode notification
+ * @debugfs_dir: debug filesystem directory dentry
  * @pub: public (driver visible) link STA data
  * TODO Move other link params from sta_info as required for MLD operation
  */
@@ -560,6 +561,10 @@ struct link_sta_info {
 
 	enum ieee80211_sta_rx_bandwidth cur_max_bandwidth;
 
+#ifdef CONFIG_MAC80211_DEBUGFS
+	struct dentry *debugfs_dir;
+#endif
+
 	struct ieee80211_link_sta *pub;
 };
 
-- 
cgit v1.2.3


From 53ad07e9823bca10c26e71d662b58c3e80e8ff2a Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 6 Sep 2022 11:27:57 +0200
Subject: wifi: cfg80211: support reporting failed links

For assoc and connect result APIs, support reporting
failed links; they should still come with the BSS
pointer in the case of assoc, so they're released
correctly. In the case of connect result, this is
optional.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h |  7 +++++++
 net/wireless/mlme.c    |  4 ++++
 net/wireless/nl80211.c |  5 ++++-
 net/wireless/sme.c     | 14 ++++++++++++++
 4 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index e09ff87146c1..4d35a4234417 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -6933,6 +6933,8 @@ void cfg80211_auth_timeout(struct net_device *dev, const u8 *addr);
  * @ap_mld_addr: AP MLD address (in case of MLO)
  * @links: per-link information indexed by link ID, use links[0] for
  *	non-MLO connections
+ * @links.status: Set this (along with a BSS pointer) for links that
+ *	were rejected by the AP.
  */
 struct cfg80211_rx_assoc_resp {
 	const u8 *buf;
@@ -6944,6 +6946,7 @@ struct cfg80211_rx_assoc_resp {
 	struct {
 		const u8 *addr;
 		struct cfg80211_bss *bss;
+		u16 status;
 	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
@@ -7454,6 +7457,9 @@ struct cfg80211_fils_resp_params {
  *	if the bss is expired during the connection, esp. for those drivers
  *	implementing connect op. Only one parameter among @bssid and @bss needs
  *	to be specified.
+ * @links.status: per-link status code, to report a status code that's not
+ *	%WLAN_STATUS_SUCCESS for a given link, it must also be in the
+ *	@valid_links bitmap and may have a BSS pointer (which is then released)
  */
 struct cfg80211_connect_resp_params {
 	int status;
@@ -7470,6 +7476,7 @@ struct cfg80211_connect_resp_params {
 		const u8 *addr;
 		const u8 *bssid;
 		struct cfg80211_bss *bss;
+		u16 status;
 	} links[IEEE80211_MLD_MAX_NUM_LINKS];
 };
 
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index 581df7f4c524..58e1fb18f85a 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -42,6 +42,10 @@ void cfg80211_rx_assoc_resp(struct net_device *dev,
 	unsigned int link_id;
 
 	for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) {
+		cr.links[link_id].status = data->links[link_id].status;
+		WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS &&
+			     (!cr.ap_mld_addr || !cr.links[link_id].bss));
+
 		cr.links[link_id].bss = data->links[link_id].bss;
 		if (!cr.links[link_id].bss)
 			continue;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 8ff8b1c040f0..ad7393cd3d18 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -17745,6 +17745,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 			link_info_size += (cr->links[link].bssid ||
 					   cr->links[link].bss) ?
 					  nla_total_size(ETH_ALEN) : 0;
+			link_info_size += nla_total_size(sizeof(u16));
 		}
 	}
 
@@ -17813,7 +17814,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev,
 			     nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) ||
 			    (cr->links[link].addr &&
 			     nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN,
-				     cr->links[link].addr)))
+				     cr->links[link].addr)) ||
+			    nla_put_u16(msg, NL80211_ATTR_STATUS_CODE,
+					cr->links[link].status))
 				goto nla_put_failure;
 
 			nla_nest_end(msg, nested_mlo_links);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index d513536617bd..f94497e9db43 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -793,6 +793,10 @@ void __cfg80211_connect_result(struct net_device *dev,
 		}
 
 		for_each_valid_link(cr, link) {
+			/* don't do extra lookups for failures */
+			if (cr->links[link].status != WLAN_STATUS_SUCCESS)
+				continue;
+
 			if (cr->links[link].bss)
 				continue;
 
@@ -829,6 +833,16 @@ void __cfg80211_connect_result(struct net_device *dev,
 	}
 
 	memset(wdev->links, 0, sizeof(wdev->links));
+	for_each_valid_link(cr, link) {
+		if (cr->links[link].status == WLAN_STATUS_SUCCESS)
+			continue;
+		cr->valid_links &= ~BIT(link);
+		/* don't require bss pointer for failed links */
+		if (!cr->links[link].bss)
+			continue;
+		cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss));
+		cfg80211_put_bss(wdev->wiphy, cr->links[link].bss);
+	}
 	wdev->valid_links = cr->valid_links;
 	for_each_valid_link(cr, link)
 		wdev->links[link].client.current_bss =
-- 
cgit v1.2.3


From 1e0f8cc96b7162075d2e3b6bef856497884a3ae8 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 6 Sep 2022 22:37:03 +0200
Subject: wifi: nl80211: use link ID in NL80211_CMD_SET_BSS

We clearly need the link ID here, to know the right BSS
to configure. Use/require it.

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 ++
 net/wireless/nl80211.c | 4 +++-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 4d35a4234417..659dd1bee70f 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -2105,6 +2105,7 @@ struct mpath_info {
  *
  * Used to change BSS parameters (mainly for AP mode).
  *
+ * @link_id: link_id or -1 for non-MLD
  * @use_cts_prot: Whether to use CTS protection
  *	(0 = no, 1 = yes, -1 = do not change)
  * @use_short_preamble: Whether the use of short preambles is allowed
@@ -2122,6 +2123,7 @@ struct mpath_info {
  * @p2p_opp_ps: P2P opportunistic PS (-1 = no change)
  */
 struct bss_parameters {
+	int link_id;
 	int use_cts_prot;
 	int use_short_preamble;
 	int use_short_slot_time;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index ad7393cd3d18..1d0277758d0e 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -7780,6 +7780,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info)
 	int err;
 
 	memset(&params, 0, sizeof(params));
+	params.link_id = nl80211_link_id_or_invalid(info->attrs);
 	/* default to not changing parameters */
 	params.use_cts_prot = -1;
 	params.use_short_preamble = -1;
@@ -16564,7 +16565,8 @@ static const struct genl_small_ops nl80211_small_ops[] = {
 		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
 		.doit = nl80211_set_bss,
 		.flags = GENL_UNS_ADMIN_PERM,
-		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP),
+		.internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP |
+					 NL80211_FLAG_MLO_VALID_LINK_ID),
 	},
 	{
 		.cmd = NL80211_CMD_GET_REG,
-- 
cgit v1.2.3


From f3630c4f82ae43682bf84e6ddcbd7e97285d4699 Mon Sep 17 00:00:00 2001
From: Johannes Berg <johannes.berg@intel.com>
Date: Tue, 27 Sep 2022 11:39:23 +0200
Subject: wifi: mac80211: add RCU _check() link access variants

We might sometimes need to use RCU and locking in the same code
path, so add the two variants link_conf_dereference_check() and
link_sta_dereference_check().

Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index c413050ec8dd..cda4584dfd51 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1915,6 +1915,10 @@ static inline bool lockdep_vif_mutex_held(struct ieee80211_vif *vif)
 	rcu_dereference_protected((vif)->link_conf[link_id],	\
 				  lockdep_vif_mutex_held(vif))
 
+#define link_conf_dereference_check(vif, link_id)		\
+	rcu_dereference_check((vif)->link_conf[link_id],	\
+			      lockdep_vif_mutex_held(vif))
+
 /**
  * enum ieee80211_key_flags - key flags
  *
@@ -2311,6 +2315,10 @@ static inline bool lockdep_sta_mutex_held(struct ieee80211_sta *pubsta)
 	rcu_dereference_protected((sta)->link[link_id],		\
 				  lockdep_sta_mutex_held(sta))
 
+#define link_sta_dereference_check(sta, link_id)		\
+	rcu_dereference_check((sta)->link[link_id],		\
+			      lockdep_sta_mutex_held(sta))
+
 #define for_each_sta_active_link(vif, sta, link_sta, link_id)			\
 	for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++)		\
 		if ((!(vif)->active_links ||					\
-- 
cgit v1.2.3


From 0ff57171d6d225558c81a69439d5323e35b40549 Mon Sep 17 00:00:00 2001
From: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Date: Wed, 7 Sep 2022 18:14:48 +0530
Subject: cfg80211: Update Transition Disable policy during port authorization

In case of 4way handshake offload, transition disable policy
updated by the AP during EAPOL 3/4 is not updated to the upper layer.
This results in mismatch between transition disable policy
between the upper layer and the driver. This patch addresses this
issue by updating transition disable policy as part of port
authorization indication.

Signed-off-by: Vinayak Yadawad <vinayak.yadawad@broadcom.com>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c |  2 +-
 include/net/cfg80211.h                                      |  4 +++-
 include/uapi/linux/nl80211.h                                |  3 +++
 net/wireless/core.h                                         |  5 ++++-
 net/wireless/nl80211.c                                      |  8 +++++++-
 net/wireless/nl80211.h                                      |  3 ++-
 net/wireless/sme.c                                          | 12 ++++++++----
 net/wireless/util.c                                         |  4 +++-
 8 files changed, 31 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
index bf184c0e64cb..3f2336062217 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c
@@ -6268,7 +6268,7 @@ done:
 	brcmf_dbg(CONN, "Report roaming result\n");
 
 	if (profile->use_fwsup == BRCMF_PROFILE_FWSUP_1X && profile->is_ft) {
-		cfg80211_port_authorized(ndev, profile->bssid, GFP_KERNEL);
+		cfg80211_port_authorized(ndev, profile->bssid, NULL, 0, GFP_KERNEL);
 		brcmf_dbg(CONN, "Report port authorized\n");
 	}
 
diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 659dd1bee70f..11a370e64143 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -7683,6 +7683,8 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
  *
  * @dev: network device
  * @bssid: the BSSID of the AP
+ * @td_bitmap: transition disable policy
+ * @td_bitmap_len: Length of transition disable policy
  * @gfp: allocation flags
  *
  * This function should be called by a driver that supports 4 way handshake
@@ -7693,7 +7695,7 @@ void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info,
  * indicate the 802.11 association.
  */
 void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
-			      gfp_t gfp);
+			      const u8* td_bitmap, u8 td_bitmap_len, gfp_t gfp);
 
 /**
  * cfg80211_disconnected - notify cfg80211 that connection was dropped
diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h
index c32e7616a366..c14a91bbca7c 100644
--- a/include/uapi/linux/nl80211.h
+++ b/include/uapi/linux/nl80211.h
@@ -2749,6 +2749,8 @@ enum nl80211_commands {
  *	When used with %NL80211_CMD_FRAME_TX_STATUS, indicates the ack RX
  *	timestamp. When used with %NL80211_CMD_FRAME RX notification, indicates
  *	the incoming frame RX timestamp.
+ * @NL80211_ATTR_TD_BITMAP: Transition Disable bitmap, for subsequent
+ *	(re)associations.
  * @NUM_NL80211_ATTR: total number of nl80211_attrs available
  * @NL80211_ATTR_MAX: highest attribute number currently defined
  * @__NL80211_ATTR_AFTER_LAST: internal use
@@ -3276,6 +3278,7 @@ enum nl80211_attrs {
 
 	NL80211_ATTR_TX_HW_TIMESTAMP,
 	NL80211_ATTR_RX_HW_TIMESTAMP,
+	NL80211_ATTR_TD_BITMAP,
 
 	/* add attributes here, update the policy in nl80211.c */
 
diff --git a/net/wireless/core.h b/net/wireless/core.h
index 775e16cb99ed..af85d8909935 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -271,6 +271,8 @@ struct cfg80211_event {
 		} ij;
 		struct {
 			u8 bssid[ETH_ALEN];
+			const u8 *td_bitmap;
+			u8 td_bitmap_len;
 		} pa;
 	};
 };
@@ -409,7 +411,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev,
 			bool wextev);
 void __cfg80211_roamed(struct wireless_dev *wdev,
 		       struct cfg80211_roam_info *info);
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid);
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+				const u8 *td_bitmap, u8 td_bitmap_len);
 int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev,
 			      struct wireless_dev *wdev);
 void cfg80211_autodisconnect_wk(struct work_struct *work);
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 1d0277758d0e..fe368af39554 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -17942,7 +17942,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 }
 
 void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
-				  struct net_device *netdev, const u8 *bssid)
+				  struct net_device *netdev, const u8 *bssid,
+				  const u8 *td_bitmap, u8 td_bitmap_len)
 {
 	struct sk_buff *msg;
 	void *hdr;
@@ -17962,6 +17963,11 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
 	    nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid))
 		goto nla_put_failure;
 
+	if ((td_bitmap_len > 0) && td_bitmap)
+		if (nla_put(msg, NL80211_ATTR_TD_BITMAP,
+			    td_bitmap_len, td_bitmap))
+			goto nla_put_failure;
+
 	genlmsg_end(msg, hdr);
 
 	genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0,
diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h
index 855d540ddfb9..ba9457e94c43 100644
--- a/net/wireless/nl80211.h
+++ b/net/wireless/nl80211.h
@@ -83,7 +83,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev,
 			 struct net_device *netdev,
 			 struct cfg80211_roam_info *info, gfp_t gfp);
 void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev,
-				  struct net_device *netdev, const u8 *bssid);
+				  struct net_device *netdev, const u8 *bssid,
+				  const u8 *td_bitmap, u8 td_bitmap_len);
 void nl80211_send_disconnected(struct cfg80211_registered_device *rdev,
 			       struct net_device *netdev, u16 reason,
 			       const u8 *ie, size_t ie_len, bool from_ap);
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index f94497e9db43..4b5b6ee0fe01 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -1251,7 +1251,8 @@ out:
 }
 EXPORT_SYMBOL(cfg80211_roamed);
 
-void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
+void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid,
+					const u8 *td_bitmap, u8 td_bitmap_len)
 {
 	ASSERT_WDEV_LOCK(wdev);
 
@@ -1264,11 +1265,11 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid)
 		return;
 
 	nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev,
-				     bssid);
+				     bssid, td_bitmap, td_bitmap_len);
 }
 
 void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
-			      gfp_t gfp)
+			      const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -1278,12 +1279,15 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid,
 	if (WARN_ON(!bssid))
 		return;
 
-	ev = kzalloc(sizeof(*ev), gfp);
+	ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp);
 	if (!ev)
 		return;
 
 	ev->type = EVENT_PORT_AUTHORIZED;
 	memcpy(ev->pa.bssid, bssid, ETH_ALEN);
+	ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev);
+	ev->pa.td_bitmap_len = td_bitmap_len;
+	memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len);
 
 	/*
 	 * Use the wdev event list so that if there are pending
diff --git a/net/wireless/util.c b/net/wireless/util.c
index 01493568a21d..f09d528e5199 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -988,7 +988,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev)
 			__cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev);
 			break;
 		case EVENT_PORT_AUTHORIZED:
-			__cfg80211_port_authorized(wdev, ev->pa.bssid);
+			__cfg80211_port_authorized(wdev, ev->pa.bssid,
+						   ev->pa.td_bitmap,
+						   ev->pa.td_bitmap_len);
 			break;
 		}
 		wdev_unlock(wdev);
-- 
cgit v1.2.3


From c850e31f79f049af5022f07cd9961605b4470d0b Mon Sep 17 00:00:00 2001
From: Alexander Wetzel <alexander@wetzel-home.de>
Date: Sun, 9 Oct 2022 18:30:38 +0200
Subject: wifi: mac80211: add internal handler for wake_tx_queue

Start to align the TX handling to only use internal TX queues (iTXQs):

Provide a handler for drivers not having a custom wake_tx_queue
callback and update the documentation.

Signed-off-by: Alexander Wetzel <alexander@wetzel-home.de>
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/mac80211.h | 51 +++++++++++++++++++++++++++++++-------------------
 net/mac80211/util.c    | 46 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 19 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index cda4584dfd51..721c450a9ccd 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -89,15 +89,13 @@
 /**
  * DOC: mac80211 software tx queueing
  *
- * mac80211 provides an optional intermediate queueing implementation designed
- * to allow the driver to keep hardware queues short and provide some fairness
- * between different stations/interfaces.
- * In this model, the driver pulls data frames from the mac80211 queue instead
- * of letting mac80211 push them via drv_tx().
- * Other frames (e.g. control or management) are still pushed using drv_tx().
+ * mac80211 uses an intermediate queueing implementation, designed to allow the
+ * driver to keep hardware queues short and to provide some fairness between
+ * different stations/interfaces.
  *
- * Drivers indicate that they use this model by implementing the .wake_tx_queue
- * driver operation.
+ * Drivers must provide the .wake_tx_queue driver operation by either
+ * linking it to ieee80211_handle_wake_tx_queue() or implementing a custom
+ * handler.
  *
  * Intermediate queues (struct ieee80211_txq) are kept per-sta per-tid, with
  * another per-sta for non-data/non-mgmt and bufferable management frames, and
@@ -106,9 +104,12 @@
  * The driver is expected to initialize its private per-queue data for stations
  * and interfaces in the .add_interface and .sta_add ops.
  *
- * The driver can't access the queue directly. To dequeue a frame from a
- * txq, it calls ieee80211_tx_dequeue(). Whenever mac80211 adds a new frame to a
- * queue, it calls the .wake_tx_queue driver op.
+ * The driver can't access the internal TX queues (iTXQs) directly.
+ * Whenever mac80211 adds a new frame to a queue, it calls the .wake_tx_queue
+ * driver op.
+ * Drivers implementing a custom .wake_tx_queue op can get them by calling
+ * ieee80211_tx_dequeue(). Drivers using ieee80211_handle_wake_tx_queue() will
+ * simply get the individual frames pushed via the .tx driver operation.
  *
  * Drivers can optionally delegate responsibility for scheduling queues to
  * mac80211, to take advantage of airtime fairness accounting. In this case, to
@@ -1826,7 +1827,7 @@ struct ieee80211_vif_cfg {
  *	for this interface.
  * @drv_priv: data area for driver use, will always be aligned to
  *	sizeof(void \*).
- * @txq: the multicast data TX queue (if driver uses the TXQ abstraction)
+ * @txq: the multicast data TX queue
  * @txqs_stopped: per AC flag to indicate that intermediate TXQs are stopped,
  *	protected by fq->lock.
  * @offload_flags: 802.3 -> 802.11 enapsulation offload flags, see
@@ -2259,8 +2260,8 @@ struct ieee80211_link_sta {
  *	For non MLO STA it will point to the deflink data. For MLO STA
  *	ieee80211_sta_recalc_aggregates() must be called to update it.
  * @support_p2p_ps: indicates whether the STA supports P2P PS mechanism or not.
- * @txq: per-TID data TX queues (if driver uses the TXQ abstraction); note that
- *	the last entry (%IEEE80211_NUM_TIDS) is used for non-data frames
+ * @txq: per-TID data TX queues; note that the last entry (%IEEE80211_NUM_TIDS)
+ *	is used for non-data frames
  * @deflink: This holds the default link STA information, for non MLO STA all link
  *	specific STA information is accessed through @deflink or through
  *	link[0] which points to address of @deflink. For MLO Link STA
@@ -5713,7 +5714,7 @@ void ieee80211_key_replay(struct ieee80211_key_conf *keyconf);
  * @hw: pointer as obtained from ieee80211_alloc_hw().
  * @queue: queue number (counted from zero).
  *
- * Drivers should use this function instead of netif_wake_queue.
+ * Drivers must use this function instead of netif_wake_queue.
  */
 void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue);
 
@@ -5722,7 +5723,7 @@ void ieee80211_wake_queue(struct ieee80211_hw *hw, int queue);
  * @hw: pointer as obtained from ieee80211_alloc_hw().
  * @queue: queue number (counted from zero).
  *
- * Drivers should use this function instead of netif_stop_queue.
+ * Drivers must use this function instead of netif_stop_queue.
  */
 void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue);
 
@@ -5731,7 +5732,7 @@ void ieee80211_stop_queue(struct ieee80211_hw *hw, int queue);
  * @hw: pointer as obtained from ieee80211_alloc_hw().
  * @queue: queue number (counted from zero).
  *
- * Drivers should use this function instead of netif_stop_queue.
+ * Drivers must use this function instead of netif_queue_stopped.
  *
  * Return: %true if the queue is stopped. %false otherwise.
  */
@@ -5742,7 +5743,7 @@ int ieee80211_queue_stopped(struct ieee80211_hw *hw, int queue);
  * ieee80211_stop_queues - stop all queues
  * @hw: pointer as obtained from ieee80211_alloc_hw().
  *
- * Drivers should use this function instead of netif_stop_queue.
+ * Drivers must use this function instead of netif_tx_stop_all_queues.
  */
 void ieee80211_stop_queues(struct ieee80211_hw *hw);
 
@@ -5750,7 +5751,7 @@ void ieee80211_stop_queues(struct ieee80211_hw *hw);
  * ieee80211_wake_queues - wake all queues
  * @hw: pointer as obtained from ieee80211_alloc_hw().
  *
- * Drivers should use this function instead of netif_wake_queue.
+ * Drivers must use this function instead of netif_tx_wake_all_queues.
  */
 void ieee80211_wake_queues(struct ieee80211_hw *hw);
 
@@ -6971,6 +6972,18 @@ static inline struct sk_buff *ieee80211_tx_dequeue_ni(struct ieee80211_hw *hw,
 	return skb;
 }
 
+/**
+ * ieee80211_handle_wake_tx_queue - mac80211 handler for wake_tx_queue callback
+ *
+ * @hw: pointer as obtained from wake_tx_queue() callback().
+ * @txq: pointer as obtained from wake_tx_queue() callback().
+ *
+ * Drivers can use this function for the mandatory mac80211 wake_tx_queue
+ * callback in struct ieee80211_ops. They should not call this function.
+ */
+void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw,
+				    struct ieee80211_txq *txq);
+
 /**
  * ieee80211_next_txq - get next tx queue to pull packets from
  *
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 40b75fa82b15..a4bf86f17c39 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -288,6 +288,52 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw,
 }
 EXPORT_SYMBOL(ieee80211_ctstoself_duration);
 
+static void wake_tx_push_queue(struct ieee80211_local *local,
+			       struct ieee80211_sub_if_data *sdata,
+			       struct ieee80211_txq *queue)
+{
+	int q = sdata->vif.hw_queue[queue->ac];
+	struct ieee80211_tx_control control = {
+		.sta = queue->sta,
+	};
+	struct sk_buff *skb;
+	unsigned long flags;
+	bool q_stopped;
+
+	while (1) {
+		spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
+		q_stopped = local->queue_stop_reasons[q];
+		spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags);
+
+		if (q_stopped)
+			break;
+
+		skb = ieee80211_tx_dequeue(&local->hw, queue);
+		if (!skb)
+			break;
+
+		drv_tx(local, &control, skb);
+	}
+}
+
+/* wake_tx_queue handler for driver not implementing a custom one*/
+void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw,
+				    struct ieee80211_txq *txq)
+{
+	struct ieee80211_local *local = hw_to_local(hw);
+	struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif);
+	struct ieee80211_txq *queue;
+
+	/* Use ieee80211_next_txq() for airtime fairness accounting */
+	ieee80211_txq_schedule_start(hw, txq->ac);
+	while ((queue = ieee80211_next_txq(hw, txq->ac))) {
+		wake_tx_push_queue(local, sdata, queue);
+		ieee80211_return_txq(hw, queue, false);
+	}
+	ieee80211_txq_schedule_end(hw, txq->ac);
+}
+EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue);
+
 static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac)
 {
 	struct ieee80211_local *local = sdata->local;
-- 
cgit v1.2.3


From ac8037c35bd1fb1799d39f52205f813e6585b58b Mon Sep 17 00:00:00 2001
From: Alexander Aring <aahringo@redhat.com>
Date: Fri, 7 Oct 2022 10:53:05 +0200
Subject: mac802154: set filter at drv_start()

The current filtering level is set on the first interface up on a wpan
phy. If we support scan functionality we need to change the filtering
level on the fly on an operational phy and switching back again.

This patch will move the receive mode parameter e.g. address filter and
promiscuous mode to the drv_start() functionality to allow changing the
receive mode on an operational phy not on first ifup only. In future this
should be handled on driver layer because each hardware has it's own way
to enter a specific filtering level. However this should offer to switch
to mode IEEE802154_FILTERING_NONE and back to
IEEE802154_FILTERING_4_FRAME_FIELDS.

Only IEEE802154_FILTERING_4_FRAME_FIELDS and IEEE802154_FILTERING_NONE
are somewhat supported by current hardware. All other filtering levels
can be supported in future but will end in IEEE802154_FILTERING_NONE as
the receive part can kind of "emulate" those receive paths by doing
additional filtering routines.

There are in total three filtering levels in the code:
- the per-interface default level (should not be changed)
- the required per-interface level (mac commands may play with it)
- the actual per-PHY (hw) level that is currently in use

Signed-off-by: Alexander Aring <aahringo@redhat.com>
[<miquel.raynal@bootlin.com: Add the third filtering variable]
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Link: https://lore.kernel.org/r/20221007085310.503366-4-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h      |  7 +++--
 net/mac802154/cfg.c          |  2 +-
 net/mac802154/driver-ops.h   | 71 +++++++++++++++++++++++++++++++++++++++++++-
 net/mac802154/ieee802154_i.h | 12 ++++++++
 net/mac802154/iface.c        | 44 ++++++++++-----------------
 net/mac802154/rx.c           | 12 +++++++-
 6 files changed, 115 insertions(+), 33 deletions(-)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index 428cece22205..e1481f9cf049 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -223,6 +223,11 @@ struct wpan_phy {
 	atomic_t hold_txs;
 	wait_queue_head_t sync_txq;
 
+	/* Current filtering level on reception.
+	 * Only allowed to be changed if phy is not operational.
+	 */
+	enum ieee802154_filtering_level filtering;
+
 	char priv[] __aligned(NETDEV_ALIGN);
 };
 
@@ -374,8 +379,6 @@ struct wpan_dev {
 
 	bool lbt;
 
-	bool promiscuous_mode;
-
 	/* fallback for acknowledgment bit setting */
 	bool ackreq;
 };
diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c
index 93df24f75572..dc2d918fac68 100644
--- a/net/mac802154/cfg.c
+++ b/net/mac802154/cfg.c
@@ -67,7 +67,7 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy)
 		goto wake_up;
 
 	/* restart hardware */
-	ret = drv_start(local);
+	ret = drv_start(local, local->phy->filtering, &local->addr_filt);
 	if (ret)
 		return ret;
 
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index c9d54088a567..a7af3f0ddb3e 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -129,12 +129,81 @@ drv_set_promiscuous_mode(struct ieee802154_local *local, bool on)
 	return ret;
 }
 
-static inline int drv_start(struct ieee802154_local *local)
+static inline int drv_start(struct ieee802154_local *local,
+			    enum ieee802154_filtering_level level,
+			    const struct ieee802154_hw_addr_filt *addr_filt)
 {
 	int ret;
 
 	might_sleep();
 
+	/* setup receive mode parameters e.g. address mode */
+	if (local->hw.flags & IEEE802154_HW_AFILT) {
+		ret = drv_set_pan_id(local, addr_filt->pan_id);
+		if (ret < 0)
+			return ret;
+
+		ret = drv_set_short_addr(local, addr_filt->short_addr);
+		if (ret < 0)
+			return ret;
+
+		ret = drv_set_extended_addr(local, addr_filt->ieee_addr);
+		if (ret < 0)
+			return ret;
+	}
+
+	switch (level) {
+	case IEEE802154_FILTERING_NONE:
+		fallthrough;
+	case IEEE802154_FILTERING_1_FCS:
+		fallthrough;
+	case IEEE802154_FILTERING_2_PROMISCUOUS:
+		/* TODO: Requires a different receive mode setup e.g.
+		 * at86rf233 hardware.
+		 */
+		fallthrough;
+	case IEEE802154_FILTERING_3_SCAN:
+		if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+			ret = drv_set_promiscuous_mode(local, true);
+			if (ret < 0)
+				return ret;
+		} else {
+			return -EOPNOTSUPP;
+		}
+
+		/* In practice other filtering levels can be requested, but as
+		 * for now most hardware/drivers only support
+		 * IEEE802154_FILTERING_NONE, we fallback to this actual
+		 * filtering level in hardware and make our own additional
+		 * filtering in mac802154 receive path.
+		 *
+		 * TODO: Move this logic to the device drivers as hardware may
+		 * support more higher level filters. Hardware may also require
+		 * a different order how register are set, which could currently
+		 * be buggy, so all received parameters need to be moved to the
+		 * start() callback and let the driver go into the mode before
+		 * it will turn on receive handling.
+		 */
+		local->phy->filtering = IEEE802154_FILTERING_NONE;
+		break;
+	case IEEE802154_FILTERING_4_FRAME_FIELDS:
+		/* Do not error out if IEEE802154_HW_PROMISCUOUS because we
+		 * expect the hardware to operate at the level
+		 * IEEE802154_FILTERING_4_FRAME_FIELDS anyway.
+		 */
+		if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
+			ret = drv_set_promiscuous_mode(local, false);
+			if (ret < 0)
+				return ret;
+		}
+
+		local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS;
+		break;
+	default:
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
 	trace_802154_drv_start(local);
 	local->started = true;
 	smp_mb();
diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h
index 010365a6364e..509e0172fe82 100644
--- a/net/mac802154/ieee802154_i.h
+++ b/net/mac802154/ieee802154_i.h
@@ -26,6 +26,8 @@ struct ieee802154_local {
 	struct ieee802154_hw hw;
 	const struct ieee802154_ops *ops;
 
+	/* hardware address filter */
+	struct ieee802154_hw_addr_filt addr_filt;
 	/* ieee802154 phy */
 	struct wpan_phy *phy;
 
@@ -82,6 +84,16 @@ struct ieee802154_sub_if_data {
 	struct ieee802154_local *local;
 	struct net_device *dev;
 
+	/* Each interface starts and works in nominal state at a given filtering
+	 * level given by iface_default_filtering, which is set once for all at
+	 * the interface creation and should not evolve over time. For some MAC
+	 * operations however, the filtering level may change temporarily, as
+	 * reflected in the required_filtering field. The actual filtering at
+	 * the PHY level may be different and is shown in struct wpan_phy.
+	 */
+	enum ieee802154_filtering_level iface_default_filtering;
+	enum ieee802154_filtering_level required_filtering;
+
 	unsigned long state;
 	char name[IFNAMSIZ];
 
diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c
index 500ed1b81250..d9b50884d34e 100644
--- a/net/mac802154/iface.c
+++ b/net/mac802154/iface.c
@@ -147,25 +147,12 @@ static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata)
 	struct wpan_dev *wpan_dev = &sdata->wpan_dev;
 	int ret;
 
-	if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
-		ret = drv_set_promiscuous_mode(local,
-					       wpan_dev->promiscuous_mode);
-		if (ret < 0)
-			return ret;
-	}
+	sdata->required_filtering = sdata->iface_default_filtering;
 
 	if (local->hw.flags & IEEE802154_HW_AFILT) {
-		ret = drv_set_pan_id(local, wpan_dev->pan_id);
-		if (ret < 0)
-			return ret;
-
-		ret = drv_set_extended_addr(local, wpan_dev->extended_addr);
-		if (ret < 0)
-			return ret;
-
-		ret = drv_set_short_addr(local, wpan_dev->short_addr);
-		if (ret < 0)
-			return ret;
+		local->addr_filt.pan_id = wpan_dev->pan_id;
+		local->addr_filt.ieee_addr = wpan_dev->extended_addr;
+		local->addr_filt.short_addr = wpan_dev->short_addr;
 	}
 
 	if (local->hw.flags & IEEE802154_HW_LBT) {
@@ -206,7 +193,8 @@ static int mac802154_slave_open(struct net_device *dev)
 		if (res)
 			goto err;
 
-		res = drv_start(local);
+		res = drv_start(local, sdata->required_filtering,
+				&local->addr_filt);
 		if (res)
 			goto err;
 	}
@@ -223,15 +211,16 @@ err:
 
 static int
 ieee802154_check_mac_settings(struct ieee802154_local *local,
-			      struct wpan_dev *wpan_dev,
-			      struct wpan_dev *nwpan_dev)
+			      struct ieee802154_sub_if_data *sdata,
+			      struct ieee802154_sub_if_data *nsdata)
 {
+	struct wpan_dev *nwpan_dev = &nsdata->wpan_dev;
+	struct wpan_dev *wpan_dev = &sdata->wpan_dev;
+
 	ASSERT_RTNL();
 
-	if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) {
-		if (wpan_dev->promiscuous_mode != nwpan_dev->promiscuous_mode)
-			return -EBUSY;
-	}
+	if (sdata->iface_default_filtering != nsdata->iface_default_filtering)
+		return -EBUSY;
 
 	if (local->hw.flags & IEEE802154_HW_AFILT) {
 		if (wpan_dev->pan_id != nwpan_dev->pan_id ||
@@ -285,8 +274,7 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata,
 			/* check all phy mac sublayer settings are the same.
 			 * We have only one phy, different values makes trouble.
 			 */
-			ret = ieee802154_check_mac_settings(local, wpan_dev,
-							    &nsdata->wpan_dev);
+			ret = ieee802154_check_mac_settings(local, sdata, nsdata);
 			if (ret < 0)
 				return ret;
 		}
@@ -586,7 +574,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
 		sdata->dev->priv_destructor = mac802154_wpan_free;
 		sdata->dev->netdev_ops = &mac802154_wpan_ops;
 		sdata->dev->ml_priv = &mac802154_mlme_wpan;
-		wpan_dev->promiscuous_mode = false;
+		sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS;
 		wpan_dev->header_ops = &ieee802154_header_ops;
 
 		mutex_init(&sdata->sec_mtx);
@@ -600,7 +588,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata,
 	case NL802154_IFTYPE_MONITOR:
 		sdata->dev->needs_free_netdev = true;
 		sdata->dev->netdev_ops = &mac802154_monitor_ops;
-		wpan_dev->promiscuous_mode = true;
+		sdata->iface_default_filtering = IEEE802154_FILTERING_NONE;
 		break;
 	default:
 		BUG();
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index b8ce84618a55..8543c28948a0 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -268,10 +268,20 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
 
 	ieee802154_monitors_rx(local, skb);
 
+	/* TODO: Avoid delivering frames received at the level
+	 * IEEE802154_FILTERING_NONE on interfaces not expecting it because of
+	 * the missing auto ACK handling feature.
+	 */
+
+	/* TODO: Handle upcomming receive path where the PHY is at the
+	 * IEEE802154_FILTERING_NONE level during a scan.
+	 */
+
 	/* Check if transceiver doesn't validate the checksum.
 	 * If not we validate the checksum here.
 	 */
-	if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM) {
+	if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM ||
+	    local->phy->filtering == IEEE802154_FILTERING_NONE) {
 		crc = crc_ccitt(0, skb->data, skb->len);
 		if (crc) {
 			rcu_read_unlock();
-- 
cgit v1.2.3


From ea562d8c486eebd2707bcd193974078a2a47affc Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 7 Oct 2022 10:53:07 +0200
Subject: ieee802154: hwsim: Implement address filtering

We have access to the address filters being theoretically applied, we
also have access to the actual filtering level applied, so let's add a
proper frame validation sequence in hwsim.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20221007085310.503366-6-miquel.raynal@bootlin.com
[stefan@datenfreihafen.org: fixup some checkpatch warnings]
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 drivers/net/ieee802154/mac802154_hwsim.c | 110 ++++++++++++++++++++++++++++++-
 include/net/ieee802154_netdev.h          |   8 +++
 2 files changed, 116 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c
index 458be66b5195..75d802e0b685 100644
--- a/drivers/net/ieee802154/mac802154_hwsim.c
+++ b/drivers/net/ieee802154/mac802154_hwsim.c
@@ -18,6 +18,7 @@
 #include <linux/netdevice.h>
 #include <linux/device.h>
 #include <linux/spinlock.h>
+#include <net/ieee802154_netdev.h>
 #include <net/mac802154.h>
 #include <net/cfg802154.h>
 #include <net/genetlink.h>
@@ -139,6 +140,112 @@ static int hwsim_hw_addr_filt(struct ieee802154_hw *hw,
 	return 0;
 }
 
+static void hwsim_hw_receive(struct ieee802154_hw *hw, struct sk_buff *skb,
+			     u8 lqi)
+{
+	struct ieee802154_hdr hdr;
+	struct hwsim_phy *phy = hw->priv;
+	struct hwsim_pib *pib;
+
+	rcu_read_lock();
+	pib = rcu_dereference(phy->pib);
+
+	if (!pskb_may_pull(skb, 3)) {
+		dev_dbg(hw->parent, "invalid frame\n");
+		goto drop;
+	}
+
+	memcpy(&hdr, skb->data, 3);
+
+	/* Level 4 filtering: Frame fields validity */
+	if (hw->phy->filtering == IEEE802154_FILTERING_4_FRAME_FIELDS) {
+		/* a) Drop reserved frame types */
+		switch (mac_cb(skb)->type) {
+		case IEEE802154_FC_TYPE_BEACON:
+		case IEEE802154_FC_TYPE_DATA:
+		case IEEE802154_FC_TYPE_ACK:
+		case IEEE802154_FC_TYPE_MAC_CMD:
+			break;
+		default:
+			dev_dbg(hw->parent, "unrecognized frame type 0x%x\n",
+				mac_cb(skb)->type);
+			goto drop;
+		}
+
+		/* b) Drop reserved frame versions */
+		switch (hdr.fc.version) {
+		case IEEE802154_2003_STD:
+		case IEEE802154_2006_STD:
+		case IEEE802154_STD:
+			break;
+		default:
+			dev_dbg(hw->parent,
+				"unrecognized frame version 0x%x\n",
+				hdr.fc.version);
+			goto drop;
+		}
+
+		/* c) PAN ID constraints */
+		if ((mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG ||
+		     mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT) &&
+		    mac_cb(skb)->dest.pan_id != pib->filt.pan_id &&
+		    mac_cb(skb)->dest.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST)) {
+			dev_dbg(hw->parent,
+				"unrecognized PAN ID %04x\n",
+				le16_to_cpu(mac_cb(skb)->dest.pan_id));
+			goto drop;
+		}
+
+		/* d1) Short address constraints */
+		if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_SHORT &&
+		    mac_cb(skb)->dest.short_addr != pib->filt.short_addr &&
+		    mac_cb(skb)->dest.short_addr != cpu_to_le16(IEEE802154_ADDR_BROADCAST)) {
+			dev_dbg(hw->parent,
+				"unrecognized short address %04x\n",
+				le16_to_cpu(mac_cb(skb)->dest.short_addr));
+			goto drop;
+		}
+
+		/* d2) Extended address constraints */
+		if (mac_cb(skb)->dest.mode == IEEE802154_ADDR_LONG &&
+		    mac_cb(skb)->dest.extended_addr != pib->filt.ieee_addr) {
+			dev_dbg(hw->parent,
+				"unrecognized long address 0x%016llx\n",
+				mac_cb(skb)->dest.extended_addr);
+			goto drop;
+		}
+
+		/* d4) Specific PAN coordinator case (no parent) */
+		if ((mac_cb(skb)->type == IEEE802154_FC_TYPE_DATA ||
+		     mac_cb(skb)->type == IEEE802154_FC_TYPE_MAC_CMD) &&
+		    mac_cb(skb)->dest.mode == IEEE802154_ADDR_NONE) {
+			dev_dbg(hw->parent,
+				"relaying is not supported\n");
+			goto drop;
+		}
+
+		/* e) Beacon frames follow specific PAN ID rules */
+		if (mac_cb(skb)->type == IEEE802154_FC_TYPE_BEACON &&
+		    pib->filt.pan_id != cpu_to_le16(IEEE802154_PANID_BROADCAST) &&
+		    mac_cb(skb)->dest.pan_id != pib->filt.pan_id) {
+			dev_dbg(hw->parent,
+				"invalid beacon PAN ID %04x\n",
+				le16_to_cpu(mac_cb(skb)->dest.pan_id));
+			goto drop;
+		}
+	}
+
+	rcu_read_unlock();
+
+	ieee802154_rx_irqsafe(hw, skb, lqi);
+
+	return;
+
+drop:
+	rcu_read_unlock();
+	kfree_skb(skb);
+}
+
 static int hwsim_hw_xmit(struct ieee802154_hw *hw, struct sk_buff *skb)
 {
 	struct hwsim_phy *current_phy = hw->priv;
@@ -166,8 +273,7 @@ static int hwsim_hw_xmit(struct ieee802154_hw *hw, struct sk_buff *skb)
 
 			einfo = rcu_dereference(e->info);
 			if (newskb)
-				ieee802154_rx_irqsafe(e->endpoint->hw, newskb,
-						      einfo->lqi);
+				hwsim_hw_receive(e->endpoint->hw, newskb, einfo->lqi);
 		}
 	}
 	rcu_read_unlock();
diff --git a/include/net/ieee802154_netdev.h b/include/net/ieee802154_netdev.h
index d0d188c3294b..1b82bbafe8c7 100644
--- a/include/net/ieee802154_netdev.h
+++ b/include/net/ieee802154_netdev.h
@@ -69,6 +69,14 @@ struct ieee802154_hdr_fc {
 #endif
 };
 
+enum ieee802154_frame_version {
+	IEEE802154_2003_STD,
+	IEEE802154_2006_STD,
+	IEEE802154_STD,
+	IEEE802154_RESERVED_STD,
+	IEEE802154_MULTIPURPOSE_STD = IEEE802154_2003_STD,
+};
+
 struct ieee802154_hdr {
 	struct ieee802154_hdr_fc fc;
 	u8 seq;
-- 
cgit v1.2.3


From a4b5b4c56dd8b1dd46b2f13cb09f5f8031978f86 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Fri, 7 Oct 2022 10:53:08 +0200
Subject: mac802154: Drop IEEE802154_HW_RX_DROP_BAD_CKSUM

This IEEE802154_HW_RX_DROP_BAD_CKSUM flag was only used by hwsim to
reflect the fact that it would not validate the checksum (FCS). So this
was only useful while the only filtering level hwsim was capable of was
"NONE". Now that the driver has been improved we no longer need this
flag.

Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20221007085310.503366-7-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 drivers/net/ieee802154/mac802154_hwsim.c | 3 ++-
 include/net/mac802154.h                  | 4 ----
 net/mac802154/rx.c                       | 7 ++-----
 3 files changed, 4 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ieee802154/mac802154_hwsim.c b/drivers/net/ieee802154/mac802154_hwsim.c
index 75d802e0b685..1db7da3ccc1a 100644
--- a/drivers/net/ieee802154/mac802154_hwsim.c
+++ b/drivers/net/ieee802154/mac802154_hwsim.c
@@ -287,6 +287,7 @@ static int hwsim_hw_start(struct ieee802154_hw *hw)
 	struct hwsim_phy *phy = hw->priv;
 
 	phy->suspended = false;
+
 	return 0;
 }
 
@@ -933,7 +934,7 @@ static int hwsim_add_one(struct genl_info *info, struct device *dev,
 	phy->idx = idx;
 	INIT_LIST_HEAD(&phy->edges);
 
-	hw->flags = IEEE802154_HW_PROMISCUOUS | IEEE802154_HW_RX_DROP_BAD_CKSUM;
+	hw->flags = IEEE802154_HW_PROMISCUOUS;
 	hw->parent = dev;
 
 	err = ieee802154_register_hw(hw);
diff --git a/include/net/mac802154.h b/include/net/mac802154.h
index 357d25ef627a..4a3a9de9da73 100644
--- a/include/net/mac802154.h
+++ b/include/net/mac802154.h
@@ -111,9 +111,6 @@ struct ieee802154_hw {
  *	promiscuous mode setting.
  *
  * @IEEE802154_HW_RX_OMIT_CKSUM: Indicates that receiver omits FCS.
- *
- * @IEEE802154_HW_RX_DROP_BAD_CKSUM: Indicates that receiver will not filter
- *	frames with bad checksum.
  */
 enum ieee802154_hw_flags {
 	IEEE802154_HW_TX_OMIT_CKSUM	= BIT(0),
@@ -123,7 +120,6 @@ enum ieee802154_hw_flags {
 	IEEE802154_HW_AFILT		= BIT(4),
 	IEEE802154_HW_PROMISCUOUS	= BIT(5),
 	IEEE802154_HW_RX_OMIT_CKSUM	= BIT(6),
-	IEEE802154_HW_RX_DROP_BAD_CKSUM	= BIT(7),
 };
 
 /* Indicates that receiver omits FCS and xmitter will add FCS on it's own. */
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 8543c28948a0..80dd52bc6bf1 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -277,11 +277,8 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb)
 	 * IEEE802154_FILTERING_NONE level during a scan.
 	 */
 
-	/* Check if transceiver doesn't validate the checksum.
-	 * If not we validate the checksum here.
-	 */
-	if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM ||
-	    local->phy->filtering == IEEE802154_FILTERING_NONE) {
+	/* Level 1 filtering: Check the FCS by software when relevant */
+	if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) {
 		crc = crc_ccitt(0, skb->data, skb->len);
 		if (crc) {
 			rcu_read_unlock();
-- 
cgit v1.2.3


From 6fdfdef7fdb57e6b9f768c9ca0718dcb5e727a85 Mon Sep 17 00:00:00 2001
From: Alexey Kodanev <aleksei.kodanev@bell-sw.com>
Date: Wed, 19 Oct 2022 21:07:33 +0300
Subject: sctp: remove unnecessary NULL check in sctp_association_init()

'&asoc->ulpq' passed to sctp_ulpq_init() as the first argument,
then sctp_qlpq_init() initializes it and eventually returns the
address of the struct member back. Therefore, in this case, the
return pointer cannot be NULL.

Moreover, it seems sctp_ulpq_init() has always been used only in
sctp_association_init(), so there's really no need to return ulpq
anymore.

Detected using the static analysis tool - Svace.
Signed-off-by: Alexey Kodanev <aleksei.kodanev@bell-sw.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Link: https://lore.kernel.org/r/20221019180735.161388-1-aleksei.kodanev@bell-sw.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/ulpqueue.h | 3 +--
 net/sctp/associola.c        | 4 +---
 net/sctp/ulpqueue.c         | 5 +----
 3 files changed, 3 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/ulpqueue.h b/include/net/sctp/ulpqueue.h
index 0eaf8650e3b2..60f6641290c3 100644
--- a/include/net/sctp/ulpqueue.h
+++ b/include/net/sctp/ulpqueue.h
@@ -35,8 +35,7 @@ struct sctp_ulpq {
 };
 
 /* Prototypes. */
-struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *,
-				 struct sctp_association *);
+void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc);
 void sctp_ulpq_flush(struct sctp_ulpq *ulpq);
 void sctp_ulpq_free(struct sctp_ulpq *);
 
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 3460abceba44..63ba5551c13f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -226,8 +226,7 @@ static struct sctp_association *sctp_association_init(
 	/* Create an output queue.  */
 	sctp_outq_init(asoc, &asoc->outqueue);
 
-	if (!sctp_ulpq_init(&asoc->ulpq, asoc))
-		goto fail_init;
+	sctp_ulpq_init(&asoc->ulpq, asoc);
 
 	if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp))
 		goto stream_free;
@@ -277,7 +276,6 @@ static struct sctp_association *sctp_association_init(
 
 stream_free:
 	sctp_stream_free(&asoc->stream);
-fail_init:
 	sock_put(asoc->base.sk);
 	sctp_endpoint_put(asoc->ep);
 	return NULL;
diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c
index 0a8510a0c5e6..24960dcb6a21 100644
--- a/net/sctp/ulpqueue.c
+++ b/net/sctp/ulpqueue.c
@@ -38,8 +38,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq);
 /* 1st Level Abstractions */
 
 /* Initialize a ULP queue from a block of memory.  */
-struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
-				 struct sctp_association *asoc)
+void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc)
 {
 	memset(ulpq, 0, sizeof(struct sctp_ulpq));
 
@@ -48,8 +47,6 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq,
 	skb_queue_head_init(&ulpq->reasm_uo);
 	skb_queue_head_init(&ulpq->lobby);
 	ulpq->pd_mode  = 0;
-
-	return ulpq;
 }
 
 
-- 
cgit v1.2.3


From 1f8c4eeb945553baf868bbec7a8c59810df97a07 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Wed, 19 Oct 2022 15:36:02 -0700
Subject: inet6: Remove inet6_destroy_sock().

The last user of inet6_destroy_sock() is its wrapper inet6_cleanup_sock().
Let's rename inet6_destroy_sock() to inet6_cleanup_sock().

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/transp_v6.h | 2 --
 net/ipv6/af_inet6.c     | 8 +-------
 2 files changed, 1 insertion(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h
index b830463e3dff..d27b1caf3753 100644
--- a/include/net/transp_v6.h
+++ b/include/net/transp_v6.h
@@ -58,8 +58,6 @@ ip6_dgram_sock_seq_show(struct seq_file *seq, struct sock *sp, __u16 srcp,
 
 #define LOOPBACK4_IPV6 cpu_to_be32(0x7f000006)
 
-void inet6_destroy_sock(struct sock *sk);
-
 #define IPV6_SEQ_DGRAM_HEADER					       \
 	"  sl  "						       \
 	"local_address                         "		       \
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 6540551ea7ec..68075295d587 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -490,7 +490,7 @@ int inet6_release(struct socket *sock)
 }
 EXPORT_SYMBOL(inet6_release);
 
-void inet6_destroy_sock(struct sock *sk)
+void inet6_cleanup_sock(struct sock *sk)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
@@ -515,12 +515,6 @@ void inet6_destroy_sock(struct sock *sk)
 		txopt_put(opt);
 	}
 }
-EXPORT_SYMBOL_GPL(inet6_destroy_sock);
-
-void inet6_cleanup_sock(struct sock *sk)
-{
-	inet6_destroy_sock(sk);
-}
 EXPORT_SYMBOL_GPL(inet6_cleanup_sock);
 
 /*
-- 
cgit v1.2.3


From 8a3854c7b8e4532063b14bed34115079b7d0cb36 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Thu, 20 Oct 2022 19:48:52 +0200
Subject: udp: track the forward memory release threshold in an hot cacheline

When the receiver process and the BH runs on different cores,
udp_rmem_release() experience a cache miss while accessing sk_rcvbuf,
as the latter shares the same cacheline with sk_forward_alloc, written
by the BH.

With this patch, UDP tracks the rcvbuf value and its update via custom
SOL_SOCKET socket options, and copies the forward memory threshold value
used by udp_rmem_release() in a different cacheline, already accessed by
the above function and uncontended.

Since the UDP socket init operation grown a bit, factor out the common
code between v4 and v6 in a shared helper.

Overall the above give a 10% peek throughput increase under UDP flood.

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/udp.h |  3 +++
 include/net/udp.h   |  9 +++++++++
 net/ipv4/udp.c      | 18 +++++++++++++++---
 net/ipv6/udp.c      |  4 ++--
 4 files changed, 29 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index e96da4157d04..5cdba00a904a 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -87,6 +87,9 @@ struct udp_sock {
 
 	/* This field is dirtied by udp_recvmsg() */
 	int		forward_deficit;
+
+	/* This fields follows rcvbuf value, and is touched by udp_recvmsg */
+	int		forward_threshold;
 };
 
 #define UDP_MAX_SEGMENTS	(1 << 6UL)
diff --git a/include/net/udp.h b/include/net/udp.h
index fee053bcd17c..de4b528522bb 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -174,6 +174,15 @@ INDIRECT_CALLABLE_DECLARE(int udpv6_rcv(struct sk_buff *));
 struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb,
 				  netdev_features_t features, bool is_ipv6);
 
+static inline void udp_lib_init_sock(struct sock *sk)
+{
+	struct udp_sock *up = udp_sk(sk);
+
+	skb_queue_head_init(&up->reader_queue);
+	up->forward_threshold = sk->sk_rcvbuf >> 2;
+	set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
+}
+
 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
 static inline int udp_lib_hash(struct sock *sk)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index c83e5271030b..e77c8f0e9087 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1448,7 +1448,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial,
 	if (likely(partial)) {
 		up->forward_deficit += size;
 		size = up->forward_deficit;
-		if (size < (sk->sk_rcvbuf >> 2) &&
+		if (size < READ_ONCE(up->forward_threshold) &&
 		    !skb_queue_empty(&up->reader_queue))
 			return;
 	} else {
@@ -1622,7 +1622,7 @@ static void udp_destruct_sock(struct sock *sk)
 
 int udp_init_sock(struct sock *sk)
 {
-	skb_queue_head_init(&udp_sk(sk)->reader_queue);
+	udp_lib_init_sock(sk);
 	sk->sk_destruct = udp_destruct_sock;
 	return 0;
 }
@@ -2671,6 +2671,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname,
 	int err = 0;
 	int is_udplite = IS_UDPLITE(sk);
 
+	if (level == SOL_SOCKET) {
+		err = sk_setsockopt(sk, level, optname, optval, optlen);
+
+		if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) {
+			sockopt_lock_sock(sk);
+			/* paired with READ_ONCE in udp_rmem_release() */
+			WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2);
+			sockopt_release_sock(sk);
+		}
+		return err;
+	}
+
 	if (optlen < sizeof(int))
 		return -EINVAL;
 
@@ -2784,7 +2796,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt);
 int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		   unsigned int optlen)
 {
-	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE || level == SOL_SOCKET)
 		return udp_lib_setsockopt(sk, level, optname,
 					  optval, optlen,
 					  udp_push_pending_frames);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 2260406740d3..297f7cc06044 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -64,7 +64,7 @@ static void udpv6_destruct_sock(struct sock *sk)
 
 int udpv6_init_sock(struct sock *sk)
 {
-	skb_queue_head_init(&udp_sk(sk)->reader_queue);
+	udp_lib_init_sock(sk);
 	sk->sk_destruct = udpv6_destruct_sock;
 	return 0;
 }
@@ -1669,7 +1669,7 @@ void udpv6_destroy_sock(struct sock *sk)
 int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval,
 		     unsigned int optlen)
 {
-	if (level == SOL_UDP  ||  level == SOL_UDPLITE)
+	if (level == SOL_UDP  ||  level == SOL_UDPLITE || level == SOL_SOCKET)
 		return udp_lib_setsockopt(sk, level, optname,
 					  optval, optlen,
 					  udp_v6_push_pending_frames);
-- 
cgit v1.2.3


From 0cafd77dcd032d1687efaba5598cf07bce85997f Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Thu, 20 Oct 2022 23:20:18 +0000
Subject: net: add a refcount tracker for kernel sockets

Commit ffa84b5ffb37 ("net: add netns refcount tracker to struct sock")
added a tracker to sockets, but did not track kernel sockets.

We still have syzbot reports hinting about netns being destroyed
while some kernel TCP sockets had not been dismantled.

This patch tracks kernel sockets, and adds a ref_tracker_dir_print()
call to net_free() right before the netns is freed.

Normally, each layer is responsible for properly releasing its
kernel sockets before last call to net_free().

This debugging facility is enabled with CONFIG_NET_NS_REFCNT_TRACKER=y

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Tested-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/net_namespace.h | 30 ++++++++++++++++++++++--------
 net/core/net_namespace.c    |  5 +++++
 net/core/sock.c             | 14 ++++++++++++++
 net/netlink/af_netlink.c    | 11 +++++++++++
 net/rds/tcp.c               |  3 +++
 5 files changed, 55 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 8c3587d5c308..78beaa765c73 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -92,7 +92,9 @@ struct net {
 
 	struct ns_common	ns;
 	struct ref_tracker_dir  refcnt_tracker;
-
+	struct ref_tracker_dir  notrefcnt_tracker; /* tracker for objects not
+						    * refcounted against netns
+						    */
 	struct list_head 	dev_base_head;
 	struct proc_dir_entry 	*proc_net;
 	struct proc_dir_entry 	*proc_net_stat;
@@ -320,19 +322,31 @@ static inline int check_net(const struct net *net)
 #endif
 
 
-static inline void netns_tracker_alloc(struct net *net,
-				       netns_tracker *tracker, gfp_t gfp)
+static inline void __netns_tracker_alloc(struct net *net,
+					 netns_tracker *tracker,
+					 bool refcounted,
+					 gfp_t gfp)
 {
 #ifdef CONFIG_NET_NS_REFCNT_TRACKER
-	ref_tracker_alloc(&net->refcnt_tracker, tracker, gfp);
+	ref_tracker_alloc(refcounted ? &net->refcnt_tracker :
+				       &net->notrefcnt_tracker,
+			  tracker, gfp);
 #endif
 }
 
-static inline void netns_tracker_free(struct net *net,
-				      netns_tracker *tracker)
+static inline void netns_tracker_alloc(struct net *net, netns_tracker *tracker,
+				       gfp_t gfp)
+{
+	__netns_tracker_alloc(net, tracker, true, gfp);
+}
+
+static inline void __netns_tracker_free(struct net *net,
+					netns_tracker *tracker,
+					bool refcounted)
 {
 #ifdef CONFIG_NET_NS_REFCNT_TRACKER
-       ref_tracker_free(&net->refcnt_tracker, tracker);
+       ref_tracker_free(refcounted ? &net->refcnt_tracker :
+				     &net->notrefcnt_tracker, tracker);
 #endif
 }
 
@@ -346,7 +360,7 @@ static inline struct net *get_net_track(struct net *net,
 
 static inline void put_net_track(struct net *net, netns_tracker *tracker)
 {
-	netns_tracker_free(net, tracker);
+	__netns_tracker_free(net, tracker, true);
 	put_net(net);
 }
 
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 0ec2f5906a27..12c68edf7682 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -309,6 +309,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 
 	refcount_set(&net->ns.count, 1);
 	ref_tracker_dir_init(&net->refcnt_tracker, 128);
+	ref_tracker_dir_init(&net->notrefcnt_tracker, 128);
 
 	refcount_set(&net->passive, 1);
 	get_random_bytes(&net->hash_mix, sizeof(u32));
@@ -429,6 +430,10 @@ static void net_free(struct net *net)
 {
 	if (refcount_dec_and_test(&net->passive)) {
 		kfree(rcu_access_pointer(net->gen));
+
+		/* There should not be any trackers left there. */
+		ref_tracker_dir_exit(&net->notrefcnt_tracker);
+
 		kmem_cache_free(net_cachep, net);
 	}
 }
diff --git a/net/core/sock.c b/net/core/sock.c
index a3ba0358c77c..aa608dc0930b 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2094,6 +2094,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
 		if (likely(sk->sk_net_refcnt)) {
 			get_net_track(net, &sk->ns_tracker, priority);
 			sock_inuse_add(net, 1);
+		} else {
+			__netns_tracker_alloc(net, &sk->ns_tracker,
+					      false, priority);
 		}
 
 		sock_net_set(sk, net);
@@ -2149,6 +2152,9 @@ static void __sk_destruct(struct rcu_head *head)
 
 	if (likely(sk->sk_net_refcnt))
 		put_net_track(sock_net(sk), &sk->ns_tracker);
+	else
+		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+
 	sk_prot_free(sk->sk_prot_creator, sk);
 }
 
@@ -2237,6 +2243,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
 	if (likely(newsk->sk_net_refcnt)) {
 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
 		sock_inuse_add(sock_net(newsk), 1);
+	} else {
+		/* Kernel sockets are not elevating the struct net refcount.
+		 * Instead, use a tracker to more easily detect if a layer
+		 * is not properly dismantling its kernel sockets at netns
+		 * destroy time.
+		 */
+		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
+				      false, priority);
 	}
 	sk_node_init(&newsk->sk_node);
 	sock_lock_init(newsk);
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a662e8a5ff84..f0c94d394ab1 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -812,6 +812,17 @@ static int netlink_release(struct socket *sock)
 	}
 
 	sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+
+	/* Because struct net might disappear soon, do not keep a pointer. */
+	if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) {
+		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
+		/* Because of deferred_put_nlk_sk and use of work queue,
+		 * it is possible  netns will be freed before this socket.
+		 */
+		sock_net_set(sk, &init_net);
+		__netns_tracker_alloc(&init_net, &sk->ns_tracker,
+				      false, GFP_KERNEL);
+	}
 	call_rcu(&nlk->rcu, deferred_put_nlk_sk);
 	return 0;
 }
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 4444fd82b66d..c5b86066ff66 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -503,6 +503,9 @@ bool rds_tcp_tune(struct socket *sock)
 			release_sock(sk);
 			return false;
 		}
+		/* Update ns_tracker to current stack trace and refcounted tracker */
+		__netns_tracker_free(net, &sk->ns_tracker, false);
+
 		sk->sk_net_refcnt = 1;
 		netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL);
 		sock_inuse_add(net, 1);
-- 
cgit v1.2.3


From 233baf9a1bc46f18ad3bec688f52ea5f818a8a25 Mon Sep 17 00:00:00 2001
From: xu xin <xu.xin16@zte.com.cn>
Date: Thu, 20 Oct 2022 06:54:41 +0000
Subject: net: remove useless parameter of __sock_cmsg_send

The parameter 'msg' has never been used by __sock_cmsg_send, so we can remove it
safely.

Reported-by: Zeal Robot <zealci@zte.com.cn>
Signed-off-by: xu xin <xu.xin16@zte.com.cn>
Reviewed-by: Zhang Yunkai <zhang.yunkai@zte.com.cn>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h     | 2 +-
 net/core/sock.c        | 4 ++--
 net/ipv4/ip_sockglue.c | 2 +-
 net/ipv6/datagram.c    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9e464f6409a7..b1dacc4d68c9 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1901,7 +1901,7 @@ static inline void sockcm_init(struct sockcm_cookie *sockc,
 	*sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags };
 }
 
-int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 		     struct sockcm_cookie *sockc);
 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 		   struct sockcm_cookie *sockc);
diff --git a/net/core/sock.c b/net/core/sock.c
index aa608dc0930b..2786c1107e53 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2744,7 +2744,7 @@ failure:
 }
 EXPORT_SYMBOL(sock_alloc_send_pskb);
 
-int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
+int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
 		     struct sockcm_cookie *sockc)
 {
 	u32 tsflags;
@@ -2798,7 +2798,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
 			return -EINVAL;
 		if (cmsg->cmsg_level != SOL_SOCKET)
 			continue;
-		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
+		ret = __sock_cmsg_send(sk, cmsg, sockc);
 		if (ret)
 			return ret;
 	}
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 6e19cad154f5..5f16807d3235 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -267,7 +267,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc,
 		}
 #endif
 		if (cmsg->cmsg_level == SOL_SOCKET) {
-			err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc);
+			err = __sock_cmsg_send(sk, cmsg, &ipc->sockc);
 			if (err)
 				return err;
 			continue;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 5ecb56522f9d..df7e032ce87d 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -771,7 +771,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk,
 		}
 
 		if (cmsg->cmsg_level == SOL_SOCKET) {
-			err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc);
+			err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc);
 			if (err)
 				return err;
 			continue;
-- 
cgit v1.2.3


From 720ca52bcef225b967a339e0fffb6d0c7e962240 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 21 Oct 2022 09:03:04 -0700
Subject: net-memcg: avoid stalls when under memory pressure

As Shakeel explains the commit under Fixes had the unintended
side-effect of no longer pre-loading the cached memory allowance.
Even tho we previously dropped the first packet received when
over memory limit - the consecutive ones would get thru by using
the cache. The charging was happening in batches of 128kB, so
we'd let in 128kB (truesize) worth of packets per one drop.

After the change we no longer force charge, there will be no
cache filling side effects. This causes significant drops and
connection stalls for workloads which use a lot of page cache,
since we can't reclaim page cache under GFP_NOWAIT.

Some of the latency can be recovered by improving SACK reneg
handling but nowhere near enough to get back to the pre-5.15
performance (the application I'm experimenting with still
sees 5-10x worst latency).

Apply the suggested workaround of using GFP_ATOMIC. We will now
be more permissive than previously as we'll drop _no_ packets
in softirq when under pressure. But I can't think of any good
and simple way to address that within networking.

Link: https://lore.kernel.org/all/20221012163300.795e7b86@kernel.org/
Suggested-by: Shakeel Butt <shakeelb@google.com>
Fixes: 4b1327be9fe5 ("net-memcg: pass in gfp_t mask to mem_cgroup_charge_skmem()")
Acked-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <roman.gushchin@linux.dev>
Link: https://lore.kernel.org/r/20221021160304.1362511-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 9e464f6409a7..22f8bab583dd 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2585,7 +2585,7 @@ static inline gfp_t gfp_any(void)
 
 static inline gfp_t gfp_memcg_charge(void)
 {
-	return in_softirq() ? GFP_NOWAIT : GFP_KERNEL;
+	return in_softirq() ? GFP_ATOMIC : GFP_KERNEL;
 }
 
 static inline long sock_rcvtimeo(const struct sock *sk, bool noblock)
-- 
cgit v1.2.3


From 4fa86555d1cd338afc6e6308cc1ff890a014ec8c Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 21 Oct 2022 12:35:32 -0700
Subject: genetlink: piggy back on resv_op to default to a reject policy

To keep backward compatibility we used to leave attribute parsing
to the family if no policy is specified. This becomes tedious as
we move to more strict validation. Families must define reject all
policies if they don't want any attributes accepted.

Piggy back on the resv_start_op field as the switchover point.
AFAICT only ethtool has added new commands since the resv_start_op
was defined, and it has per-op policies so this should be a no-op.

Nonetheless the patch should still go into v6.1 for consistency.

Link: https://lore.kernel.org/all/20221019125745.3f2e7659@kernel.org/
Link: https://lore.kernel.org/r/20221021193532.1511293-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/genetlink.h | 10 +++++++++-
 net/netlink/genetlink.c | 23 +++++++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 3d08e67b3cfc..9f97f73615b6 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -41,13 +41,21 @@ struct genl_info;
  * @mcgrps: multicast groups used by this family
  * @n_mcgrps: number of multicast groups
  * @resv_start_op: first operation for which reserved fields of the header
- *	can be validated, new families should leave this field at zero
+ *	can be validated and policies are required (see below);
+ *	new families should leave this field at zero
  * @mcgrp_offset: starting number of multicast group IDs in this family
  *	(private)
  * @ops: the operations supported by this family
  * @n_ops: number of operations supported by this family
  * @small_ops: the small-struct operations supported by this family
  * @n_small_ops: number of small-struct operations supported by this family
+ *
+ * Attribute policies (the combination of @policy and @maxattr fields)
+ * can be attached at the family level or at the operation level.
+ * If both are present the per-operation policy takes precedence.
+ * For operations before @resv_start_op lack of policy means that the core
+ * will perform no attribute parsing or validation. For newer operations
+ * if policy is not provided core will reject all TLV attributes.
  */
 struct genl_family {
 	int			id;		/* private */
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 39b7c00e4cef..b1fd059c9992 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -78,10 +78,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) |
 static unsigned long *mc_groups = &mc_group_start;
 static unsigned long mc_groups_longs = 1;
 
+/* We need the last attribute with non-zero ID therefore a 2-entry array */
+static struct nla_policy genl_policy_reject_all[] = {
+	{ .type = NLA_REJECT },
+	{ .type = NLA_REJECT },
+};
+
 static int genl_ctrl_event(int event, const struct genl_family *family,
 			   const struct genl_multicast_group *grp,
 			   int grp_id);
 
+static void
+genl_op_fill_in_reject_policy(const struct genl_family *family,
+			      struct genl_ops *op)
+{
+	BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1);
+
+	if (op->policy || op->cmd < family->resv_start_op)
+		return;
+
+	op->policy = genl_policy_reject_all;
+	op->maxattr = 1;
+}
+
 static const struct genl_family *genl_family_find_byid(unsigned int id)
 {
 	return idr_find(&genl_fam_idr, id);
@@ -113,6 +132,8 @@ static void genl_op_from_full(const struct genl_family *family,
 		op->maxattr = family->maxattr;
 	if (!op->policy)
 		op->policy = family->policy;
+
+	genl_op_fill_in_reject_policy(family, op);
 }
 
 static int genl_get_cmd_full(u32 cmd, const struct genl_family *family,
@@ -142,6 +163,8 @@ static void genl_op_from_small(const struct genl_family *family,
 
 	op->maxattr = family->maxattr;
 	op->policy = family->policy;
+
+	genl_op_fill_in_reject_policy(family, op);
 }
 
 static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
-- 
cgit v1.2.3


From 4a6a676f8c16ec17d2f8d69ce3b5d680277ed0d2 Mon Sep 17 00:00:00 2001
From: Amritha Nambiar <amritha.nambiar@intel.com>
Date: Fri, 21 Oct 2022 00:58:39 -0700
Subject: act_skbedit: skbedit queue mapping for receive queue

Add support for skbedit queue mapping action on receive
side. This is supported only in hardware, so the skip_sw
flag is enforced. This enables offloading filters for
receive queue selection in the hardware using the
skbedit action. Traffic arrives on the Rx queue requested
in the skbedit action parameter. A new tc action flag
TCA_ACT_FLAGS_AT_INGRESS is introduced to identify the
traffic direction the action queue_mapping is requested
on during filter addition. This is used to disallow
offloading the skbedit queue mapping action on transmit
side.

Example:
$tc filter add dev $IFACE ingress protocol ip flower dst_ip $DST_IP\
 action skbedit queue_mapping $rxq_id skip_sw

Reviewed-by: Sridhar Samudrala <sridhar.samudrala@intel.com>
Signed-off-by: Amritha Nambiar <amritha.nambiar@intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/act_api.h           |  1 +
 include/net/flow_offload.h      |  2 ++
 include/net/tc_act/tc_skbedit.h | 29 +++++++++++++++++++++++++++++
 net/sched/act_skbedit.c         | 14 ++++++++++++--
 net/sched/cls_api.c             |  7 +++++++
 5 files changed, 51 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 61f2ceb3939e..c94ea1a306e0 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -67,6 +67,7 @@ struct tc_action {
 #define TCA_ACT_FLAGS_BIND	(1U << (TCA_ACT_FLAGS_USER_BITS + 1))
 #define TCA_ACT_FLAGS_REPLACE	(1U << (TCA_ACT_FLAGS_USER_BITS + 2))
 #define TCA_ACT_FLAGS_NO_RTNL	(1U << (TCA_ACT_FLAGS_USER_BITS + 3))
+#define TCA_ACT_FLAGS_AT_INGRESS	(1U << (TCA_ACT_FLAGS_USER_BITS + 4))
 
 /* Update lastuse only if needed, to avoid dirtying a cache line.
  * We use a temp variable to avoid fetching jiffies twice.
diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index e343f9f8363e..7a60bc6d72c9 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -155,6 +155,7 @@ enum flow_action_id {
 	FLOW_ACTION_MARK,
 	FLOW_ACTION_PTYPE,
 	FLOW_ACTION_PRIORITY,
+	FLOW_ACTION_RX_QUEUE_MAPPING,
 	FLOW_ACTION_WAKE,
 	FLOW_ACTION_QUEUE,
 	FLOW_ACTION_SAMPLE,
@@ -247,6 +248,7 @@ struct flow_action_entry {
 		u32			csum_flags;	/* FLOW_ACTION_CSUM */
 		u32			mark;		/* FLOW_ACTION_MARK */
 		u16                     ptype;          /* FLOW_ACTION_PTYPE */
+		u16			rx_queue;	/* FLOW_ACTION_RX_QUEUE_MAPPING */
 		u32			priority;	/* FLOW_ACTION_PRIORITY */
 		struct {				/* FLOW_ACTION_QUEUE */
 			u32		ctx;
diff --git a/include/net/tc_act/tc_skbedit.h b/include/net/tc_act/tc_skbedit.h
index dc1079f28e13..9649600fb3dc 100644
--- a/include/net/tc_act/tc_skbedit.h
+++ b/include/net/tc_act/tc_skbedit.h
@@ -95,12 +95,41 @@ static inline u32 tcf_skbedit_priority(const struct tc_action *a)
 	return priority;
 }
 
+static inline u16 tcf_skbedit_rx_queue_mapping(const struct tc_action *a)
+{
+	u16 rx_queue;
+
+	rcu_read_lock();
+	rx_queue = rcu_dereference(to_skbedit(a)->params)->queue_mapping;
+	rcu_read_unlock();
+
+	return rx_queue;
+}
+
 /* Return true iff action is queue_mapping */
 static inline bool is_tcf_skbedit_queue_mapping(const struct tc_action *a)
 {
 	return is_tcf_skbedit_with_flag(a, SKBEDIT_F_QUEUE_MAPPING);
 }
 
+/* Return true if action is on ingress traffic */
+static inline bool is_tcf_skbedit_ingress(u32 flags)
+{
+	return flags & TCA_ACT_FLAGS_AT_INGRESS;
+}
+
+static inline bool is_tcf_skbedit_tx_queue_mapping(const struct tc_action *a)
+{
+	return is_tcf_skbedit_queue_mapping(a) &&
+	       !is_tcf_skbedit_ingress(a->tcfa_flags);
+}
+
+static inline bool is_tcf_skbedit_rx_queue_mapping(const struct tc_action *a)
+{
+	return is_tcf_skbedit_queue_mapping(a) &&
+	       is_tcf_skbedit_ingress(a->tcfa_flags);
+}
+
 /* Return true iff action is inheritdsfield */
 static inline bool is_tcf_skbedit_inheritdsfield(const struct tc_action *a)
 {
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 7f598784fd30..1710780c908a 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -148,6 +148,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) {
+		if (is_tcf_skbedit_ingress(act_flags) &&
+		    !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) {
+			NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw");
+			return -EOPNOTSUPP;
+		}
 		flags |= SKBEDIT_F_QUEUE_MAPPING;
 		queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]);
 	}
@@ -374,9 +379,12 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data
 		} else if (is_tcf_skbedit_priority(act)) {
 			entry->id = FLOW_ACTION_PRIORITY;
 			entry->priority = tcf_skbedit_priority(act);
-		} else if (is_tcf_skbedit_queue_mapping(act)) {
-			NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used");
+		} else if (is_tcf_skbedit_tx_queue_mapping(act)) {
+			NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side");
 			return -EOPNOTSUPP;
+		} else if (is_tcf_skbedit_rx_queue_mapping(act)) {
+			entry->id = FLOW_ACTION_RX_QUEUE_MAPPING;
+			entry->rx_queue = tcf_skbedit_rx_queue_mapping(act);
 		} else if (is_tcf_skbedit_inheritdsfield(act)) {
 			NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used");
 			return -EOPNOTSUPP;
@@ -394,6 +402,8 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data
 			fl_action->id = FLOW_ACTION_PTYPE;
 		else if (is_tcf_skbedit_priority(act))
 			fl_action->id = FLOW_ACTION_PRIORITY;
+		else if (is_tcf_skbedit_rx_queue_mapping(act))
+			fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING;
 		else
 			return -EOPNOTSUPP;
 	}
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 50566db45949..23d1cfa4f58c 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -1953,6 +1953,11 @@ static void tfilter_put(struct tcf_proto *tp, void *fh)
 		tp->ops->put(tp, fh);
 }
 
+static bool is_qdisc_ingress(__u32 classid)
+{
+	return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS));
+}
+
 static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n,
 			  struct netlink_ext_ack *extack)
 {
@@ -2144,6 +2149,8 @@ replay:
 		flags |= TCA_ACT_FLAGS_REPLACE;
 	if (!rtnl_held)
 		flags |= TCA_ACT_FLAGS_NO_RTNL;
+	if (is_qdisc_ingress(parent))
+		flags |= TCA_ACT_FLAGS_AT_INGRESS;
 	err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh,
 			      flags, extack);
 	if (err == 0) {
-- 
cgit v1.2.3


From b261eda84ec136240a9ca753389853a3a1bccca2 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 21 Oct 2022 13:44:34 -0700
Subject: soreuseport: Fix socket selection for SO_INCOMING_CPU.

Kazuho Oku reported that setsockopt(SO_INCOMING_CPU) does not work
with setsockopt(SO_REUSEPORT) since v4.6.

With the combination of SO_REUSEPORT and SO_INCOMING_CPU, we could
build a highly efficient server application.

setsockopt(SO_INCOMING_CPU) associates a CPU with a TCP listener
or UDP socket, and then incoming packets processed on the CPU will
likely be distributed to the socket.  Technically, a socket could
even receive packets handled on another CPU if no sockets in the
reuseport group have the same CPU receiving the flow.

The logic exists in compute_score() so that a socket will get a higher
score if it has the same CPU with the flow.  However, the score gets
ignored after the blamed two commits, which introduced a faster socket
selection algorithm for SO_REUSEPORT.

This patch introduces a counter of sockets with SO_INCOMING_CPU in
a reuseport group to check if we should iterate all sockets to find
a proper one.  We increment the counter when

  * calling listen() if the socket has SO_INCOMING_CPU and SO_REUSEPORT

  * enabling SO_INCOMING_CPU if the socket is in a reuseport group

Also, we decrement it when

  * detaching a socket out of the group to apply SO_INCOMING_CPU to
    migrated TCP requests

  * disabling SO_INCOMING_CPU if the socket is in a reuseport group

When the counter reaches 0, we can get back to the O(1) selection
algorithm.

The overall changes are negligible for the non-SO_INCOMING_CPU case,
and the only notable thing is that we have to update sk_incomnig_cpu
under reuseport_lock.  Otherwise, the race prevents transitioning to
the O(n) algorithm and results in the wrong socket selection.

 cpu1 (setsockopt)               cpu2 (listen)
+-----------------+             +-------------+

lock_sock(sk1)                  lock_sock(sk2)

reuseport_update_incoming_cpu(sk1, val)
.
|  /* set CPU as 0 */
|- WRITE_ONCE(sk1->incoming_cpu, val)
|
|                               spin_lock_bh(&reuseport_lock)
|                               reuseport_grow(sk2, reuse)
|                               .
|                               |- more_socks_size = reuse->max_socks * 2U;
|                               |- if (more_socks_size > U16_MAX &&
|                               |       reuse->num_closed_socks)
|                               |  .
|                               |  |- RCU_INIT_POINTER(sk1->sk_reuseport_cb, NULL);
|                               |  `- __reuseport_detach_closed_sock(sk1, reuse)
|                               |     .
|                               |     `- reuseport_put_incoming_cpu(sk1, reuse)
|                               |        .
|                               |        |  /* Read shutdown()ed sk1's sk_incoming_cpu
|                               |        |   * without lock_sock().
|                               |        |   */
|                               |        `- if (sk1->sk_incoming_cpu >= 0)
|                               |           .
|                               |           |  /* decrement not-yet-incremented
|                               |           |   * count, which is never incremented.
|                               |           |   */
|                               |           `- __reuseport_put_incoming_cpu(reuse);
|                               |
|                               `- spin_lock_bh(&reuseport_lock)
|
|- spin_lock_bh(&reuseport_lock)
|
|- reuse = rcu_dereference_protected(sk1->sk_reuseport_cb, ...)
|- if (!reuse)
|  .
|  |  /* Cannot increment reuse->incoming_cpu. */
|  `- goto out;
|
`- spin_unlock_bh(&reuseport_lock)

Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection")
Reported-by: Kazuho Oku <kazuhooku@gmail.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/sock_reuseport.h |  2 +
 net/core/sock.c              |  2 +-
 net/core/sock_reuseport.c    | 94 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 92 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock_reuseport.h b/include/net/sock_reuseport.h
index efc9085c6892..6ec140b0a61b 100644
--- a/include/net/sock_reuseport.h
+++ b/include/net/sock_reuseport.h
@@ -16,6 +16,7 @@ struct sock_reuseport {
 	u16			max_socks;		/* length of socks */
 	u16			num_socks;		/* elements in socks */
 	u16			num_closed_socks;	/* closed elements in socks */
+	u16			incoming_cpu;
 	/* The last synq overflow event timestamp of this
 	 * reuse->socks[] group.
 	 */
@@ -58,5 +59,6 @@ static inline bool reuseport_has_conns(struct sock *sk)
 }
 
 void reuseport_has_conns_set(struct sock *sk);
+void reuseport_update_incoming_cpu(struct sock *sk, int val);
 
 #endif  /* _SOCK_REUSEPORT_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index 2786c1107e53..4571914a4aa8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1436,7 +1436,7 @@ set_sndbuf:
 		break;
 		}
 	case SO_INCOMING_CPU:
-		WRITE_ONCE(sk->sk_incoming_cpu, val);
+		reuseport_update_incoming_cpu(sk, val);
 		break;
 
 	case SO_CNX_ADVICE:
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index fb90e1e00773..5a165286e4d8 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -37,6 +37,70 @@ void reuseport_has_conns_set(struct sock *sk)
 }
 EXPORT_SYMBOL(reuseport_has_conns_set);
 
+static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse)
+{
+	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1);
+}
+
+static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse)
+{
+	/* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */
+	WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1);
+}
+
+static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+	if (sk->sk_incoming_cpu >= 0)
+		__reuseport_get_incoming_cpu(reuse);
+}
+
+static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse)
+{
+	if (sk->sk_incoming_cpu >= 0)
+		__reuseport_put_incoming_cpu(reuse);
+}
+
+void reuseport_update_incoming_cpu(struct sock *sk, int val)
+{
+	struct sock_reuseport *reuse;
+	int old_sk_incoming_cpu;
+
+	if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) {
+		/* Paired with REAE_ONCE() in sk_incoming_cpu_update()
+		 * and compute_score().
+		 */
+		WRITE_ONCE(sk->sk_incoming_cpu, val);
+		return;
+	}
+
+	spin_lock_bh(&reuseport_lock);
+
+	/* This must be done under reuseport_lock to avoid a race with
+	 * reuseport_grow(), which accesses sk->sk_incoming_cpu without
+	 * lock_sock() when detaching a shutdown()ed sk.
+	 *
+	 * Paired with READ_ONCE() in reuseport_select_sock_by_hash().
+	 */
+	old_sk_incoming_cpu = sk->sk_incoming_cpu;
+	WRITE_ONCE(sk->sk_incoming_cpu, val);
+
+	reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+					  lockdep_is_held(&reuseport_lock));
+
+	/* reuseport_grow() has detached a closed sk. */
+	if (!reuse)
+		goto out;
+
+	if (old_sk_incoming_cpu < 0 && val >= 0)
+		__reuseport_get_incoming_cpu(reuse);
+	else if (old_sk_incoming_cpu >= 0 && val < 0)
+		__reuseport_put_incoming_cpu(reuse);
+
+out:
+	spin_unlock_bh(&reuseport_lock);
+}
+
 static int reuseport_sock_index(struct sock *sk,
 				const struct sock_reuseport *reuse,
 				bool closed)
@@ -64,6 +128,7 @@ static void __reuseport_add_sock(struct sock *sk,
 	/* paired with smp_rmb() in reuseport_(select|migrate)_sock() */
 	smp_wmb();
 	reuse->num_socks++;
+	reuseport_get_incoming_cpu(sk, reuse);
 }
 
 static bool __reuseport_detach_sock(struct sock *sk,
@@ -76,6 +141,7 @@ static bool __reuseport_detach_sock(struct sock *sk,
 
 	reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
 	reuse->num_socks--;
+	reuseport_put_incoming_cpu(sk, reuse);
 
 	return true;
 }
@@ -86,6 +152,7 @@ static void __reuseport_add_closed_sock(struct sock *sk,
 	reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk;
 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1);
+	reuseport_get_incoming_cpu(sk, reuse);
 }
 
 static bool __reuseport_detach_closed_sock(struct sock *sk,
@@ -99,6 +166,7 @@ static bool __reuseport_detach_closed_sock(struct sock *sk,
 	reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks];
 	/* paired with READ_ONCE() in inet_csk_bind_conflict() */
 	WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1);
+	reuseport_put_incoming_cpu(sk, reuse);
 
 	return true;
 }
@@ -166,6 +234,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany)
 	reuse->bind_inany = bind_inany;
 	reuse->socks[0] = sk;
 	reuse->num_socks = 1;
+	reuseport_get_incoming_cpu(sk, reuse);
 	rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
 
 out:
@@ -209,6 +278,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
 	more_reuse->reuseport_id = reuse->reuseport_id;
 	more_reuse->bind_inany = reuse->bind_inany;
 	more_reuse->has_conns = reuse->has_conns;
+	more_reuse->incoming_cpu = reuse->incoming_cpu;
 
 	memcpy(more_reuse->socks, reuse->socks,
 	       reuse->num_socks * sizeof(struct sock *));
@@ -458,18 +528,32 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks,
 static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse,
 						  u32 hash, u16 num_socks)
 {
+	struct sock *first_valid_sk = NULL;
 	int i, j;
 
 	i = j = reciprocal_scale(hash, num_socks);
-	while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) {
+	do {
+		struct sock *sk = reuse->socks[i];
+
+		if (sk->sk_state != TCP_ESTABLISHED) {
+			/* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */
+			if (!READ_ONCE(reuse->incoming_cpu))
+				return sk;
+
+			/* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */
+			if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
+				return sk;
+
+			if (!first_valid_sk)
+				first_valid_sk = sk;
+		}
+
 		i++;
 		if (i >= num_socks)
 			i = 0;
-		if (i == j)
-			return NULL;
-	}
+	} while (i != j);
 
-	return reuse->socks[i];
+	return first_valid_sk;
 }
 
 /**
-- 
cgit v1.2.3


From ac1f8c049319847b1b4c6b387fdb2e3f7fb84ffc Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Wed, 28 Sep 2022 23:55:06 +0200
Subject: netfilter: nft_payload: move struct nft_payload_set definition where
 it belongs

Not required to expose this header in nf_tables_core.h, move it to where
it is used, ie. nft_payload.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h | 10 ----------
 net/netfilter/nft_payload.c            | 10 ++++++++++
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 1223af68cd9a..990c3767a350 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -66,16 +66,6 @@ struct nft_payload {
 	u8			dreg;
 };
 
-struct nft_payload_set {
-	enum nft_payload_bases	base:8;
-	u8			offset;
-	u8			len;
-	u8			sreg;
-	u8			csum_type;
-	u8			csum_offset;
-	u8			csum_flags;
-};
-
 extern const struct nft_expr_ops nft_payload_fast_ops;
 
 extern const struct nft_expr_ops nft_bitwise_fast_ops;
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 088244f9d838..07621d509a68 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -665,6 +665,16 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src,
 	return 0;
 }
 
+struct nft_payload_set {
+	enum nft_payload_bases	base:8;
+	u8			offset;
+	u8			len;
+	u8			sreg;
+	u8			csum_type;
+	u8			csum_offset;
+	u8			csum_flags;
+};
+
 static void nft_payload_set_eval(const struct nft_expr *expr,
 				 struct nft_regs *regs,
 				 const struct nft_pktinfo *pkt)
-- 
cgit v1.2.3


From e7a1caa67ce62765fe174cae08e537d542bb44f8 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Sat, 15 Oct 2022 00:20:50 +0200
Subject: netfilter: nf_tables: reduce nft_pktinfo by 8 bytes

structure is reduced from 32 to 24 bytes.  While at it, also check
that iphdrlen is sane, this is guaranteed for NFPROTO_IPV4 but not
for ingress or bridge, so add checks for this.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      | 4 ++--
 include/net/netfilter/nf_tables_ipv4.h | 4 ++++
 include/net/netfilter/nf_tables_ipv6.h | 6 +++---
 3 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index cdb7db9b0e25..f6db510689a8 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -32,8 +32,8 @@ struct nft_pktinfo {
 	u8				flags;
 	u8				tprot;
 	u16				fragoff;
-	unsigned int			thoff;
-	unsigned int			inneroff;
+	u16				thoff;
+	u16				inneroff;
 };
 
 static inline struct sock *nft_sk(const struct nft_pktinfo *pkt)
diff --git a/include/net/netfilter/nf_tables_ipv4.h b/include/net/netfilter/nf_tables_ipv4.h
index c4a6147b0ef8..112708f7a6b4 100644
--- a/include/net/netfilter/nf_tables_ipv4.h
+++ b/include/net/netfilter/nf_tables_ipv4.h
@@ -35,6 +35,8 @@ static inline int __nft_set_pktinfo_ipv4_validate(struct nft_pktinfo *pkt)
 		return -1;
 	else if (len < thoff)
 		return -1;
+	else if (thoff < sizeof(*iph))
+		return -1;
 
 	pkt->flags = NFT_PKTINFO_L4PROTO;
 	pkt->tprot = iph->protocol;
@@ -69,6 +71,8 @@ static inline int nft_set_pktinfo_ipv4_ingress(struct nft_pktinfo *pkt)
 		return -1;
 	} else if (len < thoff) {
 		goto inhdr_error;
+	} else if (thoff < sizeof(*iph)) {
+		return -1;
 	}
 
 	pkt->flags = NFT_PKTINFO_L4PROTO;
diff --git a/include/net/netfilter/nf_tables_ipv6.h b/include/net/netfilter/nf_tables_ipv6.h
index ec7eaeaf4f04..467d59b9e533 100644
--- a/include/net/netfilter/nf_tables_ipv6.h
+++ b/include/net/netfilter/nf_tables_ipv6.h
@@ -13,7 +13,7 @@ static inline void nft_set_pktinfo_ipv6(struct nft_pktinfo *pkt)
 	unsigned short frag_off;
 
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags);
-	if (protohdr < 0) {
+	if (protohdr < 0 || thoff > U16_MAX) {
 		nft_set_pktinfo_unspec(pkt);
 		return;
 	}
@@ -47,7 +47,7 @@ static inline int __nft_set_pktinfo_ipv6_validate(struct nft_pktinfo *pkt)
 		return -1;
 
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags);
-	if (protohdr < 0)
+	if (protohdr < 0 || thoff > U16_MAX)
 		return -1;
 
 	pkt->flags = NFT_PKTINFO_L4PROTO;
@@ -93,7 +93,7 @@ static inline int nft_set_pktinfo_ipv6_ingress(struct nft_pktinfo *pkt)
 	}
 
 	protohdr = ipv6_find_hdr(pkt->skb, &thoff, -1, &frag_off, &flags);
-	if (protohdr < 0)
+	if (protohdr < 0 || thoff > U16_MAX)
 		goto inhdr_error;
 
 	pkt->flags = NFT_PKTINFO_L4PROTO;
-- 
cgit v1.2.3


From d037abc2414b4539401e0e6aa278bedc4628ad69 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Fri, 21 Oct 2022 16:17:53 +0200
Subject: netfilter: nft_objref: make it builtin

nft_objref is needed to reference named objects, it makes
no sense to disable it.

Before:
   text	   data	    bss	    dec	 filename
  4014	    424	      0	   4438	 nft_objref.o
  4174	   1128	      0	   5302	 nft_objref.ko
359351	  15276	    864	 375491	 nf_tables.ko
After:
  text	   data	    bss	    dec	 filename
  3815	    408	      0	   4223	 nft_objref.o
363161	  15692	    864	 379717	 nf_tables.ko

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/Kconfig                  |  6 ------
 net/netfilter/Makefile                 |  4 ++--
 net/netfilter/nf_tables_core.c         |  1 +
 net/netfilter/nft_objref.c             | 22 +---------------------
 5 files changed, 5 insertions(+), 29 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 990c3767a350..83d763631f81 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -18,6 +18,7 @@ extern struct nft_expr_type nft_meta_type;
 extern struct nft_expr_type nft_rt_type;
 extern struct nft_expr_type nft_exthdr_type;
 extern struct nft_expr_type nft_last_type;
+extern struct nft_expr_type nft_objref_type;
 
 #ifdef CONFIG_NETWORK_SECMARK
 extern struct nft_object_type nft_secmark_obj_type;
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 4b8d04640ff3..0846bd75b1da 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -568,12 +568,6 @@ config NFT_TUNNEL
 	  This option adds the "tunnel" expression that you can use to set
 	  tunneling policies.
 
-config NFT_OBJREF
-	tristate "Netfilter nf_tables stateful object reference module"
-	help
-	  This option adds the "objref" expression that allows you to refer to
-	  stateful objects, such as counters and quotas.
-
 config NFT_QUEUE
 	depends on NETFILTER_NETLINK_QUEUE
 	tristate "Netfilter nf_tables queue module"
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 0f060d100880..7a6b518ba2b4 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,7 +86,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
 		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
-		  nft_counter.o nft_chain_route.o nf_tables_offload.o \
+		  nft_counter.o nft_objref.o \
+		  nft_chain_route.o nf_tables_offload.o \
 		  nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
 		  nft_set_pipapo.o
 
@@ -104,7 +105,6 @@ obj-$(CONFIG_NFT_CT)		+= nft_ct.o
 obj-$(CONFIG_NFT_FLOW_OFFLOAD)	+= nft_flow_offload.o
 obj-$(CONFIG_NFT_LIMIT)		+= nft_limit.o
 obj-$(CONFIG_NFT_NAT)		+= nft_nat.o
-obj-$(CONFIG_NFT_OBJREF)	+= nft_objref.o
 obj-$(CONFIG_NFT_QUEUE)		+= nft_queue.o
 obj-$(CONFIG_NFT_QUOTA)		+= nft_quota.o
 obj-$(CONFIG_NFT_REJECT) 	+= nft_reject.o
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index cee3e4e905ec..6dcead50208c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -340,6 +340,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_exthdr_type,
 	&nft_last_type,
 	&nft_counter_type,
+	&nft_objref_type,
 };
 
 static struct nft_object_type *nft_basic_objects[] = {
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 5d8d91b3904d..74e0eea4abac 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -82,7 +82,6 @@ static void nft_objref_activate(const struct nft_ctx *ctx,
 	obj->use++;
 }
 
-static struct nft_expr_type nft_objref_type;
 static const struct nft_expr_ops nft_objref_ops = {
 	.type		= &nft_objref_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_object *)),
@@ -195,7 +194,6 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx,
 	nf_tables_destroy_set(ctx, priv->set);
 }
 
-static struct nft_expr_type nft_objref_type;
 static const struct nft_expr_ops nft_objref_map_ops = {
 	.type		= &nft_objref_type,
 	.size		= NFT_EXPR_SIZE(sizeof(struct nft_objref_map)),
@@ -233,28 +231,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = {
 	[NFTA_OBJREF_SET_ID]	= { .type = NLA_U32 },
 };
 
-static struct nft_expr_type nft_objref_type __read_mostly = {
+struct nft_expr_type nft_objref_type __read_mostly = {
 	.name		= "objref",
 	.select_ops	= nft_objref_select_ops,
 	.policy		= nft_objref_policy,
 	.maxattr	= NFTA_OBJREF_MAX,
 	.owner		= THIS_MODULE,
 };
-
-static int __init nft_objref_module_init(void)
-{
-	return nft_register_expr(&nft_objref_type);
-}
-
-static void __exit nft_objref_module_exit(void)
-{
-	nft_unregister_expr(&nft_objref_type);
-}
-
-module_init(nft_objref_module_init);
-module_exit(nft_objref_module_exit);
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
-MODULE_ALIAS_NFT_EXPR("objref");
-MODULE_DESCRIPTION("nftables stateful object reference module");
-- 
cgit v1.2.3


From 3a07327d10a09379315c844c63f27941f5081e0a Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Tue, 25 Oct 2022 13:48:15 +0200
Subject: netfilter: nft_inner: support for inner tunnel header matching

This new expression allows you to match on the inner headers that are
encapsulated by any of the existing tunneling protocols.

This expression parses the inner packet to set the link, network and
transport offsets, so the existing expressions (with a few updates) can
be reused to match on the inner headers.

The inner expression supports for different tunnel combinations such as:

- ethernet frame over IPv4/IPv6 packet, eg. VxLAN.
- IPv4/IPv6 packet over IPv4/IPv6 packet, eg. IPIP.
- IPv4/IPv6 packet over IPv4/IPv6 + transport header, eg. GRE.
- transport header (ESP or SCTP) over transport header (usually UDP)

The following fields are used to describe the tunnel protocol:

- flags, which describe how to parse the inner headers:

  NFT_PAYLOAD_CTX_INNER_TUN, the tunnel provides its own header.
  NFT_PAYLOAD_CTX_INNER_ETHER, the ethernet frame is available as inner header.
  NFT_PAYLOAD_CTX_INNER_NH, the network header is available as inner header.
  NFT_PAYLOAD_CTX_INNER_TH, the transport header is available as inner header.

For example, VxLAN sets on all of these flags. While GRE only sets on
NFT_PAYLOAD_CTX_INNER_NH and NFT_PAYLOAD_CTX_INNER_TH. Then, ESP over
UDP only sets on NFT_PAYLOAD_CTX_INNER_TH.

The tunnel description is composed of the following attributes:

- header size: in case the tunnel comes with its own header, eg. VxLAN.

- type: this provides a hint to userspace on how to delinearize the rule.
  This is useful for VxLAN and Geneve since they run over UDP, since
  transport does not provide a hint. This is also useful in case hardware
  offload is ever supported. The type is not currently interpreted by the
  kernel.

- expression: currently only payload supported. Follow up patch adds
  also inner meta support which is required by autogenerated
  dependencies. The exthdr expression should be supported too
  at some point. There is a new inner_ops operation that needs to be
  set on to allow to use an existing expression from the inner expression.

This patch adds a new NFT_PAYLOAD_TUN_HEADER base which allows to match
on the tunnel header fields, eg. vxlan vni.

The payload expression is embedded into nft_inner private area and this
private data area is passed to the payload inner eval function via
direct call.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |   5 +
 include/net/netfilter/nf_tables_core.h   |  24 +++
 include/uapi/linux/netfilter/nf_tables.h |  26 +++
 net/netfilter/Makefile                   |   2 +-
 net/netfilter/nf_tables_api.c            |  37 ++++
 net/netfilter/nf_tables_core.c           |   1 +
 net/netfilter/nft_inner.c                | 336 +++++++++++++++++++++++++++++++
 net/netfilter/nft_payload.c              |  89 +++++++-
 8 files changed, 518 insertions(+), 2 deletions(-)
 create mode 100644 net/netfilter/nft_inner.c

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index f6db510689a8..2dbfe7524a7e 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -375,6 +375,10 @@ static inline void *nft_expr_priv(const struct nft_expr *expr)
 	return (void *)expr->data;
 }
 
+struct nft_expr_info;
+
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+			 struct nft_expr_info *info);
 int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src);
 void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
@@ -864,6 +868,7 @@ struct nft_expr_type {
 						       const struct nlattr * const tb[]);
 	void				(*release_ops)(const struct nft_expr_ops *ops);
 	const struct nft_expr_ops	*ops;
+	const struct nft_expr_ops	*inner_ops;
 	struct list_head		list;
 	const char			*name;
 	struct module			*owner;
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index 83d763631f81..be2b2b5d0a52 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -19,6 +19,7 @@ extern struct nft_expr_type nft_rt_type;
 extern struct nft_expr_type nft_exthdr_type;
 extern struct nft_expr_type nft_last_type;
 extern struct nft_expr_type nft_objref_type;
+extern struct nft_expr_type nft_inner_type;
 
 #ifdef CONFIG_NETWORK_SECMARK
 extern struct nft_object_type nft_secmark_obj_type;
@@ -139,4 +140,27 @@ void nft_rt_get_eval(const struct nft_expr *expr,
 		     struct nft_regs *regs, const struct nft_pktinfo *pkt);
 void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
                       const struct nft_pktinfo *pkt);
+
+enum {
+	NFT_PAYLOAD_CTX_INNER_TUN	= (1 << 0),
+	NFT_PAYLOAD_CTX_INNER_LL	= (1 << 1),
+	NFT_PAYLOAD_CTX_INNER_NH	= (1 << 2),
+	NFT_PAYLOAD_CTX_INNER_TH	= (1 << 3),
+};
+
+struct nft_inner_tun_ctx {
+	u16	inner_tunoff;
+	u16	inner_lloff;
+	u16	inner_nhoff;
+	u16	inner_thoff;
+	__be16	llproto;
+	u8	l4proto;
+	u8      flags;
+};
+
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt);
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt,
+			    struct nft_inner_tun_ctx *ctx);
+
 #endif /* _NET_NF_TABLES_CORE_H */
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index 466fd3f4447c..05a15dce8271 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -760,6 +760,7 @@ enum nft_payload_bases {
 	NFT_PAYLOAD_NETWORK_HEADER,
 	NFT_PAYLOAD_TRANSPORT_HEADER,
 	NFT_PAYLOAD_INNER_HEADER,
+	NFT_PAYLOAD_TUN_HEADER,
 };
 
 /**
@@ -779,6 +780,31 @@ enum nft_payload_csum_flags {
 	NFT_PAYLOAD_L4CSUM_PSEUDOHDR = (1 << 0),
 };
 
+enum nft_inner_type {
+	NFT_INNER_UNSPEC	= 0,
+	NFT_INNER_VXLAN,
+};
+
+enum nft_inner_flags {
+	NFT_INNER_HDRSIZE	= (1 << 0),
+	NFT_INNER_LL		= (1 << 1),
+	NFT_INNER_NH		= (1 << 2),
+	NFT_INNER_TH		= (1 << 3),
+};
+#define NFT_INNER_MASK		(NFT_INNER_HDRSIZE | NFT_INNER_LL | \
+				 NFT_INNER_NH | NFT_INNER_TH)
+
+enum nft_inner_attributes {
+	NFTA_INNER_UNSPEC,
+	NFTA_INNER_NUM,
+	NFTA_INNER_TYPE,
+	NFTA_INNER_FLAGS,
+	NFTA_INNER_HDRSIZE,
+	NFTA_INNER_EXPR,
+	__NFTA_INNER_MAX
+};
+#define NFTA_INNER_MAX	(__NFTA_INNER_MAX - 1)
+
 /**
  * enum nft_payload_attributes - nf_tables payload expression netlink attributes
  *
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7a6b518ba2b4..1d4db1943936 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -86,7 +86,7 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \
 		  nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \
 		  nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \
 		  nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \
-		  nft_counter.o nft_objref.o \
+		  nft_counter.o nft_objref.o nft_inner.o \
 		  nft_chain_route.o nf_tables_offload.o \
 		  nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \
 		  nft_set_pipapo.o
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 58d9cbc9ccdc..6b79f5e18f08 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2857,6 +2857,43 @@ err1:
 	return err;
 }
 
+int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
+			 struct nft_expr_info *info)
+{
+	struct nlattr *tb[NFTA_EXPR_MAX + 1];
+	const struct nft_expr_type *type;
+	int err;
+
+	err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla,
+					  nft_expr_policy, NULL);
+	if (err < 0)
+		return err;
+
+	if (!tb[NFTA_EXPR_DATA])
+		return -EINVAL;
+
+	type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]);
+	if (IS_ERR(type))
+		return PTR_ERR(type);
+
+	if (!type->inner_ops)
+		return -EOPNOTSUPP;
+
+	err = nla_parse_nested_deprecated(info->tb, type->maxattr,
+					  tb[NFTA_EXPR_DATA],
+					  type->policy, NULL);
+	if (err < 0)
+		goto err_nla_parse;
+
+	info->attr = nla;
+	info->ops = type->inner_ops;
+
+	return 0;
+
+err_nla_parse:
+	return err;
+}
+
 static int nf_tables_newexpr(const struct nft_ctx *ctx,
 			     const struct nft_expr_info *expr_info,
 			     struct nft_expr *expr)
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index 6dcead50208c..709a736c301c 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -341,6 +341,7 @@ static struct nft_expr_type *nft_basic_types[] = {
 	&nft_last_type,
 	&nft_counter_type,
 	&nft_objref_type,
+	&nft_inner_type,
 };
 
 static struct nft_object_type *nft_basic_objects[] = {
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
new file mode 100644
index 000000000000..1e4079b5b431
--- /dev/null
+++ b/net/netfilter/nft_inner.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org>
+ */
+
+#include <linux/kernel.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_offload.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <net/gre.h>
+#include <net/ip.h>
+#include <linux/icmpv6.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+
+/* Same layout as nft_expr but it embeds the private expression data area. */
+struct __nft_expr {
+	const struct nft_expr_ops	*ops;
+	union {
+		struct nft_payload	payload;
+	} __attribute__((aligned(__alignof__(u64))));
+};
+
+enum {
+	NFT_INNER_EXPR_PAYLOAD,
+};
+
+struct nft_inner {
+	u8			flags;
+	u8			hdrsize;
+	u8			type;
+	u8			expr_type;
+
+	struct __nft_expr	expr;
+};
+
+static int nft_inner_parse_l2l3(const struct nft_inner *priv,
+				const struct nft_pktinfo *pkt,
+				struct nft_inner_tun_ctx *ctx, u32 off)
+{
+	__be16 llproto, outer_llproto;
+	u32 nhoff, thoff;
+
+	if (priv->flags & NFT_INNER_LL) {
+		struct vlan_ethhdr *veth, _veth;
+		struct ethhdr *eth, _eth;
+		u32 hdrsize;
+
+		eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth);
+		if (!eth)
+			return -1;
+
+		switch (eth->h_proto) {
+		case htons(ETH_P_IP):
+		case htons(ETH_P_IPV6):
+			llproto = eth->h_proto;
+			hdrsize = sizeof(_eth);
+			break;
+		case htons(ETH_P_8021Q):
+			veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth);
+			if (!eth)
+				return -1;
+
+			outer_llproto = veth->h_vlan_encapsulated_proto;
+			llproto = veth->h_vlan_proto;
+			hdrsize = sizeof(_veth);
+			break;
+		default:
+			return -1;
+		}
+
+		ctx->inner_lloff = off;
+		ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL;
+		off += hdrsize;
+	} else {
+		struct iphdr *iph;
+		u32 _version;
+
+		iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version);
+		if (!iph)
+			return -1;
+
+		switch (iph->version) {
+		case 4:
+			llproto = htons(ETH_P_IP);
+			break;
+		case 6:
+			llproto = htons(ETH_P_IPV6);
+			break;
+		default:
+			return -1;
+		}
+	}
+
+	ctx->llproto = llproto;
+	if (llproto == htons(ETH_P_8021Q))
+		llproto = outer_llproto;
+
+	nhoff = off;
+
+	switch (llproto) {
+	case htons(ETH_P_IP): {
+		struct iphdr *iph, _iph;
+
+		iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph);
+		if (!iph)
+			return -1;
+
+		if (iph->ihl < 5 || iph->version != 4)
+			return -1;
+
+		ctx->inner_nhoff = nhoff;
+		ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+		thoff = nhoff + (iph->ihl * 4);
+		if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) {
+			ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+			ctx->inner_thoff = thoff;
+			ctx->l4proto = iph->protocol;
+		}
+		}
+		break;
+	case htons(ETH_P_IPV6): {
+		struct ipv6hdr *ip6h, _ip6h;
+		int fh_flags = IP6_FH_F_AUTH;
+		unsigned short fragoff;
+		int l4proto;
+
+		ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h);
+		if (!ip6h)
+			return -1;
+
+		if (ip6h->version != 6)
+			return -1;
+
+		ctx->inner_nhoff = nhoff;
+		ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH;
+
+		thoff = nhoff;
+		l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags);
+		if (l4proto < 0 || thoff > U16_MAX)
+			return -1;
+
+		if (fragoff == 0) {
+			thoff = nhoff + sizeof(_ip6h);
+			ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH;
+			ctx->inner_thoff = thoff;
+			ctx->l4proto = l4proto;
+		}
+		}
+		break;
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
+				  const struct nft_pktinfo *pkt,
+				  struct nft_inner_tun_ctx *ctx, u32 *off)
+{
+	if (pkt->tprot != IPPROTO_UDP ||
+	    pkt->tprot != IPPROTO_GRE)
+		return -1;
+
+	ctx->inner_tunoff = *off;
+	ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN;
+	*off += priv->hdrsize;
+
+	return 0;
+}
+
+static int nft_inner_parse(const struct nft_inner *priv,
+			   const struct nft_pktinfo *pkt,
+			   struct nft_inner_tun_ctx *tun_ctx)
+{
+	struct nft_inner_tun_ctx ctx = {};
+	u32 off = pkt->inneroff;
+
+	if (priv->flags & NFT_INNER_HDRSIZE &&
+	    nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0)
+		return -1;
+
+	if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) {
+		if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0)
+			return -1;
+	} else if (priv->flags & NFT_INNER_TH) {
+		ctx.inner_thoff = off;
+		ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH;
+	}
+
+	*tun_ctx = ctx;
+
+	return 0;
+}
+
+static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+			   const struct nft_pktinfo *pkt)
+{
+	const struct nft_inner *priv = nft_expr_priv(expr);
+	struct nft_inner_tun_ctx tun_ctx = {};
+
+	if (nft_payload_inner_offset(pkt) < 0)
+		goto err;
+
+	if (nft_inner_parse(priv, pkt, &tun_ctx) < 0)
+		goto err;
+
+	switch (priv->expr_type) {
+	case NFT_INNER_EXPR_PAYLOAD:
+		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		goto err;
+	}
+	return;
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
+static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = {
+	[NFTA_INNER_NUM]	= { .type = NLA_U32 },
+	[NFTA_INNER_FLAGS]	= { .type = NLA_U32 },
+	[NFTA_INNER_HDRSIZE]	= { .type = NLA_U32 },
+	[NFTA_INNER_TYPE]	= { .type = NLA_U32 },
+	[NFTA_INNER_EXPR]	= { .type = NLA_NESTED },
+};
+
+struct nft_expr_info {
+	const struct nft_expr_ops	*ops;
+	const struct nlattr		*attr;
+	struct nlattr			*tb[NFT_EXPR_MAXATTR + 1];
+};
+
+static int nft_inner_init(const struct nft_ctx *ctx,
+			  const struct nft_expr *expr,
+			  const struct nlattr * const tb[])
+{
+	struct nft_inner *priv = nft_expr_priv(expr);
+	u32 flags, hdrsize, type, num;
+	struct nft_expr_info expr_info;
+	int err;
+
+	if (!tb[NFTA_INNER_FLAGS] ||
+	    !tb[NFTA_INNER_HDRSIZE] ||
+	    !tb[NFTA_INNER_TYPE] ||
+	    !tb[NFTA_INNER_EXPR])
+		return -EINVAL;
+
+	flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS]));
+	if (flags & ~NFT_INNER_MASK)
+		return -EOPNOTSUPP;
+
+	num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM]));
+	if (num != 0)
+		return -EOPNOTSUPP;
+
+	hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE]));
+	type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE]));
+
+	if (type > U8_MAX)
+		return -EINVAL;
+
+	if (flags & NFT_INNER_HDRSIZE) {
+		if (hdrsize == 0 || hdrsize > 64)
+			return -EOPNOTSUPP;
+	}
+
+	priv->flags = flags;
+	priv->hdrsize = hdrsize;
+	priv->type = type;
+
+	err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info);
+	if (err < 0)
+		return err;
+
+	priv->expr.ops = expr_info.ops;
+
+	if (!strcmp(expr_info.ops->type->name, "payload"))
+		priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+	else
+		return -EINVAL;
+
+	err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr,
+				  (const struct nlattr * const*)expr_info.tb);
+	if (err < 0)
+		return err;
+
+	return 0;
+}
+
+static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+	const struct nft_inner *priv = nft_expr_priv(expr);
+
+	if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) ||
+	    nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) ||
+	    nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) ||
+	    nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize)))
+		goto nla_put_failure;
+
+	if (nft_expr_dump(skb, NFTA_INNER_EXPR,
+			  (struct nft_expr *)&priv->expr) < 0)
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+
+static const struct nft_expr_ops nft_inner_ops = {
+	.type		= &nft_inner_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_inner)),
+	.eval		= nft_inner_eval,
+	.init		= nft_inner_init,
+	.dump		= nft_inner_dump,
+};
+
+struct nft_expr_type nft_inner_type __read_mostly = {
+	.name		= "inner",
+	.ops		= &nft_inner_ops,
+	.policy		= nft_inner_policy,
+	.maxattr	= NFTA_INNER_MAX,
+	.owner		= THIS_MODULE,
+};
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 84b490d6cc75..9d2ac764a14c 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -144,7 +144,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt)
 	return 0;
 }
 
-static int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
+int nft_payload_inner_offset(const struct nft_pktinfo *pkt)
 {
 	if (!(pkt->flags & NFT_PKTINFO_INNER) &&
 	    __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0)
@@ -587,6 +587,92 @@ const struct nft_expr_ops nft_payload_fast_ops = {
 	.offload	= nft_payload_offload,
 };
 
+void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
+			    const struct nft_pktinfo *pkt,
+			    struct nft_inner_tun_ctx *tun_ctx)
+{
+	const struct nft_payload *priv = nft_expr_priv(expr);
+	const struct sk_buff *skb = pkt->skb;
+	u32 *dest = &regs->data[priv->dreg];
+	int offset;
+
+	if (priv->len % NFT_REG32_SIZE)
+		dest[priv->len / NFT_REG32_SIZE] = 0;
+
+	switch (priv->base) {
+	case NFT_PAYLOAD_TUN_HEADER:
+		if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN))
+			goto err;
+
+		offset = tun_ctx->inner_tunoff;
+		break;
+	case NFT_PAYLOAD_LL_HEADER:
+		if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL))
+			goto err;
+
+		offset = tun_ctx->inner_lloff;
+		break;
+	case NFT_PAYLOAD_NETWORK_HEADER:
+		if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH))
+			goto err;
+
+		offset = tun_ctx->inner_nhoff;
+		break;
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+			goto err;
+
+		offset = tun_ctx->inner_thoff;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		goto err;
+	}
+	offset += priv->offset;
+
+	if (skb_copy_bits(skb, offset, dest, priv->len) < 0)
+		goto err;
+
+	return;
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_inner_init(const struct nft_ctx *ctx,
+				  const struct nft_expr *expr,
+				  const struct nlattr * const tb[])
+{
+	struct nft_payload *priv = nft_expr_priv(expr);
+	u32 base;
+
+	base   = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+	switch (base) {
+	case NFT_PAYLOAD_TUN_HEADER:
+	case NFT_PAYLOAD_LL_HEADER:
+	case NFT_PAYLOAD_NETWORK_HEADER:
+	case NFT_PAYLOAD_TRANSPORT_HEADER:
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	priv->base   = base;
+	priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+	priv->len    = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+
+	return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG],
+					&priv->dreg, NULL, NFT_DATA_VALUE,
+					priv->len);
+}
+
+static const struct nft_expr_ops nft_payload_inner_ops = {
+	.type		= &nft_payload_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_payload)),
+	.init		= nft_payload_inner_init,
+	.dump		= nft_payload_dump,
+	/* direct call to nft_payload_inner_eval(). */
+};
+
 static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum)
 {
 	*sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum));
@@ -930,6 +1016,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
 struct nft_expr_type nft_payload_type __read_mostly = {
 	.name		= "payload",
 	.select_ops	= nft_payload_select_ops,
+	.inner_ops	= &nft_payload_inner_ops,
 	.policy		= nft_payload_policy,
 	.maxattr	= NFTA_PAYLOAD_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3


From 0e795b37ba044893107f887b037594645a6fc584 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 17 Oct 2022 13:03:32 +0200
Subject: netfilter: nft_inner: add percpu inner context

Add NFT_PKTINFO_INNER_FULL flag to annotate that inner offsets are
available. Store nft_inner_tun_ctx object in percpu area to cache
existing inner offsets for this skbuff.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h      |  1 +
 include/net/netfilter/nf_tables_core.h |  1 +
 net/netfilter/nft_inner.c              | 26 ++++++++++++++++++++++----
 3 files changed, 24 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 2dbfe7524a7e..38e2b396e38a 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -24,6 +24,7 @@ struct module;
 enum {
 	NFT_PKTINFO_L4PROTO	= (1 << 0),
 	NFT_PKTINFO_INNER	= (1 << 1),
+	NFT_PKTINFO_INNER_FULL	= (1 << 2),
 };
 
 struct nft_pktinfo {
diff --git a/include/net/netfilter/nf_tables_core.h b/include/net/netfilter/nf_tables_core.h
index be2b2b5d0a52..3e825381ac5c 100644
--- a/include/net/netfilter/nf_tables_core.h
+++ b/include/net/netfilter/nf_tables_core.h
@@ -149,6 +149,7 @@ enum {
 };
 
 struct nft_inner_tun_ctx {
+	u16	type;
 	u16	inner_tunoff;
 	u16	inner_lloff;
 	u16	inner_nhoff;
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 1e4079b5b431..29f2eefe0357 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -21,6 +21,8 @@
 #include <linux/ip.h>
 #include <linux/ipv6.h>
 
+static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx);
+
 /* Same layout as nft_expr but it embeds the private expression data area. */
 struct __nft_expr {
 	const struct nft_expr_ops	*ops;
@@ -180,7 +182,7 @@ static int nft_inner_parse_tunhdr(const struct nft_inner *priv,
 }
 
 static int nft_inner_parse(const struct nft_inner *priv,
-			   const struct nft_pktinfo *pkt,
+			   struct nft_pktinfo *pkt,
 			   struct nft_inner_tun_ctx *tun_ctx)
 {
 	struct nft_inner_tun_ctx ctx = {};
@@ -199,25 +201,41 @@ static int nft_inner_parse(const struct nft_inner *priv,
 	}
 
 	*tun_ctx = ctx;
+	tun_ctx->type = priv->type;
+	pkt->flags |= NFT_PKTINFO_INNER_FULL;
 
 	return 0;
 }
 
+static bool nft_inner_parse_needed(const struct nft_inner *priv,
+				   const struct nft_pktinfo *pkt,
+				   const struct nft_inner_tun_ctx *tun_ctx)
+{
+	if (!(pkt->flags & NFT_PKTINFO_INNER_FULL))
+		return true;
+
+	if (priv->type != tun_ctx->type)
+		return true;
+
+	return false;
+}
+
 static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
 			   const struct nft_pktinfo *pkt)
 {
+	struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx);
 	const struct nft_inner *priv = nft_expr_priv(expr);
-	struct nft_inner_tun_ctx tun_ctx = {};
 
 	if (nft_payload_inner_offset(pkt) < 0)
 		goto err;
 
-	if (nft_inner_parse(priv, pkt, &tun_ctx) < 0)
+	if (nft_inner_parse_needed(priv, pkt, tun_ctx) &&
+	    nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0)
 		goto err;
 
 	switch (priv->expr_type) {
 	case NFT_INNER_EXPR_PAYLOAD:
-		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, &tun_ctx);
+		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
 		break;
 	default:
 		WARN_ON_ONCE(1);
-- 
cgit v1.2.3


From a150d122b6bdb84df532057aa3b2faf8c6485792 Mon Sep 17 00:00:00 2001
From: Pablo Neira Ayuso <pablo@netfilter.org>
Date: Mon, 17 Oct 2022 13:03:33 +0200
Subject: netfilter: nft_meta: add inner match support

Add support for inner meta matching on:

- NFT_META_PROTOCOL: to match on the ethertype, this can be used
  regardless tunnel protocol provides no link layer header, in that case
  nft_inner sets on the ethertype based on the IP header version field.
- NFT_META_L4PROTO: to match on the layer 4 protocol.

These meta expression are usually autogenerated as dependencies by
userspace nftables.

Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nft_meta.h |  6 ++++
 net/netfilter/nft_inner.c        |  8 ++++++
 net/netfilter/nft_meta.c         | 62 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+)

(limited to 'include/net')

diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index 9b51cc67de54..f3a5285a511c 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -46,4 +46,10 @@ int nft_meta_set_validate(const struct nft_ctx *ctx,
 
 bool nft_meta_get_reduce(struct nft_regs_track *track,
 			 const struct nft_expr *expr);
+
+struct nft_inner_tun_ctx;
+void nft_meta_inner_eval(const struct nft_expr *expr,
+			 struct nft_regs *regs, const struct nft_pktinfo *pkt,
+			 struct nft_inner_tun_ctx *tun_ctx);
+
 #endif
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 29f2eefe0357..c43a2fe0ceb7 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -12,6 +12,7 @@
 #include <linux/netfilter/nf_tables.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nft_meta.h>
 #include <net/netfilter/nf_tables_offload.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
@@ -28,11 +29,13 @@ struct __nft_expr {
 	const struct nft_expr_ops	*ops;
 	union {
 		struct nft_payload	payload;
+		struct nft_meta		meta;
 	} __attribute__((aligned(__alignof__(u64))));
 };
 
 enum {
 	NFT_INNER_EXPR_PAYLOAD,
+	NFT_INNER_EXPR_META,
 };
 
 struct nft_inner {
@@ -237,6 +240,9 @@ static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	case NFT_INNER_EXPR_PAYLOAD:
 		nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
 		break;
+	case NFT_INNER_EXPR_META:
+		nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx);
+		break;
 	default:
 		WARN_ON_ONCE(1);
 		goto err;
@@ -306,6 +312,8 @@ static int nft_inner_init(const struct nft_ctx *ctx,
 
 	if (!strcmp(expr_info.ops->type->name, "payload"))
 		priv->expr_type = NFT_INNER_EXPR_PAYLOAD;
+	else if (!strcmp(expr_info.ops->type->name, "meta"))
+		priv->expr_type = NFT_INNER_EXPR_META;
 	else
 		return -EINVAL;
 
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 55d2d49c3425..8c39adeebb5c 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -831,9 +831,71 @@ nft_meta_select_ops(const struct nft_ctx *ctx,
 	return ERR_PTR(-EINVAL);
 }
 
+static int nft_meta_inner_init(const struct nft_ctx *ctx,
+			       const struct nft_expr *expr,
+			       const struct nlattr * const tb[])
+{
+	struct nft_meta *priv = nft_expr_priv(expr);
+	unsigned int len;
+
+	priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY]));
+	switch (priv->key) {
+	case NFT_META_PROTOCOL:
+		len = sizeof(u16);
+		break;
+	case NFT_META_L4PROTO:
+		len = sizeof(u32);
+		break;
+	default:
+		return -EOPNOTSUPP;
+	}
+	priv->len = len;
+
+	return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg,
+					NULL, NFT_DATA_VALUE, len);
+}
+
+void nft_meta_inner_eval(const struct nft_expr *expr,
+			 struct nft_regs *regs,
+			 const struct nft_pktinfo *pkt,
+			 struct nft_inner_tun_ctx *tun_ctx)
+{
+	const struct nft_meta *priv = nft_expr_priv(expr);
+	u32 *dest = &regs->data[priv->dreg];
+
+	switch (priv->key) {
+	case NFT_META_PROTOCOL:
+		nft_reg_store16(dest, (__force u16)tun_ctx->llproto);
+		break;
+	case NFT_META_L4PROTO:
+		if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH))
+			goto err;
+
+		nft_reg_store8(dest, tun_ctx->l4proto);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		goto err;
+	}
+	return;
+
+err:
+	regs->verdict.code = NFT_BREAK;
+}
+EXPORT_SYMBOL_GPL(nft_meta_inner_eval);
+
+static const struct nft_expr_ops nft_meta_inner_ops = {
+	.type		= &nft_meta_type,
+	.size		= NFT_EXPR_SIZE(sizeof(struct nft_meta)),
+	.init		= nft_meta_inner_init,
+	.dump		= nft_meta_get_dump,
+	/* direct call to nft_meta_inner_eval(). */
+};
+
 struct nft_expr_type nft_meta_type __read_mostly = {
 	.name		= "meta",
 	.select_ops	= nft_meta_select_ops,
+	.inner_ops	= &nft_meta_inner_ops,
 	.policy		= nft_meta_policy,
 	.maxattr	= NFTA_META_MAX,
 	.owner		= THIS_MODULE,
-- 
cgit v1.2.3


From 28581b9c2c94cc912354eadc98c1146fdc7092e6 Mon Sep 17 00:00:00 2001
From: Tariq Toukan <tariqt@nvidia.com>
Date: Tue, 25 Oct 2022 13:53:00 +0300
Subject: bond: Disable TLS features indication

Bond agnostically interacts with TLS device-offload requests via the
.ndo_sk_get_lower_dev operation. Return value is true iff bond
guarantees fixed mapping between the TLS connection and a lower netdev.

Due to this nature, the bond TLS device offload features are not
explicitly controllable in the bond layer. As of today, these are
read-only values based on the evaluation of bond_sk_check().  However,
this indication might be incorrect and misleading, when the feature bits
are "fixed" by some dependency features.  For example,
NETIF_F_HW_TLS_TX/RX are forcefully cleared in case the corresponding
checksum offload is disabled. But in fact the bond ability to still
offload TLS connections to the lower device is not hurt.

This means that these bits can not be trusted, and hence better become
unused.

This patch revives some old discussion [1] and proposes a much simpler
solution: Clear the bond's TLS features bits. Everyone should stop
reading them.

[1] https://lore.kernel.org/netdev/20210526095747.22446-1-tariqt@nvidia.com/

Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
Reviewed-by: Gal Pressman <gal@nvidia.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/r/20221025105300.4718-1-tariqt@nvidia.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 drivers/net/bonding/bond_main.c    | 13 +------------
 drivers/net/bonding/bond_options.c | 18 ------------------
 include/net/bonding.h              |  4 ----
 3 files changed, 1 insertion(+), 34 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index e84c49bf4d0c..1cd4e71916f8 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -307,7 +307,7 @@ netdev_tx_t bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb,
 	return dev_queue_xmit(skb);
 }
 
-bool bond_sk_check(struct bonding *bond)
+static bool bond_sk_check(struct bonding *bond)
 {
 	switch (BOND_MODE(bond)) {
 	case BOND_MODE_8023AD:
@@ -1398,13 +1398,6 @@ static netdev_features_t bond_fix_features(struct net_device *dev,
 	netdev_features_t mask;
 	struct slave *slave;
 
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
-	if (bond_sk_check(bond))
-		features |= BOND_TLS_FEATURES;
-	else
-		features &= ~BOND_TLS_FEATURES;
-#endif
-
 	mask = features;
 
 	features &= ~NETIF_F_ONE_FOR_ALL;
@@ -5806,10 +5799,6 @@ void bond_setup(struct net_device *bond_dev)
 	if (BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP)
 		bond_dev->features |= BOND_XFRM_FEATURES;
 #endif /* CONFIG_XFRM_OFFLOAD */
-#if IS_ENABLED(CONFIG_TLS_DEVICE)
-	if (bond_sk_check(bond))
-		bond_dev->features |= BOND_TLS_FEATURES;
-#endif
 }
 
 /* Destroy a bonding device.
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 3498db1c1b3c..f71d5517f829 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -842,19 +842,6 @@ static bool bond_set_xfrm_features(struct bonding *bond)
 	return true;
 }
 
-static bool bond_set_tls_features(struct bonding *bond)
-{
-	if (!IS_ENABLED(CONFIG_TLS_DEVICE))
-		return false;
-
-	if (bond_sk_check(bond))
-		bond->dev->wanted_features |= BOND_TLS_FEATURES;
-	else
-		bond->dev->wanted_features &= ~BOND_TLS_FEATURES;
-
-	return true;
-}
-
 static int bond_option_mode_set(struct bonding *bond,
 				const struct bond_opt_value *newval)
 {
@@ -885,7 +872,6 @@ static int bond_option_mode_set(struct bonding *bond,
 		bool update = false;
 
 		update |= bond_set_xfrm_features(bond);
-		update |= bond_set_tls_features(bond);
 
 		if (update)
 			netdev_update_features(bond->dev);
@@ -1418,10 +1404,6 @@ static int bond_option_xmit_hash_policy_set(struct bonding *bond,
 		   newval->string, newval->value);
 	bond->params.xmit_policy = newval->value;
 
-	if (bond->dev->reg_state == NETREG_REGISTERED)
-		if (bond_set_tls_features(bond))
-			netdev_update_features(bond->dev);
-
 	return 0;
 }
 
diff --git a/include/net/bonding.h b/include/net/bonding.h
index e999f851738b..ea36ab7f9e72 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -92,8 +92,6 @@
 #define BOND_XFRM_FEATURES (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM | \
 			    NETIF_F_GSO_ESP)
 
-#define BOND_TLS_FEATURES (NETIF_F_HW_TLS_TX | NETIF_F_HW_TLS_RX)
-
 #ifdef CONFIG_NET_POLL_CONTROLLER
 extern atomic_t netpoll_block_tx;
 
@@ -280,8 +278,6 @@ struct bond_vlan_tag {
 	unsigned short	vlan_id;
 };
 
-bool bond_sk_check(struct bonding *bond);
-
 /**
  * Returns NULL if the net_device does not belong to any of the bond's slaves
  *
-- 
cgit v1.2.3


From bd456f283b66704920fae8e655ebc769cb743420 Mon Sep 17 00:00:00 2001
From: Mubashir Adnan Qureshi <mubashirq@google.com>
Date: Wed, 26 Oct 2022 13:51:11 +0000
Subject: tcp: add sysctls for TCP PLB parameters

PLB (Protective Load Balancing) is a host based mechanism for load
balancing across switch links. It leverages congestion signals(e.g. ECN)
from transport layer to randomly change the path of the connection
experiencing congestion. PLB changes the path of the connection by
changing the outgoing IPv6 flow label for IPv6 connections (implemented
in Linux by calling sk_rethink_txhash()). Because of this implementation
mechanism, PLB can currently only work for IPv6 traffic. For more
information, see the SIGCOMM 2022 paper:
  https://doi.org/10.1145/3544216.3544226

This commit adds new sysctl knobs and sets their default values for
TCP PLB.

Signed-off-by: Mubashir Adnan Qureshi <mubashirq@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst | 75 ++++++++++++++++++++++++++++++++++
 include/net/netns/ipv4.h               |  5 +++
 net/ipv4/sysctl_net_ipv4.c             | 43 +++++++++++++++++++
 net/ipv4/tcp_ipv4.c                    |  8 ++++
 4 files changed, 131 insertions(+)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index e7b3fa7bb3f7..815efc89ad73 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1069,6 +1069,81 @@ tcp_child_ehash_entries - INTEGER
 
 	Default: 0
 
+tcp_plb_enabled - BOOLEAN
+	If set and the underlying congestion control (e.g. DCTCP) supports
+	and enables PLB feature, TCP PLB (Protective Load Balancing) is
+	enabled. PLB is described in the following paper:
+	https://doi.org/10.1145/3544216.3544226. Based on PLB parameters,
+	upon sensing sustained congestion, TCP triggers a change in
+	flow label field for outgoing IPv6 packets. A change in flow label
+	field potentially changes the path of outgoing packets for switches
+	that use ECMP/WCMP for routing.
+
+	PLB changes socket txhash which results in a change in IPv6 Flow Label
+	field, and currently no-op for IPv4 headers. It is possible
+	to apply PLB for IPv4 with other network header fields (e.g. TCP
+	or IPv4 options) or using encapsulation where outer header is used
+	by switches to determine next hop. In either case, further host
+	and switch side changes will be needed.
+
+	When set, PLB assumes that congestion signal (e.g. ECN) is made
+	available and used by congestion control module to estimate a
+	congestion measure (e.g. ce_ratio). PLB needs a congestion measure to
+	make repathing decisions.
+
+	Default: FALSE
+
+tcp_plb_idle_rehash_rounds - INTEGER
+	Number of consecutive congested rounds (RTT) seen after which
+	a rehash can be performed, given there are no packets in flight.
+	This is referred to as M in PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	Possible Values: 0 - 31
+
+	Default: 3
+
+tcp_plb_rehash_rounds - INTEGER
+	Number of consecutive congested rounds (RTT) seen after which
+	a forced rehash can be performed. Be careful when setting this
+	parameter, as a small value increases the risk of retransmissions.
+	This is referred to as N in PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	Possible Values: 0 - 31
+
+	Default: 12
+
+tcp_plb_suspend_rto_sec - INTEGER
+	Time, in seconds, to suspend PLB in event of an RTO. In order to avoid
+	having PLB repath onto a connectivity "black hole", after an RTO a TCP
+	connection suspends PLB repathing for a random duration between 1x and
+	2x of this parameter. Randomness is added to avoid concurrent rehashing
+	of multiple TCP connections. This should be set corresponding to the
+	amount of time it takes to repair a failed link.
+
+	Possible Values: 0 - 255
+
+	Default: 60
+
+tcp_plb_cong_thresh - INTEGER
+	Fraction of packets marked with congestion over a round (RTT) to
+	tag that round as congested. This is referred to as K in the PLB paper:
+	https://doi.org/10.1145/3544216.3544226.
+
+	The 0-1 fraction range is mapped to 0-256 range to avoid floating
+	point operations. For example, 128 means that if at least 50% of
+	the packets in a round were marked as congested then the round
+	will be tagged as congested.
+
+	Setting threshold to 0 means that PLB repaths every RTT regardless
+	of congestion. This is not intended behavior for PLB and should be
+	used only for experimentation purpose.
+
+	Possible Values: 0 - 256
+
+	Default: 128
+
 UDP variables
 =============
 
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1b8004679445..25f90bba4889 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -183,6 +183,11 @@ struct netns_ipv4 {
 	unsigned long tfo_active_disable_stamp;
 	u32 tcp_challenge_timestamp;
 	u32 tcp_challenge_count;
+	u8 sysctl_tcp_plb_enabled;
+	u8 sysctl_tcp_plb_idle_rehash_rounds;
+	u8 sysctl_tcp_plb_rehash_rounds;
+	u8 sysctl_tcp_plb_suspend_rto_sec;
+	int sysctl_tcp_plb_cong_thresh;
 
 	int sysctl_udp_wmem_min;
 	int sysctl_udp_rmem_min;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 9b8a6db7a66b..0af28cedd071 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -40,6 +40,8 @@ static int one_day_secs = 24 * 3600;
 static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
 	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
 static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static int tcp_plb_max_rounds = 31;
+static int tcp_plb_max_cong_thresh = 256;
 
 /* obsolete */
 static int sysctl_tcp_low_latency __read_mostly;
@@ -1384,6 +1386,47 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_TWO,
 	},
+	{
+		.procname       = "tcp_plb_enabled",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_enabled,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = SYSCTL_ONE,
+	},
+	{
+		.procname       = "tcp_plb_idle_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2		= &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_rehash_rounds",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+		.extra2         = &tcp_plb_max_rounds,
+	},
+	{
+		.procname       = "tcp_plb_suspend_rto_sec",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec,
+		.maxlen         = sizeof(u8),
+		.mode           = 0644,
+		.proc_handler   = proc_dou8vec_minmax,
+	},
+	{
+		.procname       = "tcp_plb_cong_thresh",
+		.data           = &init_net.ipv4.sysctl_tcp_plb_cong_thresh,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec_minmax,
+		.extra1         = SYSCTL_ZERO,
+		.extra2         = &tcp_plb_max_cong_thresh,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 87d440f47a70..58b838b56c7f 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3218,6 +3218,14 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0;
 	atomic_set(&net->ipv4.tfo_active_disable_times, 0);
 
+	/* Set default values for PLB */
+	net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */
+	net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3;
+	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
+	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
+	/* Default congestion threshold for PLB to mark a round is 50% */
+	net->ipv4.sysctl_tcp_plb_cong_thresh = 128;
+
 	/* Reno is always built in */
 	if (!net_eq(net, &init_net) &&
 	    bpf_try_module_get(init_net.ipv4.tcp_congestion_control,
-- 
cgit v1.2.3


From 1a91bb7c3ebf95e908ec33220defbcda1ecc072f Mon Sep 17 00:00:00 2001
From: Mubashir Adnan Qureshi <mubashirq@google.com>
Date: Wed, 26 Oct 2022 13:51:12 +0000
Subject: tcp: add PLB functionality for TCP

Congestion control algorithms track PLB state and cause the connection
to trigger a path change when either of the 2 conditions is satisfied:

- No packets are in flight and (# consecutive congested rounds >=
  sysctl_tcp_plb_idle_rehash_rounds)
- (# consecutive congested rounds >= sysctl_tcp_plb_rehash_rounds)

A round (RTT) is marked as congested when congestion signal
(ECN ce_ratio) over an RTT is greater than sysctl_tcp_plb_cong_thresh.
In the event of RTO, PLB (via tcp_write_timeout()) triggers a path
change and disables congestion-triggered path changes for random time
between (sysctl_tcp_plb_suspend_rto_sec, 2*sysctl_tcp_plb_suspend_rto_sec)
to avoid hopping onto the "connectivity blackhole". RTO-triggered
path changes can still happen during this cool-off period.

Signed-off-by: Mubashir Adnan Qureshi <mubashirq@google.com>
Signed-off-by: Yuchung Cheng <ycheng@google.com>
Signed-off-by: Neal Cardwell <ncardwell@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tcp.h   |  28 ++++++++++++++
 net/ipv4/Makefile   |   2 +-
 net/ipv4/tcp_ipv4.c |   2 +-
 net/ipv4/tcp_plb.c  | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 137 insertions(+), 2 deletions(-)
 create mode 100644 net/ipv4/tcp_plb.c

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 14d45661a84d..6b814e788f00 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2140,6 +2140,34 @@ extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq,
 extern void tcp_rack_reo_timeout(struct sock *sk);
 extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs);
 
+/* tcp_plb.c */
+
+/*
+ * Scaling factor for fractions in PLB. For example, tcp_plb_update_state
+ * expects cong_ratio which represents fraction of traffic that experienced
+ * congestion over a single RTT. In order to avoid floating point operations,
+ * this fraction should be mapped to (1 << TCP_PLB_SCALE) and passed in.
+ */
+#define TCP_PLB_SCALE 8
+
+/* State for PLB (Protective Load Balancing) for a single TCP connection. */
+struct tcp_plb_state {
+	u8	consec_cong_rounds:5, /* consecutive congested rounds */
+		unused:3;
+	u32	pause_until; /* jiffies32 when PLB can resume rerouting */
+};
+
+static inline void tcp_plb_init(const struct sock *sk,
+				struct tcp_plb_state *plb)
+{
+	plb->consec_cong_rounds = 0;
+	plb->pause_until = 0;
+}
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+			  const int cong_ratio);
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb);
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb);
+
 /* At how many usecs into the future should the RTO fire? */
 static inline s64 tcp_rto_delta_us(const struct sock *sk)
 {
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index bbdd9c44f14e..af7d2cf490fb 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -10,7 +10,7 @@ obj-y     := route.o inetpeer.o protocol.o \
 	     tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \
 	     tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \
 	     tcp_rate.o tcp_recovery.o tcp_ulp.o \
-	     tcp_offload.o datagram.o raw.o udp.o udplite.o \
+	     tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \
 	     udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
 	     fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \
 	     inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 58b838b56c7f..ebab9e8b184c 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -3224,7 +3224,7 @@ static int __net_init tcp_sk_init(struct net *net)
 	net->ipv4.sysctl_tcp_plb_rehash_rounds = 12;
 	net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60;
 	/* Default congestion threshold for PLB to mark a round is 50% */
-	net->ipv4.sysctl_tcp_plb_cong_thresh = 128;
+	net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2;
 
 	/* Reno is always built in */
 	if (!net_eq(net, &init_net) &&
diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c
new file mode 100644
index 000000000000..f4ced370acad
--- /dev/null
+++ b/net/ipv4/tcp_plb.c
@@ -0,0 +1,107 @@
+/* Protective Load Balancing (PLB)
+ *
+ * PLB was designed to reduce link load imbalance across datacenter
+ * switches. PLB is a host-based optimization; it leverages congestion
+ * signals from the transport layer to randomly change the path of the
+ * connection experiencing sustained congestion. PLB prefers to repath
+ * after idle periods to minimize packet reordering. It repaths by
+ * changing the IPv6 Flow Label on the packets of a connection, which
+ * datacenter switches include as part of ECMP/WCMP hashing.
+ *
+ * PLB is described in detail in:
+ *
+ *	Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu,
+ *	Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson,
+ *	David Wetherall,Abdul Kabbani:
+ *	"PLB: Congestion Signals are Simple and Effective for
+ *	 Network Load Balancing"
+ *	In ACM SIGCOMM 2022, Amsterdam Netherlands.
+ *
+ */
+
+#include <net/tcp.h>
+
+/* Called once per round-trip to update PLB state for a connection. */
+void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb,
+			  const int cong_ratio)
+{
+	struct net *net = sock_net(sk);
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	if (cong_ratio >= 0) {
+		if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh))
+			plb->consec_cong_rounds = 0;
+		else if (plb->consec_cong_rounds <
+			 READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds))
+			plb->consec_cong_rounds++;
+	}
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state);
+
+/* Check whether recent congestion has been persistent enough to warrant
+ * a load balancing decision that switches the connection to another path.
+ */
+void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	u32 max_suspend;
+	bool forced_rehash = false, idle_rehash = false;
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	forced_rehash = plb->consec_cong_rounds >=
+			READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds);
+	/* If sender goes idle then we check whether to rehash. */
+	idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) &&
+		      !tcp_sk(sk)->packets_out &&
+		      plb->consec_cong_rounds >=
+		      READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds);
+
+	if (!forced_rehash && !idle_rehash)
+		return;
+
+	/* Note that tcp_jiffies32 can wrap; we detect wraps by checking for
+	 * cases where the max suspension end is before the actual suspension
+	 * end. We clear pause_until to 0 to indicate there is no recent
+	 * RTO event that constrains PLB rehashing.
+	 */
+	max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+	if (plb->pause_until &&
+	    (!before(tcp_jiffies32, plb->pause_until) ||
+	     before(tcp_jiffies32 + max_suspend, plb->pause_until)))
+		plb->pause_until = 0;
+
+	if (plb->pause_until)
+		return;
+
+	sk_rethink_txhash(sk);
+	plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_check_rehash);
+
+/* Upon RTO, disallow load balancing for a while, to avoid having load
+ * balancing decisions switch traffic to a black-holed path that was
+ * previously avoided with a sk_rethink_txhash() call at RTO time.
+ */
+void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb)
+{
+	struct net *net = sock_net(sk);
+	u32 pause;
+
+	if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))
+		return;
+
+	pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ;
+	pause += prandom_u32_max(pause);
+	plb->pause_until = tcp_jiffies32 + pause;
+
+	/* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call
+	 * that may switch this connection to a path with completely different
+	 * congestion characteristics.
+	 */
+	plb->consec_cong_rounds = 0;
+}
+EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto);
-- 
cgit v1.2.3


From 7354c9024f2835f6122ed9612e21ab379df050f9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 27 Oct 2022 14:21:07 -0700
Subject: netlink: hide validation union fields from kdoc

Mark the validation fields as private, users shouldn't set
them directly and they are too complicated to explain in
a more succinct way (there's already a long explanation
in the comment above).

The strict_start_type field is set directly and has a dedicated
comment so move that above the "private" section.

Link: https://lore.kernel.org/r/20221027212107.2639255-1-kuba@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netlink.h | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 4418b1981e31..7db13b3261fc 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -317,19 +317,10 @@ struct nla_policy {
 	u8		validation_type;
 	u16		len;
 	union {
-		const u32 bitfield32_valid;
-		const u32 mask;
-		const char *reject_message;
-		const struct nla_policy *nested_policy;
-		struct netlink_range_validation *range;
-		struct netlink_range_validation_signed *range_signed;
-		struct {
-			s16 min, max;
-			u8 network_byte_order:1;
-		};
-		int (*validate)(const struct nlattr *attr,
-				struct netlink_ext_ack *extack);
-		/* This entry is special, and used for the attribute at index 0
+		/**
+		 * @strict_start_type: first attribute to validate strictly
+		 *
+		 * This entry is special, and used for the attribute at index 0
 		 * only, and specifies special data about the policy, namely it
 		 * specifies the "boundary type" where strict length validation
 		 * starts for any attribute types >= this value, also, strict
@@ -348,6 +339,20 @@ struct nla_policy {
 		 * was added to enforce strict validation from thereon.
 		 */
 		u16 strict_start_type;
+
+		/* private: use NLA_POLICY_*() to set */
+		const u32 bitfield32_valid;
+		const u32 mask;
+		const char *reject_message;
+		const struct nla_policy *nested_policy;
+		struct netlink_range_validation *range;
+		struct netlink_range_validation_signed *range_signed;
+		struct {
+			s16 min, max;
+			u8 network_byte_order:1;
+		};
+		int (*validate)(const struct nlattr *attr,
+				struct netlink_ext_ack *extack);
 	};
 };
 
-- 
cgit v1.2.3


From fee9ac06647e59a69fb7aec58f25267c134264b4 Mon Sep 17 00:00:00 2001
From: Pavel Begunkov <asml.silence@gmail.com>
Date: Thu, 27 Oct 2022 00:25:57 +0100
Subject: net: remove SOCK_SUPPORT_ZC from sockmap

sockmap replaces ->sk_prot with its own callbacks, we should remove
SOCK_SUPPORT_ZC as the new proto doesn't support msghdr::ubuf_info.

Cc: <stable@vger.kernel.org> # 6.0
Reported-by: Jakub Kicinski <kuba@kernel.org>
Fixes: e993ffe3da4bc ("net: flag sockets supporting msghdr originated zerocopy")
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h  | 7 +++++++
 net/ipv4/tcp_bpf.c  | 4 ++--
 net/ipv4/udp_bpf.c  | 4 ++--
 net/unix/unix_bpf.c | 8 ++++----
 4 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 22f8bab583dd..5db02546941c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1889,6 +1889,13 @@ void sock_kfree_s(struct sock *sk, void *mem, int size);
 void sock_kzfree_s(struct sock *sk, void *mem, int size);
 void sk_send_sigurg(struct sock *sk);
 
+static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
+{
+	if (sk->sk_socket)
+		clear_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags);
+	WRITE_ONCE(sk->sk_prot, proto);
+}
+
 struct sockcm_cookie {
 	u64 transmit_time;
 	u32 mark;
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index a1626afe87a1..c501c329b1db 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -607,7 +607,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 		} else {
 			sk->sk_write_space = psock->saved_write_space;
 			/* Pairs with lockless read in sk_clone_lock() */
-			WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+			sock_replace_proto(sk, psock->sk_proto);
 		}
 		return 0;
 	}
@@ -620,7 +620,7 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 	}
 
 	/* Pairs with lockless read in sk_clone_lock() */
-	WRITE_ONCE(sk->sk_prot, &tcp_bpf_prots[family][config]);
+	sock_replace_proto(sk, &tcp_bpf_prots[family][config]);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(tcp_bpf_update_proto);
diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
index ff15918b7bdc..e5dc91d0e079 100644
--- a/net/ipv4/udp_bpf.c
+++ b/net/ipv4/udp_bpf.c
@@ -141,14 +141,14 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 
 	if (restore) {
 		sk->sk_write_space = psock->saved_write_space;
-		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		sock_replace_proto(sk, psock->sk_proto);
 		return 0;
 	}
 
 	if (sk->sk_family == AF_INET6)
 		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
 
-	WRITE_ONCE(sk->sk_prot, &udp_bpf_prots[family]);
+	sock_replace_proto(sk, &udp_bpf_prots[family]);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(udp_bpf_update_proto);
diff --git a/net/unix/unix_bpf.c b/net/unix/unix_bpf.c
index 7cf14c6b1725..e9bf15513961 100644
--- a/net/unix/unix_bpf.c
+++ b/net/unix/unix_bpf.c
@@ -145,12 +145,12 @@ int unix_dgram_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool re
 
 	if (restore) {
 		sk->sk_write_space = psock->saved_write_space;
-		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		sock_replace_proto(sk, psock->sk_proto);
 		return 0;
 	}
 
 	unix_dgram_bpf_check_needs_rebuild(psock->sk_proto);
-	WRITE_ONCE(sk->sk_prot, &unix_dgram_bpf_prot);
+	sock_replace_proto(sk, &unix_dgram_bpf_prot);
 	return 0;
 }
 
@@ -158,12 +158,12 @@ int unix_stream_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool r
 {
 	if (restore) {
 		sk->sk_write_space = psock->saved_write_space;
-		WRITE_ONCE(sk->sk_prot, psock->sk_proto);
+		sock_replace_proto(sk, psock->sk_proto);
 		return 0;
 	}
 
 	unix_stream_bpf_check_needs_rebuild(psock->sk_proto);
-	WRITE_ONCE(sk->sk_prot, &unix_stream_bpf_prot);
+	sock_replace_proto(sk, &unix_stream_bpf_prot);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 738136a0e3757a8534df3ad97d6ff6d7f429f6c1 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 27 Oct 2022 14:25:53 -0700
Subject: netlink: split up copies in the ack construction

Clean up the use of unsafe_memcpy() by adding a flexible array
at the end of netlink message header and splitting up the header
and data copies.

Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netlink.h        | 21 +++++++++++++++++++++
 include/uapi/linux/netlink.h |  2 ++
 net/netlink/af_netlink.c     | 29 ++++++++++++++++++++---------
 3 files changed, 43 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 4418b1981e31..784b4688fc6f 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -931,6 +931,27 @@ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 se
 	return __nlmsg_put(skb, portid, seq, type, payload, flags);
 }
 
+/**
+ * nlmsg_append - Add more data to a nlmsg in a skb
+ * @skb: socket buffer to store message in
+ * @size: length of message payload
+ *
+ * Append data to an existing nlmsg, used when constructing a message
+ * with multiple fixed-format headers (which is rare).
+ * Returns NULL if the tailroom of the skb is insufficient to store
+ * the extra payload.
+ */
+static inline void *nlmsg_append(struct sk_buff *skb, u32 size)
+{
+	if (unlikely(skb_tailroom(skb) < NLMSG_ALIGN(size)))
+		return NULL;
+
+	if (NLMSG_ALIGN(size) - size)
+		memset(skb_tail_pointer(skb) + size, 0,
+		       NLMSG_ALIGN(size) - size);
+	return __skb_put(skb, NLMSG_ALIGN(size));
+}
+
 /**
  * nlmsg_put_answer - Add a new callback based netlink message to an skb
  * @skb: socket buffer to store message in
diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
index e2ae82e3f9f7..5da0da59bf01 100644
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -48,6 +48,7 @@ struct sockaddr_nl {
  * @nlmsg_flags: Additional flags
  * @nlmsg_seq:   Sequence number
  * @nlmsg_pid:   Sending process port ID
+ * @nlmsg_data:  Message payload
  */
 struct nlmsghdr {
 	__u32		nlmsg_len;
@@ -55,6 +56,7 @@ struct nlmsghdr {
 	__u16		nlmsg_flags;
 	__u32		nlmsg_seq;
 	__u32		nlmsg_pid;
+	__u8		nlmsg_data[];
 };
 
 /* Flags values */
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f0c94d394ab1..b10d5e50b99d 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2499,19 +2499,24 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 		flags |= NLM_F_ACK_TLVS;
 
 	skb = nlmsg_new(payload + tlvlen, GFP_KERNEL);
-	if (!skb) {
-		NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
-		sk_error_report(NETLINK_CB(in_skb).sk);
-		return;
-	}
+	if (!skb)
+		goto err_bad_put;
 
 	rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
-			NLMSG_ERROR, payload, flags);
+			NLMSG_ERROR, sizeof(*errmsg), flags);
+	if (!rep)
+		goto err_bad_put;
 	errmsg = nlmsg_data(rep);
 	errmsg->error = err;
-	unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg)
-					 ? nlh->nlmsg_len : sizeof(*nlh),
-		      /* Bounds checked by the skb layer. */);
+	errmsg->msg = *nlh;
+
+	if (!(flags & NLM_F_CAPPED)) {
+		if (!nlmsg_append(skb, nlmsg_len(nlh)))
+			goto err_bad_put;
+
+		memcpy(errmsg->msg.nlmsg_data, nlh->nlmsg_data,
+		       nlmsg_len(nlh));
+	}
 
 	if (tlvlen)
 		netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack);
@@ -2519,6 +2524,12 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err,
 	nlmsg_end(skb, rep);
 
 	nlmsg_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid);
+
+	return;
+
+err_bad_put:
+	NETLINK_CB(in_skb).sk->sk_err = ENOBUFS;
+	sk_error_report(NETLINK_CB(in_skb).sk);
 }
 EXPORT_SYMBOL(netlink_ack);
 
-- 
cgit v1.2.3


From 8c2a535e089b2ab82cf50c876bd10c8ed33252c9 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Thu, 27 Oct 2022 20:52:59 -0700
Subject: net: geneve: fix array of flexible structures warnings

New compilers don't like flexible array of flexible structs:

  include/net/geneve.h:62:34: warning: array of flexible structures

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/geneve.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/geneve.h b/include/net/geneve.h
index bced0b1d9fe4..5c96827a487e 100644
--- a/include/net/geneve.h
+++ b/include/net/geneve.h
@@ -59,7 +59,7 @@ struct genevehdr {
 	__be16 proto_type;
 	u8 vni[3];
 	u8 rsvd2;
-	struct geneve_opt options[];
+	u8 options[];
 };
 
 static inline bool netif_is_geneve(const struct net_device *dev)
-- 
cgit v1.2.3


From 1d997f1013079c05b642c739901e3584a3ae558d Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 28 Oct 2022 04:42:21 -0400
Subject: rtnetlink: pass netlink message header and portid to
 rtnl_configure_link()

This patch pass netlink message header and portid to rtnl_configure_link()
All the functions in this call chain need to add the parameters so we can
use them in the last call rtnl_notify(), and notify the userspace about
the new link info if NLM_F_ECHO flag is set.

- rtnl_configure_link()
  - __dev_notify_flags()
    - rtmsg_ifinfo()
      - rtmsg_ifinfo_event()
        - rtmsg_ifinfo_build_skb()
        - rtmsg_ifinfo_send()
	  - rtnl_notify()

Also move __dev_notify_flags() declaration to net/core/dev.h, as Jakub
suggested.

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/can/vxcan.c        |  2 +-
 drivers/net/geneve.c           |  2 +-
 drivers/net/veth.c             |  2 +-
 drivers/net/vxlan/vxlan_core.c |  4 ++--
 drivers/net/wwan/wwan_core.c   |  2 +-
 include/linux/netdevice.h      |  2 --
 include/linux/rtnetlink.h      |  9 +++++----
 include/net/netlink.h          | 11 +++++++++++
 include/net/rtnetlink.h        |  3 ++-
 net/core/dev.c                 | 25 +++++++++++++------------
 net/core/dev.h                 |  4 ++++
 net/core/rtnetlink.c           | 35 +++++++++++++++++++----------------
 net/ipv4/ip_gre.c              |  2 +-
 13 files changed, 61 insertions(+), 42 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c
index 26a472d2ea58..4068d962203d 100644
--- a/drivers/net/can/vxcan.c
+++ b/drivers/net/can/vxcan.c
@@ -236,7 +236,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
 
 	netif_carrier_off(peer);
 
-	err = rtnl_configure_link(peer, ifmp);
+	err = rtnl_configure_link(peer, ifmp, 0, NULL);
 	if (err < 0)
 		goto unregister_network_device;
 
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index f393e454f45c..89ff7f8e8c7e 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1907,7 +1907,7 @@ struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
 	if (err)
 		goto err;
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto err;
 
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 740506c44427..ac7c0653695f 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1773,7 +1773,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
 	veth_disable_gro(peer);
 	netif_carrier_off(peer);
 
-	err = rtnl_configure_link(peer, ifmp);
+	err = rtnl_configure_link(peer, ifmp, 0, NULL);
 	if (err < 0)
 		goto err_configure_peer;
 
diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c
index 6ab669dcd1c6..92224b36787a 100644
--- a/drivers/net/vxlan/vxlan_core.c
+++ b/drivers/net/vxlan/vxlan_core.c
@@ -3794,7 +3794,7 @@ static int __vxlan_dev_create(struct net *net, struct net_device *dev,
 			goto errout;
 	}
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto unlink;
 
@@ -4416,7 +4416,7 @@ struct net_device *vxlan_dev_create(struct net *net, const char *name,
 		return ERR_PTR(err);
 	}
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0) {
 		LIST_HEAD(list_kill);
 
diff --git a/drivers/net/wwan/wwan_core.c b/drivers/net/wwan/wwan_core.c
index 62e9f7d6c9fe..d72ee18476d1 100644
--- a/drivers/net/wwan/wwan_core.c
+++ b/drivers/net/wwan/wwan_core.c
@@ -1058,7 +1058,7 @@ static void wwan_create_default_link(struct wwan_device *wwandev,
 		goto unlock;
 	}
 
-	rtnl_configure_link(dev, NULL); /* Link initialized, notify new link */
+	rtnl_configure_link(dev, NULL, 0, NULL); /* Link initialized, notify new link */
 
 unlock:
 	rtnl_unlock();
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 99e58b773266..4b5052db978f 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -3855,8 +3855,6 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
 		       struct netlink_ext_ack *extack);
 int dev_change_flags(struct net_device *dev, unsigned int flags,
 		     struct netlink_ext_ack *extack);
-void __dev_notify_flags(struct net_device *, unsigned int old_flags,
-			unsigned int gchanges);
 int dev_set_alias(struct net_device *, const char *, size_t);
 int dev_get_alias(const struct net_device *, char *, size_t);
 int __dev_change_net_namespace(struct net_device *dev, struct net *net,
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index ae2c6a3cec5d..92ad75549e9c 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -12,21 +12,22 @@
 extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo);
 extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid);
 extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid,
-			u32 group, struct nlmsghdr *nlh, gfp_t flags);
+			u32 group, const struct nlmsghdr *nlh, gfp_t flags);
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, long expires, u32 error);
 
-void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags);
+void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags,
+		  u32 portid, const struct nlmsghdr *nlh);
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 			 gfp_t flags, int *new_nsid, int new_ifindex);
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned change, u32 event,
 				       gfp_t flags, int *new_nsid,
-				       int new_ifindex);
+				       int new_ifindex, u32 portid, u32 seq);
 void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev,
-		       gfp_t flags);
+		       gfp_t flags, u32 portid, const struct nlmsghdr *nlh);
 
 
 /* RTNL is used as a global lock for all changes to network configuration  */
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 784b4688fc6f..464e2e026f7b 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -899,6 +899,17 @@ static inline int nlmsg_report(const struct nlmsghdr *nlh)
 	return nlh ? !!(nlh->nlmsg_flags & NLM_F_ECHO) : 0;
 }
 
+/**
+ * nlmsg_seq - return the seq number of netlink message
+ * @nlh: netlink message header
+ *
+ * Returns 0 if netlink message is NULL
+ */
+static inline u32 nlmsg_seq(const struct nlmsghdr *nlh)
+{
+	return nlh ? nlh->nlmsg_seq : 0;
+}
+
 /**
  * nlmsg_for_each_attr - iterate over a stream of attributes
  * @pos: loop counter, set to current attribute
diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index bf8bb3357825..cd94f65dc2a9 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -187,7 +187,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    struct nlattr *tb[],
 				    struct netlink_ext_ack *extack);
 int rtnl_delete_link(struct net_device *dev);
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm);
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+			u32 portid, const struct nlmsghdr *nlh);
 
 int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len,
 			struct netlink_ext_ack *exterr);
diff --git a/net/core/dev.c b/net/core/dev.c
index cfb68db040a4..19e0db536022 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -1333,7 +1333,7 @@ void netdev_state_change(struct net_device *dev)
 
 		call_netdevice_notifiers_info(NETDEV_CHANGE,
 					      &change_info.info);
-		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL, 0, NULL);
 	}
 }
 EXPORT_SYMBOL(netdev_state_change);
@@ -1469,7 +1469,7 @@ int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
 	if (ret < 0)
 		return ret;
 
-	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+	rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 	call_netdevice_notifiers(NETDEV_UP, dev);
 
 	return ret;
@@ -1541,7 +1541,7 @@ void dev_close_many(struct list_head *head, bool unlink)
 	__dev_close_many(head);
 
 	list_for_each_entry_safe(dev, tmp, head, close_list) {
-		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP | IFF_RUNNING, GFP_KERNEL, 0, NULL);
 		call_netdevice_notifiers(NETDEV_DOWN, dev);
 		if (unlink)
 			list_del_init(&dev->close_list);
@@ -8351,7 +8351,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 		dev_change_rx_flags(dev, IFF_PROMISC);
 	}
 	if (notify)
-		__dev_notify_flags(dev, old_flags, IFF_PROMISC);
+		__dev_notify_flags(dev, old_flags, IFF_PROMISC, 0, NULL);
 	return 0;
 }
 
@@ -8406,7 +8406,7 @@ static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
 		dev_set_rx_mode(dev);
 		if (notify)
 			__dev_notify_flags(dev, old_flags,
-					   dev->gflags ^ old_gflags);
+					   dev->gflags ^ old_gflags, 0, NULL);
 	}
 	return 0;
 }
@@ -8569,12 +8569,13 @@ int __dev_change_flags(struct net_device *dev, unsigned int flags,
 }
 
 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
-			unsigned int gchanges)
+			unsigned int gchanges, u32 portid,
+			const struct nlmsghdr *nlh)
 {
 	unsigned int changes = dev->flags ^ old_flags;
 
 	if (gchanges)
-		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC, portid, nlh);
 
 	if (changes & IFF_UP) {
 		if (dev->flags & IFF_UP)
@@ -8616,7 +8617,7 @@ int dev_change_flags(struct net_device *dev, unsigned int flags,
 		return ret;
 
 	changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
-	__dev_notify_flags(dev, old_flags, changes);
+	__dev_notify_flags(dev, old_flags, changes, 0, NULL);
 	return ret;
 }
 EXPORT_SYMBOL(dev_change_flags);
@@ -10101,7 +10102,7 @@ int register_netdevice(struct net_device *dev)
 	 */
 	if (!dev->rtnl_link_ops ||
 	    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
-		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+		rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
 out:
 	return ret;
@@ -10849,7 +10850,7 @@ void unregister_netdevice_many(struct list_head *head)
 		if (!dev->rtnl_link_ops ||
 		    dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
 			skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
-						     GFP_KERNEL, NULL, 0);
+						     GFP_KERNEL, NULL, 0, 0, 0);
 
 		/*
 		 *	Flush the unicast and multicast chains
@@ -10864,7 +10865,7 @@ void unregister_netdevice_many(struct list_head *head)
 			dev->netdev_ops->ndo_uninit(dev);
 
 		if (skb)
-			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
+			rtmsg_ifinfo_send(skb, dev, GFP_KERNEL, 0, NULL);
 
 		/* Notifier chain MUST detach us all upper devices. */
 		WARN_ON(netdev_has_any_upper_dev(dev));
@@ -11042,7 +11043,7 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net,
 	 *	Prevent userspace races by waiting until the network
 	 *	device is fully setup before sending notifications.
 	 */
-	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
+	rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL, 0, NULL);
 
 	synchronize_net();
 	err = 0;
diff --git a/net/core/dev.h b/net/core/dev.h
index cbb8a925175a..6b3c7302f570 100644
--- a/net/core/dev.h
+++ b/net/core/dev.h
@@ -88,6 +88,10 @@ int dev_change_carrier(struct net_device *dev, bool new_carrier);
 
 void __dev_set_rx_mode(struct net_device *dev);
 
+void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
+			unsigned int gchanges, u32 portid,
+			const struct nlmsghdr *nlh);
+
 static inline void netif_set_gso_max_size(struct net_device *dev,
 					  unsigned int size)
 {
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 74864dc46a7e..c9dd9730f3c6 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -760,7 +760,7 @@ int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid)
 EXPORT_SYMBOL(rtnl_unicast);
 
 void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group,
-		 struct nlmsghdr *nlh, gfp_t flags)
+		 const struct nlmsghdr *nlh, gfp_t flags)
 {
 	struct sock *rtnl = net->rtnl;
 
@@ -3180,7 +3180,8 @@ out:
 	return err;
 }
 
-int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
+int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
+			u32 portid, const struct nlmsghdr *nlh)
 {
 	unsigned int old_flags;
 	int err;
@@ -3194,10 +3195,10 @@ int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm)
 	}
 
 	if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
-		__dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags));
+		__dev_notify_flags(dev, old_flags, (old_flags ^ dev->flags), portid, nlh);
 	} else {
 		dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
-		__dev_notify_flags(dev, old_flags, ~0U);
+		__dev_notify_flags(dev, old_flags, ~0U, portid, nlh);
 	}
 	return 0;
 }
@@ -3369,7 +3370,7 @@ static int rtnl_newlink_create(struct sk_buff *skb, struct ifinfomsg *ifm,
 		goto out;
 	}
 
-	err = rtnl_configure_link(dev, ifm);
+	err = rtnl_configure_link(dev, ifm, 0, NULL);
 	if (err < 0)
 		goto out_unregister;
 	if (link_net) {
@@ -3896,7 +3897,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 				       unsigned int change,
 				       u32 event, gfp_t flags, int *new_nsid,
-				       int new_ifindex)
+				       int new_ifindex, u32 portid, u32 seq)
 {
 	struct net *net = dev_net(dev);
 	struct sk_buff *skb;
@@ -3907,7 +3908,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev,
 		goto errout;
 
 	err = rtnl_fill_ifinfo(skb, dev, dev_net(dev),
-			       type, 0, 0, change, 0, 0, event,
+			       type, portid, seq, change, 0, 0, event,
 			       new_nsid, new_ifindex, -1, flags);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in if_nlmsg_size() */
@@ -3922,16 +3923,18 @@ errout:
 	return NULL;
 }
 
-void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags)
+void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags,
+		       u32 portid, const struct nlmsghdr *nlh)
 {
 	struct net *net = dev_net(dev);
 
-	rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL, flags);
+	rtnl_notify(skb, net, portid, RTNLGRP_LINK, nlh, flags);
 }
 
 static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 			       unsigned int change, u32 event,
-			       gfp_t flags, int *new_nsid, int new_ifindex)
+			       gfp_t flags, int *new_nsid, int new_ifindex,
+			       u32 portid, const struct nlmsghdr *nlh)
 {
 	struct sk_buff *skb;
 
@@ -3939,23 +3942,23 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev,
 		return;
 
 	skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid,
-				     new_ifindex);
+				     new_ifindex, portid, nlmsg_seq(nlh));
 	if (skb)
-		rtmsg_ifinfo_send(skb, dev, flags);
+		rtmsg_ifinfo_send(skb, dev, flags, portid, nlh);
 }
 
 void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change,
-		  gfp_t flags)
+		  gfp_t flags, u32 portid, const struct nlmsghdr *nlh)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   NULL, 0);
+			   NULL, 0, portid, nlh);
 }
 
 void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change,
 			 gfp_t flags, int *new_nsid, int new_ifindex)
 {
 	rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags,
-			   new_nsid, new_ifindex);
+			   new_nsid, new_ifindex, 0, NULL);
 }
 
 static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
@@ -6140,7 +6143,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi
 	case NETDEV_CHANGELOWERSTATE:
 	case NETDEV_CHANGE_TX_QUEUE_LEN:
 		rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event),
-				   GFP_KERNEL, NULL, 0);
+				   GFP_KERNEL, NULL, 0, 0, NULL);
 		break;
 	default:
 		break;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index f866d6282b2b..d8ee5238c395 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1665,7 +1665,7 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
 	if (err)
 		goto out;
 
-	err = rtnl_configure_link(dev, NULL);
+	err = rtnl_configure_link(dev, NULL, 0, NULL);
 	if (err < 0)
 		goto out;
 
-- 
cgit v1.2.3


From f3a63cce1b4fbde7738395c5a2dea83f05de3407 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Fri, 28 Oct 2022 04:42:24 -0400
Subject: rtnetlink: Honour NLM_F_ECHO flag in rtnl_delete_link

This patch use the new helper unregister_netdevice_many_notify() for
rtnl_delete_link(), so that the kernel could reply unicast when userspace
 set NLM_F_ECHO flag to request the new created interface info.

At the same time, the parameters of rtnl_delete_link() need to be updated
since we need nlmsghdr and portid info.

Suggested-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/rtnetlink.h        | 2 +-
 net/core/rtnetlink.c           | 7 ++++---
 net/openvswitch/vport-geneve.c | 2 +-
 net/openvswitch/vport-gre.c    | 2 +-
 net/openvswitch/vport-netdev.c | 2 +-
 net/openvswitch/vport-vxlan.c  | 2 +-
 6 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index cd94f65dc2a9..d9076a7a430c 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -186,7 +186,7 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname,
 				    const struct rtnl_link_ops *ops,
 				    struct nlattr *tb[],
 				    struct netlink_ext_ack *extack);
-int rtnl_delete_link(struct net_device *dev);
+int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh);
 int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm,
 			u32 portid, const struct nlmsghdr *nlh);
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 839ff8b7eadc..d2f27548fc0b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3110,7 +3110,7 @@ static int rtnl_group_dellink(const struct net *net, int group)
 	return 0;
 }
 
-int rtnl_delete_link(struct net_device *dev)
+int rtnl_delete_link(struct net_device *dev, u32 portid, const struct nlmsghdr *nlh)
 {
 	const struct rtnl_link_ops *ops;
 	LIST_HEAD(list_kill);
@@ -3120,7 +3120,7 @@ int rtnl_delete_link(struct net_device *dev)
 		return -EOPNOTSUPP;
 
 	ops->dellink(dev, &list_kill);
-	unregister_netdevice_many(&list_kill);
+	unregister_netdevice_many_notify(&list_kill, portid, nlh);
 
 	return 0;
 }
@@ -3130,6 +3130,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 			struct netlink_ext_ack *extack)
 {
 	struct net *net = sock_net(skb->sk);
+	u32 portid = NETLINK_CB(skb).portid;
 	struct net *tgt_net = net;
 	struct net_device *dev = NULL;
 	struct ifinfomsg *ifm;
@@ -3171,7 +3172,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
 		goto out;
 	}
 
-	err = rtnl_delete_link(dev);
+	err = rtnl_delete_link(dev, portid, nlh);
 
 out:
 	if (netnsid >= 0)
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 89a8e1501809..b10e1602c6b1 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -91,7 +91,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
 
 	err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
 	if (err < 0) {
-		rtnl_delete_link(dev);
+		rtnl_delete_link(dev, 0, NULL);
 		rtnl_unlock();
 		ovs_vport_free(vport);
 		goto error;
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index e6b5e76a962a..4014c9b5eb79 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -57,7 +57,7 @@ static struct vport *gre_tnl_create(const struct vport_parms *parms)
 
 	err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
 	if (err < 0) {
-		rtnl_delete_link(dev);
+		rtnl_delete_link(dev, 0, NULL);
 		rtnl_unlock();
 		ovs_vport_free(vport);
 		return ERR_PTR(err);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index 2f61d5bdce1a..903537a5da22 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -172,7 +172,7 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
 	 * if it's not already shutting down.
 	 */
 	if (vport->dev->reg_state == NETREG_REGISTERED)
-		rtnl_delete_link(vport->dev);
+		rtnl_delete_link(vport->dev, 0, NULL);
 	netdev_put(vport->dev, &vport->dev_tracker);
 	vport->dev = NULL;
 	rtnl_unlock();
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 188e9c1360a1..0b881b043bcf 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -120,7 +120,7 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
 
 	err = dev_change_flags(dev, dev->flags | IFF_UP, NULL);
 	if (err < 0) {
-		rtnl_delete_link(dev);
+		rtnl_delete_link(dev, 0, NULL);
 		rtnl_unlock();
 		ovs_vport_free(vport);
 		goto error;
-- 
cgit v1.2.3


From 0e84afe8ebfbb9eade3f4f6de4720887bf908e26 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 29 Oct 2022 15:45:16 +0000
Subject: net: dropreason: add SKB_CONSUMED reason

This will allow to simply use in the future:

	kfree_skb_reason(skb, reason);

Instead of repeating sequences like:

	if (dropped)
	    kfree_skb_reason(skb, reason);
	else
	    consume_skb(skb);

For instance, following patch in the series is adding
@reason to skb_release_data() and skb_release_all(),
so that we can propagate a meaningful @reason whenever
consume_skb()/kfree_skb() have to take care of a potential frag_list.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h | 2 ++
 net/core/skbuff.c        | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index c1cbcdbaf149..0bd18c14dae0 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -80,6 +80,8 @@ enum skb_drop_reason {
 	 * @SKB_NOT_DROPPED_YET: skb is not dropped yet (used for no-drop case)
 	 */
 	SKB_NOT_DROPPED_YET = 0,
+	/** @SKB_CONSUMED: packet has been consumed */
+	SKB_CONSUMED,
 	/** @SKB_DROP_REASON_NOT_SPECIFIED: drop reason is not specified */
 	SKB_DROP_REASON_NOT_SPECIFIED,
 	/** @SKB_DROP_REASON_NO_SOCKET: socket not found */
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 1d84a17eada5..7ce797cd121f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -94,6 +94,7 @@ EXPORT_SYMBOL(sysctl_max_skb_frags);
 #undef FN
 #define FN(reason) [SKB_DROP_REASON_##reason] = #reason,
 const char * const drop_reasons[] = {
+	[SKB_CONSUMED] = "CONSUMED",
 	DEFINE_DROP_REASON(FN, FN)
 };
 EXPORT_SYMBOL(drop_reasons);
@@ -894,7 +895,10 @@ kfree_skb_reason(struct sk_buff *skb, enum skb_drop_reason reason)
 
 	DEBUG_NET_WARN_ON_ONCE(reason <= 0 || reason >= SKB_DROP_REASON_MAX);
 
-	trace_kfree_skb(skb, __builtin_return_address(0), reason);
+	if (reason == SKB_CONSUMED)
+		trace_consume_skb(skb);
+	else
+		trace_kfree_skb(skb, __builtin_return_address(0), reason);
 	__kfree_skb(skb);
 }
 EXPORT_SYMBOL(kfree_skb_reason);
-- 
cgit v1.2.3


From 4ecbb1c27c363686d11a241cd682a454a8454c2b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 29 Oct 2022 15:45:18 +0000
Subject: net: dropreason: add SKB_DROP_REASON_DUP_FRAG

This is used to track when a duplicate segment received by various
reassembly units is dropped.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h                |  3 +++
 net/ipv4/ip_fragment.c                  | 13 +++++++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  2 +-
 net/ipv6/reassembly.c                   | 13 +++++++++----
 4 files changed, 22 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 0bd18c14dae0..602d555a5f83 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -68,6 +68,7 @@
 	FN(IP_INADDRERRORS)		\
 	FN(IP_INNOROUTES)		\
 	FN(PKT_TOO_BIG)			\
+	FN(DUP_FRAG)			\
 	FNe(MAX)
 
 /**
@@ -300,6 +301,8 @@ enum skb_drop_reason {
 	 * MTU)
 	 */
 	SKB_DROP_REASON_PKT_TOO_BIG,
+	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
+	SKB_DROP_REASON_DUP_FRAG,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
 	 * used as a real 'reason'
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index fb153569889e..676bd8d25955 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -278,10 +278,14 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	struct net_device *dev;
 	unsigned int fragsize;
 	int err = -ENOENT;
+	SKB_DR(reason);
 	u8 ecn;
 
-	if (qp->q.flags & INET_FRAG_COMPLETE)
+	/* If reassembly is already done, @skb must be a duplicate frag. */
+	if (qp->q.flags & INET_FRAG_COMPLETE) {
+		SKB_DR_SET(reason, DUP_FRAG);
 		goto err;
+	}
 
 	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
 	    unlikely(ip_frag_too_far(qp)) &&
@@ -382,8 +386,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 
 insert_error:
 	if (err == IPFRAG_DUP) {
-		kfree_skb(skb);
-		return -EINVAL;
+		SKB_DR_SET(reason, DUP_FRAG);
+		err = -EINVAL;
+		goto err;
 	}
 	err = -EINVAL;
 	__IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
@@ -391,7 +396,7 @@ discard_qp:
 	inet_frag_kill(&qp->q);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 err:
-	kfree_skb(skb);
+	kfree_skb_reason(skb, reason);
 	return err;
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 38db0064d661..d13240f13607 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -253,7 +253,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	if (err) {
 		if (err == IPFRAG_DUP) {
 			/* No error for duplicates, pretend they got queued. */
-			kfree_skb(skb);
+			kfree_skb_reason(skb, SKB_DROP_REASON_DUP_FRAG);
 			return -EINPROGRESS;
 		}
 		goto insert_error;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index ff866f2a879e..5bc8a28e67f9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -112,10 +112,14 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	struct sk_buff *prev_tail;
 	struct net_device *dev;
 	int err = -ENOENT;
+	SKB_DR(reason);
 	u8 ecn;
 
-	if (fq->q.flags & INET_FRAG_COMPLETE)
+	/* If reassembly is already done, @skb must be a duplicate frag. */
+	if (fq->q.flags & INET_FRAG_COMPLETE) {
+		SKB_DR_SET(reason, DUP_FRAG);
 		goto err;
+	}
 
 	err = -EINVAL;
 	offset = ntohs(fhdr->frag_off) & ~0x7;
@@ -226,8 +230,9 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 
 insert_error:
 	if (err == IPFRAG_DUP) {
-		kfree_skb(skb);
-		return -EINVAL;
+		SKB_DR_SET(reason, DUP_FRAG);
+		err = -EINVAL;
+		goto err;
 	}
 	err = -EINVAL;
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
@@ -237,7 +242,7 @@ discard_fq:
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
 err:
-	kfree_skb(skb);
+	kfree_skb_reason(skb, reason);
 	return err;
 }
 
-- 
cgit v1.2.3


From 77adfd3a1d44c4730fd2af99b497e04ddc2b5837 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 29 Oct 2022 15:45:19 +0000
Subject: net: dropreason: add SKB_DROP_REASON_FRAG_REASM_TIMEOUT

Used to track skbs freed after a timeout happened
in a reassmbly unit.

Passing a @reason argument to inet_frag_rbtree_purge()
allows to use correct consumed status for frags
that have been successfully re-assembled.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h |  3 +++
 include/net/inet_frag.h  |  6 +++++-
 include/net/ipv6_frag.h  |  3 ++-
 net/ipv4/inet_fragment.c | 14 ++++++++++----
 net/ipv4/ip_fragment.c   |  6 ++++--
 5 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 602d555a5f83..1d45a74148c3 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -69,6 +69,7 @@
 	FN(IP_INNOROUTES)		\
 	FN(PKT_TOO_BIG)			\
 	FN(DUP_FRAG)			\
+	FN(FRAG_REASM_TIMEOUT)		\
 	FNe(MAX)
 
 /**
@@ -303,6 +304,8 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_PKT_TOO_BIG,
 	/** @SKB_DROP_REASON_DUP_FRAG: duplicate fragment */
 	SKB_DROP_REASON_DUP_FRAG,
+	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
+	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
 	 * used as a real 'reason'
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 0b0876610553..b23ddec3cd5c 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -7,6 +7,7 @@
 #include <linux/in6.h>
 #include <linux/rbtree_types.h>
 #include <linux/refcount.h>
+#include <net/dropreason.h>
 
 /* Per netns frag queues directory */
 struct fqdir {
@@ -34,12 +35,14 @@ struct fqdir {
  * @INET_FRAG_LAST_IN: final fragment has arrived
  * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction
  * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable
+ * @INET_FRAG_DROP: if skbs must be dropped (instead of being consumed)
  */
 enum {
 	INET_FRAG_FIRST_IN	= BIT(0),
 	INET_FRAG_LAST_IN	= BIT(1),
 	INET_FRAG_COMPLETE	= BIT(2),
 	INET_FRAG_HASH_DEAD	= BIT(3),
+	INET_FRAG_DROP		= BIT(4),
 };
 
 struct frag_v4_compare_key {
@@ -139,7 +142,8 @@ void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key);
 
 /* Free all skbs in the queue; return the sum of their truesizes. */
-unsigned int inet_frag_rbtree_purge(struct rb_root *root);
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+				    enum skb_drop_reason reason);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h
index 5052c66e22d2..7321ffe3a108 100644
--- a/include/net/ipv6_frag.h
+++ b/include/net/ipv6_frag.h
@@ -76,6 +76,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
+	fq->q.flags |= INET_FRAG_DROP;
 	inet_frag_kill(&fq->q);
 
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -101,7 +102,7 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
 	spin_unlock(&fq->q.lock);
 
 	icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0);
-	kfree_skb(head);
+	kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
 	goto out_rcu_unlock;
 
 out:
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index c9f9ac5013a7..7072fc0783ef 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -133,6 +133,7 @@ static void inet_frags_free_cb(void *ptr, void *arg)
 	count = del_timer_sync(&fq->timer) ? 1 : 0;
 
 	spin_lock_bh(&fq->lock);
+	fq->flags |= INET_FRAG_DROP;
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
 		fq->flags |= INET_FRAG_COMPLETE;
 		count++;
@@ -260,7 +261,8 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
 	kmem_cache_free(f->frags_cachep, q);
 }
 
-unsigned int inet_frag_rbtree_purge(struct rb_root *root)
+unsigned int inet_frag_rbtree_purge(struct rb_root *root,
+				    enum skb_drop_reason reason)
 {
 	struct rb_node *p = rb_first(root);
 	unsigned int sum = 0;
@@ -274,7 +276,7 @@ unsigned int inet_frag_rbtree_purge(struct rb_root *root)
 			struct sk_buff *next = FRAG_CB(skb)->next_frag;
 
 			sum += skb->truesize;
-			kfree_skb(skb);
+			kfree_skb_reason(skb, reason);
 			skb = next;
 		}
 	}
@@ -284,17 +286,21 @@ EXPORT_SYMBOL(inet_frag_rbtree_purge);
 
 void inet_frag_destroy(struct inet_frag_queue *q)
 {
-	struct fqdir *fqdir;
 	unsigned int sum, sum_truesize = 0;
+	enum skb_drop_reason reason;
 	struct inet_frags *f;
+	struct fqdir *fqdir;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
+	reason = (q->flags & INET_FRAG_DROP) ?
+			SKB_DROP_REASON_FRAG_REASM_TIMEOUT :
+			SKB_CONSUMED;
 	WARN_ON(del_timer(&q->timer) != 0);
 
 	/* Release all fragment data. */
 	fqdir = q->fqdir;
 	f = fqdir->f;
-	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments);
+	sum_truesize = inet_frag_rbtree_purge(&q->rb_fragments, reason);
 	sum = sum_truesize + f->qsize;
 
 	call_rcu(&q->rcu, inet_frag_destroy_rcu);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 676bd8d25955..85e8113259c3 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -153,6 +153,7 @@ static void ip_expire(struct timer_list *t)
 	if (qp->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
+	qp->q.flags |= INET_FRAG_DROP;
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
@@ -194,7 +195,7 @@ out:
 	spin_unlock(&qp->q.lock);
 out_rcu_unlock:
 	rcu_read_unlock();
-	kfree_skb(head);
+	kfree_skb_reason(head, SKB_DROP_REASON_FRAG_REASM_TIMEOUT);
 	ipq_put(qp);
 }
 
@@ -254,7 +255,8 @@ static int ip_frag_reinit(struct ipq *qp)
 		return -ETIMEDOUT;
 	}
 
-	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
+					      SKB_DROP_REASON_NOT_SPECIFIED);
 	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
 
 	qp->q.flags = 0;
-- 
cgit v1.2.3


From 3bdfb04f13ebdd4ae50fc5dc595663874781e48c Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Sat, 29 Oct 2022 15:45:20 +0000
Subject: net: dropreason: add SKB_DROP_REASON_FRAG_TOO_FAR

IPv4 reassembly unit can decide to drop frags based on
/proc/sys/net/ipv4/ipfrag_max_dist sysctl.

Add a specific drop reason to track this specific
and weird case.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dropreason.h | 6 ++++++
 net/ipv4/ip_fragment.c   | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/dropreason.h b/include/net/dropreason.h
index 1d45a74148c3..70539288f995 100644
--- a/include/net/dropreason.h
+++ b/include/net/dropreason.h
@@ -70,6 +70,7 @@
 	FN(PKT_TOO_BIG)			\
 	FN(DUP_FRAG)			\
 	FN(FRAG_REASM_TIMEOUT)		\
+	FN(FRAG_TOO_FAR)		\
 	FNe(MAX)
 
 /**
@@ -306,6 +307,11 @@ enum skb_drop_reason {
 	SKB_DROP_REASON_DUP_FRAG,
 	/** @SKB_DROP_REASON_FRAG_REASM_TIMEOUT: fragment reassembly timeout */
 	SKB_DROP_REASON_FRAG_REASM_TIMEOUT,
+	/**
+	 * @SKB_DROP_REASON_FRAG_TOO_FAR: ipv4 fragment too far.
+	 * (/proc/sys/net/ipv4/ipfrag_max_dist)
+	 */
+	SKB_DROP_REASON_FRAG_TOO_FAR,
 	/**
 	 * @SKB_DROP_REASON_MAX: the maximum of drop reason, which shouldn't be
 	 * used as a real 'reason'
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 85e8113259c3..69c00ffdcf3e 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -256,7 +256,7 @@ static int ip_frag_reinit(struct ipq *qp)
 	}
 
 	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments,
-					      SKB_DROP_REASON_NOT_SPECIFIED);
+					      SKB_DROP_REASON_FRAG_TOO_FAR);
 	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
 
 	qp->q.flags = 0;
-- 
cgit v1.2.3


From ecaf75ffd5f5db320d8b1da0198eef5a5ce64a3f Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Mon, 31 Oct 2022 13:34:07 +0100
Subject: netlink: introduce bigendian integer types

Jakub reported that the addition of the "network_byte_order"
member in struct nla_policy increases size of 32bit platforms.

Instead of scraping the bit from elsewhere Johannes suggested
to add explicit NLA_BE types instead, so do this here.

NLA_POLICY_MAX_BE() macro is removed again, there is no need
for it: NLA_POLICY_MAX(NLA_BE.., ..) will do the right thing.

NLA_BE64 can be added later.

Fixes: 08724ef69907 ("netlink: introduce NLA_POLICY_MAX_BE")
Reported-by: Jakub Kicinski <kuba@kernel.org>
Suggested-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20221031123407.9158-1-fw@strlen.de
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/netlink.h       | 19 ++++++++++---------
 lib/nlattr.c                | 41 +++++++++++++++--------------------------
 net/netfilter/nft_payload.c |  6 +++---
 3 files changed, 28 insertions(+), 38 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netlink.h b/include/net/netlink.h
index 7db13b3261fc..6bfa972f2fbf 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -181,6 +181,8 @@ enum {
 	NLA_S64,
 	NLA_BITFIELD32,
 	NLA_REJECT,
+	NLA_BE16,
+	NLA_BE32,
 	__NLA_TYPE_MAX,
 };
 
@@ -231,6 +233,7 @@ enum nla_policy_validation {
  *    NLA_U32, NLA_U64,
  *    NLA_S8, NLA_S16,
  *    NLA_S32, NLA_S64,
+ *    NLA_BE16, NLA_BE32,
  *    NLA_MSECS            Leaving the length field zero will verify the
  *                         given type fits, using it verifies minimum length
  *                         just like "All other"
@@ -261,6 +264,8 @@ enum nla_policy_validation {
  *    NLA_U16,
  *    NLA_U32,
  *    NLA_U64,
+ *    NLA_BE16,
+ *    NLA_BE32,
  *    NLA_S8,
  *    NLA_S16,
  *    NLA_S32,
@@ -349,7 +354,6 @@ struct nla_policy {
 		struct netlink_range_validation_signed *range_signed;
 		struct {
 			s16 min, max;
-			u8 network_byte_order:1;
 		};
 		int (*validate)(const struct nlattr *attr,
 				struct netlink_ext_ack *extack);
@@ -374,6 +378,8 @@ struct nla_policy {
 	(tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64)
 #define __NLA_IS_SINT_TYPE(tp)						\
 	(tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64)
+#define __NLA_IS_BEINT_TYPE(tp)						\
+	(tp == NLA_BE16 || tp == NLA_BE32)
 
 #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition))
 #define NLA_ENSURE_UINT_TYPE(tp)			\
@@ -387,6 +393,7 @@ struct nla_policy {
 #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp)		\
 	(__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) ||		\
 		      __NLA_IS_SINT_TYPE(tp) ||		\
+		      __NLA_IS_BEINT_TYPE(tp) ||	\
 		      tp == NLA_MSECS ||		\
 		      tp == NLA_BINARY) + tp)
 #define NLA_ENSURE_NO_VALIDATION_PTR(tp)		\
@@ -394,6 +401,8 @@ struct nla_policy {
 		      tp != NLA_REJECT &&		\
 		      tp != NLA_NESTED &&		\
 		      tp != NLA_NESTED_ARRAY) + tp)
+#define NLA_ENSURE_BEINT_TYPE(tp)			\
+	(__NLA_ENSURE(__NLA_IS_BEINT_TYPE(tp)) + tp)
 
 #define NLA_POLICY_RANGE(tp, _min, _max) {		\
 	.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),	\
@@ -424,14 +433,6 @@ struct nla_policy {
 	.type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp),	\
 	.validation_type = NLA_VALIDATE_MAX,		\
 	.max = _max,					\
-	.network_byte_order = 0,			\
-}
-
-#define NLA_POLICY_MAX_BE(tp, _max) {			\
-	.type = NLA_ENSURE_UINT_TYPE(tp),		\
-	.validation_type = NLA_VALIDATE_MAX,		\
-	.max = _max,					\
-	.network_byte_order = 1,			\
 }
 
 #define NLA_POLICY_MASK(tp, _mask) {			\
diff --git a/lib/nlattr.c b/lib/nlattr.c
index 40f22b177d69..b67a53e29b8f 100644
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -124,10 +124,12 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
 		range->max = U8_MAX;
 		break;
 	case NLA_U16:
+	case NLA_BE16:
 	case NLA_BINARY:
 		range->max = U16_MAX;
 		break;
 	case NLA_U32:
+	case NLA_BE32:
 		range->max = U32_MAX;
 		break;
 	case NLA_U64:
@@ -159,31 +161,6 @@ void nla_get_range_unsigned(const struct nla_policy *pt,
 	}
 }
 
-static u64 nla_get_attr_bo(const struct nla_policy *pt,
-			   const struct nlattr *nla)
-{
-	switch (pt->type) {
-	case NLA_U16:
-		if (pt->network_byte_order)
-			return ntohs(nla_get_be16(nla));
-
-		return nla_get_u16(nla);
-	case NLA_U32:
-		if (pt->network_byte_order)
-			return ntohl(nla_get_be32(nla));
-
-		return nla_get_u32(nla);
-	case NLA_U64:
-		if (pt->network_byte_order)
-			return be64_to_cpu(nla_get_be64(nla));
-
-		return nla_get_u64(nla);
-	}
-
-	WARN_ON_ONCE(1);
-	return 0;
-}
-
 static int nla_validate_range_unsigned(const struct nla_policy *pt,
 				       const struct nlattr *nla,
 				       struct netlink_ext_ack *extack,
@@ -197,9 +174,13 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
 		value = nla_get_u8(nla);
 		break;
 	case NLA_U16:
+		value = nla_get_u16(nla);
+		break;
 	case NLA_U32:
+		value = nla_get_u32(nla);
+		break;
 	case NLA_U64:
-		value = nla_get_attr_bo(pt, nla);
+		value = nla_get_u64(nla);
 		break;
 	case NLA_MSECS:
 		value = nla_get_u64(nla);
@@ -207,6 +188,12 @@ static int nla_validate_range_unsigned(const struct nla_policy *pt,
 	case NLA_BINARY:
 		value = nla_len(nla);
 		break;
+	case NLA_BE16:
+		value = ntohs(nla_get_be16(nla));
+		break;
+	case NLA_BE32:
+		value = ntohl(nla_get_be32(nla));
+		break;
 	default:
 		return -EINVAL;
 	}
@@ -334,6 +321,8 @@ static int nla_validate_int_range(const struct nla_policy *pt,
 	case NLA_U64:
 	case NLA_MSECS:
 	case NLA_BINARY:
+	case NLA_BE16:
+	case NLA_BE32:
 		return nla_validate_range_unsigned(pt, nla, extack, validate);
 	case NLA_S8:
 	case NLA_S16:
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 088244f9d838..4edd899aeb9b 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -173,10 +173,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
 	[NFTA_PAYLOAD_SREG]		= { .type = NLA_U32 },
 	[NFTA_PAYLOAD_DREG]		= { .type = NLA_U32 },
 	[NFTA_PAYLOAD_BASE]		= { .type = NLA_U32 },
-	[NFTA_PAYLOAD_OFFSET]		= NLA_POLICY_MAX_BE(NLA_U32, 255),
-	[NFTA_PAYLOAD_LEN]		= NLA_POLICY_MAX_BE(NLA_U32, 255),
+	[NFTA_PAYLOAD_OFFSET]		= NLA_POLICY_MAX(NLA_BE32, 255),
+	[NFTA_PAYLOAD_LEN]		= NLA_POLICY_MAX(NLA_BE32, 255),
 	[NFTA_PAYLOAD_CSUM_TYPE]	= { .type = NLA_U32 },
-	[NFTA_PAYLOAD_CSUM_OFFSET]	= NLA_POLICY_MAX_BE(NLA_U32, 255),
+	[NFTA_PAYLOAD_CSUM_OFFSET]	= NLA_POLICY_MAX(NLA_BE32, 255),
 	[NFTA_PAYLOAD_CSUM_FLAGS]	= { .type = NLA_U32 },
 };
 
-- 
cgit v1.2.3


From 777fa87c7682228e155cf0892ba61cb2ab1fe3ae Mon Sep 17 00:00:00 2001
From: "Jiri Slaby (SUSE)" <jirislaby@kernel.org>
Date: Mon, 31 Oct 2022 12:44:09 +0100
Subject: bonding (gcc13): synchronize bond_{a,t}lb_xmit() types

Both bond_alb_xmit() and bond_tlb_xmit() produce a valid warning with
gcc-13:
  drivers/net/bonding/bond_alb.c:1409:13: error: conflicting types for 'bond_tlb_xmit' due to enum/integer mismatch; have 'netdev_tx_t(struct sk_buff *, struct net_device *)' ...
  include/net/bond_alb.h:160:5: note: previous declaration of 'bond_tlb_xmit' with type 'int(struct sk_buff *, struct net_device *)'

  drivers/net/bonding/bond_alb.c:1523:13: error: conflicting types for 'bond_alb_xmit' due to enum/integer mismatch; have 'netdev_tx_t(struct sk_buff *, struct net_device *)' ...
  include/net/bond_alb.h:159:5: note: previous declaration of 'bond_alb_xmit' with type 'int(struct sk_buff *, struct net_device *)'

I.e. the return type of the declaration is int, while the definitions
spell netdev_tx_t. Synchronize both of them to the latter.

Cc: Martin Liska <mliska@suse.cz>
Cc: Jay Vosburgh <j.vosburgh@gmail.com>
Cc: Veaceslav Falico <vfalico@gmail.com>
Cc: Andy Gospodarek <andy@greyhouse.net>
Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
Link: https://lore.kernel.org/r/20221031114409.10417-1-jirislaby@kernel.org
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/bond_alb.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/bond_alb.h b/include/net/bond_alb.h
index 191c36afa1f4..9dc082b2d543 100644
--- a/include/net/bond_alb.h
+++ b/include/net/bond_alb.h
@@ -156,8 +156,8 @@ int bond_alb_init_slave(struct bonding *bond, struct slave *slave);
 void bond_alb_deinit_slave(struct bonding *bond, struct slave *slave);
 void bond_alb_handle_link_change(struct bonding *bond, struct slave *slave, char link);
 void bond_alb_handle_active_change(struct bonding *bond, struct slave *new_slave);
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev);
 struct slave *bond_xmit_alb_slave_get(struct bonding *bond,
 				      struct sk_buff *skb);
 struct slave *bond_xmit_tlb_slave_get(struct bonding *bond,
-- 
cgit v1.2.3


From 6182d5875c330a5a611687caa05f47752455720c Mon Sep 17 00:00:00 2001
From: Daniel Machon <daniel.machon@microchip.com>
Date: Tue, 1 Nov 2022 10:48:30 +0100
Subject: net: dcb: add new apptrust attribute

Add new apptrust extension attributes to the 8021Qaz APP managed object.

Two new attributes, DCB_ATTR_DCB_APP_TRUST_TABLE and
DCB_ATTR_DCB_APP_TRUST, has been added. Trusted selectors are passed in
the nested attribute DCB_ATTR_DCB_APP_TRUST, in order of precedence.

The new attributes are meant to allow drivers, whose hw supports the
notion of trust, to be able to set whether a particular app selector is
trusted - and in which order.

Signed-off-by: Daniel Machon <daniel.machon@microchip.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/dcbnl.h        |  4 +++
 include/uapi/linux/dcbnl.h |  2 ++
 net/dcb/dcbnl.c            | 76 ++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 80 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dcbnl.h b/include/net/dcbnl.h
index 2b2d86fb3131..8841ab6c2de7 100644
--- a/include/net/dcbnl.h
+++ b/include/net/dcbnl.h
@@ -109,6 +109,10 @@ struct dcbnl_rtnl_ops {
 	/* buffer settings */
 	int (*dcbnl_getbuffer)(struct net_device *, struct dcbnl_buffer *);
 	int (*dcbnl_setbuffer)(struct net_device *, struct dcbnl_buffer *);
+
+	/* apptrust */
+	int (*dcbnl_setapptrust)(struct net_device *, u8 *, int);
+	int (*dcbnl_getapptrust)(struct net_device *, u8 *, int *);
 };
 
 #endif /* __NET_DCBNL_H__ */
diff --git a/include/uapi/linux/dcbnl.h b/include/uapi/linux/dcbnl.h
index dc7ef96207ca..99047223ab26 100644
--- a/include/uapi/linux/dcbnl.h
+++ b/include/uapi/linux/dcbnl.h
@@ -410,6 +410,7 @@ enum dcbnl_attrs {
  * @DCB_ATTR_IEEE_PEER_ETS: peer ETS configuration - get only
  * @DCB_ATTR_IEEE_PEER_PFC: peer PFC configuration - get only
  * @DCB_ATTR_IEEE_PEER_APP: peer APP tlv - get only
+ * @DCB_ATTR_DCB_APP_TRUST_TABLE: selector trust table
  */
 enum ieee_attrs {
 	DCB_ATTR_IEEE_UNSPEC,
@@ -423,6 +424,7 @@ enum ieee_attrs {
 	DCB_ATTR_IEEE_QCN,
 	DCB_ATTR_IEEE_QCN_STATS,
 	DCB_ATTR_DCB_BUFFER,
+	DCB_ATTR_DCB_APP_TRUST_TABLE,
 	__DCB_ATTR_IEEE_MAX
 };
 #define DCB_ATTR_IEEE_MAX (__DCB_ATTR_IEEE_MAX - 1)
diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c
index 0c17bb28ae60..cec0632f96db 100644
--- a/net/dcb/dcbnl.c
+++ b/net/dcb/dcbnl.c
@@ -166,6 +166,7 @@ static const struct nla_policy dcbnl_ieee_policy[DCB_ATTR_IEEE_MAX + 1] = {
 	[DCB_ATTR_IEEE_QCN]         = {.len = sizeof(struct ieee_qcn)},
 	[DCB_ATTR_IEEE_QCN_STATS]   = {.len = sizeof(struct ieee_qcn_stats)},
 	[DCB_ATTR_DCB_BUFFER]       = {.len = sizeof(struct dcbnl_buffer)},
+	[DCB_ATTR_DCB_APP_TRUST_TABLE] = {.type = NLA_NESTED},
 };
 
 /* DCB number of traffic classes nested attributes. */
@@ -1062,9 +1063,9 @@ nla_put_failure:
 /* Handle IEEE 802.1Qaz/802.1Qau/802.1Qbb GET commands. */
 static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 {
-	struct nlattr *ieee, *app;
-	struct dcb_app_type *itr;
 	const struct dcbnl_rtnl_ops *ops = netdev->dcbnl_ops;
+	struct nlattr *ieee, *app, *apptrust;
+	struct dcb_app_type *itr;
 	int dcbx;
 	int err;
 
@@ -1166,6 +1167,30 @@ static int dcbnl_ieee_fill(struct sk_buff *skb, struct net_device *netdev)
 	spin_unlock_bh(&dcb_lock);
 	nla_nest_end(skb, app);
 
+	if (ops->dcbnl_getapptrust) {
+		u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
+		int nselectors, i;
+
+		apptrust = nla_nest_start(skb, DCB_ATTR_DCB_APP_TRUST_TABLE);
+		if (!apptrust)
+			return -EMSGSIZE;
+
+		err = ops->dcbnl_getapptrust(netdev, selectors, &nselectors);
+		if (!err) {
+			for (i = 0; i < nselectors; i++) {
+				enum ieee_attrs_app type =
+					dcbnl_app_attr_type_get(selectors[i]);
+				err = nla_put_u8(skb, type, selectors[i]);
+				if (err) {
+					nla_nest_cancel(skb, apptrust);
+					return err;
+				}
+			}
+		}
+
+		nla_nest_end(skb, apptrust);
+	}
+
 	/* get peer info if available */
 	if (ops->ieee_peer_getets) {
 		struct ieee_ets ets;
@@ -1554,6 +1579,53 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh,
 		}
 	}
 
+	if (ieee[DCB_ATTR_DCB_APP_TRUST_TABLE]) {
+		u8 selectors[IEEE_8021QAZ_APP_SEL_MAX + 1] = {0};
+		struct nlattr *attr;
+		int nselectors = 0;
+		int rem;
+
+		if (!ops->dcbnl_setapptrust) {
+			err = -EOPNOTSUPP;
+			goto err;
+		}
+
+		nla_for_each_nested(attr, ieee[DCB_ATTR_DCB_APP_TRUST_TABLE],
+				    rem) {
+			enum ieee_attrs_app type = nla_type(attr);
+			u8 selector;
+			int i;
+
+			if (!dcbnl_app_attr_type_validate(type) ||
+			    nla_len(attr) != 1 ||
+			    nselectors >= sizeof(selectors)) {
+				err = -EINVAL;
+				goto err;
+			}
+
+			selector = nla_get_u8(attr);
+
+			if (!dcbnl_app_selector_validate(type, selector)) {
+				err = -EINVAL;
+				goto err;
+			}
+
+			/* Duplicate selector ? */
+			for (i = 0; i < nselectors; i++) {
+				if (selectors[i] == selector) {
+					err = -EINVAL;
+					goto err;
+				}
+			}
+
+			selectors[nselectors++] = selector;
+		}
+
+		err = ops->dcbnl_setapptrust(netdev, selectors, nselectors);
+		if (err)
+			goto err;
+	}
+
 err:
 	err = nla_put_u8(skb, DCB_ATTR_IEEE, err);
 	dcbnl_ieee_notify(netdev, RTM_SETDCB, DCB_CMD_IEEE_SET, seq, 0);
-- 
cgit v1.2.3


From 3830c5719af66fac9849cf5fb04b03d4e4bb46ff Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:01:59 +0100
Subject: net: devlink: convert devlink port type-specific pointers to union

Instead of storing type_dev as a void pointer, convert it to union and
use it to store either struct net_device or struct ib_device pointer.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h | 13 ++++++++++---
 net/core/devlink.c    | 17 +++++++++++++----
 2 files changed, 23 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index ba6b8b094943..6c55aabaedf1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -121,12 +121,19 @@ struct devlink_port {
 	struct list_head region_list;
 	struct devlink *devlink;
 	unsigned int index;
-	spinlock_t type_lock; /* Protects type and type_dev
-			       * pointer consistency.
+	spinlock_t type_lock; /* Protects type and type_eth/ib
+			       * structures consistency.
 			       */
 	enum devlink_port_type type;
 	enum devlink_port_type desired_type;
-	void *type_dev;
+	union {
+		struct {
+			struct net_device *netdev;
+		} type_eth;
+		struct {
+			struct ib_device *ibdev;
+		} type_ib;
+	};
 	struct devlink_port_attrs attrs;
 	u8 attrs_set:1,
 	   switch_port:1,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 0a16ad45520e..868d04c2164f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1303,7 +1303,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 		goto nla_put_failure_type_locked;
 	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
 		struct net *net = devlink_net(devlink_port->devlink);
-		struct net_device *netdev = devlink_port->type_dev;
+		struct net_device *netdev = devlink_port->type_eth.netdev;
 
 		if (netdev && net_eq(net, dev_net(netdev)) &&
 		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
@@ -1313,7 +1313,7 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 			goto nla_put_failure_type_locked;
 	}
 	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
-		struct ib_device *ibdev = devlink_port->type_dev;
+		struct ib_device *ibdev = devlink_port->type_ib.ibdev;
 
 		if (ibdev &&
 		    nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
@@ -10003,7 +10003,16 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 	devlink_port_type_warn_cancel(devlink_port);
 	spin_lock_bh(&devlink_port->type_lock);
 	devlink_port->type = type;
-	devlink_port->type_dev = type_dev;
+	switch (type) {
+	case DEVLINK_PORT_TYPE_ETH:
+		devlink_port->type_eth.netdev = type_dev;
+		break;
+	case DEVLINK_PORT_TYPE_IB:
+		devlink_port->type_ib.ibdev = type_dev;
+		break;
+	default:
+		break;
+	}
 	spin_unlock_bh(&devlink_port->type_lock);
 	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
@@ -12016,7 +12025,7 @@ devlink_trap_report_metadata_set(struct devlink_trap_metadata *metadata,
 
 	spin_lock(&in_devlink_port->type_lock);
 	if (in_devlink_port->type == DEVLINK_PORT_TYPE_ETH)
-		metadata->input_dev = in_devlink_port->type_dev;
+		metadata->input_dev = in_devlink_port->type_eth.netdev;
 	spin_unlock(&in_devlink_port->type_lock);
 }
 
-- 
cgit v1.2.3


From c80965784dbf2fd624be654c1e73c24beada7441 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:02:05 +0100
Subject: net: devlink: remove netdev arg from devlink_port_type_eth_set()

Since devlink_port_type_eth_set() should no longer be called by any
driver with netdev pointer as it should rather use
SET_NETDEV_DEVLINK_PORT, remove the netdev arg. Add a warn to
type_clear()

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx4/main.c |  2 +-
 include/net/devlink.h                     |  3 +--
 net/core/devlink.c                        | 23 ++++++++++++++---------
 3 files changed, 16 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c
index d3fc86cd3c1d..3ae246391549 100644
--- a/drivers/net/ethernet/mellanox/mlx4/main.c
+++ b/drivers/net/ethernet/mellanox/mlx4/main.c
@@ -3043,7 +3043,7 @@ static int mlx4_init_port_info(struct mlx4_dev *dev, int port)
 	 */
 	if (!IS_ENABLED(CONFIG_MLX4_EN) &&
 	    dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH)
-		devlink_port_type_eth_set(&info->devlink_port, NULL);
+		devlink_port_type_eth_set(&info->devlink_port);
 	else if (!IS_ENABLED(CONFIG_MLX4_INFINIBAND) &&
 		 dev->caps.port_type[port] == MLX4_PORT_TYPE_IB)
 		devlink_port_type_ib_set(&info->devlink_port, NULL);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 6c55aabaedf1..b1582b32183a 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1582,8 +1582,7 @@ int devlink_port_register(struct devlink *devlink,
 			  unsigned int port_index);
 void devl_port_unregister(struct devlink_port *devlink_port);
 void devlink_port_unregister(struct devlink_port *devlink_port);
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
-			       struct net_device *netdev);
+void devlink_port_type_eth_set(struct devlink_port *devlink_port);
 void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev);
 void devlink_port_type_clear(struct devlink_port *devlink_port);
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6f06c05c7b1a..70a374c828ae 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10097,17 +10097,15 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
  *	devlink_port_type_eth_set - Set port type to Ethernet
  *
  *	@devlink_port: devlink port
- *	@netdev: related netdevice
+ *
+ *	If driver is calling this, most likely it is doing something wrong.
  */
-void devlink_port_type_eth_set(struct devlink_port *devlink_port,
-			       struct net_device *netdev)
+void devlink_port_type_eth_set(struct devlink_port *devlink_port)
 {
-	if (!netdev)
-		dev_warn(devlink_port->devlink->dev,
-			 "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
-			 devlink_port->index);
-
-	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, netdev,
+	dev_warn(devlink_port->devlink->dev,
+		 "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
+		 devlink_port->index);
+	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL,
 				false);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
@@ -10130,9 +10128,16 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
  *	devlink_port_type_clear - Clear port type
  *
  *	@devlink_port: devlink port
+ *
+ *	If driver is calling this for clearing Ethernet type, most likely
+ *	it is doing something wrong.
  */
 void devlink_port_type_clear(struct devlink_port *devlink_port)
 {
+	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH)
+		dev_warn(devlink_port->devlink->dev,
+			 "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
+			 devlink_port->index);
 	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL,
 				false);
 }
-- 
cgit v1.2.3


From 31265c1e29eb28f17df50d04ee421b5b6369fefd Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:02:07 +0100
Subject: net: devlink: store copy netdevice ifindex and ifname to allow
 port_fill() without RTNL held

To avoid a need to take RTNL mutex in port_fill() function, benefit from
the introduce infrastructure that tracks netdevice notifier events.
Store the ifindex and ifname upon register and change name events.
Remove the rtnl_held bool propagated down to port_fill() function as it
is no longer needed.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  2 ++
 net/core/devlink.c    | 68 ++++++++++++++++++++-------------------------------
 2 files changed, 29 insertions(+), 41 deletions(-)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index b1582b32183a..7befad57afd4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -129,6 +129,8 @@ struct devlink_port {
 	union {
 		struct {
 			struct net_device *netdev;
+			int ifindex;
+			char ifname[IFNAMSIZ];
 		} type_eth;
 		struct {
 			struct ib_device *ibdev;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index d948bb2fdd5f..38de3a1dff36 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1279,8 +1279,7 @@ out:
 static int devlink_nl_port_fill(struct sk_buff *msg,
 				struct devlink_port *devlink_port,
 				enum devlink_command cmd, u32 portid, u32 seq,
-				int flags, struct netlink_ext_ack *extack,
-				bool rtnl_held)
+				int flags, struct netlink_ext_ack *extack)
 {
 	struct devlink *devlink = devlink_port->devlink;
 	void *hdr;
@@ -1294,9 +1293,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
 		goto nla_put_failure;
 
-	/* Hold rtnl lock while accessing port's netdev attributes. */
-	if (!rtnl_held)
-		rtnl_lock();
 	spin_lock_bh(&devlink_port->type_lock);
 	if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
 		goto nla_put_failure_type_locked;
@@ -1305,13 +1301,11 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 			devlink_port->desired_type))
 		goto nla_put_failure_type_locked;
 	if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
-		struct net_device *netdev = devlink_port->type_eth.netdev;
-
-		if (netdev &&
+		if (devlink_port->type_eth.netdev &&
 		    (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
-				 netdev->ifindex) ||
+				 devlink_port->type_eth.ifindex) ||
 		     nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
-				    netdev->name)))
+				    devlink_port->type_eth.ifname)))
 			goto nla_put_failure_type_locked;
 	}
 	if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
@@ -1323,8 +1317,6 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 			goto nla_put_failure_type_locked;
 	}
 	spin_unlock_bh(&devlink_port->type_lock);
-	if (!rtnl_held)
-		rtnl_unlock();
 	if (devlink_nl_port_attrs_put(msg, devlink_port))
 		goto nla_put_failure;
 	if (devlink_nl_port_function_attrs_put(msg, devlink_port, extack))
@@ -1339,15 +1331,13 @@ static int devlink_nl_port_fill(struct sk_buff *msg,
 
 nla_put_failure_type_locked:
 	spin_unlock_bh(&devlink_port->type_lock);
-	if (!rtnl_held)
-		rtnl_unlock();
 nla_put_failure:
 	genlmsg_cancel(msg, hdr);
 	return -EMSGSIZE;
 }
 
-static void __devlink_port_notify(struct devlink_port *devlink_port,
-				  enum devlink_command cmd, bool rtnl_held)
+static void devlink_port_notify(struct devlink_port *devlink_port,
+				enum devlink_command cmd)
 {
 	struct devlink *devlink = devlink_port->devlink;
 	struct sk_buff *msg;
@@ -1362,8 +1352,7 @@ static void __devlink_port_notify(struct devlink_port *devlink_port,
 	if (!msg)
 		return;
 
-	err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL,
-				   rtnl_held);
+	err = devlink_nl_port_fill(msg, devlink_port, cmd, 0, 0, 0, NULL);
 	if (err) {
 		nlmsg_free(msg);
 		return;
@@ -1373,12 +1362,6 @@ static void __devlink_port_notify(struct devlink_port *devlink_port,
 				0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
 }
 
-static void devlink_port_notify(struct devlink_port *devlink_port,
-				enum devlink_command cmd)
-{
-	__devlink_port_notify(devlink_port, cmd, false);
-}
-
 static void devlink_rate_notify(struct devlink_rate *devlink_rate,
 				enum devlink_command cmd)
 {
@@ -1542,7 +1525,7 @@ static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
 
 	err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_PORT_NEW,
 				   info->snd_portid, info->snd_seq, 0,
-				   info->extack, false);
+				   info->extack);
 	if (err) {
 		nlmsg_free(msg);
 		return err;
@@ -1572,8 +1555,7 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
 						   DEVLINK_CMD_NEW,
 						   NETLINK_CB(cb->skb).portid,
 						   cb->nlh->nlmsg_seq,
-						   NLM_F_MULTI, cb->extack,
-						   false);
+						   NLM_F_MULTI, cb->extack);
 			if (err) {
 				devl_unlock(devlink);
 				devlink_put(devlink);
@@ -1785,8 +1767,7 @@ static int devlink_port_new_notify(struct devlink *devlink,
 	}
 
 	err = devlink_nl_port_fill(msg, devlink_port, DEVLINK_CMD_NEW,
-				   info->snd_portid, info->snd_seq, 0, NULL,
-				   false);
+				   info->snd_portid, info->snd_seq, 0, NULL);
 	if (err)
 		goto out;
 
@@ -10062,7 +10043,7 @@ static void devlink_port_type_netdev_checks(struct devlink_port *devlink_port,
 
 static void __devlink_port_type_set(struct devlink_port *devlink_port,
 				    enum devlink_port_type type,
-				    void *type_dev, bool rtnl_held)
+				    void *type_dev)
 {
 	struct net_device *netdev = type_dev;
 
@@ -10081,6 +10062,13 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 	switch (type) {
 	case DEVLINK_PORT_TYPE_ETH:
 		devlink_port->type_eth.netdev = netdev;
+		if (netdev) {
+			ASSERT_RTNL();
+			devlink_port->type_eth.ifindex = netdev->ifindex;
+			BUILD_BUG_ON(sizeof(devlink_port->type_eth.ifname) !=
+				     sizeof(netdev->name));
+			strcpy(devlink_port->type_eth.ifname, netdev->name);
+		}
 		break;
 	case DEVLINK_PORT_TYPE_IB:
 		devlink_port->type_ib.ibdev = type_dev;
@@ -10089,7 +10077,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
 		break;
 	}
 	spin_unlock_bh(&devlink_port->type_lock);
-	__devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW, rtnl_held);
+	devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
 }
 
 /**
@@ -10104,8 +10092,7 @@ void devlink_port_type_eth_set(struct devlink_port *devlink_port)
 	dev_warn(devlink_port->devlink->dev,
 		 "devlink port type for port %d set to Ethernet without a software interface reference, device type not supported by the kernel?\n",
 		 devlink_port->index);
-	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL,
-				false);
+	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH, NULL);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
 
@@ -10118,8 +10105,7 @@ EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
 void devlink_port_type_ib_set(struct devlink_port *devlink_port,
 			      struct ib_device *ibdev)
 {
-	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev,
-				false);
+	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_IB, ibdev);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
 
@@ -10137,8 +10123,7 @@ void devlink_port_type_clear(struct devlink_port *devlink_port)
 		dev_warn(devlink_port->devlink->dev,
 			 "devlink port type for port %d cleared without a software interface reference, device type not supported by the kernel?\n",
 			 devlink_port->index);
-	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL,
-				false);
+	__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
 }
 EXPORT_SYMBOL_GPL(devlink_port_type_clear);
 
@@ -10161,16 +10146,17 @@ static int devlink_netdevice_event(struct notifier_block *nb,
 		 * netdevice register
 		 */
 		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
-					NULL, true);
+					NULL);
 		break;
 	case NETDEV_REGISTER:
+	case NETDEV_CHANGENAME:
 		/* Set the netdev on top of previously set type. Note this
 		 * event happens also during net namespace change so here
 		 * we take into account netdev pointer appearing in this
 		 * namespace.
 		 */
 		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
-					netdev, true);
+					netdev);
 		break;
 	case NETDEV_UNREGISTER:
 		/* Clear netdev pointer, but not the type. This event happens
@@ -10178,14 +10164,14 @@ static int devlink_netdevice_event(struct notifier_block *nb,
 		 * pointer to netdev that is going to another net namespace.
 		 */
 		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_ETH,
-					NULL, true);
+					NULL);
 		break;
 	case NETDEV_PRE_UNINIT:
 		/* Clear the type and the netdev pointer. Happens one during
 		 * netdevice unregister.
 		 */
 		__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET,
-					NULL, true);
+					NULL);
 		break;
 	}
 
-- 
cgit v1.2.3


From dca56c3038c34a3e5acfe0aadb1f2bc9d724ae79 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jiri@nvidia.com>
Date: Wed, 2 Nov 2022 17:02:11 +0100
Subject: net: expose devlink port over rtnetlink

Expose devlink port handle related to netdev over rtnetlink. Introduce a
new nested IFLA attribute to carry the info. Call into devlink code to
fill-up the nest with existing devlink attributes that are used over
devlink netlink.

Signed-off-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        | 14 ++++++++++++++
 include/uapi/linux/if_link.h |  2 ++
 net/core/devlink.c           | 18 ++++++++++++++++++
 net/core/rtnetlink.c         | 39 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 73 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 7befad57afd4..fa6e936af1a5 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1873,6 +1873,9 @@ int devlink_compat_phys_port_name_get(struct net_device *dev,
 int devlink_compat_switch_id_get(struct net_device *dev,
 				 struct netdev_phys_item_id *ppid);
 
+int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port);
+size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port);
+
 #else
 
 static inline struct devlink *devlink_try_get(struct devlink *devlink)
@@ -1909,6 +1912,17 @@ devlink_compat_switch_id_get(struct net_device *dev,
 	return -EOPNOTSUPP;
 }
 
+static inline int
+devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+	return 0;
+}
+
+static inline size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+{
+	return 0;
+}
+
 #endif
 
 #endif /* _NET_DEVLINK_H_ */
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index d92b3f79eba3..1021a7e47a86 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -372,6 +372,8 @@ enum {
 	IFLA_TSO_MAX_SEGS,
 	IFLA_ALLMULTI,		/* Allmulti count: > 0 means acts ALLMULTI */
 
+	IFLA_DEVLINK_PORT,
+
 	__IFLA_MAX
 };
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 3a454d0045e5..2dcf2bcc3527 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -880,6 +880,24 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+int devlink_nl_port_handle_fill(struct sk_buff *msg, struct devlink_port *devlink_port)
+{
+	if (devlink_nl_put_handle(msg, devlink_port->devlink))
+		return -EMSGSIZE;
+	if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+		return -EMSGSIZE;
+	return 0;
+}
+
+size_t devlink_nl_port_handle_size(struct devlink_port *devlink_port)
+{
+	struct devlink *devlink = devlink_port->devlink;
+
+	return nla_total_size(strlen(devlink->dev->bus->name) + 1) /* DEVLINK_ATTR_BUS_NAME */
+	     + nla_total_size(strlen(dev_name(devlink->dev)) + 1) /* DEVLINK_ATTR_DEV_NAME */
+	     + nla_total_size(4); /* DEVLINK_ATTR_PORT_INDEX */
+}
+
 struct devlink_reload_combination {
 	enum devlink_reload_action action;
 	enum devlink_reload_limit limit;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b64fffeb3844..64289bc98887 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -53,6 +53,7 @@
 #include <net/fib_rules.h>
 #include <net/rtnetlink.h>
 #include <net/net_namespace.h>
+#include <net/devlink.h>
 
 #include "dev.h"
 
@@ -1038,6 +1039,16 @@ static size_t rtnl_proto_down_size(const struct net_device *dev)
 	return size;
 }
 
+static size_t rtnl_devlink_port_size(const struct net_device *dev)
+{
+	size_t size = nla_total_size(0); /* nest IFLA_DEVLINK_PORT */
+
+	if (dev->devlink_port)
+		size += devlink_nl_port_handle_size(dev->devlink_port);
+
+	return size;
+}
+
 static noinline size_t if_nlmsg_size(const struct net_device *dev,
 				     u32 ext_filter_mask)
 {
@@ -1091,6 +1102,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(4)  /* IFLA_MAX_MTU */
 	       + rtnl_prop_list_size(dev)
 	       + nla_total_size(MAX_ADDR_LEN) /* IFLA_PERM_ADDRESS */
+	       + rtnl_devlink_port_size(dev)
 	       + 0;
 }
 
@@ -1728,6 +1740,30 @@ nla_put_failure:
 	return -EMSGSIZE;
 }
 
+static int rtnl_fill_devlink_port(struct sk_buff *skb,
+				  const struct net_device *dev)
+{
+	struct nlattr *devlink_port_nest;
+	int ret;
+
+	devlink_port_nest = nla_nest_start(skb, IFLA_DEVLINK_PORT);
+	if (!devlink_port_nest)
+		return -EMSGSIZE;
+
+	if (dev->devlink_port) {
+		ret = devlink_nl_port_handle_fill(skb, dev->devlink_port);
+		if (ret < 0)
+			goto nest_cancel;
+	}
+
+	nla_nest_end(skb, devlink_port_nest);
+	return 0;
+
+nest_cancel:
+	nla_nest_cancel(skb, devlink_port_nest);
+	return ret;
+}
+
 static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			    struct net_device *dev, struct net *src_net,
 			    int type, u32 pid, u32 seq, u32 change,
@@ -1865,6 +1901,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
 			   dev->dev.parent->bus->name))
 		goto nla_put_failure;
 
+	if (rtnl_fill_devlink_port(skb, dev))
+		goto nla_put_failure;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
-- 
cgit v1.2.3


From 7c3eaa022261d79d273d73ac68ff02cbe2839c10 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 4 Nov 2022 12:13:32 -0700
Subject: genetlink: move the private fields in struct genl_family

Move the private fields down to form a "private section".
Use the kdoc "private:" label comment thing to hide them
from the main kdoc comment.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 9f97f73615b6..81180fc6526a 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -23,7 +23,6 @@ struct genl_info;
 
 /**
  * struct genl_family - generic netlink family
- * @id: protocol family identifier (private)
  * @hdrsize: length of user specific header in bytes
  * @name: name of family
  * @version: protocol version
@@ -43,8 +42,6 @@ struct genl_info;
  * @resv_start_op: first operation for which reserved fields of the header
  *	can be validated and policies are required (see below);
  *	new families should leave this field at zero
- * @mcgrp_offset: starting number of multicast group IDs in this family
- *	(private)
  * @ops: the operations supported by this family
  * @n_ops: number of operations supported by this family
  * @small_ops: the small-struct operations supported by this family
@@ -58,12 +55,10 @@ struct genl_info;
  * if policy is not provided core will reject all TLV attributes.
  */
 struct genl_family {
-	int			id;		/* private */
 	unsigned int		hdrsize;
 	char			name[GENL_NAMSIZ];
 	unsigned int		version;
 	unsigned int		maxattr;
-	unsigned int		mcgrp_offset;	/* private */
 	u8			netnsok:1;
 	u8			parallel_ops:1;
 	u8			n_ops;
@@ -81,6 +76,12 @@ struct genl_family {
 	const struct genl_small_ops *small_ops;
 	const struct genl_multicast_group *mcgrps;
 	struct module		*module;
+
+/* private: internal use only */
+	/* protocol family identifier */
+	int			id;
+	/* starting number of multicast group IDs in this family */
+	unsigned int		mcgrp_offset;
 };
 
 /**
-- 
cgit v1.2.3


From 20b0b53aca436af9fece9428ca3ab7c7b9cf4583 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 4 Nov 2022 12:13:33 -0700
Subject: genetlink: introduce split op representation

We currently have two forms of operations - small ops and "full" ops
(or just ops). The former does not have pointers for some of the less
commonly used features (namely dump start/done and policy).

The "full" ops, however, still don't contain all the necessary
information. In particular the policy is per command ID, while
do and dump often accept different attributes. It's also not
possible to define different pre_doit and post_doit callbacks
for different commands within the family.

At the same time a lot of commands do not support dumping and
therefore all the dump-related information is wasted space.

Create a new command representation which can hold info about
a do implementation or a dump implementation, but not both at
the same time.

Use this new representation on the command execution path
(genl_family_rcv_msg) as we either run a do or a dump and
don't have to create a "full" op there.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h   | 60 ++++++++++++++++++++++++++++++++---
 net/batman-adv/netlink.c  |  6 ++--
 net/core/devlink.c        |  4 +--
 net/core/drop_monitor.c   |  4 +--
 net/ieee802154/nl802154.c |  6 ++--
 net/netlink/genetlink.c   | 79 ++++++++++++++++++++++++++++++++++++++---------
 net/wireless/nl80211.c    |  6 ++--
 7 files changed, 136 insertions(+), 29 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 81180fc6526a..4be7989c451b 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -18,7 +18,7 @@ struct genl_multicast_group {
 	u8			flags;
 };
 
-struct genl_ops;
+struct genl_split_ops;
 struct genl_info;
 
 /**
@@ -66,10 +66,10 @@ struct genl_family {
 	u8			n_mcgrps;
 	u8			resv_start_op;
 	const struct nla_policy *policy;
-	int			(*pre_doit)(const struct genl_ops *ops,
+	int			(*pre_doit)(const struct genl_split_ops *ops,
 					    struct sk_buff *skb,
 					    struct genl_info *info);
-	void			(*post_doit)(const struct genl_ops *ops,
+	void			(*post_doit)(const struct genl_split_ops *ops,
 					     struct sk_buff *skb,
 					     struct genl_info *info);
 	const struct genl_ops *	ops;
@@ -182,6 +182,58 @@ struct genl_ops {
 	u8			validate;
 };
 
+/**
+ * struct genl_split_ops - generic netlink operations (do/dump split version)
+ * @cmd: command identifier
+ * @internal_flags: flags used by the family
+ * @flags: GENL_* flags (%GENL_ADMIN_PERM or %GENL_UNS_ADMIN_PERM)
+ * @validate: validation flags from enum genl_validate_flags
+ * @policy: netlink policy (takes precedence over family policy)
+ * @maxattr: maximum number of attributes supported
+ *
+ * Do callbacks:
+ * @pre_doit: called before an operation's @doit callback, it may
+ *	do additional, common, filtering and return an error
+ * @doit: standard command callback
+ * @post_doit: called after an operation's @doit callback, it may
+ *	undo operations done by pre_doit, for example release locks
+ *
+ * Dump callbacks:
+ * @start: start callback for dumps
+ * @dumpit: callback for dumpers
+ * @done: completion callback for dumps
+ *
+ * Do callbacks can be used if %GENL_CMD_CAP_DO is set in @flags.
+ * Dump callbacks can be used if %GENL_CMD_CAP_DUMP is set in @flags.
+ * Exactly one of those flags must be set.
+ */
+struct genl_split_ops {
+	union {
+		struct {
+			int (*pre_doit)(const struct genl_split_ops *ops,
+					struct sk_buff *skb,
+					struct genl_info *info);
+			int (*doit)(struct sk_buff *skb,
+				    struct genl_info *info);
+			void (*post_doit)(const struct genl_split_ops *ops,
+					  struct sk_buff *skb,
+					  struct genl_info *info);
+		};
+		struct {
+			int (*start)(struct netlink_callback *cb);
+			int (*dumpit)(struct sk_buff *skb,
+				      struct netlink_callback *cb);
+			int (*done)(struct netlink_callback *cb);
+		};
+	};
+	const struct nla_policy *policy;
+	unsigned int		maxattr;
+	u8			cmd;
+	u8			internal_flags;
+	u8			flags;
+	u8			validate;
+};
+
 /**
  * struct genl_dumpit_info - info that is available during dumpit op call
  * @family: generic netlink family - for internal genl code usage
@@ -190,7 +242,7 @@ struct genl_ops {
  */
 struct genl_dumpit_info {
 	const struct genl_family *family;
-	struct genl_ops op;
+	struct genl_split_ops op;
 	struct nlattr **attrs;
 };
 
diff --git a/net/batman-adv/netlink.c b/net/batman-adv/netlink.c
index a5e4a4e976cf..ad5714f737be 100644
--- a/net/batman-adv/netlink.c
+++ b/net/batman-adv/netlink.c
@@ -1267,7 +1267,8 @@ batadv_get_vlan_from_info(struct batadv_priv *bat_priv, struct net *net,
  *
  * Return: 0 on success or negative error number in case of failure
  */
-static int batadv_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int batadv_pre_doit(const struct genl_split_ops *ops,
+			   struct sk_buff *skb,
 			   struct genl_info *info)
 {
 	struct net *net = genl_info_net(info);
@@ -1332,7 +1333,8 @@ err_put_softif:
  * @skb: Netlink message with request data
  * @info: receiver information
  */
-static void batadv_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void batadv_post_doit(const struct genl_split_ops *ops,
+			     struct sk_buff *skb,
 			     struct genl_info *info)
 {
 	struct batadv_hard_iface *hard_iface;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 2dcf2bcc3527..40fcdded57e6 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -770,7 +770,7 @@ devlink_region_snapshot_get_by_id(struct devlink_region *region, u32 id)
 #define DEVLINK_NL_FLAG_NEED_RATE_NODE		BIT(3)
 #define DEVLINK_NL_FLAG_NEED_LINECARD		BIT(4)
 
-static int devlink_nl_pre_doit(const struct genl_ops *ops,
+static int devlink_nl_pre_doit(const struct genl_split_ops *ops,
 			       struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink_linecard *linecard;
@@ -828,7 +828,7 @@ unlock:
 	return err;
 }
 
-static void devlink_nl_post_doit(const struct genl_ops *ops,
+static void devlink_nl_post_doit(const struct genl_split_ops *ops,
 				 struct sk_buff *skb, struct genl_info *info)
 {
 	struct devlink_linecard *linecard;
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 11aa6e8a3098..5a782d1d8fd3 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -1620,7 +1620,7 @@ static const struct genl_small_ops dropmon_ops[] = {
 	},
 };
 
-static int net_dm_nl_pre_doit(const struct genl_ops *ops,
+static int net_dm_nl_pre_doit(const struct genl_split_ops *ops,
 			      struct sk_buff *skb, struct genl_info *info)
 {
 	mutex_lock(&net_dm_mutex);
@@ -1628,7 +1628,7 @@ static int net_dm_nl_pre_doit(const struct genl_ops *ops,
 	return 0;
 }
 
-static void net_dm_nl_post_doit(const struct genl_ops *ops,
+static void net_dm_nl_post_doit(const struct genl_split_ops *ops,
 				struct sk_buff *skb, struct genl_info *info)
 {
 	mutex_unlock(&net_dm_mutex);
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 38c4f3cb010e..b33d1b5eda87 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -2157,7 +2157,8 @@ static int nl802154_del_llsec_seclevel(struct sk_buff *skb,
 #define NL802154_FLAG_CHECK_NETDEV_UP	0x08
 #define NL802154_FLAG_NEED_WPAN_DEV	0x10
 
-static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int nl802154_pre_doit(const struct genl_split_ops *ops,
+			     struct sk_buff *skb,
 			     struct genl_info *info)
 {
 	struct cfg802154_registered_device *rdev;
@@ -2219,7 +2220,8 @@ static int nl802154_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
 	return 0;
 }
 
-static void nl802154_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void nl802154_post_doit(const struct genl_split_ops *ops,
+			       struct sk_buff *skb,
 			       struct genl_info *info)
 {
 	if (info->user_ptr[1]) {
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0a7a856e9ce0..c66299740c05 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -189,6 +189,51 @@ static int genl_get_cmd(u32 cmd, const struct genl_family *family,
 	return genl_get_cmd_small(cmd, family, op);
 }
 
+static void
+genl_cmd_full_to_split(struct genl_split_ops *op,
+		       const struct genl_family *family,
+		       const struct genl_ops *full, u8 flags)
+{
+	if (flags & GENL_CMD_CAP_DUMP) {
+		op->start	= full->start;
+		op->dumpit	= full->dumpit;
+		op->done	= full->done;
+	} else {
+		op->pre_doit	= family->pre_doit;
+		op->doit	= full->doit;
+		op->post_doit	= family->post_doit;
+	}
+
+	op->policy		= full->policy;
+	op->maxattr		= full->maxattr;
+
+	op->cmd			= full->cmd;
+	op->internal_flags	= full->internal_flags;
+	op->flags		= full->flags;
+	op->validate		= full->validate;
+
+	/* Make sure flags include the GENL_CMD_CAP_DO / GENL_CMD_CAP_DUMP */
+	op->flags		|= flags;
+}
+
+static int
+genl_get_cmd_split(u32 cmd, u8 flags, const struct genl_family *family,
+		   struct genl_split_ops *op)
+{
+	struct genl_ops full;
+	int err;
+
+	err = genl_get_cmd(cmd, family, &full);
+	if (err) {
+		memset(op, 0, sizeof(*op));
+		return err;
+	}
+
+	genl_cmd_full_to_split(op, family, &full, flags);
+
+	return 0;
+}
+
 static void genl_get_cmd_by_index(unsigned int i,
 				  const struct genl_family *family,
 				  struct genl_ops *op)
@@ -544,7 +589,7 @@ static struct nlattr **
 genl_family_rcv_msg_attrs_parse(const struct genl_family *family,
 				struct nlmsghdr *nlh,
 				struct netlink_ext_ack *extack,
-				const struct genl_ops *ops,
+				const struct genl_split_ops *ops,
 				int hdrlen,
 				enum genl_validate_flags no_strict_flag)
 {
@@ -580,18 +625,19 @@ struct genl_start_context {
 	const struct genl_family *family;
 	struct nlmsghdr *nlh;
 	struct netlink_ext_ack *extack;
-	const struct genl_ops *ops;
+	const struct genl_split_ops *ops;
 	int hdrlen;
 };
 
 static int genl_start(struct netlink_callback *cb)
 {
 	struct genl_start_context *ctx = cb->data;
-	const struct genl_ops *ops = ctx->ops;
+	const struct genl_split_ops *ops;
 	struct genl_dumpit_info *info;
 	struct nlattr **attrs = NULL;
 	int rc = 0;
 
+	ops = ctx->ops;
 	if (ops->validate & GENL_DONT_VALIDATE_DUMP)
 		goto no_attrs;
 
@@ -633,7 +679,7 @@ no_attrs:
 
 static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
-	const struct genl_ops *ops = &genl_dumpit_info(cb)->op;
+	const struct genl_split_ops *ops = &genl_dumpit_info(cb)->op;
 	int rc;
 
 	genl_lock();
@@ -645,7 +691,7 @@ static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 static int genl_lock_done(struct netlink_callback *cb)
 {
 	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
-	const struct genl_ops *ops = &info->op;
+	const struct genl_split_ops *ops = &info->op;
 	int rc = 0;
 
 	if (ops->done) {
@@ -661,7 +707,7 @@ static int genl_lock_done(struct netlink_callback *cb)
 static int genl_parallel_done(struct netlink_callback *cb)
 {
 	const struct genl_dumpit_info *info = genl_dumpit_info(cb);
-	const struct genl_ops *ops = &info->op;
+	const struct genl_split_ops *ops = &info->op;
 	int rc = 0;
 
 	if (ops->done)
@@ -675,7 +721,7 @@ static int genl_family_rcv_msg_dumpit(const struct genl_family *family,
 				      struct sk_buff *skb,
 				      struct nlmsghdr *nlh,
 				      struct netlink_ext_ack *extack,
-				      const struct genl_ops *ops,
+				      const struct genl_split_ops *ops,
 				      int hdrlen, struct net *net)
 {
 	struct genl_start_context ctx;
@@ -721,7 +767,7 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family,
 				    struct sk_buff *skb,
 				    struct nlmsghdr *nlh,
 				    struct netlink_ext_ack *extack,
-				    const struct genl_ops *ops,
+				    const struct genl_split_ops *ops,
 				    int hdrlen, struct net *net)
 {
 	struct nlattr **attrbuf;
@@ -747,16 +793,16 @@ static int genl_family_rcv_msg_doit(const struct genl_family *family,
 	genl_info_net_set(&info, net);
 	memset(&info.user_ptr, 0, sizeof(info.user_ptr));
 
-	if (family->pre_doit) {
-		err = family->pre_doit(ops, skb, &info);
+	if (ops->pre_doit) {
+		err = ops->pre_doit(ops, skb, &info);
 		if (err)
 			goto out;
 	}
 
 	err = ops->doit(skb, &info);
 
-	if (family->post_doit)
-		family->post_doit(ops, skb, &info);
+	if (ops->post_doit)
+		ops->post_doit(ops, skb, &info);
 
 out:
 	genl_family_rcv_msg_attrs_free(attrbuf);
@@ -801,8 +847,9 @@ static int genl_family_rcv_msg(const struct genl_family *family,
 {
 	struct net *net = sock_net(skb->sk);
 	struct genlmsghdr *hdr = nlmsg_data(nlh);
-	struct genl_ops op;
+	struct genl_split_ops op;
 	int hdrlen;
+	u8 flags;
 
 	/* this family doesn't exist in this netns */
 	if (!family->netnsok && !net_eq(net, &init_net))
@@ -815,7 +862,9 @@ static int genl_family_rcv_msg(const struct genl_family *family,
 	if (genl_header_check(family, nlh, hdr, extack))
 		return -EINVAL;
 
-	if (genl_get_cmd(hdr->cmd, family, &op))
+	flags = (nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP ?
+		GENL_CMD_CAP_DUMP : GENL_CMD_CAP_DO;
+	if (genl_get_cmd_split(hdr->cmd, flags, family, &op))
 		return -EOPNOTSUPP;
 
 	if ((op.flags & GENL_ADMIN_PERM) &&
@@ -826,7 +875,7 @@ static int genl_family_rcv_msg(const struct genl_family *family,
 	    !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
-	if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP)
+	if (flags & GENL_CMD_CAP_DUMP)
 		return genl_family_rcv_msg_dumpit(family, skb, nlh, extack,
 						  &op, hdrlen, net);
 	else
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index 148f66edb015..1ad0326ff4dc 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -16140,7 +16140,8 @@ static u32 nl80211_internal_flags[] = {
 #undef SELECTOR
 };
 
-static int nl80211_pre_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static int nl80211_pre_doit(const struct genl_split_ops *ops,
+			    struct sk_buff *skb,
 			    struct genl_info *info)
 {
 	struct cfg80211_registered_device *rdev = NULL;
@@ -16241,7 +16242,8 @@ out_unlock:
 	return err;
 }
 
-static void nl80211_post_doit(const struct genl_ops *ops, struct sk_buff *skb,
+static void nl80211_post_doit(const struct genl_split_ops *ops,
+			      struct sk_buff *skb,
 			      struct genl_info *info)
 {
 	u32 internal_flags = nl80211_internal_flags[ops->internal_flags];
-- 
cgit v1.2.3


From b8fd60c36a44351f773432e24efd8bb92f8ba0c6 Mon Sep 17 00:00:00 2001
From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 4 Nov 2022 12:13:42 -0700
Subject: genetlink: allow families to use split ops directly

Let families to hook in the new split ops.

They are more flexible and should not be much larger than
full ops. Each split op is 40B while full op is 48B.
Devlink for example has 54 dos and 19 dumps, 2 of the dumps
do not have a do -> 56 full commands = 2688B.
Split ops would have taken 2920B, so 9% more space while
allowing individual per/post doit and per-type policies.

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |   5 ++
 net/netlink/genetlink.c | 170 ++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 149 insertions(+), 26 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index 4be7989c451b..d21210709f84 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -46,6 +46,9 @@ struct genl_info;
  * @n_ops: number of operations supported by this family
  * @small_ops: the small-struct operations supported by this family
  * @n_small_ops: number of small-struct operations supported by this family
+ * @split_ops: the split do/dump form of operation definition
+ * @n_split_ops: number of entries in @split_ops, not that with split do/dump
+ *	ops the number of entries is not the same as number of commands
  *
  * Attribute policies (the combination of @policy and @maxattr fields)
  * can be attached at the family level or at the operation level.
@@ -63,6 +66,7 @@ struct genl_family {
 	u8			parallel_ops:1;
 	u8			n_ops;
 	u8			n_small_ops;
+	u8			n_split_ops;
 	u8			n_mcgrps;
 	u8			resv_start_op;
 	const struct nla_policy *policy;
@@ -74,6 +78,7 @@ struct genl_family {
 					     struct genl_info *info);
 	const struct genl_ops *	ops;
 	const struct genl_small_ops *small_ops;
+	const struct genl_split_ops *split_ops;
 	const struct genl_multicast_group *mcgrps;
 	struct module		*module;
 
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index 0a4f1470f442..90b0feb5eb73 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -101,6 +101,17 @@ genl_op_fill_in_reject_policy(const struct genl_family *family,
 	op->maxattr = 1;
 }
 
+static void
+genl_op_fill_in_reject_policy_split(const struct genl_family *family,
+				    struct genl_split_ops *op)
+{
+	if (op->policy)
+		return;
+
+	op->policy = genl_policy_reject_all;
+	op->maxattr = 1;
+}
+
 static const struct genl_family *genl_family_find_byid(unsigned int id)
 {
 	return idr_find(&genl_fam_idr, id);
@@ -118,6 +129,16 @@ static const struct genl_family *genl_family_find_byname(char *name)
 	return NULL;
 }
 
+struct genl_op_iter {
+	const struct genl_family *family;
+	struct genl_split_ops doit;
+	struct genl_split_ops dumpit;
+	int cmd_idx;
+	int entry_idx;
+	u32 cmd;
+	u8 flags;
+};
+
 static void genl_op_from_full(const struct genl_family *family,
 			      unsigned int i, struct genl_ops *op)
 {
@@ -176,6 +197,50 @@ static int genl_get_cmd_small(u32 cmd, const struct genl_family *family,
 	return -ENOENT;
 }
 
+static void genl_op_from_split(struct genl_op_iter *iter)
+{
+	const struct genl_family *family = iter->family;
+	int i, cnt = 0;
+
+	i = iter->entry_idx - family->n_ops - family->n_small_ops;
+
+	if (family->split_ops[i + cnt].flags & GENL_CMD_CAP_DO) {
+		iter->doit = family->split_ops[i + cnt];
+		genl_op_fill_in_reject_policy_split(family, &iter->doit);
+		cnt++;
+	} else {
+		memset(&iter->doit, 0, sizeof(iter->doit));
+	}
+
+	if (i + cnt < family->n_split_ops &&
+	    family->split_ops[i + cnt].flags & GENL_CMD_CAP_DUMP) {
+		iter->dumpit = family->split_ops[i + cnt];
+		genl_op_fill_in_reject_policy_split(family, &iter->dumpit);
+		cnt++;
+	} else {
+		memset(&iter->dumpit, 0, sizeof(iter->dumpit));
+	}
+
+	WARN_ON(!cnt);
+	iter->entry_idx += cnt;
+}
+
+static int
+genl_get_cmd_split(u32 cmd, u8 flag, const struct genl_family *family,
+		   struct genl_split_ops *op)
+{
+	int i;
+
+	for (i = 0; i < family->n_split_ops; i++)
+		if (family->split_ops[i].cmd == cmd &&
+		    family->split_ops[i].flags & flag) {
+			*op = family->split_ops[i];
+			return 0;
+		}
+
+	return -ENOENT;
+}
+
 static int
 genl_cmd_full_to_split(struct genl_split_ops *op,
 		       const struct genl_family *family,
@@ -227,50 +292,60 @@ genl_get_cmd(u32 cmd, u8 flags, const struct genl_family *family,
 	err = genl_get_cmd_full(cmd, family, &full);
 	if (err == -ENOENT)
 		err = genl_get_cmd_small(cmd, family, &full);
-	if (err) {
-		memset(op, 0, sizeof(*op));
-		return err;
-	}
+	/* Found one of legacy forms */
+	if (err == 0)
+		return genl_cmd_full_to_split(op, family, &full, flags);
 
-	return genl_cmd_full_to_split(op, family, &full, flags);
+	err = genl_get_cmd_split(cmd, flags, family, op);
+	if (err)
+		memset(op, 0, sizeof(*op));
+	return err;
 }
 
-struct genl_op_iter {
-	const struct genl_family *family;
-	struct genl_split_ops doit;
-	struct genl_split_ops dumpit;
-	int i;
-	u32 cmd;
-	u8 flags;
-};
-
 static bool
 genl_op_iter_init(const struct genl_family *family, struct genl_op_iter *iter)
 {
 	iter->family = family;
-	iter->i = 0;
+	iter->cmd_idx = 0;
+	iter->entry_idx = 0;
 
 	iter->flags = 0;
 
-	return iter->family->n_ops + iter->family->n_small_ops;
+	return iter->family->n_ops +
+		iter->family->n_small_ops +
+		iter->family->n_split_ops;
 }
 
 static bool genl_op_iter_next(struct genl_op_iter *iter)
 {
 	const struct genl_family *family = iter->family;
+	bool legacy_op = true;
 	struct genl_ops op;
 
-	if (iter->i < family->n_ops)
-		genl_op_from_full(family, iter->i, &op);
-	else if (iter->i < family->n_ops + family->n_small_ops)
-		genl_op_from_small(family, iter->i - family->n_ops, &op);
-	else
+	if (iter->entry_idx < family->n_ops) {
+		genl_op_from_full(family, iter->entry_idx, &op);
+	} else if (iter->entry_idx < family->n_ops + family->n_small_ops) {
+		genl_op_from_small(family, iter->entry_idx - family->n_ops,
+				   &op);
+	} else if (iter->entry_idx <
+		   family->n_ops + family->n_small_ops + family->n_split_ops) {
+		legacy_op = false;
+		/* updates entry_idx */
+		genl_op_from_split(iter);
+	} else {
 		return false;
+	}
 
-	iter->i++;
+	iter->cmd_idx++;
 
-	genl_cmd_full_to_split(&iter->doit, family, &op, GENL_CMD_CAP_DO);
-	genl_cmd_full_to_split(&iter->dumpit, family, &op, GENL_CMD_CAP_DUMP);
+	if (legacy_op) {
+		iter->entry_idx++;
+
+		genl_cmd_full_to_split(&iter->doit, family,
+				       &op, GENL_CMD_CAP_DO);
+		genl_cmd_full_to_split(&iter->dumpit, family,
+				       &op, GENL_CMD_CAP_DUMP);
+	}
 
 	iter->cmd = iter->doit.cmd | iter->dumpit.cmd;
 	iter->flags = iter->doit.flags | iter->dumpit.flags;
@@ -286,7 +361,7 @@ genl_op_iter_copy(struct genl_op_iter *dst, struct genl_op_iter *src)
 
 static unsigned int genl_op_iter_idx(struct genl_op_iter *iter)
 {
-	return iter->i;
+	return iter->cmd_idx;
 }
 
 static int genl_allocate_reserve_groups(int n_groups, int *first_id)
@@ -454,12 +529,22 @@ static void genl_unregister_mc_groups(const struct genl_family *family)
 	}
 }
 
+static bool genl_split_op_check(const struct genl_split_ops *op)
+{
+	if (WARN_ON(hweight8(op->flags & (GENL_CMD_CAP_DO |
+					  GENL_CMD_CAP_DUMP)) != 1))
+		return true;
+	return false;
+}
+
 static int genl_validate_ops(const struct genl_family *family)
 {
 	struct genl_op_iter i, j;
+	unsigned int s;
 
 	if (WARN_ON(family->n_ops && !family->ops) ||
-	    WARN_ON(family->n_small_ops && !family->small_ops))
+	    WARN_ON(family->n_small_ops && !family->small_ops) ||
+	    WARN_ON(family->n_split_ops && !family->split_ops))
 		return -EINVAL;
 
 	for (genl_op_iter_init(family, &i); genl_op_iter_next(&i); ) {
@@ -477,6 +562,39 @@ static int genl_validate_ops(const struct genl_family *family)
 		}
 	}
 
+	if (family->n_split_ops) {
+		if (genl_split_op_check(&family->split_ops[0]))
+			return -EINVAL;
+	}
+
+	for (s = 1; s < family->n_split_ops; s++) {
+		const struct genl_split_ops *a, *b;
+
+		a = &family->split_ops[s - 1];
+		b = &family->split_ops[s];
+
+		if (genl_split_op_check(b))
+			return -EINVAL;
+
+		/* Check sort order */
+		if (a->cmd < b->cmd)
+			continue;
+
+		if (a->internal_flags != b->internal_flags ||
+		    ((a->flags ^ b->flags) & ~(GENL_CMD_CAP_DO |
+					       GENL_CMD_CAP_DUMP))) {
+			WARN_ON(1);
+			return -EINVAL;
+		}
+
+		if ((a->flags & GENL_CMD_CAP_DO) &&
+		    (b->flags & GENL_CMD_CAP_DUMP))
+			continue;
+
+		WARN_ON(1);
+		return -EINVAL;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From ca71277f36e0781db663aedeb5fc1e26e7c144c4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 6 Nov 2022 15:34:14 -0500
Subject: net: move the ct helper function to nf_conntrack_helper for ovs and
 tc

Move ovs_ct_helper from openvswitch to nf_conntrack_helper and rename
as nf_ct_helper so that it can be used in TC act_ct in the next patch.
Note that it also adds the checks for the family and proto, as in TC
act_ct, the packets with correct family and proto are not guaranteed.

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netfilter/nf_conntrack_helper.h |  3 ++
 net/netfilter/nf_conntrack_helper.c         | 69 +++++++++++++++++++++++++++++
 net/openvswitch/conntrack.c                 | 61 +------------------------
 3 files changed, 73 insertions(+), 60 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index 9939c366f720..b6676249eeeb 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -115,6 +115,9 @@ struct nf_conn_help *nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp);
 int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 			      gfp_t flags);
 
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo, u16 proto);
+
 void nf_ct_helper_destroy(struct nf_conn *ct);
 
 static inline struct nf_conn_help *nfct_help(const struct nf_conn *ct)
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index ff737a76052e..88039eedadea 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -26,7 +26,9 @@
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <net/netfilter/nf_log.h>
+#include <net/ip.h>
 
 static DEFINE_MUTEX(nf_ct_helper_mutex);
 struct hlist_head *nf_ct_helper_hash __read_mostly;
@@ -240,6 +242,73 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 }
 EXPORT_SYMBOL_GPL(__nf_ct_try_assign_helper);
 
+/* 'skb' should already be pulled to nh_ofs. */
+int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
+		 enum ip_conntrack_info ctinfo, u16 proto)
+{
+	const struct nf_conntrack_helper *helper;
+	const struct nf_conn_help *help;
+	unsigned int protoff;
+	int err;
+
+	if (ctinfo == IP_CT_RELATED_REPLY)
+		return NF_ACCEPT;
+
+	help = nfct_help(ct);
+	if (!help)
+		return NF_ACCEPT;
+
+	helper = rcu_dereference(help->helper);
+	if (!helper)
+		return NF_ACCEPT;
+
+	if (helper->tuple.src.l3num != NFPROTO_UNSPEC &&
+	    helper->tuple.src.l3num != proto)
+		return NF_ACCEPT;
+
+	switch (proto) {
+	case NFPROTO_IPV4:
+		protoff = ip_hdrlen(skb);
+		proto = ip_hdr(skb)->protocol;
+		break;
+	case NFPROTO_IPV6: {
+		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+		__be16 frag_off;
+		int ofs;
+
+		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
+				       &frag_off);
+		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
+			pr_debug("proto header not found\n");
+			return NF_ACCEPT;
+		}
+		protoff = ofs;
+		proto = nexthdr;
+		break;
+	}
+	default:
+		WARN_ONCE(1, "helper invoked on non-IP family!");
+		return NF_DROP;
+	}
+
+	if (helper->tuple.dst.protonum != proto)
+		return NF_ACCEPT;
+
+	err = helper->help(skb, protoff, ct, ctinfo);
+	if (err != NF_ACCEPT)
+		return err;
+
+	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
+	 * FTP with NAT) adusting the TCP payload size when mangling IP
+	 * addresses and/or port numbers in the text-based control connection.
+	 */
+	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+		return NF_DROP;
+	return NF_ACCEPT;
+}
+EXPORT_SYMBOL_GPL(nf_ct_helper);
+
 /* appropriate ct lock protecting must be taken by caller */
 static int unhelp(struct nf_conn *ct, void *me)
 {
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c7b10234cf7c..18f54fa38e8f 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -434,65 +434,6 @@ static int ovs_ct_set_labels(struct nf_conn *ct, struct sw_flow_key *key,
 	return 0;
 }
 
-/* 'skb' should already be pulled to nh_ofs. */
-static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
-{
-	const struct nf_conntrack_helper *helper;
-	const struct nf_conn_help *help;
-	enum ip_conntrack_info ctinfo;
-	unsigned int protoff;
-	struct nf_conn *ct;
-	int err;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return NF_ACCEPT;
-
-	help = nfct_help(ct);
-	if (!help)
-		return NF_ACCEPT;
-
-	helper = rcu_dereference(help->helper);
-	if (!helper)
-		return NF_ACCEPT;
-
-	switch (proto) {
-	case NFPROTO_IPV4:
-		protoff = ip_hdrlen(skb);
-		break;
-	case NFPROTO_IPV6: {
-		u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-		__be16 frag_off;
-		int ofs;
-
-		ofs = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr,
-				       &frag_off);
-		if (ofs < 0 || (frag_off & htons(~0x7)) != 0) {
-			pr_debug("proto header not found\n");
-			return NF_ACCEPT;
-		}
-		protoff = ofs;
-		break;
-	}
-	default:
-		WARN_ONCE(1, "helper invoked on non-IP family!");
-		return NF_DROP;
-	}
-
-	err = helper->help(skb, protoff, ct, ctinfo);
-	if (err != NF_ACCEPT)
-		return err;
-
-	/* Adjust seqs after helper.  This is needed due to some helpers (e.g.,
-	 * FTP with NAT) adusting the TCP payload size when mangling IP
-	 * addresses and/or port numbers in the text-based control connection.
-	 */
-	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
-		return NF_DROP;
-	return NF_ACCEPT;
-}
-
 /* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
  * value if 'skb' is freed.
  */
@@ -1038,7 +979,7 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
 		 */
 		if ((nf_ct_is_confirmed(ct) ? !cached || add_helper :
 					      info->commit) &&
-		    ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
+		    nf_ct_helper(skb, ct, ctinfo, info->family) != NF_ACCEPT) {
 			return -EINVAL;
 		}
 
-- 
cgit v1.2.3


From f96cba2eb923c025014fe74a50e104b7c5234feb Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 6 Nov 2022 15:34:15 -0500
Subject: net: move add ct helper function to nf_conntrack_helper for ovs and
 tc

Move ovs_ct_add_helper from openvswitch to nf_conntrack_helper and
rename as nf_ct_add_helper, so that it can be used in TC act_ct in
the next patch.

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/netfilter/nf_conntrack_helper.h |  2 ++
 net/netfilter/nf_conntrack_helper.c         | 31 ++++++++++++++++++++
 net/openvswitch/conntrack.c                 | 44 ++++-------------------------
 3 files changed, 38 insertions(+), 39 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h
index b6676249eeeb..f30b1694b690 100644
--- a/include/net/netfilter/nf_conntrack_helper.h
+++ b/include/net/netfilter/nf_conntrack_helper.h
@@ -117,6 +117,8 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl,
 
 int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
 		 enum ip_conntrack_info ctinfo, u16 proto);
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+		     u8 proto, bool nat, struct nf_conntrack_helper **hp);
 
 void nf_ct_helper_destroy(struct nf_conn *ct);
 
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index 88039eedadea..48ea6d0264b5 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -309,6 +309,37 @@ int nf_ct_helper(struct sk_buff *skb, struct nf_conn *ct,
 }
 EXPORT_SYMBOL_GPL(nf_ct_helper);
 
+int nf_ct_add_helper(struct nf_conn *ct, const char *name, u8 family,
+		     u8 proto, bool nat, struct nf_conntrack_helper **hp)
+{
+	struct nf_conntrack_helper *helper;
+	struct nf_conn_help *help;
+	int ret = 0;
+
+	helper = nf_conntrack_helper_try_module_get(name, family, proto);
+	if (!helper)
+		return -EINVAL;
+
+	help = nf_ct_helper_ext_add(ct, GFP_KERNEL);
+	if (!help) {
+		nf_conntrack_helper_put(helper);
+		return -ENOMEM;
+	}
+#if IS_ENABLED(CONFIG_NF_NAT)
+	if (nat) {
+		ret = nf_nat_helper_try_module_get(name, family, proto);
+		if (ret) {
+			nf_conntrack_helper_put(helper);
+			return ret;
+		}
+	}
+#endif
+	rcu_assign_pointer(help->helper, helper);
+	*hp = helper;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nf_ct_add_helper);
+
 /* appropriate ct lock protecting must be taken by caller */
 static int unhelp(struct nf_conn *ct, void *me)
 {
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 18f54fa38e8f..4348321856af 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -1291,43 +1291,6 @@ int ovs_ct_clear(struct sk_buff *skb, struct sw_flow_key *key)
 	return 0;
 }
 
-static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
-			     const struct sw_flow_key *key, bool log)
-{
-	struct nf_conntrack_helper *helper;
-	struct nf_conn_help *help;
-	int ret = 0;
-
-	helper = nf_conntrack_helper_try_module_get(name, info->family,
-						    key->ip.proto);
-	if (!helper) {
-		OVS_NLERR(log, "Unknown helper \"%s\"", name);
-		return -EINVAL;
-	}
-
-	help = nf_ct_helper_ext_add(info->ct, GFP_KERNEL);
-	if (!help) {
-		nf_conntrack_helper_put(helper);
-		return -ENOMEM;
-	}
-
-#if IS_ENABLED(CONFIG_NF_NAT)
-	if (info->nat) {
-		ret = nf_nat_helper_try_module_get(name, info->family,
-						   key->ip.proto);
-		if (ret) {
-			nf_conntrack_helper_put(helper);
-			OVS_NLERR(log, "Failed to load \"%s\" NAT helper, error: %d",
-				  name, ret);
-			return ret;
-		}
-	}
-#endif
-	rcu_assign_pointer(help->helper, helper);
-	info->helper = helper;
-	return ret;
-}
-
 #if IS_ENABLED(CONFIG_NF_NAT)
 static int parse_nat(const struct nlattr *attr,
 		     struct ovs_conntrack_info *info, bool log)
@@ -1661,9 +1624,12 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
 	}
 
 	if (helper) {
-		err = ovs_ct_add_helper(&ct_info, helper, key, log);
-		if (err)
+		err = nf_ct_add_helper(ct_info.ct, helper, ct_info.family,
+				       key->ip.proto, ct_info.nat, &ct_info.helper);
+		if (err) {
+			OVS_NLERR(log, "Failed to add %s helper %d", helper, err);
 			goto err_free_ct;
+		}
 	}
 
 	err = ovs_nla_add_action(sfa, OVS_ACTION_ATTR_CT, &ct_info,
-- 
cgit v1.2.3


From a21b06e7319129994f339ed47f512bbe57b77f5b Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Sun, 6 Nov 2022 15:34:17 -0500
Subject: net: sched: add helper support in act_ct

This patch is to add helper support in act_ct for OVS actions=ct(alg=xxx)
offloading, which is corresponding to Commit cae3a2627520 ("openvswitch:
Allow attaching helpers to ct action") in OVS kernel part.

The difference is when adding TC actions family and proto cannot be got
from the filter/match, other than helper name in tb[TCA_CT_HELPER_NAME],
we also need to send the family in tb[TCA_CT_HELPER_FAMILY] and the
proto in tb[TCA_CT_HELPER_PROTO] to kernel.

Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/tc_act/tc_ct.h        |  1 +
 include/uapi/linux/tc_act/tc_ct.h |  3 ++
 net/sched/act_ct.c                | 89 +++++++++++++++++++++++++++++++++++----
 3 files changed, 85 insertions(+), 8 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tc_act/tc_ct.h b/include/net/tc_act/tc_ct.h
index 8250d6f0a462..b24ea2d9400b 100644
--- a/include/net/tc_act/tc_ct.h
+++ b/include/net/tc_act/tc_ct.h
@@ -10,6 +10,7 @@
 #include <net/netfilter/nf_conntrack_labels.h>
 
 struct tcf_ct_params {
+	struct nf_conntrack_helper *helper;
 	struct nf_conn *tmpl;
 	u16 zone;
 
diff --git a/include/uapi/linux/tc_act/tc_ct.h b/include/uapi/linux/tc_act/tc_ct.h
index 5fb1d7ac1027..6c5200f0ed38 100644
--- a/include/uapi/linux/tc_act/tc_ct.h
+++ b/include/uapi/linux/tc_act/tc_ct.h
@@ -22,6 +22,9 @@ enum {
 	TCA_CT_NAT_PORT_MIN,	/* be16 */
 	TCA_CT_NAT_PORT_MAX,	/* be16 */
 	TCA_CT_PAD,
+	TCA_CT_HELPER_NAME,	/* string */
+	TCA_CT_HELPER_FAMILY,	/* u8 */
+	TCA_CT_HELPER_PROTO,	/* u8 */
 	__TCA_CT_MAX
 };
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 193a460a9d7f..da0b7f665277 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -33,6 +33,7 @@
 #include <net/netfilter/nf_conntrack_acct.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <net/netfilter/nf_conntrack_act_ct.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
 static struct workqueue_struct *act_ct_wq;
@@ -655,7 +656,7 @@ struct tc_ct_action_net {
 
 /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
 static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
-				   u16 zone_id, bool force)
+				   struct tcf_ct_params *p)
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct;
@@ -665,11 +666,19 @@ static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
 		return false;
 	if (!net_eq(net, read_pnet(&ct->ct_net)))
 		goto drop_ct;
-	if (nf_ct_zone(ct)->id != zone_id)
+	if (nf_ct_zone(ct)->id != p->zone)
 		goto drop_ct;
+	if (p->helper) {
+		struct nf_conn_help *help;
+
+		help = nf_ct_ext_find(ct, NF_CT_EXT_HELPER);
+		if (help && rcu_access_pointer(help->helper) != p->helper)
+			goto drop_ct;
+	}
 
 	/* Force conntrack entry direction. */
-	if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
+	if ((p->ct_action & TCA_CT_ACT_FORCE) &&
+	    CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
 		if (nf_ct_is_confirmed(ct))
 			nf_ct_kill(ct);
 
@@ -832,6 +841,13 @@ out_free:
 
 static void tcf_ct_params_free(struct tcf_ct_params *params)
 {
+	if (params->helper) {
+#if IS_ENABLED(CONFIG_NF_NAT)
+		if (params->ct_action & TCA_CT_ACT_NAT)
+			nf_nat_helper_put(params->helper);
+#endif
+		nf_conntrack_helper_put(params->helper);
+	}
 	if (params->ct_ft)
 		tcf_ct_flow_table_put(params->ct_ft);
 	if (params->tmpl)
@@ -1026,13 +1042,14 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 		      struct tcf_result *res)
 {
 	struct net *net = dev_net(skb->dev);
-	bool cached, commit, clear, force;
 	enum ip_conntrack_info ctinfo;
 	struct tcf_ct *c = to_ct(a);
 	struct nf_conn *tmpl = NULL;
 	struct nf_hook_state state;
+	bool cached, commit, clear;
 	int nh_ofs, err, retval;
 	struct tcf_ct_params *p;
+	bool add_helper = false;
 	bool skip_add = false;
 	bool defrag = false;
 	struct nf_conn *ct;
@@ -1043,7 +1060,6 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	retval = READ_ONCE(c->tcf_action);
 	commit = p->ct_action & TCA_CT_ACT_COMMIT;
 	clear = p->ct_action & TCA_CT_ACT_CLEAR;
-	force = p->ct_action & TCA_CT_ACT_FORCE;
 	tmpl = p->tmpl;
 
 	tcf_lastuse_update(&c->tcf_tm);
@@ -1086,7 +1102,7 @@ static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
 	 * actually run the packet through conntrack twice unless it's for a
 	 * different zone.
 	 */
-	cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
+	cached = tcf_ct_skb_nfct_cached(net, skb, p);
 	if (!cached) {
 		if (tcf_ct_flow_table_lookup(p, skb, family)) {
 			skip_add = true;
@@ -1119,6 +1135,22 @@ do_nat:
 	if (err != NF_ACCEPT)
 		goto drop;
 
+	if (!nf_ct_is_confirmed(ct) && commit && p->helper && !nfct_help(ct)) {
+		err = __nf_ct_try_assign_helper(ct, p->tmpl, GFP_ATOMIC);
+		if (err)
+			goto drop;
+		add_helper = true;
+		if (p->ct_action & TCA_CT_ACT_NAT && !nfct_seqadj(ct)) {
+			if (!nfct_seqadj_ext_add(ct))
+				goto drop;
+		}
+	}
+
+	if (nf_ct_is_confirmed(ct) ? ((!cached && !skip_add) || add_helper) : commit) {
+		if (nf_ct_helper(skb, ct, ctinfo, family) != NF_ACCEPT)
+			goto drop;
+	}
+
 	if (commit) {
 		tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
 		tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
@@ -1167,6 +1199,9 @@ static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
 	[TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
 	[TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
 	[TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
+	[TCA_CT_HELPER_NAME] = { .type = NLA_STRING, .len = NF_CT_HELPER_NAME_LEN },
+	[TCA_CT_HELPER_FAMILY] = { .type = NLA_U8 },
+	[TCA_CT_HELPER_PROTO] = { .type = NLA_U8 },
 };
 
 static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
@@ -1256,8 +1291,9 @@ static int tcf_ct_fill_params(struct net *net,
 {
 	struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
 	struct nf_conntrack_zone zone;
+	int err, family, proto, len;
 	struct nf_conn *tmpl;
-	int err;
+	char *name;
 
 	p->zone = NF_CT_DEFAULT_ZONE_ID;
 
@@ -1318,10 +1354,31 @@ static int tcf_ct_fill_params(struct net *net,
 		NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
 		return -ENOMEM;
 	}
-	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
 	p->tmpl = tmpl;
+	if (tb[TCA_CT_HELPER_NAME]) {
+		name = nla_data(tb[TCA_CT_HELPER_NAME]);
+		len = nla_len(tb[TCA_CT_HELPER_NAME]);
+		if (len > 16 || name[len - 1] != '\0') {
+			NL_SET_ERR_MSG_MOD(extack, "Failed to parse helper name.");
+			err = -EINVAL;
+			goto err;
+		}
+		family = tb[TCA_CT_HELPER_FAMILY] ? nla_get_u8(tb[TCA_CT_HELPER_FAMILY]) : AF_INET;
+		proto = tb[TCA_CT_HELPER_PROTO] ? nla_get_u8(tb[TCA_CT_HELPER_PROTO]) : IPPROTO_TCP;
+		err = nf_ct_add_helper(tmpl, name, family, proto,
+				       p->ct_action & TCA_CT_ACT_NAT, &p->helper);
+		if (err) {
+			NL_SET_ERR_MSG_MOD(extack, "Failed to add helper");
+			goto err;
+		}
+	}
 
+	__set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
 	return 0;
+err:
+	nf_ct_put(p->tmpl);
+	p->tmpl = NULL;
+	return err;
 }
 
 static int tcf_ct_init(struct net *net, struct nlattr *nla,
@@ -1490,6 +1547,19 @@ static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
 	return 0;
 }
 
+static int tcf_ct_dump_helper(struct sk_buff *skb, struct nf_conntrack_helper *helper)
+{
+	if (!helper)
+		return 0;
+
+	if (nla_put_string(skb, TCA_CT_HELPER_NAME, helper->name) ||
+	    nla_put_u8(skb, TCA_CT_HELPER_FAMILY, helper->tuple.src.l3num) ||
+	    nla_put_u8(skb, TCA_CT_HELPER_PROTO, helper->tuple.dst.protonum))
+		return -1;
+
+	return 0;
+}
+
 static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
 			      int bind, int ref)
 {
@@ -1542,6 +1612,9 @@ static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
 	if (tcf_ct_dump_nat(skb, p))
 		goto nla_put_failure;
 
+	if (tcf_ct_dump_helper(skb, p->helper))
+		goto nla_put_failure;
+
 skip_dump:
 	if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
 		goto nla_put_failure;
-- 
cgit v1.2.3


From 42fb06b391ace2aec5cdb1ebb8ff668f0a34332f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 12 Oct 2022 08:49:29 +0100
Subject: net: Change the udp encap_err_rcv to allow use of
 {ip,ipv6}_icmp_error()

Change the udp encap_err_rcv signature to match ip_icmp_error() and
ipv6_icmp_error() so that those can be used from the called function and
export them.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: netdev@vger.kernel.org
---
 include/linux/udp.h      |  3 +-
 include/net/udp_tunnel.h |  4 +--
 net/ipv4/ip_sockglue.c   |  1 +
 net/ipv4/udp.c           |  3 +-
 net/ipv6/datagram.c      |  1 +
 net/ipv6/udp.c           |  3 +-
 net/rxrpc/ar-internal.h  |  2 +-
 net/rxrpc/peer_event.c   | 71 ++++++++++++------------------------------------
 8 files changed, 28 insertions(+), 60 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 5cdba00a904a..dea57aa37df6 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -70,7 +70,8 @@ struct udp_sock {
 	 * For encapsulation sockets.
 	 */
 	int (*encap_rcv)(struct sock *sk, struct sk_buff *skb);
-	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
+	void (*encap_err_rcv)(struct sock *sk, struct sk_buff *skb, int err,
+			      __be16 port, u32 info, u8 *payload);
 	int (*encap_err_lookup)(struct sock *sk, struct sk_buff *skb);
 	void (*encap_destroy)(struct sock *sk);
 
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 72394f441dad..0ca9b7a11baf 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -68,8 +68,8 @@ typedef int (*udp_tunnel_encap_rcv_t)(struct sock *sk, struct sk_buff *skb);
 typedef int (*udp_tunnel_encap_err_lookup_t)(struct sock *sk,
 					     struct sk_buff *skb);
 typedef void (*udp_tunnel_encap_err_rcv_t)(struct sock *sk,
-					   struct sk_buff *skb,
-					   unsigned int udp_offset);
+					   struct sk_buff *skb, int err,
+					   __be16 port, u32 info, u8 *payload);
 typedef void (*udp_tunnel_encap_destroy_t)(struct sock *sk);
 typedef struct sk_buff *(*udp_tunnel_gro_receive_t)(struct sock *sk,
 						    struct list_head *head,
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index 5f16807d3235..9f92ae35bb01 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -433,6 +433,7 @@ void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 	}
 	kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(ip_icmp_error);
 
 void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 port, u32 info)
 {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 89accc3c8bb3..b859d6c8298e 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -784,7 +784,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
 	if (tunnel) {
 		/* ...not for tunnels though: we don't have a sending socket */
 		if (udp_sk(sk)->encap_err_rcv)
-			udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2);
+			udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest, info,
+						  (u8 *)(uh+1));
 		goto out;
 	}
 	if (!inet->recverr) {
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index df7e032ce87d..7c7155b48f17 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -334,6 +334,7 @@ void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err,
 	if (sock_queue_err_skb(sk, skb))
 		kfree_skb(skb);
 }
+EXPORT_SYMBOL_GPL(ipv6_icmp_error);
 
 void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info)
 {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 297f7cc06044..4bc3fc27ec78 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -631,7 +631,8 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
 	/* Tunnels don't have an application socket: don't pass errors back */
 	if (tunnel) {
 		if (udp_sk(sk)->encap_err_rcv)
-			udp_sk(sk)->encap_err_rcv(sk, skb, offset);
+			udp_sk(sk)->encap_err_rcv(sk, skb, err, uh->dest,
+						  ntohl(info), (u8 *)(uh+1));
 		goto out;
 	}
 
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 436a1e8d0abd..51270b2e49c3 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -998,7 +998,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *);
 /*
  * peer_event.c
  */
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset);
+void rxrpc_encap_err_rcv(struct sock *, struct sk_buff *, int, __be16, u32, u8 *);
 void rxrpc_error_report(struct sock *);
 void rxrpc_peer_keepalive_worker(struct work_struct *);
 
diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c
index 32561e9567fe..d7d6d7aff985 100644
--- a/net/rxrpc/peer_event.c
+++ b/net/rxrpc/peer_event.c
@@ -29,20 +29,16 @@ static void rxrpc_distribute_error(struct rxrpc_peer *, int,
  */
 static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 						     struct sk_buff *skb,
-						     unsigned int udp_offset,
-						     unsigned int *info,
+						     __be16 udp_port,
 						     struct sockaddr_rxrpc *srx)
 {
 	struct iphdr *ip, *ip0 = ip_hdr(skb);
 	struct icmphdr *icmp = icmp_hdr(skb);
-	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
 
 	_enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code);
 
 	switch (icmp->type) {
 	case ICMP_DEST_UNREACH:
-		*info = ntohs(icmp->un.frag.mtu);
-		fallthrough;
 	case ICMP_TIME_EXCEEDED:
 	case ICMP_PARAMETERPROB:
 		ip = (struct iphdr *)((void *)icmp + 8);
@@ -63,7 +59,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 	case AF_INET:
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
 		       sizeof(struct in_addr));
 		break;
@@ -72,7 +68,7 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
 	case AF_INET6:
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr, &ip->daddr,
 		       sizeof(struct in_addr));
 		break;
@@ -93,20 +89,16 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local,
  */
 static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 						      struct sk_buff *skb,
-						      unsigned int udp_offset,
-						      unsigned int *info,
+						      __be16 udp_port,
 						      struct sockaddr_rxrpc *srx)
 {
 	struct icmp6hdr *icmp = icmp6_hdr(skb);
 	struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb);
-	struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset);
 
 	_enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code);
 
 	switch (icmp->icmp6_type) {
 	case ICMPV6_DEST_UNREACH:
-		*info = ntohl(icmp->icmp6_mtu);
-		fallthrough;
 	case ICMPV6_PKT_TOOBIG:
 	case ICMPV6_TIME_EXCEED:
 	case ICMPV6_PARAMPROB:
@@ -129,13 +121,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 		_net("Rx ICMP6 on v4 sock");
 		srx->transport_len = sizeof(srx->transport.sin);
 		srx->transport.family = AF_INET;
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin.sin_addr,
 		       &ip->daddr.s6_addr32[3], sizeof(struct in_addr));
 		break;
 	case AF_INET6:
 		_net("Rx ICMP6");
-		srx->transport.sin.sin_port = udp->dest;
+		srx->transport.sin.sin_port = udp_port;
 		memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr,
 		       sizeof(struct in6_addr));
 		break;
@@ -152,15 +144,13 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local,
 /*
  * Handle an error received on the local endpoint as a tunnel.
  */
-void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
-			 unsigned int udp_offset)
+void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, int err,
+			 __be16 port, u32 info, u8 *payload)
 {
 	struct sock_extended_err ee;
 	struct sockaddr_rxrpc srx;
 	struct rxrpc_local *local;
 	struct rxrpc_peer *peer;
-	unsigned int info = 0;
-	int err;
 	u8 version = ip_hdr(skb)->version;
 	u8 type = icmp_hdr(skb)->type;
 	u8 code = icmp_hdr(skb)->code;
@@ -176,13 +166,11 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 	switch (ip_hdr(skb)->version) {
 	case IPVERSION:
-		peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset,
-						  &info, &srx);
+		peer = rxrpc_lookup_peer_icmp_rcu(local, skb, port, &srx);
 		break;
 #ifdef CONFIG_AF_RXRPC_IPV6
 	case 6:
-		peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset,
-						   &info, &srx);
+		peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, port, &srx);
 		break;
 #endif
 	default:
@@ -201,34 +189,12 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 	switch (version) {
 	case IPVERSION:
-		switch (type) {
-		case ICMP_DEST_UNREACH:
-			switch (code) {
-			case ICMP_FRAG_NEEDED:
-				rxrpc_adjust_mtu(peer, info);
-				rcu_read_unlock();
-				rxrpc_put_peer(peer);
-				return;
-			default:
-				break;
-			}
-
-			err = EHOSTUNREACH;
-			if (code <= NR_ICMP_UNREACH) {
-				/* Might want to do something different with
-				 * non-fatal errors
-				 */
-				//harderr = icmp_err_convert[code].fatal;
-				err = icmp_err_convert[code].errno;
-			}
-			break;
-
-		case ICMP_TIME_EXCEEDED:
-			err = EHOSTUNREACH;
-			break;
-		default:
-			err = EPROTO;
-			break;
+		if (type == ICMP_DEST_UNREACH &&
+		    code == ICMP_FRAG_NEEDED) {
+			rxrpc_adjust_mtu(peer, info);
+			rcu_read_unlock();
+			rxrpc_put_peer(peer);
+			return;
 		}
 
 		ee.ee_origin = SO_EE_ORIGIN_ICMP;
@@ -239,16 +205,13 @@ void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb,
 
 #ifdef CONFIG_AF_RXRPC_IPV6
 	case 6:
-		switch (type) {
-		case ICMPV6_PKT_TOOBIG:
+		if (type == ICMPV6_PKT_TOOBIG) {
 			rxrpc_adjust_mtu(peer, info);
 			rcu_read_unlock();
 			rxrpc_put_peer(peer);
 			return;
 		}
 
-		icmpv6_err_convert(type, code, &err);
-
 		if (err == EACCES)
 			err = EHOSTUNREACH;
 
-- 
cgit v1.2.3


From 27fabd02abf30a9df9899f92d467591c7eabb1ba Mon Sep 17 00:00:00 2001
From: "Hans J. Schultz" <netdev@kapio-technology.com>
Date: Tue, 8 Nov 2022 11:47:08 +0100
Subject: bridge: switchdev: Allow device drivers to install locked FDB entries

When the bridge is offloaded to hardware, FDB entries are learned and
aged-out by the hardware. Some device drivers synchronize the hardware
and software FDBs by generating switchdev events towards the bridge.

When a port is locked, the hardware must not learn autonomously, as
otherwise any host will blindly gain authorization. Instead, the
hardware should generate events regarding hosts that are trying to gain
authorization and their MAC addresses should be notified by the device
driver as locked FDB entries towards the bridge driver.

Allow device drivers to notify the bridge driver about such entries by
extending the 'switchdev_notifier_fdb_info' structure with the 'locked'
bit. The bit can only be set by device drivers and not by the bridge
driver.

Prevent a locked entry from being installed if MAB is not enabled on the
bridge port.

If an entry already exists in the bridge driver, reject the locked entry
if the current entry does not have the "locked" flag set or if it points
to a different port. The same semantics are implemented in the software
data path.

Signed-off-by: Hans J. Schultz <netdev@kapio-technology.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Reviewed-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/switchdev.h   |  1 +
 net/bridge/br.c           |  3 ++-
 net/bridge/br_fdb.c       | 22 ++++++++++++++++++++--
 net/bridge/br_private.h   |  2 +-
 net/bridge/br_switchdev.c |  4 ++++
 5 files changed, 28 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/include/net/switchdev.h b/include/net/switchdev.h
index 7dcdc97c0bc3..ca0312b78294 100644
--- a/include/net/switchdev.h
+++ b/include/net/switchdev.h
@@ -248,6 +248,7 @@ struct switchdev_notifier_fdb_info {
 	u16 vid;
 	u8 added_by_user:1,
 	   is_local:1,
+	   locked:1,
 	   offloaded:1;
 };
 
diff --git a/net/bridge/br.c b/net/bridge/br.c
index 145999b8c355..4f5098d33a46 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -166,7 +166,8 @@ static int br_switchdev_event(struct notifier_block *unused,
 	case SWITCHDEV_FDB_ADD_TO_BRIDGE:
 		fdb_info = ptr;
 		err = br_fdb_external_learn_add(br, p, fdb_info->addr,
-						fdb_info->vid, false);
+						fdb_info->vid,
+						fdb_info->locked, false);
 		if (err) {
 			err = notifier_from_errno(err);
 			break;
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index 3b83af4458b8..e69a872bfc1d 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -1139,7 +1139,7 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br,
 					   "FDB entry towards bridge must be permanent");
 			return -EINVAL;
 		}
-		err = br_fdb_external_learn_add(br, p, addr, vid, true);
+		err = br_fdb_external_learn_add(br, p, addr, vid, false, true);
 	} else {
 		spin_lock_bh(&br->hash_lock);
 		err = fdb_add_entry(br, p, addr, ndm, nlh_flags, vid, nfea_tb);
@@ -1377,7 +1377,7 @@ void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p)
 }
 
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
-			      const unsigned char *addr, u16 vid,
+			      const unsigned char *addr, u16 vid, bool locked,
 			      bool swdev_notify)
 {
 	struct net_bridge_fdb_entry *fdb;
@@ -1386,6 +1386,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 
 	trace_br_fdb_external_learn_add(br, p, addr, vid);
 
+	if (locked && (!p || !(p->flags & BR_PORT_MAB)))
+		return -EINVAL;
+
 	spin_lock_bh(&br->hash_lock);
 
 	fdb = br_fdb_find(br, addr, vid);
@@ -1398,6 +1401,9 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		if (!p)
 			flags |= BIT(BR_FDB_LOCAL);
 
+		if (locked)
+			flags |= BIT(BR_FDB_LOCKED);
+
 		fdb = fdb_create(br, p, addr, vid, flags);
 		if (!fdb) {
 			err = -ENOMEM;
@@ -1405,6 +1411,13 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 		}
 		fdb_notify(br, fdb, RTM_NEWNEIGH, swdev_notify);
 	} else {
+		if (locked &&
+		    (!test_bit(BR_FDB_LOCKED, &fdb->flags) ||
+		     READ_ONCE(fdb->dst) != p)) {
+			err = -EINVAL;
+			goto err_unlock;
+		}
+
 		fdb->updated = jiffies;
 
 		if (READ_ONCE(fdb->dst) != p) {
@@ -1421,6 +1434,11 @@ int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			modified = true;
 		}
 
+		if (locked != test_bit(BR_FDB_LOCKED, &fdb->flags)) {
+			change_bit(BR_FDB_LOCKED, &fdb->flags);
+			modified = true;
+		}
+
 		if (swdev_notify)
 			set_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
 
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 4ce8b8e5ae0b..4c4fda930068 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -811,7 +811,7 @@ int br_fdb_sync_static(struct net_bridge *br, struct net_bridge_port *p);
 void br_fdb_unsync_static(struct net_bridge *br, struct net_bridge_port *p);
 int br_fdb_external_learn_add(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid,
-			      bool swdev_notify);
+			      bool locked, bool swdev_notify);
 int br_fdb_external_learn_del(struct net_bridge *br, struct net_bridge_port *p,
 			      const unsigned char *addr, u16 vid,
 			      bool swdev_notify);
diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
index 8f3d76c751dd..8a0abe35137d 100644
--- a/net/bridge/br_switchdev.c
+++ b/net/bridge/br_switchdev.c
@@ -136,6 +136,7 @@ static void br_switchdev_fdb_populate(struct net_bridge *br,
 	item->added_by_user = test_bit(BR_FDB_ADDED_BY_USER, &fdb->flags);
 	item->offloaded = test_bit(BR_FDB_OFFLOADED, &fdb->flags);
 	item->is_local = test_bit(BR_FDB_LOCAL, &fdb->flags);
+	item->locked = false;
 	item->info.dev = (!p || item->is_local) ? br->dev : p->dev;
 	item->info.ctx = ctx;
 }
@@ -146,6 +147,9 @@ br_switchdev_fdb_notify(struct net_bridge *br,
 {
 	struct switchdev_notifier_fdb_info item;
 
+	if (test_bit(BR_FDB_LOCKED, &fdb->flags))
+		return;
+
 	br_switchdev_fdb_populate(br, &item, fdb, NULL);
 
 	switch (type) {
-- 
cgit v1.2.3


From 2640a82bbc08393c846c7b55178079bb8ca31a8c Mon Sep 17 00:00:00 2001
From: Ido Schimmel <idosch@nvidia.com>
Date: Tue, 8 Nov 2022 11:47:10 +0100
Subject: devlink: Add packet traps for 802.1X operation

Add packet traps for 802.1X operation. The "eapol" control trap is used
to trap EAPOL packets and is required for the correct operation of the
control plane. The "locked_port" drop trap can be enabled to gain
visibility into packets that were dropped by the device due to the
locked bridge port check.

Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Reviewed-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Petr Machata <petrm@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/devlink-trap.rst | 13 +++++++++++++
 include/net/devlink.h                             |  9 +++++++++
 net/core/devlink.c                                |  3 +++
 3 files changed, 25 insertions(+)

(limited to 'include/net')

diff --git a/Documentation/networking/devlink/devlink-trap.rst b/Documentation/networking/devlink/devlink-trap.rst
index 90d1381b88de..2c14dfe69b3a 100644
--- a/Documentation/networking/devlink/devlink-trap.rst
+++ b/Documentation/networking/devlink/devlink-trap.rst
@@ -485,6 +485,16 @@ be added to the following table:
      - Traps incoming packets that the device decided to drop because
        the destination MAC is not configured in the MAC table and
        the interface is not in promiscuous mode
+   * - ``eapol``
+     - ``control``
+     - Traps "Extensible Authentication Protocol over LAN" (EAPOL) packets
+       specified in IEEE 802.1X
+   * - ``locked_port``
+     - ``drop``
+     - Traps packets that the device decided to drop because they failed the
+       locked bridge port check. That is, packets that were received via a
+       locked port and whose {SMAC, VID} does not correspond to an FDB entry
+       pointing to the port
 
 Driver-specific Packet Traps
 ============================
@@ -589,6 +599,9 @@ narrow. The description of these groups must be added to the following table:
    * - ``parser_error_drops``
      - Contains packet traps for packets that were marked by the device during
        parsing as erroneous
+   * - ``eapol``
+     - Contains packet traps for "Extensible Authentication Protocol over LAN"
+       (EAPOL) packets specified in IEEE 802.1X
 
 Packet Trap Policers
 ====================
diff --git a/include/net/devlink.h b/include/net/devlink.h
index fa6e936af1a5..611a23a3deb2 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -894,6 +894,8 @@ enum devlink_trap_generic_id {
 	DEVLINK_TRAP_GENERIC_ID_ESP_PARSING,
 	DEVLINK_TRAP_GENERIC_ID_BLACKHOLE_NEXTHOP,
 	DEVLINK_TRAP_GENERIC_ID_DMAC_FILTER,
+	DEVLINK_TRAP_GENERIC_ID_EAPOL,
+	DEVLINK_TRAP_GENERIC_ID_LOCKED_PORT,
 
 	/* Add new generic trap IDs above */
 	__DEVLINK_TRAP_GENERIC_ID_MAX,
@@ -930,6 +932,7 @@ enum devlink_trap_group_generic_id {
 	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_SAMPLE,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_ACL_TRAP,
 	DEVLINK_TRAP_GROUP_GENERIC_ID_PARSER_ERROR_DROPS,
+	DEVLINK_TRAP_GROUP_GENERIC_ID_EAPOL,
 
 	/* Add new generic trap group IDs above */
 	__DEVLINK_TRAP_GROUP_GENERIC_ID_MAX,
@@ -1121,6 +1124,10 @@ enum devlink_trap_group_generic_id {
 	"blackhole_nexthop"
 #define DEVLINK_TRAP_GENERIC_NAME_DMAC_FILTER \
 	"dmac_filter"
+#define DEVLINK_TRAP_GENERIC_NAME_EAPOL \
+	"eapol"
+#define DEVLINK_TRAP_GENERIC_NAME_LOCKED_PORT \
+	"locked_port"
 
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_L2_DROPS \
 	"l2_drops"
@@ -1174,6 +1181,8 @@ enum devlink_trap_group_generic_id {
 	"acl_trap"
 #define DEVLINK_TRAP_GROUP_GENERIC_NAME_PARSER_ERROR_DROPS \
 	"parser_error_drops"
+#define DEVLINK_TRAP_GROUP_GENERIC_NAME_EAPOL \
+	"eapol"
 
 #define DEVLINK_TRAP_GENERIC(_type, _init_action, _id, _group_id,	      \
 			     _metadata_cap)				      \
diff --git a/net/core/devlink.c b/net/core/devlink.c
index ea0b319385fc..6bbe230c4ec5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -11734,6 +11734,8 @@ static const struct devlink_trap devlink_trap_generic[] = {
 	DEVLINK_TRAP(ESP_PARSING, DROP),
 	DEVLINK_TRAP(BLACKHOLE_NEXTHOP, DROP),
 	DEVLINK_TRAP(DMAC_FILTER, DROP),
+	DEVLINK_TRAP(EAPOL, CONTROL),
+	DEVLINK_TRAP(LOCKED_PORT, DROP),
 };
 
 #define DEVLINK_TRAP_GROUP(_id)						      \
@@ -11769,6 +11771,7 @@ static const struct devlink_trap_group devlink_trap_group_generic[] = {
 	DEVLINK_TRAP_GROUP(ACL_SAMPLE),
 	DEVLINK_TRAP_GROUP(ACL_TRAP),
 	DEVLINK_TRAP_GROUP(PARSER_ERROR_DROPS),
+	DEVLINK_TRAP_GROUP(EAPOL),
 };
 
 static int devlink_trap_generic_verify(const struct devlink_trap *trap)
-- 
cgit v1.2.3


From fd325cd648f15eb9a8b32a68de3bafc72bcfe753 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:25 -0700
Subject: net: mana: Move header files to a common location

In preparation to add MANA RDMA driver, move all the required header files
to a common location for use by both Ethernet and RDMA drivers.

Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-8-git-send-email-longli@linuxonhyperv.com
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 MAINTAINERS                                        |   1 +
 drivers/net/ethernet/microsoft/mana/gdma.h         | 689 ---------------------
 drivers/net/ethernet/microsoft/mana/gdma_main.c    |   2 +-
 drivers/net/ethernet/microsoft/mana/hw_channel.c   |   4 +-
 drivers/net/ethernet/microsoft/mana/hw_channel.h   | 195 ------
 drivers/net/ethernet/microsoft/mana/mana.h         | 650 -------------------
 .../net/ethernet/microsoft/mana/mana_auxiliary.h   |  10 -
 drivers/net/ethernet/microsoft/mana/mana_bpf.c     |   2 +-
 drivers/net/ethernet/microsoft/mana/mana_en.c      |   4 +-
 drivers/net/ethernet/microsoft/mana/mana_ethtool.c |   2 +-
 drivers/net/ethernet/microsoft/mana/shm_channel.c  |   2 +-
 drivers/net/ethernet/microsoft/mana/shm_channel.h  |  21 -
 include/net/mana/gdma.h                            | 689 +++++++++++++++++++++
 include/net/mana/hw_channel.h                      | 195 ++++++
 include/net/mana/mana.h                            | 650 +++++++++++++++++++
 include/net/mana/mana_auxiliary.h                  |  10 +
 include/net/mana/shm_channel.h                     |  21 +
 17 files changed, 1574 insertions(+), 1573 deletions(-)
 delete mode 100644 drivers/net/ethernet/microsoft/mana/gdma.h
 delete mode 100644 drivers/net/ethernet/microsoft/mana/hw_channel.h
 delete mode 100644 drivers/net/ethernet/microsoft/mana/mana.h
 delete mode 100644 drivers/net/ethernet/microsoft/mana/mana_auxiliary.h
 delete mode 100644 drivers/net/ethernet/microsoft/mana/shm_channel.h
 create mode 100644 include/net/mana/gdma.h
 create mode 100644 include/net/mana/hw_channel.h
 create mode 100644 include/net/mana/mana.h
 create mode 100644 include/net/mana/mana_auxiliary.h
 create mode 100644 include/net/mana/shm_channel.h

(limited to 'include/net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 379945f82a64..441a65d41eb4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9541,6 +9541,7 @@ F:	include/asm-generic/hyperv-tlfs.h
 F:	include/asm-generic/mshyperv.h
 F:	include/clocksource/hyperv_timer.h
 F:	include/linux/hyperv.h
+F:	include/net/mana
 F:	include/uapi/linux/hyperv.h
 F:	net/vmw_vsock/hyperv_transport.c
 F:	tools/hv/
diff --git a/drivers/net/ethernet/microsoft/mana/gdma.h b/drivers/net/ethernet/microsoft/mana/gdma.h
deleted file mode 100644
index 72eaec2470c0..000000000000
--- a/drivers/net/ethernet/microsoft/mana/gdma.h
+++ /dev/null
@@ -1,689 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2021, Microsoft Corporation. */
-
-#ifndef _GDMA_H
-#define _GDMA_H
-
-#include <linux/dma-mapping.h>
-#include <linux/netdevice.h>
-
-#include "shm_channel.h"
-
-/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
- * them are naturally aligned and hence don't need __packed.
- */
-
-enum gdma_request_type {
-	GDMA_VERIFY_VF_DRIVER_VERSION	= 1,
-	GDMA_QUERY_MAX_RESOURCES	= 2,
-	GDMA_LIST_DEVICES		= 3,
-	GDMA_REGISTER_DEVICE		= 4,
-	GDMA_DEREGISTER_DEVICE		= 5,
-	GDMA_GENERATE_TEST_EQE		= 10,
-	GDMA_CREATE_QUEUE		= 12,
-	GDMA_DISABLE_QUEUE		= 13,
-	GDMA_CREATE_DMA_REGION		= 25,
-	GDMA_DMA_REGION_ADD_PAGES	= 26,
-	GDMA_DESTROY_DMA_REGION		= 27,
-};
-
-enum gdma_queue_type {
-	GDMA_INVALID_QUEUE,
-	GDMA_SQ,
-	GDMA_RQ,
-	GDMA_CQ,
-	GDMA_EQ,
-};
-
-enum gdma_work_request_flags {
-	GDMA_WR_NONE			= 0,
-	GDMA_WR_OOB_IN_SGL		= BIT(0),
-	GDMA_WR_PAD_BY_SGE0		= BIT(1),
-};
-
-enum gdma_eqe_type {
-	GDMA_EQE_COMPLETION		= 3,
-	GDMA_EQE_TEST_EVENT		= 64,
-	GDMA_EQE_HWC_INIT_EQ_ID_DB	= 129,
-	GDMA_EQE_HWC_INIT_DATA		= 130,
-	GDMA_EQE_HWC_INIT_DONE		= 131,
-};
-
-enum {
-	GDMA_DEVICE_NONE	= 0,
-	GDMA_DEVICE_HWC		= 1,
-	GDMA_DEVICE_MANA	= 2,
-};
-
-struct gdma_resource {
-	/* Protect the bitmap */
-	spinlock_t lock;
-
-	/* The bitmap size in bits. */
-	u32 size;
-
-	/* The bitmap tracks the resources. */
-	unsigned long *map;
-};
-
-union gdma_doorbell_entry {
-	u64	as_uint64;
-
-	struct {
-		u64 id		: 24;
-		u64 reserved	: 8;
-		u64 tail_ptr	: 31;
-		u64 arm		: 1;
-	} cq;
-
-	struct {
-		u64 id		: 24;
-		u64 wqe_cnt	: 8;
-		u64 tail_ptr	: 32;
-	} rq;
-
-	struct {
-		u64 id		: 24;
-		u64 reserved	: 8;
-		u64 tail_ptr	: 32;
-	} sq;
-
-	struct {
-		u64 id		: 16;
-		u64 reserved	: 16;
-		u64 tail_ptr	: 31;
-		u64 arm		: 1;
-	} eq;
-}; /* HW DATA */
-
-struct gdma_msg_hdr {
-	u32 hdr_type;
-	u32 msg_type;
-	u16 msg_version;
-	u16 hwc_msg_id;
-	u32 msg_size;
-}; /* HW DATA */
-
-struct gdma_dev_id {
-	union {
-		struct {
-			u16 type;
-			u16 instance;
-		};
-
-		u32 as_uint32;
-	};
-}; /* HW DATA */
-
-struct gdma_req_hdr {
-	struct gdma_msg_hdr req;
-	struct gdma_msg_hdr resp; /* The expected response */
-	struct gdma_dev_id dev_id;
-	u32 activity_id;
-}; /* HW DATA */
-
-struct gdma_resp_hdr {
-	struct gdma_msg_hdr response;
-	struct gdma_dev_id dev_id;
-	u32 activity_id;
-	u32 status;
-	u32 reserved;
-}; /* HW DATA */
-
-struct gdma_general_req {
-	struct gdma_req_hdr hdr;
-}; /* HW DATA */
-
-#define GDMA_MESSAGE_V1 1
-
-struct gdma_general_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-#define GDMA_STANDARD_HEADER_TYPE 0
-
-static inline void mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, u32 code,
-					u32 req_size, u32 resp_size)
-{
-	hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE;
-	hdr->req.msg_type = code;
-	hdr->req.msg_version = GDMA_MESSAGE_V1;
-	hdr->req.msg_size = req_size;
-
-	hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE;
-	hdr->resp.msg_type = code;
-	hdr->resp.msg_version = GDMA_MESSAGE_V1;
-	hdr->resp.msg_size = resp_size;
-}
-
-/* The 16-byte struct is part of the GDMA work queue entry (WQE). */
-struct gdma_sge {
-	u64 address;
-	u32 mem_key;
-	u32 size;
-}; /* HW DATA */
-
-struct gdma_wqe_request {
-	struct gdma_sge *sgl;
-	u32 num_sge;
-
-	u32 inline_oob_size;
-	const void *inline_oob_data;
-
-	u32 flags;
-	u32 client_data_unit;
-};
-
-enum gdma_page_type {
-	GDMA_PAGE_TYPE_4K,
-};
-
-#define GDMA_INVALID_DMA_REGION 0
-
-struct gdma_mem_info {
-	struct device *dev;
-
-	dma_addr_t dma_handle;
-	void *virt_addr;
-	u64 length;
-
-	/* Allocated by the PF driver */
-	u64 gdma_region;
-};
-
-#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8
-
-struct gdma_dev {
-	struct gdma_context *gdma_context;
-
-	struct gdma_dev_id dev_id;
-
-	u32 pdid;
-	u32 doorbell;
-	u32 gpa_mkey;
-
-	/* GDMA driver specific pointer */
-	void *driver_data;
-
-	struct auxiliary_device *adev;
-};
-
-#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE
-
-#define GDMA_CQE_SIZE 64
-#define GDMA_EQE_SIZE 16
-#define GDMA_MAX_SQE_SIZE 512
-#define GDMA_MAX_RQE_SIZE 256
-
-#define GDMA_COMP_DATA_SIZE 0x3C
-
-#define GDMA_EVENT_DATA_SIZE 0xC
-
-/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */
-#define GDMA_WQE_BU_SIZE 32
-
-#define INVALID_PDID		UINT_MAX
-#define INVALID_DOORBELL	UINT_MAX
-#define INVALID_MEM_KEY		UINT_MAX
-#define INVALID_QUEUE_ID	UINT_MAX
-#define INVALID_PCI_MSIX_INDEX  UINT_MAX
-
-struct gdma_comp {
-	u32 cqe_data[GDMA_COMP_DATA_SIZE / 4];
-	u32 wq_num;
-	bool is_sq;
-};
-
-struct gdma_event {
-	u32 details[GDMA_EVENT_DATA_SIZE / 4];
-	u8  type;
-};
-
-struct gdma_queue;
-
-struct mana_eq {
-	struct gdma_queue *eq;
-};
-
-typedef void gdma_eq_callback(void *context, struct gdma_queue *q,
-			      struct gdma_event *e);
-
-typedef void gdma_cq_callback(void *context, struct gdma_queue *q);
-
-/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE
- * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the
- * driver increases the 'head' in BUs rather than in bytes, and notifies
- * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track
- * the HW head, and increases the 'head' by 1 for every processed EQE/CQE.
- *
- * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is
- * processed, the driver increases the 'tail' to indicate that WQEs have
- * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ.
- *
- * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures
- * that the EQ/CQ is big enough so they can't overflow, and the driver uses
- * the owner bits mechanism to detect if the queue has become empty.
- */
-struct gdma_queue {
-	struct gdma_dev *gdma_dev;
-
-	enum gdma_queue_type type;
-	u32 id;
-
-	struct gdma_mem_info mem_info;
-
-	void *queue_mem_ptr;
-	u32 queue_size;
-
-	bool monitor_avl_buf;
-
-	u32 head;
-	u32 tail;
-
-	/* Extra fields specific to EQ/CQ. */
-	union {
-		struct {
-			bool disable_needed;
-
-			gdma_eq_callback *callback;
-			void *context;
-
-			unsigned int msix_index;
-
-			u32 log2_throttle_limit;
-		} eq;
-
-		struct {
-			gdma_cq_callback *callback;
-			void *context;
-
-			struct gdma_queue *parent; /* For CQ/EQ relationship */
-		} cq;
-	};
-};
-
-struct gdma_queue_spec {
-	enum gdma_queue_type type;
-	bool monitor_avl_buf;
-	unsigned int queue_size;
-
-	/* Extra fields specific to EQ/CQ. */
-	union {
-		struct {
-			gdma_eq_callback *callback;
-			void *context;
-
-			unsigned long log2_throttle_limit;
-		} eq;
-
-		struct {
-			gdma_cq_callback *callback;
-			void *context;
-
-			struct gdma_queue *parent_eq;
-
-		} cq;
-	};
-};
-
-struct gdma_irq_context {
-	void (*handler)(void *arg);
-	void *arg;
-};
-
-struct gdma_context {
-	struct device		*dev;
-
-	/* Per-vPort max number of queues */
-	unsigned int		max_num_queues;
-	unsigned int		max_num_msix;
-	unsigned int		num_msix_usable;
-	struct gdma_resource	msix_resource;
-	struct gdma_irq_context	*irq_contexts;
-
-	/* This maps a CQ index to the queue structure. */
-	unsigned int		max_num_cqs;
-	struct gdma_queue	**cq_table;
-
-	/* Protect eq_test_event and test_event_eq_id  */
-	struct mutex		eq_test_event_mutex;
-	struct completion	eq_test_event;
-	u32			test_event_eq_id;
-
-	bool			is_pf;
-	phys_addr_t		bar0_pa;
-	void __iomem		*bar0_va;
-	void __iomem		*shm_base;
-	void __iomem		*db_page_base;
-	phys_addr_t		phys_db_page_base;
-	u32 db_page_size;
-
-	/* Shared memory chanenl (used to bootstrap HWC) */
-	struct shm_channel	shm_channel;
-
-	/* Hardware communication channel (HWC) */
-	struct gdma_dev		hwc;
-
-	/* Azure network adapter */
-	struct gdma_dev		mana;
-};
-
-#define MAX_NUM_GDMA_DEVICES	4
-
-static inline bool mana_gd_is_mana(struct gdma_dev *gd)
-{
-	return gd->dev_id.type == GDMA_DEVICE_MANA;
-}
-
-static inline bool mana_gd_is_hwc(struct gdma_dev *gd)
-{
-	return gd->dev_id.type == GDMA_DEVICE_HWC;
-}
-
-u8 *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, u32 wqe_offset);
-u32 mana_gd_wq_avail_space(struct gdma_queue *wq);
-
-int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq);
-
-int mana_gd_create_hwc_queue(struct gdma_dev *gd,
-			     const struct gdma_queue_spec *spec,
-			     struct gdma_queue **queue_ptr);
-
-int mana_gd_create_mana_eq(struct gdma_dev *gd,
-			   const struct gdma_queue_spec *spec,
-			   struct gdma_queue **queue_ptr);
-
-int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
-			      const struct gdma_queue_spec *spec,
-			      struct gdma_queue **queue_ptr);
-
-void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue);
-
-int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe);
-
-void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit);
-
-struct gdma_wqe {
-	u32 reserved	:24;
-	u32 last_vbytes	:8;
-
-	union {
-		u32 flags;
-
-		struct {
-			u32 num_sge		:8;
-			u32 inline_oob_size_div4:3;
-			u32 client_oob_in_sgl	:1;
-			u32 reserved1		:4;
-			u32 client_data_unit	:14;
-			u32 reserved2		:2;
-		};
-	};
-}; /* HW DATA */
-
-#define INLINE_OOB_SMALL_SIZE 8
-#define INLINE_OOB_LARGE_SIZE 24
-
-#define MAX_TX_WQE_SIZE 512
-#define MAX_RX_WQE_SIZE 256
-
-struct gdma_cqe {
-	u32 cqe_data[GDMA_COMP_DATA_SIZE / 4];
-
-	union {
-		u32 as_uint32;
-
-		struct {
-			u32 wq_num	: 24;
-			u32 is_sq	: 1;
-			u32 reserved	: 4;
-			u32 owner_bits	: 3;
-		};
-	} cqe_info;
-}; /* HW DATA */
-
-#define GDMA_CQE_OWNER_BITS 3
-
-#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1)
-
-#define SET_ARM_BIT 1
-
-#define GDMA_EQE_OWNER_BITS 3
-
-union gdma_eqe_info {
-	u32 as_uint32;
-
-	struct {
-		u32 type	: 8;
-		u32 reserved1	: 8;
-		u32 client_id	: 2;
-		u32 reserved2	: 11;
-		u32 owner_bits	: 3;
-	};
-}; /* HW DATA */
-
-#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1)
-#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries))
-
-struct gdma_eqe {
-	u32 details[GDMA_EVENT_DATA_SIZE / 4];
-	u32 eqe_info;
-}; /* HW DATA */
-
-#define GDMA_REG_DB_PAGE_OFFSET	8
-#define GDMA_REG_DB_PAGE_SIZE	0x10
-#define GDMA_REG_SHM_OFFSET	0x18
-
-#define GDMA_PF_REG_DB_PAGE_SIZE	0xD0
-#define GDMA_PF_REG_DB_PAGE_OFF		0xC8
-#define GDMA_PF_REG_SHM_OFF		0x70
-
-#define GDMA_SRIOV_REG_CFG_BASE_OFF	0x108
-
-#define MANA_PF_DEVICE_ID 0x00B9
-#define MANA_VF_DEVICE_ID 0x00BA
-
-struct gdma_posted_wqe_info {
-	u32 wqe_size_in_bu;
-};
-
-/* GDMA_GENERATE_TEST_EQE */
-struct gdma_generate_test_event_req {
-	struct gdma_req_hdr hdr;
-	u32 queue_index;
-}; /* HW DATA */
-
-/* GDMA_VERIFY_VF_DRIVER_VERSION */
-enum {
-	GDMA_PROTOCOL_V1	= 1,
-	GDMA_PROTOCOL_FIRST	= GDMA_PROTOCOL_V1,
-	GDMA_PROTOCOL_LAST	= GDMA_PROTOCOL_V1,
-};
-
-#define GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT BIT(0)
-
-#define GDMA_DRV_CAP_FLAGS1 GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT
-
-#define GDMA_DRV_CAP_FLAGS2 0
-
-#define GDMA_DRV_CAP_FLAGS3 0
-
-#define GDMA_DRV_CAP_FLAGS4 0
-
-struct gdma_verify_ver_req {
-	struct gdma_req_hdr hdr;
-
-	/* Mandatory fields required for protocol establishment */
-	u64 protocol_ver_min;
-	u64 protocol_ver_max;
-
-	/* Gdma Driver Capability Flags */
-	u64 gd_drv_cap_flags1;
-	u64 gd_drv_cap_flags2;
-	u64 gd_drv_cap_flags3;
-	u64 gd_drv_cap_flags4;
-
-	/* Advisory fields */
-	u64 drv_ver;
-	u32 os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */
-	u32 reserved;
-	u32 os_ver_major;
-	u32 os_ver_minor;
-	u32 os_ver_build;
-	u32 os_ver_platform;
-	u64 reserved_2;
-	u8 os_ver_str1[128];
-	u8 os_ver_str2[128];
-	u8 os_ver_str3[128];
-	u8 os_ver_str4[128];
-}; /* HW DATA */
-
-struct gdma_verify_ver_resp {
-	struct gdma_resp_hdr hdr;
-	u64 gdma_protocol_ver;
-	u64 pf_cap_flags1;
-	u64 pf_cap_flags2;
-	u64 pf_cap_flags3;
-	u64 pf_cap_flags4;
-}; /* HW DATA */
-
-/* GDMA_QUERY_MAX_RESOURCES */
-struct gdma_query_max_resources_resp {
-	struct gdma_resp_hdr hdr;
-	u32 status;
-	u32 max_sq;
-	u32 max_rq;
-	u32 max_cq;
-	u32 max_eq;
-	u32 max_db;
-	u32 max_mst;
-	u32 max_cq_mod_ctx;
-	u32 max_mod_cq;
-	u32 max_msix;
-}; /* HW DATA */
-
-/* GDMA_LIST_DEVICES */
-struct gdma_list_devices_resp {
-	struct gdma_resp_hdr hdr;
-	u32 num_of_devs;
-	u32 reserved;
-	struct gdma_dev_id devs[64];
-}; /* HW DATA */
-
-/* GDMA_REGISTER_DEVICE */
-struct gdma_register_device_resp {
-	struct gdma_resp_hdr hdr;
-	u32 pdid;
-	u32 gpa_mkey;
-	u32 db_id;
-}; /* HW DATA */
-
-/* GDMA_CREATE_QUEUE */
-struct gdma_create_queue_req {
-	struct gdma_req_hdr hdr;
-	u32 type;
-	u32 reserved1;
-	u32 pdid;
-	u32 doolbell_id;
-	u64 gdma_region;
-	u32 reserved2;
-	u32 queue_size;
-	u32 log2_throttle_limit;
-	u32 eq_pci_msix_index;
-	u32 cq_mod_ctx_id;
-	u32 cq_parent_eq_id;
-	u8  rq_drop_on_overrun;
-	u8  rq_err_on_wqe_overflow;
-	u8  rq_chain_rec_wqes;
-	u8  sq_hw_db;
-	u32 reserved3;
-}; /* HW DATA */
-
-struct gdma_create_queue_resp {
-	struct gdma_resp_hdr hdr;
-	u32 queue_index;
-}; /* HW DATA */
-
-/* GDMA_DISABLE_QUEUE */
-struct gdma_disable_queue_req {
-	struct gdma_req_hdr hdr;
-	u32 type;
-	u32 queue_index;
-	u32 alloc_res_id_on_creation;
-}; /* HW DATA */
-
-/* GDMA_CREATE_DMA_REGION */
-struct gdma_create_dma_region_req {
-	struct gdma_req_hdr hdr;
-
-	/* The total size of the DMA region */
-	u64 length;
-
-	/* The offset in the first page */
-	u32 offset_in_page;
-
-	/* enum gdma_page_type */
-	u32 gdma_page_type;
-
-	/* The total number of pages */
-	u32 page_count;
-
-	/* If page_addr_list_len is smaller than page_count,
-	 * the remaining page addresses will be added via the
-	 * message GDMA_DMA_REGION_ADD_PAGES.
-	 */
-	u32 page_addr_list_len;
-	u64 page_addr_list[];
-}; /* HW DATA */
-
-struct gdma_create_dma_region_resp {
-	struct gdma_resp_hdr hdr;
-	u64 gdma_region;
-}; /* HW DATA */
-
-/* GDMA_DMA_REGION_ADD_PAGES */
-struct gdma_dma_region_add_pages_req {
-	struct gdma_req_hdr hdr;
-
-	u64 gdma_region;
-
-	u32 page_addr_list_len;
-	u32 reserved3;
-
-	u64 page_addr_list[];
-}; /* HW DATA */
-
-/* GDMA_DESTROY_DMA_REGION */
-struct gdma_destroy_dma_region_req {
-	struct gdma_req_hdr hdr;
-
-	u64 gdma_region;
-}; /* HW DATA */
-
-int mana_gd_verify_vf_version(struct pci_dev *pdev);
-
-int mana_gd_register_device(struct gdma_dev *gd);
-int mana_gd_deregister_device(struct gdma_dev *gd);
-
-int mana_gd_post_work_request(struct gdma_queue *wq,
-			      const struct gdma_wqe_request *wqe_req,
-			      struct gdma_posted_wqe_info *wqe_info);
-
-int mana_gd_post_and_ring(struct gdma_queue *queue,
-			  const struct gdma_wqe_request *wqe,
-			  struct gdma_posted_wqe_info *wqe_info);
-
-int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r);
-void mana_gd_free_res_map(struct gdma_resource *r);
-
-void mana_gd_wq_ring_doorbell(struct gdma_context *gc,
-			      struct gdma_queue *queue);
-
-int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length,
-			 struct gdma_mem_info *gmi);
-
-void mana_gd_free_memory(struct gdma_mem_info *gmi);
-
-int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req,
-			 u32 resp_len, void *resp);
-#endif /* _GDMA_H */
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index f0e22954d5c0..69795bc679e7 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -6,7 +6,7 @@
 #include <linux/utsname.h>
 #include <linux/version.h>
 
-#include "mana.h"
+#include <net/mana/mana.h>
 
 static u32 mana_gd_r32(struct gdma_context *g, u64 offset)
 {
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 543a5d5c304f..76829ab43d40 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -1,8 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
 /* Copyright (c) 2021, Microsoft Corporation. */
 
-#include "gdma.h"
-#include "hw_channel.h"
+#include <net/mana/gdma.h>
+#include <net/mana/hw_channel.h>
 
 static int mana_hwc_get_msg_index(struct hw_channel_context *hwc, u16 *msg_id)
 {
diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.h b/drivers/net/ethernet/microsoft/mana/hw_channel.h
deleted file mode 100644
index 6a757a6e2732..000000000000
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.h
+++ /dev/null
@@ -1,195 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2021, Microsoft Corporation. */
-
-#ifndef _HW_CHANNEL_H
-#define _HW_CHANNEL_H
-
-#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ  4
-
-#define HW_CHANNEL_MAX_REQUEST_SIZE  0x1000
-#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000
-
-#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1
-
-#define HWC_INIT_DATA_CQID		1
-#define HWC_INIT_DATA_RQID		2
-#define HWC_INIT_DATA_SQID		3
-#define HWC_INIT_DATA_QUEUE_DEPTH	4
-#define HWC_INIT_DATA_MAX_REQUEST	5
-#define HWC_INIT_DATA_MAX_RESPONSE	6
-#define HWC_INIT_DATA_MAX_NUM_CQS	7
-#define HWC_INIT_DATA_PDID		8
-#define HWC_INIT_DATA_GPA_MKEY		9
-#define HWC_INIT_DATA_PF_DEST_RQ_ID	10
-#define HWC_INIT_DATA_PF_DEST_CQ_ID	11
-
-/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
- * them are naturally aligned and hence don't need __packed.
- */
-
-union hwc_init_eq_id_db {
-	u32 as_uint32;
-
-	struct {
-		u32 eq_id	: 16;
-		u32 doorbell	: 16;
-	};
-}; /* HW DATA */
-
-union hwc_init_type_data {
-	u32 as_uint32;
-
-	struct {
-		u32 value	: 24;
-		u32 type	:  8;
-	};
-}; /* HW DATA */
-
-struct hwc_rx_oob {
-	u32 type	: 6;
-	u32 eom		: 1;
-	u32 som		: 1;
-	u32 vendor_err	: 8;
-	u32 reserved1	: 16;
-
-	u32 src_virt_wq	: 24;
-	u32 src_vfid	: 8;
-
-	u32 reserved2;
-
-	union {
-		u32 wqe_addr_low;
-		u32 wqe_offset;
-	};
-
-	u32 wqe_addr_high;
-
-	u32 client_data_unit	: 14;
-	u32 reserved3		: 18;
-
-	u32 tx_oob_data_size;
-
-	u32 chunk_offset	: 21;
-	u32 reserved4		: 11;
-}; /* HW DATA */
-
-struct hwc_tx_oob {
-	u32 reserved1;
-
-	u32 reserved2;
-
-	u32 vrq_id	: 24;
-	u32 dest_vfid	: 8;
-
-	u32 vrcq_id	: 24;
-	u32 reserved3	: 8;
-
-	u32 vscq_id	: 24;
-	u32 loopback	: 1;
-	u32 lso_override: 1;
-	u32 dest_pf	: 1;
-	u32 reserved4	: 5;
-
-	u32 vsq_id	: 24;
-	u32 reserved5	: 8;
-}; /* HW DATA */
-
-struct hwc_work_request {
-	void *buf_va;
-	void *buf_sge_addr;
-	u32 buf_len;
-	u32 msg_size;
-
-	struct gdma_wqe_request wqe_req;
-	struct hwc_tx_oob tx_oob;
-
-	struct gdma_sge sge;
-};
-
-/* hwc_dma_buf represents the array of in-flight WQEs.
- * mem_info as know as the GDMA mapped memory is partitioned and used by
- * in-flight WQEs.
- * The number of WQEs is determined by the number of in-flight messages.
- */
-struct hwc_dma_buf {
-	struct gdma_mem_info mem_info;
-
-	u32 gpa_mkey;
-
-	u32 num_reqs;
-	struct hwc_work_request reqs[];
-};
-
-typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id,
-				    const struct hwc_rx_oob *rx_oob);
-
-typedef void hwc_tx_event_handler_t(void *ctx, u32 gdma_txq_id,
-				    const struct hwc_rx_oob *rx_oob);
-
-struct hwc_cq {
-	struct hw_channel_context *hwc;
-
-	struct gdma_queue *gdma_cq;
-	struct gdma_queue *gdma_eq;
-	struct gdma_comp *comp_buf;
-	u16 queue_depth;
-
-	hwc_rx_event_handler_t *rx_event_handler;
-	void *rx_event_ctx;
-
-	hwc_tx_event_handler_t *tx_event_handler;
-	void *tx_event_ctx;
-};
-
-struct hwc_wq {
-	struct hw_channel_context *hwc;
-
-	struct gdma_queue *gdma_wq;
-	struct hwc_dma_buf *msg_buf;
-	u16 queue_depth;
-
-	struct hwc_cq *hwc_cq;
-};
-
-struct hwc_caller_ctx {
-	struct completion comp_event;
-	void *output_buf;
-	u32 output_buflen;
-
-	u32 error; /* Linux error code */
-	u32 status_code;
-};
-
-struct hw_channel_context {
-	struct gdma_dev *gdma_dev;
-	struct device *dev;
-
-	u16 num_inflight_msg;
-	u32 max_req_msg_size;
-
-	u16 hwc_init_q_depth_max;
-	u32 hwc_init_max_req_msg_size;
-	u32 hwc_init_max_resp_msg_size;
-
-	struct completion hwc_init_eqe_comp;
-
-	struct hwc_wq *rxq;
-	struct hwc_wq *txq;
-	struct hwc_cq *cq;
-
-	struct semaphore sema;
-	struct gdma_resource inflight_msg_res;
-
-	u32 pf_dest_vrq_id;
-	u32 pf_dest_vrcq_id;
-
-	struct hwc_caller_ctx *caller_ctx;
-};
-
-int mana_hwc_create_channel(struct gdma_context *gc);
-void mana_hwc_destroy_channel(struct gdma_context *gc);
-
-int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
-			  const void *req, u32 resp_len, void *resp);
-
-#endif /* _HW_CHANNEL_H */
diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h
deleted file mode 100644
index 6e9e86fb4c02..000000000000
--- a/drivers/net/ethernet/microsoft/mana/mana.h
+++ /dev/null
@@ -1,650 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2021, Microsoft Corporation. */
-
-#ifndef _MANA_H
-#define _MANA_H
-
-#include "gdma.h"
-#include "hw_channel.h"
-
-/* Microsoft Azure Network Adapter (MANA)'s definitions
- *
- * Structures labeled with "HW DATA" are exchanged with the hardware. All of
- * them are naturally aligned and hence don't need __packed.
- */
-
-/* MANA protocol version */
-#define MANA_MAJOR_VERSION	0
-#define MANA_MINOR_VERSION	1
-#define MANA_MICRO_VERSION	1
-
-typedef u64 mana_handle_t;
-#define INVALID_MANA_HANDLE ((mana_handle_t)-1)
-
-enum TRI_STATE {
-	TRI_STATE_UNKNOWN = -1,
-	TRI_STATE_FALSE = 0,
-	TRI_STATE_TRUE = 1
-};
-
-/* Number of entries for hardware indirection table must be in power of 2 */
-#define MANA_INDIRECT_TABLE_SIZE 64
-#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1)
-
-/* The Toeplitz hash key's length in bytes: should be multiple of 8 */
-#define MANA_HASH_KEY_SIZE 40
-
-#define COMP_ENTRY_SIZE 64
-
-#define ADAPTER_MTU_SIZE 1500
-#define MAX_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
-
-#define RX_BUFFERS_PER_QUEUE 512
-
-#define MAX_SEND_BUFFERS_PER_QUEUE 256
-
-#define EQ_SIZE (8 * PAGE_SIZE)
-#define LOG2_EQ_THROTTLE 3
-
-#define MAX_PORTS_IN_MANA_DEV 256
-
-struct mana_stats_rx {
-	u64 packets;
-	u64 bytes;
-	u64 xdp_drop;
-	u64 xdp_tx;
-	u64 xdp_redirect;
-	struct u64_stats_sync syncp;
-};
-
-struct mana_stats_tx {
-	u64 packets;
-	u64 bytes;
-	u64 xdp_xmit;
-	struct u64_stats_sync syncp;
-};
-
-struct mana_txq {
-	struct gdma_queue *gdma_sq;
-
-	union {
-		u32 gdma_txq_id;
-		struct {
-			u32 reserved1	: 10;
-			u32 vsq_frame	: 14;
-			u32 reserved2	: 8;
-		};
-	};
-
-	u16 vp_offset;
-
-	struct net_device *ndev;
-
-	/* The SKBs are sent to the HW and we are waiting for the CQEs. */
-	struct sk_buff_head pending_skbs;
-	struct netdev_queue *net_txq;
-
-	atomic_t pending_sends;
-
-	struct mana_stats_tx stats;
-};
-
-/* skb data and frags dma mappings */
-struct mana_skb_head {
-	dma_addr_t dma_handle[MAX_SKB_FRAGS + 1];
-
-	u32 size[MAX_SKB_FRAGS + 1];
-};
-
-#define MANA_HEADROOM sizeof(struct mana_skb_head)
-
-enum mana_tx_pkt_format {
-	MANA_SHORT_PKT_FMT	= 0,
-	MANA_LONG_PKT_FMT	= 1,
-};
-
-struct mana_tx_short_oob {
-	u32 pkt_fmt		: 2;
-	u32 is_outer_ipv4	: 1;
-	u32 is_outer_ipv6	: 1;
-	u32 comp_iphdr_csum	: 1;
-	u32 comp_tcp_csum	: 1;
-	u32 comp_udp_csum	: 1;
-	u32 supress_txcqe_gen	: 1;
-	u32 vcq_num		: 24;
-
-	u32 trans_off		: 10; /* Transport header offset */
-	u32 vsq_frame		: 14;
-	u32 short_vp_offset	: 8;
-}; /* HW DATA */
-
-struct mana_tx_long_oob {
-	u32 is_encap		: 1;
-	u32 inner_is_ipv6	: 1;
-	u32 inner_tcp_opt	: 1;
-	u32 inject_vlan_pri_tag : 1;
-	u32 reserved1		: 12;
-	u32 pcp			: 3;  /* 802.1Q */
-	u32 dei			: 1;  /* 802.1Q */
-	u32 vlan_id		: 12; /* 802.1Q */
-
-	u32 inner_frame_offset	: 10;
-	u32 inner_ip_rel_offset : 6;
-	u32 long_vp_offset	: 12;
-	u32 reserved2		: 4;
-
-	u32 reserved3;
-	u32 reserved4;
-}; /* HW DATA */
-
-struct mana_tx_oob {
-	struct mana_tx_short_oob s_oob;
-	struct mana_tx_long_oob l_oob;
-}; /* HW DATA */
-
-enum mana_cq_type {
-	MANA_CQ_TYPE_RX,
-	MANA_CQ_TYPE_TX,
-};
-
-enum mana_cqe_type {
-	CQE_INVALID			= 0,
-	CQE_RX_OKAY			= 1,
-	CQE_RX_COALESCED_4		= 2,
-	CQE_RX_OBJECT_FENCE		= 3,
-	CQE_RX_TRUNCATED		= 4,
-
-	CQE_TX_OKAY			= 32,
-	CQE_TX_SA_DROP			= 33,
-	CQE_TX_MTU_DROP			= 34,
-	CQE_TX_INVALID_OOB		= 35,
-	CQE_TX_INVALID_ETH_TYPE		= 36,
-	CQE_TX_HDR_PROCESSING_ERROR	= 37,
-	CQE_TX_VF_DISABLED		= 38,
-	CQE_TX_VPORT_IDX_OUT_OF_RANGE	= 39,
-	CQE_TX_VPORT_DISABLED		= 40,
-	CQE_TX_VLAN_TAGGING_VIOLATION	= 41,
-};
-
-#define MANA_CQE_COMPLETION 1
-
-struct mana_cqe_header {
-	u32 cqe_type	: 6;
-	u32 client_type	: 2;
-	u32 vendor_err	: 24;
-}; /* HW DATA */
-
-/* NDIS HASH Types */
-#define NDIS_HASH_IPV4		BIT(0)
-#define NDIS_HASH_TCP_IPV4	BIT(1)
-#define NDIS_HASH_UDP_IPV4	BIT(2)
-#define NDIS_HASH_IPV6		BIT(3)
-#define NDIS_HASH_TCP_IPV6	BIT(4)
-#define NDIS_HASH_UDP_IPV6	BIT(5)
-#define NDIS_HASH_IPV6_EX	BIT(6)
-#define NDIS_HASH_TCP_IPV6_EX	BIT(7)
-#define NDIS_HASH_UDP_IPV6_EX	BIT(8)
-
-#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
-#define MANA_HASH_L4                                                         \
-	(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |      \
-	 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
-
-struct mana_rxcomp_perpkt_info {
-	u32 pkt_len	: 16;
-	u32 reserved1	: 16;
-	u32 reserved2;
-	u32 pkt_hash;
-}; /* HW DATA */
-
-#define MANA_RXCOMP_OOB_NUM_PPI 4
-
-/* Receive completion OOB */
-struct mana_rxcomp_oob {
-	struct mana_cqe_header cqe_hdr;
-
-	u32 rx_vlan_id			: 12;
-	u32 rx_vlantag_present		: 1;
-	u32 rx_outer_iphdr_csum_succeed	: 1;
-	u32 rx_outer_iphdr_csum_fail	: 1;
-	u32 reserved1			: 1;
-	u32 rx_hashtype			: 9;
-	u32 rx_iphdr_csum_succeed	: 1;
-	u32 rx_iphdr_csum_fail		: 1;
-	u32 rx_tcp_csum_succeed		: 1;
-	u32 rx_tcp_csum_fail		: 1;
-	u32 rx_udp_csum_succeed		: 1;
-	u32 rx_udp_csum_fail		: 1;
-	u32 reserved2			: 1;
-
-	struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI];
-
-	u32 rx_wqe_offset;
-}; /* HW DATA */
-
-struct mana_tx_comp_oob {
-	struct mana_cqe_header cqe_hdr;
-
-	u32 tx_data_offset;
-
-	u32 tx_sgl_offset	: 5;
-	u32 tx_wqe_offset	: 27;
-
-	u32 reserved[12];
-}; /* HW DATA */
-
-struct mana_rxq;
-
-#define CQE_POLLING_BUFFER 512
-
-struct mana_cq {
-	struct gdma_queue *gdma_cq;
-
-	/* Cache the CQ id (used to verify if each CQE comes to the right CQ. */
-	u32 gdma_id;
-
-	/* Type of the CQ: TX or RX */
-	enum mana_cq_type type;
-
-	/* Pointer to the mana_rxq that is pushing RX CQEs to the queue.
-	 * Only and must be non-NULL if type is MANA_CQ_TYPE_RX.
-	 */
-	struct mana_rxq *rxq;
-
-	/* Pointer to the mana_txq that is pushing TX CQEs to the queue.
-	 * Only and must be non-NULL if type is MANA_CQ_TYPE_TX.
-	 */
-	struct mana_txq *txq;
-
-	/* Buffer which the CQ handler can copy the CQE's into. */
-	struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER];
-
-	/* NAPI data */
-	struct napi_struct napi;
-	int work_done;
-	int budget;
-};
-
-#define GDMA_MAX_RQE_SGES 15
-
-struct mana_recv_buf_oob {
-	/* A valid GDMA work request representing the data buffer. */
-	struct gdma_wqe_request wqe_req;
-
-	void *buf_va;
-	dma_addr_t buf_dma_addr;
-
-	/* SGL of the buffer going to be sent has part of the work request. */
-	u32 num_sge;
-	struct gdma_sge sgl[GDMA_MAX_RQE_SGES];
-
-	/* Required to store the result of mana_gd_post_work_request.
-	 * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
-	 * work queue when the WQE is consumed.
-	 */
-	struct gdma_posted_wqe_info wqe_inf;
-};
-
-struct mana_rxq {
-	struct gdma_queue *gdma_rq;
-	/* Cache the gdma receive queue id */
-	u32 gdma_id;
-
-	/* Index of RQ in the vPort, not gdma receive queue id */
-	u32 rxq_idx;
-
-	u32 datasize;
-
-	mana_handle_t rxobj;
-
-	struct mana_cq rx_cq;
-
-	struct completion fence_event;
-
-	struct net_device *ndev;
-
-	/* Total number of receive buffers to be allocated */
-	u32 num_rx_buf;
-
-	u32 buf_index;
-
-	struct mana_stats_rx stats;
-
-	struct bpf_prog __rcu *bpf_prog;
-	struct xdp_rxq_info xdp_rxq;
-	struct page *xdp_save_page;
-	bool xdp_flush;
-	int xdp_rc; /* XDP redirect return code */
-
-	/* MUST BE THE LAST MEMBER:
-	 * Each receive buffer has an associated mana_recv_buf_oob.
-	 */
-	struct mana_recv_buf_oob rx_oobs[];
-};
-
-struct mana_tx_qp {
-	struct mana_txq txq;
-
-	struct mana_cq tx_cq;
-
-	mana_handle_t tx_object;
-};
-
-struct mana_ethtool_stats {
-	u64 stop_queue;
-	u64 wake_queue;
-};
-
-struct mana_context {
-	struct gdma_dev *gdma_dev;
-
-	u16 num_ports;
-
-	struct mana_eq *eqs;
-
-	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
-};
-
-struct mana_port_context {
-	struct mana_context *ac;
-	struct net_device *ndev;
-
-	u8 mac_addr[ETH_ALEN];
-
-	enum TRI_STATE rss_state;
-
-	mana_handle_t default_rxobj;
-	bool tx_shortform_allowed;
-	u16 tx_vp_offset;
-
-	struct mana_tx_qp *tx_qp;
-
-	/* Indirection Table for RX & TX. The values are queue indexes */
-	u32 indir_table[MANA_INDIRECT_TABLE_SIZE];
-
-	/* Indirection table containing RxObject Handles */
-	mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE];
-
-	/*  Hash key used by the NIC */
-	u8 hashkey[MANA_HASH_KEY_SIZE];
-
-	/* This points to an array of num_queues of RQ pointers. */
-	struct mana_rxq **rxqs;
-
-	struct bpf_prog *bpf_prog;
-
-	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
-	unsigned int max_queues;
-	unsigned int num_queues;
-
-	mana_handle_t port_handle;
-	mana_handle_t pf_filter_handle;
-
-	/* Mutex for sharing access to vport_use_count */
-	struct mutex vport_mutex;
-	int vport_use_count;
-
-	u16 port_idx;
-
-	bool port_is_up;
-	bool port_st_save; /* Saved port state */
-
-	struct mana_ethtool_stats eth_stats;
-};
-
-int mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
-int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
-		    bool update_hash, bool update_tab);
-
-int mana_alloc_queues(struct net_device *ndev);
-int mana_attach(struct net_device *ndev);
-int mana_detach(struct net_device *ndev, bool from_close);
-
-int mana_probe(struct gdma_dev *gd, bool resuming);
-void mana_remove(struct gdma_dev *gd, bool suspending);
-
-void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev);
-int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
-		  u32 flags);
-u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq,
-		 struct xdp_buff *xdp, void *buf_va, uint pkt_len);
-struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
-void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
-int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
-
-extern const struct ethtool_ops mana_ethtool_ops;
-
-struct mana_obj_spec {
-	u32 queue_index;
-	u64 gdma_region;
-	u32 queue_size;
-	u32 attached_eq;
-	u32 modr_ctx_id;
-};
-
-enum mana_command_code {
-	MANA_QUERY_DEV_CONFIG	= 0x20001,
-	MANA_QUERY_GF_STAT	= 0x20002,
-	MANA_CONFIG_VPORT_TX	= 0x20003,
-	MANA_CREATE_WQ_OBJ	= 0x20004,
-	MANA_DESTROY_WQ_OBJ	= 0x20005,
-	MANA_FENCE_RQ		= 0x20006,
-	MANA_CONFIG_VPORT_RX	= 0x20007,
-	MANA_QUERY_VPORT_CONFIG	= 0x20008,
-
-	/* Privileged commands for the PF mode */
-	MANA_REGISTER_FILTER	= 0x28000,
-	MANA_DEREGISTER_FILTER	= 0x28001,
-	MANA_REGISTER_HW_PORT	= 0x28003,
-	MANA_DEREGISTER_HW_PORT	= 0x28004,
-};
-
-/* Query Device Configuration */
-struct mana_query_device_cfg_req {
-	struct gdma_req_hdr hdr;
-
-	/* MANA Nic Driver Capability flags */
-	u64 mn_drv_cap_flags1;
-	u64 mn_drv_cap_flags2;
-	u64 mn_drv_cap_flags3;
-	u64 mn_drv_cap_flags4;
-
-	u32 proto_major_ver;
-	u32 proto_minor_ver;
-	u32 proto_micro_ver;
-
-	u32 reserved;
-}; /* HW DATA */
-
-struct mana_query_device_cfg_resp {
-	struct gdma_resp_hdr hdr;
-
-	u64 pf_cap_flags1;
-	u64 pf_cap_flags2;
-	u64 pf_cap_flags3;
-	u64 pf_cap_flags4;
-
-	u16 max_num_vports;
-	u16 reserved;
-	u32 max_num_eqs;
-}; /* HW DATA */
-
-/* Query vPort Configuration */
-struct mana_query_vport_cfg_req {
-	struct gdma_req_hdr hdr;
-	u32 vport_index;
-}; /* HW DATA */
-
-struct mana_query_vport_cfg_resp {
-	struct gdma_resp_hdr hdr;
-	u32 max_num_sq;
-	u32 max_num_rq;
-	u32 num_indirection_ent;
-	u32 reserved1;
-	u8 mac_addr[6];
-	u8 reserved2[2];
-	mana_handle_t vport;
-}; /* HW DATA */
-
-/* Configure vPort */
-struct mana_config_vport_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t vport;
-	u32 pdid;
-	u32 doorbell_pageid;
-}; /* HW DATA */
-
-struct mana_config_vport_resp {
-	struct gdma_resp_hdr hdr;
-	u16 tx_vport_offset;
-	u8 short_form_allowed;
-	u8 reserved;
-}; /* HW DATA */
-
-/* Create WQ Object */
-struct mana_create_wqobj_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t vport;
-	u32 wq_type;
-	u32 reserved;
-	u64 wq_gdma_region;
-	u64 cq_gdma_region;
-	u32 wq_size;
-	u32 cq_size;
-	u32 cq_moderation_ctx_id;
-	u32 cq_parent_qid;
-}; /* HW DATA */
-
-struct mana_create_wqobj_resp {
-	struct gdma_resp_hdr hdr;
-	u32 wq_id;
-	u32 cq_id;
-	mana_handle_t wq_obj;
-}; /* HW DATA */
-
-/* Destroy WQ Object */
-struct mana_destroy_wqobj_req {
-	struct gdma_req_hdr hdr;
-	u32 wq_type;
-	u32 reserved;
-	mana_handle_t wq_obj_handle;
-}; /* HW DATA */
-
-struct mana_destroy_wqobj_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-/* Fence RQ */
-struct mana_fence_rq_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t wq_obj_handle;
-}; /* HW DATA */
-
-struct mana_fence_rq_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-/* Configure vPort Rx Steering */
-struct mana_cfg_rx_steer_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t vport;
-	u16 num_indir_entries;
-	u16 indir_tab_offset;
-	u32 rx_enable;
-	u32 rss_enable;
-	u8 update_default_rxobj;
-	u8 update_hashkey;
-	u8 update_indir_tab;
-	u8 reserved;
-	mana_handle_t default_rxobj;
-	u8 hashkey[MANA_HASH_KEY_SIZE];
-}; /* HW DATA */
-
-struct mana_cfg_rx_steer_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-/* Register HW vPort */
-struct mana_register_hw_vport_req {
-	struct gdma_req_hdr hdr;
-	u16 attached_gfid;
-	u8 is_pf_default_vport;
-	u8 reserved1;
-	u8 allow_all_ether_types;
-	u8 reserved2;
-	u8 reserved3;
-	u8 reserved4;
-}; /* HW DATA */
-
-struct mana_register_hw_vport_resp {
-	struct gdma_resp_hdr hdr;
-	mana_handle_t hw_vport_handle;
-}; /* HW DATA */
-
-/* Deregister HW vPort */
-struct mana_deregister_hw_vport_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t hw_vport_handle;
-}; /* HW DATA */
-
-struct mana_deregister_hw_vport_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-/* Register filter */
-struct mana_register_filter_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t vport;
-	u8 mac_addr[6];
-	u8 reserved1;
-	u8 reserved2;
-	u8 reserved3;
-	u8 reserved4;
-	u16 reserved5;
-	u32 reserved6;
-	u32 reserved7;
-	u32 reserved8;
-}; /* HW DATA */
-
-struct mana_register_filter_resp {
-	struct gdma_resp_hdr hdr;
-	mana_handle_t filter_handle;
-}; /* HW DATA */
-
-/* Deregister filter */
-struct mana_deregister_filter_req {
-	struct gdma_req_hdr hdr;
-	mana_handle_t filter_handle;
-}; /* HW DATA */
-
-struct mana_deregister_filter_resp {
-	struct gdma_resp_hdr hdr;
-}; /* HW DATA */
-
-#define MANA_MAX_NUM_QUEUES 64
-
-#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1)
-
-struct mana_tx_package {
-	struct gdma_wqe_request wqe_req;
-	struct gdma_sge sgl_array[5];
-	struct gdma_sge *sgl_ptr;
-
-	struct mana_tx_oob tx_oob;
-
-	struct gdma_posted_wqe_info wqe_info;
-};
-
-int mana_create_wq_obj(struct mana_port_context *apc,
-		       mana_handle_t vport,
-		       u32 wq_type, struct mana_obj_spec *wq_spec,
-		       struct mana_obj_spec *cq_spec,
-		       mana_handle_t *wq_obj);
-
-void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
-			 mana_handle_t wq_obj);
-
-int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
-		   u32 doorbell_pg_id);
-void mana_uncfg_vport(struct mana_port_context *apc);
-#endif /* _MANA_H */
diff --git a/drivers/net/ethernet/microsoft/mana/mana_auxiliary.h b/drivers/net/ethernet/microsoft/mana/mana_auxiliary.h
deleted file mode 100644
index 373d59756846..000000000000
--- a/drivers/net/ethernet/microsoft/mana/mana_auxiliary.h
+++ /dev/null
@@ -1,10 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/* Copyright (c) 2022, Microsoft Corporation. */
-
-#include "mana.h"
-#include <linux/auxiliary_bus.h>
-
-struct mana_adev {
-	struct auxiliary_device adev;
-	struct gdma_dev *mdev;
-};
diff --git a/drivers/net/ethernet/microsoft/mana/mana_bpf.c b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
index 421fd39ff3a8..3caea631229c 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_bpf.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_bpf.c
@@ -8,7 +8,7 @@
 #include <linux/bpf_trace.h>
 #include <net/xdp.h>
 
-#include "mana.h"
+#include <net/mana/mana.h>
 
 void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev)
 {
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index b6303a43fa7c..ffa2a0e2c213 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -12,8 +12,8 @@
 #include <net/checksum.h>
 #include <net/ip6_checksum.h>
 
-#include "mana.h"
-#include "mana_auxiliary.h"
+#include <net/mana/mana.h>
+#include <net/mana/mana_auxiliary.h>
 
 static DEFINE_IDA(mana_adev_ida);
 
diff --git a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
index c530db76880f..6f98de6d7440 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_ethtool.c
@@ -5,7 +5,7 @@
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
 
-#include "mana.h"
+#include <net/mana/mana.h>
 
 static const struct {
 	char name[ETH_GSTRING_LEN];
diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.c b/drivers/net/ethernet/microsoft/mana/shm_channel.c
index da255da62176..5553af9c8085 100644
--- a/drivers/net/ethernet/microsoft/mana/shm_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/shm_channel.c
@@ -6,7 +6,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 
-#include "shm_channel.h"
+#include <net/mana/shm_channel.h>
 
 #define PAGE_FRAME_L48_WIDTH_BYTES 6
 #define PAGE_FRAME_L48_WIDTH_BITS (PAGE_FRAME_L48_WIDTH_BYTES * 8)
diff --git a/drivers/net/ethernet/microsoft/mana/shm_channel.h b/drivers/net/ethernet/microsoft/mana/shm_channel.h
deleted file mode 100644
index 5199b41497ff..000000000000
--- a/drivers/net/ethernet/microsoft/mana/shm_channel.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
-/* Copyright (c) 2021, Microsoft Corporation. */
-
-#ifndef _SHM_CHANNEL_H
-#define _SHM_CHANNEL_H
-
-struct shm_channel {
-	struct device *dev;
-	void __iomem *base;
-};
-
-void mana_smc_init(struct shm_channel *sc, struct device *dev,
-		   void __iomem *base);
-
-int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr,
-		       u64 cq_addr, u64 rq_addr, u64 sq_addr,
-		       u32 eq_msix_index);
-
-int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf);
-
-#endif /* _SHM_CHANNEL_H */
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
new file mode 100644
index 000000000000..72eaec2470c0
--- /dev/null
+++ b/include/net/mana/gdma.h
@@ -0,0 +1,689 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#ifndef _GDMA_H
+#define _GDMA_H
+
+#include <linux/dma-mapping.h>
+#include <linux/netdevice.h>
+
+#include "shm_channel.h"
+
+/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+enum gdma_request_type {
+	GDMA_VERIFY_VF_DRIVER_VERSION	= 1,
+	GDMA_QUERY_MAX_RESOURCES	= 2,
+	GDMA_LIST_DEVICES		= 3,
+	GDMA_REGISTER_DEVICE		= 4,
+	GDMA_DEREGISTER_DEVICE		= 5,
+	GDMA_GENERATE_TEST_EQE		= 10,
+	GDMA_CREATE_QUEUE		= 12,
+	GDMA_DISABLE_QUEUE		= 13,
+	GDMA_CREATE_DMA_REGION		= 25,
+	GDMA_DMA_REGION_ADD_PAGES	= 26,
+	GDMA_DESTROY_DMA_REGION		= 27,
+};
+
+enum gdma_queue_type {
+	GDMA_INVALID_QUEUE,
+	GDMA_SQ,
+	GDMA_RQ,
+	GDMA_CQ,
+	GDMA_EQ,
+};
+
+enum gdma_work_request_flags {
+	GDMA_WR_NONE			= 0,
+	GDMA_WR_OOB_IN_SGL		= BIT(0),
+	GDMA_WR_PAD_BY_SGE0		= BIT(1),
+};
+
+enum gdma_eqe_type {
+	GDMA_EQE_COMPLETION		= 3,
+	GDMA_EQE_TEST_EVENT		= 64,
+	GDMA_EQE_HWC_INIT_EQ_ID_DB	= 129,
+	GDMA_EQE_HWC_INIT_DATA		= 130,
+	GDMA_EQE_HWC_INIT_DONE		= 131,
+};
+
+enum {
+	GDMA_DEVICE_NONE	= 0,
+	GDMA_DEVICE_HWC		= 1,
+	GDMA_DEVICE_MANA	= 2,
+};
+
+struct gdma_resource {
+	/* Protect the bitmap */
+	spinlock_t lock;
+
+	/* The bitmap size in bits. */
+	u32 size;
+
+	/* The bitmap tracks the resources. */
+	unsigned long *map;
+};
+
+union gdma_doorbell_entry {
+	u64	as_uint64;
+
+	struct {
+		u64 id		: 24;
+		u64 reserved	: 8;
+		u64 tail_ptr	: 31;
+		u64 arm		: 1;
+	} cq;
+
+	struct {
+		u64 id		: 24;
+		u64 wqe_cnt	: 8;
+		u64 tail_ptr	: 32;
+	} rq;
+
+	struct {
+		u64 id		: 24;
+		u64 reserved	: 8;
+		u64 tail_ptr	: 32;
+	} sq;
+
+	struct {
+		u64 id		: 16;
+		u64 reserved	: 16;
+		u64 tail_ptr	: 31;
+		u64 arm		: 1;
+	} eq;
+}; /* HW DATA */
+
+struct gdma_msg_hdr {
+	u32 hdr_type;
+	u32 msg_type;
+	u16 msg_version;
+	u16 hwc_msg_id;
+	u32 msg_size;
+}; /* HW DATA */
+
+struct gdma_dev_id {
+	union {
+		struct {
+			u16 type;
+			u16 instance;
+		};
+
+		u32 as_uint32;
+	};
+}; /* HW DATA */
+
+struct gdma_req_hdr {
+	struct gdma_msg_hdr req;
+	struct gdma_msg_hdr resp; /* The expected response */
+	struct gdma_dev_id dev_id;
+	u32 activity_id;
+}; /* HW DATA */
+
+struct gdma_resp_hdr {
+	struct gdma_msg_hdr response;
+	struct gdma_dev_id dev_id;
+	u32 activity_id;
+	u32 status;
+	u32 reserved;
+}; /* HW DATA */
+
+struct gdma_general_req {
+	struct gdma_req_hdr hdr;
+}; /* HW DATA */
+
+#define GDMA_MESSAGE_V1 1
+
+struct gdma_general_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+#define GDMA_STANDARD_HEADER_TYPE 0
+
+static inline void mana_gd_init_req_hdr(struct gdma_req_hdr *hdr, u32 code,
+					u32 req_size, u32 resp_size)
+{
+	hdr->req.hdr_type = GDMA_STANDARD_HEADER_TYPE;
+	hdr->req.msg_type = code;
+	hdr->req.msg_version = GDMA_MESSAGE_V1;
+	hdr->req.msg_size = req_size;
+
+	hdr->resp.hdr_type = GDMA_STANDARD_HEADER_TYPE;
+	hdr->resp.msg_type = code;
+	hdr->resp.msg_version = GDMA_MESSAGE_V1;
+	hdr->resp.msg_size = resp_size;
+}
+
+/* The 16-byte struct is part of the GDMA work queue entry (WQE). */
+struct gdma_sge {
+	u64 address;
+	u32 mem_key;
+	u32 size;
+}; /* HW DATA */
+
+struct gdma_wqe_request {
+	struct gdma_sge *sgl;
+	u32 num_sge;
+
+	u32 inline_oob_size;
+	const void *inline_oob_data;
+
+	u32 flags;
+	u32 client_data_unit;
+};
+
+enum gdma_page_type {
+	GDMA_PAGE_TYPE_4K,
+};
+
+#define GDMA_INVALID_DMA_REGION 0
+
+struct gdma_mem_info {
+	struct device *dev;
+
+	dma_addr_t dma_handle;
+	void *virt_addr;
+	u64 length;
+
+	/* Allocated by the PF driver */
+	u64 gdma_region;
+};
+
+#define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8
+
+struct gdma_dev {
+	struct gdma_context *gdma_context;
+
+	struct gdma_dev_id dev_id;
+
+	u32 pdid;
+	u32 doorbell;
+	u32 gpa_mkey;
+
+	/* GDMA driver specific pointer */
+	void *driver_data;
+
+	struct auxiliary_device *adev;
+};
+
+#define MINIMUM_SUPPORTED_PAGE_SIZE PAGE_SIZE
+
+#define GDMA_CQE_SIZE 64
+#define GDMA_EQE_SIZE 16
+#define GDMA_MAX_SQE_SIZE 512
+#define GDMA_MAX_RQE_SIZE 256
+
+#define GDMA_COMP_DATA_SIZE 0x3C
+
+#define GDMA_EVENT_DATA_SIZE 0xC
+
+/* The WQE size must be a multiple of the Basic Unit, which is 32 bytes. */
+#define GDMA_WQE_BU_SIZE 32
+
+#define INVALID_PDID		UINT_MAX
+#define INVALID_DOORBELL	UINT_MAX
+#define INVALID_MEM_KEY		UINT_MAX
+#define INVALID_QUEUE_ID	UINT_MAX
+#define INVALID_PCI_MSIX_INDEX  UINT_MAX
+
+struct gdma_comp {
+	u32 cqe_data[GDMA_COMP_DATA_SIZE / 4];
+	u32 wq_num;
+	bool is_sq;
+};
+
+struct gdma_event {
+	u32 details[GDMA_EVENT_DATA_SIZE / 4];
+	u8  type;
+};
+
+struct gdma_queue;
+
+struct mana_eq {
+	struct gdma_queue *eq;
+};
+
+typedef void gdma_eq_callback(void *context, struct gdma_queue *q,
+			      struct gdma_event *e);
+
+typedef void gdma_cq_callback(void *context, struct gdma_queue *q);
+
+/* The 'head' is the producer index. For SQ/RQ, when the driver posts a WQE
+ * (Note: the WQE size must be a multiple of the 32-byte Basic Unit), the
+ * driver increases the 'head' in BUs rather than in bytes, and notifies
+ * the HW of the updated head. For EQ/CQ, the driver uses the 'head' to track
+ * the HW head, and increases the 'head' by 1 for every processed EQE/CQE.
+ *
+ * The 'tail' is the consumer index for SQ/RQ. After the CQE of the SQ/RQ is
+ * processed, the driver increases the 'tail' to indicate that WQEs have
+ * been consumed by the HW, so the driver can post new WQEs into the SQ/RQ.
+ *
+ * The driver doesn't use the 'tail' for EQ/CQ, because the driver ensures
+ * that the EQ/CQ is big enough so they can't overflow, and the driver uses
+ * the owner bits mechanism to detect if the queue has become empty.
+ */
+struct gdma_queue {
+	struct gdma_dev *gdma_dev;
+
+	enum gdma_queue_type type;
+	u32 id;
+
+	struct gdma_mem_info mem_info;
+
+	void *queue_mem_ptr;
+	u32 queue_size;
+
+	bool monitor_avl_buf;
+
+	u32 head;
+	u32 tail;
+
+	/* Extra fields specific to EQ/CQ. */
+	union {
+		struct {
+			bool disable_needed;
+
+			gdma_eq_callback *callback;
+			void *context;
+
+			unsigned int msix_index;
+
+			u32 log2_throttle_limit;
+		} eq;
+
+		struct {
+			gdma_cq_callback *callback;
+			void *context;
+
+			struct gdma_queue *parent; /* For CQ/EQ relationship */
+		} cq;
+	};
+};
+
+struct gdma_queue_spec {
+	enum gdma_queue_type type;
+	bool monitor_avl_buf;
+	unsigned int queue_size;
+
+	/* Extra fields specific to EQ/CQ. */
+	union {
+		struct {
+			gdma_eq_callback *callback;
+			void *context;
+
+			unsigned long log2_throttle_limit;
+		} eq;
+
+		struct {
+			gdma_cq_callback *callback;
+			void *context;
+
+			struct gdma_queue *parent_eq;
+
+		} cq;
+	};
+};
+
+struct gdma_irq_context {
+	void (*handler)(void *arg);
+	void *arg;
+};
+
+struct gdma_context {
+	struct device		*dev;
+
+	/* Per-vPort max number of queues */
+	unsigned int		max_num_queues;
+	unsigned int		max_num_msix;
+	unsigned int		num_msix_usable;
+	struct gdma_resource	msix_resource;
+	struct gdma_irq_context	*irq_contexts;
+
+	/* This maps a CQ index to the queue structure. */
+	unsigned int		max_num_cqs;
+	struct gdma_queue	**cq_table;
+
+	/* Protect eq_test_event and test_event_eq_id  */
+	struct mutex		eq_test_event_mutex;
+	struct completion	eq_test_event;
+	u32			test_event_eq_id;
+
+	bool			is_pf;
+	phys_addr_t		bar0_pa;
+	void __iomem		*bar0_va;
+	void __iomem		*shm_base;
+	void __iomem		*db_page_base;
+	phys_addr_t		phys_db_page_base;
+	u32 db_page_size;
+
+	/* Shared memory chanenl (used to bootstrap HWC) */
+	struct shm_channel	shm_channel;
+
+	/* Hardware communication channel (HWC) */
+	struct gdma_dev		hwc;
+
+	/* Azure network adapter */
+	struct gdma_dev		mana;
+};
+
+#define MAX_NUM_GDMA_DEVICES	4
+
+static inline bool mana_gd_is_mana(struct gdma_dev *gd)
+{
+	return gd->dev_id.type == GDMA_DEVICE_MANA;
+}
+
+static inline bool mana_gd_is_hwc(struct gdma_dev *gd)
+{
+	return gd->dev_id.type == GDMA_DEVICE_HWC;
+}
+
+u8 *mana_gd_get_wqe_ptr(const struct gdma_queue *wq, u32 wqe_offset);
+u32 mana_gd_wq_avail_space(struct gdma_queue *wq);
+
+int mana_gd_test_eq(struct gdma_context *gc, struct gdma_queue *eq);
+
+int mana_gd_create_hwc_queue(struct gdma_dev *gd,
+			     const struct gdma_queue_spec *spec,
+			     struct gdma_queue **queue_ptr);
+
+int mana_gd_create_mana_eq(struct gdma_dev *gd,
+			   const struct gdma_queue_spec *spec,
+			   struct gdma_queue **queue_ptr);
+
+int mana_gd_create_mana_wq_cq(struct gdma_dev *gd,
+			      const struct gdma_queue_spec *spec,
+			      struct gdma_queue **queue_ptr);
+
+void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue);
+
+int mana_gd_poll_cq(struct gdma_queue *cq, struct gdma_comp *comp, int num_cqe);
+
+void mana_gd_ring_cq(struct gdma_queue *cq, u8 arm_bit);
+
+struct gdma_wqe {
+	u32 reserved	:24;
+	u32 last_vbytes	:8;
+
+	union {
+		u32 flags;
+
+		struct {
+			u32 num_sge		:8;
+			u32 inline_oob_size_div4:3;
+			u32 client_oob_in_sgl	:1;
+			u32 reserved1		:4;
+			u32 client_data_unit	:14;
+			u32 reserved2		:2;
+		};
+	};
+}; /* HW DATA */
+
+#define INLINE_OOB_SMALL_SIZE 8
+#define INLINE_OOB_LARGE_SIZE 24
+
+#define MAX_TX_WQE_SIZE 512
+#define MAX_RX_WQE_SIZE 256
+
+struct gdma_cqe {
+	u32 cqe_data[GDMA_COMP_DATA_SIZE / 4];
+
+	union {
+		u32 as_uint32;
+
+		struct {
+			u32 wq_num	: 24;
+			u32 is_sq	: 1;
+			u32 reserved	: 4;
+			u32 owner_bits	: 3;
+		};
+	} cqe_info;
+}; /* HW DATA */
+
+#define GDMA_CQE_OWNER_BITS 3
+
+#define GDMA_CQE_OWNER_MASK ((1 << GDMA_CQE_OWNER_BITS) - 1)
+
+#define SET_ARM_BIT 1
+
+#define GDMA_EQE_OWNER_BITS 3
+
+union gdma_eqe_info {
+	u32 as_uint32;
+
+	struct {
+		u32 type	: 8;
+		u32 reserved1	: 8;
+		u32 client_id	: 2;
+		u32 reserved2	: 11;
+		u32 owner_bits	: 3;
+	};
+}; /* HW DATA */
+
+#define GDMA_EQE_OWNER_MASK ((1 << GDMA_EQE_OWNER_BITS) - 1)
+#define INITIALIZED_OWNER_BIT(log2_num_entries) (1UL << (log2_num_entries))
+
+struct gdma_eqe {
+	u32 details[GDMA_EVENT_DATA_SIZE / 4];
+	u32 eqe_info;
+}; /* HW DATA */
+
+#define GDMA_REG_DB_PAGE_OFFSET	8
+#define GDMA_REG_DB_PAGE_SIZE	0x10
+#define GDMA_REG_SHM_OFFSET	0x18
+
+#define GDMA_PF_REG_DB_PAGE_SIZE	0xD0
+#define GDMA_PF_REG_DB_PAGE_OFF		0xC8
+#define GDMA_PF_REG_SHM_OFF		0x70
+
+#define GDMA_SRIOV_REG_CFG_BASE_OFF	0x108
+
+#define MANA_PF_DEVICE_ID 0x00B9
+#define MANA_VF_DEVICE_ID 0x00BA
+
+struct gdma_posted_wqe_info {
+	u32 wqe_size_in_bu;
+};
+
+/* GDMA_GENERATE_TEST_EQE */
+struct gdma_generate_test_event_req {
+	struct gdma_req_hdr hdr;
+	u32 queue_index;
+}; /* HW DATA */
+
+/* GDMA_VERIFY_VF_DRIVER_VERSION */
+enum {
+	GDMA_PROTOCOL_V1	= 1,
+	GDMA_PROTOCOL_FIRST	= GDMA_PROTOCOL_V1,
+	GDMA_PROTOCOL_LAST	= GDMA_PROTOCOL_V1,
+};
+
+#define GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT BIT(0)
+
+#define GDMA_DRV_CAP_FLAGS1 GDMA_DRV_CAP_FLAG_1_EQ_SHARING_MULTI_VPORT
+
+#define GDMA_DRV_CAP_FLAGS2 0
+
+#define GDMA_DRV_CAP_FLAGS3 0
+
+#define GDMA_DRV_CAP_FLAGS4 0
+
+struct gdma_verify_ver_req {
+	struct gdma_req_hdr hdr;
+
+	/* Mandatory fields required for protocol establishment */
+	u64 protocol_ver_min;
+	u64 protocol_ver_max;
+
+	/* Gdma Driver Capability Flags */
+	u64 gd_drv_cap_flags1;
+	u64 gd_drv_cap_flags2;
+	u64 gd_drv_cap_flags3;
+	u64 gd_drv_cap_flags4;
+
+	/* Advisory fields */
+	u64 drv_ver;
+	u32 os_type; /* Linux = 0x10; Windows = 0x20; Other = 0x30 */
+	u32 reserved;
+	u32 os_ver_major;
+	u32 os_ver_minor;
+	u32 os_ver_build;
+	u32 os_ver_platform;
+	u64 reserved_2;
+	u8 os_ver_str1[128];
+	u8 os_ver_str2[128];
+	u8 os_ver_str3[128];
+	u8 os_ver_str4[128];
+}; /* HW DATA */
+
+struct gdma_verify_ver_resp {
+	struct gdma_resp_hdr hdr;
+	u64 gdma_protocol_ver;
+	u64 pf_cap_flags1;
+	u64 pf_cap_flags2;
+	u64 pf_cap_flags3;
+	u64 pf_cap_flags4;
+}; /* HW DATA */
+
+/* GDMA_QUERY_MAX_RESOURCES */
+struct gdma_query_max_resources_resp {
+	struct gdma_resp_hdr hdr;
+	u32 status;
+	u32 max_sq;
+	u32 max_rq;
+	u32 max_cq;
+	u32 max_eq;
+	u32 max_db;
+	u32 max_mst;
+	u32 max_cq_mod_ctx;
+	u32 max_mod_cq;
+	u32 max_msix;
+}; /* HW DATA */
+
+/* GDMA_LIST_DEVICES */
+struct gdma_list_devices_resp {
+	struct gdma_resp_hdr hdr;
+	u32 num_of_devs;
+	u32 reserved;
+	struct gdma_dev_id devs[64];
+}; /* HW DATA */
+
+/* GDMA_REGISTER_DEVICE */
+struct gdma_register_device_resp {
+	struct gdma_resp_hdr hdr;
+	u32 pdid;
+	u32 gpa_mkey;
+	u32 db_id;
+}; /* HW DATA */
+
+/* GDMA_CREATE_QUEUE */
+struct gdma_create_queue_req {
+	struct gdma_req_hdr hdr;
+	u32 type;
+	u32 reserved1;
+	u32 pdid;
+	u32 doolbell_id;
+	u64 gdma_region;
+	u32 reserved2;
+	u32 queue_size;
+	u32 log2_throttle_limit;
+	u32 eq_pci_msix_index;
+	u32 cq_mod_ctx_id;
+	u32 cq_parent_eq_id;
+	u8  rq_drop_on_overrun;
+	u8  rq_err_on_wqe_overflow;
+	u8  rq_chain_rec_wqes;
+	u8  sq_hw_db;
+	u32 reserved3;
+}; /* HW DATA */
+
+struct gdma_create_queue_resp {
+	struct gdma_resp_hdr hdr;
+	u32 queue_index;
+}; /* HW DATA */
+
+/* GDMA_DISABLE_QUEUE */
+struct gdma_disable_queue_req {
+	struct gdma_req_hdr hdr;
+	u32 type;
+	u32 queue_index;
+	u32 alloc_res_id_on_creation;
+}; /* HW DATA */
+
+/* GDMA_CREATE_DMA_REGION */
+struct gdma_create_dma_region_req {
+	struct gdma_req_hdr hdr;
+
+	/* The total size of the DMA region */
+	u64 length;
+
+	/* The offset in the first page */
+	u32 offset_in_page;
+
+	/* enum gdma_page_type */
+	u32 gdma_page_type;
+
+	/* The total number of pages */
+	u32 page_count;
+
+	/* If page_addr_list_len is smaller than page_count,
+	 * the remaining page addresses will be added via the
+	 * message GDMA_DMA_REGION_ADD_PAGES.
+	 */
+	u32 page_addr_list_len;
+	u64 page_addr_list[];
+}; /* HW DATA */
+
+struct gdma_create_dma_region_resp {
+	struct gdma_resp_hdr hdr;
+	u64 gdma_region;
+}; /* HW DATA */
+
+/* GDMA_DMA_REGION_ADD_PAGES */
+struct gdma_dma_region_add_pages_req {
+	struct gdma_req_hdr hdr;
+
+	u64 gdma_region;
+
+	u32 page_addr_list_len;
+	u32 reserved3;
+
+	u64 page_addr_list[];
+}; /* HW DATA */
+
+/* GDMA_DESTROY_DMA_REGION */
+struct gdma_destroy_dma_region_req {
+	struct gdma_req_hdr hdr;
+
+	u64 gdma_region;
+}; /* HW DATA */
+
+int mana_gd_verify_vf_version(struct pci_dev *pdev);
+
+int mana_gd_register_device(struct gdma_dev *gd);
+int mana_gd_deregister_device(struct gdma_dev *gd);
+
+int mana_gd_post_work_request(struct gdma_queue *wq,
+			      const struct gdma_wqe_request *wqe_req,
+			      struct gdma_posted_wqe_info *wqe_info);
+
+int mana_gd_post_and_ring(struct gdma_queue *queue,
+			  const struct gdma_wqe_request *wqe,
+			  struct gdma_posted_wqe_info *wqe_info);
+
+int mana_gd_alloc_res_map(u32 res_avail, struct gdma_resource *r);
+void mana_gd_free_res_map(struct gdma_resource *r);
+
+void mana_gd_wq_ring_doorbell(struct gdma_context *gc,
+			      struct gdma_queue *queue);
+
+int mana_gd_alloc_memory(struct gdma_context *gc, unsigned int length,
+			 struct gdma_mem_info *gmi);
+
+void mana_gd_free_memory(struct gdma_mem_info *gmi);
+
+int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req,
+			 u32 resp_len, void *resp);
+#endif /* _GDMA_H */
diff --git a/include/net/mana/hw_channel.h b/include/net/mana/hw_channel.h
new file mode 100644
index 000000000000..6a757a6e2732
--- /dev/null
+++ b/include/net/mana/hw_channel.h
@@ -0,0 +1,195 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#ifndef _HW_CHANNEL_H
+#define _HW_CHANNEL_H
+
+#define DEFAULT_LOG2_THROTTLING_FOR_ERROR_EQ  4
+
+#define HW_CHANNEL_MAX_REQUEST_SIZE  0x1000
+#define HW_CHANNEL_MAX_RESPONSE_SIZE 0x1000
+
+#define HW_CHANNEL_VF_BOOTSTRAP_QUEUE_DEPTH 1
+
+#define HWC_INIT_DATA_CQID		1
+#define HWC_INIT_DATA_RQID		2
+#define HWC_INIT_DATA_SQID		3
+#define HWC_INIT_DATA_QUEUE_DEPTH	4
+#define HWC_INIT_DATA_MAX_REQUEST	5
+#define HWC_INIT_DATA_MAX_RESPONSE	6
+#define HWC_INIT_DATA_MAX_NUM_CQS	7
+#define HWC_INIT_DATA_PDID		8
+#define HWC_INIT_DATA_GPA_MKEY		9
+#define HWC_INIT_DATA_PF_DEST_RQ_ID	10
+#define HWC_INIT_DATA_PF_DEST_CQ_ID	11
+
+/* Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+union hwc_init_eq_id_db {
+	u32 as_uint32;
+
+	struct {
+		u32 eq_id	: 16;
+		u32 doorbell	: 16;
+	};
+}; /* HW DATA */
+
+union hwc_init_type_data {
+	u32 as_uint32;
+
+	struct {
+		u32 value	: 24;
+		u32 type	:  8;
+	};
+}; /* HW DATA */
+
+struct hwc_rx_oob {
+	u32 type	: 6;
+	u32 eom		: 1;
+	u32 som		: 1;
+	u32 vendor_err	: 8;
+	u32 reserved1	: 16;
+
+	u32 src_virt_wq	: 24;
+	u32 src_vfid	: 8;
+
+	u32 reserved2;
+
+	union {
+		u32 wqe_addr_low;
+		u32 wqe_offset;
+	};
+
+	u32 wqe_addr_high;
+
+	u32 client_data_unit	: 14;
+	u32 reserved3		: 18;
+
+	u32 tx_oob_data_size;
+
+	u32 chunk_offset	: 21;
+	u32 reserved4		: 11;
+}; /* HW DATA */
+
+struct hwc_tx_oob {
+	u32 reserved1;
+
+	u32 reserved2;
+
+	u32 vrq_id	: 24;
+	u32 dest_vfid	: 8;
+
+	u32 vrcq_id	: 24;
+	u32 reserved3	: 8;
+
+	u32 vscq_id	: 24;
+	u32 loopback	: 1;
+	u32 lso_override: 1;
+	u32 dest_pf	: 1;
+	u32 reserved4	: 5;
+
+	u32 vsq_id	: 24;
+	u32 reserved5	: 8;
+}; /* HW DATA */
+
+struct hwc_work_request {
+	void *buf_va;
+	void *buf_sge_addr;
+	u32 buf_len;
+	u32 msg_size;
+
+	struct gdma_wqe_request wqe_req;
+	struct hwc_tx_oob tx_oob;
+
+	struct gdma_sge sge;
+};
+
+/* hwc_dma_buf represents the array of in-flight WQEs.
+ * mem_info as know as the GDMA mapped memory is partitioned and used by
+ * in-flight WQEs.
+ * The number of WQEs is determined by the number of in-flight messages.
+ */
+struct hwc_dma_buf {
+	struct gdma_mem_info mem_info;
+
+	u32 gpa_mkey;
+
+	u32 num_reqs;
+	struct hwc_work_request reqs[];
+};
+
+typedef void hwc_rx_event_handler_t(void *ctx, u32 gdma_rxq_id,
+				    const struct hwc_rx_oob *rx_oob);
+
+typedef void hwc_tx_event_handler_t(void *ctx, u32 gdma_txq_id,
+				    const struct hwc_rx_oob *rx_oob);
+
+struct hwc_cq {
+	struct hw_channel_context *hwc;
+
+	struct gdma_queue *gdma_cq;
+	struct gdma_queue *gdma_eq;
+	struct gdma_comp *comp_buf;
+	u16 queue_depth;
+
+	hwc_rx_event_handler_t *rx_event_handler;
+	void *rx_event_ctx;
+
+	hwc_tx_event_handler_t *tx_event_handler;
+	void *tx_event_ctx;
+};
+
+struct hwc_wq {
+	struct hw_channel_context *hwc;
+
+	struct gdma_queue *gdma_wq;
+	struct hwc_dma_buf *msg_buf;
+	u16 queue_depth;
+
+	struct hwc_cq *hwc_cq;
+};
+
+struct hwc_caller_ctx {
+	struct completion comp_event;
+	void *output_buf;
+	u32 output_buflen;
+
+	u32 error; /* Linux error code */
+	u32 status_code;
+};
+
+struct hw_channel_context {
+	struct gdma_dev *gdma_dev;
+	struct device *dev;
+
+	u16 num_inflight_msg;
+	u32 max_req_msg_size;
+
+	u16 hwc_init_q_depth_max;
+	u32 hwc_init_max_req_msg_size;
+	u32 hwc_init_max_resp_msg_size;
+
+	struct completion hwc_init_eqe_comp;
+
+	struct hwc_wq *rxq;
+	struct hwc_wq *txq;
+	struct hwc_cq *cq;
+
+	struct semaphore sema;
+	struct gdma_resource inflight_msg_res;
+
+	u32 pf_dest_vrq_id;
+	u32 pf_dest_vrcq_id;
+
+	struct hwc_caller_ctx *caller_ctx;
+};
+
+int mana_hwc_create_channel(struct gdma_context *gc);
+void mana_hwc_destroy_channel(struct gdma_context *gc);
+
+int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
+			  const void *req, u32 resp_len, void *resp);
+
+#endif /* _HW_CHANNEL_H */
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
new file mode 100644
index 000000000000..6e9e86fb4c02
--- /dev/null
+++ b/include/net/mana/mana.h
@@ -0,0 +1,650 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#ifndef _MANA_H
+#define _MANA_H
+
+#include "gdma.h"
+#include "hw_channel.h"
+
+/* Microsoft Azure Network Adapter (MANA)'s definitions
+ *
+ * Structures labeled with "HW DATA" are exchanged with the hardware. All of
+ * them are naturally aligned and hence don't need __packed.
+ */
+
+/* MANA protocol version */
+#define MANA_MAJOR_VERSION	0
+#define MANA_MINOR_VERSION	1
+#define MANA_MICRO_VERSION	1
+
+typedef u64 mana_handle_t;
+#define INVALID_MANA_HANDLE ((mana_handle_t)-1)
+
+enum TRI_STATE {
+	TRI_STATE_UNKNOWN = -1,
+	TRI_STATE_FALSE = 0,
+	TRI_STATE_TRUE = 1
+};
+
+/* Number of entries for hardware indirection table must be in power of 2 */
+#define MANA_INDIRECT_TABLE_SIZE 64
+#define MANA_INDIRECT_TABLE_MASK (MANA_INDIRECT_TABLE_SIZE - 1)
+
+/* The Toeplitz hash key's length in bytes: should be multiple of 8 */
+#define MANA_HASH_KEY_SIZE 40
+
+#define COMP_ENTRY_SIZE 64
+
+#define ADAPTER_MTU_SIZE 1500
+#define MAX_FRAME_SIZE (ADAPTER_MTU_SIZE + 14)
+
+#define RX_BUFFERS_PER_QUEUE 512
+
+#define MAX_SEND_BUFFERS_PER_QUEUE 256
+
+#define EQ_SIZE (8 * PAGE_SIZE)
+#define LOG2_EQ_THROTTLE 3
+
+#define MAX_PORTS_IN_MANA_DEV 256
+
+struct mana_stats_rx {
+	u64 packets;
+	u64 bytes;
+	u64 xdp_drop;
+	u64 xdp_tx;
+	u64 xdp_redirect;
+	struct u64_stats_sync syncp;
+};
+
+struct mana_stats_tx {
+	u64 packets;
+	u64 bytes;
+	u64 xdp_xmit;
+	struct u64_stats_sync syncp;
+};
+
+struct mana_txq {
+	struct gdma_queue *gdma_sq;
+
+	union {
+		u32 gdma_txq_id;
+		struct {
+			u32 reserved1	: 10;
+			u32 vsq_frame	: 14;
+			u32 reserved2	: 8;
+		};
+	};
+
+	u16 vp_offset;
+
+	struct net_device *ndev;
+
+	/* The SKBs are sent to the HW and we are waiting for the CQEs. */
+	struct sk_buff_head pending_skbs;
+	struct netdev_queue *net_txq;
+
+	atomic_t pending_sends;
+
+	struct mana_stats_tx stats;
+};
+
+/* skb data and frags dma mappings */
+struct mana_skb_head {
+	dma_addr_t dma_handle[MAX_SKB_FRAGS + 1];
+
+	u32 size[MAX_SKB_FRAGS + 1];
+};
+
+#define MANA_HEADROOM sizeof(struct mana_skb_head)
+
+enum mana_tx_pkt_format {
+	MANA_SHORT_PKT_FMT	= 0,
+	MANA_LONG_PKT_FMT	= 1,
+};
+
+struct mana_tx_short_oob {
+	u32 pkt_fmt		: 2;
+	u32 is_outer_ipv4	: 1;
+	u32 is_outer_ipv6	: 1;
+	u32 comp_iphdr_csum	: 1;
+	u32 comp_tcp_csum	: 1;
+	u32 comp_udp_csum	: 1;
+	u32 supress_txcqe_gen	: 1;
+	u32 vcq_num		: 24;
+
+	u32 trans_off		: 10; /* Transport header offset */
+	u32 vsq_frame		: 14;
+	u32 short_vp_offset	: 8;
+}; /* HW DATA */
+
+struct mana_tx_long_oob {
+	u32 is_encap		: 1;
+	u32 inner_is_ipv6	: 1;
+	u32 inner_tcp_opt	: 1;
+	u32 inject_vlan_pri_tag : 1;
+	u32 reserved1		: 12;
+	u32 pcp			: 3;  /* 802.1Q */
+	u32 dei			: 1;  /* 802.1Q */
+	u32 vlan_id		: 12; /* 802.1Q */
+
+	u32 inner_frame_offset	: 10;
+	u32 inner_ip_rel_offset : 6;
+	u32 long_vp_offset	: 12;
+	u32 reserved2		: 4;
+
+	u32 reserved3;
+	u32 reserved4;
+}; /* HW DATA */
+
+struct mana_tx_oob {
+	struct mana_tx_short_oob s_oob;
+	struct mana_tx_long_oob l_oob;
+}; /* HW DATA */
+
+enum mana_cq_type {
+	MANA_CQ_TYPE_RX,
+	MANA_CQ_TYPE_TX,
+};
+
+enum mana_cqe_type {
+	CQE_INVALID			= 0,
+	CQE_RX_OKAY			= 1,
+	CQE_RX_COALESCED_4		= 2,
+	CQE_RX_OBJECT_FENCE		= 3,
+	CQE_RX_TRUNCATED		= 4,
+
+	CQE_TX_OKAY			= 32,
+	CQE_TX_SA_DROP			= 33,
+	CQE_TX_MTU_DROP			= 34,
+	CQE_TX_INVALID_OOB		= 35,
+	CQE_TX_INVALID_ETH_TYPE		= 36,
+	CQE_TX_HDR_PROCESSING_ERROR	= 37,
+	CQE_TX_VF_DISABLED		= 38,
+	CQE_TX_VPORT_IDX_OUT_OF_RANGE	= 39,
+	CQE_TX_VPORT_DISABLED		= 40,
+	CQE_TX_VLAN_TAGGING_VIOLATION	= 41,
+};
+
+#define MANA_CQE_COMPLETION 1
+
+struct mana_cqe_header {
+	u32 cqe_type	: 6;
+	u32 client_type	: 2;
+	u32 vendor_err	: 24;
+}; /* HW DATA */
+
+/* NDIS HASH Types */
+#define NDIS_HASH_IPV4		BIT(0)
+#define NDIS_HASH_TCP_IPV4	BIT(1)
+#define NDIS_HASH_UDP_IPV4	BIT(2)
+#define NDIS_HASH_IPV6		BIT(3)
+#define NDIS_HASH_TCP_IPV6	BIT(4)
+#define NDIS_HASH_UDP_IPV6	BIT(5)
+#define NDIS_HASH_IPV6_EX	BIT(6)
+#define NDIS_HASH_TCP_IPV6_EX	BIT(7)
+#define NDIS_HASH_UDP_IPV6_EX	BIT(8)
+
+#define MANA_HASH_L3 (NDIS_HASH_IPV4 | NDIS_HASH_IPV6 | NDIS_HASH_IPV6_EX)
+#define MANA_HASH_L4                                                         \
+	(NDIS_HASH_TCP_IPV4 | NDIS_HASH_UDP_IPV4 | NDIS_HASH_TCP_IPV6 |      \
+	 NDIS_HASH_UDP_IPV6 | NDIS_HASH_TCP_IPV6_EX | NDIS_HASH_UDP_IPV6_EX)
+
+struct mana_rxcomp_perpkt_info {
+	u32 pkt_len	: 16;
+	u32 reserved1	: 16;
+	u32 reserved2;
+	u32 pkt_hash;
+}; /* HW DATA */
+
+#define MANA_RXCOMP_OOB_NUM_PPI 4
+
+/* Receive completion OOB */
+struct mana_rxcomp_oob {
+	struct mana_cqe_header cqe_hdr;
+
+	u32 rx_vlan_id			: 12;
+	u32 rx_vlantag_present		: 1;
+	u32 rx_outer_iphdr_csum_succeed	: 1;
+	u32 rx_outer_iphdr_csum_fail	: 1;
+	u32 reserved1			: 1;
+	u32 rx_hashtype			: 9;
+	u32 rx_iphdr_csum_succeed	: 1;
+	u32 rx_iphdr_csum_fail		: 1;
+	u32 rx_tcp_csum_succeed		: 1;
+	u32 rx_tcp_csum_fail		: 1;
+	u32 rx_udp_csum_succeed		: 1;
+	u32 rx_udp_csum_fail		: 1;
+	u32 reserved2			: 1;
+
+	struct mana_rxcomp_perpkt_info ppi[MANA_RXCOMP_OOB_NUM_PPI];
+
+	u32 rx_wqe_offset;
+}; /* HW DATA */
+
+struct mana_tx_comp_oob {
+	struct mana_cqe_header cqe_hdr;
+
+	u32 tx_data_offset;
+
+	u32 tx_sgl_offset	: 5;
+	u32 tx_wqe_offset	: 27;
+
+	u32 reserved[12];
+}; /* HW DATA */
+
+struct mana_rxq;
+
+#define CQE_POLLING_BUFFER 512
+
+struct mana_cq {
+	struct gdma_queue *gdma_cq;
+
+	/* Cache the CQ id (used to verify if each CQE comes to the right CQ. */
+	u32 gdma_id;
+
+	/* Type of the CQ: TX or RX */
+	enum mana_cq_type type;
+
+	/* Pointer to the mana_rxq that is pushing RX CQEs to the queue.
+	 * Only and must be non-NULL if type is MANA_CQ_TYPE_RX.
+	 */
+	struct mana_rxq *rxq;
+
+	/* Pointer to the mana_txq that is pushing TX CQEs to the queue.
+	 * Only and must be non-NULL if type is MANA_CQ_TYPE_TX.
+	 */
+	struct mana_txq *txq;
+
+	/* Buffer which the CQ handler can copy the CQE's into. */
+	struct gdma_comp gdma_comp_buf[CQE_POLLING_BUFFER];
+
+	/* NAPI data */
+	struct napi_struct napi;
+	int work_done;
+	int budget;
+};
+
+#define GDMA_MAX_RQE_SGES 15
+
+struct mana_recv_buf_oob {
+	/* A valid GDMA work request representing the data buffer. */
+	struct gdma_wqe_request wqe_req;
+
+	void *buf_va;
+	dma_addr_t buf_dma_addr;
+
+	/* SGL of the buffer going to be sent has part of the work request. */
+	u32 num_sge;
+	struct gdma_sge sgl[GDMA_MAX_RQE_SGES];
+
+	/* Required to store the result of mana_gd_post_work_request.
+	 * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
+	 * work queue when the WQE is consumed.
+	 */
+	struct gdma_posted_wqe_info wqe_inf;
+};
+
+struct mana_rxq {
+	struct gdma_queue *gdma_rq;
+	/* Cache the gdma receive queue id */
+	u32 gdma_id;
+
+	/* Index of RQ in the vPort, not gdma receive queue id */
+	u32 rxq_idx;
+
+	u32 datasize;
+
+	mana_handle_t rxobj;
+
+	struct mana_cq rx_cq;
+
+	struct completion fence_event;
+
+	struct net_device *ndev;
+
+	/* Total number of receive buffers to be allocated */
+	u32 num_rx_buf;
+
+	u32 buf_index;
+
+	struct mana_stats_rx stats;
+
+	struct bpf_prog __rcu *bpf_prog;
+	struct xdp_rxq_info xdp_rxq;
+	struct page *xdp_save_page;
+	bool xdp_flush;
+	int xdp_rc; /* XDP redirect return code */
+
+	/* MUST BE THE LAST MEMBER:
+	 * Each receive buffer has an associated mana_recv_buf_oob.
+	 */
+	struct mana_recv_buf_oob rx_oobs[];
+};
+
+struct mana_tx_qp {
+	struct mana_txq txq;
+
+	struct mana_cq tx_cq;
+
+	mana_handle_t tx_object;
+};
+
+struct mana_ethtool_stats {
+	u64 stop_queue;
+	u64 wake_queue;
+};
+
+struct mana_context {
+	struct gdma_dev *gdma_dev;
+
+	u16 num_ports;
+
+	struct mana_eq *eqs;
+
+	struct net_device *ports[MAX_PORTS_IN_MANA_DEV];
+};
+
+struct mana_port_context {
+	struct mana_context *ac;
+	struct net_device *ndev;
+
+	u8 mac_addr[ETH_ALEN];
+
+	enum TRI_STATE rss_state;
+
+	mana_handle_t default_rxobj;
+	bool tx_shortform_allowed;
+	u16 tx_vp_offset;
+
+	struct mana_tx_qp *tx_qp;
+
+	/* Indirection Table for RX & TX. The values are queue indexes */
+	u32 indir_table[MANA_INDIRECT_TABLE_SIZE];
+
+	/* Indirection table containing RxObject Handles */
+	mana_handle_t rxobj_table[MANA_INDIRECT_TABLE_SIZE];
+
+	/*  Hash key used by the NIC */
+	u8 hashkey[MANA_HASH_KEY_SIZE];
+
+	/* This points to an array of num_queues of RQ pointers. */
+	struct mana_rxq **rxqs;
+
+	struct bpf_prog *bpf_prog;
+
+	/* Create num_queues EQs, SQs, SQ-CQs, RQs and RQ-CQs, respectively. */
+	unsigned int max_queues;
+	unsigned int num_queues;
+
+	mana_handle_t port_handle;
+	mana_handle_t pf_filter_handle;
+
+	/* Mutex for sharing access to vport_use_count */
+	struct mutex vport_mutex;
+	int vport_use_count;
+
+	u16 port_idx;
+
+	bool port_is_up;
+	bool port_st_save; /* Saved port state */
+
+	struct mana_ethtool_stats eth_stats;
+};
+
+int mana_start_xmit(struct sk_buff *skb, struct net_device *ndev);
+int mana_config_rss(struct mana_port_context *ac, enum TRI_STATE rx,
+		    bool update_hash, bool update_tab);
+
+int mana_alloc_queues(struct net_device *ndev);
+int mana_attach(struct net_device *ndev);
+int mana_detach(struct net_device *ndev, bool from_close);
+
+int mana_probe(struct gdma_dev *gd, bool resuming);
+void mana_remove(struct gdma_dev *gd, bool suspending);
+
+void mana_xdp_tx(struct sk_buff *skb, struct net_device *ndev);
+int mana_xdp_xmit(struct net_device *ndev, int n, struct xdp_frame **frames,
+		  u32 flags);
+u32 mana_run_xdp(struct net_device *ndev, struct mana_rxq *rxq,
+		 struct xdp_buff *xdp, void *buf_va, uint pkt_len);
+struct bpf_prog *mana_xdp_get(struct mana_port_context *apc);
+void mana_chn_setxdp(struct mana_port_context *apc, struct bpf_prog *prog);
+int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
+
+extern const struct ethtool_ops mana_ethtool_ops;
+
+struct mana_obj_spec {
+	u32 queue_index;
+	u64 gdma_region;
+	u32 queue_size;
+	u32 attached_eq;
+	u32 modr_ctx_id;
+};
+
+enum mana_command_code {
+	MANA_QUERY_DEV_CONFIG	= 0x20001,
+	MANA_QUERY_GF_STAT	= 0x20002,
+	MANA_CONFIG_VPORT_TX	= 0x20003,
+	MANA_CREATE_WQ_OBJ	= 0x20004,
+	MANA_DESTROY_WQ_OBJ	= 0x20005,
+	MANA_FENCE_RQ		= 0x20006,
+	MANA_CONFIG_VPORT_RX	= 0x20007,
+	MANA_QUERY_VPORT_CONFIG	= 0x20008,
+
+	/* Privileged commands for the PF mode */
+	MANA_REGISTER_FILTER	= 0x28000,
+	MANA_DEREGISTER_FILTER	= 0x28001,
+	MANA_REGISTER_HW_PORT	= 0x28003,
+	MANA_DEREGISTER_HW_PORT	= 0x28004,
+};
+
+/* Query Device Configuration */
+struct mana_query_device_cfg_req {
+	struct gdma_req_hdr hdr;
+
+	/* MANA Nic Driver Capability flags */
+	u64 mn_drv_cap_flags1;
+	u64 mn_drv_cap_flags2;
+	u64 mn_drv_cap_flags3;
+	u64 mn_drv_cap_flags4;
+
+	u32 proto_major_ver;
+	u32 proto_minor_ver;
+	u32 proto_micro_ver;
+
+	u32 reserved;
+}; /* HW DATA */
+
+struct mana_query_device_cfg_resp {
+	struct gdma_resp_hdr hdr;
+
+	u64 pf_cap_flags1;
+	u64 pf_cap_flags2;
+	u64 pf_cap_flags3;
+	u64 pf_cap_flags4;
+
+	u16 max_num_vports;
+	u16 reserved;
+	u32 max_num_eqs;
+}; /* HW DATA */
+
+/* Query vPort Configuration */
+struct mana_query_vport_cfg_req {
+	struct gdma_req_hdr hdr;
+	u32 vport_index;
+}; /* HW DATA */
+
+struct mana_query_vport_cfg_resp {
+	struct gdma_resp_hdr hdr;
+	u32 max_num_sq;
+	u32 max_num_rq;
+	u32 num_indirection_ent;
+	u32 reserved1;
+	u8 mac_addr[6];
+	u8 reserved2[2];
+	mana_handle_t vport;
+}; /* HW DATA */
+
+/* Configure vPort */
+struct mana_config_vport_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	u32 pdid;
+	u32 doorbell_pageid;
+}; /* HW DATA */
+
+struct mana_config_vport_resp {
+	struct gdma_resp_hdr hdr;
+	u16 tx_vport_offset;
+	u8 short_form_allowed;
+	u8 reserved;
+}; /* HW DATA */
+
+/* Create WQ Object */
+struct mana_create_wqobj_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	u32 wq_type;
+	u32 reserved;
+	u64 wq_gdma_region;
+	u64 cq_gdma_region;
+	u32 wq_size;
+	u32 cq_size;
+	u32 cq_moderation_ctx_id;
+	u32 cq_parent_qid;
+}; /* HW DATA */
+
+struct mana_create_wqobj_resp {
+	struct gdma_resp_hdr hdr;
+	u32 wq_id;
+	u32 cq_id;
+	mana_handle_t wq_obj;
+}; /* HW DATA */
+
+/* Destroy WQ Object */
+struct mana_destroy_wqobj_req {
+	struct gdma_req_hdr hdr;
+	u32 wq_type;
+	u32 reserved;
+	mana_handle_t wq_obj_handle;
+}; /* HW DATA */
+
+struct mana_destroy_wqobj_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Fence RQ */
+struct mana_fence_rq_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t wq_obj_handle;
+}; /* HW DATA */
+
+struct mana_fence_rq_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Configure vPort Rx Steering */
+struct mana_cfg_rx_steer_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	u16 num_indir_entries;
+	u16 indir_tab_offset;
+	u32 rx_enable;
+	u32 rss_enable;
+	u8 update_default_rxobj;
+	u8 update_hashkey;
+	u8 update_indir_tab;
+	u8 reserved;
+	mana_handle_t default_rxobj;
+	u8 hashkey[MANA_HASH_KEY_SIZE];
+}; /* HW DATA */
+
+struct mana_cfg_rx_steer_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Register HW vPort */
+struct mana_register_hw_vport_req {
+	struct gdma_req_hdr hdr;
+	u16 attached_gfid;
+	u8 is_pf_default_vport;
+	u8 reserved1;
+	u8 allow_all_ether_types;
+	u8 reserved2;
+	u8 reserved3;
+	u8 reserved4;
+}; /* HW DATA */
+
+struct mana_register_hw_vport_resp {
+	struct gdma_resp_hdr hdr;
+	mana_handle_t hw_vport_handle;
+}; /* HW DATA */
+
+/* Deregister HW vPort */
+struct mana_deregister_hw_vport_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t hw_vport_handle;
+}; /* HW DATA */
+
+struct mana_deregister_hw_vport_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+/* Register filter */
+struct mana_register_filter_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t vport;
+	u8 mac_addr[6];
+	u8 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+	u8 reserved4;
+	u16 reserved5;
+	u32 reserved6;
+	u32 reserved7;
+	u32 reserved8;
+}; /* HW DATA */
+
+struct mana_register_filter_resp {
+	struct gdma_resp_hdr hdr;
+	mana_handle_t filter_handle;
+}; /* HW DATA */
+
+/* Deregister filter */
+struct mana_deregister_filter_req {
+	struct gdma_req_hdr hdr;
+	mana_handle_t filter_handle;
+}; /* HW DATA */
+
+struct mana_deregister_filter_resp {
+	struct gdma_resp_hdr hdr;
+}; /* HW DATA */
+
+#define MANA_MAX_NUM_QUEUES 64
+
+#define MANA_SHORT_VPORT_OFFSET_MAX ((1U << 8) - 1)
+
+struct mana_tx_package {
+	struct gdma_wqe_request wqe_req;
+	struct gdma_sge sgl_array[5];
+	struct gdma_sge *sgl_ptr;
+
+	struct mana_tx_oob tx_oob;
+
+	struct gdma_posted_wqe_info wqe_info;
+};
+
+int mana_create_wq_obj(struct mana_port_context *apc,
+		       mana_handle_t vport,
+		       u32 wq_type, struct mana_obj_spec *wq_spec,
+		       struct mana_obj_spec *cq_spec,
+		       mana_handle_t *wq_obj);
+
+void mana_destroy_wq_obj(struct mana_port_context *apc, u32 wq_type,
+			 mana_handle_t wq_obj);
+
+int mana_cfg_vport(struct mana_port_context *apc, u32 protection_dom_id,
+		   u32 doorbell_pg_id);
+void mana_uncfg_vport(struct mana_port_context *apc);
+#endif /* _MANA_H */
diff --git a/include/net/mana/mana_auxiliary.h b/include/net/mana/mana_auxiliary.h
new file mode 100644
index 000000000000..373d59756846
--- /dev/null
+++ b/include/net/mana/mana_auxiliary.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright (c) 2022, Microsoft Corporation. */
+
+#include "mana.h"
+#include <linux/auxiliary_bus.h>
+
+struct mana_adev {
+	struct auxiliary_device adev;
+	struct gdma_dev *mdev;
+};
diff --git a/include/net/mana/shm_channel.h b/include/net/mana/shm_channel.h
new file mode 100644
index 000000000000..5199b41497ff
--- /dev/null
+++ b/include/net/mana/shm_channel.h
@@ -0,0 +1,21 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */
+/* Copyright (c) 2021, Microsoft Corporation. */
+
+#ifndef _SHM_CHANNEL_H
+#define _SHM_CHANNEL_H
+
+struct shm_channel {
+	struct device *dev;
+	void __iomem *base;
+};
+
+void mana_smc_init(struct shm_channel *sc, struct device *dev,
+		   void __iomem *base);
+
+int mana_smc_setup_hwc(struct shm_channel *sc, bool reset_vf, u64 eq_addr,
+		       u64 cq_addr, u64 rq_addr, u64 sq_addr,
+		       u32 eq_msix_index);
+
+int mana_smc_teardown_hwc(struct shm_channel *sc, bool reset_vf);
+
+#endif /* _SHM_CHANNEL_H */
-- 
cgit v1.2.3


From aa56549792fb348892fbbae67f6f0c71bb750b65 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:26 -0700
Subject: net: mana: Define max values for SGL entries

The number of maximum SGl entries should be computed from the maximum
WQE size for the intended queue type and the corresponding OOB data
size. This guarantees the hardware queue can successfully queue requests
up to the queue depth exposed to the upper layer.

Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-9-git-send-email-longli@linuxonhyperv.com
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/microsoft/mana/mana_en.c | 2 +-
 include/net/mana/gdma.h                       | 7 +++++++
 include/net/mana/mana.h                       | 4 +---
 3 files changed, 9 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index ffa2a0e2c213..f6bcd0cc6cda 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -189,7 +189,7 @@ int mana_start_xmit(struct sk_buff *skb, struct net_device *ndev)
 	pkg.wqe_req.client_data_unit = 0;
 
 	pkg.wqe_req.num_sge = 1 + skb_shinfo(skb)->nr_frags;
-	WARN_ON_ONCE(pkg.wqe_req.num_sge > 30);
+	WARN_ON_ONCE(pkg.wqe_req.num_sge > MAX_TX_WQE_SGL_ENTRIES);
 
 	if (pkg.wqe_req.num_sge <= ARRAY_SIZE(pkg.sgl_array)) {
 		pkg.wqe_req.sgl = pkg.sgl_array;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 72eaec2470c0..0b0c0cd0b6bd 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -427,6 +427,13 @@ struct gdma_wqe {
 #define MAX_TX_WQE_SIZE 512
 #define MAX_RX_WQE_SIZE 256
 
+#define MAX_TX_WQE_SGL_ENTRIES	((GDMA_MAX_SQE_SIZE -			   \
+			sizeof(struct gdma_sge) - INLINE_OOB_SMALL_SIZE) / \
+			sizeof(struct gdma_sge))
+
+#define MAX_RX_WQE_SGL_ENTRIES	((GDMA_MAX_RQE_SIZE -			   \
+			sizeof(struct gdma_sge)) / sizeof(struct gdma_sge))
+
 struct gdma_cqe {
 	u32 cqe_data[GDMA_COMP_DATA_SIZE / 4];
 
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 6e9e86fb4c02..713a8f8cca9a 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -265,8 +265,6 @@ struct mana_cq {
 	int budget;
 };
 
-#define GDMA_MAX_RQE_SGES 15
-
 struct mana_recv_buf_oob {
 	/* A valid GDMA work request representing the data buffer. */
 	struct gdma_wqe_request wqe_req;
@@ -276,7 +274,7 @@ struct mana_recv_buf_oob {
 
 	/* SGL of the buffer going to be sent has part of the work request. */
 	u32 num_sge;
-	struct gdma_sge sgl[GDMA_MAX_RQE_SGES];
+	struct gdma_sge sgl[MAX_RX_WQE_SGL_ENTRIES];
 
 	/* Required to store the result of mana_gd_post_work_request.
 	 * gdma_posted_wqe_info.wqe_size_in_bu is required for progressing the
-- 
cgit v1.2.3


From de372f2a9ca7ada2698ecac7df8f02407cd98fa0 Mon Sep 17 00:00:00 2001
From: Ajay Sharma <sharmaajay@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:27 -0700
Subject: net: mana: Define and process GDMA response code
 GDMA_STATUS_MORE_ENTRIES

When doing memory registration, the PF may respond with
GDMA_STATUS_MORE_ENTRIES to indicate a follow request is needed. This is
not an error and should be processed as expected.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-10-git-send-email-longli@linuxonhyperv.com
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/microsoft/mana/hw_channel.c | 2 +-
 include/net/mana/gdma.h                          | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/microsoft/mana/hw_channel.c b/drivers/net/ethernet/microsoft/mana/hw_channel.c
index 76829ab43d40..9d1507eba5b9 100644
--- a/drivers/net/ethernet/microsoft/mana/hw_channel.c
+++ b/drivers/net/ethernet/microsoft/mana/hw_channel.c
@@ -836,7 +836,7 @@ int mana_hwc_send_request(struct hw_channel_context *hwc, u32 req_len,
 		goto out;
 	}
 
-	if (ctx->status_code) {
+	if (ctx->status_code && ctx->status_code != GDMA_STATUS_MORE_ENTRIES) {
 		dev_err(hwc->dev, "HWC: Failed hw_channel req: 0x%x\n",
 			ctx->status_code);
 		err = -EPROTO;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 0b0c0cd0b6bd..a9b7930dfbf8 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -9,6 +9,8 @@
 
 #include "shm_channel.h"
 
+#define GDMA_STATUS_MORE_ENTRIES	0x00000105
+
 /* Structures labeled with "HW DATA" are exchanged with the hardware. All of
  * them are naturally aligned and hence don't need __packed.
  */
-- 
cgit v1.2.3


From f72ececfc197e9b0bbb5595294908a950cf444fa Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:28 -0700
Subject: net: mana: Define data structures for allocating doorbell page from
 GDMA

The RDMA device needs to allocate doorbell pages for each user context.
Define the GDMA data structures for use by the RDMA driver.

Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-11-git-send-email-longli@linuxonhyperv.com
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 include/net/mana/gdma.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'include/net')

diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index a9b7930dfbf8..055408a5baf3 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -24,11 +24,15 @@ enum gdma_request_type {
 	GDMA_GENERATE_TEST_EQE		= 10,
 	GDMA_CREATE_QUEUE		= 12,
 	GDMA_DISABLE_QUEUE		= 13,
+	GDMA_ALLOCATE_RESOURCE_RANGE	= 22,
+	GDMA_DESTROY_RESOURCE_RANGE	= 24,
 	GDMA_CREATE_DMA_REGION		= 25,
 	GDMA_DMA_REGION_ADD_PAGES	= 26,
 	GDMA_DESTROY_DMA_REGION		= 27,
 };
 
+#define GDMA_RESOURCE_DOORBELL_PAGE	27
+
 enum gdma_queue_type {
 	GDMA_INVALID_QUEUE,
 	GDMA_SQ,
@@ -587,6 +591,26 @@ struct gdma_register_device_resp {
 	u32 db_id;
 }; /* HW DATA */
 
+struct gdma_allocate_resource_range_req {
+	struct gdma_req_hdr hdr;
+	u32 resource_type;
+	u32 num_resources;
+	u32 alignment;
+	u32 allocated_resources;
+};
+
+struct gdma_allocate_resource_range_resp {
+	struct gdma_resp_hdr hdr;
+	u32 allocated_resources;
+};
+
+struct gdma_destroy_resource_range_req {
+	struct gdma_req_hdr hdr;
+	u32 resource_type;
+	u32 num_resources;
+	u32 allocated_resources;
+};
+
 /* GDMA_CREATE_QUEUE */
 struct gdma_create_queue_req {
 	struct gdma_req_hdr hdr;
-- 
cgit v1.2.3


From 28c66cfa45388af1126985d1114e0ed762eb2abd Mon Sep 17 00:00:00 2001
From: Ajay Sharma <sharmaajay@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:29 -0700
Subject: net: mana: Define data structures for protection domain and memory
 registration

The MANA hardware support protection domain and memory registration for use
in RDMA environment. Add those definitions and expose them for use by the
RDMA driver.

Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-12-git-send-email-longli@linuxonhyperv.com
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Acked-by: Haiyang Zhang <haiyangz@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/net/ethernet/microsoft/mana/gdma_main.c |  27 ++++--
 drivers/net/ethernet/microsoft/mana/mana_en.c   |  18 ++--
 include/net/mana/gdma.h                         | 121 +++++++++++++++++++++++-
 3 files changed, 143 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 69795bc679e7..46a7d1e6ece9 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -198,7 +198,7 @@ static int mana_gd_create_hw_eq(struct gdma_context *gc,
 	req.type = queue->type;
 	req.pdid = queue->gdma_dev->pdid;
 	req.doolbell_id = queue->gdma_dev->doorbell;
-	req.gdma_region = queue->mem_info.gdma_region;
+	req.gdma_region = queue->mem_info.dma_region_handle;
 	req.queue_size = queue->queue_size;
 	req.log2_throttle_limit = queue->eq.log2_throttle_limit;
 	req.eq_pci_msix_index = queue->eq.msix_index;
@@ -212,7 +212,7 @@ static int mana_gd_create_hw_eq(struct gdma_context *gc,
 
 	queue->id = resp.queue_index;
 	queue->eq.disable_needed = true;
-	queue->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+	queue->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION;
 	return 0;
 }
 
@@ -671,24 +671,30 @@ free_q:
 	return err;
 }
 
-static void mana_gd_destroy_dma_region(struct gdma_context *gc, u64 gdma_region)
+int mana_gd_destroy_dma_region(struct gdma_context *gc,
+			       gdma_obj_handle_t dma_region_handle)
 {
 	struct gdma_destroy_dma_region_req req = {};
 	struct gdma_general_resp resp = {};
 	int err;
 
-	if (gdma_region == GDMA_INVALID_DMA_REGION)
-		return;
+	if (dma_region_handle == GDMA_INVALID_DMA_REGION)
+		return 0;
 
 	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_DMA_REGION, sizeof(req),
 			     sizeof(resp));
-	req.gdma_region = gdma_region;
+	req.dma_region_handle = dma_region_handle;
 
 	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
-	if (err || resp.hdr.status)
+	if (err || resp.hdr.status) {
 		dev_err(gc->dev, "Failed to destroy DMA region: %d, 0x%x\n",
 			err, resp.hdr.status);
+		return -EPROTO;
+	}
+
+	return 0;
 }
+EXPORT_SYMBOL_NS(mana_gd_destroy_dma_region, NET_MANA);
 
 static int mana_gd_create_dma_region(struct gdma_dev *gd,
 				     struct gdma_mem_info *gmi)
@@ -733,14 +739,15 @@ static int mana_gd_create_dma_region(struct gdma_dev *gd,
 	if (err)
 		goto out;
 
-	if (resp.hdr.status || resp.gdma_region == GDMA_INVALID_DMA_REGION) {
+	if (resp.hdr.status ||
+	    resp.dma_region_handle == GDMA_INVALID_DMA_REGION) {
 		dev_err(gc->dev, "Failed to create DMA region: 0x%x\n",
 			resp.hdr.status);
 		err = -EPROTO;
 		goto out;
 	}
 
-	gmi->gdma_region = resp.gdma_region;
+	gmi->dma_region_handle = resp.dma_region_handle;
 out:
 	kfree(req);
 	return err;
@@ -863,7 +870,7 @@ void mana_gd_destroy_queue(struct gdma_context *gc, struct gdma_queue *queue)
 		return;
 	}
 
-	mana_gd_destroy_dma_region(gc, gmi->gdma_region);
+	mana_gd_destroy_dma_region(gc, gmi->dma_region_handle);
 	mana_gd_free_memory(gmi);
 	kfree(queue);
 }
diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
index f6bcd0cc6cda..1c59502d34b5 100644
--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
+++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
@@ -1523,10 +1523,10 @@ static int mana_create_txq(struct mana_port_context *apc,
 		memset(&wq_spec, 0, sizeof(wq_spec));
 		memset(&cq_spec, 0, sizeof(cq_spec));
 
-		wq_spec.gdma_region = txq->gdma_sq->mem_info.gdma_region;
+		wq_spec.gdma_region = txq->gdma_sq->mem_info.dma_region_handle;
 		wq_spec.queue_size = txq->gdma_sq->queue_size;
 
-		cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region;
+		cq_spec.gdma_region = cq->gdma_cq->mem_info.dma_region_handle;
 		cq_spec.queue_size = cq->gdma_cq->queue_size;
 		cq_spec.modr_ctx_id = 0;
 		cq_spec.attached_eq = cq->gdma_cq->cq.parent->id;
@@ -1541,8 +1541,10 @@ static int mana_create_txq(struct mana_port_context *apc,
 		txq->gdma_sq->id = wq_spec.queue_index;
 		cq->gdma_cq->id = cq_spec.queue_index;
 
-		txq->gdma_sq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
-		cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+		txq->gdma_sq->mem_info.dma_region_handle =
+			GDMA_INVALID_DMA_REGION;
+		cq->gdma_cq->mem_info.dma_region_handle =
+			GDMA_INVALID_DMA_REGION;
 
 		txq->gdma_txq_id = txq->gdma_sq->id;
 
@@ -1753,10 +1755,10 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 
 	memset(&wq_spec, 0, sizeof(wq_spec));
 	memset(&cq_spec, 0, sizeof(cq_spec));
-	wq_spec.gdma_region = rxq->gdma_rq->mem_info.gdma_region;
+	wq_spec.gdma_region = rxq->gdma_rq->mem_info.dma_region_handle;
 	wq_spec.queue_size = rxq->gdma_rq->queue_size;
 
-	cq_spec.gdma_region = cq->gdma_cq->mem_info.gdma_region;
+	cq_spec.gdma_region = cq->gdma_cq->mem_info.dma_region_handle;
 	cq_spec.queue_size = cq->gdma_cq->queue_size;
 	cq_spec.modr_ctx_id = 0;
 	cq_spec.attached_eq = cq->gdma_cq->cq.parent->id;
@@ -1769,8 +1771,8 @@ static struct mana_rxq *mana_create_rxq(struct mana_port_context *apc,
 	rxq->gdma_rq->id = wq_spec.queue_index;
 	cq->gdma_cq->id = cq_spec.queue_index;
 
-	rxq->gdma_rq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
-	cq->gdma_cq->mem_info.gdma_region = GDMA_INVALID_DMA_REGION;
+	rxq->gdma_rq->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION;
+	cq->gdma_cq->mem_info.dma_region_handle = GDMA_INVALID_DMA_REGION;
 
 	rxq->gdma_id = rxq->gdma_rq->id;
 	cq->gdma_id = cq->gdma_cq->id;
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 055408a5baf3..221adc96340c 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -29,6 +29,10 @@ enum gdma_request_type {
 	GDMA_CREATE_DMA_REGION		= 25,
 	GDMA_DMA_REGION_ADD_PAGES	= 26,
 	GDMA_DESTROY_DMA_REGION		= 27,
+	GDMA_CREATE_PD			= 29,
+	GDMA_DESTROY_PD			= 30,
+	GDMA_CREATE_MR			= 31,
+	GDMA_DESTROY_MR			= 32,
 };
 
 #define GDMA_RESOURCE_DOORBELL_PAGE	27
@@ -61,6 +65,8 @@ enum {
 	GDMA_DEVICE_MANA	= 2,
 };
 
+typedef u64 gdma_obj_handle_t;
+
 struct gdma_resource {
 	/* Protect the bitmap */
 	spinlock_t lock;
@@ -194,7 +200,7 @@ struct gdma_mem_info {
 	u64 length;
 
 	/* Allocated by the PF driver */
-	u64 gdma_region;
+	gdma_obj_handle_t dma_region_handle;
 };
 
 #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8
@@ -618,7 +624,7 @@ struct gdma_create_queue_req {
 	u32 reserved1;
 	u32 pdid;
 	u32 doolbell_id;
-	u64 gdma_region;
+	gdma_obj_handle_t gdma_region;
 	u32 reserved2;
 	u32 queue_size;
 	u32 log2_throttle_limit;
@@ -645,6 +651,28 @@ struct gdma_disable_queue_req {
 	u32 alloc_res_id_on_creation;
 }; /* HW DATA */
 
+enum atb_page_size {
+	ATB_PAGE_SIZE_4K,
+	ATB_PAGE_SIZE_8K,
+	ATB_PAGE_SIZE_16K,
+	ATB_PAGE_SIZE_32K,
+	ATB_PAGE_SIZE_64K,
+	ATB_PAGE_SIZE_128K,
+	ATB_PAGE_SIZE_256K,
+	ATB_PAGE_SIZE_512K,
+	ATB_PAGE_SIZE_1M,
+	ATB_PAGE_SIZE_2M,
+	ATB_PAGE_SIZE_MAX,
+};
+
+enum gdma_mr_access_flags {
+	GDMA_ACCESS_FLAG_LOCAL_READ = BIT_ULL(0),
+	GDMA_ACCESS_FLAG_LOCAL_WRITE = BIT_ULL(1),
+	GDMA_ACCESS_FLAG_REMOTE_READ = BIT_ULL(2),
+	GDMA_ACCESS_FLAG_REMOTE_WRITE = BIT_ULL(3),
+	GDMA_ACCESS_FLAG_REMOTE_ATOMIC = BIT_ULL(4),
+};
+
 /* GDMA_CREATE_DMA_REGION */
 struct gdma_create_dma_region_req {
 	struct gdma_req_hdr hdr;
@@ -671,14 +699,14 @@ struct gdma_create_dma_region_req {
 
 struct gdma_create_dma_region_resp {
 	struct gdma_resp_hdr hdr;
-	u64 gdma_region;
+	gdma_obj_handle_t dma_region_handle;
 }; /* HW DATA */
 
 /* GDMA_DMA_REGION_ADD_PAGES */
 struct gdma_dma_region_add_pages_req {
 	struct gdma_req_hdr hdr;
 
-	u64 gdma_region;
+	gdma_obj_handle_t dma_region_handle;
 
 	u32 page_addr_list_len;
 	u32 reserved3;
@@ -690,9 +718,88 @@ struct gdma_dma_region_add_pages_req {
 struct gdma_destroy_dma_region_req {
 	struct gdma_req_hdr hdr;
 
-	u64 gdma_region;
+	gdma_obj_handle_t dma_region_handle;
 }; /* HW DATA */
 
+enum gdma_pd_flags {
+	GDMA_PD_FLAG_INVALID = 0,
+};
+
+struct gdma_create_pd_req {
+	struct gdma_req_hdr hdr;
+	enum gdma_pd_flags flags;
+	u32 reserved;
+};/* HW DATA */
+
+struct gdma_create_pd_resp {
+	struct gdma_resp_hdr hdr;
+	gdma_obj_handle_t pd_handle;
+	u32 pd_id;
+	u32 reserved;
+};/* HW DATA */
+
+struct gdma_destroy_pd_req {
+	struct gdma_req_hdr hdr;
+	gdma_obj_handle_t pd_handle;
+};/* HW DATA */
+
+struct gdma_destory_pd_resp {
+	struct gdma_resp_hdr hdr;
+};/* HW DATA */
+
+enum gdma_mr_type {
+	/* Guest Virtual Address - MRs of this type allow access
+	 * to memory mapped by PTEs associated with this MR using a virtual
+	 * address that is set up in the MST
+	 */
+	GDMA_MR_TYPE_GVA = 2,
+};
+
+struct gdma_create_mr_params {
+	gdma_obj_handle_t pd_handle;
+	enum gdma_mr_type mr_type;
+	union {
+		struct {
+			gdma_obj_handle_t dma_region_handle;
+			u64 virtual_address;
+			enum gdma_mr_access_flags access_flags;
+		} gva;
+	};
+};
+
+struct gdma_create_mr_request {
+	struct gdma_req_hdr hdr;
+	gdma_obj_handle_t pd_handle;
+	enum gdma_mr_type mr_type;
+	u32 reserved_1;
+
+	union {
+		struct {
+			gdma_obj_handle_t dma_region_handle;
+			u64 virtual_address;
+			enum gdma_mr_access_flags access_flags;
+		} gva;
+
+	};
+	u32 reserved_2;
+};/* HW DATA */
+
+struct gdma_create_mr_response {
+	struct gdma_resp_hdr hdr;
+	gdma_obj_handle_t mr_handle;
+	u32 lkey;
+	u32 rkey;
+};/* HW DATA */
+
+struct gdma_destroy_mr_request {
+	struct gdma_req_hdr hdr;
+	gdma_obj_handle_t mr_handle;
+};/* HW DATA */
+
+struct gdma_destroy_mr_response {
+	struct gdma_resp_hdr hdr;
+};/* HW DATA */
+
 int mana_gd_verify_vf_version(struct pci_dev *pdev);
 
 int mana_gd_register_device(struct gdma_dev *gd);
@@ -719,4 +826,8 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi);
 
 int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req,
 			 u32 resp_len, void *resp);
+
+int mana_gd_destroy_dma_region(struct gdma_context *gc,
+			       gdma_obj_handle_t dma_region_handle);
+
 #endif /* _GDMA_H */
-- 
cgit v1.2.3


From 0266a177631d4c6b963b5b12dd986a8c5abdbf06 Mon Sep 17 00:00:00 2001
From: Long Li <longli@microsoft.com>
Date: Thu, 3 Nov 2022 12:16:30 -0700
Subject: RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter

Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).

Co-developed-by: Ajay Sharma <sharmaajay@microsoft.com>
Signed-off-by: Ajay Sharma <sharmaajay@microsoft.com>
Reviewed-by: Dexuan Cui <decui@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Link: https://lore.kernel.org/r/1667502990-2559-13-git-send-email-longli@linuxonhyperv.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
---
 MAINTAINERS                             |   9 +
 drivers/infiniband/Kconfig              |   1 +
 drivers/infiniband/hw/Makefile          |   1 +
 drivers/infiniband/hw/mana/Kconfig      |  10 +
 drivers/infiniband/hw/mana/Makefile     |   4 +
 drivers/infiniband/hw/mana/cq.c         |  79 +++++
 drivers/infiniband/hw/mana/device.c     | 117 +++++++
 drivers/infiniband/hw/mana/main.c       | 521 ++++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mana/mana_ib.h    | 162 ++++++++++
 drivers/infiniband/hw/mana/mr.c         | 198 ++++++++++++
 drivers/infiniband/hw/mana/qp.c         | 506 +++++++++++++++++++++++++++++++
 drivers/infiniband/hw/mana/wq.c         | 115 +++++++
 include/net/mana/mana.h                 |   3 +
 include/uapi/rdma/ib_user_ioctl_verbs.h |   1 +
 include/uapi/rdma/mana-abi.h            |  66 ++++
 15 files changed, 1793 insertions(+)
 create mode 100644 drivers/infiniband/hw/mana/Kconfig
 create mode 100644 drivers/infiniband/hw/mana/Makefile
 create mode 100644 drivers/infiniband/hw/mana/cq.c
 create mode 100644 drivers/infiniband/hw/mana/device.c
 create mode 100644 drivers/infiniband/hw/mana/main.c
 create mode 100644 drivers/infiniband/hw/mana/mana_ib.h
 create mode 100644 drivers/infiniband/hw/mana/mr.c
 create mode 100644 drivers/infiniband/hw/mana/qp.c
 create mode 100644 drivers/infiniband/hw/mana/wq.c
 create mode 100644 include/uapi/rdma/mana-abi.h

(limited to 'include/net')

diff --git a/MAINTAINERS b/MAINTAINERS
index 441a65d41eb4..4db8e4e02c05 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -13669,6 +13669,15 @@ F:	drivers/scsi/smartpqi/smartpqi*.[ch]
 F:	include/linux/cciss*.h
 F:	include/uapi/linux/cciss*.h
 
+MICROSOFT MANA RDMA DRIVER
+M:	Long Li <longli@microsoft.com>
+M:	Ajay Sharma <sharmaajay@microsoft.com>
+L:	linux-rdma@vger.kernel.org
+S:	Supported
+F:	drivers/infiniband/hw/mana/
+F:	include/net/mana
+F:	include/uapi/rdma/mana-abi.h
+
 MICROSOFT SURFACE AGGREGATOR TABLET-MODE SWITCH
 M:	Maximilian Luz <luzmaximilian@gmail.com>
 L:	platform-driver-x86@vger.kernel.org
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index aa36ac618e72..ccc874478f0b 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -85,6 +85,7 @@ source "drivers/infiniband/hw/erdma/Kconfig"
 source "drivers/infiniband/hw/hfi1/Kconfig"
 source "drivers/infiniband/hw/hns/Kconfig"
 source "drivers/infiniband/hw/irdma/Kconfig"
+source "drivers/infiniband/hw/mana/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
 source "drivers/infiniband/hw/mthca/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index 6b3a88046125..1211f4317a9f 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
 obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
 obj-$(CONFIG_INFINIBAND_EFA)		+= efa/
 obj-$(CONFIG_INFINIBAND_IRDMA)		+= irdma/
+obj-$(CONFIG_MANA_INFINIBAND)		+= mana/
 obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
 obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
diff --git a/drivers/infiniband/hw/mana/Kconfig b/drivers/infiniband/hw/mana/Kconfig
new file mode 100644
index 000000000000..546640657bac
--- /dev/null
+++ b/drivers/infiniband/hw/mana/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config MANA_INFINIBAND
+	tristate "Microsoft Azure Network Adapter support"
+	depends on NETDEVICES && ETHERNET && PCI && MICROSOFT_MANA
+	help
+	  This driver provides low-level RDMA support for Microsoft Azure
+	  Network Adapter (MANA). MANA supports RDMA features that can be used
+	  for workloads (e.g. DPDK, MPI etc) that uses RDMA verbs to directly
+	  access hardware from user-mode processes in Microsoft Azure cloud
+	  environment.
diff --git a/drivers/infiniband/hw/mana/Makefile b/drivers/infiniband/hw/mana/Makefile
new file mode 100644
index 000000000000..88655fe5e398
--- /dev/null
+++ b/drivers/infiniband/hw/mana/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_MANA_INFINIBAND) += mana_ib.o
+
+mana_ib-y := device.o main.o wq.o qp.o cq.o mr.o
diff --git a/drivers/infiniband/hw/mana/cq.c b/drivers/infiniband/hw/mana/cq.c
new file mode 100644
index 000000000000..d141cab8a1e6
--- /dev/null
+++ b/drivers/infiniband/hw/mana/cq.c
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata)
+{
+	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+	struct ib_device *ibdev = ibcq->device;
+	struct mana_ib_create_cq ucmd = {};
+	struct mana_ib_dev *mdev;
+	int err;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	if (udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(ibdev,
+			  "Failed to copy from udata for create cq, %d\n", err);
+		return err;
+	}
+
+	if (attr->cqe > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(ibdev, "CQE %d exceeding limit\n", attr->cqe);
+		return -EINVAL;
+	}
+
+	cq->cqe = attr->cqe;
+	cq->umem = ib_umem_get(ibdev, ucmd.buf_addr, cq->cqe * COMP_ENTRY_SIZE,
+			       IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(cq->umem)) {
+		err = PTR_ERR(cq->umem);
+		ibdev_dbg(ibdev, "Failed to get umem for create cq, err %d\n",
+			  err);
+		return err;
+	}
+
+	err = mana_ib_gd_create_dma_region(mdev, cq->umem, &cq->gdma_region);
+	if (err) {
+		ibdev_dbg(ibdev,
+			  "Failed to create dma region for create cq, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(ibdev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, cq->gdma_region);
+
+	/*
+	 * The CQ ID is not known at this time. The ID is generated at create_qp
+	 */
+
+	return 0;
+
+err_release_umem:
+	ib_umem_release(cq->umem);
+	return err;
+}
+
+int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+	struct mana_ib_cq *cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+	struct ib_device *ibdev = ibcq->device;
+	struct mana_ib_dev *mdev;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	mana_ib_gd_destroy_dma_region(mdev, cq->gdma_region);
+	ib_umem_release(cq->umem);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/device.c b/drivers/infiniband/hw/mana/device.c
new file mode 100644
index 000000000000..d4541b8707e4
--- /dev/null
+++ b/drivers/infiniband/hw/mana/device.c
@@ -0,0 +1,117 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+#include <net/mana/mana_auxiliary.h>
+
+MODULE_DESCRIPTION("Microsoft Azure Network Adapter IB driver");
+MODULE_LICENSE("GPL");
+MODULE_IMPORT_NS(NET_MANA);
+
+static const struct ib_device_ops mana_ib_dev_ops = {
+	.owner = THIS_MODULE,
+	.driver_id = RDMA_DRIVER_MANA,
+	.uverbs_abi_ver = MANA_IB_UVERBS_ABI_VERSION,
+
+	.alloc_pd = mana_ib_alloc_pd,
+	.alloc_ucontext = mana_ib_alloc_ucontext,
+	.create_cq = mana_ib_create_cq,
+	.create_qp = mana_ib_create_qp,
+	.create_rwq_ind_table = mana_ib_create_rwq_ind_table,
+	.create_wq = mana_ib_create_wq,
+	.dealloc_pd = mana_ib_dealloc_pd,
+	.dealloc_ucontext = mana_ib_dealloc_ucontext,
+	.dereg_mr = mana_ib_dereg_mr,
+	.destroy_cq = mana_ib_destroy_cq,
+	.destroy_qp = mana_ib_destroy_qp,
+	.destroy_rwq_ind_table = mana_ib_destroy_rwq_ind_table,
+	.destroy_wq = mana_ib_destroy_wq,
+	.disassociate_ucontext = mana_ib_disassociate_ucontext,
+	.get_port_immutable = mana_ib_get_port_immutable,
+	.mmap = mana_ib_mmap,
+	.modify_qp = mana_ib_modify_qp,
+	.modify_wq = mana_ib_modify_wq,
+	.query_device = mana_ib_query_device,
+	.query_gid = mana_ib_query_gid,
+	.query_port = mana_ib_query_port,
+	.reg_user_mr = mana_ib_reg_user_mr,
+
+	INIT_RDMA_OBJ_SIZE(ib_cq, mana_ib_cq, ibcq),
+	INIT_RDMA_OBJ_SIZE(ib_pd, mana_ib_pd, ibpd),
+	INIT_RDMA_OBJ_SIZE(ib_qp, mana_ib_qp, ibqp),
+	INIT_RDMA_OBJ_SIZE(ib_ucontext, mana_ib_ucontext, ibucontext),
+	INIT_RDMA_OBJ_SIZE(ib_rwq_ind_table, mana_ib_rwq_ind_table,
+			   ib_ind_table),
+};
+
+static int mana_ib_probe(struct auxiliary_device *adev,
+			 const struct auxiliary_device_id *id)
+{
+	struct mana_adev *madev = container_of(adev, struct mana_adev, adev);
+	struct gdma_dev *mdev = madev->mdev;
+	struct mana_context *mc;
+	struct mana_ib_dev *dev;
+	int ret;
+
+	mc = mdev->driver_data;
+
+	dev = ib_alloc_device(mana_ib_dev, ib_dev);
+	if (!dev)
+		return -ENOMEM;
+
+	ib_set_device_ops(&dev->ib_dev, &mana_ib_dev_ops);
+
+	dev->ib_dev.phys_port_cnt = mc->num_ports;
+
+	ibdev_dbg(&dev->ib_dev, "mdev=%p id=%d num_ports=%d\n", mdev,
+		  mdev->dev_id.as_uint32, dev->ib_dev.phys_port_cnt);
+
+	dev->gdma_dev = mdev;
+	dev->ib_dev.node_type = RDMA_NODE_IB_CA;
+
+	/*
+	 * num_comp_vectors needs to set to the max MSIX index
+	 * when interrupts and event queues are implemented
+	 */
+	dev->ib_dev.num_comp_vectors = 1;
+	dev->ib_dev.dev.parent = mdev->gdma_context->dev;
+
+	ret = ib_register_device(&dev->ib_dev, "mana_%d",
+				 mdev->gdma_context->dev);
+	if (ret) {
+		ib_dealloc_device(&dev->ib_dev);
+		return ret;
+	}
+
+	dev_set_drvdata(&adev->dev, dev);
+
+	return 0;
+}
+
+static void mana_ib_remove(struct auxiliary_device *adev)
+{
+	struct mana_ib_dev *dev = dev_get_drvdata(&adev->dev);
+
+	ib_unregister_device(&dev->ib_dev);
+	ib_dealloc_device(&dev->ib_dev);
+}
+
+static const struct auxiliary_device_id mana_id_table[] = {
+	{
+		.name = "mana.rdma",
+	},
+	{},
+};
+
+MODULE_DEVICE_TABLE(auxiliary, mana_id_table);
+
+static struct auxiliary_driver mana_driver = {
+	.name = "rdma",
+	.probe = mana_ib_probe,
+	.remove = mana_ib_remove,
+	.id_table = mana_id_table,
+};
+
+module_auxiliary_driver(mana_driver);
diff --git a/drivers/infiniband/hw/mana/main.c b/drivers/infiniband/hw/mana/main.c
new file mode 100644
index 000000000000..8b3bc302d6f3
--- /dev/null
+++ b/drivers/infiniband/hw/mana/main.c
@@ -0,0 +1,521 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+			 u32 port)
+{
+	struct gdma_dev *gd = dev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct net_device *ndev;
+	struct mana_context *mc;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[port];
+	mpc = netdev_priv(ndev);
+
+	mutex_lock(&pd->vport_mutex);
+
+	pd->vport_use_count--;
+	WARN_ON(pd->vport_use_count < 0);
+
+	if (!pd->vport_use_count)
+		mana_uncfg_vport(mpc);
+
+	mutex_unlock(&pd->vport_mutex);
+}
+
+int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
+		      u32 doorbell_id)
+{
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	int err;
+
+	mc = mdev->driver_data;
+	ndev = mc->ports[port];
+	mpc = netdev_priv(ndev);
+
+	mutex_lock(&pd->vport_mutex);
+
+	pd->vport_use_count++;
+	if (pd->vport_use_count > 1) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Skip as this PD is already configured vport\n");
+		mutex_unlock(&pd->vport_mutex);
+		return 0;
+	}
+
+	err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
+	if (err) {
+		pd->vport_use_count--;
+		mutex_unlock(&pd->vport_mutex);
+
+		ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err);
+		return err;
+	}
+
+	mutex_unlock(&pd->vport_mutex);
+
+	pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
+	pd->tx_vp_offset = mpc->tx_vp_offset;
+
+	ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
+		  mpc->port_handle, pd->pdn, doorbell_id);
+
+	return 0;
+}
+
+int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct ib_device *ibdev = ibpd->device;
+	struct gdma_create_pd_resp resp = {};
+	struct gdma_create_pd_req req = {};
+	enum gdma_pd_flags flags = 0;
+	struct mana_ib_dev *dev;
+	struct gdma_dev *mdev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = dev->gdma_dev;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_PD, sizeof(req),
+			     sizeof(resp));
+
+	req.flags = flags;
+	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+				   sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to get pd_id err %d status %u\n", err,
+			  resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		return err;
+	}
+
+	pd->pd_handle = resp.pd_handle;
+	pd->pdn = resp.pd_id;
+	ibdev_dbg(&dev->ib_dev, "pd_handle 0x%llx pd_id %d\n",
+		  pd->pd_handle, pd->pdn);
+
+	mutex_init(&pd->vport_mutex);
+	pd->vport_use_count = 0;
+	return 0;
+}
+
+int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct ib_device *ibdev = ibpd->device;
+	struct gdma_destory_pd_resp resp = {};
+	struct gdma_destroy_pd_req req = {};
+	struct mana_ib_dev *dev;
+	struct gdma_dev *mdev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	mdev = dev->gdma_dev;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_PD, sizeof(req),
+			     sizeof(resp));
+
+	req.pd_handle = pd->pd_handle;
+	err = mana_gd_send_request(mdev->gdma_context, sizeof(req), &req,
+				   sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to destroy pd_handle 0x%llx err %d status %u",
+			  pd->pd_handle, err, resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+	}
+
+	return err;
+}
+
+static int mana_gd_destroy_doorbell_page(struct gdma_context *gc,
+					 int doorbell_page)
+{
+	struct gdma_destroy_resource_range_req req = {};
+	struct gdma_resp_hdr resp = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_RESOURCE_RANGE,
+			     sizeof(req), sizeof(resp));
+
+	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
+	req.num_resources = 1;
+	req.allocated_resources = doorbell_page;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.status) {
+		dev_err(gc->dev,
+			"Failed to destroy doorbell page: ret %d, 0x%x\n",
+			err, resp.status);
+		return err ?: -EPROTO;
+	}
+
+	return 0;
+}
+
+static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
+					  int *doorbell_page)
+{
+	struct gdma_allocate_resource_range_req req = {};
+	struct gdma_allocate_resource_range_resp resp = {};
+	int err;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE,
+			     sizeof(req), sizeof(resp));
+
+	req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
+	req.num_resources = 1;
+	req.alignment = 1;
+
+	/* Have GDMA start searching from 0 */
+	req.allocated_resources = 0;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.hdr.status) {
+		dev_err(gc->dev,
+			"Failed to allocate doorbell page: ret %d, 0x%x\n",
+			err, resp.hdr.status);
+		return err ?: -EPROTO;
+	}
+
+	*doorbell_page = resp.allocated_resources;
+
+	return 0;
+}
+
+int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
+			   struct ib_udata *udata)
+{
+	struct mana_ib_ucontext *ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	struct gdma_dev *dev;
+	int doorbell_page;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	dev = mdev->gdma_dev;
+	gc = dev->gdma_context;
+
+	/* Allocate a doorbell page index */
+	ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
+	if (ret) {
+		ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
+		return ret;
+	}
+
+	ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
+
+	ucontext->doorbell = doorbell_page;
+
+	return 0;
+}
+
+void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mana_ib_ucontext *mana_ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mdev->gdma_dev->gdma_context;
+
+	ret = mana_gd_destroy_doorbell_page(gc, mana_ucontext->doorbell);
+	if (ret)
+		ibdev_dbg(ibdev, "Failed to destroy doorbell page %d\n", ret);
+}
+
+static int
+mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
+			    struct gdma_context *gc,
+			    struct gdma_create_dma_region_req *create_req,
+			    size_t num_pages, mana_handle_t *gdma_region)
+{
+	struct gdma_create_dma_region_resp create_resp = {};
+	unsigned int create_req_msg_size;
+	int err;
+
+	create_req_msg_size =
+		struct_size(create_req, page_addr_list, num_pages);
+	create_req->page_addr_list_len = num_pages;
+
+	err = mana_gd_send_request(gc, create_req_msg_size, create_req,
+				   sizeof(create_resp), &create_resp);
+	if (err || create_resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to create DMA region: %d, 0x%x\n",
+			  err, create_resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		return err;
+	}
+
+	*gdma_region = create_resp.dma_region_handle;
+	ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n",
+		  *gdma_region);
+
+	return 0;
+}
+
+static int
+mana_ib_gd_add_dma_region(struct mana_ib_dev *dev, struct gdma_context *gc,
+			  struct gdma_dma_region_add_pages_req *add_req,
+			  unsigned int num_pages, u32 expected_status)
+{
+	unsigned int add_req_msg_size =
+		struct_size(add_req, page_addr_list, num_pages);
+	struct gdma_general_resp add_resp = {};
+	int err;
+
+	mana_gd_init_req_hdr(&add_req->hdr, GDMA_DMA_REGION_ADD_PAGES,
+			     add_req_msg_size, sizeof(add_resp));
+	add_req->page_addr_list_len = num_pages;
+
+	err = mana_gd_send_request(gc, add_req_msg_size, add_req,
+				   sizeof(add_resp), &add_resp);
+	if (err || add_resp.hdr.status != expected_status) {
+		ibdev_dbg(&dev->ib_dev,
+			  "Failed to create DMA region: %d, 0x%x\n",
+			  err, add_resp.hdr.status);
+
+		if (!err)
+			err = -EPROTO;
+
+		return err;
+	}
+
+	return 0;
+}
+
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+				 mana_handle_t *gdma_region)
+{
+	struct gdma_dma_region_add_pages_req *add_req = NULL;
+	size_t num_pages_processed = 0, num_pages_to_handle;
+	struct gdma_create_dma_region_req *create_req;
+	unsigned int create_req_msg_size;
+	struct hw_channel_context *hwc;
+	struct ib_block_iter biter;
+	size_t max_pgs_add_cmd = 0;
+	size_t max_pgs_create_cmd;
+	struct gdma_context *gc;
+	size_t num_pages_total;
+	struct gdma_dev *mdev;
+	unsigned long page_sz;
+	unsigned int tail = 0;
+	u64 *page_addr_list;
+	void *request_buf;
+	int err;
+
+	mdev = dev->gdma_dev;
+	gc = mdev->gdma_context;
+	hwc = gc->hwc.driver_data;
+
+	/* Hardware requires dma region to align to chosen page size */
+	page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
+	if (!page_sz) {
+		ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
+		return -ENOMEM;
+	}
+	num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
+
+	max_pgs_create_cmd =
+		(hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64);
+	num_pages_to_handle =
+		min_t(size_t, num_pages_total, max_pgs_create_cmd);
+	create_req_msg_size =
+		struct_size(create_req, page_addr_list, num_pages_to_handle);
+
+	request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
+	if (!request_buf)
+		return -ENOMEM;
+
+	create_req = request_buf;
+	mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION,
+			     create_req_msg_size,
+			     sizeof(struct gdma_create_dma_region_resp));
+
+	create_req->length = umem->length;
+	create_req->offset_in_page = umem->address & (page_sz - 1);
+	create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
+	create_req->page_count = num_pages_total;
+
+	ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
+		  umem->length, num_pages_total);
+
+	ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
+		  page_sz, create_req->offset_in_page);
+
+	ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
+		  num_pages_to_handle, create_req->gdma_page_type);
+
+	page_addr_list = create_req->page_addr_list;
+	rdma_umem_for_each_dma_block(umem, &biter, page_sz) {
+		page_addr_list[tail++] = rdma_block_iter_dma_address(&biter);
+		if (tail < num_pages_to_handle)
+			continue;
+
+		if (!num_pages_processed) {
+			/* First create message */
+			err = mana_ib_gd_first_dma_region(dev, gc, create_req,
+							  tail, gdma_region);
+			if (err)
+				goto out;
+
+			max_pgs_add_cmd = (hwc->max_req_msg_size -
+				sizeof(*add_req)) / sizeof(u64);
+
+			add_req = request_buf;
+			add_req->dma_region_handle = *gdma_region;
+			add_req->reserved3 = 0;
+			page_addr_list = add_req->page_addr_list;
+		} else {
+			/* Subsequent create messages */
+			u32 expected_s = 0;
+
+			if (num_pages_processed + num_pages_to_handle <
+			    num_pages_total)
+				expected_s = GDMA_STATUS_MORE_ENTRIES;
+
+			err = mana_ib_gd_add_dma_region(dev, gc, add_req, tail,
+							expected_s);
+			if (err)
+				break;
+		}
+
+		num_pages_processed += tail;
+		tail = 0;
+
+		/* The remaining pages to create */
+		num_pages_to_handle =
+			min_t(size_t,
+			      num_pages_total - num_pages_processed,
+			      max_pgs_add_cmd);
+	}
+
+	if (err)
+		mana_ib_gd_destroy_dma_region(dev, *gdma_region);
+
+out:
+	kfree(request_buf);
+	return err;
+}
+
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev, u64 gdma_region)
+{
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+
+	gc = mdev->gdma_context;
+	ibdev_dbg(&dev->ib_dev, "destroy dma region 0x%llx\n", gdma_region);
+
+	return mana_gd_destroy_dma_region(gc, gdma_region);
+}
+
+int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mana_ib_ucontext *mana_ucontext =
+		container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
+	struct ib_device *ibdev = ibcontext->device;
+	struct mana_ib_dev *mdev;
+	struct gdma_context *gc;
+	phys_addr_t pfn;
+	pgprot_t prot;
+	int ret;
+
+	mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+	gc = mdev->gdma_dev->gdma_context;
+
+	if (vma->vm_pgoff != 0) {
+		ibdev_dbg(ibdev, "Unexpected vm_pgoff %lu\n", vma->vm_pgoff);
+		return -EINVAL;
+	}
+
+	/* Map to the page indexed by ucontext->doorbell */
+	pfn = (gc->phys_db_page_base +
+	       gc->db_page_size * mana_ucontext->doorbell) >>
+	      PAGE_SHIFT;
+	prot = pgprot_writecombine(vma->vm_page_prot);
+
+	ret = rdma_user_mmap_io(ibcontext, vma, pfn, gc->db_page_size, prot,
+				NULL);
+	if (ret)
+		ibdev_dbg(ibdev, "can't rdma_user_mmap_io ret %d\n", ret);
+	else
+		ibdev_dbg(ibdev, "mapped I/O pfn 0x%llx page_size %u, ret %d\n",
+			  pfn, gc->db_page_size, ret);
+
+	return ret;
+}
+
+int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
+			       struct ib_port_immutable *immutable)
+{
+	/*
+	 * This version only support RAW_PACKET
+	 * other values need to be filled for other types
+	 */
+	immutable->core_cap_flags = RDMA_CORE_PORT_RAW_PACKET;
+
+	return 0;
+}
+
+int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			 struct ib_udata *uhw)
+{
+	props->max_qp = MANA_MAX_NUM_QUEUES;
+	props->max_qp_wr = MAX_SEND_BUFFERS_PER_QUEUE;
+
+	/*
+	 * max_cqe could be potentially much bigger.
+	 * As this version of driver only support RAW QP, set it to the same
+	 * value as max_qp_wr
+	 */
+	props->max_cqe = MAX_SEND_BUFFERS_PER_QUEUE;
+
+	props->max_mr_size = MANA_IB_MAX_MR_SIZE;
+	props->max_mr = MANA_IB_MAX_MR;
+	props->max_send_sge = MAX_TX_WQE_SGL_ENTRIES;
+	props->max_recv_sge = MAX_RX_WQE_SGL_ENTRIES;
+
+	return 0;
+}
+
+int mana_ib_query_port(struct ib_device *ibdev, u32 port,
+		       struct ib_port_attr *props)
+{
+	/* This version doesn't return port properties */
+	return 0;
+}
+
+int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+		      union ib_gid *gid)
+{
+	/* This version doesn't return GID properties */
+	return 0;
+}
+
+void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext)
+{
+}
diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
new file mode 100644
index 000000000000..502cc8672eef
--- /dev/null
+++ b/drivers/infiniband/hw/mana/mana_ib.h
@@ -0,0 +1,162 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ */
+
+#ifndef _MANA_IB_H_
+#define _MANA_IB_H_
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_umem.h>
+#include <rdma/mana-abi.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include <net/mana/mana.h>
+
+#define PAGE_SZ_BM                                                             \
+	(SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K |        \
+	 SZ_512K | SZ_1M | SZ_2M)
+
+/* MANA doesn't have any limit for MR size */
+#define MANA_IB_MAX_MR_SIZE	U64_MAX
+
+/*
+ * The hardware limit of number of MRs is greater than maximum number of MRs
+ * that can possibly represent in 24 bits
+ */
+#define MANA_IB_MAX_MR		0xFFFFFFu
+
+struct mana_ib_dev {
+	struct ib_device ib_dev;
+	struct gdma_dev *gdma_dev;
+};
+
+struct mana_ib_wq {
+	struct ib_wq ibwq;
+	struct ib_umem *umem;
+	int wqe;
+	u32 wq_buf_size;
+	u64 gdma_region;
+	u64 id;
+	mana_handle_t rx_object;
+};
+
+struct mana_ib_pd {
+	struct ib_pd ibpd;
+	u32 pdn;
+	mana_handle_t pd_handle;
+
+	/* Mutex for sharing access to vport_use_count */
+	struct mutex vport_mutex;
+	int vport_use_count;
+
+	bool tx_shortform_allowed;
+	u32 tx_vp_offset;
+};
+
+struct mana_ib_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	mana_handle_t mr_handle;
+};
+
+struct mana_ib_cq {
+	struct ib_cq ibcq;
+	struct ib_umem *umem;
+	int cqe;
+	u64 gdma_region;
+	u64 id;
+};
+
+struct mana_ib_qp {
+	struct ib_qp ibqp;
+
+	/* Work queue info */
+	struct ib_umem *sq_umem;
+	int sqe;
+	u64 sq_gdma_region;
+	u64 sq_id;
+	mana_handle_t tx_object;
+
+	/* The port on the IB device, starting with 1 */
+	u32 port;
+};
+
+struct mana_ib_ucontext {
+	struct ib_ucontext ibucontext;
+	u32 doorbell;
+};
+
+struct mana_ib_rwq_ind_table {
+	struct ib_rwq_ind_table ib_ind_table;
+};
+
+int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
+				 mana_handle_t *gdma_region);
+
+int mana_ib_gd_destroy_dma_region(struct mana_ib_dev *dev,
+				  mana_handle_t gdma_region);
+
+struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata);
+
+int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata);
+
+int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata);
+
+int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata);
+
+int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl);
+
+struct ib_mr *mana_ib_get_dma_mr(struct ib_pd *ibpd, int access_flags);
+
+struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 iova, int access_flags,
+				  struct ib_udata *udata);
+
+int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+
+int mana_ib_create_qp(struct ib_qp *qp, struct ib_qp_init_attr *qp_init_attr,
+		      struct ib_udata *udata);
+
+int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+
+int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+
+int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port_id,
+		      struct mana_ib_pd *pd, u32 doorbell_id);
+void mana_ib_uncfg_vport(struct mana_ib_dev *dev, struct mana_ib_pd *pd,
+			 u32 port);
+
+int mana_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
+		      struct ib_udata *udata);
+
+int mana_ib_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+
+int mana_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int mana_ib_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+
+int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
+			   struct ib_udata *udata);
+void mana_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+
+int mana_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma);
+
+int mana_ib_get_port_immutable(struct ib_device *ibdev, u32 port_num,
+			       struct ib_port_immutable *immutable);
+int mana_ib_query_device(struct ib_device *ibdev, struct ib_device_attr *props,
+			 struct ib_udata *uhw);
+int mana_ib_query_port(struct ib_device *ibdev, u32 port,
+		       struct ib_port_attr *props);
+int mana_ib_query_gid(struct ib_device *ibdev, u32 port, int index,
+		      union ib_gid *gid);
+
+void mana_ib_disassociate_ucontext(struct ib_ucontext *ibcontext);
+
+#endif
diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
new file mode 100644
index 000000000000..a56236cdd9ee
--- /dev/null
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -0,0 +1,198 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+#define VALID_MR_FLAGS                                                         \
+	(IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
+
+static enum gdma_mr_access_flags
+mana_ib_verbs_to_gdma_access_flags(int access_flags)
+{
+	enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ;
+
+	if (access_flags & IB_ACCESS_LOCAL_WRITE)
+		flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
+
+	return flags;
+}
+
+static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
+				struct gdma_create_mr_params *mr_params)
+{
+	struct gdma_create_mr_response resp = {};
+	struct gdma_create_mr_request req = {};
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+	int err;
+
+	gc = mdev->gdma_context;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
+			     sizeof(resp));
+	req.pd_handle = mr_params->pd_handle;
+	req.mr_type = mr_params->mr_type;
+
+	switch (mr_params->mr_type) {
+	case GDMA_MR_TYPE_GVA:
+		req.gva.dma_region_handle = mr_params->gva.dma_region_handle;
+		req.gva.virtual_address = mr_params->gva.virtual_address;
+		req.gva.access_flags = mr_params->gva.access_flags;
+		break;
+
+	default:
+		ibdev_dbg(&dev->ib_dev,
+			  "invalid param (GDMA_MR_TYPE) passed, type %d\n",
+			  req.mr_type);
+		return -EINVAL;
+	}
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+
+	if (err || resp.hdr.status) {
+		ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
+			  resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+
+		return err;
+	}
+
+	mr->ibmr.lkey = resp.lkey;
+	mr->ibmr.rkey = resp.rkey;
+	mr->mr_handle = resp.mr_handle;
+
+	return 0;
+}
+
+static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev,
+				 gdma_obj_handle_t mr_handle)
+{
+	struct gdma_destroy_mr_response resp = {};
+	struct gdma_destroy_mr_request req = {};
+	struct gdma_dev *mdev = dev->gdma_dev;
+	struct gdma_context *gc;
+	int err;
+
+	gc = mdev->gdma_context;
+
+	mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
+			     sizeof(resp));
+
+	req.mr_handle = mr_handle;
+
+	err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
+	if (err || resp.hdr.status) {
+		dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
+			resp.hdr.status);
+		if (!err)
+			err = -EPROTO;
+		return err;
+	}
+
+	return 0;
+}
+
+struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
+				  u64 iova, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct gdma_create_mr_params mr_params = {};
+	struct ib_device *ibdev = ibpd->device;
+	gdma_obj_handle_t dma_region_handle;
+	struct mana_ib_dev *dev;
+	struct mana_ib_mr *mr;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	ibdev_dbg(ibdev,
+		  "start 0x%llx, iova 0x%llx length 0x%llx access_flags 0x%x",
+		  start, iova, length, access_flags);
+
+	if (access_flags & ~VALID_MR_FLAGS)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(ibdev, start, length, access_flags);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		ibdev_dbg(ibdev,
+			  "Failed to get umem for register user-mr, %d\n", err);
+		goto err_free;
+	}
+
+	err = mana_ib_gd_create_dma_region(dev, mr->umem, &dma_region_handle);
+	if (err) {
+		ibdev_dbg(ibdev, "Failed create dma region for user-mr, %d\n",
+			  err);
+		goto err_umem;
+	}
+
+	ibdev_dbg(ibdev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region %llx\n", err,
+		  dma_region_handle);
+
+	mr_params.pd_handle = pd->pd_handle;
+	mr_params.mr_type = GDMA_MR_TYPE_GVA;
+	mr_params.gva.dma_region_handle = dma_region_handle;
+	mr_params.gva.virtual_address = iova;
+	mr_params.gva.access_flags =
+		mana_ib_verbs_to_gdma_access_flags(access_flags);
+
+	err = mana_ib_gd_create_mr(dev, mr, &mr_params);
+	if (err)
+		goto err_dma_region;
+
+	/*
+	 * There is no need to keep track of dma_region_handle after MR is
+	 * successfully created. The dma_region_handle is tracked in the PF
+	 * as part of the lifecycle of this MR.
+	 */
+
+	return &mr->ibmr;
+
+err_dma_region:
+	mana_gd_destroy_dma_region(dev->gdma_dev->gdma_context,
+				   dma_region_handle);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int mana_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+	struct mana_ib_mr *mr = container_of(ibmr, struct mana_ib_mr, ibmr);
+	struct ib_device *ibdev = ibmr->device;
+	struct mana_ib_dev *dev;
+	int err;
+
+	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
+
+	err = mana_ib_gd_destroy_mr(dev, mr->mr_handle);
+	if (err)
+		return err;
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mana/qp.c b/drivers/infiniband/hw/mana/qp.c
new file mode 100644
index 000000000000..ea15ec77e321
--- /dev/null
+++ b/drivers/infiniband/hw/mana/qp.c
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+static int mana_ib_cfg_vport_steering(struct mana_ib_dev *dev,
+				      struct net_device *ndev,
+				      mana_handle_t default_rxobj,
+				      mana_handle_t ind_table[],
+				      u32 log_ind_tbl_size, u32 rx_hash_key_len,
+				      u8 *rx_hash_key)
+{
+	struct mana_port_context *mpc = netdev_priv(ndev);
+	struct mana_cfg_rx_steer_req *req = NULL;
+	struct mana_cfg_rx_steer_resp resp = {};
+	mana_handle_t *req_indir_tab;
+	struct gdma_context *gc;
+	struct gdma_dev *mdev;
+	u32 req_buf_size;
+	int i, err;
+
+	mdev = dev->gdma_dev;
+	gc = mdev->gdma_context;
+
+	req_buf_size =
+		sizeof(*req) + sizeof(mana_handle_t) * MANA_INDIRECT_TABLE_SIZE;
+	req = kzalloc(req_buf_size, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	mana_gd_init_req_hdr(&req->hdr, MANA_CONFIG_VPORT_RX, req_buf_size,
+			     sizeof(resp));
+
+	req->vport = mpc->port_handle;
+	req->rx_enable = 1;
+	req->update_default_rxobj = 1;
+	req->default_rxobj = default_rxobj;
+	req->hdr.dev_id = mdev->dev_id;
+
+	/* If there are more than 1 entries in indirection table, enable RSS */
+	if (log_ind_tbl_size)
+		req->rss_enable = true;
+
+	req->num_indir_entries = MANA_INDIRECT_TABLE_SIZE;
+	req->indir_tab_offset = sizeof(*req);
+	req->update_indir_tab = true;
+
+	req_indir_tab = (mana_handle_t *)(req + 1);
+	/* The ind table passed to the hardware must have
+	 * MANA_INDIRECT_TABLE_SIZE entries. Adjust the verb
+	 * ind_table to MANA_INDIRECT_TABLE_SIZE if required
+	 */
+	ibdev_dbg(&dev->ib_dev, "ind table size %u\n", 1 << log_ind_tbl_size);
+	for (i = 0; i < MANA_INDIRECT_TABLE_SIZE; i++) {
+		req_indir_tab[i] = ind_table[i % (1 << log_ind_tbl_size)];
+		ibdev_dbg(&dev->ib_dev, "index %u handle 0x%llx\n", i,
+			  req_indir_tab[i]);
+	}
+
+	req->update_hashkey = true;
+	if (rx_hash_key_len)
+		memcpy(req->hashkey, rx_hash_key, rx_hash_key_len);
+	else
+		netdev_rss_key_fill(req->hashkey, MANA_HASH_KEY_SIZE);
+
+	ibdev_dbg(&dev->ib_dev, "vport handle %llu default_rxobj 0x%llx\n",
+		  req->vport, default_rxobj);
+
+	err = mana_gd_send_request(gc, req_buf_size, req, sizeof(resp), &resp);
+	if (err) {
+		netdev_err(ndev, "Failed to configure vPort RX: %d\n", err);
+		goto out;
+	}
+
+	if (resp.hdr.status) {
+		netdev_err(ndev, "vPort RX configuration failed: 0x%x\n",
+			   resp.hdr.status);
+		err = -EPROTO;
+		goto out;
+	}
+
+	netdev_info(ndev, "Configured steering vPort %llu log_entries %u\n",
+		    mpc->port_handle, log_ind_tbl_size);
+
+out:
+	kfree(req);
+	return err;
+}
+
+static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
+				 struct ib_qp_init_attr *attr,
+				 struct ib_udata *udata)
+{
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(pd->device, struct mana_ib_dev, ib_dev);
+	struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
+	struct mana_ib_create_qp_rss_resp resp = {};
+	struct mana_ib_create_qp_rss ucmd = {};
+	struct gdma_dev *gd = mdev->gdma_dev;
+	mana_handle_t *mana_ind_table;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_cq *cq;
+	struct mana_ib_wq *wq;
+	unsigned int ind_tbl_size;
+	struct ib_cq *ibcq;
+	struct ib_wq *ibwq;
+	int i = 0;
+	u32 port;
+	int ret;
+
+	mc = gd->driver_data;
+
+	if (!udata || udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (ret) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed copy from udata for create rss-qp, err %d\n",
+			  ret);
+		return ret;
+	}
+
+	if (attr->cap.max_recv_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_recv_wr %d exceeding limit\n",
+			  attr->cap.max_recv_wr);
+		return -EINVAL;
+	}
+
+	if (attr->cap.max_recv_sge > MAX_RX_WQE_SGL_ENTRIES) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_recv_sge %d exceeding limit\n",
+			  attr->cap.max_recv_sge);
+		return -EINVAL;
+	}
+
+	ind_tbl_size = 1 << ind_tbl->log_ind_tbl_size;
+	if (ind_tbl_size > MANA_INDIRECT_TABLE_SIZE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Indirect table size %d exceeding limit\n",
+			  ind_tbl_size);
+		return -EINVAL;
+	}
+
+	if (ucmd.rx_hash_function != MANA_IB_RX_HASH_FUNC_TOEPLITZ) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "RX Hash function is not supported, %d\n",
+			  ucmd.rx_hash_function);
+		return -EINVAL;
+	}
+
+	/* IB ports start with 1, MANA start with 0 */
+	port = ucmd.port;
+	if (port < 1 || port > mc->num_ports) {
+		ibdev_dbg(&mdev->ib_dev, "Invalid port %u in creating qp\n",
+			  port);
+		return -EINVAL;
+	}
+	ndev = mc->ports[port - 1];
+	mpc = netdev_priv(ndev);
+
+	ibdev_dbg(&mdev->ib_dev, "rx_hash_function %d port %d\n",
+		  ucmd.rx_hash_function, port);
+
+	mana_ind_table = kcalloc(ind_tbl_size, sizeof(mana_handle_t),
+				 GFP_KERNEL);
+	if (!mana_ind_table) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	qp->port = port;
+
+	for (i = 0; i < ind_tbl_size; i++) {
+		struct mana_obj_spec wq_spec = {};
+		struct mana_obj_spec cq_spec = {};
+
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+
+		ibcq = ibwq->cq;
+		cq = container_of(ibcq, struct mana_ib_cq, ibcq);
+
+		wq_spec.gdma_region = wq->gdma_region;
+		wq_spec.queue_size = wq->wq_buf_size;
+
+		cq_spec.gdma_region = cq->gdma_region;
+		cq_spec.queue_size = cq->cqe * COMP_ENTRY_SIZE;
+		cq_spec.modr_ctx_id = 0;
+		cq_spec.attached_eq = GDMA_CQ_NO_EQ;
+
+		ret = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_RQ,
+					 &wq_spec, &cq_spec, &wq->rx_object);
+		if (ret)
+			goto fail;
+
+		/* The GDMA regions are now owned by the WQ object */
+		wq->gdma_region = GDMA_INVALID_DMA_REGION;
+		cq->gdma_region = GDMA_INVALID_DMA_REGION;
+
+		wq->id = wq_spec.queue_index;
+		cq->id = cq_spec.queue_index;
+
+		ibdev_dbg(&mdev->ib_dev,
+			  "ret %d rx_object 0x%llx wq id %llu cq id %llu\n",
+			  ret, wq->rx_object, wq->id, cq->id);
+
+		resp.entries[i].cqid = cq->id;
+		resp.entries[i].wqid = wq->id;
+
+		mana_ind_table[i] = wq->rx_object;
+	}
+	resp.num_entries = i;
+
+	ret = mana_ib_cfg_vport_steering(mdev, ndev, wq->rx_object,
+					 mana_ind_table,
+					 ind_tbl->log_ind_tbl_size,
+					 ucmd.rx_hash_key_len,
+					 ucmd.rx_hash_key);
+	if (ret)
+		goto fail;
+
+	ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (ret) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy to udata create rss-qp, %d\n",
+			  ret);
+		goto fail;
+	}
+
+	kfree(mana_ind_table);
+
+	return 0;
+
+fail:
+	while (i-- > 0) {
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+	}
+
+	kfree(mana_ind_table);
+
+	return ret;
+}
+
+static int mana_ib_create_qp_raw(struct ib_qp *ibqp, struct ib_pd *ibpd,
+				 struct ib_qp_init_attr *attr,
+				 struct ib_udata *udata)
+{
+	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+	struct mana_ib_dev *mdev =
+		container_of(ibpd->device, struct mana_ib_dev, ib_dev);
+	struct mana_ib_cq *send_cq =
+		container_of(attr->send_cq, struct mana_ib_cq, ibcq);
+	struct mana_ib_ucontext *mana_ucontext =
+		rdma_udata_to_drv_context(udata, struct mana_ib_ucontext,
+					  ibucontext);
+	struct mana_ib_create_qp_resp resp = {};
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct mana_ib_create_qp ucmd = {};
+	struct mana_obj_spec wq_spec = {};
+	struct mana_obj_spec cq_spec = {};
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct ib_umem *umem;
+	int err;
+	u32 port;
+
+	mc = gd->driver_data;
+
+	if (!mana_ucontext || udata->inlen < sizeof(ucmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy from udata create qp-raw, %d\n", err);
+		return err;
+	}
+
+	/* IB ports start with 1, MANA Ethernet ports start with 0 */
+	port = ucmd.port;
+	if (ucmd.port > mc->num_ports)
+		return -EINVAL;
+
+	if (attr->cap.max_send_wr > MAX_SEND_BUFFERS_PER_QUEUE) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_send_wr %d exceeding limit\n",
+			  attr->cap.max_send_wr);
+		return -EINVAL;
+	}
+
+	if (attr->cap.max_send_sge > MAX_TX_WQE_SGL_ENTRIES) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Requested max_send_sge %d exceeding limit\n",
+			  attr->cap.max_send_sge);
+		return -EINVAL;
+	}
+
+	ndev = mc->ports[port - 1];
+	mpc = netdev_priv(ndev);
+	ibdev_dbg(&mdev->ib_dev, "port %u ndev %p mpc %p\n", port, ndev, mpc);
+
+	err = mana_ib_cfg_vport(mdev, port - 1, pd, mana_ucontext->doorbell);
+	if (err)
+		return -ENODEV;
+
+	qp->port = port;
+
+	ibdev_dbg(&mdev->ib_dev, "ucmd sq_buf_addr 0x%llx port %u\n",
+		  ucmd.sq_buf_addr, ucmd.port);
+
+	umem = ib_umem_get(ibpd->device, ucmd.sq_buf_addr, ucmd.sq_buf_size,
+			   IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to get umem for create qp-raw, err %d\n",
+			  err);
+		goto err_free_vport;
+	}
+	qp->sq_umem = umem;
+
+	err = mana_ib_gd_create_dma_region(mdev, qp->sq_umem,
+					   &qp->sq_gdma_region);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create dma region for create qp-raw, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, qp->sq_gdma_region);
+
+	/* Create a WQ on the same port handle used by the Ethernet */
+	wq_spec.gdma_region = qp->sq_gdma_region;
+	wq_spec.queue_size = ucmd.sq_buf_size;
+
+	cq_spec.gdma_region = send_cq->gdma_region;
+	cq_spec.queue_size = send_cq->cqe * COMP_ENTRY_SIZE;
+	cq_spec.modr_ctx_id = 0;
+	cq_spec.attached_eq = GDMA_CQ_NO_EQ;
+
+	err = mana_create_wq_obj(mpc, mpc->port_handle, GDMA_SQ, &wq_spec,
+				 &cq_spec, &qp->tx_object);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create wq for create raw-qp, err %d\n",
+			  err);
+		goto err_destroy_dma_region;
+	}
+
+	/* The GDMA regions are now owned by the WQ object */
+	qp->sq_gdma_region = GDMA_INVALID_DMA_REGION;
+	send_cq->gdma_region = GDMA_INVALID_DMA_REGION;
+
+	qp->sq_id = wq_spec.queue_index;
+	send_cq->id = cq_spec.queue_index;
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "ret %d qp->tx_object 0x%llx sq id %llu cq id %llu\n", err,
+		  qp->tx_object, qp->sq_id, send_cq->id);
+
+	resp.sqid = qp->sq_id;
+	resp.cqid = send_cq->id;
+	resp.tx_vp_offset = pd->tx_vp_offset;
+
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed copy udata for create qp-raw, %d\n",
+			  err);
+		goto err_destroy_wq_obj;
+	}
+
+	return 0;
+
+err_destroy_wq_obj:
+	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
+
+err_destroy_dma_region:
+	mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+
+err_release_umem:
+	ib_umem_release(umem);
+
+err_free_vport:
+	mana_ib_uncfg_vport(mdev, pd, port - 1);
+
+	return err;
+}
+
+int mana_ib_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attr,
+		      struct ib_udata *udata)
+{
+	switch (attr->qp_type) {
+	case IB_QPT_RAW_PACKET:
+		/* When rwq_ind_tbl is used, it's for creating WQs for RSS */
+		if (attr->rwq_ind_tbl)
+			return mana_ib_create_qp_rss(ibqp, ibqp->pd, attr,
+						     udata);
+
+		return mana_ib_create_qp_raw(ibqp, ibqp->pd, attr, udata);
+	default:
+		/* Creating QP other than IB_QPT_RAW_PACKET is not supported */
+		ibdev_dbg(ibqp->device, "Creating QP type %u not supported\n",
+			  attr->qp_type);
+	}
+
+	return -EINVAL;
+}
+
+int mana_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	/* modify_qp is not supported by this version of the driver */
+	return -EOPNOTSUPP;
+}
+
+static int mana_ib_destroy_qp_rss(struct mana_ib_qp *qp,
+				  struct ib_rwq_ind_table *ind_tbl,
+				  struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_wq *wq;
+	struct ib_wq *ibwq;
+	int i;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[qp->port - 1];
+	mpc = netdev_priv(ndev);
+
+	for (i = 0; i < (1 << ind_tbl->log_ind_tbl_size); i++) {
+		ibwq = ind_tbl->ind_tbl[i];
+		wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+		ibdev_dbg(&mdev->ib_dev, "destroying wq->rx_object %llu\n",
+			  wq->rx_object);
+		mana_destroy_wq_obj(mpc, GDMA_RQ, wq->rx_object);
+	}
+
+	return 0;
+}
+
+static int mana_ib_destroy_qp_raw(struct mana_ib_qp *qp, struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(qp->ibqp.device, struct mana_ib_dev, ib_dev);
+	struct gdma_dev *gd = mdev->gdma_dev;
+	struct ib_pd *ibpd = qp->ibqp.pd;
+	struct mana_port_context *mpc;
+	struct mana_context *mc;
+	struct net_device *ndev;
+	struct mana_ib_pd *pd;
+
+	mc = gd->driver_data;
+	ndev = mc->ports[qp->port - 1];
+	mpc = netdev_priv(ndev);
+	pd = container_of(ibpd, struct mana_ib_pd, ibpd);
+
+	mana_destroy_wq_obj(mpc, GDMA_SQ, qp->tx_object);
+
+	if (qp->sq_umem) {
+		mana_ib_gd_destroy_dma_region(mdev, qp->sq_gdma_region);
+		ib_umem_release(qp->sq_umem);
+	}
+
+	mana_ib_uncfg_vport(mdev, pd, qp->port - 1);
+
+	return 0;
+}
+
+int mana_ib_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+	struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
+
+	switch (ibqp->qp_type) {
+	case IB_QPT_RAW_PACKET:
+		if (ibqp->rwq_ind_tbl)
+			return mana_ib_destroy_qp_rss(qp, ibqp->rwq_ind_tbl,
+						      udata);
+
+		return mana_ib_destroy_qp_raw(qp, udata);
+
+	default:
+		ibdev_dbg(ibqp->device, "Unexpected QP type %u\n",
+			  ibqp->qp_type);
+	}
+
+	return -ENOENT;
+}
diff --git a/drivers/infiniband/hw/mana/wq.c b/drivers/infiniband/hw/mana/wq.c
new file mode 100644
index 000000000000..372d361510e0
--- /dev/null
+++ b/drivers/infiniband/hw/mana/wq.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#include "mana_ib.h"
+
+struct ib_wq *mana_ib_create_wq(struct ib_pd *pd,
+				struct ib_wq_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mana_ib_dev *mdev =
+		container_of(pd->device, struct mana_ib_dev, ib_dev);
+	struct mana_ib_create_wq ucmd = {};
+	struct mana_ib_wq *wq;
+	struct ib_umem *umem;
+	int err;
+
+	if (udata->inlen < sizeof(ucmd))
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to copy from udata for create wq, %d\n", err);
+		return ERR_PTR(err);
+	}
+
+	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
+	if (!wq)
+		return ERR_PTR(-ENOMEM);
+
+	ibdev_dbg(&mdev->ib_dev, "ucmd wq_buf_addr 0x%llx\n", ucmd.wq_buf_addr);
+
+	umem = ib_umem_get(pd->device, ucmd.wq_buf_addr, ucmd.wq_buf_size,
+			   IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to get umem for create wq, err %d\n", err);
+		goto err_free_wq;
+	}
+
+	wq->umem = umem;
+	wq->wqe = init_attr->max_wr;
+	wq->wq_buf_size = ucmd.wq_buf_size;
+	wq->rx_object = INVALID_MANA_HANDLE;
+
+	err = mana_ib_gd_create_dma_region(mdev, wq->umem, &wq->gdma_region);
+	if (err) {
+		ibdev_dbg(&mdev->ib_dev,
+			  "Failed to create dma region for create wq, %d\n",
+			  err);
+		goto err_release_umem;
+	}
+
+	ibdev_dbg(&mdev->ib_dev,
+		  "mana_ib_gd_create_dma_region ret %d gdma_region 0x%llx\n",
+		  err, wq->gdma_region);
+
+	/* WQ ID is returned at wq_create time, doesn't know the value yet */
+
+	return &wq->ibwq;
+
+err_release_umem:
+	ib_umem_release(umem);
+
+err_free_wq:
+	kfree(wq);
+
+	return ERR_PTR(err);
+}
+
+int mana_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
+		      u32 wq_attr_mask, struct ib_udata *udata)
+{
+	/* modify_wq is not supported by this version of the driver */
+	return -EOPNOTSUPP;
+}
+
+int mana_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
+{
+	struct mana_ib_wq *wq = container_of(ibwq, struct mana_ib_wq, ibwq);
+	struct ib_device *ib_dev = ibwq->device;
+	struct mana_ib_dev *mdev;
+
+	mdev = container_of(ib_dev, struct mana_ib_dev, ib_dev);
+
+	mana_ib_gd_destroy_dma_region(mdev, wq->gdma_region);
+	ib_umem_release(wq->umem);
+
+	kfree(wq);
+
+	return 0;
+}
+
+int mana_ib_create_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_table,
+				 struct ib_rwq_ind_table_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	/*
+	 * There is no additional data in ind_table to be maintained by this
+	 * driver, do nothing
+	 */
+	return 0;
+}
+
+int mana_ib_destroy_rwq_ind_table(struct ib_rwq_ind_table *ib_rwq_ind_tbl)
+{
+	/*
+	 * There is no additional data in ind_table to be maintained by this
+	 * driver, do nothing
+	 */
+	return 0;
+}
diff --git a/include/net/mana/mana.h b/include/net/mana/mana.h
index 713a8f8cca9a..20212ffeefb9 100644
--- a/include/net/mana/mana.h
+++ b/include/net/mana/mana.h
@@ -412,6 +412,9 @@ int mana_bpf(struct net_device *ndev, struct netdev_bpf *bpf);
 
 extern const struct ethtool_ops mana_ethtool_ops;
 
+/* A CQ can be created not associated with any EQ */
+#define GDMA_CQ_NO_EQ  0xffff
+
 struct mana_obj_spec {
 	u32 queue_index;
 	u64 gdma_region;
diff --git a/include/uapi/rdma/ib_user_ioctl_verbs.h b/include/uapi/rdma/ib_user_ioctl_verbs.h
index 7dd56210226f..e0c25537fd2e 100644
--- a/include/uapi/rdma/ib_user_ioctl_verbs.h
+++ b/include/uapi/rdma/ib_user_ioctl_verbs.h
@@ -251,6 +251,7 @@ enum rdma_driver_id {
 	RDMA_DRIVER_EFA,
 	RDMA_DRIVER_SIW,
 	RDMA_DRIVER_ERDMA,
+	RDMA_DRIVER_MANA,
 };
 
 enum ib_uverbs_gid_type {
diff --git a/include/uapi/rdma/mana-abi.h b/include/uapi/rdma/mana-abi.h
new file mode 100644
index 000000000000..5fcb31b37fb9
--- /dev/null
+++ b/include/uapi/rdma/mana-abi.h
@@ -0,0 +1,66 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) */
+/*
+ * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
+ */
+
+#ifndef MANA_ABI_USER_H
+#define MANA_ABI_USER_H
+
+#include <linux/types.h>
+#include <rdma/ib_user_ioctl_verbs.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MANA_IB_UVERBS_ABI_VERSION 1
+
+struct mana_ib_create_cq {
+	__aligned_u64 buf_addr;
+};
+
+struct mana_ib_create_qp {
+	__aligned_u64 sq_buf_addr;
+	__u32 sq_buf_size;
+	__u32 port;
+};
+
+struct mana_ib_create_qp_resp {
+	__u32 sqid;
+	__u32 cqid;
+	__u32 tx_vp_offset;
+	__u32 reserved;
+};
+
+struct mana_ib_create_wq {
+	__aligned_u64 wq_buf_addr;
+	__u32 wq_buf_size;
+	__u32 reserved;
+};
+
+/* RX Hash function flags */
+enum mana_ib_rx_hash_function_flags {
+	MANA_IB_RX_HASH_FUNC_TOEPLITZ = 1 << 0,
+};
+
+struct mana_ib_create_qp_rss {
+	__aligned_u64 rx_hash_fields_mask;
+	__u8 rx_hash_function;
+	__u8 reserved[7];
+	__u32 rx_hash_key_len;
+	__u8 rx_hash_key[40];
+	__u32 port;
+};
+
+struct rss_resp_entry {
+	__u32 cqid;
+	__u32 wqid;
+};
+
+struct mana_ib_create_qp_rss_resp {
+	__aligned_u64 num_entries;
+	struct rss_resp_entry entries[64];
+};
+
+#endif
-- 
cgit v1.2.3


From 3574cfdca28543e2e8db649297cd6659ea8e4bb8 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 11 Nov 2022 11:55:29 +0200
Subject: RDMA/mana: Remove redefinition of basic u64 type

gdma_obj_handle_t is no more than redefinition of basic
u64 type. Remove such obfuscation.

Link: https://lore.kernel.org/r/3c1e821279e6a165d058655d2343722d6650e776.1668160486.git.leonro@nvidia.com
Acked-by: Long Li <longli@microsoft.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 drivers/infiniband/hw/mana/mr.c                 |  5 ++--
 drivers/net/ethernet/microsoft/mana/gdma_main.c |  3 +--
 include/net/mana/gdma.h                         | 31 +++++++++++--------------
 3 files changed, 17 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
index a56236cdd9ee..351207c60eb6 100644
--- a/drivers/infiniband/hw/mana/mr.c
+++ b/drivers/infiniband/hw/mana/mr.c
@@ -73,8 +73,7 @@ static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
 	return 0;
 }
 
-static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev,
-				 gdma_obj_handle_t mr_handle)
+static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, u64 mr_handle)
 {
 	struct gdma_destroy_mr_response resp = {};
 	struct gdma_destroy_mr_request req = {};
@@ -108,9 +107,9 @@ struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 length,
 	struct mana_ib_pd *pd = container_of(ibpd, struct mana_ib_pd, ibpd);
 	struct gdma_create_mr_params mr_params = {};
 	struct ib_device *ibdev = ibpd->device;
-	gdma_obj_handle_t dma_region_handle;
 	struct mana_ib_dev *dev;
 	struct mana_ib_mr *mr;
+	u64 dma_region_handle;
 	int err;
 
 	dev = container_of(ibdev, struct mana_ib_dev, ib_dev);
diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c
index 46a7d1e6ece9..69224ff8efb6 100644
--- a/drivers/net/ethernet/microsoft/mana/gdma_main.c
+++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c
@@ -671,8 +671,7 @@ free_q:
 	return err;
 }
 
-int mana_gd_destroy_dma_region(struct gdma_context *gc,
-			       gdma_obj_handle_t dma_region_handle)
+int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle)
 {
 	struct gdma_destroy_dma_region_req req = {};
 	struct gdma_general_resp resp = {};
diff --git a/include/net/mana/gdma.h b/include/net/mana/gdma.h
index 221adc96340c..a9fdae14d24c 100644
--- a/include/net/mana/gdma.h
+++ b/include/net/mana/gdma.h
@@ -65,8 +65,6 @@ enum {
 	GDMA_DEVICE_MANA	= 2,
 };
 
-typedef u64 gdma_obj_handle_t;
-
 struct gdma_resource {
 	/* Protect the bitmap */
 	spinlock_t lock;
@@ -200,7 +198,7 @@ struct gdma_mem_info {
 	u64 length;
 
 	/* Allocated by the PF driver */
-	gdma_obj_handle_t dma_region_handle;
+	u64 dma_region_handle;
 };
 
 #define REGISTER_ATB_MST_MKEY_LOWER_SIZE 8
@@ -624,7 +622,7 @@ struct gdma_create_queue_req {
 	u32 reserved1;
 	u32 pdid;
 	u32 doolbell_id;
-	gdma_obj_handle_t gdma_region;
+	u64 gdma_region;
 	u32 reserved2;
 	u32 queue_size;
 	u32 log2_throttle_limit;
@@ -699,14 +697,14 @@ struct gdma_create_dma_region_req {
 
 struct gdma_create_dma_region_resp {
 	struct gdma_resp_hdr hdr;
-	gdma_obj_handle_t dma_region_handle;
+	u64 dma_region_handle;
 }; /* HW DATA */
 
 /* GDMA_DMA_REGION_ADD_PAGES */
 struct gdma_dma_region_add_pages_req {
 	struct gdma_req_hdr hdr;
 
-	gdma_obj_handle_t dma_region_handle;
+	u64 dma_region_handle;
 
 	u32 page_addr_list_len;
 	u32 reserved3;
@@ -718,7 +716,7 @@ struct gdma_dma_region_add_pages_req {
 struct gdma_destroy_dma_region_req {
 	struct gdma_req_hdr hdr;
 
-	gdma_obj_handle_t dma_region_handle;
+	u64 dma_region_handle;
 }; /* HW DATA */
 
 enum gdma_pd_flags {
@@ -733,14 +731,14 @@ struct gdma_create_pd_req {
 
 struct gdma_create_pd_resp {
 	struct gdma_resp_hdr hdr;
-	gdma_obj_handle_t pd_handle;
+	u64 pd_handle;
 	u32 pd_id;
 	u32 reserved;
 };/* HW DATA */
 
 struct gdma_destroy_pd_req {
 	struct gdma_req_hdr hdr;
-	gdma_obj_handle_t pd_handle;
+	u64 pd_handle;
 };/* HW DATA */
 
 struct gdma_destory_pd_resp {
@@ -756,11 +754,11 @@ enum gdma_mr_type {
 };
 
 struct gdma_create_mr_params {
-	gdma_obj_handle_t pd_handle;
+	u64 pd_handle;
 	enum gdma_mr_type mr_type;
 	union {
 		struct {
-			gdma_obj_handle_t dma_region_handle;
+			u64 dma_region_handle;
 			u64 virtual_address;
 			enum gdma_mr_access_flags access_flags;
 		} gva;
@@ -769,13 +767,13 @@ struct gdma_create_mr_params {
 
 struct gdma_create_mr_request {
 	struct gdma_req_hdr hdr;
-	gdma_obj_handle_t pd_handle;
+	u64 pd_handle;
 	enum gdma_mr_type mr_type;
 	u32 reserved_1;
 
 	union {
 		struct {
-			gdma_obj_handle_t dma_region_handle;
+			u64 dma_region_handle;
 			u64 virtual_address;
 			enum gdma_mr_access_flags access_flags;
 		} gva;
@@ -786,14 +784,14 @@ struct gdma_create_mr_request {
 
 struct gdma_create_mr_response {
 	struct gdma_resp_hdr hdr;
-	gdma_obj_handle_t mr_handle;
+	u64 mr_handle;
 	u32 lkey;
 	u32 rkey;
 };/* HW DATA */
 
 struct gdma_destroy_mr_request {
 	struct gdma_req_hdr hdr;
-	gdma_obj_handle_t mr_handle;
+	u64 mr_handle;
 };/* HW DATA */
 
 struct gdma_destroy_mr_response {
@@ -827,7 +825,6 @@ void mana_gd_free_memory(struct gdma_mem_info *gmi);
 int mana_gd_send_request(struct gdma_context *gc, u32 req_len, const void *req,
 			 u32 resp_len, void *resp);
 
-int mana_gd_destroy_dma_region(struct gdma_context *gc,
-			       gdma_obj_handle_t dma_region_handle);
+int mana_gd_destroy_dma_region(struct gdma_context *gc, u64 dma_region_handle);
 
 #endif /* _GDMA_H */
-- 
cgit v1.2.3


From 70ea86a0dfed10e00ee2666dadeb563bab00efea Mon Sep 17 00:00:00 2001
From: Steen Hegelund <steen.hegelund@microchip.com>
Date: Fri, 11 Nov 2022 14:05:14 +0100
Subject: net: flow_offload: add support for ARP frame matching

This adds a new flow_rule_match_arp function that allows drivers
to be able to dissect ARP frames.

Signed-off-by: Steen Hegelund <steen.hegelund@microchip.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/flow_offload.h | 6 ++++++
 net/core/flow_offload.c    | 7 +++++++
 2 files changed, 13 insertions(+)

(limited to 'include/net')

diff --git a/include/net/flow_offload.h b/include/net/flow_offload.h
index 7a60bc6d72c9..0400a0ac8a29 100644
--- a/include/net/flow_offload.h
+++ b/include/net/flow_offload.h
@@ -32,6 +32,10 @@ struct flow_match_vlan {
 	struct flow_dissector_key_vlan *key, *mask;
 };
 
+struct flow_match_arp {
+	struct flow_dissector_key_arp *key, *mask;
+};
+
 struct flow_match_ipv4_addrs {
 	struct flow_dissector_key_ipv4_addrs *key, *mask;
 };
@@ -98,6 +102,8 @@ void flow_rule_match_vlan(const struct flow_rule *rule,
 			  struct flow_match_vlan *out);
 void flow_rule_match_cvlan(const struct flow_rule *rule,
 			   struct flow_match_vlan *out);
+void flow_rule_match_arp(const struct flow_rule *rule,
+			 struct flow_match_arp *out);
 void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
 				struct flow_match_ipv4_addrs *out);
 void flow_rule_match_ipv6_addrs(const struct flow_rule *rule,
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index abe423fd5736..acfc1f88ea79 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -97,6 +97,13 @@ void flow_rule_match_cvlan(const struct flow_rule *rule,
 }
 EXPORT_SYMBOL(flow_rule_match_cvlan);
 
+void flow_rule_match_arp(const struct flow_rule *rule,
+			 struct flow_match_arp *out)
+{
+	FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ARP, out);
+}
+EXPORT_SYMBOL(flow_rule_match_arp);
+
 void flow_rule_match_ipv4_addrs(const struct flow_rule *rule,
 				struct flow_match_ipv4_addrs *out)
 {
-- 
cgit v1.2.3


From 7d34aa3e03b6a56306296bd98b26c6a1710cd57b Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 14 Oct 2022 23:45:58 +0200
Subject: netfilter: nf_tables: Extend nft_expr_ops::dump callback parameters

Add a 'reset' flag just like with nft_object_ops::dump. This will be
useful to reset "anonymous stateful objects", e.g. simple rule counters.

No functional change intended.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h  | 3 ++-
 include/net/netfilter/nft_fib.h    | 2 +-
 include/net/netfilter/nft_meta.h   | 4 ++--
 include/net/netfilter/nft_reject.h | 3 ++-
 net/ipv4/netfilter/nft_dup_ipv4.c  | 3 ++-
 net/ipv6/netfilter/nft_dup_ipv6.c  | 3 ++-
 net/netfilter/nf_tables_api.c      | 2 +-
 net/netfilter/nft_bitwise.c        | 6 ++++--
 net/netfilter/nft_byteorder.c      | 3 ++-
 net/netfilter/nft_cmp.c            | 9 ++++++---
 net/netfilter/nft_compat.c         | 9 ++++++---
 net/netfilter/nft_connlimit.c      | 3 ++-
 net/netfilter/nft_counter.c        | 3 ++-
 net/netfilter/nft_ct.c             | 6 ++++--
 net/netfilter/nft_dup_netdev.c     | 3 ++-
 net/netfilter/nft_dynset.c         | 3 ++-
 net/netfilter/nft_exthdr.c         | 9 ++++++---
 net/netfilter/nft_fib.c            | 2 +-
 net/netfilter/nft_flow_offload.c   | 3 ++-
 net/netfilter/nft_fwd_netdev.c     | 6 ++++--
 net/netfilter/nft_hash.c           | 4 ++--
 net/netfilter/nft_immediate.c      | 3 ++-
 net/netfilter/nft_inner.c          | 3 ++-
 net/netfilter/nft_last.c           | 3 ++-
 net/netfilter/nft_limit.c          | 5 +++--
 net/netfilter/nft_log.c            | 3 ++-
 net/netfilter/nft_lookup.c         | 3 ++-
 net/netfilter/nft_masq.c           | 3 ++-
 net/netfilter/nft_meta.c           | 5 +++--
 net/netfilter/nft_nat.c            | 3 ++-
 net/netfilter/nft_numgen.c         | 6 ++++--
 net/netfilter/nft_objref.c         | 6 ++++--
 net/netfilter/nft_osf.c            | 3 ++-
 net/netfilter/nft_payload.c        | 6 ++++--
 net/netfilter/nft_queue.c          | 6 ++++--
 net/netfilter/nft_quota.c          | 3 ++-
 net/netfilter/nft_range.c          | 3 ++-
 net/netfilter/nft_redir.c          | 3 ++-
 net/netfilter/nft_reject.c         | 3 ++-
 net/netfilter/nft_rt.c             | 2 +-
 net/netfilter/nft_socket.c         | 2 +-
 net/netfilter/nft_synproxy.c       | 3 ++-
 net/netfilter/nft_tproxy.c         | 2 +-
 net/netfilter/nft_tunnel.c         | 2 +-
 net/netfilter/nft_xfrm.c           | 2 +-
 45 files changed, 110 insertions(+), 62 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index 38e2b396e38a..c557a57fb0f1 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -927,7 +927,8 @@ struct nft_expr_ops {
 	void				(*destroy_clone)(const struct nft_ctx *ctx,
 							 const struct nft_expr *expr);
 	int				(*dump)(struct sk_buff *skb,
-						const struct nft_expr *expr);
+						const struct nft_expr *expr,
+						bool reset);
 	int				(*validate)(const struct nft_ctx *ctx,
 						    const struct nft_expr *expr,
 						    const struct nft_data **data);
diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h
index eed099eae672..167640b843ef 100644
--- a/include/net/netfilter/nft_fib.h
+++ b/include/net/netfilter/nft_fib.h
@@ -18,7 +18,7 @@ nft_fib_is_loopback(const struct sk_buff *skb, const struct net_device *in)
 	return skb->pkt_type == PACKET_LOOPBACK || in->flags & IFF_LOOPBACK;
 }
 
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr);
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset);
 int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 		 const struct nlattr * const tb[]);
 int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr,
diff --git a/include/net/netfilter/nft_meta.h b/include/net/netfilter/nft_meta.h
index f3a5285a511c..ba1238f12a48 100644
--- a/include/net/netfilter/nft_meta.h
+++ b/include/net/netfilter/nft_meta.h
@@ -24,10 +24,10 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 		      const struct nlattr * const tb[]);
 
 int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
+		      const struct nft_expr *expr, bool reset);
 
 int nft_meta_set_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr);
+		      const struct nft_expr *expr, bool reset);
 
 void nft_meta_get_eval(const struct nft_expr *expr,
 		       struct nft_regs *regs,
diff --git a/include/net/netfilter/nft_reject.h b/include/net/netfilter/nft_reject.h
index 56b123a42220..6d9ba62efd75 100644
--- a/include/net/netfilter/nft_reject.h
+++ b/include/net/netfilter/nft_reject.h
@@ -22,7 +22,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
 		    const struct nft_expr *expr,
 		    const struct nlattr * const tb[]);
 
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr);
+int nft_reject_dump(struct sk_buff *skb,
+		    const struct nft_expr *expr, bool reset);
 
 int nft_reject_icmp_code(u8 code);
 int nft_reject_icmpv6_code(u8 code);
diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c
index 0bcd6aee6000..a522c3a3be52 100644
--- a/net/ipv4/netfilter/nft_dup_ipv4.c
+++ b/net/ipv4/netfilter/nft_dup_ipv4.c
@@ -52,7 +52,8 @@ static int nft_dup_ipv4_init(const struct nft_ctx *ctx,
 	return err;
 }
 
-static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv4_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr, bool reset)
 {
 	struct nft_dup_ipv4 *priv = nft_expr_priv(expr);
 
diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c
index 70a405b4006f..c82f3fdd4a65 100644
--- a/net/ipv6/netfilter/nft_dup_ipv6.c
+++ b/net/ipv6/netfilter/nft_dup_ipv6.c
@@ -50,7 +50,8 @@ static int nft_dup_ipv6_init(const struct nft_ctx *ctx,
 	return err;
 }
 
-static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_ipv6_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr, bool reset)
 {
 	struct nft_dup_ipv6 *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 62da204eed41..741a0e386406 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb,
 							    NFTA_EXPR_DATA);
 		if (data == NULL)
 			goto nla_put_failure;
-		if (expr->ops->dump(skb, expr) < 0)
+		if (expr->ops->dump(skb, expr, false) < 0)
 			goto nla_put_failure;
 		nla_nest_end(skb, data);
 	}
diff --git a/net/netfilter/nft_bitwise.c b/net/netfilter/nft_bitwise.c
index e6e402b247d0..84eae7cabc67 100644
--- a/net/netfilter/nft_bitwise.c
+++ b/net/netfilter/nft_bitwise.c
@@ -232,7 +232,8 @@ static int nft_bitwise_dump_shift(struct sk_buff *skb,
 	return 0;
 }
 
-static int nft_bitwise_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_bitwise_dump(struct sk_buff *skb,
+			    const struct nft_expr *expr, bool reset)
 {
 	const struct nft_bitwise *priv = nft_expr_priv(expr);
 	int err = 0;
@@ -393,7 +394,8 @@ static int nft_bitwise_fast_init(const struct nft_ctx *ctx,
 }
 
 static int
-nft_bitwise_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_bitwise_fast_dump(struct sk_buff *skb,
+		      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_bitwise_fast_expr *priv = nft_expr_priv(expr);
 	struct nft_data data;
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index f952a80275a8..b66647a5a171 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -148,7 +148,8 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
 					priv->len);
 }
 
-static int nft_byteorder_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_byteorder_dump(struct sk_buff *skb,
+			      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_byteorder *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_cmp.c b/net/netfilter/nft_cmp.c
index 963cf831799c..6eb21a4f5698 100644
--- a/net/netfilter/nft_cmp.c
+++ b/net/netfilter/nft_cmp.c
@@ -92,7 +92,8 @@ static int nft_cmp_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return 0;
 }
 
-static int nft_cmp_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_dump(struct sk_buff *skb,
+			const struct nft_expr *expr, bool reset)
 {
 	const struct nft_cmp_expr *priv = nft_expr_priv(expr);
 
@@ -253,7 +254,8 @@ static int nft_cmp_fast_offload(struct nft_offload_ctx *ctx,
 	return __nft_cmp_offload(ctx, flow, &cmp);
 }
 
-static int nft_cmp_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp_fast_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr, bool reset)
 {
 	const struct nft_cmp_fast_expr *priv = nft_expr_priv(expr);
 	enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
@@ -347,7 +349,8 @@ static int nft_cmp16_fast_offload(struct nft_offload_ctx *ctx,
 	return __nft_cmp_offload(ctx, flow, &cmp);
 }
 
-static int nft_cmp16_fast_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_cmp16_fast_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	const struct nft_cmp16_fast_expr *priv = nft_expr_priv(expr);
 	enum nft_cmp_ops op = priv->inv ? NFT_CMP_NEQ : NFT_CMP_EQ;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index c16172427622..5284cd2ad532 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -324,7 +324,8 @@ static int nft_extension_dump_info(struct sk_buff *skb, int attr,
 	return 0;
 }
 
-static int nft_target_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_target_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct xt_target *target = expr->ops->data;
 	void *info = nft_expr_priv(expr);
@@ -572,12 +573,14 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_match_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_match_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	return __nft_match_dump(skb, expr, nft_expr_priv(expr));
 }
 
-static int nft_match_large_dump(struct sk_buff *skb, const struct nft_expr *e)
+static int nft_match_large_dump(struct sk_buff *skb,
+				const struct nft_expr *e, bool reset)
 {
 	struct nft_xt_match_priv *priv = nft_expr_priv(e);
 
diff --git a/net/netfilter/nft_connlimit.c b/net/netfilter/nft_connlimit.c
index d657f999a11b..de9d1980df69 100644
--- a/net/netfilter/nft_connlimit.c
+++ b/net/netfilter/nft_connlimit.c
@@ -185,7 +185,8 @@ static void nft_connlimit_eval(const struct nft_expr *expr,
 	nft_connlimit_do_eval(priv, regs, pkt, NULL);
 }
 
-static int nft_connlimit_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_connlimit_dump(struct sk_buff *skb,
+			      const struct nft_expr *expr, bool reset)
 {
 	struct nft_connlimit *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index f4d3573e8782..06482fb9c145 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -201,7 +201,8 @@ void nft_counter_eval(const struct nft_expr *expr, struct nft_regs *regs,
 	nft_counter_do_eval(priv, regs, pkt);
 }
 
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_counter_dump(struct sk_buff *skb,
+			    const struct nft_expr *expr, bool reset)
 {
 	struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a3f01f209a53..a0696d7ea10c 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -641,7 +641,8 @@ static void nft_ct_set_destroy(const struct nft_ctx *ctx,
 	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
-static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_get_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_ct *priv = nft_expr_priv(expr);
 
@@ -703,7 +704,8 @@ static bool nft_ct_get_reduce(struct nft_regs_track *track,
 	return nft_expr_reduce_bitwise(track, expr);
 }
 
-static int nft_ct_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ct_set_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_ct *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
index 63507402716d..e5739a59ebf1 100644
--- a/net/netfilter/nft_dup_netdev.c
+++ b/net/netfilter/nft_dup_netdev.c
@@ -44,7 +44,8 @@ static int nft_dup_netdev_init(const struct nft_ctx *ctx,
 				       sizeof(int));
 }
 
-static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dup_netdev_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	struct nft_dup_netdev *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 6983e6ddeef9..01c61e090639 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -357,7 +357,8 @@ static void nft_dynset_destroy(const struct nft_ctx *ctx,
 	nf_tables_destroy_set(ctx, priv->set);
 }
 
-static int nft_dynset_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_dynset_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_dynset *priv = nft_expr_priv(expr);
 	u32 flags = priv->invert ? NFT_DYNSET_F_INV : 0;
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index a67ea9c3ae57..ed929d0d37ce 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -576,7 +576,8 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_exthdr *priv = nft_expr_priv(expr);
 
@@ -586,7 +587,8 @@ static int nft_exthdr_dump(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
-static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_set(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	const struct nft_exthdr *priv = nft_expr_priv(expr);
 
@@ -596,7 +598,8 @@ static int nft_exthdr_dump_set(struct sk_buff *skb, const struct nft_expr *expr)
 	return nft_exthdr_dump_common(skb, priv);
 }
 
-static int nft_exthdr_dump_strip(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_exthdr_dump_strip(struct sk_buff *skb,
+				 const struct nft_expr *expr, bool reset)
 {
 	const struct nft_exthdr *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c
index 1f12d7ade606..6e049fd48760 100644
--- a/net/netfilter/nft_fib.c
+++ b/net/netfilter/nft_fib.c
@@ -118,7 +118,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 }
 EXPORT_SYMBOL_GPL(nft_fib_init);
 
-int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr, bool reset)
 {
 	const struct nft_fib *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
index a25c88bc8b75..e860d8fe0e5e 100644
--- a/net/netfilter/nft_flow_offload.c
+++ b/net/netfilter/nft_flow_offload.c
@@ -433,7 +433,8 @@ static void nft_flow_offload_destroy(const struct nft_ctx *ctx,
 	nf_ct_netns_put(ctx->net, ctx->family);
 }
 
-static int nft_flow_offload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_flow_offload_dump(struct sk_buff *skb,
+				 const struct nft_expr *expr, bool reset)
 {
 	struct nft_flow_offload *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
index 7c5876dc9ff2..7b9d4d1bd17c 100644
--- a/net/netfilter/nft_fwd_netdev.c
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -56,7 +56,8 @@ static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
 				       sizeof(int));
 }
 
-static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_netdev_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	struct nft_fwd_netdev *priv = nft_expr_priv(expr);
 
@@ -186,7 +187,8 @@ static int nft_fwd_neigh_init(const struct nft_ctx *ctx,
 				       addr_len);
 }
 
-static int nft_fwd_neigh_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_fwd_neigh_dump(struct sk_buff *skb,
+			      const struct nft_expr *expr, bool reset)
 {
 	struct nft_fwd_neigh *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index e5631e88b285..ee8d487b69c0 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -139,7 +139,7 @@ static int nft_symhash_init(const struct nft_ctx *ctx,
 }
 
 static int nft_jhash_dump(struct sk_buff *skb,
-			  const struct nft_expr *expr)
+			  const struct nft_expr *expr, bool reset)
 {
 	const struct nft_jhash *priv = nft_expr_priv(expr);
 
@@ -176,7 +176,7 @@ static bool nft_jhash_reduce(struct nft_regs_track *track,
 }
 
 static int nft_symhash_dump(struct sk_buff *skb,
-			    const struct nft_expr *expr)
+			    const struct nft_expr *expr, bool reset)
 {
 	const struct nft_symhash *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c
index 5f28b21abc7d..c9d2f7c29f53 100644
--- a/net/netfilter/nft_immediate.c
+++ b/net/netfilter/nft_immediate.c
@@ -147,7 +147,8 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx,
 	}
 }
 
-static int nft_immediate_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_immediate_dump(struct sk_buff *skb,
+			      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_immediate_expr *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 809f0d0787ec..6d96b826db4e 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -347,7 +347,8 @@ static int nft_inner_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_inner_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	const struct nft_inner *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_last.c b/net/netfilter/nft_last.c
index bb15a55dad5c..7f2bda6641bd 100644
--- a/net/netfilter/nft_last.c
+++ b/net/netfilter/nft_last.c
@@ -65,7 +65,8 @@ static void nft_last_eval(const struct nft_expr *expr,
 		WRITE_ONCE(last->set, 1);
 }
 
-static int nft_last_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_last_dump(struct sk_buff *skb,
+			 const struct nft_expr *expr, bool reset)
 {
 	struct nft_last_priv *priv = nft_expr_priv(expr);
 	struct nft_last *last = priv->last;
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 981addb2d051..145dc62c6247 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -193,7 +193,8 @@ static int nft_limit_pkts_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_limit_pkts_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_limit_pkts_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	const struct nft_limit_priv_pkts *priv = nft_expr_priv(expr);
 
@@ -251,7 +252,7 @@ static int nft_limit_bytes_init(const struct nft_ctx *ctx,
 }
 
 static int nft_limit_bytes_dump(struct sk_buff *skb,
-				const struct nft_expr *expr)
+				const struct nft_expr *expr, bool reset)
 {
 	const struct nft_limit_priv *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c
index 0e13c003f0c1..5defe6e4fd98 100644
--- a/net/netfilter/nft_log.c
+++ b/net/netfilter/nft_log.c
@@ -241,7 +241,8 @@ static void nft_log_destroy(const struct nft_ctx *ctx,
 	nf_logger_put(ctx->family, li->type);
 }
 
-static int nft_log_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_log_dump(struct sk_buff *skb,
+			const struct nft_expr *expr, bool reset)
 {
 	const struct nft_log *priv = nft_expr_priv(expr);
 	const struct nf_loginfo *li = &priv->loginfo;
diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c
index dfae12759c7c..cae5a6724163 100644
--- a/net/netfilter/nft_lookup.c
+++ b/net/netfilter/nft_lookup.c
@@ -178,7 +178,8 @@ static void nft_lookup_destroy(const struct nft_ctx *ctx,
 	nf_tables_destroy_set(ctx, priv->set);
 }
 
-static int nft_lookup_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_lookup_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_lookup *priv = nft_expr_priv(expr);
 	u32 flags = priv->invert ? NFT_LOOKUP_F_INV : 0;
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 2a0adc497bbb..e55e455275c4 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -73,7 +73,8 @@ static int nft_masq_init(const struct nft_ctx *ctx,
 	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 
-static int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_masq_dump(struct sk_buff *skb,
+			 const struct nft_expr *expr, bool reset)
 {
 	const struct nft_masq *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index 8c39adeebb5c..e384e0de7a54 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -669,7 +669,7 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
 EXPORT_SYMBOL_GPL(nft_meta_set_init);
 
 int nft_meta_get_dump(struct sk_buff *skb,
-		      const struct nft_expr *expr)
+		      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
@@ -684,7 +684,8 @@ nla_put_failure:
 }
 EXPORT_SYMBOL_GPL(nft_meta_get_dump);
 
-int nft_meta_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_meta_set_dump(struct sk_buff *skb,
+		      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_meta *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
index e5fd6995e4bf..047999150390 100644
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -255,7 +255,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 	return nf_ct_netns_get(ctx->net, family);
 }
 
-static int nft_nat_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_nat_dump(struct sk_buff *skb,
+			const struct nft_expr *expr, bool reset)
 {
 	const struct nft_nat *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c
index 45d3dc9e96f2..7d29db7c2ac0 100644
--- a/net/netfilter/nft_numgen.c
+++ b/net/netfilter/nft_numgen.c
@@ -112,7 +112,8 @@ nla_put_failure:
 	return -1;
 }
 
-static int nft_ng_inc_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_inc_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_ng_inc *priv = nft_expr_priv(expr);
 
@@ -168,7 +169,8 @@ static int nft_ng_random_init(const struct nft_ctx *ctx,
 					NULL, NFT_DATA_VALUE, sizeof(u32));
 }
 
-static int nft_ng_random_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_ng_random_dump(struct sk_buff *skb,
+			      const struct nft_expr *expr, bool reset)
 {
 	const struct nft_ng_random *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c
index 74e0eea4abac..7b01aa2ef653 100644
--- a/net/netfilter/nft_objref.c
+++ b/net/netfilter/nft_objref.c
@@ -47,7 +47,8 @@ static int nft_objref_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_objref_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_dump(struct sk_buff *skb,
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_object *obj = nft_objref_priv(expr);
 
@@ -155,7 +156,8 @@ static int nft_objref_map_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_objref_map_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_objref_map_dump(struct sk_buff *skb,
+			       const struct nft_expr *expr, bool reset)
 {
 	const struct nft_objref_map *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c
index adacf95b6e2b..70820c66b591 100644
--- a/net/netfilter/nft_osf.c
+++ b/net/netfilter/nft_osf.c
@@ -92,7 +92,8 @@ static int nft_osf_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_osf_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_osf_dump(struct sk_buff *skb,
+			const struct nft_expr *expr, bool reset)
 {
 	const struct nft_osf *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 53e64d8aa01f..336ac668cae3 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -231,7 +231,8 @@ static int nft_payload_init(const struct nft_ctx *ctx,
 					priv->len);
 }
 
-static int nft_payload_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_dump(struct sk_buff *skb,
+			    const struct nft_expr *expr, bool reset)
 {
 	const struct nft_payload *priv = nft_expr_priv(expr);
 
@@ -919,7 +920,8 @@ static int nft_payload_set_init(const struct nft_ctx *ctx,
 				       priv->len);
 }
 
-static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_payload_set_dump(struct sk_buff *skb,
+				const struct nft_expr *expr, bool reset)
 {
 	const struct nft_payload_set *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c
index da29e92c03e2..b2b8127c8d43 100644
--- a/net/netfilter/nft_queue.c
+++ b/net/netfilter/nft_queue.c
@@ -152,7 +152,8 @@ static int nft_queue_sreg_init(const struct nft_ctx *ctx,
 	return 0;
 }
 
-static int nft_queue_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_queue_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	const struct nft_queue *priv = nft_expr_priv(expr);
 
@@ -168,7 +169,8 @@ nla_put_failure:
 }
 
 static int
-nft_queue_sreg_dump(struct sk_buff *skb, const struct nft_expr *expr)
+nft_queue_sreg_dump(struct sk_buff *skb,
+		    const struct nft_expr *expr, bool reset)
 {
 	const struct nft_queue *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index e6b0df68feea..b1a1217bca4c 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -217,7 +217,8 @@ static int nft_quota_init(const struct nft_ctx *ctx,
 	return nft_quota_do_init(tb, priv);
 }
 
-static int nft_quota_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_quota_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	struct nft_quota *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_range.c b/net/netfilter/nft_range.c
index 832f0d725a9e..0566d6aaf1e5 100644
--- a/net/netfilter/nft_range.c
+++ b/net/netfilter/nft_range.c
@@ -111,7 +111,8 @@ err1:
 	return err;
 }
 
-static int nft_range_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_range_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	const struct nft_range_expr *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
index 5086adfe731c..5f7739987559 100644
--- a/net/netfilter/nft_redir.c
+++ b/net/netfilter/nft_redir.c
@@ -75,7 +75,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
 	return nf_ct_netns_get(ctx->net, ctx->family);
 }
 
-static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_redir_dump(struct sk_buff *skb,
+			  const struct nft_expr *expr, bool reset)
 {
 	const struct nft_redir *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c
index 927ff8459bd9..f2addc844dd2 100644
--- a/net/netfilter/nft_reject.c
+++ b/net/netfilter/nft_reject.c
@@ -69,7 +69,8 @@ int nft_reject_init(const struct nft_ctx *ctx,
 }
 EXPORT_SYMBOL_GPL(nft_reject_init);
 
-int nft_reject_dump(struct sk_buff *skb, const struct nft_expr *expr)
+int nft_reject_dump(struct sk_buff *skb,
+		    const struct nft_expr *expr, bool reset)
 {
 	const struct nft_reject *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
index 71931ec91721..5990fdd7b3cc 100644
--- a/net/netfilter/nft_rt.c
+++ b/net/netfilter/nft_rt.c
@@ -146,7 +146,7 @@ static int nft_rt_get_init(const struct nft_ctx *ctx,
 }
 
 static int nft_rt_get_dump(struct sk_buff *skb,
-			   const struct nft_expr *expr)
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_rt *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
index 49a5348a6a14..85f8df87efda 100644
--- a/net/netfilter/nft_socket.c
+++ b/net/netfilter/nft_socket.c
@@ -199,7 +199,7 @@ static int nft_socket_init(const struct nft_ctx *ctx,
 }
 
 static int nft_socket_dump(struct sk_buff *skb,
-			   const struct nft_expr *expr)
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_socket *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
index 6cf9a04fbfe2..13da882669a4 100644
--- a/net/netfilter/nft_synproxy.c
+++ b/net/netfilter/nft_synproxy.c
@@ -272,7 +272,8 @@ static void nft_synproxy_destroy(const struct nft_ctx *ctx,
 	nft_synproxy_do_destroy(ctx);
 }
 
-static int nft_synproxy_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static int nft_synproxy_dump(struct sk_buff *skb,
+			     const struct nft_expr *expr, bool reset)
 {
 	struct nft_synproxy *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
index 62da25ad264b..ea83f661417e 100644
--- a/net/netfilter/nft_tproxy.c
+++ b/net/netfilter/nft_tproxy.c
@@ -294,7 +294,7 @@ static void nft_tproxy_destroy(const struct nft_ctx *ctx,
 }
 
 static int nft_tproxy_dump(struct sk_buff *skb,
-			   const struct nft_expr *expr)
+			   const struct nft_expr *expr, bool reset)
 {
 	const struct nft_tproxy *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_tunnel.c b/net/netfilter/nft_tunnel.c
index 983ade4be3b3..b059aa541798 100644
--- a/net/netfilter/nft_tunnel.c
+++ b/net/netfilter/nft_tunnel.c
@@ -108,7 +108,7 @@ static int nft_tunnel_get_init(const struct nft_ctx *ctx,
 }
 
 static int nft_tunnel_get_dump(struct sk_buff *skb,
-			       const struct nft_expr *expr)
+			       const struct nft_expr *expr, bool reset)
 {
 	const struct nft_tunnel *priv = nft_expr_priv(expr);
 
diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
index 1c5343c936a8..c88fd078a9ae 100644
--- a/net/netfilter/nft_xfrm.c
+++ b/net/netfilter/nft_xfrm.c
@@ -212,7 +212,7 @@ static void nft_xfrm_get_eval(const struct nft_expr *expr,
 }
 
 static int nft_xfrm_get_dump(struct sk_buff *skb,
-			     const struct nft_expr *expr)
+			     const struct nft_expr *expr, bool reset)
 {
 	const struct nft_xfrm *priv = nft_expr_priv(expr);
 
-- 
cgit v1.2.3


From 8daa8fde3fc3f069ff0b5c87079a5c1df7743113 Mon Sep 17 00:00:00 2001
From: Phil Sutter <phil@nwl.cc>
Date: Fri, 14 Oct 2022 23:45:59 +0200
Subject: netfilter: nf_tables: Introduce NFT_MSG_GETRULE_RESET

Analogous to NFT_MSG_GETOBJ_RESET, but for rules: Reset stateful
expressions like counters or quotas. The latter two are the only
consumers, adjust their 'dump' callbacks to respect the parameter
introduced earlier.

Signed-off-by: Phil Sutter <phil@nwl.cc>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_tables.h        |  2 +-
 include/uapi/linux/netfilter/nf_tables.h |  2 ++
 net/netfilter/nf_tables_api.c            | 49 +++++++++++++++++++++-----------
 net/netfilter/nft_counter.c              |  2 +-
 net/netfilter/nft_dynset.c               |  4 +--
 net/netfilter/nft_inner.c                |  2 +-
 net/netfilter/nft_quota.c                |  2 +-
 7 files changed, 41 insertions(+), 22 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h
index c557a57fb0f1..e69ce23566ea 100644
--- a/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@ -383,7 +383,7 @@ int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla,
 int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src);
 void nft_expr_destroy(const struct nft_ctx *ctx, struct nft_expr *expr);
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
-		  const struct nft_expr *expr);
+		  const struct nft_expr *expr, bool reset);
 bool nft_expr_reduce_bitwise(struct nft_regs_track *track,
 			     const struct nft_expr *expr);
 
diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h
index e4b739d57480..cfa844da1ce6 100644
--- a/include/uapi/linux/netfilter/nf_tables.h
+++ b/include/uapi/linux/netfilter/nf_tables.h
@@ -97,6 +97,7 @@ enum nft_verdicts {
  * @NFT_MSG_NEWFLOWTABLE: add new flow table (enum nft_flowtable_attributes)
  * @NFT_MSG_GETFLOWTABLE: get flow table (enum nft_flowtable_attributes)
  * @NFT_MSG_DELFLOWTABLE: delete flow table (enum nft_flowtable_attributes)
+ * @NFT_MSG_GETRULE_RESET: get rules and reset stateful expressions (enum nft_obj_attributes)
  */
 enum nf_tables_msg_types {
 	NFT_MSG_NEWTABLE,
@@ -124,6 +125,7 @@ enum nf_tables_msg_types {
 	NFT_MSG_NEWFLOWTABLE,
 	NFT_MSG_GETFLOWTABLE,
 	NFT_MSG_DELFLOWTABLE,
+	NFT_MSG_GETRULE_RESET,
 	NFT_MSG_MAX,
 };
 
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 741a0e386406..80e613405f6f 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -2759,7 +2759,7 @@ static const struct nla_policy nft_expr_policy[NFTA_EXPR_MAX + 1] = {
 };
 
 static int nf_tables_fill_expr_info(struct sk_buff *skb,
-				    const struct nft_expr *expr)
+				    const struct nft_expr *expr, bool reset)
 {
 	if (nla_put_string(skb, NFTA_EXPR_NAME, expr->ops->type->name))
 		goto nla_put_failure;
@@ -2769,7 +2769,7 @@ static int nf_tables_fill_expr_info(struct sk_buff *skb,
 							    NFTA_EXPR_DATA);
 		if (data == NULL)
 			goto nla_put_failure;
-		if (expr->ops->dump(skb, expr, false) < 0)
+		if (expr->ops->dump(skb, expr, reset) < 0)
 			goto nla_put_failure;
 		nla_nest_end(skb, data);
 	}
@@ -2781,14 +2781,14 @@ nla_put_failure:
 };
 
 int nft_expr_dump(struct sk_buff *skb, unsigned int attr,
-		  const struct nft_expr *expr)
+		  const struct nft_expr *expr, bool reset)
 {
 	struct nlattr *nest;
 
 	nest = nla_nest_start_noflag(skb, attr);
 	if (!nest)
 		goto nla_put_failure;
-	if (nf_tables_fill_expr_info(skb, expr) < 0)
+	if (nf_tables_fill_expr_info(skb, expr, reset) < 0)
 		goto nla_put_failure;
 	nla_nest_end(skb, nest);
 	return 0;
@@ -3034,7 +3034,8 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 				    u32 flags, int family,
 				    const struct nft_table *table,
 				    const struct nft_chain *chain,
-				    const struct nft_rule *rule, u64 handle)
+				    const struct nft_rule *rule, u64 handle,
+				    bool reset)
 {
 	struct nlmsghdr *nlh;
 	const struct nft_expr *expr, *next;
@@ -3067,7 +3068,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net,
 	if (list == NULL)
 		goto nla_put_failure;
 	nft_rule_for_each_expr(expr, next, rule) {
-		if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+		if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, reset) < 0)
 			goto nla_put_failure;
 	}
 	nla_nest_end(skb, list);
@@ -3118,7 +3119,7 @@ static void nf_tables_rule_notify(const struct nft_ctx *ctx,
 
 	err = nf_tables_fill_rule_info(skb, ctx->net, ctx->portid, ctx->seq,
 				       event, flags, ctx->family, ctx->table,
-				       ctx->chain, rule, handle);
+				       ctx->chain, rule, handle, false);
 	if (err < 0) {
 		kfree_skb(skb);
 		goto err;
@@ -3139,7 +3140,8 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 				  unsigned int *idx,
 				  struct netlink_callback *cb,
 				  const struct nft_table *table,
-				  const struct nft_chain *chain)
+				  const struct nft_chain *chain,
+				  bool reset)
 {
 	struct net *net = sock_net(skb->sk);
 	const struct nft_rule *rule, *prule;
@@ -3166,7 +3168,7 @@ static int __nf_tables_dump_rules(struct sk_buff *skb,
 					NFT_MSG_NEWRULE,
 					NLM_F_MULTI | NLM_F_APPEND,
 					table->family,
-					table, chain, rule, handle) < 0)
+					table, chain, rule, handle, reset) < 0)
 			return 1;
 
 		nl_dump_check_consistent(cb, nlmsg_hdr(skb));
@@ -3189,6 +3191,10 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 	struct net *net = sock_net(skb->sk);
 	int family = nfmsg->nfgen_family;
 	struct nftables_pernet *nft_net;
+	bool reset = false;
+
+	if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+		reset = true;
 
 	rcu_read_lock();
 	nft_net = nft_pernet(net);
@@ -3213,14 +3219,15 @@ static int nf_tables_dump_rules(struct sk_buff *skb,
 				if (!nft_is_active(net, chain))
 					continue;
 				__nf_tables_dump_rules(skb, &idx,
-						       cb, table, chain);
+						       cb, table, chain, reset);
 				break;
 			}
 			goto done;
 		}
 
 		list_for_each_entry_rcu(chain, &table->chains, list) {
-			if (__nf_tables_dump_rules(skb, &idx, cb, table, chain))
+			if (__nf_tables_dump_rules(skb, &idx,
+						   cb, table, chain, reset))
 				goto done;
 		}
 
@@ -3291,6 +3298,7 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	struct net *net = info->net;
 	struct nft_table *table;
 	struct sk_buff *skb2;
+	bool reset = false;
 	int err;
 
 	if (info->nlh->nlmsg_flags & NLM_F_DUMP) {
@@ -3327,9 +3335,12 @@ static int nf_tables_getrule(struct sk_buff *skb, const struct nfnl_info *info,
 	if (!skb2)
 		return -ENOMEM;
 
+	if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_GETRULE_RESET)
+		reset = true;
+
 	err = nf_tables_fill_rule_info(skb2, net, NETLINK_CB(skb).portid,
 				       info->nlh->nlmsg_seq, NFT_MSG_NEWRULE, 0,
-				       family, table, chain, rule, 0);
+				       family, table, chain, rule, 0, reset);
 	if (err < 0)
 		goto err_fill_rule_info;
 
@@ -4104,7 +4115,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 
 	if (set->num_exprs == 1) {
 		nest = nla_nest_start_noflag(skb, NFTA_SET_EXPR);
-		if (nf_tables_fill_expr_info(skb, set->exprs[0]) < 0)
+		if (nf_tables_fill_expr_info(skb, set->exprs[0], false) < 0)
 			goto nla_put_failure;
 
 		nla_nest_end(skb, nest);
@@ -4115,7 +4126,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
 
 		for (i = 0; i < set->num_exprs; i++) {
 			if (nft_expr_dump(skb, NFTA_LIST_ELEM,
-					  set->exprs[i]) < 0)
+					  set->exprs[i], false) < 0)
 				goto nla_put_failure;
 		}
 		nla_nest_end(skb, nest);
@@ -4946,7 +4957,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
 
 	if (num_exprs == 1) {
 		expr = nft_setelem_expr_at(elem_expr, 0);
-		if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr) < 0)
+		if (nft_expr_dump(skb, NFTA_SET_ELEM_EXPR, expr, false) < 0)
 			return -1;
 
 		return 0;
@@ -4957,7 +4968,7 @@ static int nft_set_elem_expr_dump(struct sk_buff *skb,
 
 		nft_setelem_expr_foreach(expr, elem_expr, size) {
 			expr = nft_setelem_expr_at(elem_expr, size);
-			if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr) < 0)
+			if (nft_expr_dump(skb, NFTA_LIST_ELEM, expr, false) < 0)
 				goto nla_put_failure;
 		}
 		nla_nest_end(skb, nest);
@@ -8311,6 +8322,12 @@ static const struct nfnl_callback nf_tables_cb[NFT_MSG_MAX] = {
 		.attr_count	= NFTA_RULE_MAX,
 		.policy		= nft_rule_policy,
 	},
+	[NFT_MSG_GETRULE_RESET] = {
+		.call		= nf_tables_getrule,
+		.type		= NFNL_CB_RCU,
+		.attr_count	= NFTA_RULE_MAX,
+		.policy		= nft_rule_policy,
+	},
 	[NFT_MSG_DELRULE] = {
 		.call		= nf_tables_delrule,
 		.type		= NFNL_CB_BATCH,
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 06482fb9c145..dccc68a5135a 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -206,7 +206,7 @@ static int nft_counter_dump(struct sk_buff *skb,
 {
 	struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
 
-	return nft_counter_do_dump(skb, priv, false);
+	return nft_counter_do_dump(skb, priv, reset);
 }
 
 static int nft_counter_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 01c61e090639..274579b1696e 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -380,7 +380,7 @@ static int nft_dynset_dump(struct sk_buff *skb,
 	if (priv->set->num_exprs == 0) {
 		if (priv->num_exprs == 1) {
 			if (nft_expr_dump(skb, NFTA_DYNSET_EXPR,
-					  priv->expr_array[0]))
+					  priv->expr_array[0], reset))
 				goto nla_put_failure;
 		} else if (priv->num_exprs > 1) {
 			struct nlattr *nest;
@@ -391,7 +391,7 @@ static int nft_dynset_dump(struct sk_buff *skb,
 
 			for (i = 0; i < priv->num_exprs; i++) {
 				if (nft_expr_dump(skb, NFTA_LIST_ELEM,
-						  priv->expr_array[i]))
+						  priv->expr_array[i], reset))
 					goto nla_put_failure;
 			}
 			nla_nest_end(skb, nest);
diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c
index 6d96b826db4e..28e2873ba24e 100644
--- a/net/netfilter/nft_inner.c
+++ b/net/netfilter/nft_inner.c
@@ -359,7 +359,7 @@ static int nft_inner_dump(struct sk_buff *skb,
 		goto nla_put_failure;
 
 	if (nft_expr_dump(skb, NFTA_INNER_EXPR,
-			  (struct nft_expr *)&priv->expr) < 0)
+			  (struct nft_expr *)&priv->expr, reset) < 0)
 		goto nla_put_failure;
 
 	return 0;
diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c
index b1a1217bca4c..123578e28917 100644
--- a/net/netfilter/nft_quota.c
+++ b/net/netfilter/nft_quota.c
@@ -222,7 +222,7 @@ static int nft_quota_dump(struct sk_buff *skb,
 {
 	struct nft_quota *priv = nft_expr_priv(expr);
 
-	return nft_quota_do_dump(skb, priv, false);
+	return nft_quota_do_dump(skb, priv, reset);
 }
 
 static void nft_quota_destroy(const struct nft_ctx *ctx,
-- 
cgit v1.2.3


From 53d04b9811107633f25be02a5d981a6070d09e6e Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 14 Nov 2022 19:07:30 +0200
Subject: net: dsa: remove phylink_validate() method

As of now, no DSA driver uses a custom link mode validation procedure
anymore. So remove this DSA operation and let phylink determine what is
supported based on config->mac_capabilities (if provided by the driver).
Leave a comment why we left the code that we did, and that there is more
work to do.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h |  3 ---
 net/dsa/port.c    | 18 ++++++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index ee369670e20e..dde364688739 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -880,9 +880,6 @@ struct dsa_switch_ops {
 	 */
 	void	(*phylink_get_caps)(struct dsa_switch *ds, int port,
 				    struct phylink_config *config);
-	void	(*phylink_validate)(struct dsa_switch *ds, int port,
-				    unsigned long *supported,
-				    struct phylink_link_state *state);
 	struct phylink_pcs *(*phylink_mac_select_pcs)(struct dsa_switch *ds,
 						      int port,
 						      phy_interface_t iface);
diff --git a/net/dsa/port.c b/net/dsa/port.c
index 208168276995..48c9eaa74aee 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -1536,16 +1536,14 @@ static void dsa_port_phylink_validate(struct phylink_config *config,
 				      unsigned long *supported,
 				      struct phylink_link_state *state)
 {
-	struct dsa_port *dp = container_of(config, struct dsa_port, pl_config);
-	struct dsa_switch *ds = dp->ds;
-
-	if (!ds->ops->phylink_validate) {
-		if (config->mac_capabilities)
-			phylink_generic_validate(config, supported, state);
-		return;
-	}
-
-	ds->ops->phylink_validate(ds, dp->index, supported, state);
+	/* Skip call for drivers which don't yet set mac_capabilities,
+	 * since validating in that case would mean their PHY will advertise
+	 * nothing. In turn, skipping validation makes them advertise
+	 * everything that the PHY supports, so those drivers should be
+	 * converted ASAP.
+	 */
+	if (config->mac_capabilities)
+		phylink_generic_validate(config, supported, state);
 }
 
 static void dsa_port_phylink_mac_pcs_get_state(struct phylink_config *config,
-- 
cgit v1.2.3


From 02ae6a7034d7b2e3d89e33d73da10a1f156789a0 Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
Date: Tue, 8 Nov 2022 14:23:55 -0600
Subject: wifi: cfg80211: Avoid clashing function prototypes

When built with Control Flow Integrity, function prototypes between
caller and function declaration must match. These mismatches are visible
at compile time with the new -Wcast-function-type-strict in Clang[1].

Fix a total of 73 warnings like these:

drivers/net/wireless/intersil/orinoco/wext.c:1379:27: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_param *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict]
        IW_HANDLER(SIOCGIWPOWER,        (iw_handler)orinoco_ioctl_getpower),
                                        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

../net/wireless/wext-compat.c:1607:33: warning: cast from 'int (*)(struct net_device *, struct iw_request_info *, struct iw_point *, char *)' to 'iw_handler' (aka 'int (*)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') converts to incompatible function type [-Wcast-function-type-strict]
        [IW_IOCTL_IDX(SIOCSIWGENIE)]    = (iw_handler) cfg80211_wext_siwgenie,
                                          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

../drivers/net/wireless/intersil/orinoco/wext.c:1390:27: error: incompatible function pointer types initializing 'const iw_handler' (aka 'int (*const)(struct net_device *, struct iw_request_info *, union iwreq_data *, char *)') with an expression of type 'int (struct net_device *, struct iw_request_info *, struct iw_param *, char *)' [-Wincompatible-function-pointer-types]
        IW_HANDLER(SIOCGIWRETRY,        cfg80211_wext_giwretry),
                                        ^~~~~~~~~~~~~~~~~~~~~~

The cfg80211 Wireless Extension handler callbacks (iw_handler) use a
union for the data argument. Actually use the union and perform explicit
member selection in the function body instead of having a function
prototype mismatch. There are no resulting binary differences
before/after changes.

These changes were made partly manually and partly with the help of
Coccinelle.

Link: https://github.com/KSPP/linux/issues/234
Link: https://reviews.llvm.org/D134831 [1]
Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Kalle Valo <kvalo@kernel.org>
Link: https://lore.kernel.org/r/a68822bf8dd587988131bb6a295280cb4293f05d.1667934775.git.gustavoars@kernel.org
---
 drivers/net/wireless/intel/ipw2x00/ipw2200.c |   2 +-
 drivers/net/wireless/intersil/orinoco/wext.c |  22 ++--
 include/net/cfg80211-wext.h                  |  20 +--
 net/wireless/scan.c                          |   3 +-
 net/wireless/wext-compat.c                   | 180 ++++++++++++---------------
 net/wireless/wext-compat.h                   |   8 +-
 net/wireless/wext-sme.c                      |   5 +-
 7 files changed, 113 insertions(+), 127 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2200.c b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
index 79d5c09757d4..ca802af8cddc 100644
--- a/drivers/net/wireless/intel/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/intel/ipw2x00/ipw2200.c
@@ -9856,7 +9856,7 @@ static int ipw_wx_sw_reset(struct net_device *dev,
 
 /* Rebase the WE IOCTLs to zero for the handler array */
 static iw_handler ipw_wx_handlers[] = {
-	IW_HANDLER(SIOCGIWNAME, (iw_handler)cfg80211_wext_giwname),
+	IW_HANDLER(SIOCGIWNAME, cfg80211_wext_giwname),
 	IW_HANDLER(SIOCSIWFREQ, ipw_wx_set_freq),
 	IW_HANDLER(SIOCGIWFREQ, ipw_wx_get_freq),
 	IW_HANDLER(SIOCSIWMODE, ipw_wx_set_mode),
diff --git a/drivers/net/wireless/intersil/orinoco/wext.c b/drivers/net/wireless/intersil/orinoco/wext.c
index b8eb5d60192f..dea1ff044342 100644
--- a/drivers/net/wireless/intersil/orinoco/wext.c
+++ b/drivers/net/wireless/intersil/orinoco/wext.c
@@ -1363,31 +1363,31 @@ static const struct iw_priv_args orinoco_privtab[] = {
 
 static const iw_handler	orinoco_handler[] = {
 	IW_HANDLER(SIOCSIWCOMMIT,	orinoco_ioctl_commit),
-	IW_HANDLER(SIOCGIWNAME,		(iw_handler)cfg80211_wext_giwname),
+	IW_HANDLER(SIOCGIWNAME,		cfg80211_wext_giwname),
 	IW_HANDLER(SIOCSIWFREQ,		orinoco_ioctl_setfreq),
 	IW_HANDLER(SIOCGIWFREQ,		orinoco_ioctl_getfreq),
-	IW_HANDLER(SIOCSIWMODE,		(iw_handler)cfg80211_wext_siwmode),
-	IW_HANDLER(SIOCGIWMODE,		(iw_handler)cfg80211_wext_giwmode),
+	IW_HANDLER(SIOCSIWMODE,		cfg80211_wext_siwmode),
+	IW_HANDLER(SIOCGIWMODE,		cfg80211_wext_giwmode),
 	IW_HANDLER(SIOCSIWSENS,		orinoco_ioctl_setsens),
 	IW_HANDLER(SIOCGIWSENS,		orinoco_ioctl_getsens),
-	IW_HANDLER(SIOCGIWRANGE,	(iw_handler)cfg80211_wext_giwrange),
+	IW_HANDLER(SIOCGIWRANGE,	cfg80211_wext_giwrange),
 	IW_HANDLER(SIOCSIWSPY,		iw_handler_set_spy),
 	IW_HANDLER(SIOCGIWSPY,		iw_handler_get_spy),
 	IW_HANDLER(SIOCSIWTHRSPY,	iw_handler_set_thrspy),
 	IW_HANDLER(SIOCGIWTHRSPY,	iw_handler_get_thrspy),
 	IW_HANDLER(SIOCSIWAP,		orinoco_ioctl_setwap),
 	IW_HANDLER(SIOCGIWAP,		orinoco_ioctl_getwap),
-	IW_HANDLER(SIOCSIWSCAN,		(iw_handler)cfg80211_wext_siwscan),
-	IW_HANDLER(SIOCGIWSCAN,		(iw_handler)cfg80211_wext_giwscan),
+	IW_HANDLER(SIOCSIWSCAN,		cfg80211_wext_siwscan),
+	IW_HANDLER(SIOCGIWSCAN,		cfg80211_wext_giwscan),
 	IW_HANDLER(SIOCSIWESSID,	orinoco_ioctl_setessid),
 	IW_HANDLER(SIOCGIWESSID,	orinoco_ioctl_getessid),
 	IW_HANDLER(SIOCSIWRATE,		orinoco_ioctl_setrate),
 	IW_HANDLER(SIOCGIWRATE,		orinoco_ioctl_getrate),
-	IW_HANDLER(SIOCSIWRTS,		(iw_handler)cfg80211_wext_siwrts),
-	IW_HANDLER(SIOCGIWRTS,		(iw_handler)cfg80211_wext_giwrts),
-	IW_HANDLER(SIOCSIWFRAG,		(iw_handler)cfg80211_wext_siwfrag),
-	IW_HANDLER(SIOCGIWFRAG,		(iw_handler)cfg80211_wext_giwfrag),
-	IW_HANDLER(SIOCGIWRETRY,	(iw_handler)cfg80211_wext_giwretry),
+	IW_HANDLER(SIOCSIWRTS,		cfg80211_wext_siwrts),
+	IW_HANDLER(SIOCGIWRTS,		cfg80211_wext_giwrts),
+	IW_HANDLER(SIOCSIWFRAG,		cfg80211_wext_siwfrag),
+	IW_HANDLER(SIOCGIWFRAG,		cfg80211_wext_giwfrag),
+	IW_HANDLER(SIOCGIWRETRY,	cfg80211_wext_giwretry),
 	IW_HANDLER(SIOCSIWENCODE,	orinoco_ioctl_setiwencode),
 	IW_HANDLER(SIOCGIWENCODE,	orinoco_ioctl_getiwencode),
 	IW_HANDLER(SIOCSIWPOWER,	orinoco_ioctl_setpower),
diff --git a/include/net/cfg80211-wext.h b/include/net/cfg80211-wext.h
index ad77caf2ffde..0ee36d97e068 100644
--- a/include/net/cfg80211-wext.h
+++ b/include/net/cfg80211-wext.h
@@ -19,34 +19,34 @@
  */
 int cfg80211_wext_giwname(struct net_device *dev,
 			  struct iw_request_info *info,
-			  char *name, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
-			  u32 *mode, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
-			  u32 *mode, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_siwscan(struct net_device *dev,
 			  struct iw_request_info *info,
 			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwscan(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_point *data, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwrange(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_point *data, char *extra);
+			   union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_siwrts(struct net_device *dev,
 			 struct iw_request_info *info,
-			 struct iw_param *rts, char *extra);
+			 union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwrts(struct net_device *dev,
 			 struct iw_request_info *info,
-			 struct iw_param *rts, char *extra);
+			 union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_siwfrag(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_param *frag, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwfrag(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_param *frag, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_giwretry(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_param *retry, char *extra);
+			   union iwreq_data *wrqu, char *extra);
 
 #endif /* __NET_CFG80211_WEXT_H */
diff --git a/net/wireless/scan.c b/net/wireless/scan.c
index 806a5f1330ff..853619bc0f1a 100644
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -3229,8 +3229,9 @@ static int ieee80211_scan_results(struct cfg80211_registered_device *rdev,
 
 int cfg80211_wext_giwscan(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_point *data, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_point *data = &wrqu->data;
 	struct cfg80211_registered_device *rdev;
 	int res;
 
diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c
index ddf340bfa07a..8a24dfca75af 100644
--- a/net/wireless/wext-compat.c
+++ b/net/wireless/wext-compat.c
@@ -25,16 +25,17 @@
 
 int cfg80211_wext_giwname(struct net_device *dev,
 			  struct iw_request_info *info,
-			  char *name, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
-	strcpy(name, "IEEE 802.11");
+	strcpy(wrqu->name, "IEEE 802.11");
 	return 0;
 }
 EXPORT_WEXT_HANDLER(cfg80211_wext_giwname);
 
 int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
-			  u32 *mode, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
+	__u32 *mode = &wrqu->mode;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev;
 	struct vif_params vifparams;
@@ -71,8 +72,9 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info,
 EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode);
 
 int cfg80211_wext_giwmode(struct net_device *dev, struct iw_request_info *info,
-			  u32 *mode, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
+	__u32 *mode = &wrqu->mode;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	if (!wdev)
@@ -108,8 +110,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwmode);
 
 int cfg80211_wext_giwrange(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_point *data, char *extra)
+			   union iwreq_data *wrqu, char *extra)
 {
+	struct iw_point *data = &wrqu->data;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct iw_range *range = (struct iw_range *) extra;
 	enum nl80211_band band;
@@ -251,8 +254,9 @@ int cfg80211_wext_freq(struct iw_freq *freq)
 
 int cfg80211_wext_siwrts(struct net_device *dev,
 			 struct iw_request_info *info,
-			 struct iw_param *rts, char *extra)
+			 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *rts = &wrqu->rts;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	u32 orts = wdev->wiphy->rts_threshold;
@@ -281,8 +285,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwrts);
 
 int cfg80211_wext_giwrts(struct net_device *dev,
 			 struct iw_request_info *info,
-			 struct iw_param *rts, char *extra)
+			 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *rts = &wrqu->rts;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	rts->value = wdev->wiphy->rts_threshold;
@@ -295,8 +300,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwrts);
 
 int cfg80211_wext_siwfrag(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_param *frag, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *frag = &wrqu->frag;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	u32 ofrag = wdev->wiphy->frag_threshold;
@@ -325,8 +331,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_siwfrag);
 
 int cfg80211_wext_giwfrag(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_param *frag, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *frag = &wrqu->frag;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	frag->value = wdev->wiphy->frag_threshold;
@@ -339,8 +346,9 @@ EXPORT_WEXT_HANDLER(cfg80211_wext_giwfrag);
 
 static int cfg80211_wext_siwretry(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_param *retry, char *extra)
+				  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *retry = &wrqu->retry;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	u32 changed = 0;
@@ -378,8 +386,9 @@ static int cfg80211_wext_siwretry(struct net_device *dev,
 
 int cfg80211_wext_giwretry(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_param *retry, char *extra)
+			   union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *retry = &wrqu->retry;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	retry->disabled = 0;
@@ -588,8 +597,9 @@ static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev,
 
 static int cfg80211_wext_siwencode(struct net_device *dev,
 				   struct iw_request_info *info,
-				   struct iw_point *erq, char *keybuf)
+				   union iwreq_data *wrqu, char *keybuf)
 {
+	struct iw_point *erq = &wrqu->encoding;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int idx, err;
@@ -664,8 +674,9 @@ out:
 
 static int cfg80211_wext_siwencodeext(struct net_device *dev,
 				      struct iw_request_info *info,
-				      struct iw_point *erq, char *extra)
+				      union iwreq_data *wrqu, char *extra)
 {
+	struct iw_point *erq = &wrqu->encoding;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct iw_encode_ext *ext = (struct iw_encode_ext *) extra;
@@ -767,8 +778,9 @@ static int cfg80211_wext_siwencodeext(struct net_device *dev,
 
 static int cfg80211_wext_giwencode(struct net_device *dev,
 				   struct iw_request_info *info,
-				   struct iw_point *erq, char *keybuf)
+				   union iwreq_data *wrqu, char *keybuf)
 {
+	struct iw_point *erq = &wrqu->encoding;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	int idx;
 
@@ -804,8 +816,9 @@ static int cfg80211_wext_giwencode(struct net_device *dev,
 
 static int cfg80211_wext_siwfreq(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_freq *wextfreq, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_freq *wextfreq = &wrqu->freq;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_chan_def chandef = {
@@ -870,8 +883,9 @@ static int cfg80211_wext_siwfreq(struct net_device *dev,
 
 static int cfg80211_wext_giwfreq(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_freq *freq, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_freq *freq = &wrqu->freq;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_chan_def chandef = {};
@@ -1147,8 +1161,9 @@ static int cfg80211_set_key_mgt(struct wireless_dev *wdev, u32 key_mgt)
 
 static int cfg80211_wext_siwauth(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_param *data, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *data = &wrqu->param;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	if (wdev->iftype != NL80211_IFTYPE_STATION)
@@ -1180,7 +1195,7 @@ static int cfg80211_wext_siwauth(struct net_device *dev,
 
 static int cfg80211_wext_giwauth(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_param *data, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
 	/* XXX: what do we need? */
 
@@ -1189,8 +1204,9 @@ static int cfg80211_wext_giwauth(struct net_device *dev,
 
 static int cfg80211_wext_siwpower(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_param *wrq, char *extra)
+				  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *wrq = &wrqu->power;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	bool ps;
@@ -1238,8 +1254,9 @@ static int cfg80211_wext_siwpower(struct net_device *dev,
 
 static int cfg80211_wext_giwpower(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_param *wrq, char *extra)
+				  union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *wrq = &wrqu->power;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 
 	wrq->disabled = !wdev->ps;
@@ -1249,8 +1266,9 @@ static int cfg80211_wext_giwpower(struct net_device *dev,
 
 static int cfg80211_wext_siwrate(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_param *rate, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *rate = &wrqu->bitrate;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct cfg80211_bitrate_mask mask;
@@ -1307,8 +1325,9 @@ static int cfg80211_wext_siwrate(struct net_device *dev,
 
 static int cfg80211_wext_giwrate(struct net_device *dev,
 				 struct iw_request_info *info,
-				 struct iw_param *rate, char *extra)
+				 union iwreq_data *wrqu, char *extra)
 {
+	struct iw_param *rate = &wrqu->bitrate;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	struct station_info sinfo = {};
@@ -1430,8 +1449,9 @@ static struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev)
 
 static int cfg80211_wext_siwap(struct net_device *dev,
 			       struct iw_request_info *info,
-			       struct sockaddr *ap_addr, char *extra)
+			       union iwreq_data *wrqu, char *extra)
 {
+	struct sockaddr *ap_addr = &wrqu->ap_addr;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int ret;
@@ -1455,8 +1475,9 @@ static int cfg80211_wext_siwap(struct net_device *dev,
 
 static int cfg80211_wext_giwap(struct net_device *dev,
 			       struct iw_request_info *info,
-			       struct sockaddr *ap_addr, char *extra)
+			       union iwreq_data *wrqu, char *extra)
 {
+	struct sockaddr *ap_addr = &wrqu->ap_addr;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int ret;
@@ -1480,8 +1501,9 @@ static int cfg80211_wext_giwap(struct net_device *dev,
 
 static int cfg80211_wext_siwessid(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_point *data, char *ssid)
+				  union iwreq_data *wrqu, char *ssid)
 {
+	struct iw_point *data = &wrqu->data;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int ret;
@@ -1505,8 +1527,9 @@ static int cfg80211_wext_siwessid(struct net_device *dev,
 
 static int cfg80211_wext_giwessid(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_point *data, char *ssid)
+				  union iwreq_data *wrqu, char *ssid)
 {
+	struct iw_point *data = &wrqu->data;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	int ret;
@@ -1533,7 +1556,7 @@ static int cfg80211_wext_giwessid(struct net_device *dev,
 
 static int cfg80211_wext_siwpmksa(struct net_device *dev,
 				  struct iw_request_info *info,
-				  struct iw_point *data, char *extra)
+				  union iwreq_data *wrqu, char *extra)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
@@ -1584,78 +1607,39 @@ static int cfg80211_wext_siwpmksa(struct net_device *dev,
 	return ret;
 }
 
-#define DEFINE_WEXT_COMPAT_STUB(func, type)			\
-	static int __ ## func(struct net_device *dev,		\
-			      struct iw_request_info *info,	\
-			      union iwreq_data *wrqu,		\
-			      char *extra)			\
-	{							\
-		return func(dev, info, (type *)wrqu, extra);	\
-	}
-
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwname, char)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfreq, struct iw_freq)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfreq, struct iw_freq)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmode, u32)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwmode, u32)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrange, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwap, struct sockaddr)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwap, struct sockaddr)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwmlme, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwscan, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwessid, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwessid, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrate, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrate, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwrts, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwrts, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwfrag, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwfrag, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwretry, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwretry, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencode, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwencode, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwpower, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpower, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwgenie, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_giwauth, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwauth, struct iw_param)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwencodeext, struct iw_point)
-DEFINE_WEXT_COMPAT_STUB(cfg80211_wext_siwpmksa, struct iw_point)
-
 static const iw_handler cfg80211_handlers[] = {
-	[IW_IOCTL_IDX(SIOCGIWNAME)]	= __cfg80211_wext_giwname,
-	[IW_IOCTL_IDX(SIOCSIWFREQ)]	= __cfg80211_wext_siwfreq,
-	[IW_IOCTL_IDX(SIOCGIWFREQ)]	= __cfg80211_wext_giwfreq,
-	[IW_IOCTL_IDX(SIOCSIWMODE)]	= __cfg80211_wext_siwmode,
-	[IW_IOCTL_IDX(SIOCGIWMODE)]	= __cfg80211_wext_giwmode,
-	[IW_IOCTL_IDX(SIOCGIWRANGE)]	= __cfg80211_wext_giwrange,
-	[IW_IOCTL_IDX(SIOCSIWAP)]	= __cfg80211_wext_siwap,
-	[IW_IOCTL_IDX(SIOCGIWAP)]	= __cfg80211_wext_giwap,
-	[IW_IOCTL_IDX(SIOCSIWMLME)]	= __cfg80211_wext_siwmlme,
-	[IW_IOCTL_IDX(SIOCSIWSCAN)]	= cfg80211_wext_siwscan,
-	[IW_IOCTL_IDX(SIOCGIWSCAN)]	= __cfg80211_wext_giwscan,
-	[IW_IOCTL_IDX(SIOCSIWESSID)]	= __cfg80211_wext_siwessid,
-	[IW_IOCTL_IDX(SIOCGIWESSID)]	= __cfg80211_wext_giwessid,
-	[IW_IOCTL_IDX(SIOCSIWRATE)]	= __cfg80211_wext_siwrate,
-	[IW_IOCTL_IDX(SIOCGIWRATE)]	= __cfg80211_wext_giwrate,
-	[IW_IOCTL_IDX(SIOCSIWRTS)]	= __cfg80211_wext_siwrts,
-	[IW_IOCTL_IDX(SIOCGIWRTS)]	= __cfg80211_wext_giwrts,
-	[IW_IOCTL_IDX(SIOCSIWFRAG)]	= __cfg80211_wext_siwfrag,
-	[IW_IOCTL_IDX(SIOCGIWFRAG)]	= __cfg80211_wext_giwfrag,
-	[IW_IOCTL_IDX(SIOCSIWTXPOW)]	= cfg80211_wext_siwtxpower,
-	[IW_IOCTL_IDX(SIOCGIWTXPOW)]	= cfg80211_wext_giwtxpower,
-	[IW_IOCTL_IDX(SIOCSIWRETRY)]	= __cfg80211_wext_siwretry,
-	[IW_IOCTL_IDX(SIOCGIWRETRY)]	= __cfg80211_wext_giwretry,
-	[IW_IOCTL_IDX(SIOCSIWENCODE)]	= __cfg80211_wext_siwencode,
-	[IW_IOCTL_IDX(SIOCGIWENCODE)]	= __cfg80211_wext_giwencode,
-	[IW_IOCTL_IDX(SIOCSIWPOWER)]	= __cfg80211_wext_siwpower,
-	[IW_IOCTL_IDX(SIOCGIWPOWER)]	= __cfg80211_wext_giwpower,
-	[IW_IOCTL_IDX(SIOCSIWGENIE)]	= __cfg80211_wext_siwgenie,
-	[IW_IOCTL_IDX(SIOCSIWAUTH)]	= __cfg80211_wext_siwauth,
-	[IW_IOCTL_IDX(SIOCGIWAUTH)]	= __cfg80211_wext_giwauth,
-	[IW_IOCTL_IDX(SIOCSIWENCODEEXT)]= __cfg80211_wext_siwencodeext,
-	[IW_IOCTL_IDX(SIOCSIWPMKSA)]	= __cfg80211_wext_siwpmksa,
+	IW_HANDLER(SIOCGIWNAME,		cfg80211_wext_giwname),
+	IW_HANDLER(SIOCSIWFREQ,		cfg80211_wext_siwfreq),
+	IW_HANDLER(SIOCGIWFREQ,		cfg80211_wext_giwfreq),
+	IW_HANDLER(SIOCSIWMODE,		cfg80211_wext_siwmode),
+	IW_HANDLER(SIOCGIWMODE,		cfg80211_wext_giwmode),
+	IW_HANDLER(SIOCGIWRANGE,	cfg80211_wext_giwrange),
+	IW_HANDLER(SIOCSIWAP,		cfg80211_wext_siwap),
+	IW_HANDLER(SIOCGIWAP,		cfg80211_wext_giwap),
+	IW_HANDLER(SIOCSIWMLME,		cfg80211_wext_siwmlme),
+	IW_HANDLER(SIOCSIWSCAN,		cfg80211_wext_siwscan),
+	IW_HANDLER(SIOCGIWSCAN,		cfg80211_wext_giwscan),
+	IW_HANDLER(SIOCSIWESSID,	cfg80211_wext_siwessid),
+	IW_HANDLER(SIOCGIWESSID,	cfg80211_wext_giwessid),
+	IW_HANDLER(SIOCSIWRATE,		cfg80211_wext_siwrate),
+	IW_HANDLER(SIOCGIWRATE,		cfg80211_wext_giwrate),
+	IW_HANDLER(SIOCSIWRTS,		cfg80211_wext_siwrts),
+	IW_HANDLER(SIOCGIWRTS,		cfg80211_wext_giwrts),
+	IW_HANDLER(SIOCSIWFRAG,		cfg80211_wext_siwfrag),
+	IW_HANDLER(SIOCGIWFRAG,		cfg80211_wext_giwfrag),
+	IW_HANDLER(SIOCSIWTXPOW,	cfg80211_wext_siwtxpower),
+	IW_HANDLER(SIOCGIWTXPOW,	cfg80211_wext_giwtxpower),
+	IW_HANDLER(SIOCSIWRETRY,	cfg80211_wext_siwretry),
+	IW_HANDLER(SIOCGIWRETRY,	cfg80211_wext_giwretry),
+	IW_HANDLER(SIOCSIWENCODE,	cfg80211_wext_siwencode),
+	IW_HANDLER(SIOCGIWENCODE,	cfg80211_wext_giwencode),
+	IW_HANDLER(SIOCSIWPOWER,	cfg80211_wext_siwpower),
+	IW_HANDLER(SIOCGIWPOWER,	cfg80211_wext_giwpower),
+	IW_HANDLER(SIOCSIWGENIE,	cfg80211_wext_siwgenie),
+	IW_HANDLER(SIOCSIWAUTH,		cfg80211_wext_siwauth),
+	IW_HANDLER(SIOCGIWAUTH,		cfg80211_wext_giwauth),
+	IW_HANDLER(SIOCSIWENCODEEXT,	cfg80211_wext_siwencodeext),
+	IW_HANDLER(SIOCSIWPMKSA,	cfg80211_wext_siwpmksa),
 };
 
 const struct iw_handler_def cfg80211_wext_handler = {
diff --git a/net/wireless/wext-compat.h b/net/wireless/wext-compat.h
index 8d3cc1552e2f..c02eb789e676 100644
--- a/net/wireless/wext-compat.h
+++ b/net/wireless/wext-compat.h
@@ -13,7 +13,7 @@
 
 int cfg80211_ibss_wext_siwfreq(struct net_device *dev,
 			       struct iw_request_info *info,
-			       struct iw_freq *freq, char *extra);
+			       struct iw_freq *wextfreq, char *extra);
 int cfg80211_ibss_wext_giwfreq(struct net_device *dev,
 			       struct iw_request_info *info,
 			       struct iw_freq *freq, char *extra);
@@ -32,7 +32,7 @@ int cfg80211_ibss_wext_giwessid(struct net_device *dev,
 
 int cfg80211_mgd_wext_siwfreq(struct net_device *dev,
 			      struct iw_request_info *info,
-			      struct iw_freq *freq, char *extra);
+			      struct iw_freq *wextfreq, char *extra);
 int cfg80211_mgd_wext_giwfreq(struct net_device *dev,
 			      struct iw_request_info *info,
 			      struct iw_freq *freq, char *extra);
@@ -51,10 +51,10 @@ int cfg80211_mgd_wext_giwessid(struct net_device *dev,
 
 int cfg80211_wext_siwmlme(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_point *data, char *extra);
+			  union iwreq_data *wrqu, char *extra);
 int cfg80211_wext_siwgenie(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_point *data, char *extra);
+			   union iwreq_data *wrqu, char *extra);
 
 
 int cfg80211_wext_freq(struct iw_freq *freq);
diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c
index 68f45afc352d..191c6d98c700 100644
--- a/net/wireless/wext-sme.c
+++ b/net/wireless/wext-sme.c
@@ -324,8 +324,9 @@ int cfg80211_mgd_wext_giwap(struct net_device *dev,
 
 int cfg80211_wext_siwgenie(struct net_device *dev,
 			   struct iw_request_info *info,
-			   struct iw_point *data, char *extra)
+			   union iwreq_data *wrqu, char *extra)
 {
+	struct iw_point *data = &wrqu->data;
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy);
 	u8 *ie = extra;
@@ -374,7 +375,7 @@ int cfg80211_wext_siwgenie(struct net_device *dev,
 
 int cfg80211_wext_siwmlme(struct net_device *dev,
 			  struct iw_request_info *info,
-			  struct iw_point *data, char *extra)
+			  union iwreq_data *wrqu, char *extra)
 {
 	struct wireless_dev *wdev = dev->ieee80211_ptr;
 	struct iw_mlme *mlme = (struct iw_mlme *)extra;
-- 
cgit v1.2.3


From 67fb43308f4b354f13aabcc66dd5d99bfbb7e838 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 14 Nov 2022 13:57:54 -0800
Subject: udp: Set NULL to sk->sk_prot->h.udp_table.

We will soon introduce an optional per-netns hash table
for UDP.

This means we cannot use the global sk->sk_prot->h.udp_table
to fetch a UDP hash table.

Instead, set NULL to sk->sk_prot->h.udp_table for UDP and get
a proper table from net->ipv4.udp_table.

Note that we still need sk->sk_prot->h.udp_table for UDP LITE.

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/ipv4.h |  1 +
 net/ipv4/udp.c           | 15 +++++++++++----
 net/ipv6/udp.c           |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 25f90bba4889..e4cc4d3cacc4 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -43,6 +43,7 @@ struct tcp_fastopen_context;
 
 struct netns_ipv4 {
 	struct inet_timewait_death_row tcp_death_row;
+	struct udp_table *udp_table;
 
 #ifdef CONFIG_SYSCTL
 	struct ctl_table_header	*forw_hdr;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index a34de263e9ce..6206c27a1659 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -131,6 +131,11 @@ EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
 #define MAX_UDP_PORTS 65536
 #define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
 
+static struct udp_table *udp_get_table_prot(struct sock *sk)
+{
+	return sk->sk_prot->h.udp_table ? : sock_net(sk)->ipv4.udp_table;
+}
+
 static int udp_lib_lport_inuse(struct net *net, __u16 num,
 			       const struct udp_hslot *hslot,
 			       unsigned long *bitmap,
@@ -232,7 +237,7 @@ static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot)
 int udp_lib_get_port(struct sock *sk, unsigned short snum,
 		     unsigned int hash2_nulladdr)
 {
-	struct udp_table *udptable = sk->sk_prot->h.udp_table;
+	struct udp_table *udptable = udp_get_table_prot(sk);
 	struct udp_hslot *hslot, *hslot2;
 	struct net *net = sock_net(sk);
 	int error = 1;
@@ -1999,7 +2004,7 @@ EXPORT_SYMBOL(udp_disconnect);
 void udp_lib_unhash(struct sock *sk)
 {
 	if (sk_hashed(sk)) {
-		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_table *udptable = udp_get_table_prot(sk);
 		struct udp_hslot *hslot, *hslot2;
 
 		hslot  = udp_hashslot(udptable, sock_net(sk),
@@ -2030,7 +2035,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
 void udp_lib_rehash(struct sock *sk, u16 newhash)
 {
 	if (sk_hashed(sk)) {
-		struct udp_table *udptable = sk->sk_prot->h.udp_table;
+		struct udp_table *udptable = udp_get_table_prot(sk);
 		struct udp_hslot *hslot, *hslot2, *nhslot2;
 
 		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
@@ -2967,7 +2972,7 @@ struct proto udp_prot = {
 	.sysctl_wmem_offset	= offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset	= offsetof(struct net, ipv4.sysctl_udp_rmem_min),
 	.obj_size		= sizeof(struct udp_sock),
-	.h.udp_table		= &udp_table,
+	.h.udp_table		= NULL,
 	.diag_destroy		= udp_abort,
 };
 EXPORT_SYMBOL(udp_prot);
@@ -3280,6 +3285,8 @@ EXPORT_SYMBOL(udp_flow_hashrnd);
 
 static int __net_init udp_sysctl_init(struct net *net)
 {
+	net->ipv4.udp_table = &udp_table;
+
 	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
 	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
 
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 727de67e4c90..bbd6dc398f3b 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1774,7 +1774,7 @@ struct proto udpv6_prot = {
 	.sysctl_wmem_offset     = offsetof(struct net, ipv4.sysctl_udp_wmem_min),
 	.sysctl_rmem_offset     = offsetof(struct net, ipv4.sysctl_udp_rmem_min),
 	.obj_size		= sizeof(struct udp6_sock),
-	.h.udp_table		= &udp_table,
+	.h.udp_table		= NULL,
 	.diag_destroy		= udp_abort,
 };
 
-- 
cgit v1.2.3


From 9804985bf27f8fbcf0d96c7435b5ad94a2a6ea20 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Mon, 14 Nov 2022 13:57:57 -0800
Subject: udp: Introduce optional per-netns hash table.

The maximum hash table size is 64K due to the nature of the protocol. [0]
It's smaller than TCP, and fewer sockets can cause a performance drop.

On an EC2 c5.24xlarge instance (192 GiB memory), after running iperf3 in
different netns, creating 32Mi sockets without data transfer in the root
netns causes regression for the iperf3's connection.

  uhash_entries		sockets		length		Gbps
	    64K		      1		     1		5.69
			    1Mi		    16		5.27
			    2Mi		    32		4.90
			    4Mi		    64		4.09
			    8Mi		   128		2.96
			   16Mi		   256		2.06
			   32Mi		   512		1.12

The per-netns hash table breaks the lengthy lists into shorter ones.  It is
useful on a multi-tenant system with thousands of netns.  With smaller hash
tables, we can look up sockets faster, isolate noisy neighbours, and reduce
lock contention.

The max size of the per-netns table is 64K as well.  This is because the
possible hash range by udp_hashfn() always fits in 64K within the same
netns and we cannot make full use of the whole buckets larger than 64K.

  /* 0 < num < 64K  ->  X < hash < X + 64K */
  (num + net_hash_mix(net)) & mask;

Also, the min size is 128.  We use a bitmap to search for an available
port in udp_lib_get_port().  To keep the bitmap on the stack and not
fire the CONFIG_FRAME_WARN error at build time, we round up the table
size to 128.

The sysctl usage is the same with TCP:

  $ dmesg | cut -d ' ' -f 6- | grep "UDP hash"
  UDP hash table entries: 65536 (order: 9, 2097152 bytes, vmalloc)

  # sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = 65536  # can be changed by uhash_entries

  # sysctl net.ipv4.udp_child_hash_entries
  net.ipv4.udp_child_hash_entries = 0  # disabled by default

  # ip netns add test1
  # ip netns exec test1 sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = -65536  # share the global table

  # sysctl -w net.ipv4.udp_child_hash_entries=100
  net.ipv4.udp_child_hash_entries = 100

  # ip netns add test2
  # ip netns exec test2 sysctl net.ipv4.udp_hash_entries
  net.ipv4.udp_hash_entries = 128  # own a per-netns table with 2^n buckets

We could optimise the hash table lookup/iteration further by removing
the netns comparison for the per-netns one in the future.  Also, we
could optimise the sparse udp_hslot layout by putting it in udp_table.

[0]: https://lore.kernel.org/netdev/4ACC2815.7010101@gmail.com/

Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.rst |  27 +++++++++
 include/linux/udp.h                    |   2 +
 include/net/netns/ipv4.h               |   2 +
 net/ipv4/sysctl_net_ipv4.c             |  40 +++++++++++++
 net/ipv4/udp.c                         | 101 +++++++++++++++++++++++++++++++--
 5 files changed, 166 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 815efc89ad73..727b25cc7ec4 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -1177,6 +1177,33 @@ udp_rmem_min - INTEGER
 udp_wmem_min - INTEGER
 	UDP does not have tx memory accounting and this tunable has no effect.
 
+udp_hash_entries - INTEGER
+	Show the number of hash buckets for UDP sockets in the current
+	networking namespace.
+
+	A negative value means the networking namespace does not own its
+	hash buckets and shares the initial networking namespace's one.
+
+udp_child_ehash_entries - INTEGER
+	Control the number of hash buckets for UDP sockets in the child
+	networking namespace, which must be set before clone() or unshare().
+
+	If the value is not 0, the kernel uses a value rounded up to 2^n
+	as the actual hash bucket size.  0 is a special value, meaning
+	the child networking namespace will share the initial networking
+	namespace's hash buckets.
+
+	Note that the child will use the global one in case the kernel
+	fails to allocate enough memory.  In addition, the global hash
+	buckets are spread over available NUMA nodes, but the allocation
+	of the child hash table depends on the current process's NUMA
+	policy, which could result in performance differences.
+
+	Possible values: 0, 2^n (n: 7 (128) - 16 (64K))
+
+	Default: 0
+
+
 RAW variables
 =============
 
diff --git a/include/linux/udp.h b/include/linux/udp.h
index dea57aa37df6..a2892e151644 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -23,7 +23,9 @@ static inline struct udphdr *udp_hdr(const struct sk_buff *skb)
 	return (struct udphdr *)skb_transport_header(skb);
 }
 
+#define UDP_HTABLE_SIZE_MIN_PERNET	128
 #define UDP_HTABLE_SIZE_MIN		(CONFIG_BASE_SMALL ? 128 : 256)
+#define UDP_HTABLE_SIZE_MAX		65536
 
 static inline u32 udp_hashfn(const struct net *net, u32 num, u32 mask)
 {
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index e4cc4d3cacc4..db762e35aca9 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -208,6 +208,8 @@ struct netns_ipv4 {
 
 	atomic_t dev_addr_genid;
 
+	unsigned int sysctl_udp_child_hash_entries;
+
 #ifdef CONFIG_SYSCTL
 	unsigned long *sysctl_local_reserved_ports;
 	int sysctl_ip_prot_sock;
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 0af28cedd071..0d0cc4ef2b85 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -40,6 +40,7 @@ static int one_day_secs = 24 * 3600;
 static u32 fib_multipath_hash_fields_all_mask __maybe_unused =
 	FIB_MULTIPATH_HASH_FIELD_ALL_MASK;
 static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024;
+static unsigned int udp_child_hash_entries_max = UDP_HTABLE_SIZE_MAX;
 static int tcp_plb_max_rounds = 31;
 static int tcp_plb_max_cong_thresh = 256;
 
@@ -402,12 +403,36 @@ static int proc_tcp_ehash_entries(struct ctl_table *table, int write,
 	if (!net_eq(net, &init_net) && !hinfo->pernet)
 		tcp_ehash_entries *= -1;
 
+	memset(&tbl, 0, sizeof(tbl));
 	tbl.data = &tcp_ehash_entries;
 	tbl.maxlen = sizeof(int);
 
 	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
 }
 
+static int proc_udp_hash_entries(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct net *net = container_of(table->data, struct net,
+				       ipv4.sysctl_udp_child_hash_entries);
+	int udp_hash_entries;
+	struct ctl_table tbl;
+
+	udp_hash_entries = net->ipv4.udp_table->mask + 1;
+
+	/* A negative number indicates that the child netns
+	 * shares the global udp_table.
+	 */
+	if (!net_eq(net, &init_net) && net->ipv4.udp_table == &udp_table)
+		udp_hash_entries *= -1;
+
+	memset(&tbl, 0, sizeof(tbl));
+	tbl.data = &udp_hash_entries;
+	tbl.maxlen = sizeof(int);
+
+	return proc_dointvec(&tbl, write, buffer, lenp, ppos);
+}
+
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
 static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write,
 					  void *buffer, size_t *lenp,
@@ -1361,6 +1386,21 @@ static struct ctl_table ipv4_net_table[] = {
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= &tcp_child_ehash_entries_max,
 	},
+	{
+		.procname	= "udp_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.mode		= 0444,
+		.proc_handler	= proc_udp_hash_entries,
+	},
+	{
+		.procname	= "udp_child_hash_entries",
+		.data		= &init_net.ipv4.sysctl_udp_child_hash_entries,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_douintvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= &udp_child_hash_entries_max,
+	},
 	{
 		.procname	= "udp_rmem_min",
 		.data		= &init_net.ipv4.sysctl_udp_rmem_min,
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 37e79158d145..1fb7d1ed1cb1 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -129,7 +129,7 @@ DEFINE_PER_CPU(int, udp_memory_per_cpu_fw_alloc);
 EXPORT_PER_CPU_SYMBOL_GPL(udp_memory_per_cpu_fw_alloc);
 
 #define MAX_UDP_PORTS 65536
-#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN)
+#define PORTS_PER_CHAIN (MAX_UDP_PORTS / UDP_HTABLE_SIZE_MIN_PERNET)
 
 static struct udp_table *udp_get_table_prot(struct sock *sk)
 {
@@ -3277,7 +3277,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 					      &table->log,
 					      &table->mask,
 					      UDP_HTABLE_SIZE_MIN,
-					      64 * 1024);
+					      UDP_HTABLE_SIZE_MAX);
 
 	table->hash2 = table->hash + (table->mask + 1);
 	for (i = 0; i <= table->mask; i++) {
@@ -3302,22 +3302,111 @@ u32 udp_flow_hashrnd(void)
 }
 EXPORT_SYMBOL(udp_flow_hashrnd);
 
-static int __net_init udp_sysctl_init(struct net *net)
+static void __net_init udp_sysctl_init(struct net *net)
 {
-	net->ipv4.udp_table = &udp_table;
-
 	net->ipv4.sysctl_udp_rmem_min = PAGE_SIZE;
 	net->ipv4.sysctl_udp_wmem_min = PAGE_SIZE;
 
 #ifdef CONFIG_NET_L3_MASTER_DEV
 	net->ipv4.sysctl_udp_l3mdev_accept = 0;
 #endif
+}
+
+static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_entries)
+{
+	struct udp_table *udptable;
+	int i;
+
+	udptable = kmalloc(sizeof(*udptable), GFP_KERNEL);
+	if (!udptable)
+		goto out;
+
+	udptable->hash = vmalloc_huge(hash_entries * 2 * sizeof(struct udp_hslot),
+				      GFP_KERNEL_ACCOUNT);
+	if (!udptable->hash)
+		goto free_table;
+
+	udptable->hash2 = udptable->hash + hash_entries;
+	udptable->mask = hash_entries - 1;
+	udptable->log = ilog2(hash_entries);
+
+	for (i = 0; i < hash_entries; i++) {
+		INIT_HLIST_HEAD(&udptable->hash[i].head);
+		udptable->hash[i].count = 0;
+		spin_lock_init(&udptable->hash[i].lock);
+
+		INIT_HLIST_HEAD(&udptable->hash2[i].head);
+		udptable->hash2[i].count = 0;
+		spin_lock_init(&udptable->hash2[i].lock);
+	}
+
+	return udptable;
+
+free_table:
+	kfree(udptable);
+out:
+	return NULL;
+}
+
+static void __net_exit udp_pernet_table_free(struct net *net)
+{
+	struct udp_table *udptable = net->ipv4.udp_table;
+
+	if (udptable == &udp_table)
+		return;
+
+	kvfree(udptable->hash);
+	kfree(udptable);
+}
+
+static void __net_init udp_set_table(struct net *net)
+{
+	struct udp_table *udptable;
+	unsigned int hash_entries;
+	struct net *old_net;
+
+	if (net_eq(net, &init_net))
+		goto fallback;
+
+	old_net = current->nsproxy->net_ns;
+	hash_entries = READ_ONCE(old_net->ipv4.sysctl_udp_child_hash_entries);
+	if (!hash_entries)
+		goto fallback;
+
+	/* Set min to keep the bitmap on stack in udp_lib_get_port() */
+	if (hash_entries < UDP_HTABLE_SIZE_MIN_PERNET)
+		hash_entries = UDP_HTABLE_SIZE_MIN_PERNET;
+	else
+		hash_entries = roundup_pow_of_two(hash_entries);
+
+	udptable = udp_pernet_table_alloc(hash_entries);
+	if (udptable) {
+		net->ipv4.udp_table = udptable;
+	} else {
+		pr_warn("Failed to allocate UDP hash table (entries: %u) "
+			"for a netns, fallback to the global one\n",
+			hash_entries);
+fallback:
+		net->ipv4.udp_table = &udp_table;
+	}
+}
+
+static int __net_init udp_pernet_init(struct net *net)
+{
+	udp_sysctl_init(net);
+	udp_set_table(net);
 
 	return 0;
 }
 
+static void __net_exit udp_pernet_exit(struct net *net)
+{
+	udp_pernet_table_free(net);
+}
+
 static struct pernet_operations __net_initdata udp_sysctl_ops = {
-	.init	= udp_sysctl_init,
+	.init	= udp_pernet_init,
+	.exit	= udp_pernet_exit,
 };
 
 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
-- 
cgit v1.2.3


From 6c1c5097781f563b70a81683ea6fdac21637573b Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 15 Nov 2022 08:53:55 +0000
Subject: net: add atomic_long_t to net_device_stats fields

Long standing KCSAN issues are caused by data-race around
some dev->stats changes.

Most performance critical paths already use per-cpu
variables, or per-queue ones.

It is reasonable (and more correct) to use atomic operations
for the slow paths.

This patch adds an union for each field of net_device_stats,
so that we can convert paths that are not yet protected
by a spinlock or a mutex.

netdev_stats_to_stats64() no longer has an #if BITS_PER_LONG==64

Note that the memcpy() we were using on 64bit arches
had no provision to avoid load-tearing,
while atomic_long_read() is providing the needed protection
at no cost.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/netdevice.h | 58 ++++++++++++++++++++++++++++-------------------
 include/net/dst.h         |  5 ++--
 net/core/dev.c            | 14 +++---------
 3 files changed, 40 insertions(+), 37 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 02a2318da7c7..23b3903b0678 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -171,31 +171,38 @@ static inline bool dev_xmit_complete(int rc)
  *	(unsigned long) so they can be read and written atomically.
  */
 
+#define NET_DEV_STAT(FIELD)			\
+	union {					\
+		unsigned long FIELD;		\
+		atomic_long_t __##FIELD;	\
+	}
+
 struct net_device_stats {
-	unsigned long	rx_packets;
-	unsigned long	tx_packets;
-	unsigned long	rx_bytes;
-	unsigned long	tx_bytes;
-	unsigned long	rx_errors;
-	unsigned long	tx_errors;
-	unsigned long	rx_dropped;
-	unsigned long	tx_dropped;
-	unsigned long	multicast;
-	unsigned long	collisions;
-	unsigned long	rx_length_errors;
-	unsigned long	rx_over_errors;
-	unsigned long	rx_crc_errors;
-	unsigned long	rx_frame_errors;
-	unsigned long	rx_fifo_errors;
-	unsigned long	rx_missed_errors;
-	unsigned long	tx_aborted_errors;
-	unsigned long	tx_carrier_errors;
-	unsigned long	tx_fifo_errors;
-	unsigned long	tx_heartbeat_errors;
-	unsigned long	tx_window_errors;
-	unsigned long	rx_compressed;
-	unsigned long	tx_compressed;
+	NET_DEV_STAT(rx_packets);
+	NET_DEV_STAT(tx_packets);
+	NET_DEV_STAT(rx_bytes);
+	NET_DEV_STAT(tx_bytes);
+	NET_DEV_STAT(rx_errors);
+	NET_DEV_STAT(tx_errors);
+	NET_DEV_STAT(rx_dropped);
+	NET_DEV_STAT(tx_dropped);
+	NET_DEV_STAT(multicast);
+	NET_DEV_STAT(collisions);
+	NET_DEV_STAT(rx_length_errors);
+	NET_DEV_STAT(rx_over_errors);
+	NET_DEV_STAT(rx_crc_errors);
+	NET_DEV_STAT(rx_frame_errors);
+	NET_DEV_STAT(rx_fifo_errors);
+	NET_DEV_STAT(rx_missed_errors);
+	NET_DEV_STAT(tx_aborted_errors);
+	NET_DEV_STAT(tx_carrier_errors);
+	NET_DEV_STAT(tx_fifo_errors);
+	NET_DEV_STAT(tx_heartbeat_errors);
+	NET_DEV_STAT(tx_window_errors);
+	NET_DEV_STAT(rx_compressed);
+	NET_DEV_STAT(tx_compressed);
 };
+#undef NET_DEV_STAT
 
 /* per-cpu stats, allocated on demand.
  * Try to fit them in a single cache line, for dev_get_stats() sake.
@@ -5171,4 +5178,9 @@ extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 
 extern struct net_device *blackhole_netdev;
 
+/* Note: Avoid these macros in fast path, prefer per-cpu or per-queue counters. */
+#define DEV_STATS_INC(DEV, FIELD) atomic_long_inc(&(DEV)->stats.__##FIELD)
+#define DEV_STATS_ADD(DEV, FIELD, VAL) 	\
+		atomic_long_add((VAL), &(DEV)->stats.__##FIELD)
+
 #endif	/* _LINUX_NETDEVICE_H */
diff --git a/include/net/dst.h b/include/net/dst.h
index 00b479ce6b99..d67fda89cd0f 100644
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -356,9 +356,8 @@ static inline void __skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 static inline void skb_tunnel_rx(struct sk_buff *skb, struct net_device *dev,
 				 struct net *net)
 {
-	/* TODO : stats should be SMP safe */
-	dev->stats.rx_packets++;
-	dev->stats.rx_bytes += skb->len;
+	DEV_STATS_INC(dev, rx_packets);
+	DEV_STATS_ADD(dev, rx_bytes, skb->len);
 	__skb_tunnel_rx(skb, dev, net);
 }
 
diff --git a/net/core/dev.c b/net/core/dev.c
index fb943dad9651..d0fb4af9a126 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -10369,24 +10369,16 @@ void netdev_run_todo(void)
 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 			     const struct net_device_stats *netdev_stats)
 {
-#if BITS_PER_LONG == 64
-	BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
-	memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
-	/* zero out counters that only exist in rtnl_link_stats64 */
-	memset((char *)stats64 + sizeof(*netdev_stats), 0,
-	       sizeof(*stats64) - sizeof(*netdev_stats));
-#else
-	size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
-	const unsigned long *src = (const unsigned long *)netdev_stats;
+	size_t i, n = sizeof(*netdev_stats) / sizeof(atomic_long_t);
+	const atomic_long_t *src = (atomic_long_t *)netdev_stats;
 	u64 *dst = (u64 *)stats64;
 
 	BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
 	for (i = 0; i < n; i++)
-		dst[i] = src[i];
+		dst[i] = atomic_long_read(&src[i]);
 	/* zero out counters that only exist in rtnl_link_stats64 */
 	memset((char *)stats64 + n * sizeof(u64), 0,
 	       sizeof(*stats64) - n * sizeof(u64));
-#endif
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
-- 
cgit v1.2.3


From b68777d54fac21fc833ec26ea1a2a84f975ab035 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Mon, 14 Nov 2022 20:16:19 +0100
Subject: l2tp: Serialize access to sk_user_data with sk_callback_lock

sk->sk_user_data has multiple users, which are not compatible with each
other. Writers must synchronize by grabbing the sk->sk_callback_lock.

l2tp currently fails to grab the lock when modifying the underlying tunnel
socket fields. Fix it by adding appropriate locking.

We err on the side of safety and grab the sk_callback_lock also inside the
sk_destruct callback overridden by l2tp, even though there should be no
refs allowing access to the sock at the time when sk_destruct gets called.

v4:
- serialize write to sk_user_data in l2tp sk_destruct

v3:
- switch from sock lock to sk_callback_lock
- document write-protection for sk_user_data

v2:
- update Fixes to point to origin of the bug
- use real names in Reported/Tested-by tags

Cc: Tom Parkin <tparkin@katalix.com>
Fixes: 3557baabf280 ("[L2TP]: PPP over L2TP driver core")
Reported-by: Haowei Yan <g1042620637@gmail.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sock.h   |  2 +-
 net/l2tp/l2tp_core.c | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index 5db02546941c..e0517ecc6531 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -323,7 +323,7 @@ struct sk_filter;
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
   *	@sk_zckey: counter to order MSG_ZEROCOPY notifications
   *	@sk_socket: Identd and reporting IO signals
-  *	@sk_user_data: RPC layer private data
+  *	@sk_user_data: RPC layer private data. Write-protected by @sk_callback_lock.
   *	@sk_frag: cached page frag
   *	@sk_peek_off: current peek_offset value
   *	@sk_send_head: front of stuff to transmit
diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c
index 7499c51b1850..754fdda8a5f5 100644
--- a/net/l2tp/l2tp_core.c
+++ b/net/l2tp/l2tp_core.c
@@ -1150,8 +1150,10 @@ static void l2tp_tunnel_destruct(struct sock *sk)
 	}
 
 	/* Remove hooks into tunnel socket */
+	write_lock_bh(&sk->sk_callback_lock);
 	sk->sk_destruct = tunnel->old_sk_destruct;
 	sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
 
 	/* Call the original destructor */
 	if (sk->sk_destruct)
@@ -1469,16 +1471,18 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
 		sock = sockfd_lookup(tunnel->fd, &ret);
 		if (!sock)
 			goto err;
-
-		ret = l2tp_validate_socket(sock->sk, net, tunnel->encap);
-		if (ret < 0)
-			goto err_sock;
 	}
 
+	sk = sock->sk;
+	write_lock(&sk->sk_callback_lock);
+
+	ret = l2tp_validate_socket(sk, net, tunnel->encap);
+	if (ret < 0)
+		goto err_sock;
+
 	tunnel->l2tp_net = net;
 	pn = l2tp_pernet(net);
 
-	sk = sock->sk;
 	sock_hold(sk);
 	tunnel->sock = sk;
 
@@ -1504,7 +1508,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
 
 		setup_udp_tunnel_sock(net, sock, &udp_cfg);
 	} else {
-		sk->sk_user_data = tunnel;
+		rcu_assign_sk_user_data(sk, tunnel);
 	}
 
 	tunnel->old_sk_destruct = sk->sk_destruct;
@@ -1518,6 +1522,7 @@ int l2tp_tunnel_register(struct l2tp_tunnel *tunnel, struct net *net,
 	if (tunnel->fd >= 0)
 		sockfd_put(sock);
 
+	write_unlock(&sk->sk_callback_lock);
 	return 0;
 
 err_sock:
@@ -1525,6 +1530,8 @@ err_sock:
 		sock_release(sock);
 	else
 		sockfd_put(sock);
+
+	write_unlock(&sk->sk_callback_lock);
 err:
 	return ret;
 }
-- 
cgit v1.2.3


From 58e0be1ef6118c5352b56a4d06e974c5599993a5 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Tue, 15 Nov 2022 22:24:00 +0800
Subject: net: use struct_group to copy ip/ipv6 header addresses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

kernel test robot reported warnings when build bonding module with
make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/net/bonding/:

                 from ../drivers/net/bonding/bond_main.c:35:
In function ‘fortify_memcpy_chk’,
    inlined from ‘iph_to_flow_copy_v4addrs’ at ../include/net/ip.h:566:2,
    inlined from ‘bond_flow_ip’ at ../drivers/net/bonding/bond_main.c:3984:3:
../include/linux/fortify-string.h:413:25: warning: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of f
ield (2nd parameter); maybe use struct_group()? [-Wattribute-warning]
  413 |                         __read_overflow2_field(q_size_field, size);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘fortify_memcpy_chk’,
    inlined from ‘iph_to_flow_copy_v6addrs’ at ../include/net/ipv6.h:900:2,
    inlined from ‘bond_flow_ip’ at ../drivers/net/bonding/bond_main.c:3994:3:
../include/linux/fortify-string.h:413:25: warning: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of f
ield (2nd parameter); maybe use struct_group()? [-Wattribute-warning]
  413 |                         __read_overflow2_field(q_size_field, size);
      |                         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

This is because we try to copy the whole ip/ip6 address to the flow_key,
while we only point the to ip/ip6 saddr. Note that since these are UAPI
headers, __struct_group() is used to avoid the compiler warnings.

Reported-by: kernel test robot <lkp@intel.com>
Fixes: c3f8324188fa ("net: Add full IPv6 addresses to flow_keys")
Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Link: https://lore.kernel.org/r/20221115142400.1204786-1-liuhangbin@gmail.com
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/net/ip.h          | 2 +-
 include/net/ipv6.h        | 2 +-
 include/uapi/linux/ip.h   | 6 ++++--
 include/uapi/linux/ipv6.h | 6 ++++--
 4 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip.h b/include/net/ip.h
index 038097c2a152..144bdfbb25af 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -563,7 +563,7 @@ static inline void iph_to_flow_copy_v4addrs(struct flow_keys *flow,
 	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v4addrs.dst) !=
 		     offsetof(typeof(flow->addrs), v4addrs.src) +
 			      sizeof(flow->addrs.v4addrs.src));
-	memcpy(&flow->addrs.v4addrs, &iph->saddr, sizeof(flow->addrs.v4addrs));
+	memcpy(&flow->addrs.v4addrs, &iph->addrs, sizeof(flow->addrs.v4addrs));
 	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
 }
 
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 37943ba3a73c..d383c895592a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -897,7 +897,7 @@ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow,
 	BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) !=
 		     offsetof(typeof(flow->addrs), v6addrs.src) +
 		     sizeof(flow->addrs.v6addrs.src));
-	memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs));
+	memcpy(&flow->addrs.v6addrs, &iph->addrs, sizeof(flow->addrs.v6addrs));
 	flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
 }
 
diff --git a/include/uapi/linux/ip.h b/include/uapi/linux/ip.h
index 961ec16a26b8..874a92349bf5 100644
--- a/include/uapi/linux/ip.h
+++ b/include/uapi/linux/ip.h
@@ -100,8 +100,10 @@ struct iphdr {
 	__u8	ttl;
 	__u8	protocol;
 	__sum16	check;
-	__be32	saddr;
-	__be32	daddr;
+	__struct_group(/* no tag */, addrs, /* no attrs */,
+		__be32	saddr;
+		__be32	daddr;
+	);
 	/*The options start here. */
 };
 
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h
index 03cdbe798fe3..81f4243bebb1 100644
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -130,8 +130,10 @@ struct ipv6hdr {
 	__u8			nexthdr;
 	__u8			hop_limit;
 
-	struct	in6_addr	saddr;
-	struct	in6_addr	daddr;
+	__struct_group(/* no tag */, addrs, /* no attrs */,
+		struct	in6_addr	saddr;
+		struct	in6_addr	daddr;
+	);
 };
 
 
-- 
cgit v1.2.3


From 9999f85ba34651726018e0f50d4afdf6c8cc8096 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Tue, 15 Nov 2022 03:18:42 +0200
Subject: net: dsa: stop exposing tag proto module helpers to the world

The DSA tagging protocol driver macros are in the public include/net/dsa.h
probably because that's also where the DSA_TAG_PROTO_*_VALUE macros are
(MODULE_ALIAS_DSA_TAG_DRIVER hinges on those macro definitions).

But there is no reason to expose these helpers to <net/dsa.h>. That
header is shared between switch drivers (drivers/net/dsa/), tagging
protocol drivers (net/dsa/tag_*.c), the DSA core (net/dsa/ sans tag_*.c),
and the rest of the world (DSA master drivers, network stack, etc).
Too much exposure.

On the other hand, net/dsa/dsa_priv.h is included only by the DSA core
and by DSA tagging protocol drivers (or IOW, "friend" modules). Also a
bit too much exposure - I've contemplated creating a new header which is
only included by tagging protocol drivers, but completely separating a
new dsa_tag_proto.h from dsa_priv.h is not immediately trivial - for
example dsa_slave_to_port() is used both from the fast path and from the
control path.

So for now, move these definitions to dsa_priv.h which at least hides
them from the world.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Tested-by: Michael Walle <michael@walle.cc>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h  | 70 ------------------------------------------------------
 net/dsa/dsa_priv.h | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 70 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index dde364688739..82da44561f4c 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -118,10 +118,6 @@ struct dsa_netdevice_ops {
 			     int cmd);
 };
 
-#define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
-#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
-	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
-
 struct dsa_lag {
 	struct net_device *dev;
 	unsigned int id;
@@ -1400,70 +1396,4 @@ static inline bool dsa_slave_dev_check(const struct net_device *dev)
 netdev_tx_t dsa_enqueue_skb(struct sk_buff *skb, struct net_device *dev);
 void dsa_port_phylink_mac_change(struct dsa_switch *ds, int port, bool up);
 
-struct dsa_tag_driver {
-	const struct dsa_device_ops *ops;
-	struct list_head list;
-	struct module *owner;
-};
-
-void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
-			      unsigned int count,
-			      struct module *owner);
-void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
-				unsigned int count);
-
-#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count)	\
-static int __init dsa_tag_driver_module_init(void)			\
-{									\
-	dsa_tag_drivers_register(__dsa_tag_drivers_array, __count,	\
-				 THIS_MODULE);				\
-	return 0;							\
-}									\
-module_init(dsa_tag_driver_module_init);				\
-									\
-static void __exit dsa_tag_driver_module_exit(void)			\
-{									\
-	dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count);	\
-}									\
-module_exit(dsa_tag_driver_module_exit)
-
-/**
- * module_dsa_tag_drivers() - Helper macro for registering DSA tag
- * drivers
- * @__ops_array: Array of tag driver structures
- *
- * Helper macro for DSA tag drivers which do not do anything special
- * in module init/exit. Each module may only use this macro once, and
- * calling it replaces module_init() and module_exit().
- */
-#define module_dsa_tag_drivers(__ops_array)				\
-dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
-
-#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
-
-/* Create a static structure we can build a linked list of dsa_tag
- * drivers
- */
-#define DSA_TAG_DRIVER(__ops)						\
-static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = {		\
-	.ops = &__ops,							\
-}
-
-/**
- * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
- * driver
- * @__ops: Single tag driver structures
- *
- * Helper macro for DSA tag drivers which do not do anything special
- * in module init/exit. Each module may only use this macro once, and
- * calling it replaces module_init() and module_exit().
- */
-#define module_dsa_tag_driver(__ops)					\
-DSA_TAG_DRIVER(__ops);							\
-									\
-static struct dsa_tag_driver *dsa_tag_driver_array[] =	{		\
-	&DSA_TAG_DRIVER_NAME(__ops)					\
-};									\
-module_dsa_tag_drivers(dsa_tag_driver_array)
 #endif
-
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 71e9707d11d4..23d2dfdbc1ab 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -17,6 +17,76 @@
 
 #define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
 
+#define DSA_TAG_DRIVER_ALIAS "dsa_tag-"
+#define MODULE_ALIAS_DSA_TAG_DRIVER(__proto)				\
+	MODULE_ALIAS(DSA_TAG_DRIVER_ALIAS __stringify(__proto##_VALUE))
+
+struct dsa_tag_driver {
+	const struct dsa_device_ops *ops;
+	struct list_head list;
+	struct module *owner;
+};
+
+void dsa_tag_drivers_register(struct dsa_tag_driver *dsa_tag_driver_array[],
+			      unsigned int count,
+			      struct module *owner);
+void dsa_tag_drivers_unregister(struct dsa_tag_driver *dsa_tag_driver_array[],
+				unsigned int count);
+
+#define dsa_tag_driver_module_drivers(__dsa_tag_drivers_array, __count)	\
+static int __init dsa_tag_driver_module_init(void)			\
+{									\
+	dsa_tag_drivers_register(__dsa_tag_drivers_array, __count,	\
+				 THIS_MODULE);				\
+	return 0;							\
+}									\
+module_init(dsa_tag_driver_module_init);				\
+									\
+static void __exit dsa_tag_driver_module_exit(void)			\
+{									\
+	dsa_tag_drivers_unregister(__dsa_tag_drivers_array, __count);	\
+}									\
+module_exit(dsa_tag_driver_module_exit)
+
+/**
+ * module_dsa_tag_drivers() - Helper macro for registering DSA tag
+ * drivers
+ * @__ops_array: Array of tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_drivers(__ops_array)				\
+dsa_tag_driver_module_drivers(__ops_array, ARRAY_SIZE(__ops_array))
+
+#define DSA_TAG_DRIVER_NAME(__ops) dsa_tag_driver ## _ ## __ops
+
+/* Create a static structure we can build a linked list of dsa_tag
+ * drivers
+ */
+#define DSA_TAG_DRIVER(__ops)						\
+static struct dsa_tag_driver DSA_TAG_DRIVER_NAME(__ops) = {		\
+	.ops = &__ops,							\
+}
+
+/**
+ * module_dsa_tag_driver() - Helper macro for registering a single DSA tag
+ * driver
+ * @__ops: Single tag driver structures
+ *
+ * Helper macro for DSA tag drivers which do not do anything special
+ * in module init/exit. Each module may only use this macro once, and
+ * calling it replaces module_init() and module_exit().
+ */
+#define module_dsa_tag_driver(__ops)					\
+DSA_TAG_DRIVER(__ops);							\
+									\
+static struct dsa_tag_driver *dsa_tag_driver_array[] =	{		\
+	&DSA_TAG_DRIVER_NAME(__ops)					\
+};									\
+module_dsa_tag_drivers(dsa_tag_driver_array)
+
 enum {
 	DSA_NOTIFIER_AGEING_TIME,
 	DSA_NOTIFIER_BRIDGE_JOIN,
-- 
cgit v1.2.3


From cd502236835b678738810ecd501c85a3a7a11150 Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <michal.wilczynski@intel.com>
Date: Tue, 15 Nov 2022 11:48:15 +0100
Subject: devlink: Introduce new attribute 'tx_priority' to devlink-rate

To fully utilize offload capabilities of Intel 100G card QoS capabilities
new attribute 'tx_priority' needs to be introduced. This attribute allows
for usage of strict priority arbiter among siblings. This arbitration
scheme attempts to schedule nodes based on their priority as long as the
nodes remain within their bandwidth limit.

Introduce new attribute in devlink-rate that will allow for configuration
of strict priority. New attribute is optional.

Signed-off-by: Michal Wilczynski <michal.wilczynski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  6 ++++++
 include/uapi/linux/devlink.h |  1 +
 net/core/devlink.c           | 31 +++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 611a23a3deb2..90d59d673cb1 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -114,6 +114,8 @@ struct devlink_rate {
 			refcount_t refcnt;
 		};
 	};
+
+	u32 tx_priority;
 };
 
 struct devlink_port {
@@ -1511,10 +1513,14 @@ struct devlink_ops {
 				      u64 tx_share, struct netlink_ext_ack *extack);
 	int (*rate_leaf_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
 				    u64 tx_max, struct netlink_ext_ack *extack);
+	int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv,
+					 u32 tx_priority, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
 				      u64 tx_share, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
 				    u64 tx_max, struct netlink_ext_ack *extack);
+	int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv,
+					 u32 tx_priority, struct netlink_ext_ack *extack);
 	int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
 			     struct netlink_ext_ack *extack);
 	int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 2f24b53a87a5..1a9214d35ef5 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -607,6 +607,7 @@ enum devlink_attr {
 
 	DEVLINK_ATTR_SELFTESTS,			/* nested */
 
+	DEVLINK_ATTR_RATE_TX_PRIORITY,		/* u32 */
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 7f789bbcbbd7..bf6d3a3c28bb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1203,6 +1203,9 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
 			      devlink_rate->tx_max, DEVLINK_ATTR_PAD))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
+			devlink_rate->tx_priority))
+		goto nla_put_failure;
 	if (devlink_rate->parent)
 		if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
 				   devlink_rate->parent->name))
@@ -1936,6 +1939,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 {
 	struct nlattr *nla_parent, **attrs = info->attrs;
 	int err = -EOPNOTSUPP;
+	u32 priority;
 	u64 rate;
 
 	if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
@@ -1964,6 +1968,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 		devlink_rate->tx_max = rate;
 	}
 
+	if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]) {
+		priority = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_PRIORITY]);
+		if (devlink_rate_is_leaf(devlink_rate))
+			err = ops->rate_leaf_tx_priority_set(devlink_rate, devlink_rate->priv,
+							     priority, info->extack);
+		else if (devlink_rate_is_node(devlink_rate))
+			err = ops->rate_node_tx_priority_set(devlink_rate, devlink_rate->priv,
+							     priority, info->extack);
+
+		if (err)
+			return err;
+		devlink_rate->tx_priority = priority;
+	}
+
 	nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
 	if (nla_parent) {
 		err = devlink_nl_rate_parent_node_set(devlink_rate, info,
@@ -1995,6 +2013,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 			NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the leafs");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_leaf_tx_priority_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+					    "TX priority set isn't supported for the leafs");
+			return false;
+		}
 	} else if (type == DEVLINK_RATE_TYPE_NODE) {
 		if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
 			NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
@@ -2009,6 +2033,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 			NL_SET_ERR_MSG_MOD(info->extack, "Parent set isn't supported for the nodes");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TX_PRIORITY] && !ops->rate_node_tx_priority_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TX_PRIORITY],
+					    "TX priority set isn't supported for the nodes");
+			return false;
+		}
 	} else {
 		WARN(1, "Unknown type of rate object");
 		return false;
@@ -9187,6 +9217,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_LINECARD_INDEX] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
+	[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
-- 
cgit v1.2.3


From 6e2d7e84fcfee62d70e62aa9e469d10b8b6a7dc7 Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <michal.wilczynski@intel.com>
Date: Tue, 15 Nov 2022 11:48:16 +0100
Subject: devlink: Introduce new attribute 'tx_weight' to devlink-rate

To fully utilize offload capabilities of Intel 100G card QoS capabilities
new attribute 'tx_weight' needs to be introduced. This attribute allows
for usage of Weighted Fair Queuing arbitration scheme among siblings.
This arbitration scheme can be used simultaneously with the strict
priority.

Introduce new attribute in devlink-rate that will allow for configuration
of Weighted Fair Queueing. New attribute is optional.

Signed-off-by: Michal Wilczynski <michal.wilczynski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h        |  5 +++++
 include/uapi/linux/devlink.h |  2 ++
 net/core/devlink.c           | 33 +++++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 90d59d673cb1..366b23d3f973 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -116,6 +116,7 @@ struct devlink_rate {
 	};
 
 	u32 tx_priority;
+	u32 tx_weight;
 };
 
 struct devlink_port {
@@ -1515,12 +1516,16 @@ struct devlink_ops {
 				    u64 tx_max, struct netlink_ext_ack *extack);
 	int (*rate_leaf_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv,
 					 u32 tx_priority, struct netlink_ext_ack *extack);
+	int (*rate_leaf_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
+				       u32 tx_weight, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_share_set)(struct devlink_rate *devlink_rate, void *priv,
 				      u64 tx_share, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_max_set)(struct devlink_rate *devlink_rate, void *priv,
 				    u64 tx_max, struct netlink_ext_ack *extack);
 	int (*rate_node_tx_priority_set)(struct devlink_rate *devlink_rate, void *priv,
 					 u32 tx_priority, struct netlink_ext_ack *extack);
+	int (*rate_node_tx_weight_set)(struct devlink_rate *devlink_rate, void *priv,
+				       u32 tx_weight, struct netlink_ext_ack *extack);
 	int (*rate_node_new)(struct devlink_rate *rate_node, void **priv,
 			     struct netlink_ext_ack *extack);
 	int (*rate_node_del)(struct devlink_rate *rate_node, void *priv,
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 1a9214d35ef5..498d0d5d0957 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -608,6 +608,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_SELFTESTS,			/* nested */
 
 	DEVLINK_ATTR_RATE_TX_PRIORITY,		/* u32 */
+	DEVLINK_ATTR_RATE_TX_WEIGHT,		/* u32 */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index bf6d3a3c28bb..525bdf426163 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -1206,6 +1206,11 @@ static int devlink_nl_rate_fill(struct sk_buff *msg,
 	if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_PRIORITY,
 			devlink_rate->tx_priority))
 		goto nla_put_failure;
+
+	if (nla_put_u32(msg, DEVLINK_ATTR_RATE_TX_WEIGHT,
+			devlink_rate->tx_weight))
+		goto nla_put_failure;
+
 	if (devlink_rate->parent)
 		if (nla_put_string(msg, DEVLINK_ATTR_RATE_PARENT_NODE_NAME,
 				   devlink_rate->parent->name))
@@ -1940,6 +1945,7 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 	struct nlattr *nla_parent, **attrs = info->attrs;
 	int err = -EOPNOTSUPP;
 	u32 priority;
+	u32 weight;
 	u64 rate;
 
 	if (attrs[DEVLINK_ATTR_RATE_TX_SHARE]) {
@@ -1982,6 +1988,20 @@ static int devlink_nl_rate_set(struct devlink_rate *devlink_rate,
 		devlink_rate->tx_priority = priority;
 	}
 
+	if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]) {
+		weight = nla_get_u32(attrs[DEVLINK_ATTR_RATE_TX_WEIGHT]);
+		if (devlink_rate_is_leaf(devlink_rate))
+			err = ops->rate_leaf_tx_weight_set(devlink_rate, devlink_rate->priv,
+							   weight, info->extack);
+		else if (devlink_rate_is_node(devlink_rate))
+			err = ops->rate_node_tx_weight_set(devlink_rate, devlink_rate->priv,
+							   weight, info->extack);
+
+		if (err)
+			return err;
+		devlink_rate->tx_weight = weight;
+	}
+
 	nla_parent = attrs[DEVLINK_ATTR_RATE_PARENT_NODE_NAME];
 	if (nla_parent) {
 		err = devlink_nl_rate_parent_node_set(devlink_rate, info,
@@ -2019,6 +2039,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 					    "TX priority set isn't supported for the leafs");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_leaf_tx_weight_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+					    "TX weight set isn't supported for the leafs");
+			return false;
+		}
 	} else if (type == DEVLINK_RATE_TYPE_NODE) {
 		if (attrs[DEVLINK_ATTR_RATE_TX_SHARE] && !ops->rate_node_tx_share_set) {
 			NL_SET_ERR_MSG_MOD(info->extack, "TX share set isn't supported for the nodes");
@@ -2039,6 +2065,12 @@ static bool devlink_rate_set_ops_supported(const struct devlink_ops *ops,
 					    "TX priority set isn't supported for the nodes");
 			return false;
 		}
+		if (attrs[DEVLINK_ATTR_RATE_TX_WEIGHT] && !ops->rate_node_tx_weight_set) {
+			NL_SET_ERR_MSG_ATTR(info->extack,
+					    attrs[DEVLINK_ATTR_RATE_TX_WEIGHT],
+					    "TX weight set isn't supported for the nodes");
+			return false;
+		}
 	} else {
 		WARN(1, "Unknown type of rate object");
 		return false;
@@ -9218,6 +9250,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_LINECARD_TYPE] = { .type = NLA_NUL_STRING },
 	[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
 	[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
-- 
cgit v1.2.3


From caba177d7f4d7693a9157ece8c9a30944c949e34 Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <michal.wilczynski@intel.com>
Date: Tue, 15 Nov 2022 11:48:17 +0100
Subject: devlink: Enable creation of the devlink-rate nodes from the driver

Intel 100G card internal firmware hierarchy for Hierarchicial QoS is very
rigid and can't be easily removed. This requires an ability to export
default hierarchy to allow user to modify it. Currently the driver is
only able to create the 'leaf' nodes, which usually represent the vport.
This is not enough for HQoS implemented in Intel hardware.

Introduce new function devl_rate_node_create() that allows for creation
of the devlink-rate nodes from the driver.

Signed-off-by: Michal Wilczynski <michal.wilczynski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/devlink.h |  3 +++
 net/core/devlink.c    | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

(limited to 'include/net')

diff --git a/include/net/devlink.h b/include/net/devlink.h
index 366b23d3f973..339a2ed02d36 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1618,6 +1618,9 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   u32 controller, u16 pf, u32 sf,
 				   bool external);
 int devl_rate_leaf_create(struct devlink_port *port, void *priv);
+struct devlink_rate *
+devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
+		      struct devlink_rate *parent);
 void devl_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devl_rate_nodes_destroy(struct devlink *devlink);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 525bdf426163..3dfee7cd9929 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10384,6 +10384,51 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro
 }
 EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set);
 
+/**
+ * devl_rate_node_create - create devlink rate node
+ * @devlink: devlink instance
+ * @priv: driver private data
+ * @node_name: name of the resulting node
+ * @parent: parent devlink_rate struct
+ *
+ * Create devlink rate object of type node
+ */
+struct devlink_rate *
+devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
+		      struct devlink_rate *parent)
+{
+	struct devlink_rate *rate_node;
+
+	rate_node = devlink_rate_node_get_by_name(devlink, node_name);
+	if (!IS_ERR(rate_node))
+		return ERR_PTR(-EEXIST);
+
+	rate_node = kzalloc(sizeof(*rate_node), GFP_KERNEL);
+	if (!rate_node)
+		return ERR_PTR(-ENOMEM);
+
+	if (parent) {
+		rate_node->parent = parent;
+		refcount_inc(&rate_node->parent->refcnt);
+	}
+
+	rate_node->type = DEVLINK_RATE_TYPE_NODE;
+	rate_node->devlink = devlink;
+	rate_node->priv = priv;
+
+	rate_node->name = kstrdup(node_name, GFP_KERNEL);
+	if (!rate_node->name) {
+		kfree(rate_node);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	refcount_set(&rate_node->refcnt, 1);
+	list_add(&rate_node->list, &devlink->rate_list);
+	devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
+	return rate_node;
+}
+EXPORT_SYMBOL_GPL(devl_rate_node_create);
+
 /**
  * devl_rate_leaf_create - create devlink rate leaf
  * @devlink_port: devlink port object to create rate object on
-- 
cgit v1.2.3


From f2fc15e271f2d17f2bee2c5a3b3e50252a7ba91f Mon Sep 17 00:00:00 2001
From: Michal Wilczynski <michal.wilczynski@intel.com>
Date: Tue, 15 Nov 2022 11:48:19 +0100
Subject: devlink: Allow to set up parent in devl_rate_leaf_create()

Currently the driver is able to create leaf nodes for the devlink-rate,
but is unable to set parent for them. This wasn't as issue before the
possibility to export hierarchy from the driver. After adding the export
feature, in order for the driver to supply correct hierarchy, it's
necessary for it to be able to supply a parent name to
devl_rate_leaf_create().

Introduce a new parameter 'parent_name' in devl_rate_leaf_create().

Signed-off-by: Michal Wilczynski <michal.wilczynski@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c | 4 ++--
 drivers/net/netdevsim/dev.c                                | 2 +-
 include/net/devlink.h                                      | 4 +++-
 net/core/devlink.c                                         | 9 ++++++++-
 4 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
index 9bc7be95db54..084a910bb4e7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/devlink_port.c
@@ -91,7 +91,7 @@ int mlx5_esw_offloads_devlink_port_register(struct mlx5_eswitch *esw, u16 vport_
 	if (err)
 		goto reg_err;
 
-	err = devl_rate_leaf_create(dl_port, vport);
+	err = devl_rate_leaf_create(dl_port, vport, NULL);
 	if (err)
 		goto rate_err;
 
@@ -160,7 +160,7 @@ int mlx5_esw_devlink_sf_port_register(struct mlx5_eswitch *esw, struct devlink_p
 	if (err)
 		return err;
 
-	err = devl_rate_leaf_create(dl_port, vport);
+	err = devl_rate_leaf_create(dl_port, vport, NULL);
 	if (err)
 		goto rate_err;
 
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index 705872eb7564..e14686594a71 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -1401,7 +1401,7 @@ static int __nsim_dev_port_add(struct nsim_dev *nsim_dev, enum nsim_dev_port_typ
 
 	if (nsim_dev_port_is_vf(nsim_dev_port)) {
 		err = devl_rate_leaf_create(&nsim_dev_port->devlink_port,
-					    nsim_dev_port);
+					    nsim_dev_port, NULL);
 		if (err)
 			goto err_nsim_destroy;
 	}
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 339a2ed02d36..074a79b8933f 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1617,10 +1617,12 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro
 void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port,
 				   u32 controller, u16 pf, u32 sf,
 				   bool external);
-int devl_rate_leaf_create(struct devlink_port *port, void *priv);
 struct devlink_rate *
 devl_rate_node_create(struct devlink *devlink, void *priv, char *node_name,
 		      struct devlink_rate *parent);
+int
+devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
+		      struct devlink_rate *parent);
 void devl_rate_leaf_destroy(struct devlink_port *devlink_port);
 void devl_rate_nodes_destroy(struct devlink *devlink);
 void devlink_port_linecard_set(struct devlink_port *devlink_port,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 61d431578f5f..d93bc95cd7cb 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -10435,10 +10435,12 @@ EXPORT_SYMBOL_GPL(devl_rate_node_create);
  * devl_rate_leaf_create - create devlink rate leaf
  * @devlink_port: devlink port object to create rate object on
  * @priv: driver private data
+ * @parent: parent devlink_rate struct
  *
  * Create devlink rate object of type leaf on provided @devlink_port.
  */
-int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
+int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv,
+			  struct devlink_rate *parent)
 {
 	struct devlink *devlink = devlink_port->devlink;
 	struct devlink_rate *devlink_rate;
@@ -10452,6 +10454,11 @@ int devl_rate_leaf_create(struct devlink_port *devlink_port, void *priv)
 	if (!devlink_rate)
 		return -ENOMEM;
 
+	if (parent) {
+		devlink_rate->parent = parent;
+		refcount_inc(&devlink_rate->parent->refcnt);
+	}
+
 	devlink_rate->type = DEVLINK_RATE_TYPE_LEAF;
 	devlink_rate->devlink = devlink;
 	devlink_rate->devlink_port = devlink_port;
-- 
cgit v1.2.3


From b78c4162823dddc621649edae704b14c5973298c Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 15 Nov 2022 10:39:53 -0500
Subject: sctp: change to include linux/sctp.h in net/sctp/checksum.h

Currently "net/sctp/checksum.h" including "net/sctp/sctp.h" is
included in quite some places in netfilter and openswitch and
net/sched. It's not necessary to include "net/sctp/sctp.h" if
a module does not have dependence on SCTP, "linux/sctp.h" is
the right one to include.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Saeed Mahameed <saeed@kernel.org>
Link: https://lore.kernel.org/r/ca7ea96d62a26732f0491153c3979dc1c0d8d34a.1668526793.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/checksum.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/checksum.h b/include/net/sctp/checksum.h
index 5a9bb09f32b6..f514a0aa849e 100644
--- a/include/net/sctp/checksum.h
+++ b/include/net/sctp/checksum.h
@@ -24,7 +24,7 @@
 #define __sctp_checksum_h__
 
 #include <linux/types.h>
-#include <net/sctp/sctp.h>
+#include <linux/sctp.h>
 #include <linux/crc32c.h>
 #include <linux/crc32.h>
 
-- 
cgit v1.2.3


From 647541ea06a7bbbcf941e501c726b3e26328c102 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Tue, 15 Nov 2022 10:40:21 -0500
Subject: sctp: move SCTP_PAD4 and SCTP_TRUNC4 to linux/sctp.h

Move these two macros from net/sctp/sctp.h to linux/sctp.h, so that
it will be enough to include only linux/sctp.h in nft_exthdr.c and
xt_sctp.c. It should not include "net/sctp/sctp.h" if a module does
not have a dependence on SCTP module.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Reviewed-by: Saeed Mahameed <saeed@kernel.org>
Link: https://lore.kernel.org/r/ef6468a687f36da06f575c2131cd4612f6b7be88.1668526821.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/sctp.h       | 5 +++++
 include/net/sctp/sctp.h    | 5 -----
 net/netfilter/nft_exthdr.c | 1 -
 net/netfilter/xt_sctp.c    | 1 -
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index a86e852507b3..358dc08e0831 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -820,4 +820,9 @@ struct sctp_new_encap_port_hdr {
 	__be16 new_port;
 };
 
+/* Round an int up to the next multiple of 4.  */
+#define SCTP_PAD4(s) (((s)+3)&~3)
+/* Truncate to the previous multiple of 4.  */
+#define SCTP_TRUNC4(s) ((s)&~3)
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index a04999ee99b0..01d904b34cf0 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -67,11 +67,6 @@
 #define SCTP_PROTOSW_FLAG INET_PROTOSW_PERMANENT
 #endif
 
-/* Round an int up to the next multiple of 4.  */
-#define SCTP_PAD4(s) (((s)+3)&~3)
-/* Truncate to the previous multiple of 4.  */
-#define SCTP_TRUNC4(s) ((s)&~3)
-
 /*
  * Function declarations.
  */
diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c
index ed929d0d37ce..a54a7f772cec 100644
--- a/net/netfilter/nft_exthdr.c
+++ b/net/netfilter/nft_exthdr.c
@@ -13,7 +13,6 @@
 #include <linux/sctp.h>
 #include <net/netfilter/nf_tables_core.h>
 #include <net/netfilter/nf_tables.h>
-#include <net/sctp/sctp.h>
 #include <net/tcp.h>
 
 struct nft_exthdr {
diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c
index 680015ba7cb6..e8961094a282 100644
--- a/net/netfilter/xt_sctp.c
+++ b/net/netfilter/xt_sctp.c
@@ -4,7 +4,6 @@
 #include <linux/skbuff.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
-#include <net/sctp/sctp.h>
 #include <linux/sctp.h>
 
 #include <linux/netfilter/x_tables.h>
-- 
cgit v1.2.3


From 8207f253a097fe15c93d85ac15ebb73c5e39e1e1 Mon Sep 17 00:00:00 2001
From: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Date: Tue, 15 Nov 2022 23:09:41 +0100
Subject: net: neigh: decrement the family specific qlen

Commit 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit
per-device") introduced the length counter qlen in struct neigh_parms.
There are separate neigh_parms instances for IPv4/ARP and IPv6/ND, and
while the family specific qlen is incremented in pneigh_enqueue(), the
mentioned commit decrements always the IPv4/ARP specific qlen,
regardless of the currently processed family, in pneigh_queue_purge()
and neigh_proxy_process().

As a result, with IPv6/ND, the family specific qlen is only incremented
(and never decremented) until it exceeds PROXY_QLEN, and then, according
to the check in pneigh_enqueue(), neighbor solicitations are not
answered anymore. As an example, this is noted when using the
subnet-router anycast address to access a Linux router. After a certain
amount of time (in the observed case, qlen exceeded PROXY_QLEN after two
days), the Linux router stops answering neighbor solicitations for its
subnet-router anycast address and effectively becomes unreachable.

Another result with IPv6/ND is that the IPv4/ARP specific qlen is
decremented more often than incremented. This leads to negative qlen
values, as a signed integer has been used for the length counter qlen,
and potentially to an integer overflow.

Fix this by introducing the helper function neigh_parms_qlen_dec(),
which decrements the family specific qlen. Thereby, make use of the
existing helper function neigh_get_dev_parms_rcu(), whose definition
therefore needs to be placed earlier in neighbour.c. Take the family
member from struct neigh_table to determine the currently processed
family and appropriately call neigh_parms_qlen_dec() from
pneigh_queue_purge() and neigh_proxy_process().

Additionally, use an unsigned integer for the length counter qlen.

Fixes: 0ff4eb3d5ebb ("neighbour: make proxy_queue.qlen limit per-device")
Signed-off-by: Thomas Zeitlhofer <thomas.zeitlhofer+lkml@ze-it.at>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/neighbour.h |  2 +-
 net/core/neighbour.c    | 58 +++++++++++++++++++++++++------------------------
 2 files changed, 31 insertions(+), 29 deletions(-)

(limited to 'include/net')

diff --git a/include/net/neighbour.h b/include/net/neighbour.h
index 20745cf7ae1a..2f2a6023fb0e 100644
--- a/include/net/neighbour.h
+++ b/include/net/neighbour.h
@@ -83,7 +83,7 @@ struct neigh_parms {
 	struct rcu_head rcu_head;
 
 	int	reachable_time;
-	int	qlen;
+	u32	qlen;
 	int	data[NEIGH_VAR_DATA_MAX];
 	DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX);
 };
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index a77a85e357e0..952a54763358 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -307,7 +307,31 @@ static int neigh_del_timer(struct neighbour *n)
 	return 0;
 }
 
-static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
+static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
+						   int family)
+{
+	switch (family) {
+	case AF_INET:
+		return __in_dev_arp_parms_get_rcu(dev);
+	case AF_INET6:
+		return __in6_dev_nd_parms_get_rcu(dev);
+	}
+	return NULL;
+}
+
+static void neigh_parms_qlen_dec(struct net_device *dev, int family)
+{
+	struct neigh_parms *p;
+
+	rcu_read_lock();
+	p = neigh_get_dev_parms_rcu(dev, family);
+	if (p)
+		p->qlen--;
+	rcu_read_unlock();
+}
+
+static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net,
+			       int family)
 {
 	struct sk_buff_head tmp;
 	unsigned long flags;
@@ -321,13 +345,7 @@ static void pneigh_queue_purge(struct sk_buff_head *list, struct net *net)
 		struct net_device *dev = skb->dev;
 
 		if (net == NULL || net_eq(dev_net(dev), net)) {
-			struct in_device *in_dev;
-
-			rcu_read_lock();
-			in_dev = __in_dev_get_rcu(dev);
-			if (in_dev)
-				in_dev->arp_parms->qlen--;
-			rcu_read_unlock();
+			neigh_parms_qlen_dec(dev, family);
 			__skb_unlink(skb, list);
 			__skb_queue_tail(&tmp, skb);
 		}
@@ -409,7 +427,8 @@ static int __neigh_ifdown(struct neigh_table *tbl, struct net_device *dev,
 	write_lock_bh(&tbl->lock);
 	neigh_flush_dev(tbl, dev, skip_perm);
 	pneigh_ifdown_and_unlock(tbl, dev);
-	pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL);
+	pneigh_queue_purge(&tbl->proxy_queue, dev ? dev_net(dev) : NULL,
+			   tbl->family);
 	if (skb_queue_empty_lockless(&tbl->proxy_queue))
 		del_timer_sync(&tbl->proxy_timer);
 	return 0;
@@ -1621,13 +1640,8 @@ static void neigh_proxy_process(struct timer_list *t)
 
 		if (tdif <= 0) {
 			struct net_device *dev = skb->dev;
-			struct in_device *in_dev;
 
-			rcu_read_lock();
-			in_dev = __in_dev_get_rcu(dev);
-			if (in_dev)
-				in_dev->arp_parms->qlen--;
-			rcu_read_unlock();
+			neigh_parms_qlen_dec(dev, tbl->family);
 			__skb_unlink(skb, &tbl->proxy_queue);
 
 			if (tbl->proxy_redo && netif_running(dev)) {
@@ -1821,7 +1835,7 @@ int neigh_table_clear(int index, struct neigh_table *tbl)
 	cancel_delayed_work_sync(&tbl->managed_work);
 	cancel_delayed_work_sync(&tbl->gc_work);
 	del_timer_sync(&tbl->proxy_timer);
-	pneigh_queue_purge(&tbl->proxy_queue, NULL);
+	pneigh_queue_purge(&tbl->proxy_queue, NULL, tbl->family);
 	neigh_ifdown(tbl, NULL);
 	if (atomic_read(&tbl->entries))
 		pr_crit("neighbour leakage\n");
@@ -3539,18 +3553,6 @@ static int proc_unres_qlen(struct ctl_table *ctl, int write,
 	return ret;
 }
 
-static struct neigh_parms *neigh_get_dev_parms_rcu(struct net_device *dev,
-						   int family)
-{
-	switch (family) {
-	case AF_INET:
-		return __in_dev_arp_parms_get_rcu(dev);
-	case AF_INET6:
-		return __in6_dev_nd_parms_get_rcu(dev);
-	}
-	return NULL;
-}
-
 static void neigh_copy_dflt_parms(struct net *net, struct neigh_parms *p,
 				  int index)
 {
-- 
cgit v1.2.3


From 33e93ed2209d5971043bed41dd194bc583b57ef3 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 16 Nov 2022 15:01:19 -0500
Subject: sctp: add skb_sdif in struct sctp_af

Add skb_sdif function in struct sctp_af to get the enslaved device
for both ipv4 and ipv6 when adding SCTP VRF support in sctp_rcv in
the next patch.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/sctp/structs.h | 1 +
 net/sctp/ipv6.c            | 8 +++++++-
 net/sctp/protocol.c        | 6 ++++++
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 350f250b0dc7..7b4884c63b26 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -477,6 +477,7 @@ struct sctp_af {
 	int		(*available)	(union sctp_addr *,
 					 struct sctp_sock *);
 	int		(*skb_iif)	(const struct sk_buff *sk);
+	int		(*skb_sdif)(const struct sk_buff *sk);
 	int		(*is_ce)	(const struct sk_buff *sk);
 	void		(*seq_dump_addr)(struct seq_file *seq,
 					 union sctp_addr *addr);
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e6274cdbdf6c..097bd60ce964 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -842,7 +842,12 @@ static int sctp_v6_addr_to_user(struct sctp_sock *sp, union sctp_addr *addr)
 /* Where did this skb come from?  */
 static int sctp_v6_skb_iif(const struct sk_buff *skb)
 {
-	return IP6CB(skb)->iif;
+	return inet6_iif(skb);
+}
+
+static int sctp_v6_skb_sdif(const struct sk_buff *skb)
+{
+	return inet6_sdif(skb);
 }
 
 /* Was this packet marked by Explicit Congestion Notification? */
@@ -1142,6 +1147,7 @@ static struct sctp_af sctp_af_inet6 = {
 	.is_any		   = sctp_v6_is_any,
 	.available	   = sctp_v6_available,
 	.skb_iif	   = sctp_v6_skb_iif,
+	.skb_sdif	   = sctp_v6_skb_sdif,
 	.is_ce		   = sctp_v6_is_ce,
 	.seq_dump_addr	   = sctp_v6_seq_dump_addr,
 	.ecn_capable	   = sctp_v6_ecn_capable,
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index dbfe7d1000c2..a18cf0471a8d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -567,6 +567,11 @@ static int sctp_v4_skb_iif(const struct sk_buff *skb)
 	return inet_iif(skb);
 }
 
+static int sctp_v4_skb_sdif(const struct sk_buff *skb)
+{
+	return inet_sdif(skb);
+}
+
 /* Was this packet marked by Explicit Congestion Notification? */
 static int sctp_v4_is_ce(const struct sk_buff *skb)
 {
@@ -1185,6 +1190,7 @@ static struct sctp_af sctp_af_inet = {
 	.available	   = sctp_v4_available,
 	.scope		   = sctp_v4_scope,
 	.skb_iif	   = sctp_v4_skb_iif,
+	.skb_sdif	   = sctp_v4_skb_sdif,
 	.is_ce		   = sctp_v4_is_ce,
 	.seq_dump_addr	   = sctp_v4_seq_dump_addr,
 	.ecn_capable	   = sctp_v4_ecn_capable,
-- 
cgit v1.2.3


From 0af03170637f47fb5cc6501d4b2dcbf1c14772a9 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 16 Nov 2022 15:01:20 -0500
Subject: sctp: add dif and sdif check in asoc and ep lookup

This patch at first adds a pernet global l3mdev_accept to decide if it
accepts the packets from a l3mdev when a SCTP socket doesn't bind to
any interface. It's set to 1 to avoid any possible incompatible issue,
and in next patch, a sysctl will be introduced to allow to change it.

Then similar to inet/udp_sk_bound_dev_eq(), sctp_sk_bound_dev_eq() is
added to check either dif or sdif is equal to sk_bound_dev_if, and to
check sid is 0 or l3mdev_accept is 1 if sk_bound_dev_if is not set.
This function is used to match a association or a endpoint, namely
called by sctp_addrs_lookup_transport() and sctp_endpoint_is_match().
All functions that needs updating are:

sctp_rcv():
  asoc:
  __sctp_rcv_lookup()
    __sctp_lookup_association() -> sctp_addrs_lookup_transport()
    __sctp_rcv_lookup_harder()
      __sctp_rcv_init_lookup()
         __sctp_lookup_association() -> sctp_addrs_lookup_transport()
      __sctp_rcv_walk_lookup()
         __sctp_rcv_asconf_lookup()
           __sctp_lookup_association() -> sctp_addrs_lookup_transport()

  ep:
  __sctp_rcv_lookup_endpoint() -> sctp_endpoint_is_match()

sctp_connect():
  sctp_endpoint_is_peeled_off()
    __sctp_lookup_association()
      sctp_has_association()
        sctp_lookup_association()
          __sctp_lookup_association() -> sctp_addrs_lookup_transport()

sctp_diag_dump_one():
  sctp_transport_lookup_process() -> sctp_addrs_lookup_transport()

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netns/sctp.h   |   4 ++
 include/net/sctp/sctp.h    |   6 ++-
 include/net/sctp/structs.h |   8 ++--
 net/sctp/diag.c            |   3 +-
 net/sctp/endpointola.c     |  13 ++++--
 net/sctp/input.c           | 108 +++++++++++++++++++++++++--------------------
 net/sctp/protocol.c        |   4 ++
 net/sctp/socket.c          |   4 +-
 8 files changed, 89 insertions(+), 61 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index a681147aecd8..7eff3d981b89 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -175,6 +175,10 @@ struct netns_sctp {
 
 	/* Threshold for autoclose timeout, in seconds. */
 	unsigned long max_autoclose;
+
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	int l3mdev_accept;
+#endif
 };
 
 #endif /* __NETNS_SCTP_H__ */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 01d904b34cf0..c335dd01a597 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -109,7 +109,7 @@ struct sctp_transport *sctp_transport_get_idx(struct net *net,
 			struct rhashtable_iter *iter, int pos);
 int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
 				  const union sctp_addr *laddr,
-				  const union sctp_addr *paddr, void *p);
+				  const union sctp_addr *paddr, void *p, int dif);
 int sctp_transport_traverse_process(sctp_callback_t cb, sctp_callback_t cb_done,
 				    struct net *net, int *pos, void *p);
 int sctp_for_each_endpoint(int (*cb)(struct sctp_endpoint *, void *), void *p);
@@ -157,10 +157,12 @@ void sctp_unhash_transport(struct sctp_transport *t);
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
 				const union sctp_addr *laddr,
-				const union sctp_addr *paddr);
+				const union sctp_addr *paddr,
+				int dif, int sdif);
 struct sctp_transport *sctp_epaddr_lookup_transport(
 				const struct sctp_endpoint *ep,
 				const union sctp_addr *paddr);
+bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif);
 
 /*
  * sctp/proc.c
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 7b4884c63b26..afa3781e3ca2 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1379,10 +1379,12 @@ struct sctp_association *sctp_endpoint_lookup_assoc(
 	struct sctp_transport **);
 bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 				 const union sctp_addr *paddr);
-struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *,
-					struct net *, const union sctp_addr *);
+struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
+					     struct net *net,
+					     const union sctp_addr *laddr,
+					     int dif, int sdif);
 bool sctp_has_association(struct net *net, const union sctp_addr *laddr,
-			  const union sctp_addr *paddr);
+			  const union sctp_addr *paddr, int dif, int sdif);
 
 int sctp_verify_init(struct net *net, const struct sctp_endpoint *ep,
 		     const struct sctp_association *asoc,
diff --git a/net/sctp/diag.c b/net/sctp/diag.c
index d9c6d8f30f09..a557009e9832 100644
--- a/net/sctp/diag.c
+++ b/net/sctp/diag.c
@@ -426,6 +426,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb,
 	struct net *net = sock_net(skb->sk);
 	const struct nlmsghdr *nlh = cb->nlh;
 	union sctp_addr laddr, paddr;
+	int dif = req->id.idiag_if;
 	struct sctp_comm_param commp = {
 		.skb = skb,
 		.r = req,
@@ -454,7 +455,7 @@ static int sctp_diag_dump_one(struct netlink_callback *cb,
 	}
 
 	return sctp_transport_lookup_process(sctp_sock_dump_one,
-					     net, &laddr, &paddr, &commp);
+					     net, &laddr, &paddr, &commp, dif);
 }
 
 static void sctp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index efffde7f2328..7e77b450697c 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -246,12 +246,15 @@ void sctp_endpoint_put(struct sctp_endpoint *ep)
 /* Is this the endpoint we are looking for?  */
 struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
 					       struct net *net,
-					       const union sctp_addr *laddr)
+					       const union sctp_addr *laddr,
+					       int dif, int sdif)
 {
+	int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
 	struct sctp_endpoint *retval = NULL;
 
-	if ((htons(ep->base.bind_addr.port) == laddr->v4.sin_port) &&
-	    net_eq(ep->base.net, net)) {
+	if (net_eq(ep->base.net, net) &&
+	    sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
+	    (htons(ep->base.bind_addr.port) == laddr->v4.sin_port)) {
 		if (sctp_bind_addr_match(&ep->base.bind_addr, laddr,
 					 sctp_sk(ep->base.sk)))
 			retval = ep;
@@ -298,6 +301,7 @@ out:
 bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 				 const union sctp_addr *paddr)
 {
+	int bound_dev_if = READ_ONCE(ep->base.sk->sk_bound_dev_if);
 	struct sctp_sockaddr_entry *addr;
 	struct net *net = ep->base.net;
 	struct sctp_bind_addr *bp;
@@ -307,7 +311,8 @@ bool sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep,
 	 * so the address_list can not change.
 	 */
 	list_for_each_entry(addr, &bp->address_list, list) {
-		if (sctp_has_association(net, &addr->a, paddr))
+		if (sctp_has_association(net, &addr->a, paddr,
+					 bound_dev_if, bound_dev_if))
 			return true;
 	}
 
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 4f43afa8678f..bf70371301ff 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -50,16 +50,19 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
 				      struct sk_buff *skb,
 				      const union sctp_addr *paddr,
 				      const union sctp_addr *laddr,
-				      struct sctp_transport **transportp);
+				      struct sctp_transport **transportp,
+				      int dif, int sdif);
 static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
 					struct net *net, struct sk_buff *skb,
 					const union sctp_addr *laddr,
-					const union sctp_addr *daddr);
+					const union sctp_addr *daddr,
+					int dif, int sdif);
 static struct sctp_association *__sctp_lookup_association(
 					struct net *net,
 					const union sctp_addr *local,
 					const union sctp_addr *peer,
-					struct sctp_transport **pt);
+					struct sctp_transport **pt,
+					int dif, int sdif);
 
 static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb);
 
@@ -92,11 +95,11 @@ int sctp_rcv(struct sk_buff *skb)
 	struct sctp_chunk *chunk;
 	union sctp_addr src;
 	union sctp_addr dest;
-	int bound_dev_if;
 	int family;
 	struct sctp_af *af;
 	struct net *net = dev_net(skb->dev);
 	bool is_gso = skb_is_gso(skb) && skb_is_gso_sctp(skb);
+	int dif, sdif;
 
 	if (skb->pkt_type != PACKET_HOST)
 		goto discard_it;
@@ -141,6 +144,8 @@ int sctp_rcv(struct sk_buff *skb)
 	/* Initialize local addresses for lookups. */
 	af->from_skb(&src, skb, 1);
 	af->from_skb(&dest, skb, 0);
+	dif = af->skb_iif(skb);
+	sdif = af->skb_sdif(skb);
 
 	/* If the packet is to or from a non-unicast address,
 	 * silently discard the packet.
@@ -157,35 +162,15 @@ int sctp_rcv(struct sk_buff *skb)
 	    !af->addr_valid(&dest, NULL, skb))
 		goto discard_it;
 
-	asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport);
+	asoc = __sctp_rcv_lookup(net, skb, &src, &dest, &transport, dif, sdif);
 
 	if (!asoc)
-		ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src);
+		ep = __sctp_rcv_lookup_endpoint(net, skb, &dest, &src, dif, sdif);
 
 	/* Retrieve the common input handling substructure. */
 	rcvr = asoc ? &asoc->base : &ep->base;
 	sk = rcvr->sk;
 
-	/*
-	 * If a frame arrives on an interface and the receiving socket is
-	 * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB
-	 */
-	bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
-	if (bound_dev_if && (bound_dev_if != af->skb_iif(skb))) {
-		if (transport) {
-			sctp_transport_put(transport);
-			asoc = NULL;
-			transport = NULL;
-		} else {
-			sctp_endpoint_put(ep);
-			ep = NULL;
-		}
-		sk = net->sctp.ctl_sock;
-		ep = sctp_sk(sk)->ep;
-		sctp_endpoint_hold(ep);
-		rcvr = &ep->base;
-	}
-
 	/*
 	 * RFC 2960, 8.4 - Handle "Out of the blue" Packets.
 	 * An SCTP packet is called an "out of the blue" (OOTB)
@@ -485,6 +470,8 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
 	struct sctp_association *asoc;
 	struct sctp_transport *transport = NULL;
 	__u32 vtag = ntohl(sctphdr->vtag);
+	int sdif = inet_sdif(skb);
+	int dif = inet_iif(skb);
 
 	*app = NULL; *tpp = NULL;
 
@@ -500,7 +487,7 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb,
 	/* Look for an association that matches the incoming ICMP error
 	 * packet.
 	 */
-	asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport);
+	asoc = __sctp_lookup_association(net, &saddr, &daddr, &transport, dif, sdif);
 	if (!asoc)
 		return NULL;
 
@@ -850,7 +837,8 @@ static inline __u32 sctp_hashfn(const struct net *net, __be16 lport,
 static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
 					struct net *net, struct sk_buff *skb,
 					const union sctp_addr *laddr,
-					const union sctp_addr *paddr)
+					const union sctp_addr *paddr,
+					int dif, int sdif)
 {
 	struct sctp_hashbucket *head;
 	struct sctp_endpoint *ep;
@@ -863,7 +851,7 @@ static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(
 	head = &sctp_ep_hashtable[hash];
 	read_lock(&head->lock);
 	sctp_for_each_hentry(ep, &head->chain) {
-		if (sctp_endpoint_is_match(ep, net, laddr))
+		if (sctp_endpoint_is_match(ep, net, laddr, dif, sdif))
 			goto hit;
 	}
 
@@ -990,14 +978,26 @@ void sctp_unhash_transport(struct sctp_transport *t)
 			sctp_hash_params);
 }
 
+bool sctp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif)
+{
+	bool l3mdev_accept = true;
+
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	l3mdev_accept = !!READ_ONCE(net->sctp.l3mdev_accept);
+#endif
+	return inet_bound_dev_eq(l3mdev_accept, bound_dev_if, dif, sdif);
+}
+
 /* return a transport with holding it */
 struct sctp_transport *sctp_addrs_lookup_transport(
 				struct net *net,
 				const union sctp_addr *laddr,
-				const union sctp_addr *paddr)
+				const union sctp_addr *paddr,
+				int dif, int sdif)
 {
 	struct rhlist_head *tmp, *list;
 	struct sctp_transport *t;
+	int bound_dev_if;
 	struct sctp_hash_cmp_arg arg = {
 		.paddr = paddr,
 		.net   = net,
@@ -1011,7 +1011,9 @@ struct sctp_transport *sctp_addrs_lookup_transport(
 		if (!sctp_transport_hold(t))
 			continue;
 
-		if (sctp_bind_addr_match(&t->asoc->base.bind_addr,
+		bound_dev_if = READ_ONCE(t->asoc->base.sk->sk_bound_dev_if);
+		if (sctp_sk_bound_dev_eq(net, bound_dev_if, dif, sdif) &&
+		    sctp_bind_addr_match(&t->asoc->base.bind_addr,
 					 laddr, sctp_sk(t->asoc->base.sk)))
 			return t;
 		sctp_transport_put(t);
@@ -1048,12 +1050,13 @@ static struct sctp_association *__sctp_lookup_association(
 					struct net *net,
 					const union sctp_addr *local,
 					const union sctp_addr *peer,
-					struct sctp_transport **pt)
+					struct sctp_transport **pt,
+					int dif, int sdif)
 {
 	struct sctp_transport *t;
 	struct sctp_association *asoc = NULL;
 
-	t = sctp_addrs_lookup_transport(net, local, peer);
+	t = sctp_addrs_lookup_transport(net, local, peer, dif, sdif);
 	if (!t)
 		goto out;
 
@@ -1069,12 +1072,13 @@ static
 struct sctp_association *sctp_lookup_association(struct net *net,
 						 const union sctp_addr *laddr,
 						 const union sctp_addr *paddr,
-						 struct sctp_transport **transportp)
+						 struct sctp_transport **transportp,
+						 int dif, int sdif)
 {
 	struct sctp_association *asoc;
 
 	rcu_read_lock();
-	asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+	asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
 	rcu_read_unlock();
 
 	return asoc;
@@ -1083,11 +1087,12 @@ struct sctp_association *sctp_lookup_association(struct net *net,
 /* Is there an association matching the given local and peer addresses? */
 bool sctp_has_association(struct net *net,
 			  const union sctp_addr *laddr,
-			  const union sctp_addr *paddr)
+			  const union sctp_addr *paddr,
+			  int dif, int sdif)
 {
 	struct sctp_transport *transport;
 
-	if (sctp_lookup_association(net, laddr, paddr, &transport)) {
+	if (sctp_lookup_association(net, laddr, paddr, &transport, dif, sdif)) {
 		sctp_transport_put(transport);
 		return true;
 	}
@@ -1115,7 +1120,8 @@ bool sctp_has_association(struct net *net,
  */
 static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
 	struct sk_buff *skb,
-	const union sctp_addr *laddr, struct sctp_transport **transportp)
+	const union sctp_addr *laddr, struct sctp_transport **transportp,
+	int dif, int sdif)
 {
 	struct sctp_association *asoc;
 	union sctp_addr addr;
@@ -1154,7 +1160,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net,
 		if (!af->from_addr_param(paddr, params.addr, sh->source, 0))
 			continue;
 
-		asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+		asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
 		if (asoc)
 			return asoc;
 	}
@@ -1181,7 +1187,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
 					struct sctp_chunkhdr *ch,
 					const union sctp_addr *laddr,
 					__be16 peer_port,
-					struct sctp_transport **transportp)
+					struct sctp_transport **transportp,
+					int dif, int sdif)
 {
 	struct sctp_addip_chunk *asconf = (struct sctp_addip_chunk *)ch;
 	struct sctp_af *af;
@@ -1201,7 +1208,7 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
 	if (!af->from_addr_param(&paddr, param, peer_port, 0))
 		return NULL;
 
-	return __sctp_lookup_association(net, laddr, &paddr, transportp);
+	return __sctp_lookup_association(net, laddr, &paddr, transportp, dif, sdif);
 }
 
 
@@ -1217,7 +1224,8 @@ static struct sctp_association *__sctp_rcv_asconf_lookup(
 static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
 				      struct sk_buff *skb,
 				      const union sctp_addr *laddr,
-				      struct sctp_transport **transportp)
+				      struct sctp_transport **transportp,
+				      int dif, int sdif)
 {
 	struct sctp_association *asoc = NULL;
 	struct sctp_chunkhdr *ch;
@@ -1260,7 +1268,7 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
 				asoc = __sctp_rcv_asconf_lookup(
 						net, ch, laddr,
 						sctp_hdr(skb)->source,
-						transportp);
+						transportp, dif, sdif);
 			break;
 		default:
 			break;
@@ -1285,7 +1293,8 @@ static struct sctp_association *__sctp_rcv_walk_lookup(struct net *net,
 static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 				      struct sk_buff *skb,
 				      const union sctp_addr *laddr,
-				      struct sctp_transport **transportp)
+				      struct sctp_transport **transportp,
+				      int dif, int sdif)
 {
 	struct sctp_chunkhdr *ch;
 
@@ -1309,9 +1318,9 @@ static struct sctp_association *__sctp_rcv_lookup_harder(struct net *net,
 
 	/* If this is INIT/INIT-ACK look inside the chunk too. */
 	if (ch->type == SCTP_CID_INIT || ch->type == SCTP_CID_INIT_ACK)
-		return __sctp_rcv_init_lookup(net, skb, laddr, transportp);
+		return __sctp_rcv_init_lookup(net, skb, laddr, transportp, dif, sdif);
 
-	return __sctp_rcv_walk_lookup(net, skb, laddr, transportp);
+	return __sctp_rcv_walk_lookup(net, skb, laddr, transportp, dif, sdif);
 }
 
 /* Lookup an association for an inbound skb. */
@@ -1319,11 +1328,12 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
 				      struct sk_buff *skb,
 				      const union sctp_addr *paddr,
 				      const union sctp_addr *laddr,
-				      struct sctp_transport **transportp)
+				      struct sctp_transport **transportp,
+				      int dif, int sdif)
 {
 	struct sctp_association *asoc;
 
-	asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
+	asoc = __sctp_lookup_association(net, laddr, paddr, transportp, dif, sdif);
 	if (asoc)
 		goto out;
 
@@ -1331,7 +1341,7 @@ static struct sctp_association *__sctp_rcv_lookup(struct net *net,
 	 * SCTP Implementors Guide, 2.18 Handling of address
 	 * parameters within the INIT or INIT-ACK.
 	 */
-	asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp);
+	asoc = __sctp_rcv_lookup_harder(net, skb, laddr, transportp, dif, sdif);
 	if (asoc)
 		goto out;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index a18cf0471a8d..909a89a1cff4 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1394,6 +1394,10 @@ static int __net_init sctp_defaults_init(struct net *net)
 	/* Initialize maximum autoclose timeout. */
 	net->sctp.max_autoclose		= INT_MAX / HZ;
 
+#ifdef CONFIG_NET_L3_MASTER_DEV
+	net->sctp.l3mdev_accept = 1;
+#endif
+
 	status = sctp_sysctl_net_register(net);
 	if (status)
 		goto err_sysctl_register;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 4306164238ef..5acbdf0d38f3 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -5315,14 +5315,14 @@ EXPORT_SYMBOL_GPL(sctp_for_each_endpoint);
 
 int sctp_transport_lookup_process(sctp_callback_t cb, struct net *net,
 				  const union sctp_addr *laddr,
-				  const union sctp_addr *paddr, void *p)
+				  const union sctp_addr *paddr, void *p, int dif)
 {
 	struct sctp_transport *transport;
 	struct sctp_endpoint *ep;
 	int err = -ENOENT;
 
 	rcu_read_lock();
-	transport = sctp_addrs_lookup_transport(net, laddr, paddr);
+	transport = sctp_addrs_lookup_transport(net, laddr, paddr, dif, dif);
 	if (!transport) {
 		rcu_read_unlock();
 		return err;
-- 
cgit v1.2.3


From ab0377803dafc58f1e22296708c1c28e309414d6 Mon Sep 17 00:00:00 2001
From: Schspa Shi <schspa@gmail.com>
Date: Wed, 16 Nov 2022 19:45:11 +0800
Subject: mrp: introduce active flags to prevent UAF when applicant uninit

The caller of del_timer_sync must prevent restarting of the timer, If
we have no this synchronization, there is a small probability that the
cancellation will not be successful.

And syzbot report the fellowing crash:
==================================================================
BUG: KASAN: use-after-free in hlist_add_head include/linux/list.h:929 [inline]
BUG: KASAN: use-after-free in enqueue_timer+0x18/0xa4 kernel/time/timer.c:605
Write at addr f9ff000024df6058 by task syz-fuzzer/2256
Pointer tag: [f9], memory tag: [fe]

CPU: 1 PID: 2256 Comm: syz-fuzzer Not tainted 6.1.0-rc5-syzkaller-00008-
ge01d50cbd6ee #0
Hardware name: linux,dummy-virt (DT)
Call trace:
 dump_backtrace.part.0+0xe0/0xf0 arch/arm64/kernel/stacktrace.c:156
 dump_backtrace arch/arm64/kernel/stacktrace.c:162 [inline]
 show_stack+0x18/0x40 arch/arm64/kernel/stacktrace.c:163
 __dump_stack lib/dump_stack.c:88 [inline]
 dump_stack_lvl+0x68/0x84 lib/dump_stack.c:106
 print_address_description mm/kasan/report.c:284 [inline]
 print_report+0x1a8/0x4a0 mm/kasan/report.c:395
 kasan_report+0x94/0xb4 mm/kasan/report.c:495
 __do_kernel_fault+0x164/0x1e0 arch/arm64/mm/fault.c:320
 do_bad_area arch/arm64/mm/fault.c:473 [inline]
 do_tag_check_fault+0x78/0x8c arch/arm64/mm/fault.c:749
 do_mem_abort+0x44/0x94 arch/arm64/mm/fault.c:825
 el1_abort+0x40/0x60 arch/arm64/kernel/entry-common.c:367
 el1h_64_sync_handler+0xd8/0xe4 arch/arm64/kernel/entry-common.c:427
 el1h_64_sync+0x64/0x68 arch/arm64/kernel/entry.S:576
 hlist_add_head include/linux/list.h:929 [inline]
 enqueue_timer+0x18/0xa4 kernel/time/timer.c:605
 mod_timer+0x14/0x20 kernel/time/timer.c:1161
 mrp_periodic_timer_arm net/802/mrp.c:614 [inline]
 mrp_periodic_timer+0xa0/0xc0 net/802/mrp.c:627
 call_timer_fn.constprop.0+0x24/0x80 kernel/time/timer.c:1474
 expire_timers+0x98/0xc4 kernel/time/timer.c:1519

To fix it, we can introduce a new active flags to make sure the timer will
not restart.

Reported-by: syzbot+6fd64001c20aa99e34a4@syzkaller.appspotmail.com

Signed-off-by: Schspa Shi <schspa@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/mrp.h |  1 +
 net/802/mrp.c     | 18 +++++++++++++-----
 2 files changed, 14 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mrp.h b/include/net/mrp.h
index 92cd3fb6cf9d..b28915ffea28 100644
--- a/include/net/mrp.h
+++ b/include/net/mrp.h
@@ -124,6 +124,7 @@ struct mrp_applicant {
 	struct sk_buff		*pdu;
 	struct rb_root		mad;
 	struct rcu_head		rcu;
+	bool			active;
 };
 
 struct mrp_port {
diff --git a/net/802/mrp.c b/net/802/mrp.c
index 155f74d8b14f..6c927d4b35f0 100644
--- a/net/802/mrp.c
+++ b/net/802/mrp.c
@@ -606,7 +606,10 @@ static void mrp_join_timer(struct timer_list *t)
 	spin_unlock(&app->lock);
 
 	mrp_queue_xmit(app);
-	mrp_join_timer_arm(app);
+	spin_lock(&app->lock);
+	if (likely(app->active))
+		mrp_join_timer_arm(app);
+	spin_unlock(&app->lock);
 }
 
 static void mrp_periodic_timer_arm(struct mrp_applicant *app)
@@ -620,11 +623,12 @@ static void mrp_periodic_timer(struct timer_list *t)
 	struct mrp_applicant *app = from_timer(app, t, periodic_timer);
 
 	spin_lock(&app->lock);
-	mrp_mad_event(app, MRP_EVENT_PERIODIC);
-	mrp_pdu_queue(app);
+	if (likely(app->active)) {
+		mrp_mad_event(app, MRP_EVENT_PERIODIC);
+		mrp_pdu_queue(app);
+		mrp_periodic_timer_arm(app);
+	}
 	spin_unlock(&app->lock);
-
-	mrp_periodic_timer_arm(app);
 }
 
 static int mrp_pdu_parse_end_mark(struct sk_buff *skb, int *offset)
@@ -872,6 +876,7 @@ int mrp_init_applicant(struct net_device *dev, struct mrp_application *appl)
 	app->dev = dev;
 	app->app = appl;
 	app->mad = RB_ROOT;
+	app->active = true;
 	spin_lock_init(&app->lock);
 	skb_queue_head_init(&app->queue);
 	rcu_assign_pointer(dev->mrp_port->applicants[appl->type], app);
@@ -900,6 +905,9 @@ void mrp_uninit_applicant(struct net_device *dev, struct mrp_application *appl)
 
 	RCU_INIT_POINTER(port->applicants[appl->type], NULL);
 
+	spin_lock_bh(&app->lock);
+	app->active = false;
+	spin_unlock_bh(&app->lock);
 	/* Delete timer and generate a final TX event to flush out
 	 * all pending messages before the applicant is gone.
 	 */
-- 
cgit v1.2.3


From a3400e8746b626531099e4d9fd8eac41be066683 Mon Sep 17 00:00:00 2001
From: Paolo Abeni <pabeni@redhat.com>
Date: Fri, 18 Nov 2022 10:46:08 -0800
Subject: mptcp: more detailed error reporting on endpoint creation

Endpoint creation can fail for a number of reasons; in case of failure
append the error number to the extended ack message, using a newly
introduced generic helper.

Additionally let mptcp_pm_nl_append_new_local_addr() report different
error reasons.

Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/genetlink.h |  3 +++
 net/mptcp/pm_netlink.c  | 24 +++++++++++++-----------
 2 files changed, 16 insertions(+), 11 deletions(-)

(limited to 'include/net')

diff --git a/include/net/genetlink.h b/include/net/genetlink.h
index d21210709f84..ed4622dd4828 100644
--- a/include/net/genetlink.h
+++ b/include/net/genetlink.h
@@ -125,6 +125,9 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net)
 
 #define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg)
 
+#define GENL_SET_ERR_MSG_FMT(info, msg, args...) \
+	NL_SET_ERR_MSG_FMT((info)->extack, msg, ##args)
+
 /* Report that a root attribute is missing */
 #define GENL_REQ_ATTR_CHECK(info, attr) ({				\
 	struct genl_info *__info = (info);				\
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index fdf2ee29f762..d66fbd558263 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -912,10 +912,14 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 	 */
 	if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID)
 		pernet->next_id = 1;
-	if (pernet->addrs >= MPTCP_PM_ADDR_MAX)
+	if (pernet->addrs >= MPTCP_PM_ADDR_MAX) {
+		ret = -ERANGE;
 		goto out;
-	if (test_bit(entry->addr.id, pernet->id_bitmap))
+	}
+	if (test_bit(entry->addr.id, pernet->id_bitmap)) {
+		ret = -EBUSY;
 		goto out;
+	}
 
 	/* do not insert duplicate address, differentiate on port only
 	 * singled addresses
@@ -929,8 +933,10 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 			 * endpoint is an implicit one and the user-space
 			 * did not provide an endpoint id
 			 */
-			if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT))
+			if (!(cur->flags & MPTCP_PM_ADDR_FLAG_IMPLICIT)) {
+				ret = -EEXIST;
 				goto out;
+			}
 			if (entry->addr.id)
 				goto out;
 
@@ -1016,16 +1022,12 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
 		addrlen = sizeof(struct sockaddr_in6);
 #endif
 	err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
-	if (err) {
-		pr_warn("kernel_bind error, err=%d", err);
+	if (err)
 		return err;
-	}
 
 	err = kernel_listen(ssock, backlog);
-	if (err) {
-		pr_warn("kernel_listen error, err=%d", err);
+	if (err)
 		return err;
-	}
 
 	return 0;
 }
@@ -1329,13 +1331,13 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
 	if (entry->addr.port) {
 		ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
 		if (ret) {
-			GENL_SET_ERR_MSG(info, "create listen socket error");
+			GENL_SET_ERR_MSG_FMT(info, "create listen socket error: %d", ret);
 			goto out_free;
 		}
 	}
 	ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
 	if (ret < 0) {
-		GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
+		GENL_SET_ERR_MSG_FMT(info, "too many addresses or duplicate one: %d", ret);
 		goto out_free;
 	}
 
-- 
cgit v1.2.3


From 8c5dae4c1a49489499e6708c7dd284370ca36287 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 18 Nov 2022 17:49:13 -0800
Subject: dccp/tcp: Update saddr under bhash's lock.

When we call connect() for a socket bound to a wildcard address, we update
saddr locklessly.  However, it could result in a data race; another thread
iterating over bhash might see a corrupted address.

Let's update saddr under the bhash bucket's lock.

Fixes: 3df80d9320bc ("[DCCP]: Introduce DCCPv6")
Fixes: 7c657876b63c ("[DCCP]: Initial implementation")
Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h |  2 +-
 net/dccp/ipv4.c               | 22 ++++-----------------
 net/dccp/ipv6.c               | 23 ++++------------------
 net/ipv4/af_inet.c            | 11 +----------
 net/ipv4/inet_hashtables.c    | 45 +++++++++++++++++++++++++++++++++++++------
 net/ipv4/tcp_ipv4.c           | 20 ++++---------------
 net/ipv6/tcp_ipv6.c           | 19 +++---------------
 7 files changed, 56 insertions(+), 86 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 3af1e927247d..ba06e8b52264 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -281,7 +281,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
  * sk_v6_rcv_saddr (ipv6) changes after it has been binded. The socket's
  * rcv_saddr field should already have been updated when this is called.
  */
-int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk);
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family);
 
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 		    struct inet_bind2_bucket *tb2, unsigned short port);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 40640c26680e..95e376e3b911 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -45,11 +45,10 @@ static unsigned int dccp_v4_pernet_id __read_mostly;
 int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
 	const struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
-	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
-	__be32 daddr, nexthop, prev_sk_rcv_saddr;
 	struct inet_sock *inet = inet_sk(sk);
 	struct dccp_sock *dp = dccp_sk(sk);
 	__be16 orig_sport, orig_dport;
+	__be32 daddr, nexthop;
 	struct flowi4 *fl4;
 	struct rtable *rt;
 	int err;
@@ -91,26 +90,13 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 		daddr = fl4->daddr;
 
 	if (inet->inet_saddr == 0) {
-		if (inet_csk(sk)->icsk_bind2_hash) {
-			prev_addr_hashbucket =
-				inet_bhashfn_portaddr(&dccp_hashinfo, sk,
-						      sock_net(sk),
-						      inet->inet_num);
-			prev_sk_rcv_saddr = sk->sk_rcv_saddr;
-		}
-		inet->inet_saddr = fl4->saddr;
-	}
-
-	sk_rcv_saddr_set(sk, inet->inet_saddr);
-
-	if (prev_addr_hashbucket) {
-		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 		if (err) {
-			inet->inet_saddr = 0;
-			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
 			ip_rt_put(rt);
 			return err;
 		}
+	} else {
+		sk_rcv_saddr_set(sk, inet->inet_saddr);
 	}
 
 	inet->inet_dport = usin->sin_port;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 626166cb6d7e..94c101ed57a9 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -934,26 +934,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	}
 
 	if (saddr == NULL) {
-		struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
-		struct in6_addr prev_v6_rcv_saddr;
-
-		if (icsk->icsk_bind2_hash) {
-			prev_addr_hashbucket = inet_bhashfn_portaddr(&dccp_hashinfo,
-								     sk, sock_net(sk),
-								     inet->inet_num);
-			prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
-		}
-
 		saddr = &fl6.saddr;
-		sk->sk_v6_rcv_saddr = *saddr;
-
-		if (prev_addr_hashbucket) {
-			err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
-			if (err) {
-				sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
-				goto failure;
-			}
-		}
+
+		err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+		if (err)
+			goto failure;
 	}
 
 	/* set the source address */
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 4728087c42a5..0da679411330 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1230,7 +1230,6 @@ EXPORT_SYMBOL(inet_unregister_protosw);
 
 static int inet_sk_reselect_saddr(struct sock *sk)
 {
-	struct inet_bind_hashbucket *prev_addr_hashbucket;
 	struct inet_sock *inet = inet_sk(sk);
 	__be32 old_saddr = inet->inet_saddr;
 	__be32 daddr = inet->inet_daddr;
@@ -1260,16 +1259,8 @@ static int inet_sk_reselect_saddr(struct sock *sk)
 		return 0;
 	}
 
-	prev_addr_hashbucket =
-		inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk,
-				      sock_net(sk), inet->inet_num);
-
-	inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
-
-	err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+	err = inet_bhash2_update_saddr(sk, &new_saddr, AF_INET);
 	if (err) {
-		inet->inet_saddr = old_saddr;
-		inet->inet_rcv_saddr = old_saddr;
 		ip_rt_put(rt);
 		return err;
 	}
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index d745f962745e..18ef370af113 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -858,14 +858,34 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
 	return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
 }
 
-int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk)
+static void inet_update_saddr(struct sock *sk, void *saddr, int family)
+{
+	if (family == AF_INET) {
+		inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
+		sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	else {
+		sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
+	}
+#endif
+}
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
 {
 	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
+	struct inet_bind_hashbucket *head, *head2;
 	struct inet_bind2_bucket *tb2, *new_tb2;
 	int l3mdev = inet_sk_bound_l3mdev(sk);
-	struct inet_bind_hashbucket *head2;
 	int port = inet_sk(sk)->inet_num;
 	struct net *net = sock_net(sk);
+	int bhash;
+
+	if (!inet_csk(sk)->icsk_bind2_hash) {
+		/* Not bind()ed before. */
+		inet_update_saddr(sk, saddr, family);
+		return 0;
+	}
 
 	/* Allocate a bind2 bucket ahead of time to avoid permanently putting
 	 * the bhash2 table in an inconsistent state if a new tb2 bucket
@@ -875,14 +895,25 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc
 	if (!new_tb2)
 		return -ENOMEM;
 
+	bhash = inet_bhashfn(net, port, hinfo->bhash_size);
+	head = &hinfo->bhash[bhash];
 	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
 
-	spin_lock_bh(&prev_saddr->lock);
+	/* If we change saddr locklessly, another thread
+	 * iterating over bhash might see corrupted address.
+	 */
+	spin_lock_bh(&head->lock);
+
+	spin_lock(&head2->lock);
 	__sk_del_bind2_node(sk);
 	inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
-	spin_unlock_bh(&prev_saddr->lock);
+	spin_unlock(&head2->lock);
+
+	inet_update_saddr(sk, saddr, family);
 
-	spin_lock_bh(&head2->lock);
+	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
+
+	spin_lock(&head2->lock);
 	tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
 	if (!tb2) {
 		tb2 = new_tb2;
@@ -890,7 +921,9 @@ int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct soc
 	}
 	sk_add_bind2_node(sk, &tb2->owners);
 	inet_csk(sk)->icsk_bind2_hash = tb2;
-	spin_unlock_bh(&head2->lock);
+	spin_unlock(&head2->lock);
+
+	spin_unlock_bh(&head->lock);
 
 	if (tb2 != new_tb2)
 		kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 6a3a732b584d..23dd7e9df2d5 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -199,15 +199,14 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
 /* This will initiate an outgoing connection. */
 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 {
-	struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
 	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 	struct inet_timewait_death_row *tcp_death_row;
-	__be32 daddr, nexthop, prev_sk_rcv_saddr;
 	struct inet_sock *inet = inet_sk(sk);
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct ip_options_rcu *inet_opt;
 	struct net *net = sock_net(sk);
 	__be16 orig_sport, orig_dport;
+	__be32 daddr, nexthop;
 	struct flowi4 *fl4;
 	struct rtable *rt;
 	int err;
@@ -251,24 +250,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (!inet->inet_saddr) {
-		if (inet_csk(sk)->icsk_bind2_hash) {
-			prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
-								     sk, net, inet->inet_num);
-			prev_sk_rcv_saddr = sk->sk_rcv_saddr;
-		}
-		inet->inet_saddr = fl4->saddr;
-	}
-
-	sk_rcv_saddr_set(sk, inet->inet_saddr);
-
-	if (prev_addr_hashbucket) {
-		err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
+		err = inet_bhash2_update_saddr(sk,  &fl4->saddr, AF_INET);
 		if (err) {
-			inet->inet_saddr = 0;
-			sk_rcv_saddr_set(sk, prev_sk_rcv_saddr);
 			ip_rt_put(rt);
 			return err;
 		}
+	} else {
+		sk_rcv_saddr_set(sk, inet->inet_saddr);
 	}
 
 	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 81b396e5cf79..2f3ca3190d26 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -292,24 +292,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 	tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row;
 
 	if (!saddr) {
-		struct inet_bind_hashbucket *prev_addr_hashbucket = NULL;
-		struct in6_addr prev_v6_rcv_saddr;
-
-		if (icsk->icsk_bind2_hash) {
-			prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo,
-								     sk, net, inet->inet_num);
-			prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr;
-		}
 		saddr = &fl6.saddr;
-		sk->sk_v6_rcv_saddr = *saddr;
 
-		if (prev_addr_hashbucket) {
-			err = inet_bhash2_update_saddr(prev_addr_hashbucket, sk);
-			if (err) {
-				sk->sk_v6_rcv_saddr = prev_v6_rcv_saddr;
-				goto failure;
-			}
-		}
+		err = inet_bhash2_update_saddr(sk, saddr, AF_INET6);
+		if (err)
+			goto failure;
 	}
 
 	/* set the source address */
-- 
cgit v1.2.3


From e0833d1fedb02f038b526ae7dde178a076f56545 Mon Sep 17 00:00:00 2001
From: Kuniyuki Iwashima <kuniyu@amazon.com>
Date: Fri, 18 Nov 2022 17:49:14 -0800
Subject: dccp/tcp: Fixup bhash2 bucket when connect() fails.

If a socket bound to a wildcard address fails to connect(), we
only reset saddr and keep the port.  Then, we have to fix up the
bhash2 bucket; otherwise, the bucket has an inconsistent address
in the list.

Also, listen() for such a socket will fire the WARN_ON() in
inet_csk_get_port(). [0]

Note that when a system runs out of memory, we give up fixing the
bucket and unlink sk from bhash and bhash2 by inet_put_port().

[0]:
WARNING: CPU: 0 PID: 207 at net/ipv4/inet_connection_sock.c:548 inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1))
Modules linked in:
CPU: 0 PID: 207 Comm: bhash2_prev_rep Not tainted 6.1.0-rc3-00799-gc8421681c845 #63
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.0-1.amzn2022.0.1 04/01/2014
RIP: 0010:inet_csk_get_port (net/ipv4/inet_connection_sock.c:548 (discriminator 1))
Code: 74 a7 eb 93 48 8b 54 24 18 0f b7 cb 4c 89 e6 4c 89 ff e8 48 b2 ff ff 49 8b 87 18 04 00 00 e9 32 ff ff ff 0f 0b e9 34 ff ff ff <0f> 0b e9 42 ff ff ff 41 8b 7f 50 41 8b 4f 54 89 fe 81 f6 00 00 ff
RSP: 0018:ffffc900003d7e50 EFLAGS: 00010202
RAX: ffff8881047fb500 RBX: 0000000000004e20 RCX: 0000000000000000
RDX: 000000000000000a RSI: 00000000fffffe00 RDI: 00000000ffffffff
RBP: ffffffff8324dc00 R08: 0000000000000001 R09: 0000000000000001
R10: 0000000000000001 R11: 0000000000000001 R12: 0000000000000000
R13: 0000000000000001 R14: 0000000000004e20 R15: ffff8881054e1280
FS:  00007f8ac04dc740(0000) GS:ffff88842fc00000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000000020001540 CR3: 00000001055fa003 CR4: 0000000000770ef0
DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
PKRU: 55555554
Call Trace:
 <TASK>
 inet_csk_listen_start (net/ipv4/inet_connection_sock.c:1205)
 inet_listen (net/ipv4/af_inet.c:228)
 __sys_listen (net/socket.c:1810)
 __x64_sys_listen (net/socket.c:1819 net/socket.c:1817 net/socket.c:1817)
 do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
 entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)
RIP: 0033:0x7f8ac051de5d
Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 93 af 1b 00 f7 d8 64 89 01 48
RSP: 002b:00007ffc1c177248 EFLAGS: 00000206 ORIG_RAX: 0000000000000032
RAX: ffffffffffffffda RBX: 0000000020001550 RCX: 00007f8ac051de5d
RDX: ffffffffffffff80 RSI: 0000000000000000 RDI: 0000000000000004
RBP: 00007ffc1c177270 R08: 0000000000000018 R09: 0000000000000007
R10: 0000000020001540 R11: 0000000000000206 R12: 00007ffc1c177388
R13: 0000000000401169 R14: 0000000000403e18 R15: 00007f8ac0723000
 </TASK>

Fixes: 28044fc1d495 ("net: Add a bhash2 table hashed by port and address")
Reported-by: syzbot <syzkaller@googlegroups.com>
Reported-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Acked-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/inet_hashtables.h |  1 +
 net/dccp/ipv4.c               |  3 +--
 net/dccp/ipv6.c               |  3 +--
 net/dccp/proto.c              |  3 +--
 net/ipv4/inet_hashtables.c    | 38 ++++++++++++++++++++++++++++++++++----
 net/ipv4/tcp.c                |  3 +--
 net/ipv4/tcp_ipv4.c           |  3 +--
 net/ipv6/tcp_ipv6.c           |  3 +--
 8 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'include/net')

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index ba06e8b52264..69174093078f 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -282,6 +282,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in
  * rcv_saddr field should already have been updated when this is called.
  */
 int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family);
+void inet_bhash2_reset_saddr(struct sock *sk);
 
 void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
 		    struct inet_bind2_bucket *tb2, unsigned short port);
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 95e376e3b911..b780827f5e0a 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -143,8 +143,7 @@ failure:
 	 * This unhashes the socket and releases the local port, if necessary.
 	 */
 	dccp_set_state(sk, DCCP_CLOSED);
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 	ip_rt_put(rt);
 	sk->sk_route_caps = 0;
 	inet->inet_dport = 0;
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 94c101ed57a9..602f3432d80b 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -970,8 +970,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 late_failure:
 	dccp_set_state(sk, DCCP_CLOSED);
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 	__sk_dst_reset(sk);
 failure:
 	inet->inet_dport = 0;
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index c548ca3e9b0e..85e35c5e8890 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -279,8 +279,7 @@ int dccp_disconnect(struct sock *sk, int flags)
 
 	inet->inet_dport = 0;
 
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 
 	sk->sk_shutdown = 0;
 	sock_reset_flag(sk, SOCK_DONE);
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 18ef370af113..3cec471a2cd2 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -871,7 +871,7 @@ static void inet_update_saddr(struct sock *sk, void *saddr, int family)
 #endif
 }
 
-int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
 {
 	struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
 	struct inet_bind_hashbucket *head, *head2;
@@ -883,7 +883,11 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
 
 	if (!inet_csk(sk)->icsk_bind2_hash) {
 		/* Not bind()ed before. */
-		inet_update_saddr(sk, saddr, family);
+		if (reset)
+			inet_reset_saddr(sk);
+		else
+			inet_update_saddr(sk, saddr, family);
+
 		return 0;
 	}
 
@@ -892,8 +896,19 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
 	 * allocation fails.
 	 */
 	new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
-	if (!new_tb2)
+	if (!new_tb2) {
+		if (reset) {
+			/* The (INADDR_ANY, port) bucket might have already
+			 * been freed, then we cannot fixup icsk_bind2_hash,
+			 * so we give up and unlink sk from bhash/bhash2 not
+			 * to leave inconsistency in bhash2.
+			 */
+			inet_put_port(sk);
+			inet_reset_saddr(sk);
+		}
+
 		return -ENOMEM;
+	}
 
 	bhash = inet_bhashfn(net, port, hinfo->bhash_size);
 	head = &hinfo->bhash[bhash];
@@ -909,7 +924,10 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
 	inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
 	spin_unlock(&head2->lock);
 
-	inet_update_saddr(sk, saddr, family);
+	if (reset)
+		inet_reset_saddr(sk);
+	else
+		inet_update_saddr(sk, saddr, family);
 
 	head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
 
@@ -930,8 +948,20 @@ int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
 
 	return 0;
 }
+
+int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
+{
+	return __inet_bhash2_update_saddr(sk, saddr, family, false);
+}
 EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
 
+void inet_bhash2_reset_saddr(struct sock *sk)
+{
+	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
+		__inet_bhash2_update_saddr(sk, NULL, 0, true);
+}
+EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
+
 /* RFC 6056 3.3.4.  Algorithm 4: Double-Hash Port Selection Algorithm
  * Note that we use 32bit integers (vs RFC 'short integers')
  * because 2^16 is not a multiple of num_ephemeral and this
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 54836a6b81d6..4f2205756cfe 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3114,8 +3114,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 
 	inet->inet_dport = 0;
 
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 
 	sk->sk_shutdown = 0;
 	sock_reset_flag(sk, SOCK_DONE);
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 23dd7e9df2d5..da46357f501b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -331,8 +331,7 @@ failure:
 	 * if necessary.
 	 */
 	tcp_set_state(sk, TCP_CLOSE);
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 	ip_rt_put(rt);
 	sk->sk_route_caps = 0;
 	inet->inet_dport = 0;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2f3ca3190d26..f0548dbcabd2 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -346,8 +346,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 late_failure:
 	tcp_set_state(sk, TCP_CLOSE);
-	if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
-		inet_reset_saddr(sk);
+	inet_bhash2_reset_saddr(sk);
 failure:
 	inet->inet_dport = 0;
 	sk->sk_route_caps = 0;
-- 
cgit v1.2.3


From c5fb8ead3283955dc68671f853017b181f96fdc1 Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 21 Nov 2022 15:55:39 +0200
Subject: net: dsa: unexport dsa_dev_to_net_device()

dsa.o and dsa2.o are linked into the same dsa_core.o, there is no reason
to export this symbol when its only caller is local.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/dsa.h  | 2 --
 net/dsa/dsa.c      | 1 -
 net/dsa/dsa_priv.h | 2 ++
 3 files changed, 2 insertions(+), 3 deletions(-)

(limited to 'include/net')

diff --git a/include/net/dsa.h b/include/net/dsa.h
index 82da44561f4c..d5bfcb63d4c2 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -1285,8 +1285,6 @@ struct dsa_switch_driver {
 	const struct dsa_switch_ops *ops;
 };
 
-struct net_device *dsa_dev_to_net_device(struct device *dev);
-
 bool dsa_fdb_present_in_other_db(struct dsa_switch *ds, int port,
 				 const unsigned char *addr, u16 vid,
 				 struct dsa_db db);
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 4afd3edbd64d..07158c7560b5 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -182,7 +182,6 @@ struct net_device *dsa_dev_to_net_device(struct device *dev)
 
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(dsa_dev_to_net_device);
 
 /* Determine if we should defer delivery of skb until we have a rx timestamp.
  *
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 24e0ea218a35..b60987e8d931 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -247,6 +247,8 @@ const struct dsa_device_ops *dsa_tag_driver_get_by_id(int tag_protocol);
 const struct dsa_device_ops *dsa_tag_driver_get_by_name(const char *name);
 void dsa_tag_driver_put(const struct dsa_device_ops *ops);
 
+struct net_device *dsa_dev_to_net_device(struct device *dev);
+
 bool dsa_db_equal(const struct dsa_db *a, const struct dsa_db *b);
 
 bool dsa_schedule_work(struct work_struct *work);
-- 
cgit v1.2.3


From 19d05ea712ecbbb67d302664da5ec58b37b9aece Mon Sep 17 00:00:00 2001
From: Vladimir Oltean <vladimir.oltean@nxp.com>
Date: Mon, 21 Nov 2022 15:55:54 +0200
Subject: net: dsa: move tag_8021q headers to their proper place

tag_8021q definitions are all over the place. Some are exported to
linux/dsa/8021q.h (visible by DSA core, taggers, switch drivers and
everyone else), and some are in dsa_priv.h.

Move the structures that don't need external visibility into tag_8021q.c,
and the ones which don't need the world or switch drivers to see them
into tag_8021q.h.

We also have the tag_8021q.h inclusion from switch.c, which is basically
the entire reason why tag_8021q.c was built into DSA in commit
8b6e638b4be2 ("net: dsa: build tag_8021q.c as part of DSA core").
I still don't know how to better deal with that, so leave it alone.

Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/linux/dsa/8021q.h  | 31 +------------------------------
 include/net/dsa.h          |  1 +
 net/dsa/dsa_priv.h         |  8 --------
 net/dsa/port.c             |  1 +
 net/dsa/switch.c           |  1 +
 net/dsa/tag_8021q.c        | 15 +++++++++++++++
 net/dsa/tag_8021q.h        | 27 +++++++++++++++++++++++++++
 net/dsa/tag_ocelot_8021q.c |  1 +
 net/dsa/tag_sja1105.c      |  1 +
 9 files changed, 48 insertions(+), 38 deletions(-)
 create mode 100644 net/dsa/tag_8021q.h

(limited to 'include/net')

diff --git a/include/linux/dsa/8021q.h b/include/linux/dsa/8021q.h
index 3ed117e299ec..f3664ee12170 100644
--- a/include/linux/dsa/8021q.h
+++ b/include/linux/dsa/8021q.h
@@ -5,28 +5,8 @@
 #ifndef _NET_DSA_8021Q_H
 #define _NET_DSA_8021Q_H
 
-#include <linux/refcount.h>
-#include <linux/types.h>
 #include <net/dsa.h>
-
-struct dsa_switch;
-struct dsa_port;
-struct sk_buff;
-struct net_device;
-
-struct dsa_tag_8021q_vlan {
-	struct list_head list;
-	int port;
-	u16 vid;
-	refcount_t refcount;
-};
-
-struct dsa_8021q_context {
-	struct dsa_switch *ds;
-	struct list_head vlans;
-	/* EtherType of RX VID, used for filtering on master interface */
-	__be16 proto;
-};
+#include <linux/types.h>
 
 int dsa_tag_8021q_register(struct dsa_switch *ds, __be16 proto);
 
@@ -38,15 +18,6 @@ int dsa_tag_8021q_bridge_join(struct dsa_switch *ds, int port,
 void dsa_tag_8021q_bridge_leave(struct dsa_switch *ds, int port,
 				struct dsa_bridge bridge);
 
-struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
-			       u16 tpid, u16 tci);
-
-void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
-		   int *vbid);
-
-struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
-						   int vbid);
-
 u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num);
 
 u16 dsa_tag_8021q_standalone_vid(const struct dsa_port *dp);
diff --git a/include/net/dsa.h b/include/net/dsa.h
index d5bfcb63d4c2..96086289aa9b 100644
--- a/include/net/dsa.h
+++ b/include/net/dsa.h
@@ -22,6 +22,7 @@
 #include <net/devlink.h>
 #include <net/switchdev.h>
 
+struct dsa_8021q_context;
 struct tc_action;
 struct phy_device;
 struct fixed_phy_status;
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index aa685d2309e0..265659954ffd 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -13,15 +13,7 @@
 
 #define DSA_MAX_NUM_OFFLOADING_BRIDGES		BITS_PER_LONG
 
-struct dsa_notifier_tag_8021q_vlan_info;
-
 /* netlink.c */
 extern struct rtnl_link_ops dsa_link_ops __read_mostly;
 
-/* tag_8021q.c */
-int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
-				  struct dsa_notifier_tag_8021q_vlan_info *info);
-int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
-				  struct dsa_notifier_tag_8021q_vlan_info *info);
-
 #endif
diff --git a/net/dsa/port.c b/net/dsa/port.c
index e6d5c05b41b4..67ad1adec2a2 100644
--- a/net/dsa/port.c
+++ b/net/dsa/port.c
@@ -16,6 +16,7 @@
 #include "port.h"
 #include "slave.h"
 #include "switch.h"
+#include "tag_8021q.h"
 
 /**
  * dsa_port_notify - Notify the switching fabric of changes to a port
diff --git a/net/dsa/switch.c b/net/dsa/switch.c
index 4420af0081af..e53cc0c3c933 100644
--- a/net/dsa/switch.c
+++ b/net/dsa/switch.c
@@ -17,6 +17,7 @@
 #include "port.h"
 #include "slave.h"
 #include "switch.h"
+#include "tag_8021q.h"
 
 static unsigned int dsa_switch_fastest_ageing_time(struct dsa_switch *ds,
 						   unsigned int ageing_time)
diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c
index abd994dc76d5..ac2eb933106e 100644
--- a/net/dsa/tag_8021q.c
+++ b/net/dsa/tag_8021q.c
@@ -11,6 +11,7 @@
 #include "port.h"
 #include "switch.h"
 #include "tag.h"
+#include "tag_8021q.h"
 
 /* Binary structure of the fake 12-bit VID field (when the TPID is
  * ETH_P_DSA_8021Q):
@@ -63,6 +64,20 @@
 #define DSA_8021Q_PORT(x)		(((x) << DSA_8021Q_PORT_SHIFT) & \
 						 DSA_8021Q_PORT_MASK)
 
+struct dsa_tag_8021q_vlan {
+	struct list_head list;
+	int port;
+	u16 vid;
+	refcount_t refcount;
+};
+
+struct dsa_8021q_context {
+	struct dsa_switch *ds;
+	struct list_head vlans;
+	/* EtherType of RX VID, used for filtering on master interface */
+	__be16 proto;
+};
+
 u16 dsa_tag_8021q_bridge_vid(unsigned int bridge_num)
 {
 	/* The VBID value of 0 is reserved for precise TX, but it is also
diff --git a/net/dsa/tag_8021q.h b/net/dsa/tag_8021q.h
new file mode 100644
index 000000000000..b75cbaa028ef
--- /dev/null
+++ b/net/dsa/tag_8021q.h
@@ -0,0 +1,27 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#ifndef __DSA_TAG_8021Q_H
+#define __DSA_TAG_8021Q_H
+
+#include <net/dsa.h>
+
+#include "switch.h"
+
+struct sk_buff;
+struct net_device;
+
+struct sk_buff *dsa_8021q_xmit(struct sk_buff *skb, struct net_device *netdev,
+			       u16 tpid, u16 tci);
+
+void dsa_8021q_rcv(struct sk_buff *skb, int *source_port, int *switch_id,
+		   int *vbid);
+
+struct net_device *dsa_tag_8021q_find_port_by_vbid(struct net_device *master,
+						   int vbid);
+
+int dsa_switch_tag_8021q_vlan_add(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
+int dsa_switch_tag_8021q_vlan_del(struct dsa_switch *ds,
+				  struct dsa_notifier_tag_8021q_vlan_info *info);
+
+#endif
diff --git a/net/dsa/tag_ocelot_8021q.c b/net/dsa/tag_ocelot_8021q.c
index 7f0c2d71e89b..1f0b8c20eba5 100644
--- a/net/dsa/tag_ocelot_8021q.c
+++ b/net/dsa/tag_ocelot_8021q.c
@@ -12,6 +12,7 @@
 #include <linux/dsa/ocelot.h>
 
 #include "tag.h"
+#include "tag_8021q.h"
 
 #define OCELOT_8021Q_NAME "ocelot-8021q"
 
diff --git a/net/dsa/tag_sja1105.c b/net/dsa/tag_sja1105.c
index 8f581617e15c..f14f51b41491 100644
--- a/net/dsa/tag_sja1105.c
+++ b/net/dsa/tag_sja1105.c
@@ -7,6 +7,7 @@
 #include <linux/packing.h>
 
 #include "tag.h"
+#include "tag_8021q.h"
 
 #define SJA1105_NAME				"sja1105"
 #define SJA1110_NAME				"sja1110"
-- 
cgit v1.2.3


From bd12240337f43522b99c43f8976af34c712b5f57 Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 24 Nov 2022 15:43:42 +0100
Subject: xfrm: add extack to xfrm_do_migrate

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  3 ++-
 net/key/af_key.c       |  2 +-
 net/xfrm/xfrm_policy.c | 28 ++++++++++++++++++++--------
 net/xfrm/xfrm_user.c   | 16 +++++++++++-----
 4 files changed, 34 insertions(+), 15 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index dbc81f5eb553..576566bd0be9 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1703,7 +1703,8 @@ struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x,
 int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 		 struct xfrm_migrate *m, int num_bundles,
 		 struct xfrm_kmaddress *k, struct net *net,
-		 struct xfrm_encap_tmpl *encap, u32 if_id);
+		 struct xfrm_encap_tmpl *encap, u32 if_id,
+		 struct netlink_ext_ack *extack);
 #endif
 
 int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);
diff --git a/net/key/af_key.c b/net/key/af_key.c
index c85df5b958d2..7f4ff5fe2257 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -2626,7 +2626,7 @@ static int pfkey_migrate(struct sock *sk, struct sk_buff *skb,
 	}
 
 	return xfrm_migrate(&sel, dir, XFRM_POLICY_TYPE_MAIN, m, i,
-			    kma ? &k : NULL, net, NULL, 0);
+			    kma ? &k : NULL, net, NULL, 0, NULL);
 
  out:
 	return err;
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index a049f91d4446..9b9e2765363d 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -4333,7 +4333,8 @@ static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tm
 
 /* update endpoint address(es) of template(s) */
 static int xfrm_policy_migrate(struct xfrm_policy *pol,
-			       struct xfrm_migrate *m, int num_migrate)
+			       struct xfrm_migrate *m, int num_migrate,
+			       struct netlink_ext_ack *extack)
 {
 	struct xfrm_migrate *mp;
 	int i, j, n = 0;
@@ -4341,6 +4342,7 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 	write_lock_bh(&pol->lock);
 	if (unlikely(pol->walk.dead)) {
 		/* target policy has been deleted */
+		NL_SET_ERR_MSG(extack, "Target policy not found");
 		write_unlock_bh(&pol->lock);
 		return -ENOENT;
 	}
@@ -4372,17 +4374,22 @@ static int xfrm_policy_migrate(struct xfrm_policy *pol,
 	return 0;
 }
 
-static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
+static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate,
+			      struct netlink_ext_ack *extack)
 {
 	int i, j;
 
-	if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH)
+	if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) {
+		NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
 		return -EINVAL;
+	}
 
 	for (i = 0; i < num_migrate; i++) {
 		if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) ||
-		    xfrm_addr_any(&m[i].new_saddr, m[i].new_family))
+		    xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) {
+			NL_SET_ERR_MSG(extack, "Addresses in the MIGRATE attribute's list cannot be null");
 			return -EINVAL;
+		}
 
 		/* check if there is any duplicated entry */
 		for (j = i + 1; j < num_migrate; j++) {
@@ -4393,8 +4400,10 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
 			    m[i].proto == m[j].proto &&
 			    m[i].mode == m[j].mode &&
 			    m[i].reqid == m[j].reqid &&
-			    m[i].old_family == m[j].old_family)
+			    m[i].old_family == m[j].old_family) {
+				NL_SET_ERR_MSG(extack, "Entries in the MIGRATE attribute's list must be unique");
 				return -EINVAL;
+			}
 		}
 	}
 
@@ -4404,7 +4413,8 @@ static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate)
 int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 		 struct xfrm_migrate *m, int num_migrate,
 		 struct xfrm_kmaddress *k, struct net *net,
-		 struct xfrm_encap_tmpl *encap, u32 if_id)
+		 struct xfrm_encap_tmpl *encap, u32 if_id,
+		 struct netlink_ext_ack *extack)
 {
 	int i, err, nx_cur = 0, nx_new = 0;
 	struct xfrm_policy *pol = NULL;
@@ -4414,11 +4424,12 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	struct xfrm_migrate *mp;
 
 	/* Stage 0 - sanity checks */
-	err = xfrm_migrate_check(m, num_migrate);
+	err = xfrm_migrate_check(m, num_migrate, extack);
 	if (err < 0)
 		goto out;
 
 	if (dir >= XFRM_POLICY_MAX) {
+		NL_SET_ERR_MSG(extack, "Invalid policy direction");
 		err = -EINVAL;
 		goto out;
 	}
@@ -4426,6 +4437,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	/* Stage 1 - find policy */
 	pol = xfrm_migrate_policy_find(sel, dir, type, net, if_id);
 	if (!pol) {
+		NL_SET_ERR_MSG(extack, "Target policy not found");
 		err = -ENOENT;
 		goto out;
 	}
@@ -4447,7 +4459,7 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
 	}
 
 	/* Stage 3 - update policy */
-	err = xfrm_policy_migrate(pol, m, num_migrate);
+	err = xfrm_policy_migrate(pol, m, num_migrate, extack);
 	if (err < 0)
 		goto restore_state;
 
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 13607df4f30d..c5d6a92d73cb 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2687,7 +2687,8 @@ nomem:
 #ifdef CONFIG_XFRM_MIGRATE
 static int copy_from_user_migrate(struct xfrm_migrate *ma,
 				  struct xfrm_kmaddress *k,
-				  struct nlattr **attrs, int *num)
+				  struct nlattr **attrs, int *num,
+				  struct netlink_ext_ack *extack)
 {
 	struct nlattr *rt = attrs[XFRMA_MIGRATE];
 	struct xfrm_user_migrate *um;
@@ -2706,8 +2707,10 @@ static int copy_from_user_migrate(struct xfrm_migrate *ma,
 	um = nla_data(rt);
 	num_migrate = nla_len(rt) / sizeof(*um);
 
-	if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH)
+	if (num_migrate <= 0 || num_migrate > XFRM_MAX_DEPTH) {
+		NL_SET_ERR_MSG(extack, "Invalid number of SAs to migrate, must be 0 < num <= XFRM_MAX_DEPTH (6)");
 		return -EINVAL;
+	}
 
 	for (i = 0; i < num_migrate; i++, um++, ma++) {
 		memcpy(&ma->old_daddr, &um->old_daddr, sizeof(ma->old_daddr));
@@ -2740,8 +2743,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 	struct xfrm_encap_tmpl  *encap = NULL;
 	u32 if_id = 0;
 
-	if (!attrs[XFRMA_MIGRATE])
+	if (!attrs[XFRMA_MIGRATE]) {
+		NL_SET_ERR_MSG(extack, "Missing required MIGRATE attribute");
 		return -EINVAL;
+	}
 
 	kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL;
 
@@ -2749,7 +2754,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (err)
 		return err;
 
-	err = copy_from_user_migrate(m, kmp, attrs, &n);
+	err = copy_from_user_migrate(m, kmp, attrs, &n, extack);
 	if (err)
 		return err;
 
@@ -2766,7 +2771,8 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
 	if (attrs[XFRMA_IF_ID])
 		if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
-	err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap, if_id);
+	err = xfrm_migrate(&pi->sel, pi->dir, type, m, n, kmp, net, encap,
+			   if_id, extack);
 
 	kfree(encap);
 
-- 
cgit v1.2.3


From c2dad11e0466a27d40041845cf63cdfb4fbd991f Mon Sep 17 00:00:00 2001
From: Sabrina Dubroca <sd@queasysnail.net>
Date: Thu, 24 Nov 2022 15:43:43 +0100
Subject: xfrm: add extack to xfrm_alloc_userspi

Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h    |  5 +++--
 net/key/af_key.c      |  4 ++--
 net/xfrm/xfrm_state.c | 21 ++++++++++++++++-----
 net/xfrm/xfrm_user.c  |  8 +++++---
 4 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 576566bd0be9..e0cc6791c001 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1681,8 +1681,9 @@ struct xfrm_policy *xfrm_policy_byid(struct net *net,
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid);
 void xfrm_policy_hash_rebuild(struct net *net);
 u32 xfrm_get_acqseq(void);
-int verify_spi_info(u8 proto, u32 min, u32 max);
-int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi);
+int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack);
+int xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi,
+		   struct netlink_ext_ack *extack);
 struct xfrm_state *xfrm_find_acq(struct net *net, const struct xfrm_mark *mark,
 				 u8 mode, u32 reqid, u32 if_id, u8 proto,
 				 const xfrm_address_t *daddr,
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 7f4ff5fe2257..e1d2155605aa 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -1377,13 +1377,13 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_
 		max_spi = range->sadb_spirange_max;
 	}
 
-	err = verify_spi_info(x->id.proto, min_spi, max_spi);
+	err = verify_spi_info(x->id.proto, min_spi, max_spi, NULL);
 	if (err) {
 		xfrm_state_put(x);
 		return err;
 	}
 
-	err = xfrm_alloc_spi(x, min_spi, max_spi);
+	err = xfrm_alloc_spi(x, min_spi, max_spi, NULL);
 	resp_skb = err ? ERR_PTR(err) : pfkey_xfrm_state2msg(x);
 
 	if (IS_ERR(resp_skb)) {
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 81df34b3da6e..d0ae17e3bb38 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2017,7 +2017,7 @@ u32 xfrm_get_acqseq(void)
 }
 EXPORT_SYMBOL(xfrm_get_acqseq);
 
-int verify_spi_info(u8 proto, u32 min, u32 max)
+int verify_spi_info(u8 proto, u32 min, u32 max, struct netlink_ext_ack *extack)
 {
 	switch (proto) {
 	case IPPROTO_AH:
@@ -2026,22 +2026,28 @@ int verify_spi_info(u8 proto, u32 min, u32 max)
 
 	case IPPROTO_COMP:
 		/* IPCOMP spi is 16-bits. */
-		if (max >= 0x10000)
+		if (max >= 0x10000) {
+			NL_SET_ERR_MSG(extack, "IPCOMP SPI must be <= 65535");
 			return -EINVAL;
+		}
 		break;
 
 	default:
+		NL_SET_ERR_MSG(extack, "Invalid protocol, must be one of AH, ESP, IPCOMP");
 		return -EINVAL;
 	}
 
-	if (min > max)
+	if (min > max) {
+		NL_SET_ERR_MSG(extack, "Invalid SPI range: min > max");
 		return -EINVAL;
+	}
 
 	return 0;
 }
 EXPORT_SYMBOL(verify_spi_info);
 
-int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
+int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high,
+		   struct netlink_ext_ack *extack)
 {
 	struct net *net = xs_net(x);
 	unsigned int h;
@@ -2053,8 +2059,10 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
 	u32 mark = x->mark.v & x->mark.m;
 
 	spin_lock_bh(&x->lock);
-	if (x->km.state == XFRM_STATE_DEAD)
+	if (x->km.state == XFRM_STATE_DEAD) {
+		NL_SET_ERR_MSG(extack, "Target ACQUIRE is in DEAD state");
 		goto unlock;
+	}
 
 	err = 0;
 	if (x->id.spi)
@@ -2065,6 +2073,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
 	if (minspi == maxspi) {
 		x0 = xfrm_state_lookup(net, mark, &x->id.daddr, minspi, x->id.proto, x->props.family);
 		if (x0) {
+			NL_SET_ERR_MSG(extack, "Requested SPI is already in use");
 			xfrm_state_put(x0);
 			goto unlock;
 		}
@@ -2089,6 +2098,8 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high)
 		spin_unlock_bh(&net->xfrm.xfrm_state_lock);
 
 		err = 0;
+	} else {
+		NL_SET_ERR_MSG(extack, "No SPI available in the requested range");
 	}
 
 unlock:
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c5d6a92d73cb..5c280e04e02c 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1523,7 +1523,7 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 	u32 if_id = 0;
 
 	p = nlmsg_data(nlh);
-	err = verify_spi_info(p->info.id.proto, p->min, p->max);
+	err = verify_spi_info(p->info.id.proto, p->min, p->max, extack);
 	if (err)
 		goto out_noput;
 
@@ -1551,10 +1551,12 @@ static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh,
 				  &p->info.saddr, 1,
 				  family);
 	err = -ENOENT;
-	if (!x)
+	if (!x) {
+		NL_SET_ERR_MSG(extack, "Target ACQUIRE not found");
 		goto out_noput;
+	}
 
-	err = xfrm_alloc_spi(x, p->min, p->max);
+	err = xfrm_alloc_spi(x, p->min, p->max, extack);
 	if (err)
 		goto out;
 
-- 
cgit v1.2.3


From 51147284eb7d685a689a5d1b7772faec278a2338 Mon Sep 17 00:00:00 2001
From: Miquel Raynal <miquel.raynal@bootlin.com>
Date: Tue, 29 Nov 2022 14:55:34 +0100
Subject: ieee802154: Advertize coordinators discovery

Let's introduce the basics for advertizing discovered PANs and
coordinators, which is:
- A new "scan" netlink message group.
- A couple of netlink command/attribute.
- The main netlink helper to send a netlink message with all the
  necessary information to forward the main information to the user.

Two netlink attributes are proactively added to support future UWB
complex channels, but are not actually used yet.

Co-developed-by: David Girault <david.girault@qorvo.com>
Signed-off-by: David Girault <david.girault@qorvo.com>
Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
Acked-by: Alexander Aring <aahringo@redhat.com>
Link: https://lore.kernel.org/r/20221129135535.532513-2-miquel.raynal@bootlin.com
Signed-off-by: Stefan Schmidt <stefan@datenfreihafen.org>
---
 include/net/cfg802154.h   |  18 ++++++++
 include/net/nl802154.h    |  43 +++++++++++++++++++
 net/ieee802154/nl802154.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++
 net/ieee802154/nl802154.h |   2 +
 4 files changed, 166 insertions(+)

(limited to 'include/net')

diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h
index e1481f9cf049..d09c393d229f 100644
--- a/include/net/cfg802154.h
+++ b/include/net/cfg802154.h
@@ -260,6 +260,24 @@ struct ieee802154_addr {
 	};
 };
 
+/**
+ * struct ieee802154_coord_desc - Coordinator descriptor
+ * @addr: PAN ID and coordinator address
+ * @page: page this coordinator is using
+ * @channel: channel this coordinator is using
+ * @superframe_spec: SuperFrame specification as received
+ * @link_quality: link quality indicator at which the beacon was received
+ * @gts_permit: the coordinator accepts GTS requests
+ */
+struct ieee802154_coord_desc {
+	struct ieee802154_addr addr;
+	u8 page;
+	u8 channel;
+	u16 superframe_spec;
+	u8 link_quality;
+	bool gts_permit;
+};
+
 struct ieee802154_llsec_key_id {
 	u8 mode;
 	u8 id;
diff --git a/include/net/nl802154.h b/include/net/nl802154.h
index f5850b569c52..b79a89d5207c 100644
--- a/include/net/nl802154.h
+++ b/include/net/nl802154.h
@@ -72,6 +72,8 @@ enum nl802154_commands {
 	NL802154_CMD_NEW_SEC_LEVEL,
 	NL802154_CMD_DEL_SEC_LEVEL,
 
+	NL802154_CMD_SCAN_EVENT,
+
 	/* add new commands above here */
 
 	/* used to define NL802154_CMD_MAX below */
@@ -131,6 +133,8 @@ enum nl802154_attrs {
 	NL802154_ATTR_PID,
 	NL802154_ATTR_NETNS_FD,
 
+	NL802154_ATTR_COORDINATOR,
+
 	/* add attributes here, update the policy in nl802154.c */
 
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
@@ -216,6 +220,45 @@ enum nl802154_wpan_phy_capability_attr {
 	NL802154_CAP_ATTR_MAX = __NL802154_CAP_ATTR_AFTER_LAST - 1
 };
 
+/**
+ * enum nl802154_coord - Netlink attributes for a coord
+ *
+ * @__NL802154_COORD_INVALID: invalid
+ * @NL802154_COORD_PANID: PANID of the coordinator (2 bytes)
+ * @NL802154_COORD_ADDR: coordinator address, (8 bytes or 2 bytes)
+ * @NL802154_COORD_CHANNEL: channel number, related to @NL802154_COORD_PAGE (u8)
+ * @NL802154_COORD_PAGE: channel page, related to @NL802154_COORD_CHANNEL (u8)
+ * @NL802154_COORD_PREAMBLE_CODE: Preamble code used when the beacon was received,
+ *	this is PHY dependent and optional (u8)
+ * @NL802154_COORD_MEAN_PRF: Mean PRF used when the beacon was received,
+ *     this is PHY dependent and optional (u8)
+ * @NL802154_COORD_SUPERFRAME_SPEC: superframe specification of the PAN (u16)
+ * @NL802154_COORD_LINK_QUALITY: signal quality of beacon in unspecified units,
+ *	scaled to 0..255 (u8)
+ * @NL802154_COORD_GTS_PERMIT: set to true if GTS is permitted on this PAN
+ * @NL802154_COORD_PAYLOAD_DATA: binary data containing the raw data from the
+ *	frame payload, (only if beacon or probe response had data)
+ * @NL802154_COORD_PAD: attribute used for padding for 64-bit alignment
+ * @NL802154_COORD_MAX: highest coordinator attribute
+ */
+enum nl802154_coord {
+	__NL802154_COORD_INVALID,
+	NL802154_COORD_PANID,
+	NL802154_COORD_ADDR,
+	NL802154_COORD_CHANNEL,
+	NL802154_COORD_PAGE,
+	NL802154_COORD_PREAMBLE_CODE,
+	NL802154_COORD_MEAN_PRF,
+	NL802154_COORD_SUPERFRAME_SPEC,
+	NL802154_COORD_LINK_QUALITY,
+	NL802154_COORD_GTS_PERMIT,
+	NL802154_COORD_PAYLOAD_DATA,
+	NL802154_COORD_PAD,
+
+	/* keep last */
+	NL802154_COORD_MAX,
+};
+
 /**
  * enum nl802154_cca_modes - cca modes
  *
diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c
index 38c4f3cb010e..80dc73182785 100644
--- a/net/ieee802154/nl802154.c
+++ b/net/ieee802154/nl802154.c
@@ -26,10 +26,12 @@ static struct genl_family nl802154_fam;
 /* multicast groups */
 enum nl802154_multicast_groups {
 	NL802154_MCGRP_CONFIG,
+	NL802154_MCGRP_SCAN,
 };
 
 static const struct genl_multicast_group nl802154_mcgrps[] = {
 	[NL802154_MCGRP_CONFIG] = { .name = "config", },
+	[NL802154_MCGRP_SCAN] = { .name = "scan", },
 };
 
 /* returns ERR_PTR values */
@@ -216,6 +218,9 @@ static const struct nla_policy nl802154_policy[NL802154_ATTR_MAX+1] = {
 
 	[NL802154_ATTR_PID] = { .type = NLA_U32 },
 	[NL802154_ATTR_NETNS_FD] = { .type = NLA_U32 },
+
+	[NL802154_ATTR_COORDINATOR] = { .type = NLA_NESTED },
+
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
 	[NL802154_ATTR_SEC_ENABLED] = { .type = NLA_U8, },
 	[NL802154_ATTR_SEC_OUT_LEVEL] = { .type = NLA_U32, },
@@ -1281,6 +1286,104 @@ static int nl802154_wpan_phy_netns(struct sk_buff *skb, struct genl_info *info)
 	return err;
 }
 
+static int nl802154_prep_scan_event_msg(struct sk_buff *msg,
+					struct cfg802154_registered_device *rdev,
+					struct wpan_dev *wpan_dev,
+					u32 portid, u32 seq, int flags, u8 cmd,
+					struct ieee802154_coord_desc *desc)
+{
+	struct nlattr *nla;
+	void *hdr;
+
+	hdr = nl802154hdr_put(msg, portid, seq, flags, cmd);
+	if (!hdr)
+		return -ENOBUFS;
+
+	if (nla_put_u32(msg, NL802154_ATTR_WPAN_PHY, rdev->wpan_phy_idx))
+		goto nla_put_failure;
+
+	if (wpan_dev->netdev &&
+	    nla_put_u32(msg, NL802154_ATTR_IFINDEX, wpan_dev->netdev->ifindex))
+		goto nla_put_failure;
+
+	if (nla_put_u64_64bit(msg, NL802154_ATTR_WPAN_DEV,
+			      wpan_dev_id(wpan_dev), NL802154_ATTR_PAD))
+		goto nla_put_failure;
+
+	nla = nla_nest_start_noflag(msg, NL802154_ATTR_COORDINATOR);
+	if (!nla)
+		goto nla_put_failure;
+
+	if (nla_put(msg, NL802154_COORD_PANID, IEEE802154_PAN_ID_LEN,
+		    &desc->addr.pan_id))
+		goto nla_put_failure;
+
+	if (desc->addr.mode == IEEE802154_ADDR_SHORT) {
+		if (nla_put(msg, NL802154_COORD_ADDR,
+			    IEEE802154_SHORT_ADDR_LEN,
+			    &desc->addr.short_addr))
+			goto nla_put_failure;
+	} else {
+		if (nla_put(msg, NL802154_COORD_ADDR,
+			    IEEE802154_EXTENDED_ADDR_LEN,
+			    &desc->addr.extended_addr))
+			goto nla_put_failure;
+	}
+
+	if (nla_put_u8(msg, NL802154_COORD_CHANNEL, desc->channel))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL802154_COORD_PAGE, desc->page))
+		goto nla_put_failure;
+
+	if (nla_put_u16(msg, NL802154_COORD_SUPERFRAME_SPEC,
+			desc->superframe_spec))
+		goto nla_put_failure;
+
+	if (nla_put_u8(msg, NL802154_COORD_LINK_QUALITY, desc->link_quality))
+		goto nla_put_failure;
+
+	if (desc->gts_permit && nla_put_flag(msg, NL802154_COORD_GTS_PERMIT))
+		goto nla_put_failure;
+
+	/* TODO: NL802154_COORD_PAYLOAD_DATA if any */
+
+	nla_nest_end(msg, nla);
+
+	genlmsg_end(msg, hdr);
+
+	return 0;
+
+ nla_put_failure:
+	genlmsg_cancel(msg, hdr);
+
+	return -EMSGSIZE;
+}
+
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+			struct ieee802154_coord_desc *desc)
+{
+	struct cfg802154_registered_device *rdev = wpan_phy_to_rdev(wpan_phy);
+	struct sk_buff *msg;
+	int ret;
+
+	msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+	if (!msg)
+		return -ENOMEM;
+
+	ret = nl802154_prep_scan_event_msg(msg, rdev, wpan_dev, 0, 0, 0,
+					   NL802154_CMD_SCAN_EVENT,
+					   desc);
+	if (ret < 0) {
+		nlmsg_free(msg);
+		return ret;
+	}
+
+	return genlmsg_multicast_netns(&nl802154_fam, wpan_phy_net(wpan_phy),
+				       msg, 0, NL802154_MCGRP_SCAN, GFP_ATOMIC);
+}
+EXPORT_SYMBOL_GPL(nl802154_scan_event);
+
 #ifdef CONFIG_IEEE802154_NL802154_EXPERIMENTAL
 static const struct nla_policy nl802154_dev_addr_policy[NL802154_DEV_ADDR_ATTR_MAX + 1] = {
 	[NL802154_DEV_ADDR_ATTR_PAN_ID] = { .type = NLA_U16 },
diff --git a/net/ieee802154/nl802154.h b/net/ieee802154/nl802154.h
index 8c4b6d08954c..89b805500032 100644
--- a/net/ieee802154/nl802154.h
+++ b/net/ieee802154/nl802154.h
@@ -4,5 +4,7 @@
 
 int nl802154_init(void);
 void nl802154_exit(void);
+int nl802154_scan_event(struct wpan_phy *wpan_phy, struct wpan_dev *wpan_dev,
+			struct ieee802154_coord_desc *desc);
 
 #endif /* __IEEE802154_NL802154_H */
-- 
cgit v1.2.3


From 9ed7bfc79542119ac0a9e1ce8a2a5285e43433e9 Mon Sep 17 00:00:00 2001
From: Zhengchao Shao <shaozhengchao@huawei.com>
Date: Sat, 26 Nov 2022 11:17:20 +0800
Subject: sctp: fix memory leak in sctp_stream_outq_migrate()

When sctp_stream_outq_migrate() is called to release stream out resources,
the memory pointed to by prio_head in stream out is not released.

The memory leak information is as follows:
 unreferenced object 0xffff88801fe79f80 (size 64):
   comm "sctp_repo", pid 7957, jiffies 4294951704 (age 36.480s)
   hex dump (first 32 bytes):
     80 9f e7 1f 80 88 ff ff 80 9f e7 1f 80 88 ff ff  ................
     90 9f e7 1f 80 88 ff ff 90 9f e7 1f 80 88 ff ff  ................
   backtrace:
     [<ffffffff81b215c6>] kmalloc_trace+0x26/0x60
     [<ffffffff88ae517c>] sctp_sched_prio_set+0x4cc/0x770
     [<ffffffff88ad64f2>] sctp_stream_init_ext+0xd2/0x1b0
     [<ffffffff88aa2604>] sctp_sendmsg_to_asoc+0x1614/0x1a30
     [<ffffffff88ab7ff1>] sctp_sendmsg+0xda1/0x1ef0
     [<ffffffff87f765ed>] inet_sendmsg+0x9d/0xe0
     [<ffffffff8754b5b3>] sock_sendmsg+0xd3/0x120
     [<ffffffff8755446a>] __sys_sendto+0x23a/0x340
     [<ffffffff87554651>] __x64_sys_sendto+0xe1/0x1b0
     [<ffffffff89978b49>] do_syscall_64+0x39/0xb0
     [<ffffffff89a0008b>] entry_SYSCALL_64_after_hwframe+0x63/0xcd

Link: https://syzkaller.appspot.com/bug?exrid=29c402e56c4760763cc0
Fixes: 637784ade221 ("sctp: introduce priority based stream scheduler")
Reported-by: syzbot+29c402e56c4760763cc0@syzkaller.appspotmail.com
Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
Reviewed-by: Xin Long <lucien.xin@gmail.com>
Link: https://lore.kernel.org/r/20221126031720.378562-1-shaozhengchao@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/stream_sched.h |  2 ++
 net/sctp/stream.c               | 25 ++++++++++++++++++-------
 net/sctp/stream_sched.c         |  5 +++++
 net/sctp/stream_sched_prio.c    | 19 +++++++++++++++++++
 net/sctp/stream_sched_rr.c      |  5 +++++
 5 files changed, 49 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 01a70b27e026..65058faea4db 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -26,6 +26,8 @@ struct sctp_sched_ops {
 	int (*init)(struct sctp_stream *stream);
 	/* Init a stream */
 	int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
+	/* free a stream */
+	void (*free_sid)(struct sctp_stream *stream, __u16 sid);
 	/* Frees the entire thing */
 	void (*free)(struct sctp_stream *stream);
 
diff --git a/net/sctp/stream.c b/net/sctp/stream.c
index ef9fceadef8d..ee6514af830f 100644
--- a/net/sctp/stream.c
+++ b/net/sctp/stream.c
@@ -52,6 +52,19 @@ static void sctp_stream_shrink_out(struct sctp_stream *stream, __u16 outcnt)
 	}
 }
 
+static void sctp_stream_free_ext(struct sctp_stream *stream, __u16 sid)
+{
+	struct sctp_sched_ops *sched;
+
+	if (!SCTP_SO(stream, sid)->ext)
+		return;
+
+	sched = sctp_sched_ops_from_stream(stream);
+	sched->free_sid(stream, sid);
+	kfree(SCTP_SO(stream, sid)->ext);
+	SCTP_SO(stream, sid)->ext = NULL;
+}
+
 /* Migrates chunks from stream queues to new stream queues if needed,
  * but not across associations. Also, removes those chunks to streams
  * higher than the new max.
@@ -70,16 +83,14 @@ static void sctp_stream_outq_migrate(struct sctp_stream *stream,
 		 * sctp_stream_update will swap ->out pointers.
 		 */
 		for (i = 0; i < outcnt; i++) {
-			kfree(SCTP_SO(new, i)->ext);
+			sctp_stream_free_ext(new, i);
 			SCTP_SO(new, i)->ext = SCTP_SO(stream, i)->ext;
 			SCTP_SO(stream, i)->ext = NULL;
 		}
 	}
 
-	for (i = outcnt; i < stream->outcnt; i++) {
-		kfree(SCTP_SO(stream, i)->ext);
-		SCTP_SO(stream, i)->ext = NULL;
-	}
+	for (i = outcnt; i < stream->outcnt; i++)
+		sctp_stream_free_ext(stream, i);
 }
 
 static int sctp_stream_alloc_out(struct sctp_stream *stream, __u16 outcnt,
@@ -174,9 +185,9 @@ void sctp_stream_free(struct sctp_stream *stream)
 	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
 	int i;
 
-	sched->free(stream);
+	sched->unsched_all(stream);
 	for (i = 0; i < stream->outcnt; i++)
-		kfree(SCTP_SO(stream, i)->ext);
+		sctp_stream_free_ext(stream, i);
 	genradix_free(&stream->out);
 	genradix_free(&stream->in);
 }
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 1ad565ed5627..7c8f9d89e16a 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -46,6 +46,10 @@ static int sctp_sched_fcfs_init_sid(struct sctp_stream *stream, __u16 sid,
 	return 0;
 }
 
+static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
 static void sctp_sched_fcfs_free(struct sctp_stream *stream)
 {
 }
@@ -96,6 +100,7 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 	.get = sctp_sched_fcfs_get,
 	.init = sctp_sched_fcfs_init,
 	.init_sid = sctp_sched_fcfs_init_sid,
+	.free_sid = sctp_sched_fcfs_free_sid,
 	.free = sctp_sched_fcfs_free,
 	.enqueue = sctp_sched_fcfs_enqueue,
 	.dequeue = sctp_sched_fcfs_dequeue,
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 80b5a2c4cbc7..4fc9f2923ed1 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -204,6 +204,24 @@ static int sctp_sched_prio_init_sid(struct sctp_stream *stream, __u16 sid,
 	return sctp_sched_prio_set(stream, sid, 0, gfp);
 }
 
+static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+	struct sctp_stream_priorities *prio = SCTP_SO(stream, sid)->ext->prio_head;
+	int i;
+
+	if (!prio)
+		return;
+
+	SCTP_SO(stream, sid)->ext->prio_head = NULL;
+	for (i = 0; i < stream->outcnt; i++) {
+		if (SCTP_SO(stream, i)->ext &&
+		    SCTP_SO(stream, i)->ext->prio_head == prio)
+			return;
+	}
+
+	kfree(prio);
+}
+
 static void sctp_sched_prio_free(struct sctp_stream *stream)
 {
 	struct sctp_stream_priorities *prio, *n;
@@ -323,6 +341,7 @@ static struct sctp_sched_ops sctp_sched_prio = {
 	.get = sctp_sched_prio_get,
 	.init = sctp_sched_prio_init,
 	.init_sid = sctp_sched_prio_init_sid,
+	.free_sid = sctp_sched_prio_free_sid,
 	.free = sctp_sched_prio_free,
 	.enqueue = sctp_sched_prio_enqueue,
 	.dequeue = sctp_sched_prio_dequeue,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index ff425aed62c7..cc444fe0d67c 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -90,6 +90,10 @@ static int sctp_sched_rr_init_sid(struct sctp_stream *stream, __u16 sid,
 	return 0;
 }
 
+static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid)
+{
+}
+
 static void sctp_sched_rr_free(struct sctp_stream *stream)
 {
 	sctp_sched_rr_unsched_all(stream);
@@ -177,6 +181,7 @@ static struct sctp_sched_ops sctp_sched_rr = {
 	.get = sctp_sched_rr_get,
 	.init = sctp_sched_rr_init,
 	.init_sid = sctp_sched_rr_init_sid,
+	.free_sid = sctp_sched_rr_free_sid,
 	.free = sctp_sched_rr_free,
 	.enqueue = sctp_sched_rr_enqueue,
 	.dequeue = sctp_sched_rr_dequeue,
-- 
cgit v1.2.3


From a70e483460d58e64504dd679fd127e9549385c86 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Wed, 9 Nov 2022 12:21:58 +0100
Subject: netfilter: conntrack: merge ipv4+ipv6 confirm functions

No need to have distinct functions.  After merge, ipv6 can avoid
protooff computation if the connection neither needs sequence adjustment
nor helper invocation -- this is the normal case.

Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/netfilter/nf_conntrack_core.h  |   3 +-
 net/bridge/netfilter/nf_conntrack_bridge.c |  32 +-------
 net/netfilter/nf_conntrack_proto.c         | 124 +++++++++++++----------------
 3 files changed, 57 insertions(+), 102 deletions(-)

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
index b2b9de70d9f4..71d1269fe4d4 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -71,8 +71,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb)
 	return ret;
 }
 
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
-			struct nf_conn *ct, enum ip_conntrack_info ctinfo);
+unsigned int nf_confirm(void *priv, struct sk_buff *skb, const struct nf_hook_state *state);
 
 void print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
 		 const struct nf_conntrack_l4proto *proto);
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 73242962be5d..5c5dd437f1c2 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -366,42 +366,12 @@ static int nf_ct_bridge_refrag_post(struct net *net, struct sock *sk,
 	return br_dev_queue_push_xmit(net, sk, skb);
 }
 
-static unsigned int nf_ct_bridge_confirm(struct sk_buff *skb)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
-	int protoff;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return nf_conntrack_confirm(skb);
-
-	switch (skb->protocol) {
-	case htons(ETH_P_IP):
-		protoff = skb_network_offset(skb) + ip_hdrlen(skb);
-		break;
-	case htons(ETH_P_IPV6): {
-		unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-		__be16 frag_off;
-
-		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
-					   &frag_off);
-		if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
-			return nf_conntrack_confirm(skb);
-		}
-		break;
-	default:
-		return NF_ACCEPT;
-	}
-	return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
 static unsigned int nf_ct_bridge_post(void *priv, struct sk_buff *skb,
 				      const struct nf_hook_state *state)
 {
 	int ret;
 
-	ret = nf_ct_bridge_confirm(skb);
+	ret = nf_confirm(priv, skb, state);
 	if (ret != NF_ACCEPT)
 		return ret;
 
diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c
index 895b09cbd7cf..99323fb12d0f 100644
--- a/net/netfilter/nf_conntrack_proto.c
+++ b/net/netfilter/nf_conntrack_proto.c
@@ -121,17 +121,61 @@ const struct nf_conntrack_l4proto *nf_ct_l4proto_find(u8 l4proto)
 };
 EXPORT_SYMBOL_GPL(nf_ct_l4proto_find);
 
-unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
-			struct nf_conn *ct, enum ip_conntrack_info ctinfo)
+static bool in_vrf_postrouting(const struct nf_hook_state *state)
+{
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+	if (state->hook == NF_INET_POST_ROUTING &&
+	    netif_is_l3_master(state->out))
+		return true;
+#endif
+	return false;
+}
+
+unsigned int nf_confirm(void *priv,
+			struct sk_buff *skb,
+			const struct nf_hook_state *state)
 {
 	const struct nf_conn_help *help;
+	enum ip_conntrack_info ctinfo;
+	unsigned int protoff;
+	struct nf_conn *ct;
+	bool seqadj_needed;
+	__be16 frag_off;
+	u8 pnum;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct || in_vrf_postrouting(state))
+		return NF_ACCEPT;
 
 	help = nfct_help(ct);
+
+	seqadj_needed = test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) && !nf_is_loopback_packet(skb);
+	if (!help && !seqadj_needed)
+		return nf_conntrack_confirm(skb);
+
+	/* helper->help() do not expect ICMP packets */
+	if (ctinfo == IP_CT_RELATED_REPLY)
+		return nf_conntrack_confirm(skb);
+
+	switch (nf_ct_l3num(ct)) {
+	case NFPROTO_IPV4:
+		protoff = skb_network_offset(skb) + ip_hdrlen(skb);
+		break;
+	case NFPROTO_IPV6:
+		pnum = ipv6_hdr(skb)->nexthdr;
+		protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum, &frag_off);
+		if (protoff < 0 || (frag_off & htons(~0x7)) != 0)
+			return nf_conntrack_confirm(skb);
+		break;
+	default:
+		return nf_conntrack_confirm(skb);
+	}
+
 	if (help) {
 		const struct nf_conntrack_helper *helper;
 		int ret;
 
-		/* rcu_read_lock()ed by nf_hook_thresh */
+		/* rcu_read_lock()ed by nf_hook */
 		helper = rcu_dereference(help->helper);
 		if (helper) {
 			ret = helper->help(skb,
@@ -142,12 +186,10 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
 		}
 	}
 
-	if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
-	    !nf_is_loopback_packet(skb)) {
-		if (!nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
-			NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
-			return NF_DROP;
-		}
+	if (seqadj_needed &&
+	    !nf_ct_seq_adjust(skb, ct, ctinfo, protoff)) {
+		NF_CT_STAT_INC_ATOMIC(nf_ct_net(ct), drop);
+		return NF_DROP;
 	}
 
 	/* We've seen it coming out the other side: confirm it */
@@ -155,35 +197,6 @@ unsigned int nf_confirm(struct sk_buff *skb, unsigned int protoff,
 }
 EXPORT_SYMBOL_GPL(nf_confirm);
 
-static bool in_vrf_postrouting(const struct nf_hook_state *state)
-{
-#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
-	if (state->hook == NF_INET_POST_ROUTING &&
-	    netif_is_l3_master(state->out))
-		return true;
-#endif
-	return false;
-}
-
-static unsigned int ipv4_confirm(void *priv,
-				 struct sk_buff *skb,
-				 const struct nf_hook_state *state)
-{
-	enum ip_conntrack_info ctinfo;
-	struct nf_conn *ct;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return nf_conntrack_confirm(skb);
-
-	if (in_vrf_postrouting(state))
-		return NF_ACCEPT;
-
-	return nf_confirm(skb,
-			  skb_network_offset(skb) + ip_hdrlen(skb),
-			  ct, ctinfo);
-}
-
 static unsigned int ipv4_conntrack_in(void *priv,
 				      struct sk_buff *skb,
 				      const struct nf_hook_state *state)
@@ -230,13 +243,13 @@ static const struct nf_hook_ops ipv4_conntrack_ops[] = {
 		.priority	= NF_IP_PRI_CONNTRACK,
 	},
 	{
-		.hook		= ipv4_confirm,
+		.hook		= nf_confirm,
 		.pf		= NFPROTO_IPV4,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
 	},
 	{
-		.hook		= ipv4_confirm,
+		.hook		= nf_confirm,
 		.pf		= NFPROTO_IPV4,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP_PRI_CONNTRACK_CONFIRM,
@@ -373,33 +386,6 @@ static struct nf_sockopt_ops so_getorigdst6 = {
 	.owner		= THIS_MODULE,
 };
 
-static unsigned int ipv6_confirm(void *priv,
-				 struct sk_buff *skb,
-				 const struct nf_hook_state *state)
-{
-	struct nf_conn *ct;
-	enum ip_conntrack_info ctinfo;
-	unsigned char pnum = ipv6_hdr(skb)->nexthdr;
-	__be16 frag_off;
-	int protoff;
-
-	ct = nf_ct_get(skb, &ctinfo);
-	if (!ct || ctinfo == IP_CT_RELATED_REPLY)
-		return nf_conntrack_confirm(skb);
-
-	if (in_vrf_postrouting(state))
-		return NF_ACCEPT;
-
-	protoff = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &pnum,
-				   &frag_off);
-	if (protoff < 0 || (frag_off & htons(~0x7)) != 0) {
-		pr_debug("proto header not found\n");
-		return nf_conntrack_confirm(skb);
-	}
-
-	return nf_confirm(skb, protoff, ct, ctinfo);
-}
-
 static unsigned int ipv6_conntrack_in(void *priv,
 				      struct sk_buff *skb,
 				      const struct nf_hook_state *state)
@@ -428,13 +414,13 @@ static const struct nf_hook_ops ipv6_conntrack_ops[] = {
 		.priority	= NF_IP6_PRI_CONNTRACK,
 	},
 	{
-		.hook		= ipv6_confirm,
+		.hook		= nf_confirm,
 		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_POST_ROUTING,
 		.priority	= NF_IP6_PRI_LAST,
 	},
 	{
-		.hook		= ipv6_confirm,
+		.hook		= nf_confirm,
 		.pf		= NFPROTO_IPV6,
 		.hooknum	= NF_INET_LOCAL_IN,
 		.priority	= NF_IP6_PRI_LAST - 1,
-- 
cgit v1.2.3


From a351d6087bf7d3d8440d58d3bf244ec64b89394a Mon Sep 17 00:00:00 2001
From: Pengcheng Yang <yangpc@wangsu.com>
Date: Tue, 29 Nov 2022 18:40:39 +0800
Subject: bpf, sockmap: Fix missing BPF_F_INGRESS flag when using apply_bytes

When redirecting, we use sk_msg_to_ingress() to get the BPF_F_INGRESS
flag from the msg->flags. If apply_bytes is used and it is larger than
the current data being processed, sk_psock_msg_verdict() will not be
called when sendmsg() is called again. At this time, the msg->flags is 0,
and we lost the BPF_F_INGRESS flag.

So we need to save the BPF_F_INGRESS flag in sk_psock and use it when
redirection.

Fixes: 8934ce2fd081 ("bpf: sockmap redirect ingress support")
Signed-off-by: Pengcheng Yang <yangpc@wangsu.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jakub Sitnicki <jakub@cloudflare.com>
Link: https://lore.kernel.org/bpf/1669718441-2654-3-git-send-email-yangpc@wangsu.com
---
 include/linux/skmsg.h |  1 +
 include/net/tcp.h     |  4 ++--
 net/core/skmsg.c      |  9 ++++++---
 net/ipv4/tcp_bpf.c    | 11 ++++++-----
 net/tls/tls_sw.c      |  6 ++++--
 5 files changed, 19 insertions(+), 12 deletions(-)

(limited to 'include/net')

diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index 70d6cb94e580..84f787416a54 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -82,6 +82,7 @@ struct sk_psock {
 	u32				apply_bytes;
 	u32				cork_bytes;
 	u32				eval;
+	bool				redir_ingress; /* undefined if sk_redir is null */
 	struct sk_msg			*cork;
 	struct sk_psock_progs		progs;
 #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6b814e788f00..b87e7381bddf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2319,8 +2319,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #endif /* CONFIG_BPF_SYSCALL */
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes,
-			  int flags);
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+			  struct sk_msg *msg, u32 bytes, int flags);
 #endif /* CONFIG_NET_SOCK_MSG */
 
 #if !defined(CONFIG_BPF_SYSCALL) || !defined(CONFIG_NET_SOCK_MSG)
diff --git a/net/core/skmsg.c b/net/core/skmsg.c
index e6b9ced3eda8..53d0251788aa 100644
--- a/net/core/skmsg.c
+++ b/net/core/skmsg.c
@@ -886,13 +886,16 @@ int sk_psock_msg_verdict(struct sock *sk, struct sk_psock *psock,
 	ret = sk_psock_map_verd(ret, msg->sk_redir);
 	psock->apply_bytes = msg->apply_bytes;
 	if (ret == __SK_REDIRECT) {
-		if (psock->sk_redir)
+		if (psock->sk_redir) {
 			sock_put(psock->sk_redir);
-		psock->sk_redir = msg->sk_redir;
-		if (!psock->sk_redir) {
+			psock->sk_redir = NULL;
+		}
+		if (!msg->sk_redir) {
 			ret = __SK_DROP;
 			goto out;
 		}
+		psock->redir_ingress = sk_msg_to_ingress(msg);
+		psock->sk_redir = msg->sk_redir;
 		sock_hold(psock->sk_redir);
 	}
 out:
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index f3e868f4cd9e..275c5ca9e04d 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -131,10 +131,9 @@ static int tcp_bpf_push_locked(struct sock *sk, struct sk_msg *msg,
 	return ret;
 }
 
-int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg,
-			  u32 bytes, int flags)
+int tcp_bpf_sendmsg_redir(struct sock *sk, bool ingress,
+			  struct sk_msg *msg, u32 bytes, int flags)
 {
-	bool ingress = sk_msg_to_ingress(msg);
 	struct sk_psock *psock = sk_psock_get(sk);
 	int ret;
 
@@ -276,7 +275,7 @@ msg_bytes_ready:
 static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
 				struct sk_msg *msg, int *copied, int flags)
 {
-	bool cork = false, enospc = sk_msg_full(msg);
+	bool cork = false, enospc = sk_msg_full(msg), redir_ingress;
 	struct sock *sk_redir;
 	u32 tosend, origsize, sent, delta = 0;
 	u32 eval;
@@ -322,6 +321,7 @@ more_data:
 		sk_msg_apply_bytes(psock, tosend);
 		break;
 	case __SK_REDIRECT:
+		redir_ingress = psock->redir_ingress;
 		sk_redir = psock->sk_redir;
 		sk_msg_apply_bytes(psock, tosend);
 		if (!psock->apply_bytes) {
@@ -338,7 +338,8 @@ more_data:
 		release_sock(sk);
 
 		origsize = msg->sg.size;
-		ret = tcp_bpf_sendmsg_redir(sk_redir, msg, tosend, flags);
+		ret = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+					    msg, tosend, flags);
 		sent = origsize - msg->sg.size;
 
 		if (eval == __SK_REDIRECT)
diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index 264cf367e265..9ed978634125 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -792,7 +792,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
 	struct sk_psock *psock;
 	struct sock *sk_redir;
 	struct tls_rec *rec;
-	bool enospc, policy;
+	bool enospc, policy, redir_ingress;
 	int err = 0, send;
 	u32 delta = 0;
 
@@ -837,6 +837,7 @@ more_data:
 		}
 		break;
 	case __SK_REDIRECT:
+		redir_ingress = psock->redir_ingress;
 		sk_redir = psock->sk_redir;
 		memcpy(&msg_redir, msg, sizeof(*msg));
 		if (msg->apply_bytes < send)
@@ -846,7 +847,8 @@ more_data:
 		sk_msg_return_zero(sk, msg, send);
 		msg->sg.size -= send;
 		release_sock(sk);
-		err = tcp_bpf_sendmsg_redir(sk_redir, &msg_redir, send, flags);
+		err = tcp_bpf_sendmsg_redir(sk_redir, redir_ingress,
+					    &msg_redir, send, flags);
 		lock_sock(sk);
 		if (err < 0) {
 			*copied -= sk_msg_free_nocharge(sk, &msg_redir);
-- 
cgit v1.2.3


From af6397c9ee2b42988c912dcad2fca1f43d5c1c99 Mon Sep 17 00:00:00 2001
From: Jacob Keller <jacob.e.keller@intel.com>
Date: Mon, 28 Nov 2022 12:36:44 -0800
Subject: devlink: support directly reading from region memory

To read from a region, user space must currently request a new snapshot of
the region and then read from that snapshot. This can sometimes be overkill
if user space only reads a tiny portion. They first create the snapshot,
then request a read, then destroy the snapshot.

For regions which have a single underlying "contents", it makes sense to
allow supporting direct reading of the region data.

Extend the DEVLINK_CMD_REGION_READ to allow direct reading from a region if
requested via the new DEVLINK_ATTR_REGION_DIRECT. If this attribute is set,
then perform a direct read instead of using a snapshot. Direct read is
mutually exclusive with DEVLINK_ATTR_REGION_SNAPSHOT_ID, and care is taken
to ensure that we reject commands which provide incorrect attributes.

Regions must enable support for direct read by implementing the .read()
callback function. If a region does not support such direct reads, a
suitable extended error message is reported.

Signed-off-by: Jacob Keller <jacob.e.keller@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../networking/devlink/devlink-region.rst          | 13 ++++
 include/net/devlink.h                              | 16 +++++
 include/uapi/linux/devlink.h                       |  2 +
 net/core/devlink.c                                 | 80 +++++++++++++++++-----
 4 files changed, 94 insertions(+), 17 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/devlink/devlink-region.rst b/Documentation/networking/devlink/devlink-region.rst
index f06dca9a1eb6..9232cd7da301 100644
--- a/Documentation/networking/devlink/devlink-region.rst
+++ b/Documentation/networking/devlink/devlink-region.rst
@@ -31,6 +31,15 @@ in its ``devlink_region_ops`` structure. If snapshot id is not set in
 the ``DEVLINK_CMD_REGION_NEW`` request kernel will allocate one and send
 the snapshot information to user space.
 
+Regions may optionally allow directly reading from their contents without a
+snapshot. Direct read requests are not atomic. In particular a read request
+of size 256 bytes or larger will be split into multiple chunks. If atomic
+access is required, use a snapshot. A driver wishing to enable this for a
+region should implement the ``.read`` callback in the ``devlink_region_ops``
+structure. User space can request a direct read by using the
+``DEVLINK_ATTR_REGION_DIRECT`` attribute instead of specifying a snapshot
+id.
+
 example usage
 -------------
 
@@ -65,6 +74,10 @@ example usage
     $ devlink region read pci/0000:00:05.0/fw-health snapshot 1 address 0 length 16
     0000000000000000 0014 95dc 0014 9514 0035 1670 0034 db30
 
+    # Read from the region without a snapshot
+    $ devlink region read pci/0000:00:05.0/fw-health address 16 length 16
+    0000000000000010 0000 0000 ffff ff04 0029 8c00 0028 8cc8
+
 As regions are likely very device or driver specific, no generic regions are
 defined. See the driver-specific documentation files for information on the
 specific regions a driver supports.
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 074a79b8933f..02528f736f65 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -650,6 +650,10 @@ struct devlink_info_req;
  *            the data variable must be updated to point to the snapshot data.
  *            The function will be called while the devlink instance lock is
  *            held.
+ * @read: callback to directly read a portion of the region. On success,
+ *        the data pointer will be updated with the contents of the
+ *        requested portion of the region. The function will be called
+ *        while the devlink instance lock is held.
  * @priv: Pointer to driver private data for the region operation
  */
 struct devlink_region_ops {
@@ -659,6 +663,10 @@ struct devlink_region_ops {
 			const struct devlink_region_ops *ops,
 			struct netlink_ext_ack *extack,
 			u8 **data);
+	int (*read)(struct devlink *devlink,
+		    const struct devlink_region_ops *ops,
+		    struct netlink_ext_ack *extack,
+		    u64 offset, u32 size, u8 *data);
 	void *priv;
 };
 
@@ -670,6 +678,10 @@ struct devlink_region_ops {
  *            the data variable must be updated to point to the snapshot data.
  *            The function will be called while the devlink instance lock is
  *            held.
+ * @read: callback to directly read a portion of the region. On success,
+ *        the data pointer will be updated with the contents of the
+ *        requested portion of the region. The function will be called
+ *        while the devlink instance lock is held.
  * @priv: Pointer to driver private data for the region operation
  */
 struct devlink_port_region_ops {
@@ -679,6 +691,10 @@ struct devlink_port_region_ops {
 			const struct devlink_port_region_ops *ops,
 			struct netlink_ext_ack *extack,
 			u8 **data);
+	int (*read)(struct devlink_port *port,
+		    const struct devlink_port_region_ops *ops,
+		    struct netlink_ext_ack *extack,
+		    u64 offset, u32 size, u8 *data);
 	void *priv;
 };
 
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 498d0d5d0957..70191d96af89 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -610,6 +610,8 @@ enum devlink_attr {
 	DEVLINK_ATTR_RATE_TX_PRIORITY,		/* u32 */
 	DEVLINK_ATTR_RATE_TX_WEIGHT,		/* u32 */
 
+	DEVLINK_ATTR_REGION_DIRECT,		/* flag */
+
 	/* add new attributes above here, update the policy in devlink.c */
 
 	__DEVLINK_ATTR_MAX,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 6c05cfaa571d..298041a44aa8 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6515,6 +6515,26 @@ devlink_region_snapshot_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
 	return 0;
 }
 
+static int
+devlink_region_port_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+				u64 curr_offset, struct netlink_ext_ack *extack)
+{
+	struct devlink_region *region = cb_priv;
+
+	return region->port_ops->read(region->port, region->port_ops, extack,
+				      curr_offset, chunk_size, chunk);
+}
+
+static int
+devlink_region_direct_fill(void *cb_priv, u8 *chunk, u32 chunk_size,
+			   u64 curr_offset, struct netlink_ext_ack *extack)
+{
+	struct devlink_region *region = cb_priv;
+
+	return region->ops->read(region->devlink, region->ops, extack,
+				 curr_offset, chunk_size, chunk);
+}
+
 static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 					     struct netlink_callback *cb)
 {
@@ -6523,12 +6543,12 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 	u64 ret_offset, start_offset, end_offset = U64_MAX;
 	struct nlattr **attrs = info->attrs;
 	struct devlink_port *port = NULL;
-	struct devlink_snapshot *snapshot;
+	devlink_chunk_fill_t *region_cb;
 	struct devlink_region *region;
 	const char *region_name;
 	struct devlink *devlink;
 	unsigned int index;
-	u32 snapshot_id;
+	void *region_cb_priv;
 	void *hdr;
 	int err;
 
@@ -6546,12 +6566,6 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 		goto out_unlock;
 	}
 
-	if (!attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID]) {
-		NL_SET_ERR_MSG(cb->extack, "No snapshot id provided");
-		err = -EINVAL;
-		goto out_unlock;
-	}
-
 	if (info->attrs[DEVLINK_ATTR_PORT_INDEX]) {
 		index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
 
@@ -6577,12 +6591,43 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 	}
 
 	snapshot_attr = attrs[DEVLINK_ATTR_REGION_SNAPSHOT_ID];
-	snapshot_id = nla_get_u32(snapshot_attr);
-	snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
-	if (!snapshot) {
-		NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist");
-		err = -EINVAL;
-		goto out_unlock;
+	if (!snapshot_attr) {
+		if (!nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+			NL_SET_ERR_MSG(cb->extack, "No snapshot id provided");
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		if (!region->ops->read) {
+			NL_SET_ERR_MSG(cb->extack, "Requested region does not support direct read");
+			err = -EOPNOTSUPP;
+			goto out_unlock;
+		}
+
+		if (port)
+			region_cb = &devlink_region_port_direct_fill;
+		else
+			region_cb = &devlink_region_direct_fill;
+		region_cb_priv = region;
+	} else {
+		struct devlink_snapshot *snapshot;
+		u32 snapshot_id;
+
+		if (nla_get_flag(attrs[DEVLINK_ATTR_REGION_DIRECT])) {
+			NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Direct region read does not use snapshot");
+			err = -EINVAL;
+			goto out_unlock;
+		}
+
+		snapshot_id = nla_get_u32(snapshot_attr);
+		snapshot = devlink_region_snapshot_get_by_id(region, snapshot_id);
+		if (!snapshot) {
+			NL_SET_ERR_MSG_ATTR(cb->extack, snapshot_attr, "Requested snapshot does not exist");
+			err = -EINVAL;
+			goto out_unlock;
+		}
+		region_cb = &devlink_region_snapshot_fill;
+		region_cb_priv = snapshot;
 	}
 
 	if (attrs[DEVLINK_ATTR_REGION_CHUNK_ADDR] &&
@@ -6633,9 +6678,9 @@ static int devlink_nl_cmd_region_read_dumpit(struct sk_buff *skb,
 		goto nla_put_failure;
 	}
 
-	err = devlink_nl_region_read_fill(skb, &devlink_region_snapshot_fill,
-					  snapshot, start_offset, end_offset,
-					  &ret_offset, cb->extack);
+	err = devlink_nl_region_read_fill(skb, region_cb, region_cb_priv,
+					  start_offset, end_offset, &ret_offset,
+					  cb->extack);
 
 	if (err && err != -EMSGSIZE)
 		goto nla_put_failure;
@@ -9280,6 +9325,7 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
 	[DEVLINK_ATTR_SELFTESTS] = { .type = NLA_NESTED },
 	[DEVLINK_ATTR_RATE_TX_PRIORITY] = { .type = NLA_U32 },
 	[DEVLINK_ATTR_RATE_TX_WEIGHT] = { .type = NLA_U32 },
+	[DEVLINK_ATTR_REGION_DIRECT] = { .type = NLA_FLAG },
 };
 
 static const struct genl_small_ops devlink_nl_ops[] = {
-- 
cgit v1.2.3


From 226bf980550627c88549b112ac6c8fb40873afb4 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Tue, 29 Nov 2022 18:51:38 +0900
Subject: net: devlink: let the core report the driver name instead of the
 drivers

The driver name is available in device_driver::name. Right now,
drivers still have to report this piece of information themselves in
their devlink_ops::info_get callback function.

In order to factorize code, make devlink_nl_info_fill() add the driver
name attribute.

Now that the core sets the driver name attribute, drivers are not
supposed to call devlink_info_driver_name_put() anymore. Remove
devlink_info_driver_name_put() and clean-up all the drivers using this
function in their callback.

Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Tested-by: Ido Schimmel <idosch@nvidia.com> # mlxsw
Reviewed-by: Jacob Keller  <jacob.e.keller@intel.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 .../crypto/marvell/octeontx2/otx2_cpt_devlink.c    |  4 ----
 drivers/net/dsa/hirschmann/hellcreek.c             |  5 -----
 drivers/net/dsa/mv88e6xxx/devlink.c                |  5 -----
 drivers/net/dsa/sja1105/sja1105_devlink.c          | 12 +++-------
 drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c  |  4 ----
 .../ethernet/freescale/dpaa2/dpaa2-eth-devlink.c   | 11 +--------
 .../net/ethernet/fungible/funeth/funeth_devlink.c  |  2 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c |  5 -----
 .../hisilicon/hns3/hns3vf/hclgevf_devlink.c        |  5 -----
 drivers/net/ethernet/intel/ice/ice_devlink.c       |  6 -----
 .../ethernet/marvell/octeontx2/af/rvu_devlink.c    |  2 +-
 .../ethernet/marvell/octeontx2/nic/otx2_devlink.c  |  9 +-------
 .../ethernet/marvell/prestera/prestera_devlink.c   |  5 -----
 drivers/net/ethernet/mellanox/mlx5/core/devlink.c  |  4 ----
 drivers/net/ethernet/mellanox/mlxsw/core.c         |  5 -----
 drivers/net/ethernet/netronome/nfp/nfp_devlink.c   |  4 ----
 .../net/ethernet/pensando/ionic/ionic_devlink.c    |  4 ----
 drivers/net/ethernet/qlogic/qed/qed_devlink.c      |  4 ----
 drivers/net/netdevsim/dev.c                        |  3 ---
 drivers/ptp/ptp_ocp.c                              |  4 ----
 include/net/devlink.h                              |  2 --
 net/core/devlink.c                                 | 26 +++++++++++++++-------
 22 files changed, 25 insertions(+), 106 deletions(-)

(limited to 'include/net')

diff --git a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
index 7503f6b18ac5..a2aba0b0d68a 100644
--- a/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
+++ b/drivers/crypto/marvell/octeontx2/otx2_cpt_devlink.c
@@ -76,10 +76,6 @@ static int otx2_cpt_devlink_info_get(struct devlink *dl,
 	struct otx2_cptpf_dev *cptpf = cpt_dl->cptpf;
 	int err;
 
-	err = devlink_info_driver_name_put(req, "rvu_cptpf");
-	if (err)
-		return err;
-
 	err = otx2_cpt_dl_info_firmware_version_put(req, cptpf->eng_grps.grp,
 						    "fw.ae", OTX2_CPT_AE_TYPES);
 	if (err)
diff --git a/drivers/net/dsa/hirschmann/hellcreek.c b/drivers/net/dsa/hirschmann/hellcreek.c
index 951f7935c872..595a548bb0a8 100644
--- a/drivers/net/dsa/hirschmann/hellcreek.c
+++ b/drivers/net/dsa/hirschmann/hellcreek.c
@@ -1176,11 +1176,6 @@ static int hellcreek_devlink_info_get(struct dsa_switch *ds,
 				      struct netlink_ext_ack *extack)
 {
 	struct hellcreek *hellcreek = ds->priv;
-	int ret;
-
-	ret = devlink_info_driver_name_put(req, "hellcreek");
-	if (ret)
-		return ret;
 
 	return devlink_info_version_fixed_put(req,
 					      DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
diff --git a/drivers/net/dsa/mv88e6xxx/devlink.c b/drivers/net/dsa/mv88e6xxx/devlink.c
index 1266eabee086..a08dab75e0c0 100644
--- a/drivers/net/dsa/mv88e6xxx/devlink.c
+++ b/drivers/net/dsa/mv88e6xxx/devlink.c
@@ -821,11 +821,6 @@ int mv88e6xxx_devlink_info_get(struct dsa_switch *ds,
 			       struct netlink_ext_ack *extack)
 {
 	struct mv88e6xxx_chip *chip = ds->priv;
-	int err;
-
-	err = devlink_info_driver_name_put(req, "mv88e6xxx");
-	if (err)
-		return err;
 
 	return devlink_info_version_fixed_put(req,
 					      DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
diff --git a/drivers/net/dsa/sja1105/sja1105_devlink.c b/drivers/net/dsa/sja1105/sja1105_devlink.c
index 10c6fea1227f..da532614f34a 100644
--- a/drivers/net/dsa/sja1105/sja1105_devlink.c
+++ b/drivers/net/dsa/sja1105/sja1105_devlink.c
@@ -120,16 +120,10 @@ int sja1105_devlink_info_get(struct dsa_switch *ds,
 			     struct netlink_ext_ack *extack)
 {
 	struct sja1105_private *priv = ds->priv;
-	int rc;
-
-	rc = devlink_info_driver_name_put(req, "sja1105");
-	if (rc)
-		return rc;
 
-	rc = devlink_info_version_fixed_put(req,
-					    DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
-					    priv->info->name);
-	return rc;
+	return devlink_info_version_fixed_put(req,
+					      DEVLINK_INFO_VERSION_GENERIC_ASIC_ID,
+					      priv->info->name);
 }
 
 int sja1105_devlink_setup(struct dsa_switch *ds)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
index 8a6f788f6294..26913dc816d3 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
@@ -892,10 +892,6 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 	u32 ver = 0;
 	int rc;
 
-	rc = devlink_info_driver_name_put(req, DRV_MODULE_NAME);
-	if (rc)
-		return rc;
-
 	if (BNXT_PF(bp) && (bp->flags & BNXT_FLAG_DSN_VALID)) {
 		sprintf(buf, "%02X-%02X-%02X-%02X-%02X-%02X-%02X-%02X",
 			bp->dsn[7], bp->dsn[6], bp->dsn[5], bp->dsn[4],
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
index 5c6dd3029e2f..76f808d38066 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth-devlink.c
@@ -37,18 +37,9 @@ static int dpaa2_eth_dl_info_get(struct devlink *devlink,
 	struct dpaa2_eth_devlink_priv *dl_priv = devlink_priv(devlink);
 	struct dpaa2_eth_priv *priv = dl_priv->dpaa2_priv;
 	char buf[10];
-	int err;
-
-	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (err)
-		return err;
 
 	scnprintf(buf, 10, "%d.%d", priv->dpni_ver_major, priv->dpni_ver_minor);
-	err = devlink_info_version_running_put(req, "dpni", buf);
-	if (err)
-		return err;
-
-	return 0;
+	return devlink_info_version_running_put(req, "dpni", buf);
 }
 
 static struct dpaa2_eth_trap_item *
diff --git a/drivers/net/ethernet/fungible/funeth/funeth_devlink.c b/drivers/net/ethernet/fungible/funeth/funeth_devlink.c
index d50c222948b4..6668375edff6 100644
--- a/drivers/net/ethernet/fungible/funeth/funeth_devlink.c
+++ b/drivers/net/ethernet/fungible/funeth/funeth_devlink.c
@@ -6,7 +6,7 @@
 static int fun_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 			   struct netlink_ext_ack *extack)
 {
-	return devlink_info_driver_name_put(req, KBUILD_MODNAME);
+	return 0;
 }
 
 static const struct devlink_ops fun_dl_ops = {
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
index 4c441e6a5082..3d3b69605423 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_devlink.c
@@ -13,11 +13,6 @@ static int hclge_devlink_info_get(struct devlink *devlink,
 	struct hclge_devlink_priv *priv = devlink_priv(devlink);
 	char version_str[HCLGE_DEVLINK_FW_STRING_LEN];
 	struct hclge_dev *hdev = priv->hdev;
-	int ret;
-
-	ret = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (ret)
-		return ret;
 
 	snprintf(version_str, sizeof(version_str), "%lu.%lu.%lu.%lu",
 		 hnae3_get_field(hdev->fw_version, HNAE3_FW_VERSION_BYTE3_MASK,
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
index fdc19868b818..a6c3c5e8f0ab 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_devlink.c
@@ -13,11 +13,6 @@ static int hclgevf_devlink_info_get(struct devlink *devlink,
 	struct hclgevf_devlink_priv *priv = devlink_priv(devlink);
 	char version_str[HCLGEVF_DEVLINK_FW_STRING_LEN];
 	struct hclgevf_dev *hdev = priv->hdev;
-	int ret;
-
-	ret = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (ret)
-		return ret;
 
 	snprintf(version_str, sizeof(version_str), "%lu.%lu.%lu.%lu",
 		 hnae3_get_field(hdev->fw_version, HNAE3_FW_VERSION_BYTE3_MASK,
diff --git a/drivers/net/ethernet/intel/ice/ice_devlink.c b/drivers/net/ethernet/intel/ice/ice_devlink.c
index 946d64e577c9..8286e47b4bae 100644
--- a/drivers/net/ethernet/intel/ice/ice_devlink.c
+++ b/drivers/net/ethernet/intel/ice/ice_devlink.c
@@ -311,12 +311,6 @@ static int ice_devlink_info_get(struct devlink *devlink,
 		}
 	}
 
-	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (err) {
-		NL_SET_ERR_MSG_MOD(extack, "Unable to set driver name");
-		goto out_free_ctx;
-	}
-
 	ice_info_get_dsn(pf, ctx);
 
 	err = devlink_info_serial_number_put(req, ctx->buf);
diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
index 88dee589cb21..f15439d26d21 100644
--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_devlink.c
@@ -1550,7 +1550,7 @@ static int rvu_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
 static int rvu_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 				struct netlink_ext_ack *extack)
 {
-	return devlink_info_driver_name_put(req, DRV_NAME);
+	return 0;
 }
 
 static const struct devlink_ops rvu_devlink_ops = {
diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
index 777a27047c8e..5cc6416cf1a6 100644
--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_devlink.c
@@ -77,18 +77,11 @@ static const struct devlink_param otx2_dl_params[] = {
 			     otx2_dl_mcam_count_validate),
 };
 
-/* Devlink OPs */
 static int otx2_devlink_info_get(struct devlink *devlink,
 				 struct devlink_info_req *req,
 				 struct netlink_ext_ack *extack)
 {
-	struct otx2_devlink *otx2_dl = devlink_priv(devlink);
-	struct otx2_nic *pfvf = otx2_dl->pfvf;
-
-	if (is_otx2_vf(pfvf->pcifunc))
-		return devlink_info_driver_name_put(req, "rvu_nicvf");
-
-	return devlink_info_driver_name_put(req, "rvu_nicpf");
+	return 0;
 }
 
 static const struct devlink_ops otx2_devlink_ops = {
diff --git a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
index 84ad05c9f12d..2a4c9df4eb79 100644
--- a/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
+++ b/drivers/net/ethernet/marvell/prestera/prestera_devlink.c
@@ -355,11 +355,6 @@ static int prestera_dl_info_get(struct devlink *dl,
 {
 	struct prestera_switch *sw = devlink_priv(dl);
 	char buf[16];
-	int err;
-
-	err = devlink_info_driver_name_put(req, PRESTERA_DRV_NAME);
-	if (err)
-		return err;
 
 	snprintf(buf, sizeof(buf), "%d.%d.%d",
 		 sw->dev->fw_rev.maj,
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
index cc2ae427dcb0..751bc4a9edcf 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/devlink.c
@@ -46,10 +46,6 @@ mlx5_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	u32 running_fw, stored_fw;
 	int err;
 
-	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (err)
-		return err;
-
 	err = devlink_info_version_fixed_put(req, "fw.psid", dev->board_id);
 	if (err)
 		return err;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/core.c b/drivers/net/ethernet/mellanox/mlxsw/core.c
index a83f6bc30072..a0a06e2eff82 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/core.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/core.c
@@ -1459,11 +1459,6 @@ mlxsw_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	char buf[32];
 	int err;
 
-	err = devlink_info_driver_name_put(req,
-					   mlxsw_core->bus_info->device_kind);
-	if (err)
-		return err;
-
 	mlxsw_reg_mgir_pack(mgir_pl);
 	err = mlxsw_reg_query(mlxsw_core, MLXSW_REG(mgir), mgir_pl);
 	if (err)
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
index 784f23602a8a..bf6bae557158 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_devlink.c
@@ -239,10 +239,6 @@ nfp_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	char *buf = NULL;
 	int err;
 
-	err = devlink_info_driver_name_put(req, "nfp");
-	if (err)
-		return err;
-
 	vendor = nfp_hwinfo_lookup(pf->hwinfo, "assembly.vendor");
 	part = nfp_hwinfo_lookup(pf->hwinfo, "assembly.partno");
 	sn = nfp_hwinfo_lookup(pf->hwinfo, "assembly.serial");
diff --git a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
index 567f778433e2..e6ff757895ab 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_devlink.c
@@ -26,10 +26,6 @@ static int ionic_dl_info_get(struct devlink *dl, struct devlink_info_req *req,
 	char buf[16];
 	int err = 0;
 
-	err = devlink_info_driver_name_put(req, IONIC_DRV_NAME);
-	if (err)
-		return err;
-
 	err = devlink_info_version_running_put(req,
 					       DEVLINK_INFO_VERSION_GENERIC_FW,
 					       idev->dev_info.fw_version);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_devlink.c b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
index 6bb4e165b592..922c47797af6 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_devlink.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_devlink.c
@@ -162,10 +162,6 @@ static int qed_devlink_info_get(struct devlink *devlink,
 
 	dev_info = &cdev->common_dev_info;
 
-	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (err)
-		return err;
-
 	memcpy(buf, cdev->hwfns[0].hw_info.part_num, sizeof(cdev->hwfns[0].hw_info.part_num));
 	buf[sizeof(cdev->hwfns[0].hw_info.part_num)] = 0;
 
diff --git a/drivers/net/netdevsim/dev.c b/drivers/net/netdevsim/dev.c
index e14686594a71..b962fc8e1397 100644
--- a/drivers/net/netdevsim/dev.c
+++ b/drivers/net/netdevsim/dev.c
@@ -994,9 +994,6 @@ static int nsim_dev_info_get(struct devlink *devlink,
 {
 	int err;
 
-	err = devlink_info_driver_name_put(req, DRV_NAME);
-	if (err)
-		return err;
 	err = devlink_info_version_stored_put_ext(req, "fw.mgmt", "10.20.30",
 						  DEVLINK_INFO_VERSION_TYPE_COMPONENT);
 	if (err)
diff --git a/drivers/ptp/ptp_ocp.c b/drivers/ptp/ptp_ocp.c
index 154d58cbd9ce..4bbaccd543ad 100644
--- a/drivers/ptp/ptp_ocp.c
+++ b/drivers/ptp/ptp_ocp.c
@@ -1647,10 +1647,6 @@ ptp_ocp_devlink_info_get(struct devlink *devlink, struct devlink_info_req *req,
 	char buf[32];
 	int err;
 
-	err = devlink_info_driver_name_put(req, KBUILD_MODNAME);
-	if (err)
-		return err;
-
 	fw_image = bp->fw_loader ? "loader" : "fw";
 	sprintf(buf, "%d.%d", bp->fw_tag, bp->fw_version);
 	err = devlink_info_version_running_put(req, fw_image, buf);
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 02528f736f65..5f6eca5e4a40 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1762,8 +1762,6 @@ int devlink_region_snapshot_create(struct devlink_region *region,
 				   u8 *data, u32 snapshot_id);
 int devlink_info_serial_number_put(struct devlink_info_req *req,
 				   const char *sn);
-int devlink_info_driver_name_put(struct devlink_info_req *req,
-				 const char *name);
 int devlink_info_board_serial_number_put(struct devlink_info_req *req,
 					 const char *bsn);
 
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 298041a44aa8..60eb0f46520f 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -6707,14 +6707,6 @@ out_unlock:
 	return err;
 }
 
-int devlink_info_driver_name_put(struct devlink_info_req *req, const char *name)
-{
-	if (!req->msg)
-		return 0;
-	return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME, name);
-}
-EXPORT_SYMBOL_GPL(devlink_info_driver_name_put);
-
 int devlink_info_serial_number_put(struct devlink_info_req *req, const char *sn)
 {
 	if (!req->msg)
@@ -6823,11 +6815,25 @@ int devlink_info_version_running_put_ext(struct devlink_info_req *req,
 }
 EXPORT_SYMBOL_GPL(devlink_info_version_running_put_ext);
 
+static int devlink_nl_driver_info_get(struct device_driver *drv,
+				      struct devlink_info_req *req)
+{
+	if (!drv)
+		return 0;
+
+	if (drv->name[0])
+		return nla_put_string(req->msg, DEVLINK_ATTR_INFO_DRIVER_NAME,
+				      drv->name);
+
+	return 0;
+}
+
 static int
 devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
 		     enum devlink_command cmd, u32 portid,
 		     u32 seq, int flags, struct netlink_ext_ack *extack)
 {
+	struct device *dev = devlink_to_dev(devlink);
 	struct devlink_info_req req = {};
 	void *hdr;
 	int err;
@@ -6845,6 +6851,10 @@ devlink_nl_info_fill(struct sk_buff *msg, struct devlink *devlink,
 	if (err)
 		goto err_cancel_msg;
 
+	err = devlink_nl_driver_info_get(dev->driver, &req);
+	if (err)
+		goto err_cancel_msg;
+
 	genlmsg_end(msg, hdr);
 	return 0;
 
-- 
cgit v1.2.3


From c25b7a7a565e5eeb2459b37583eea67942057511 Mon Sep 17 00:00:00 2001
From: Florian Westphal <fw@strlen.de>
Date: Tue, 29 Nov 2022 15:06:44 +0100
Subject: inet: ping: use hlist_nulls rcu iterator during lookup

ping_lookup() does not acquire the table spinlock, so iteration should
use hlist_nulls_for_each_entry_rcu().

Spotted during code review.

Fixes: dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock")
Cc: Eric Dumazet <edumazet@google.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Link: https://lore.kernel.org/r/20221129140644.28525-1-fw@strlen.de
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 .clang-format      | 1 +
 include/net/ping.h | 3 ---
 net/ipv4/ping.c    | 7 ++++++-
 3 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/.clang-format b/.clang-format
index 1247d54f9e49..8d01225bfcb7 100644
--- a/.clang-format
+++ b/.clang-format
@@ -535,6 +535,7 @@ ForEachMacros:
   - 'perf_hpp_list__for_each_sort_list_safe'
   - 'perf_pmu__for_each_hybrid_pmu'
   - 'ping_portaddr_for_each_entry'
+  - 'ping_portaddr_for_each_entry_rcu'
   - 'plist_for_each'
   - 'plist_for_each_continue'
   - 'plist_for_each_entry'
diff --git a/include/net/ping.h b/include/net/ping.h
index e4ff3911cbf5..9233ad3de0ad 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -16,9 +16,6 @@
 #define PING_HTABLE_SIZE 	64
 #define PING_HTABLE_MASK 	(PING_HTABLE_SIZE-1)
 
-#define ping_portaddr_for_each_entry(__sk, node, list) \
-	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
-
 /*
  * gid_t is either uint or ushort.  We want to pass it to
  * proc_dointvec_minmax(), so it must not be larger than MAX_INT
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index bde333b24837..04b4ec07bb06 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -49,6 +49,11 @@
 #include <net/transp_v6.h>
 #endif
 
+#define ping_portaddr_for_each_entry(__sk, node, list) \
+	hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node)
+#define ping_portaddr_for_each_entry_rcu(__sk, node, list) \
+	hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node)
+
 struct ping_table {
 	struct hlist_nulls_head	hash[PING_HTABLE_SIZE];
 	spinlock_t		lock;
@@ -192,7 +197,7 @@ static struct sock *ping_lookup(struct net *net, struct sk_buff *skb, u16 ident)
 		return NULL;
 	}
 
-	ping_portaddr_for_each_entry(sk, hnode, hslot) {
+	ping_portaddr_for_each_entry_rcu(sk, hnode, hslot) {
 		isk = inet_sk(sk);
 
 		pr_debug("iterate\n");
-- 
cgit v1.2.3


From 4d371d6e3746d84056f150fdaa66c85f65f60124 Mon Sep 17 00:00:00 2001
From: Philipp Hortmann <philipp.g.hortmann@gmail.com>
Date: Mon, 14 Nov 2022 21:01:35 +0100
Subject: wifi: cfg80211: Correct example of ieee80211_iface_limit

Correct wrong closing bracket.

Signed-off-by: Philipp Hortmann <philipp.g.hortmann@gmail.com>
Link: https://lore.kernel.org/r/20221114200135.GA100176@matrix-ESPRIMO-P710
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/cfg80211.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h
index 11a370e64143..03d4f4deadae 100644
--- a/include/net/cfg80211.h
+++ b/include/net/cfg80211.h
@@ -4742,7 +4742,7 @@ struct ieee80211_iface_limit {
  *
  *	struct ieee80211_iface_limit limits1[] = {
  *		{ .max = 1, .types = BIT(NL80211_IFTYPE_STATION), },
- *		{ .max = 1, .types = BIT(NL80211_IFTYPE_AP}, },
+ *		{ .max = 1, .types = BIT(NL80211_IFTYPE_AP), },
  *	};
  *	struct ieee80211_iface_combination combination1 = {
  *		.limits = limits1,
-- 
cgit v1.2.3


From 75bfdbf2fca372e2709bcaa43e8cf1147766ae96 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 3 Nov 2022 22:27:52 +0000
Subject: rxrpc: Implement an in-kernel rxperf server for testing purposes

Implement an in-kernel rxperf server to allow kernel-based rxrpc services
to be tested directly, unlike with AFS where they're accessed by the
fileserver when the latter decides it wants to.

This is implemented as a module that, if loaded, opens UDP port 7009
(afs3-rmtsys) and listens on it for incoming calls.  Calls can be generated
using the rxperf command shipped with OpenAFS, for example.

Changes
=======
ver #2)
 - Use min_t() instead of min().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
cc: Jakub Kicinski <kuba@kernel.org>
---
 include/net/af_rxrpc.h |   1 +
 net/rxrpc/Kconfig      |   7 +
 net/rxrpc/Makefile     |   3 +
 net/rxrpc/rxperf.c     | 619 +++++++++++++++++++++++++++++++++++++++++++++++++
 net/rxrpc/server_key.c |  25 ++
 5 files changed, 655 insertions(+)
 create mode 100644 net/rxrpc/rxperf.c

(limited to 'include/net')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index b69ca695935c..dc033f08191e 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -71,5 +71,6 @@ void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 			       unsigned long);
 
 int rxrpc_sock_set_min_security_level(struct sock *sk, unsigned int val);
+int rxrpc_sock_set_security_keyring(struct sock *, struct key *);
 
 #endif /* _NET_RXRPC_H */
diff --git a/net/rxrpc/Kconfig b/net/rxrpc/Kconfig
index accd35c05577..7ae023b37a83 100644
--- a/net/rxrpc/Kconfig
+++ b/net/rxrpc/Kconfig
@@ -58,4 +58,11 @@ config RXKAD
 
 	  See Documentation/networking/rxrpc.rst.
 
+config RXPERF
+	tristate "RxRPC test service"
+	help
+	  Provide an rxperf service tester.  This listens on UDP port 7009 for
+	  incoming calls from the rxperf program (an example of which can be
+	  found in OpenAFS).
+
 endif
diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile
index fdeba488fc6e..79687477d93c 100644
--- a/net/rxrpc/Makefile
+++ b/net/rxrpc/Makefile
@@ -36,3 +36,6 @@ rxrpc-y := \
 rxrpc-$(CONFIG_PROC_FS) += proc.o
 rxrpc-$(CONFIG_RXKAD) += rxkad.o
 rxrpc-$(CONFIG_SYSCTL) += sysctl.o
+
+
+obj-$(CONFIG_RXPERF) += rxperf.o
diff --git a/net/rxrpc/rxperf.c b/net/rxrpc/rxperf.c
new file mode 100644
index 000000000000..66f5eea291ff
--- /dev/null
+++ b/net/rxrpc/rxperf.c
@@ -0,0 +1,619 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* In-kernel rxperf server for testing purposes.
+ *
+ * Copyright (C) 2022 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) "rxperf: " fmt
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+
+MODULE_DESCRIPTION("rxperf test server (afs)");
+MODULE_AUTHOR("Red Hat, Inc.");
+MODULE_LICENSE("GPL");
+
+#define RXPERF_PORT		7009
+#define RX_PERF_SERVICE		147
+#define RX_PERF_VERSION		3
+#define RX_PERF_SEND		0
+#define RX_PERF_RECV		1
+#define RX_PERF_RPC		3
+#define RX_PERF_FILE		4
+#define RX_PERF_MAGIC_COOKIE	0x4711
+
+struct rxperf_proto_params {
+	__be32		version;
+	__be32		type;
+	__be32		rsize;
+	__be32		wsize;
+} __packed;
+
+static const u8 rxperf_magic_cookie[] = { 0x00, 0x00, 0x47, 0x11 };
+static const u8 secret[8] = { 0xa7, 0x83, 0x8a, 0xcb, 0xc7, 0x83, 0xec, 0x94 };
+
+enum rxperf_call_state {
+	RXPERF_CALL_SV_AWAIT_PARAMS,	/* Server: Awaiting parameter block */
+	RXPERF_CALL_SV_AWAIT_REQUEST,	/* Server: Awaiting request data */
+	RXPERF_CALL_SV_REPLYING,	/* Server: Replying */
+	RXPERF_CALL_SV_AWAIT_ACK,	/* Server: Awaiting final ACK */
+	RXPERF_CALL_COMPLETE,		/* Completed or failed */
+};
+
+struct rxperf_call {
+	struct rxrpc_call	*rxcall;
+	struct iov_iter		iter;
+	struct kvec		kvec[1];
+	struct work_struct	work;
+	const char		*type;
+	size_t			iov_len;
+	size_t			req_len;	/* Size of request blob */
+	size_t			reply_len;	/* Size of reply blob */
+	unsigned int		debug_id;
+	unsigned int		operation_id;
+	struct rxperf_proto_params params;
+	__be32			tmp[2];
+	s32			abort_code;
+	enum rxperf_call_state	state;
+	short			error;
+	unsigned short		unmarshal;
+	u16			service_id;
+	int (*deliver)(struct rxperf_call *call);
+	void (*processor)(struct work_struct *work);
+};
+
+static struct socket *rxperf_socket;
+static struct key *rxperf_sec_keyring;	/* Ring of security/crypto keys */
+static struct workqueue_struct *rxperf_workqueue;
+
+static void rxperf_deliver_to_call(struct work_struct *work);
+static int rxperf_deliver_param_block(struct rxperf_call *call);
+static int rxperf_deliver_request(struct rxperf_call *call);
+static int rxperf_process_call(struct rxperf_call *call);
+static void rxperf_charge_preallocation(struct work_struct *work);
+
+static DECLARE_WORK(rxperf_charge_preallocation_work,
+		    rxperf_charge_preallocation);
+
+static inline void rxperf_set_call_state(struct rxperf_call *call,
+					 enum rxperf_call_state to)
+{
+	call->state = to;
+}
+
+static inline void rxperf_set_call_complete(struct rxperf_call *call,
+					    int error, s32 remote_abort)
+{
+	if (call->state != RXPERF_CALL_COMPLETE) {
+		call->abort_code = remote_abort;
+		call->error = error;
+		call->state = RXPERF_CALL_COMPLETE;
+	}
+}
+
+static void rxperf_rx_discard_new_call(struct rxrpc_call *rxcall,
+				       unsigned long user_call_ID)
+{
+	kfree((struct rxperf_call *)user_call_ID);
+}
+
+static void rxperf_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall,
+			       unsigned long user_call_ID)
+{
+	queue_work(rxperf_workqueue, &rxperf_charge_preallocation_work);
+}
+
+static void rxperf_queue_call_work(struct rxperf_call *call)
+{
+	queue_work(rxperf_workqueue, &call->work);
+}
+
+static void rxperf_notify_rx(struct sock *sk, struct rxrpc_call *rxcall,
+			     unsigned long call_user_ID)
+{
+	struct rxperf_call *call = (struct rxperf_call *)call_user_ID;
+
+	if (call->state != RXPERF_CALL_COMPLETE)
+		rxperf_queue_call_work(call);
+}
+
+static void rxperf_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID)
+{
+	struct rxperf_call *call = (struct rxperf_call *)user_call_ID;
+
+	call->rxcall = rxcall;
+}
+
+static void rxperf_notify_end_reply_tx(struct sock *sock,
+				       struct rxrpc_call *rxcall,
+				       unsigned long call_user_ID)
+{
+	rxperf_set_call_state((struct rxperf_call *)call_user_ID,
+			      RXPERF_CALL_SV_AWAIT_ACK);
+}
+
+/*
+ * Charge the incoming call preallocation.
+ */
+static void rxperf_charge_preallocation(struct work_struct *work)
+{
+	struct rxperf_call *call;
+
+	for (;;) {
+		call = kzalloc(sizeof(*call), GFP_KERNEL);
+		if (!call)
+			break;
+
+		call->type		= "unset";
+		call->debug_id		= atomic_inc_return(&rxrpc_debug_id);
+		call->deliver		= rxperf_deliver_param_block;
+		call->state		= RXPERF_CALL_SV_AWAIT_PARAMS;
+		call->service_id	= RX_PERF_SERVICE;
+		call->iov_len		= sizeof(call->params);
+		call->kvec[0].iov_len	= sizeof(call->params);
+		call->kvec[0].iov_base	= &call->params;
+		iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len);
+		INIT_WORK(&call->work, rxperf_deliver_to_call);
+
+		if (rxrpc_kernel_charge_accept(rxperf_socket,
+					       rxperf_notify_rx,
+					       rxperf_rx_attach,
+					       (unsigned long)call,
+					       GFP_KERNEL,
+					       call->debug_id) < 0)
+			break;
+		call = NULL;
+	}
+
+	kfree(call);
+}
+
+/*
+ * Open an rxrpc socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+static int rxperf_open_socket(void)
+{
+	struct sockaddr_rxrpc srx;
+	struct socket *socket;
+	int ret;
+
+	ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET6,
+			       &socket);
+	if (ret < 0)
+		goto error_1;
+
+	socket->sk->sk_allocation = GFP_NOFS;
+
+	/* bind the callback manager's address to make this a server socket */
+	memset(&srx, 0, sizeof(srx));
+	srx.srx_family			= AF_RXRPC;
+	srx.srx_service			= RX_PERF_SERVICE;
+	srx.transport_type		= SOCK_DGRAM;
+	srx.transport_len		= sizeof(srx.transport.sin6);
+	srx.transport.sin6.sin6_family	= AF_INET6;
+	srx.transport.sin6.sin6_port	= htons(RXPERF_PORT);
+
+	ret = rxrpc_sock_set_min_security_level(socket->sk,
+						RXRPC_SECURITY_ENCRYPT);
+	if (ret < 0)
+		goto error_2;
+
+	ret = rxrpc_sock_set_security_keyring(socket->sk, rxperf_sec_keyring);
+
+	ret = kernel_bind(socket, (struct sockaddr *)&srx, sizeof(srx));
+	if (ret < 0)
+		goto error_2;
+
+	rxrpc_kernel_new_call_notification(socket, rxperf_rx_new_call,
+					   rxperf_rx_discard_new_call);
+
+	ret = kernel_listen(socket, INT_MAX);
+	if (ret < 0)
+		goto error_2;
+
+	rxperf_socket = socket;
+	rxperf_charge_preallocation(&rxperf_charge_preallocation_work);
+	return 0;
+
+error_2:
+	sock_release(socket);
+error_1:
+	pr_err("Can't set up rxperf socket: %d\n", ret);
+	return ret;
+}
+
+/*
+ * close the rxrpc socket rxperf was using
+ */
+static void rxperf_close_socket(void)
+{
+	kernel_listen(rxperf_socket, 0);
+	kernel_sock_shutdown(rxperf_socket, SHUT_RDWR);
+	flush_workqueue(rxperf_workqueue);
+	sock_release(rxperf_socket);
+}
+
+/*
+ * Log remote abort codes that indicate that we have a protocol disagreement
+ * with the server.
+ */
+static void rxperf_log_error(struct rxperf_call *call, s32 remote_abort)
+{
+	static int max = 0;
+	const char *msg;
+	int m;
+
+	switch (remote_abort) {
+	case RX_EOF:		 msg = "unexpected EOF";	break;
+	case RXGEN_CC_MARSHAL:	 msg = "client marshalling";	break;
+	case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling";	break;
+	case RXGEN_SS_MARSHAL:	 msg = "server marshalling";	break;
+	case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling";	break;
+	case RXGEN_DECODE:	 msg = "opcode decode";		break;
+	case RXGEN_SS_XDRFREE:	 msg = "server XDR cleanup";	break;
+	case RXGEN_CC_XDRFREE:	 msg = "client XDR cleanup";	break;
+	case -32:		 msg = "insufficient data";	break;
+	default:
+		return;
+	}
+
+	m = max;
+	if (m < 3) {
+		max = m + 1;
+		pr_info("Peer reported %s failure on %s\n", msg, call->type);
+	}
+}
+
+/*
+ * deliver messages to a call
+ */
+static void rxperf_deliver_to_call(struct work_struct *work)
+{
+	struct rxperf_call *call = container_of(work, struct rxperf_call, work);
+	enum rxperf_call_state state;
+	u32 abort_code, remote_abort = 0;
+	int ret;
+
+	if (call->state == RXPERF_CALL_COMPLETE)
+		return;
+
+	while (state = call->state,
+	       state == RXPERF_CALL_SV_AWAIT_PARAMS ||
+	       state == RXPERF_CALL_SV_AWAIT_REQUEST ||
+	       state == RXPERF_CALL_SV_AWAIT_ACK
+	       ) {
+		if (state == RXPERF_CALL_SV_AWAIT_ACK) {
+			if (!rxrpc_kernel_check_life(rxperf_socket, call->rxcall))
+				goto call_complete;
+			return;
+		}
+
+		ret = call->deliver(call);
+		if (ret == 0)
+			ret = rxperf_process_call(call);
+
+		switch (ret) {
+		case 0:
+			continue;
+		case -EINPROGRESS:
+		case -EAGAIN:
+			return;
+		case -ECONNABORTED:
+			rxperf_log_error(call, call->abort_code);
+			goto call_complete;
+		case -EOPNOTSUPP:
+			abort_code = RXGEN_OPCODE;
+			rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+						abort_code, ret, "GOP");
+			goto call_complete;
+		case -ENOTSUPP:
+			abort_code = RX_USER_ABORT;
+			rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+						abort_code, ret, "GUA");
+			goto call_complete;
+		case -EIO:
+			pr_err("Call %u in bad state %u\n",
+			       call->debug_id, call->state);
+			fallthrough;
+		case -ENODATA:
+		case -EBADMSG:
+		case -EMSGSIZE:
+		case -ENOMEM:
+		case -EFAULT:
+			rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+						RXGEN_SS_UNMARSHAL, ret, "GUM");
+			goto call_complete;
+		default:
+			rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+						RX_CALL_DEAD, ret, "GER");
+			goto call_complete;
+		}
+	}
+
+call_complete:
+	rxperf_set_call_complete(call, ret, remote_abort);
+	/* The call may have been requeued */
+	rxrpc_kernel_end_call(rxperf_socket, call->rxcall);
+	cancel_work(&call->work);
+	kfree(call);
+}
+
+/*
+ * Extract a piece of data from the received data socket buffers.
+ */
+static int rxperf_extract_data(struct rxperf_call *call, bool want_more)
+{
+	u32 remote_abort = 0;
+	int ret;
+
+	ret = rxrpc_kernel_recv_data(rxperf_socket, call->rxcall, &call->iter,
+				     &call->iov_len, want_more, &remote_abort,
+				     &call->service_id);
+	pr_debug("Extract i=%zu l=%zu m=%u ret=%d\n",
+		 iov_iter_count(&call->iter), call->iov_len, want_more, ret);
+	if (ret == 0 || ret == -EAGAIN)
+		return ret;
+
+	if (ret == 1) {
+		switch (call->state) {
+		case RXPERF_CALL_SV_AWAIT_REQUEST:
+			rxperf_set_call_state(call, RXPERF_CALL_SV_REPLYING);
+			break;
+		case RXPERF_CALL_COMPLETE:
+			pr_debug("premature completion %d", call->error);
+			return call->error;
+		default:
+			break;
+		}
+		return 0;
+	}
+
+	rxperf_set_call_complete(call, ret, remote_abort);
+	return ret;
+}
+
+/*
+ * Grab the operation ID from an incoming manager call.
+ */
+static int rxperf_deliver_param_block(struct rxperf_call *call)
+{
+	u32 version;
+	int ret;
+
+	/* Extract the parameter block */
+	ret = rxperf_extract_data(call, true);
+	if (ret < 0)
+		return ret;
+
+	version			= ntohl(call->params.version);
+	call->operation_id	= ntohl(call->params.type);
+	call->deliver		= rxperf_deliver_request;
+
+	if (version != RX_PERF_VERSION) {
+		pr_info("Version mismatch %x\n", version);
+		return -ENOTSUPP;
+	}
+
+	switch (call->operation_id) {
+	case RX_PERF_SEND:
+		call->type = "send";
+		call->reply_len = 0;
+		call->iov_len = 4;	/* Expect req size */
+		break;
+	case RX_PERF_RECV:
+		call->type = "recv";
+		call->req_len = 0;
+		call->iov_len = 4;	/* Expect reply size */
+		break;
+	case RX_PERF_RPC:
+		call->type = "rpc";
+		call->iov_len = 8;	/* Expect req size and reply size */
+		break;
+	case RX_PERF_FILE:
+		call->type = "file";
+		fallthrough;
+	default:
+		return -EOPNOTSUPP;
+	}
+
+	rxperf_set_call_state(call, RXPERF_CALL_SV_AWAIT_REQUEST);
+	return call->deliver(call);
+}
+
+/*
+ * Deliver the request data.
+ */
+static int rxperf_deliver_request(struct rxperf_call *call)
+{
+	int ret;
+
+	switch (call->unmarshal) {
+	case 0:
+		call->kvec[0].iov_len	= call->iov_len;
+		call->kvec[0].iov_base	= call->tmp;
+		iov_iter_kvec(&call->iter, READ, call->kvec, 1, call->iov_len);
+		call->unmarshal++;
+		fallthrough;
+	case 1:
+		ret = rxperf_extract_data(call, true);
+		if (ret < 0)
+			return ret;
+
+		switch (call->operation_id) {
+		case RX_PERF_SEND:
+			call->type = "send";
+			call->req_len	= ntohl(call->tmp[0]);
+			call->reply_len	= 0;
+			break;
+		case RX_PERF_RECV:
+			call->type = "recv";
+			call->req_len = 0;
+			call->reply_len	= ntohl(call->tmp[0]);
+			break;
+		case RX_PERF_RPC:
+			call->type = "rpc";
+			call->req_len	= ntohl(call->tmp[0]);
+			call->reply_len	= ntohl(call->tmp[1]);
+			break;
+		default:
+			pr_info("Can't parse extra params\n");
+			return -EIO;
+		}
+
+		pr_debug("CALL op=%s rq=%zx rp=%zx\n",
+			 call->type, call->req_len, call->reply_len);
+
+		call->iov_len = call->req_len;
+		iov_iter_discard(&call->iter, READ, call->req_len);
+		call->unmarshal++;
+		fallthrough;
+	case 2:
+		ret = rxperf_extract_data(call, false);
+		if (ret < 0)
+			return ret;
+		call->unmarshal++;
+		fallthrough;
+	default:
+		return 0;
+	}
+}
+
+/*
+ * Process a call for which we've received the request.
+ */
+static int rxperf_process_call(struct rxperf_call *call)
+{
+	struct msghdr msg = {};
+	struct bio_vec bv[1];
+	struct kvec iov[1];
+	ssize_t n;
+	size_t reply_len = call->reply_len, len;
+
+	rxrpc_kernel_set_tx_length(rxperf_socket, call->rxcall,
+				   reply_len + sizeof(rxperf_magic_cookie));
+
+	while (reply_len > 0) {
+		len = min_t(size_t, reply_len, PAGE_SIZE);
+		bv[0].bv_page	= ZERO_PAGE(0);
+		bv[0].bv_offset	= 0;
+		bv[0].bv_len	= len;
+		iov_iter_bvec(&msg.msg_iter, WRITE, bv, 1, len);
+		msg.msg_flags = MSG_MORE;
+		n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg,
+					   len, rxperf_notify_end_reply_tx);
+		if (n < 0)
+			return n;
+		if (n == 0)
+			return -EIO;
+		reply_len -= n;
+	}
+
+	len = sizeof(rxperf_magic_cookie);
+	iov[0].iov_base	= (void *)rxperf_magic_cookie;
+	iov[0].iov_len	= len;
+	iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, len);
+	msg.msg_flags = 0;
+	n = rxrpc_kernel_send_data(rxperf_socket, call->rxcall, &msg, len,
+				   rxperf_notify_end_reply_tx);
+	if (n >= 0)
+		return 0; /* Success */
+
+	if (n == -ENOMEM)
+		rxrpc_kernel_abort_call(rxperf_socket, call->rxcall,
+					RXGEN_SS_MARSHAL, -ENOMEM, "GOM");
+	return n;
+}
+
+/*
+ * Add a key to the security keyring.
+ */
+static int rxperf_add_key(struct key *keyring)
+{
+	key_ref_t kref;
+	int ret;
+
+	kref = key_create_or_update(make_key_ref(keyring, true),
+				    "rxrpc_s",
+				    __stringify(RX_PERF_SERVICE) ":2",
+				    secret,
+				    sizeof(secret),
+				    KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH
+				    | KEY_USR_VIEW,
+				    KEY_ALLOC_NOT_IN_QUOTA);
+
+	if (IS_ERR(kref)) {
+		pr_err("Can't allocate rxperf server key: %ld\n", PTR_ERR(kref));
+		return PTR_ERR(kref);
+	}
+
+	ret = key_link(keyring, key_ref_to_ptr(kref));
+	if (ret < 0)
+		pr_err("Can't link rxperf server key: %d\n", ret);
+	key_ref_put(kref);
+	return ret;
+}
+
+/*
+ * Initialise the rxperf server.
+ */
+static int __init rxperf_init(void)
+{
+	struct key *keyring;
+	int ret = -ENOMEM;
+
+	pr_info("Server registering\n");
+
+	rxperf_workqueue = alloc_workqueue("rxperf", 0, 0);
+	if (!rxperf_workqueue)
+		goto error_workqueue;
+
+	keyring = keyring_alloc("rxperf_server",
+				GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+				KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH |
+				KEY_POS_WRITE |
+				KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH |
+				KEY_USR_WRITE |
+				KEY_OTH_VIEW | KEY_OTH_READ | KEY_OTH_SEARCH,
+				KEY_ALLOC_NOT_IN_QUOTA,
+				NULL, NULL);
+	if (IS_ERR(keyring)) {
+		pr_err("Can't allocate rxperf server keyring: %ld\n",
+		       PTR_ERR(keyring));
+		goto error_keyring;
+	}
+	rxperf_sec_keyring = keyring;
+	ret = rxperf_add_key(keyring);
+	if (ret < 0)
+		goto error_key;
+
+	ret = rxperf_open_socket();
+	if (ret < 0)
+		goto error_socket;
+	return 0;
+
+error_socket:
+error_key:
+	key_put(rxperf_sec_keyring);
+error_keyring:
+	destroy_workqueue(rxperf_workqueue);
+	rcu_barrier();
+error_workqueue:
+	pr_err("Failed to register: %d\n", ret);
+	return ret;
+}
+late_initcall(rxperf_init); /* Must be called after net/ to create socket */
+
+static void __exit rxperf_exit(void)
+{
+	pr_info("Server unregistering.\n");
+
+	rxperf_close_socket();
+	key_put(rxperf_sec_keyring);
+	destroy_workqueue(rxperf_workqueue);
+	rcu_barrier();
+}
+module_exit(rxperf_exit);
+
diff --git a/net/rxrpc/server_key.c b/net/rxrpc/server_key.c
index ee269e0e6ee8..e51940589ee5 100644
--- a/net/rxrpc/server_key.c
+++ b/net/rxrpc/server_key.c
@@ -144,3 +144,28 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, sockptr_t optval, int optlen)
 	_leave(" = 0 [key %x]", key->serial);
 	return 0;
 }
+
+/**
+ * rxrpc_sock_set_security_keyring - Set the security keyring for a kernel service
+ * @sk: The socket to set the keyring on
+ * @keyring: The keyring to set
+ *
+ * Set the server security keyring on an rxrpc socket.  This is used to provide
+ * the encryption keys for a kernel service.
+ */
+int rxrpc_sock_set_security_keyring(struct sock *sk, struct key *keyring)
+{
+	struct rxrpc_sock *rx = rxrpc_sk(sk);
+	int ret = 0;
+
+	lock_sock(sk);
+	if (rx->securities)
+		ret = -EINVAL;
+	else if (rx->sk.sk_state != RXRPC_UNBOUND)
+		ret = -EISCONN;
+	else
+		rx->securities = key_get(keyring);
+	release_sock(sk);
+	return ret;
+}
+EXPORT_SYMBOL(rxrpc_sock_set_security_keyring);
-- 
cgit v1.2.3


From 2ed83ed2be1b2395f11a95f0fec2b87ed71aebd8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 17 Nov 2022 10:04:15 +0000
Subject: rxrpc: Remove decl for rxrpc_kernel_call_is_complete()

rxrpc_kernel_call_is_complete() has been removed, so remove its declaration
too.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: Marc Dionne <marc.dionne@auristor.com>
cc: linux-afs@lists.infradead.org
---
 include/net/af_rxrpc.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
index dc033f08191e..d5a5ae926380 100644
--- a/include/net/af_rxrpc.h
+++ b/include/net/af_rxrpc.h
@@ -66,7 +66,6 @@ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
 void rxrpc_kernel_set_tx_length(struct socket *, struct rxrpc_call *, s64);
 bool rxrpc_kernel_check_life(const struct socket *, const struct rxrpc_call *);
 u32 rxrpc_kernel_get_epoch(struct socket *, struct rxrpc_call *);
-bool rxrpc_kernel_call_is_complete(struct rxrpc_call *);
 void rxrpc_kernel_set_max_life(struct socket *, struct rxrpc_call *,
 			       unsigned long);
 
-- 
cgit v1.2.3


From 7d360f6061db01830adfdb1eaa3977b19db0c30b Mon Sep 17 00:00:00 2001
From: Felix Fietkau <nbd@nbd.name>
Date: Mon, 10 Oct 2022 11:43:38 +0200
Subject: wifi: mac80211: add support for restricting netdev features per vif

This can be used to selectively disable feature flags for checksum offload,
scatter/gather or GSO by changing vif->netdev_features.
Removing features from vif->netdev_features does not affect the netdev
features themselves, but instead fixes up skbs in the tx path so that the
offloads are not needed in the driver.

Aside from making it easier to deal with vif type based hardware limitations,
this also makes it possible to optimize performance on hardware without native
GSO support by declaring GSO support in hw->netdev_features and removing it
from vif->netdev_features. This allows mac80211 to handle GSO segmentation
after the sta lookup, but before itxq enqueue, thus reducing the number of
unnecessary sta lookups, as well as some other per-packet processing.

Signed-off-by: Felix Fietkau <nbd@nbd.name>
Link: https://lore.kernel.org/r/20221010094338.78070-1-nbd@nbd.name
Signed-off-by: Johannes Berg <johannes.berg@intel.com>
---
 include/net/fq_impl.h  |  16 +--
 include/net/mac80211.h |   5 +
 net/mac80211/iface.c   |   1 +
 net/mac80211/tx.c      | 283 ++++++++++++++++++++++++++++++++-----------------
 4 files changed, 202 insertions(+), 103 deletions(-)

(limited to 'include/net')

diff --git a/include/net/fq_impl.h b/include/net/fq_impl.h
index 524b510f1c68..9467e33dfb36 100644
--- a/include/net/fq_impl.h
+++ b/include/net/fq_impl.h
@@ -200,6 +200,7 @@ static void fq_tin_enqueue(struct fq *fq,
 			   fq_skb_free_t free_func)
 {
 	struct fq_flow *flow;
+	struct sk_buff *next;
 	bool oom;
 
 	lockdep_assert_held(&fq->lock);
@@ -214,11 +215,15 @@ static void fq_tin_enqueue(struct fq *fq,
 	}
 
 	flow->tin = tin;
-	flow->backlog += skb->len;
-	tin->backlog_bytes += skb->len;
-	tin->backlog_packets++;
-	fq->memory_usage += skb->truesize;
-	fq->backlog++;
+	skb_list_walk_safe(skb, skb, next) {
+		skb_mark_not_on_list(skb);
+		flow->backlog += skb->len;
+		tin->backlog_bytes += skb->len;
+		tin->backlog_packets++;
+		fq->memory_usage += skb->truesize;
+		fq->backlog++;
+		__skb_queue_tail(&flow->queue, skb);
+	}
 
 	if (list_empty(&flow->flowchain)) {
 		flow->deficit = fq->quantum;
@@ -226,7 +231,6 @@ static void fq_tin_enqueue(struct fq *fq,
 			      &tin->new_flows);
 	}
 
-	__skb_queue_tail(&flow->queue, skb);
 	oom = (fq->memory_usage > fq->memory_limit);
 	while (fq->backlog > fq->limit || oom) {
 		flow = fq_find_fattest_flow(fq);
diff --git a/include/net/mac80211.h b/include/net/mac80211.h
index 721c450a9ccd..689da327ce2e 100644
--- a/include/net/mac80211.h
+++ b/include/net/mac80211.h
@@ -1807,6 +1807,10 @@ struct ieee80211_vif_cfg {
  * @addr: address of this interface
  * @p2p: indicates whether this AP or STA interface is a p2p
  *	interface, i.e. a GO or p2p-sta respectively
+ * @netdev_features: tx netdev features supported by the hardware for this
+ *	vif. mac80211 initializes this to hw->netdev_features, and the driver
+ *	can mask out specific tx features. mac80211 will handle software fixup
+ *	for masked offloads (GSO, CSUM)
  * @driver_flags: flags/capabilities the driver has for this interface,
  *	these need to be set (or cleared) when the interface is added
  *	or, if supported by the driver, the interface type is changed
@@ -1848,6 +1852,7 @@ struct ieee80211_vif {
 
 	struct ieee80211_txq *txq;
 
+	netdev_features_t netdev_features;
 	u32 driver_flags;
 	u32 offload_flags;
 
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index 7b2843df3813..d49a5906a943 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -2178,6 +2178,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name,
 		ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
 		ndev->hw_features |= ndev->features &
 					MAC80211_SUPPORTED_FEATURES_TX;
+		sdata->vif.netdev_features = local->hw.netdev_features;
 
 		netdev_set_default_ethtool_ops(ndev, &ieee80211_ethtool_ops);
 
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index 774e62860c45..2171cd1ca807 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -1355,7 +1355,11 @@ static struct txq_info *ieee80211_get_txq(struct ieee80211_local *local,
 
 static void ieee80211_set_skb_enqueue_time(struct sk_buff *skb)
 {
-	IEEE80211_SKB_CB(skb)->control.enqueue_time = codel_get_time();
+	struct sk_buff *next;
+	codel_time_t now = codel_get_time();
+
+	skb_list_walk_safe(skb, skb, next)
+		IEEE80211_SKB_CB(skb)->control.enqueue_time = now;
 }
 
 static u32 codel_skb_len_func(const struct sk_buff *skb)
@@ -3578,55 +3582,79 @@ ieee80211_xmit_fast_finish(struct ieee80211_sub_if_data *sdata,
 	return TX_CONTINUE;
 }
 
-static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
-				struct sta_info *sta,
-				struct ieee80211_fast_tx *fast_tx,
-				struct sk_buff *skb)
+static netdev_features_t
+ieee80211_sdata_netdev_features(struct ieee80211_sub_if_data *sdata)
 {
-	struct ieee80211_local *local = sdata->local;
-	u16 ethertype = (skb->data[12] << 8) | skb->data[13];
-	int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
-	int hw_headroom = sdata->local->hw.extra_tx_headroom;
-	struct ethhdr eth;
-	struct ieee80211_tx_info *info;
-	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
-	struct ieee80211_tx_data tx;
-	ieee80211_tx_result r;
-	struct tid_ampdu_tx *tid_tx = NULL;
-	u8 tid = IEEE80211_NUM_TIDS;
+	if (sdata->vif.type != NL80211_IFTYPE_AP_VLAN)
+		return sdata->vif.netdev_features;
 
-	/* control port protocol needs a lot of special handling */
-	if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
-		return false;
+	if (!sdata->bss)
+		return 0;
 
-	/* only RFC 1042 SNAP */
-	if (ethertype < ETH_P_802_3_MIN)
-		return false;
+	sdata = container_of(sdata->bss, struct ieee80211_sub_if_data, u.ap);
+	return sdata->vif.netdev_features;
+}
 
-	/* don't handle TX status request here either */
-	if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
-		return false;
+static struct sk_buff *
+ieee80211_tx_skb_fixup(struct sk_buff *skb, netdev_features_t features)
+{
+	if (skb_is_gso(skb)) {
+		struct sk_buff *segs;
 
-	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
-		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
-		tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
-		if (tid_tx) {
-			if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state))
-				return false;
-			if (tid_tx->timeout)
-				tid_tx->last_tx = jiffies;
-		}
+		segs = skb_gso_segment(skb, features);
+		if (!segs)
+			return skb;
+		if (IS_ERR(segs))
+			goto free;
+
+		consume_skb(skb);
+		return segs;
 	}
 
-	/* after this point (skb is modified) we cannot return false */
+	if (skb_needs_linearize(skb, features) && __skb_linearize(skb))
+		goto free;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		int ofs = skb_checksum_start_offset(skb);
+
+		if (skb->encapsulation)
+			skb_set_inner_transport_header(skb, ofs);
+		else
+			skb_set_transport_header(skb, ofs);
+
+		if (skb_csum_hwoffload_help(skb, features))
+			goto free;
+	}
+
+	skb_mark_not_on_list(skb);
+	return skb;
+
+free:
+	kfree_skb(skb);
+	return NULL;
+}
+
+static void __ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
+				  struct sta_info *sta,
+				  struct ieee80211_fast_tx *fast_tx,
+				  struct sk_buff *skb, u8 tid, bool ampdu)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+	struct ieee80211_tx_info *info;
+	struct ieee80211_tx_data tx;
+	ieee80211_tx_result r;
+	int hw_headroom = sdata->local->hw.extra_tx_headroom;
+	int extra_head = fast_tx->hdr_len - (ETH_HLEN - 2);
+	struct ethhdr eth;
 
 	skb = skb_share_check(skb, GFP_ATOMIC);
 	if (unlikely(!skb))
-		return true;
+		return;
 
 	if ((hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) &&
 	    ieee80211_amsdu_aggregate(sdata, sta, fast_tx, skb))
-		return true;
+		return;
 
 	/* will not be crypto-handled beyond what we do here, so use false
 	 * as the may-encrypt argument for the resize to not account for
@@ -3635,10 +3663,8 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	if (unlikely(ieee80211_skb_resize(sdata, skb,
 					  max_t(int, extra_head + hw_headroom -
 						     skb_headroom(skb), 0),
-					  ENCRYPT_NO))) {
-		kfree_skb(skb);
-		return true;
-	}
+					  ENCRYPT_NO)))
+		goto free;
 
 	memcpy(&eth, skb->data, ETH_HLEN - 2);
 	hdr = skb_push(skb, extra_head);
@@ -3652,7 +3678,7 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	info->control.vif = &sdata->vif;
 	info->flags = IEEE80211_TX_CTL_FIRST_FRAGMENT |
 		      IEEE80211_TX_CTL_DONTFRAG |
-		      (tid_tx ? IEEE80211_TX_CTL_AMPDU : 0);
+		      (ampdu ? IEEE80211_TX_CTL_AMPDU : 0);
 	info->control.flags = IEEE80211_TX_CTRL_FAST_XMIT |
 			      u32_encode_bits(IEEE80211_LINK_UNSPECIFIED,
 					      IEEE80211_TX_CTRL_MLO_LINK);
@@ -3676,16 +3702,14 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 	tx.key = fast_tx->key;
 
 	if (ieee80211_queue_skb(local, sdata, sta, skb))
-		return true;
+		return;
 
 	tx.skb = skb;
 	r = ieee80211_xmit_fast_finish(sdata, sta, fast_tx->pn_offs,
 				       fast_tx->key, &tx);
 	tx.skb = NULL;
-	if (r == TX_DROP) {
-		kfree_skb(skb);
-		return true;
-	}
+	if (r == TX_DROP)
+		goto free;
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
@@ -3693,6 +3717,56 @@ static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
 
 	__skb_queue_tail(&tx.skbs, skb);
 	ieee80211_tx_frags(local, &sdata->vif, sta, &tx.skbs, false);
+	return;
+
+free:
+	kfree_skb(skb);
+}
+
+static bool ieee80211_xmit_fast(struct ieee80211_sub_if_data *sdata,
+				struct sta_info *sta,
+				struct ieee80211_fast_tx *fast_tx,
+				struct sk_buff *skb)
+{
+	u16 ethertype = (skb->data[12] << 8) | skb->data[13];
+	struct ieee80211_hdr *hdr = (void *)fast_tx->hdr;
+	struct tid_ampdu_tx *tid_tx = NULL;
+	struct sk_buff *next;
+	u8 tid = IEEE80211_NUM_TIDS;
+
+	/* control port protocol needs a lot of special handling */
+	if (cpu_to_be16(ethertype) == sdata->control_port_protocol)
+		return false;
+
+	/* only RFC 1042 SNAP */
+	if (ethertype < ETH_P_802_3_MIN)
+		return false;
+
+	/* don't handle TX status request here either */
+	if (skb->sk && skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS)
+		return false;
+
+	if (hdr->frame_control & cpu_to_le16(IEEE80211_STYPE_QOS_DATA)) {
+		tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
+		tid_tx = rcu_dereference(sta->ampdu_mlme.tid_tx[tid]);
+		if (tid_tx) {
+			if (!test_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state))
+				return false;
+			if (tid_tx->timeout)
+				tid_tx->last_tx = jiffies;
+		}
+	}
+
+	/* after this point (skb is modified) we cannot return false */
+	skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata));
+	if (!skb)
+		return true;
+
+	skb_list_walk_safe(skb, skb, next) {
+		skb_mark_not_on_list(skb);
+		__ieee80211_xmit_fast(sdata, sta, fast_tx, skb, tid, tid_tx);
+	}
+
 	return true;
 }
 
@@ -4192,31 +4266,14 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb,
 			goto out;
 	}
 
-	if (skb_is_gso(skb)) {
-		struct sk_buff *segs;
-
-		segs = skb_gso_segment(skb, 0);
-		if (IS_ERR(segs)) {
-			goto out_free;
-		} else if (segs) {
-			consume_skb(skb);
-			skb = segs;
-		}
-	} else {
-		/* we cannot process non-linear frames on this path */
-		if (skb_linearize(skb))
-			goto out_free;
-
-		/* the frame could be fragmented, software-encrypted, and other
-		 * things so we cannot really handle checksum offload with it -
-		 * fix it up in software before we handle anything else.
-		 */
-		if (skb->ip_summed == CHECKSUM_PARTIAL) {
-			skb_set_transport_header(skb,
-						 skb_checksum_start_offset(skb));
-			if (skb_checksum_help(skb))
-				goto out_free;
-		}
+	/* the frame could be fragmented, software-encrypted, and other
+	 * things so we cannot really handle checksum or GSO offload.
+	 * fix it up in software before we handle anything else.
+	 */
+	skb = ieee80211_tx_skb_fixup(skb, 0);
+	if (!skb) {
+		len = 0;
+		goto out;
 	}
 
 	skb_list_walk_safe(skb, skb, next) {
@@ -4434,9 +4491,11 @@ normal:
 	return NETDEV_TX_OK;
 }
 
-static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
-			      struct sk_buff *skb, struct sta_info *sta,
-			      bool txpending)
+
+
+static bool __ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
+				struct sk_buff *skb, struct sta_info *sta,
+				bool txpending)
 {
 	struct ieee80211_local *local = sdata->local;
 	struct ieee80211_tx_control control = {};
@@ -4445,14 +4504,6 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
 	unsigned long flags;
 	int q = info->hw_queue;
 
-	if (sta)
-		sk_pacing_shift_update(skb->sk, local->hw.tx_sk_pacing_shift);
-
-	ieee80211_tpt_led_trig_tx(local, skb->len);
-
-	if (ieee80211_queue_skb(local, sdata, sta, skb))
-		return true;
-
 	spin_lock_irqsave(&local->queue_stop_reason_lock, flags);
 
 	if (local->queue_stop_reasons[q] ||
@@ -4479,6 +4530,26 @@ static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
 	return true;
 }
 
+static bool ieee80211_tx_8023(struct ieee80211_sub_if_data *sdata,
+			      struct sk_buff *skb, struct sta_info *sta,
+			      bool txpending)
+{
+	struct ieee80211_local *local = sdata->local;
+	struct sk_buff *next;
+	bool ret = true;
+
+	if (ieee80211_queue_skb(local, sdata, sta, skb))
+		return true;
+
+	skb_list_walk_safe(skb, skb, next) {
+		skb_mark_not_on_list(skb);
+		if (!__ieee80211_tx_8023(sdata, skb, sta, txpending))
+			ret = false;
+	}
+
+	return ret;
+}
+
 static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 				struct net_device *dev, struct sta_info *sta,
 				struct ieee80211_key *key, struct sk_buff *skb)
@@ -4486,9 +4557,13 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	struct ieee80211_tx_info *info;
 	struct ieee80211_local *local = sdata->local;
 	struct tid_ampdu_tx *tid_tx;
+	struct sk_buff *seg, *next;
+	unsigned int skbs = 0, len = 0;
+	u16 queue;
 	u8 tid;
 
-	skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb));
+	queue = ieee80211_select_queue(sdata, sta, skb);
+	skb_set_queue_mapping(skb, queue);
 
 	if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) &&
 	    test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state))
@@ -4498,9 +4573,6 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	if (unlikely(!skb))
 		return;
 
-	info = IEEE80211_SKB_CB(skb);
-	memset(info, 0, sizeof(*info));
-
 	ieee80211_aggr_check(sdata, sta, skb);
 
 	tid = skb->priority & IEEE80211_QOS_CTL_TAG1D_MASK;
@@ -4514,22 +4586,20 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 			return;
 		}
 
-		info->flags |= IEEE80211_TX_CTL_AMPDU;
 		if (tid_tx->timeout)
 			tid_tx->last_tx = jiffies;
 	}
 
-	if (unlikely(skb->sk &&
-		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
-		info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
-							     &info->flags, NULL);
-
-	info->hw_queue = sdata->vif.hw_queue[skb_get_queue_mapping(skb)];
+	skb = ieee80211_tx_skb_fixup(skb, ieee80211_sdata_netdev_features(sdata));
+	if (!skb)
+		return;
 
-	dev_sw_netstats_tx_add(dev, 1, skb->len);
+	info = IEEE80211_SKB_CB(skb);
+	memset(info, 0, sizeof(*info));
+	if (tid_tx)
+		info->flags |= IEEE80211_TX_CTL_AMPDU;
 
-	sta->deflink.tx_stats.bytes[skb_get_queue_mapping(skb)] += skb->len;
-	sta->deflink.tx_stats.packets[skb_get_queue_mapping(skb)]++;
+	info->hw_queue = sdata->vif.hw_queue[queue];
 
 	if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
 		sdata = container_of(sdata->bss,
@@ -4541,6 +4611,24 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata,
 	if (key)
 		info->control.hw_key = &key->conf;
 
+	skb_list_walk_safe(skb, seg, next) {
+		skbs++;
+		len += seg->len;
+		if (seg != skb)
+			memcpy(IEEE80211_SKB_CB(seg), info, sizeof(*info));
+	}
+
+	if (unlikely(skb->sk &&
+		     skb_shinfo(skb)->tx_flags & SKBTX_WIFI_STATUS))
+		info->ack_frame_id = ieee80211_store_ack_skb(local, skb,
+							     &info->flags, NULL);
+
+	dev_sw_netstats_tx_add(dev, skbs, len);
+	sta->deflink.tx_stats.packets[queue] += skbs;
+	sta->deflink.tx_stats.bytes[queue] += len;
+
+	ieee80211_tpt_led_trig_tx(local, len);
+
 	ieee80211_tx_8023(sdata, skb, sta, false);
 
 	return;
@@ -4582,6 +4670,7 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb,
 		    key->conf.cipher == WLAN_CIPHER_SUITE_TKIP))
 		goto skip_offload;
 
+	sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift);
 	ieee80211_8023_xmit(sdata, dev, sta, key, skb);
 	goto out;
 
-- 
cgit v1.2.3


From 459837b522f7dff3b6681f534d8fff4eca19b7d1 Mon Sep 17 00:00:00 2001
From: Dmitry Safonov <dima@arista.com>
Date: Wed, 23 Nov 2022 17:38:57 +0000
Subject: net/tcp: Disable TCP-MD5 static key on tcp_md5sig_info destruction

To do that, separate two scenarios:
- where it's the first MD5 key on the system, which means that enabling
  of the static key may need to sleep;
- copying of an existing key from a listening socket to the request
  socket upon receiving a signed TCP segment, where static key was
  already enabled (when the key was added to the listening socket).

Now the life-time of the static branch for TCP-MD5 is until:
- last tcp_md5sig_info is destroyed
- last socket in time-wait state with MD5 key is closed.

Which means that after all sockets with TCP-MD5 keys are gone, the
system gets back the performance of disabled md5-key static branch.

While at here, provide static_key_fast_inc() helper that does ref
counter increment in atomic fashion (without grabbing cpus_read_lock()
on CONFIG_JUMP_LABEL=y). This is needed to add a new user for
a static_key when the caller controls the lifetime of another user.

Signed-off-by: Dmitry Safonov <dima@arista.com>
Acked-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tcp.h        | 10 +++++--
 net/ipv4/tcp.c           |  5 +---
 net/ipv4/tcp_ipv4.c      | 71 +++++++++++++++++++++++++++++++++++++++---------
 net/ipv4/tcp_minisocks.c | 16 ++++++++---
 net/ipv4/tcp_output.c    |  4 +--
 net/ipv6/tcp_ipv6.c      | 10 +++----
 6 files changed, 84 insertions(+), 32 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6b814e788f00..f925377066fe 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1675,7 +1675,11 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
 			const struct sock *sk, const struct sk_buff *skb);
 int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 		   int family, u8 prefixlen, int l3index, u8 flags,
-		   const u8 *newkey, u8 newkeylen, gfp_t gfp);
+		   const u8 *newkey, u8 newkeylen);
+int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
+		     int family, u8 prefixlen, int l3index,
+		     struct tcp_md5sig_key *key);
+
 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr,
 		   int family, u8 prefixlen, int l3index, u8 flags);
 struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
@@ -1683,7 +1687,7 @@ struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk,
 
 #ifdef CONFIG_TCP_MD5SIG
 #include <linux/jump_label.h>
-extern struct static_key_false tcp_md5_needed;
+extern struct static_key_false_deferred tcp_md5_needed;
 struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index,
 					   const union tcp_md5_addr *addr,
 					   int family);
@@ -1691,7 +1695,7 @@ static inline struct tcp_md5sig_key *
 tcp_md5_do_lookup(const struct sock *sk, int l3index,
 		  const union tcp_md5_addr *addr, int family)
 {
-	if (!static_branch_unlikely(&tcp_md5_needed))
+	if (!static_branch_unlikely(&tcp_md5_needed.key))
 		return NULL;
 	return __tcp_md5_do_lookup(sk, l3index, addr, family);
 }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 24602a5184b0..001947136b0a 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4464,11 +4464,8 @@ bool tcp_alloc_md5sig_pool(void)
 	if (unlikely(!READ_ONCE(tcp_md5sig_pool_populated))) {
 		mutex_lock(&tcp_md5sig_mutex);
 
-		if (!tcp_md5sig_pool_populated) {
+		if (!tcp_md5sig_pool_populated)
 			__tcp_alloc_md5sig_pool();
-			if (tcp_md5sig_pool_populated)
-				static_branch_inc(&tcp_md5_needed);
-		}
 
 		mutex_unlock(&tcp_md5sig_mutex);
 	}
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index c72e53835397..5d83a332f1dd 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1053,7 +1053,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
  * We need to maintain these in the sk structure.
  */
 
-DEFINE_STATIC_KEY_FALSE(tcp_md5_needed);
+DEFINE_STATIC_KEY_DEFERRED_FALSE(tcp_md5_needed, HZ);
 EXPORT_SYMBOL(tcp_md5_needed);
 
 static bool better_md5_match(struct tcp_md5sig_key *old, struct tcp_md5sig_key *new)
@@ -1166,9 +1166,6 @@ static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct tcp_md5sig_info *md5sig;
 
-	if (rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk)))
-		return 0;
-
 	md5sig = kmalloc(sizeof(*md5sig), gfp);
 	if (!md5sig)
 		return -ENOMEM;
@@ -1180,9 +1177,9 @@ static int tcp_md5sig_info_add(struct sock *sk, gfp_t gfp)
 }
 
 /* This can be called on a newly created socket, from other files */
-int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
-		   int family, u8 prefixlen, int l3index, u8 flags,
-		   const u8 *newkey, u8 newkeylen, gfp_t gfp)
+static int __tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+			    int family, u8 prefixlen, int l3index, u8 flags,
+			    const u8 *newkey, u8 newkeylen, gfp_t gfp)
 {
 	/* Add Key to the list */
 	struct tcp_md5sig_key *key;
@@ -1209,9 +1206,6 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 		return 0;
 	}
 
-	if (tcp_md5sig_info_add(sk, gfp))
-		return -ENOMEM;
-
 	md5sig = rcu_dereference_protected(tp->md5sig_info,
 					   lockdep_sock_is_held(sk));
 
@@ -1235,8 +1229,59 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
 	hlist_add_head_rcu(&key->node, &md5sig->head);
 	return 0;
 }
+
+int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
+		   int family, u8 prefixlen, int l3index, u8 flags,
+		   const u8 *newkey, u8 newkeylen)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+		if (tcp_md5sig_info_add(sk, GFP_KERNEL))
+			return -ENOMEM;
+
+		if (!static_branch_inc(&tcp_md5_needed.key)) {
+			struct tcp_md5sig_info *md5sig;
+
+			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+			rcu_assign_pointer(tp->md5sig_info, NULL);
+			kfree_rcu(md5sig);
+			return -EUSERS;
+		}
+	}
+
+	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index, flags,
+				newkey, newkeylen, GFP_KERNEL);
+}
 EXPORT_SYMBOL(tcp_md5_do_add);
 
+int tcp_md5_key_copy(struct sock *sk, const union tcp_md5_addr *addr,
+		     int family, u8 prefixlen, int l3index,
+		     struct tcp_md5sig_key *key)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+
+	if (!rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk))) {
+		if (tcp_md5sig_info_add(sk, sk_gfp_mask(sk, GFP_ATOMIC)))
+			return -ENOMEM;
+
+		if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
+			struct tcp_md5sig_info *md5sig;
+
+			md5sig = rcu_dereference_protected(tp->md5sig_info, lockdep_sock_is_held(sk));
+			net_warn_ratelimited("Too many TCP-MD5 keys in the system\n");
+			rcu_assign_pointer(tp->md5sig_info, NULL);
+			kfree_rcu(md5sig);
+			return -EUSERS;
+		}
+	}
+
+	return __tcp_md5_do_add(sk, addr, family, prefixlen, l3index,
+				key->flags, key->key, key->keylen,
+				sk_gfp_mask(sk, GFP_ATOMIC));
+}
+EXPORT_SYMBOL(tcp_md5_key_copy);
+
 int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family,
 		   u8 prefixlen, int l3index, u8 flags)
 {
@@ -1323,7 +1368,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, int optname,
 		return -EINVAL;
 
 	return tcp_md5_do_add(sk, addr, AF_INET, prefixlen, l3index, flags,
-			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
+			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
 static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -1580,8 +1625,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
 		 * memory, then we end up not copying the key
 		 * across. Shucks.
 		 */
-		tcp_md5_do_add(newsk, addr, AF_INET, 32, l3index, key->flags,
-			       key->key, key->keylen, GFP_ATOMIC);
+		tcp_md5_key_copy(newsk, addr, AF_INET, 32, l3index, key);
 		sk_gso_disable(newsk);
 	}
 #endif
@@ -2273,6 +2317,7 @@ void tcp_v4_destroy_sock(struct sock *sk)
 		tcp_clear_md5_list(sk);
 		kfree_rcu(rcu_dereference_protected(tp->md5sig_info, 1), rcu);
 		tp->md5sig_info = NULL;
+		static_branch_slow_dec_deferred(&tcp_md5_needed);
 	}
 #endif
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index c375f603a16c..6908812d50d3 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -291,13 +291,19 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		 */
 		do {
 			tcptw->tw_md5_key = NULL;
-			if (static_branch_unlikely(&tcp_md5_needed)) {
+			if (static_branch_unlikely(&tcp_md5_needed.key)) {
 				struct tcp_md5sig_key *key;
 
 				key = tp->af_specific->md5_lookup(sk, sk);
 				if (key) {
 					tcptw->tw_md5_key = kmemdup(key, sizeof(*key), GFP_ATOMIC);
-					BUG_ON(tcptw->tw_md5_key && !tcp_alloc_md5sig_pool());
+					if (!tcptw->tw_md5_key)
+						break;
+					BUG_ON(!tcp_alloc_md5sig_pool());
+					if (!static_key_fast_inc_not_disabled(&tcp_md5_needed.key.key)) {
+						kfree(tcptw->tw_md5_key);
+						tcptw->tw_md5_key = NULL;
+					}
 				}
 			}
 		} while (0);
@@ -337,11 +343,13 @@ EXPORT_SYMBOL(tcp_time_wait);
 void tcp_twsk_destructor(struct sock *sk)
 {
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed)) {
+	if (static_branch_unlikely(&tcp_md5_needed.key)) {
 		struct tcp_timewait_sock *twsk = tcp_twsk(sk);
 
-		if (twsk->tw_md5_key)
+		if (twsk->tw_md5_key) {
 			kfree_rcu(twsk->tw_md5_key, rcu);
+			static_branch_slow_dec_deferred(&tcp_md5_needed);
+		}
 	}
 #endif
 }
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 894410dc9293..71d01cf3c13e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -766,7 +766,7 @@ static unsigned int tcp_syn_options(struct sock *sk, struct sk_buff *skb,
 
 	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed) &&
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
 	    rcu_access_pointer(tp->md5sig_info)) {
 		*md5 = tp->af_specific->md5_lookup(sk, sk);
 		if (*md5) {
@@ -922,7 +922,7 @@ static unsigned int tcp_established_options(struct sock *sk, struct sk_buff *skb
 
 	*md5 = NULL;
 #ifdef CONFIG_TCP_MD5SIG
-	if (static_branch_unlikely(&tcp_md5_needed) &&
+	if (static_branch_unlikely(&tcp_md5_needed.key) &&
 	    rcu_access_pointer(tp->md5sig_info)) {
 		*md5 = tp->af_specific->md5_lookup(sk, sk);
 		if (*md5) {
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index f52b6f271a24..80df917ced5f 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -665,12 +665,11 @@ static int tcp_v6_parse_md5_keys(struct sock *sk, int optname,
 	if (ipv6_addr_v4mapped(&sin6->sin6_addr))
 		return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr.s6_addr32[3],
 				      AF_INET, prefixlen, l3index, flags,
-				      cmd.tcpm_key, cmd.tcpm_keylen,
-				      GFP_KERNEL);
+				      cmd.tcpm_key, cmd.tcpm_keylen);
 
 	return tcp_md5_do_add(sk, (union tcp_md5_addr *)&sin6->sin6_addr,
 			      AF_INET6, prefixlen, l3index, flags,
-			      cmd.tcpm_key, cmd.tcpm_keylen, GFP_KERNEL);
+			      cmd.tcpm_key, cmd.tcpm_keylen);
 }
 
 static int tcp_v6_md5_hash_headers(struct tcp_md5sig_pool *hp,
@@ -1370,9 +1369,8 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
 		 * memory, then we end up not copying the key
 		 * across. Shucks.
 		 */
-		tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
-			       AF_INET6, 128, l3index, key->flags, key->key, key->keylen,
-			       sk_gfp_mask(sk, GFP_ATOMIC));
+		tcp_md5_key_copy(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
+				 AF_INET6, 128, l3index, key);
 	}
 #endif
 
-- 
cgit v1.2.3


From 7d802c8098c50fb7dcf5dfcb6466482e1f2b15e4 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Wed, 30 Nov 2022 18:04:31 -0500
Subject: sctp: delete free member from struct sctp_sched_ops

After commit 9ed7bfc79542 ("sctp: fix memory leak in
sctp_stream_outq_migrate()"), sctp_sched_set_sched() is the only
place calling sched->free(), and it can actually be replaced by
sched->free_sid() on each stream, and yet there's already a loop
to traverse all streams in sctp_sched_set_sched().

This patch adds a function sctp_sched_free_sched() where it calls
sched->free_sid() for each stream to replace sched->free() calls
in sctp_sched_set_sched() and then deletes the unused free member
from struct sctp_sched_ops.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Link: https://lore.kernel.org/r/e10aac150aca2686cb0bd0570299ec716da5a5c0.1669849471.git.lucien.xin@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sctp/stream_sched.h |  2 --
 net/sctp/stream_sched.c         | 38 ++++++++++++++++++++------------------
 net/sctp/stream_sched_prio.c    | 27 ---------------------------
 net/sctp/stream_sched_rr.c      |  6 ------
 4 files changed, 20 insertions(+), 53 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sctp/stream_sched.h b/include/net/sctp/stream_sched.h
index 65058faea4db..fa00dc20a0d7 100644
--- a/include/net/sctp/stream_sched.h
+++ b/include/net/sctp/stream_sched.h
@@ -28,8 +28,6 @@ struct sctp_sched_ops {
 	int (*init_sid)(struct sctp_stream *stream, __u16 sid, gfp_t gfp);
 	/* free a stream */
 	void (*free_sid)(struct sctp_stream *stream, __u16 sid);
-	/* Frees the entire thing */
-	void (*free)(struct sctp_stream *stream);
 
 	/* Enqueue a chunk */
 	void (*enqueue)(struct sctp_outq *q, struct sctp_datamsg *msg);
diff --git a/net/sctp/stream_sched.c b/net/sctp/stream_sched.c
index 7c8f9d89e16a..330067002deb 100644
--- a/net/sctp/stream_sched.c
+++ b/net/sctp/stream_sched.c
@@ -50,10 +50,6 @@ static void sctp_sched_fcfs_free_sid(struct sctp_stream *stream, __u16 sid)
 {
 }
 
-static void sctp_sched_fcfs_free(struct sctp_stream *stream)
-{
-}
-
 static void sctp_sched_fcfs_enqueue(struct sctp_outq *q,
 				    struct sctp_datamsg *msg)
 {
@@ -101,7 +97,6 @@ static struct sctp_sched_ops sctp_sched_fcfs = {
 	.init = sctp_sched_fcfs_init,
 	.init_sid = sctp_sched_fcfs_init_sid,
 	.free_sid = sctp_sched_fcfs_free_sid,
-	.free = sctp_sched_fcfs_free,
 	.enqueue = sctp_sched_fcfs_enqueue,
 	.dequeue = sctp_sched_fcfs_dequeue,
 	.dequeue_done = sctp_sched_fcfs_dequeue_done,
@@ -131,6 +126,23 @@ void sctp_sched_ops_init(void)
 	sctp_sched_ops_rr_init();
 }
 
+static void sctp_sched_free_sched(struct sctp_stream *stream)
+{
+	struct sctp_sched_ops *sched = sctp_sched_ops_from_stream(stream);
+	struct sctp_stream_out_ext *soute;
+	int i;
+
+	sched->unsched_all(stream);
+	for (i = 0; i < stream->outcnt; i++) {
+		soute = SCTP_SO(stream, i)->ext;
+		if (!soute)
+			continue;
+		sched->free_sid(stream, i);
+		/* Give the next scheduler a clean slate. */
+		memset_after(soute, 0, outq);
+	}
+}
+
 int sctp_sched_set_sched(struct sctp_association *asoc,
 			 enum sctp_sched_type sched)
 {
@@ -146,18 +158,8 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
 	if (sched > SCTP_SS_MAX)
 		return -EINVAL;
 
-	if (old) {
-		old->free(&asoc->stream);
-
-		/* Give the next scheduler a clean slate. */
-		for (i = 0; i < asoc->stream.outcnt; i++) {
-			struct sctp_stream_out_ext *ext = SCTP_SO(&asoc->stream, i)->ext;
-
-			if (!ext)
-				continue;
-			memset_after(ext, 0, outq);
-		}
-	}
+	if (old)
+		sctp_sched_free_sched(&asoc->stream);
 
 	asoc->outqueue.sched = n;
 	n->init(&asoc->stream);
@@ -181,7 +183,7 @@ int sctp_sched_set_sched(struct sctp_association *asoc,
 	return ret;
 
 err:
-	n->free(&asoc->stream);
+	sctp_sched_free_sched(&asoc->stream);
 	asoc->outqueue.sched = &sctp_sched_fcfs; /* Always safe */
 
 	return ret;
diff --git a/net/sctp/stream_sched_prio.c b/net/sctp/stream_sched_prio.c
index 4fc9f2923ed1..42d4800f263d 100644
--- a/net/sctp/stream_sched_prio.c
+++ b/net/sctp/stream_sched_prio.c
@@ -222,32 +222,6 @@ static void sctp_sched_prio_free_sid(struct sctp_stream *stream, __u16 sid)
 	kfree(prio);
 }
 
-static void sctp_sched_prio_free(struct sctp_stream *stream)
-{
-	struct sctp_stream_priorities *prio, *n;
-	LIST_HEAD(list);
-	int i;
-
-	/* As we don't keep a list of priorities, to avoid multiple
-	 * frees we have to do it in 3 steps:
-	 *   1. unsched everyone, so the lists are free to use in 2.
-	 *   2. build the list of the priorities
-	 *   3. free the list
-	 */
-	sctp_sched_prio_unsched_all(stream);
-	for (i = 0; i < stream->outcnt; i++) {
-		if (!SCTP_SO(stream, i)->ext)
-			continue;
-		prio = SCTP_SO(stream, i)->ext->prio_head;
-		if (prio && list_empty(&prio->prio_sched))
-			list_add(&prio->prio_sched, &list);
-	}
-	list_for_each_entry_safe(prio, n, &list, prio_sched) {
-		list_del_init(&prio->prio_sched);
-		kfree(prio);
-	}
-}
-
 static void sctp_sched_prio_enqueue(struct sctp_outq *q,
 				    struct sctp_datamsg *msg)
 {
@@ -342,7 +316,6 @@ static struct sctp_sched_ops sctp_sched_prio = {
 	.init = sctp_sched_prio_init,
 	.init_sid = sctp_sched_prio_init_sid,
 	.free_sid = sctp_sched_prio_free_sid,
-	.free = sctp_sched_prio_free,
 	.enqueue = sctp_sched_prio_enqueue,
 	.dequeue = sctp_sched_prio_dequeue,
 	.dequeue_done = sctp_sched_prio_dequeue_done,
diff --git a/net/sctp/stream_sched_rr.c b/net/sctp/stream_sched_rr.c
index cc444fe0d67c..1f235e7f643a 100644
--- a/net/sctp/stream_sched_rr.c
+++ b/net/sctp/stream_sched_rr.c
@@ -94,11 +94,6 @@ static void sctp_sched_rr_free_sid(struct sctp_stream *stream, __u16 sid)
 {
 }
 
-static void sctp_sched_rr_free(struct sctp_stream *stream)
-{
-	sctp_sched_rr_unsched_all(stream);
-}
-
 static void sctp_sched_rr_enqueue(struct sctp_outq *q,
 				  struct sctp_datamsg *msg)
 {
@@ -182,7 +177,6 @@ static struct sctp_sched_ops sctp_sched_rr = {
 	.init = sctp_sched_rr_init,
 	.init_sid = sctp_sched_rr_init_sid,
 	.free_sid = sctp_sched_rr_free_sid,
-	.free = sctp_sched_rr_free,
 	.enqueue = sctp_sched_rr_enqueue,
 	.dequeue = sctp_sched_rr_dequeue,
 	.dequeue_done = sctp_sched_rr_dequeue_done,
-- 
cgit v1.2.3


From 42d7731e3e7409f9444ff44e30c025958f1b14f0 Mon Sep 17 00:00:00 2001
From: Ismael Ferreras Morezuelas <swyterzone@gmail.com>
Date: Sat, 29 Oct 2022 22:24:52 +0200
Subject: Bluetooth: btusb: Fix CSR clones again by re-adding
 ERR_DATA_REPORTING quirk

A patch series by a Qualcomm engineer essentially removed my
quirk/workaround because they thought it was unnecessary.

It wasn't, and it broke everything again:

https://patchwork.kernel.org/project/netdevbpf/list/?series=661703&archive=both&state=*

He argues that the quirk is not necessary because the code should check
if the dongle says if it's supported or not. The problem is that for
these Chinese CSR clones they say that it would work:

= New Index: 00:00:00:00:00:00 (Primary,USB,hci0)
= Open Index: 00:00:00:00:00:00
< HCI Command: Read Local Version Information (0x04|0x0001) plen 0
> HCI Event: Command Complete (0x0e) plen 12
> [hci0] 11.276039
      Read Local Version Information (0x04|0x0001) ncmd 1
        Status: Success (0x00)
        HCI version: Bluetooth 5.0 (0x09) - Revision 2064 (0x0810)
        LMP version: Bluetooth 5.0 (0x09) - Subversion 8978 (0x2312)
        Manufacturer: Cambridge Silicon Radio (10)
...
< HCI Command: Read Local Supported Features (0x04|0x0003) plen 0
> HCI Event: Command Complete (0x0e) plen 68
> [hci0] 11.668030
      Read Local Supported Commands (0x04|0x0002) ncmd 1
        Status: Success (0x00)
        Commands: 163 entries
          ...
          Read Default Erroneous Data Reporting (Octet 18 - Bit 2)
          Write Default Erroneous Data Reporting (Octet 18 - Bit 3)
          ...
...
< HCI Command: Read Default Erroneous Data Reporting (0x03|0x005a) plen 0
= Close Index: 00:1A:7D:DA:71:XX

So bring it back wholesale.

Fixes: 63b1a7dd38bf ("Bluetooth: hci_sync: Remove HCI_QUIRK_BROKEN_ERR_DATA_REPORTING")
Fixes: e168f6900877 ("Bluetooth: btusb: Remove HCI_QUIRK_BROKEN_ERR_DATA_REPORTING for fake CSR")
Fixes: 766ae2422b43 ("Bluetooth: hci_sync: Check LMP feature bit instead of quirk")
Cc: stable@vger.kernel.org
Cc: Zijun Hu <quic_zijuhu@quicinc.com>
Cc: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
Cc: Hans de Goede <hdegoede@redhat.com>
Tested-by: Ismael Ferreras Morezuelas <swyterzone@gmail.com>
Signed-off-by: Ismael Ferreras Morezuelas <swyterzone@gmail.com>
Reviewed-by: Hans de Goede <hdegoede@redhat.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c   |  1 +
 include/net/bluetooth/hci.h | 11 +++++++++++
 net/bluetooth/hci_sync.c    |  9 +++++++--
 3 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 271963805a38..59d947dec36e 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -2118,6 +2118,7 @@ static int btusb_setup_csr(struct hci_dev *hdev)
 		 * without these the controller will lock up.
 		 */
 		set_bit(HCI_QUIRK_BROKEN_STORED_LINK_KEY, &hdev->quirks);
+		set_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks);
 		set_bit(HCI_QUIRK_BROKEN_FILTER_CLEAR_ALL, &hdev->quirks);
 		set_bit(HCI_QUIRK_NO_SUSPEND_NOTIFIER, &hdev->quirks);
 
diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index e004ba04a9ae..0fe789f6a653 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -228,6 +228,17 @@ enum {
 	 */
 	HCI_QUIRK_VALID_LE_STATES,
 
+	/* When this quirk is set, then erroneous data reporting
+	 * is ignored. This is mainly due to the fact that the HCI
+	 * Read Default Erroneous Data Reporting command is advertised,
+	 * but not supported; these controllers often reply with unknown
+	 * command and tend to lock up randomly. Needing a hard reset.
+	 *
+	 * This quirk can be set before hci_register_dev is called or
+	 * during the hdev->setup vendor callback.
+	 */
+	HCI_QUIRK_BROKEN_ERR_DATA_REPORTING,
+
 	/*
 	 * When this quirk is set, then the hci_suspend_notifier is not
 	 * registered. This is intended for devices which drop completely
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 76c3107c9f91..a91145e167f2 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -3780,7 +3780,8 @@ static int hci_read_page_scan_activity_sync(struct hci_dev *hdev)
 static int hci_read_def_err_data_reporting_sync(struct hci_dev *hdev)
 {
 	if (!(hdev->commands[18] & 0x04) ||
-	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
+	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_READ_DEF_ERR_DATA_REPORTING,
@@ -4298,7 +4299,8 @@ static int hci_set_err_data_report_sync(struct hci_dev *hdev)
 	bool enabled = hci_dev_test_flag(hdev, HCI_WIDEBAND_SPEECH_ENABLED);
 
 	if (!(hdev->commands[18] & 0x08) ||
-	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING))
+	    !(hdev->features[0][6] & LMP_ERR_DATA_REPORTING) ||
+	    test_bit(HCI_QUIRK_BROKEN_ERR_DATA_REPORTING, &hdev->quirks))
 		return 0;
 
 	if (enabled == hdev->err_data_reporting)
@@ -4457,6 +4459,9 @@ static const struct {
 	HCI_QUIRK_BROKEN(STORED_LINK_KEY,
 			 "HCI Delete Stored Link Key command is advertised, "
 			 "but not supported."),
+	HCI_QUIRK_BROKEN(ERR_DATA_REPORTING,
+			 "HCI Read Default Erroneous Data Reporting command is "
+			 "advertised, but not supported."),
 	HCI_QUIRK_BROKEN(READ_TRANSMIT_POWER,
 			 "HCI Read Transmit Power Level command is advertised, "
 			 "but not supported."),
-- 
cgit v1.2.3


From 93df7d56f15e217009323c0fbb5213ab7a14520b Mon Sep 17 00:00:00 2001
From: Chethan T N <chethan.tumkur.narayan@intel.com>
Date: Tue, 22 Nov 2022 14:32:56 +0530
Subject: Bluetooth: Remove codec id field in vendor codec definition

As per the specfication vendor codec id is defined.
BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 4, Part E page 2127

Fixes: 9ae664028a9e ("Bluetooth: Add support for Read Local Supported Codecs V2")
Signed-off-by: Chethan T N <chethan.tumkur.narayan@intel.com>
Signed-off-by: Kiran K <kiran.k@intel.com>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 0fe789f6a653..684f1cd28730 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -1435,7 +1435,6 @@ struct hci_std_codecs_v2 {
 } __packed;
 
 struct hci_vnd_codec_v2 {
-	__u8	id;
 	__le16	cid;
 	__le16	vid;
 	__u8	transport;
-- 
cgit v1.2.3


From d14f28b8c1de668bab863bf5892a49c824cb110d Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:27 +0200
Subject: xfrm: add new packet offload flag

In the next patches, the xfrm core code will be extended to support
new type of offload - packet offload. In that mode, both policy and state
should be specially configured in order to perform whole offloaded data
path.

Full offload takes care of encryption, decryption, encapsulation and
other operations with headers.

As this mode is new for XFRM policy flow, we can "start fresh" with flag
bits and release first and second bit for future use.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h        | 7 +++++++
 include/uapi/linux/xfrm.h | 6 ++++++
 net/xfrm/xfrm_device.c    | 3 +++
 net/xfrm/xfrm_user.c      | 2 ++
 4 files changed, 18 insertions(+)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0cc6791c001..b39d24fa2ef0 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -131,12 +131,19 @@ enum {
 	XFRM_DEV_OFFLOAD_OUT,
 };
 
+enum {
+	XFRM_DEV_OFFLOAD_UNSPECIFIED,
+	XFRM_DEV_OFFLOAD_CRYPTO,
+	XFRM_DEV_OFFLOAD_PACKET,
+};
+
 struct xfrm_dev_offload {
 	struct net_device	*dev;
 	netdevice_tracker	dev_tracker;
 	struct net_device	*real_dev;
 	unsigned long		offload_handle;
 	u8			dir : 2;
+	u8			type : 2;
 };
 
 struct xfrm_mode {
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 4f84ea7ee14c..23543c33fee8 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -519,6 +519,12 @@ struct xfrm_user_offload {
  */
 #define XFRM_OFFLOAD_IPV6	1
 #define XFRM_OFFLOAD_INBOUND	2
+/* Two bits above are relevant for state path only, while
+ * offload is used for both policy and state flows.
+ *
+ * In policy offload mode, they are free and can be safely reused.
+ */
+#define XFRM_OFFLOAD_PACKET	4
 
 struct xfrm_userpolicy_default {
 #define XFRM_USERPOLICY_UNSPEC	0
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 21269e8f2db4..3b0c1ca8d4bb 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -291,12 +291,15 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 	else
 		xso->dir = XFRM_DEV_OFFLOAD_OUT;
 
+	xso->type = XFRM_DEV_OFFLOAD_CRYPTO;
+
 	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
 	if (err) {
 		xso->dev = NULL;
 		xso->dir = 0;
 		xso->real_dev = NULL;
 		netdev_put(dev, &xso->dev_tracker);
+		xso->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
 
 		if (err != -EOPNOTSUPP) {
 			NL_SET_ERR_MSG(extack, "Device failed to offload this state");
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 0eb4696661c8..c3b8c1532718 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -956,6 +956,8 @@ static int copy_user_offload(struct xfrm_dev_offload *xso, struct sk_buff *skb)
 	xuo->ifindex = xso->dev->ifindex;
 	if (xso->dir == XFRM_DEV_OFFLOAD_IN)
 		xuo->flags = XFRM_OFFLOAD_INBOUND;
+	if (xso->type == XFRM_DEV_OFFLOAD_PACKET)
+		xuo->flags |= XFRM_OFFLOAD_PACKET;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 919e43fad5163a8ceb39826ecdee897a9f799351 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:29 +0200
Subject: xfrm: add an interface to offload policy

Extend netlink interface to add and delete XFRM policy from the device.
This functionality is a first step to implement packet IPsec offload solution.

Signed-off-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  3 +++
 include/net/xfrm.h        | 45 +++++++++++++++++++++++++++++++
 net/xfrm/xfrm_device.c    | 67 ++++++++++++++++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_policy.c    | 69 +++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      | 18 +++++++++++++
 5 files changed, 201 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 5aa35c58c342..4096e3fe8e4a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1040,6 +1040,9 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	int	(*xdo_dev_policy_add) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
+	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
 };
 #endif
 
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b39d24fa2ef0..6fea34cbdf48 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -129,6 +129,7 @@ struct xfrm_state_walk {
 enum {
 	XFRM_DEV_OFFLOAD_IN = 1,
 	XFRM_DEV_OFFLOAD_OUT,
+	XFRM_DEV_OFFLOAD_FWD,
 };
 
 enum {
@@ -541,6 +542,8 @@ struct xfrm_policy {
 	struct xfrm_tmpl       	xfrm_vec[XFRM_MAX_DEPTH];
 	struct hlist_node	bydst_inexact_list;
 	struct rcu_head		rcu;
+
+	struct xfrm_dev_offload xdo;
 };
 
 static inline struct net *xp_net(const struct xfrm_policy *xp)
@@ -1585,6 +1588,8 @@ struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid, bool sync);
 int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+			  bool task_valid);
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
@@ -1899,6 +1904,9 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur
 int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 		       struct xfrm_user_offload *xuo,
 		       struct netlink_ext_ack *extack);
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+			struct xfrm_user_offload *xuo, u8 dir,
+			struct netlink_ext_ack *extack);
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x);
 
 static inline void xfrm_dev_state_advance_esn(struct xfrm_state *x)
@@ -1947,6 +1955,28 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 		netdev_put(dev, &xso->dev_tracker);
 	}
 }
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+	struct net_device *dev = xdo->dev;
+
+	if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_policy_delete)
+		dev->xfrmdev_ops->xdo_dev_policy_delete(x);
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xdo;
+	struct net_device *dev = xdo->dev;
+
+	if (dev && dev->xfrmdev_ops) {
+		if (dev->xfrmdev_ops->xdo_dev_policy_free)
+			dev->xfrmdev_ops->xdo_dev_policy_free(x);
+		xdo->dev = NULL;
+		netdev_put(dev, &xdo->dev_tracker);
+	}
+}
 #else
 static inline void xfrm_dev_resume(struct sk_buff *skb)
 {
@@ -1974,6 +2004,21 @@ static inline void xfrm_dev_state_free(struct xfrm_state *x)
 {
 }
 
+static inline int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+				      struct xfrm_user_offload *xuo, u8 dir,
+				      struct netlink_ext_ack *extack)
+{
+	return 0;
+}
+
+static inline void xfrm_dev_policy_delete(struct xfrm_policy *x)
+{
+}
+
+static inline void xfrm_dev_policy_free(struct xfrm_policy *x)
+{
+}
+
 static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	return false;
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 3184b2c394b6..04ae510dcc66 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -325,6 +325,69 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x,
 }
 EXPORT_SYMBOL_GPL(xfrm_dev_state_add);
 
+int xfrm_dev_policy_add(struct net *net, struct xfrm_policy *xp,
+			struct xfrm_user_offload *xuo, u8 dir,
+			struct netlink_ext_ack *extack)
+{
+	struct xfrm_dev_offload *xdo = &xp->xdo;
+	struct net_device *dev;
+	int err;
+
+	if (!xuo->flags || xuo->flags & ~XFRM_OFFLOAD_PACKET) {
+		/* We support only packet offload mode and it means
+		 * that user must set XFRM_OFFLOAD_PACKET bit.
+		 */
+		NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request");
+		return -EINVAL;
+	}
+
+	dev = dev_get_by_index(net, xuo->ifindex);
+	if (!dev)
+		return -EINVAL;
+
+	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_policy_add) {
+		xdo->dev = NULL;
+		dev_put(dev);
+		NL_SET_ERR_MSG(extack, "Policy offload is not supported");
+		return -EINVAL;
+	}
+
+	xdo->dev = dev;
+	netdev_tracker_alloc(dev, &xdo->dev_tracker, GFP_ATOMIC);
+	xdo->real_dev = dev;
+	xdo->type = XFRM_DEV_OFFLOAD_PACKET;
+	switch (dir) {
+	case XFRM_POLICY_IN:
+		xdo->dir = XFRM_DEV_OFFLOAD_IN;
+		break;
+	case XFRM_POLICY_OUT:
+		xdo->dir = XFRM_DEV_OFFLOAD_OUT;
+		break;
+	case XFRM_POLICY_FWD:
+		xdo->dir = XFRM_DEV_OFFLOAD_FWD;
+		break;
+	default:
+		xdo->dev = NULL;
+		dev_put(dev);
+		NL_SET_ERR_MSG(extack, "Unrecognized oflload direction");
+		return -EINVAL;
+	}
+
+	err = dev->xfrmdev_ops->xdo_dev_policy_add(xp);
+	if (err) {
+		xdo->dev = NULL;
+		xdo->real_dev = NULL;
+		xdo->type = XFRM_DEV_OFFLOAD_UNSPECIFIED;
+		xdo->dir = 0;
+		netdev_put(dev, &xdo->dev_tracker);
+		NL_SET_ERR_MSG(extack, "Device failed to offload this policy");
+		return err;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xfrm_dev_policy_add);
+
 bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
 {
 	int mtu;
@@ -427,8 +490,10 @@ static int xfrm_api_check(struct net_device *dev)
 
 static int xfrm_dev_down(struct net_device *dev)
 {
-	if (dev->features & NETIF_F_HW_ESP)
+	if (dev->features & NETIF_F_HW_ESP) {
 		xfrm_dev_state_flush(dev_net(dev), dev, true);
+		xfrm_dev_policy_flush(dev_net(dev), dev, true);
+	}
 
 	return NOTIFY_DONE;
 }
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 9b9e2765363d..8b8760907563 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -425,6 +425,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
 	if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
 		BUG();
 
+	xfrm_dev_policy_free(policy);
 	call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
 }
 EXPORT_SYMBOL(xfrm_policy_destroy);
@@ -1769,12 +1770,41 @@ xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 	}
 	return err;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+						     struct net_device *dev,
+						     bool task_valid)
+{
+	struct xfrm_policy *pol;
+	int err = 0;
+
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		if (pol->walk.dead ||
+		    xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX ||
+		    pol->xdo.dev != dev)
+			continue;
+
+		err = security_xfrm_policy_delete(pol->security);
+		if (err) {
+			xfrm_audit_policy_delete(pol, 0, task_valid);
+			return err;
+		}
+	}
+	return err;
+}
 #else
 static inline int
 xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid)
 {
 	return 0;
 }
+
+static inline int xfrm_dev_policy_flush_secctx_check(struct net *net,
+						     struct net_device *dev,
+						     bool task_valid)
+{
+	return 0;
+}
 #endif
 
 int xfrm_policy_flush(struct net *net, u8 type, bool task_valid)
@@ -1814,6 +1844,44 @@ out:
 }
 EXPORT_SYMBOL(xfrm_policy_flush);
 
+int xfrm_dev_policy_flush(struct net *net, struct net_device *dev,
+			  bool task_valid)
+{
+	int dir, err = 0, cnt = 0;
+	struct xfrm_policy *pol;
+
+	spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+
+	err = xfrm_dev_policy_flush_secctx_check(net, dev, task_valid);
+	if (err)
+		goto out;
+
+again:
+	list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) {
+		dir = xfrm_policy_id2dir(pol->index);
+		if (pol->walk.dead ||
+		    dir >= XFRM_POLICY_MAX ||
+		    pol->xdo.dev != dev)
+			continue;
+
+		__xfrm_policy_unlink(pol, dir);
+		spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+		cnt++;
+		xfrm_audit_policy_delete(pol, 1, task_valid);
+		xfrm_policy_kill(pol);
+		spin_lock_bh(&net->xfrm.xfrm_policy_lock);
+		goto again;
+	}
+	if (cnt)
+		__xfrm_policy_inexact_flush(net);
+	else
+		err = -ESRCH;
+out:
+	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_dev_policy_flush);
+
 int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk,
 		     int (*func)(struct xfrm_policy *, int, int, void*),
 		     void *data)
@@ -2245,6 +2313,7 @@ int xfrm_policy_delete(struct xfrm_policy *pol, int dir)
 	pol = __xfrm_policy_unlink(pol, dir);
 	spin_unlock_bh(&net->xfrm.xfrm_policy_lock);
 	if (pol) {
+		xfrm_dev_policy_delete(pol);
 		xfrm_policy_kill(pol);
 		return 0;
 	}
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index c3b8c1532718..cf5172d4ce68 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1892,6 +1892,15 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net,
 	if (attrs[XFRMA_IF_ID])
 		xp->if_id = nla_get_u32(attrs[XFRMA_IF_ID]);
 
+	/* configure the hardware if offload is requested */
+	if (attrs[XFRMA_OFFLOAD_DEV]) {
+		err = xfrm_dev_policy_add(net, xp,
+					  nla_data(attrs[XFRMA_OFFLOAD_DEV]),
+					  p->dir, extack);
+		if (err)
+			goto error;
+	}
+
 	return xp;
  error:
 	*errp = err;
@@ -1931,6 +1940,7 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh,
 	xfrm_audit_policy_add(xp, err ? 0 : 1, true);
 
 	if (err) {
+		xfrm_dev_policy_delete(xp);
 		security_xfrm_policy_free(xp->security);
 		kfree(xp);
 		return err;
@@ -2043,6 +2053,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3381,6 +3393,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x,
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3499,6 +3513,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp,
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err) {
 		nlmsg_cancel(skb, nlh);
 		return err;
@@ -3582,6 +3598,8 @@ static int xfrm_notify_policy(struct xfrm_policy *xp, int dir, const struct km_e
 		err = xfrm_mark_put(skb, &xp->mark);
 	if (!err)
 		err = xfrm_if_id_put(skb, xp->if_id);
+	if (!err && xp->xdo.dev)
+		err = copy_user_offload(&xp->xdo, skb);
 	if (err)
 		goto out_free_skb;
 
-- 
cgit v1.2.3


From 5958372ddf628fe6f4c3e49425734ad32fcfb13c Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:31 +0200
Subject: xfrm: add RX datapath protection for IPsec packet offload mode

Traffic received by device with enabled IPsec packet offload should
be forwarded to the stack only after decryption, packet headers and
trailers removed.

Such packets are expected to be seen as normal (non-XFRM) ones, while
not-supported packets should be dropped by the HW.

Reviewed-by: Raed Salem <raeds@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h | 55 +++++++++++++++++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 6fea34cbdf48..b6ee14991a32 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1102,6 +1102,29 @@ xfrm_state_addr_cmp(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, un
 	return !0;
 }
 
+#ifdef CONFIG_XFRM
+static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
+{
+	struct sec_path *sp = skb_sec_path(skb);
+
+	return sp->xvec[sp->len - 1];
+}
+#endif
+
+static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
+{
+#ifdef CONFIG_XFRM
+	struct sec_path *sp = skb_sec_path(skb);
+
+	if (!sp || !sp->olen || sp->len != sp->olen)
+		return NULL;
+
+	return &sp->ovec[sp->olen - 1];
+#else
+	return NULL;
+#endif
+}
+
 #ifdef CONFIG_XFRM
 int __xfrm_policy_check(struct sock *, int dir, struct sk_buff *skb,
 			unsigned short family);
@@ -1133,10 +1156,19 @@ static inline int __xfrm_policy_check2(struct sock *sk, int dir,
 {
 	struct net *net = dev_net(skb->dev);
 	int ndir = dir | (reverse ? XFRM_POLICY_MASK + 1 : 0);
+	struct xfrm_offload *xo = xfrm_offload(skb);
+	struct xfrm_state *x;
 
 	if (sk && sk->sk_policy[XFRM_POLICY_IN])
 		return __xfrm_policy_check(sk, ndir, skb, family);
 
+	if (xo) {
+		x = xfrm_input_state(skb);
+		if (x->xso.type == XFRM_DEV_OFFLOAD_PACKET)
+			return (xo->flags & CRYPTO_DONE) &&
+			       (xo->status & CRYPTO_SUCCESS);
+	}
+
 	return __xfrm_check_nopolicy(net, skb, dir) ||
 	       __xfrm_check_dev_nopolicy(skb, dir, family) ||
 	       __xfrm_policy_check(sk, ndir, skb, family);
@@ -1872,29 +1904,6 @@ static inline void xfrm_states_delete(struct xfrm_state **states, int n)
 }
 #endif
 
-#ifdef CONFIG_XFRM
-static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
-{
-	struct sec_path *sp = skb_sec_path(skb);
-
-	return sp->xvec[sp->len - 1];
-}
-#endif
-
-static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb)
-{
-#ifdef CONFIG_XFRM
-	struct sec_path *sp = skb_sec_path(skb);
-
-	if (!sp || !sp->olen || sp->len != sp->olen)
-		return NULL;
-
-	return &sp->ovec[sp->olen - 1];
-#else
-	return NULL;
-#endif
-}
-
 void __init xfrm_dev_init(void);
 
 #ifdef CONFIG_XFRM_OFFLOAD
-- 
cgit v1.2.3


From f3da86dc2c8c9004445cfbb15ac086773622d853 Mon Sep 17 00:00:00 2001
From: Leon Romanovsky <leonro@nvidia.com>
Date: Fri, 2 Dec 2022 20:41:33 +0200
Subject: xfrm: add support to HW update soft and hard limits

Both in RX and TX, the traffic that performs IPsec packet offload
transformation is accounted by HW. It is needed to properly handle
hard limits that require to drop the packet.

It means that XFRM core needs to update internal counters with the one
that accounted by the HW, so new callbacks are introduced in this patch.

In case of soft or hard limit is occurred, the driver should call to
xfrm_state_check_expire() that will perform key rekeying exactly as
done by XFRM core.

Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  1 +
 include/net/xfrm.h        | 17 +++++++++++++++++
 net/xfrm/xfrm_state.c     |  4 ++++
 3 files changed, 22 insertions(+)

(limited to 'include/net')

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4096e3fe8e4a..29ae964e3b89 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1040,6 +1040,7 @@ struct xfrmdev_ops {
 	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
 				       struct xfrm_state *x);
 	void	(*xdo_dev_state_advance_esn) (struct xfrm_state *x);
+	void	(*xdo_dev_state_update_curlft) (struct xfrm_state *x);
 	int	(*xdo_dev_policy_add) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_delete) (struct xfrm_policy *x);
 	void	(*xdo_dev_policy_free) (struct xfrm_policy *x);
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index b6ee14991a32..5413cdd5ad62 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1571,6 +1571,23 @@ struct xfrm_state *xfrm_stateonly_find(struct net *net, u32 mark, u32 if_id,
 struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, __be32 spi,
 					      unsigned short family);
 int xfrm_state_check_expire(struct xfrm_state *x);
+#ifdef CONFIG_XFRM_OFFLOAD
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x)
+{
+	struct xfrm_dev_offload *xdo = &x->xso;
+	struct net_device *dev = xdo->dev;
+
+	if (x->xso.type != XFRM_DEV_OFFLOAD_PACKET)
+		return;
+
+	if (dev && dev->xfrmdev_ops &&
+	    dev->xfrmdev_ops->xdo_dev_state_update_curlft)
+		dev->xfrmdev_ops->xdo_dev_state_update_curlft(x);
+
+}
+#else
+static inline void xfrm_dev_state_update_curlft(struct xfrm_state *x) {}
+#endif
 void xfrm_state_insert(struct xfrm_state *x);
 int xfrm_state_add(struct xfrm_state *x);
 int xfrm_state_update(struct xfrm_state *x);
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 2a190e85da80..cc1d0ea42672 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -570,6 +570,8 @@ static enum hrtimer_restart xfrm_timer_handler(struct hrtimer *me)
 	int err = 0;
 
 	spin_lock(&x->lock);
+	xfrm_dev_state_update_curlft(x);
+
 	if (x->km.state == XFRM_STATE_DEAD)
 		goto out;
 	if (x->km.state == XFRM_STATE_EXPIRED)
@@ -1936,6 +1938,8 @@ EXPORT_SYMBOL(xfrm_state_update);
 
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
+	xfrm_dev_state_update_curlft(x);
+
 	if (!x->curlft.use_time)
 		x->curlft.use_time = ktime_get_real_seconds();
 
-- 
cgit v1.2.3


From 8e4c2eee1e15c1206c26f6b28b05fe9711a427c6 Mon Sep 17 00:00:00 2001
From: Christian Schoenebeck <linux_oss@crudebyte.com>
Date: Tue, 22 Nov 2022 20:20:22 +0100
Subject: net/9p: distinguish zero-copy requests

Add boolean `zc` member to struct p9_fcall to distinguish zero-copy
messages (not using the linear `sdata` buffer for message payload) from
regular messages (which do copy message payload to `sdata` before being
further processed).

This new member is appended to end of structure to avoid inserting huge
padding in generated layout.

Link: https://lkml.kernel.org/r/8f2a5c12a446c3b544da64e0b1550e1fb2d6f972.1669144861.git.linux_oss@crudebyte.com
Signed-off-by: Christian Schoenebeck <linux_oss@crudebyte.com>
Tested-by: Stefano Stabellini <sstabellini@kernel.org>
Signed-off-by: Dominique Martinet <asmadeus@codewreck.org>
---
 include/net/9p/9p.h | 2 ++
 net/9p/client.c     | 6 ++++++
 2 files changed, 8 insertions(+)

(limited to 'include/net')

diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h
index 13abe013af21..429adf6be29c 100644
--- a/include/net/9p/9p.h
+++ b/include/net/9p/9p.h
@@ -531,6 +531,7 @@ struct p9_rstatfs {
  * @offset: used by marshalling routines to track current position in buffer
  * @capacity: used by marshalling routines to track total malloc'd capacity
  * @sdata: payload
+ * @zc: whether zero-copy is used
  *
  * &p9_fcall represents the structure for all 9P RPC
  * transactions.  Requests are packaged into fcalls, and reponses
@@ -549,6 +550,7 @@ struct p9_fcall {
 
 	struct kmem_cache *cache;
 	u8 *sdata;
+	bool zc;
 };
 
 int p9_errstr2errno(char *errstr, int len);
diff --git a/net/9p/client.c b/net/9p/client.c
index b554f8357f96..a2b4a965a5a9 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -685,6 +685,9 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...)
 	if (IS_ERR(req))
 		return req;
 
+	req->tc.zc = false;
+	req->rc.zc = false;
+
 	if (signal_pending(current)) {
 		sigpending = 1;
 		clear_thread_flag(TIF_SIGPENDING);
@@ -783,6 +786,9 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type,
 	if (IS_ERR(req))
 		return req;
 
+	req->tc.zc = true;
+	req->rc.zc = true;
+
 	if (signal_pending(current)) {
 		sigpending = 1;
 		clear_thread_flag(TIF_SIGPENDING);
-- 
cgit v1.2.3


From 94151f5aa9667c562281abeaaa5e89b9d5c17729 Mon Sep 17 00:00:00 2001
From: Eyal Birger <eyal.birger@gmail.com>
Date: Sat, 3 Dec 2022 10:46:57 +0200
Subject: xfrm: interface: Add unstable helpers for setting/getting XFRM
 metadata from TC-BPF

This change adds xfrm metadata helpers using the unstable kfunc call
interface for the TC-BPF hooks. This allows steering traffic towards
different IPsec connections based on logic implemented in bpf programs.

This object is built based on the availability of BTF debug info.

When setting the xfrm metadata, percpu metadata dsts are used in order
to avoid allocating a metadata dst per packet.

In order to guarantee safe module unload, the percpu dsts are allocated
on first use and never freed. The percpu pointer is stored in
net/core/filter.c so that it can be reused on module reload.

The metadata percpu dsts take ownership of the original skb dsts so
that they may be used as part of the xfrm transmission logic - e.g.
for MTU calculations.

Signed-off-by: Eyal Birger <eyal.birger@gmail.com>
Link: https://lore.kernel.org/r/20221203084659.1837829-3-eyal.birger@gmail.com
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
---
 include/net/dst_metadata.h     |   1 +
 include/net/xfrm.h             |  17 ++++++
 net/core/dst.c                 |   8 ++-
 net/core/filter.c              |   9 ++++
 net/xfrm/Makefile              |   6 +++
 net/xfrm/xfrm_interface_bpf.c  | 115 +++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_interface_core.c |  14 +++++
 7 files changed, 168 insertions(+), 2 deletions(-)
 create mode 100644 net/xfrm/xfrm_interface_bpf.c

(limited to 'include/net')

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index a454cf4327fe..1b7fae4c6b24 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -26,6 +26,7 @@ struct macsec_info {
 struct xfrm_md_info {
 	u32 if_id;
 	int link;
+	struct dst_entry *dst_orig;
 };
 
 struct metadata_dst {
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index e0cc6791c001..3707e6b34e67 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -2086,4 +2086,21 @@ static inline bool xfrm6_local_dontfrag(const struct sock *sk)
 	return false;
 }
 #endif
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+extern struct metadata_dst __percpu *xfrm_bpf_md_dst;
+
+int register_xfrm_interface_bpf(void);
+
+#else
+
+static inline int register_xfrm_interface_bpf(void)
+{
+	return 0;
+}
+
+#endif
+
 #endif	/* _NET_XFRM_H */
diff --git a/net/core/dst.c b/net/core/dst.c
index bc9c9be4e080..bb14a0392388 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -316,6 +316,8 @@ void metadata_dst_free(struct metadata_dst *md_dst)
 	if (md_dst->type == METADATA_IP_TUNNEL)
 		dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
 #endif
+	if (md_dst->type == METADATA_XFRM)
+		dst_release(md_dst->u.xfrm_info.dst_orig);
 	kfree(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free);
@@ -340,16 +342,18 @@ EXPORT_SYMBOL_GPL(metadata_dst_alloc_percpu);
 
 void metadata_dst_free_percpu(struct metadata_dst __percpu *md_dst)
 {
-#ifdef CONFIG_DST_CACHE
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
 		struct metadata_dst *one_md_dst = per_cpu_ptr(md_dst, cpu);
 
+#ifdef CONFIG_DST_CACHE
 		if (one_md_dst->type == METADATA_IP_TUNNEL)
 			dst_cache_destroy(&one_md_dst->u.tun_info.dst_cache);
-	}
 #endif
+		if (one_md_dst->type == METADATA_XFRM)
+			dst_release(one_md_dst->u.xfrm_info.dst_orig);
+	}
 	free_percpu(md_dst);
 }
 EXPORT_SYMBOL_GPL(metadata_dst_free_percpu);
diff --git a/net/core/filter.c b/net/core/filter.c
index 8607136b6e2c..929358677183 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5631,6 +5631,15 @@ static const struct bpf_func_proto bpf_bind_proto = {
 };
 
 #ifdef CONFIG_XFRM
+
+#if (IS_BUILTIN(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF)) || \
+    (IS_MODULE(CONFIG_XFRM_INTERFACE) && IS_ENABLED(CONFIG_DEBUG_INFO_BTF_MODULES))
+
+struct metadata_dst __percpu *xfrm_bpf_md_dst;
+EXPORT_SYMBOL_GPL(xfrm_bpf_md_dst);
+
+#endif
+
 BPF_CALL_5(bpf_skb_get_xfrm_state, struct sk_buff *, skb, u32, index,
 	   struct bpf_xfrm_state *, to, u32, size, u64, flags)
 {
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index 08a2870fdd36..cd47f88921f5 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -5,6 +5,12 @@
 
 xfrm_interface-$(CONFIG_XFRM_INTERFACE) += xfrm_interface_core.o
 
+ifeq ($(CONFIG_XFRM_INTERFACE),m)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF_MODULES) += xfrm_interface_bpf.o
+else ifeq ($(CONFIG_XFRM_INTERFACE),y)
+xfrm_interface-$(CONFIG_DEBUG_INFO_BTF) += xfrm_interface_bpf.o
+endif
+
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
 		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
diff --git a/net/xfrm/xfrm_interface_bpf.c b/net/xfrm/xfrm_interface_bpf.c
new file mode 100644
index 000000000000..1ef2162cebcf
--- /dev/null
+++ b/net/xfrm/xfrm_interface_bpf.c
@@ -0,0 +1,115 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Unstable XFRM Helpers for TC-BPF hook
+ *
+ * These are called from SCHED_CLS BPF programs. Note that it is
+ * allowed to break compatibility for these functions since the interface they
+ * are exposed through to BPF programs is explicitly unstable.
+ */
+
+#include <linux/bpf.h>
+#include <linux/btf_ids.h>
+
+#include <net/dst_metadata.h>
+#include <net/xfrm.h>
+
+/* bpf_xfrm_info - XFRM metadata information
+ *
+ * Members:
+ * @if_id	- XFRM if_id:
+ *		    Transmit: if_id to be used in policy and state lookups
+ *		    Receive: if_id of the state matched for the incoming packet
+ * @link	- Underlying device ifindex:
+ *		    Transmit: used as the underlying device in VRF routing
+ *		    Receive: the device on which the packet had been received
+ */
+struct bpf_xfrm_info {
+	u32 if_id;
+	int link;
+};
+
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+		  "Global functions as their definitions will be in xfrm_interface BTF");
+
+/* bpf_skb_get_xfrm_info - Get XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @to		- Pointer to memory to which the metadata will be copied
+ *		    Cannot be NULL
+ */
+__used noinline
+int bpf_skb_get_xfrm_info(struct __sk_buff *skb_ctx, struct bpf_xfrm_info *to)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct xfrm_md_info *info;
+
+	info = skb_xfrm_md_info(skb);
+	if (!info)
+		return -EINVAL;
+
+	to->if_id = info->if_id;
+	to->link = info->link;
+	return 0;
+}
+
+/* bpf_skb_get_xfrm_info - Set XFRM metadata
+ *
+ * Parameters:
+ * @skb_ctx	- Pointer to ctx (__sk_buff) in TC program
+ *		    Cannot be NULL
+ * @from	- Pointer to memory from which the metadata will be copied
+ *		    Cannot be NULL
+ */
+__used noinline
+int bpf_skb_set_xfrm_info(struct __sk_buff *skb_ctx,
+			  const struct bpf_xfrm_info *from)
+{
+	struct sk_buff *skb = (struct sk_buff *)skb_ctx;
+	struct metadata_dst *md_dst;
+	struct xfrm_md_info *info;
+
+	if (unlikely(skb_metadata_dst(skb)))
+		return -EINVAL;
+
+	if (!xfrm_bpf_md_dst) {
+		struct metadata_dst __percpu *tmp;
+
+		tmp = metadata_dst_alloc_percpu(0, METADATA_XFRM, GFP_ATOMIC);
+		if (!tmp)
+			return -ENOMEM;
+		if (cmpxchg(&xfrm_bpf_md_dst, NULL, tmp))
+			metadata_dst_free_percpu(tmp);
+	}
+	md_dst = this_cpu_ptr(xfrm_bpf_md_dst);
+
+	info = &md_dst->u.xfrm_info;
+
+	info->if_id = from->if_id;
+	info->link = from->link;
+	skb_dst_force(skb);
+	info->dst_orig = skb_dst(skb);
+
+	dst_hold((struct dst_entry *)md_dst);
+	skb_dst_set(skb, (struct dst_entry *)md_dst);
+	return 0;
+}
+
+__diag_pop()
+
+BTF_SET8_START(xfrm_ifc_kfunc_set)
+BTF_ID_FLAGS(func, bpf_skb_get_xfrm_info)
+BTF_ID_FLAGS(func, bpf_skb_set_xfrm_info)
+BTF_SET8_END(xfrm_ifc_kfunc_set)
+
+static const struct btf_kfunc_id_set xfrm_interface_kfunc_set = {
+	.owner = THIS_MODULE,
+	.set   = &xfrm_ifc_kfunc_set,
+};
+
+int __init register_xfrm_interface_bpf(void)
+{
+	return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS,
+					 &xfrm_interface_kfunc_set);
+}
diff --git a/net/xfrm/xfrm_interface_core.c b/net/xfrm/xfrm_interface_core.c
index 5a67b120c4db..1f99dc469027 100644
--- a/net/xfrm/xfrm_interface_core.c
+++ b/net/xfrm/xfrm_interface_core.c
@@ -396,6 +396,14 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl)
 
 		if_id = md_info->if_id;
 		fl->flowi_oif = md_info->link;
+		if (md_info->dst_orig) {
+			struct dst_entry *tmp_dst = dst;
+
+			dst = md_info->dst_orig;
+			skb_dst_set(skb, dst);
+			md_info->dst_orig = NULL;
+			dst_release(tmp_dst);
+		}
 	} else {
 		if_id = xi->p.if_id;
 	}
@@ -1162,12 +1170,18 @@ static int __init xfrmi_init(void)
 	if (err < 0)
 		goto rtnl_link_failed;
 
+	err = register_xfrm_interface_bpf();
+	if (err < 0)
+		goto kfunc_failed;
+
 	lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM);
 
 	xfrm_if_register_cb(&xfrm_if_cb);
 
 	return err;
 
+kfunc_failed:
+	rtnl_link_unregister(&xfrmi_link_ops);
 rtnl_link_failed:
 	xfrmi6_fini();
 xfrmi6_failed:
-- 
cgit v1.2.3


From e9b4aeed56699b469206d05e706ddf2db95700a9 Mon Sep 17 00:00:00 2001
From: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Date: Sat, 3 Dec 2022 17:51:04 +0100
Subject: net: xsk: Don't include <linux/rculist.h>

There is no need to include <linux/rculist.h> here.

Prefer the less invasive <linux/types.h> which is needed for 'hlist_head'.

Signed-off-by: Christophe JAILLET <christophe.jaillet@wanadoo.fr>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/r/88d6a1d88764cca328610854f890a9ca1f4b029e.1670086246.git.christophe.jaillet@wanadoo.fr
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
 include/net/netns/xdp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/netns/xdp.h b/include/net/netns/xdp.h
index e5734261ba0a..21a4f25a187a 100644
--- a/include/net/netns/xdp.h
+++ b/include/net/netns/xdp.h
@@ -2,8 +2,8 @@
 #ifndef __NETNS_XDP_H__
 #define __NETNS_XDP_H__
 
-#include <linux/rculist.h>
 #include <linux/mutex.h>
+#include <linux/types.h>
 
 struct netns_xdp {
 	struct mutex		lock;
-- 
cgit v1.2.3


From da65e9ff3bf614d2836e38e1d405c7073e6ba3b7 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 6 Dec 2022 20:51:15 +0200
Subject: devlink: Expose port function commands to control RoCE

Expose port function commands to enable / disable RoCE, this is used to
control the port RoCE device capabilities.

When RoCE is disabled for a function of the port, function cannot create
any RoCE specific resources (e.g GID table).
It also saves system memory utilization. For example disabling RoCE enable a
VF/SF saves 1 Mbytes of system memory per function.

Example of a PCI VF port which supports function configuration:
Set RoCE of the VF's port function.

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 roce enable

$ devlink port function set pci/0000:06:00.0/2 roce disable

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 roce disable

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/devlink-port.rst |  34 ++++++-
 include/net/devlink.h                             |  18 ++++
 include/uapi/linux/devlink.h                      |  10 ++
 net/core/devlink.c                                | 113 ++++++++++++++++++++++
 4 files changed, 174 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index 2c637f4aae8e..c3302d23e480 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -110,7 +110,7 @@ devlink ports for both the controllers.
 Function configuration
 ======================
 
-A user can configure the function attribute before enumerating the PCI
+Users can configure one or more function attributes before enumerating the PCI
 function. Usually it means, user should configure function attribute
 before a bus specific device for the function is created. However, when
 SRIOV is enabled, virtual function devices are created on the PCI bus.
@@ -122,6 +122,9 @@ A user may set the hardware address of the function using
 `devlink port function set hw_addr` command. For Ethernet port function
 this means a MAC address.
 
+Users may also set the RoCE capability of the function using
+`devlink port function set roce` command.
+
 Function attributes
 ===================
 
@@ -162,6 +165,35 @@ device created for the PCI VF/SF.
       function:
         hw_addr 00:00:00:00:88:88
 
+RoCE capability setup
+---------------------
+Not all PCI VFs/SFs require RoCE capability.
+
+When RoCE capability is disabled, it saves system memory per PCI VF/SF.
+
+When user disables RoCE capability for a VF/SF, user application cannot send or
+receive any RoCE packets through this VF/SF and RoCE GID table for this PCI
+will be empty.
+
+When RoCE capability is disabled in the device using port function attribute,
+VF/SF driver cannot override it.
+
+- Get RoCE capability of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce enable
+
+- Set RoCE capability of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 roce disable
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 roce disable
+
 Subfunction
 ============
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 5f6eca5e4a40..ce4c65d2f2e7 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1451,6 +1451,24 @@ struct devlink_ops {
 	int (*port_function_hw_addr_set)(struct devlink_port *port,
 					 const u8 *hw_addr, int hw_addr_len,
 					 struct netlink_ext_ack *extack);
+	/**
+	 * @port_fn_roce_get: Port function's roce get function.
+	 *
+	 * Query RoCE state of a function managed by the devlink port.
+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+	 */
+	int (*port_fn_roce_get)(struct devlink_port *devlink_port,
+				bool *is_enable,
+				struct netlink_ext_ack *extack);
+	/**
+	 * @port_fn_roce_set: Port function's roce set function.
+	 *
+	 * Enable/Disable the RoCE state of a function managed by the devlink
+	 * port.
+	 * Return -EOPNOTSUPP if port function RoCE handling is not supported.
+	 */
+	int (*port_fn_roce_set)(struct devlink_port *devlink_port,
+				bool enable, struct netlink_ext_ack *extack);
 	/**
 	 * port_new() - Add a new port function of a specified flavor
 	 * @devlink: Devlink instance
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 70191d96af89..6cc2925bd478 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -658,11 +658,21 @@ enum devlink_resource_unit {
 	DEVLINK_RESOURCE_UNIT_ENTRY,
 };
 
+enum devlink_port_fn_attr_cap {
+	DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
+
+	/* Add new caps above */
+	__DEVLINK_PORT_FN_ATTR_CAPS_MAX,
+};
+
+#define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
+
 enum devlink_port_function_attr {
 	DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
 	DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR,	/* binary */
 	DEVLINK_PORT_FN_ATTR_STATE,	/* u8 */
 	DEVLINK_PORT_FN_ATTR_OPSTATE,	/* u8 */
+	DEVLINK_PORT_FN_ATTR_CAPS,	/* bitfield32 */
 
 	__DEVLINK_PORT_FUNCTION_ATTR_MAX,
 	DEVLINK_PORT_FUNCTION_ATTR_MAX = __DEVLINK_PORT_FUNCTION_ATTR_MAX - 1
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 035249c5dd17..8c0ad52431c5 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -195,11 +195,16 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwmsg);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_hwerr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(devlink_trap_report);
 
+#define DEVLINK_PORT_FN_CAPS_VALID_MASK \
+	(_BITUL(__DEVLINK_PORT_FN_ATTR_CAPS_MAX) - 1)
+
 static const struct nla_policy devlink_function_nl_policy[DEVLINK_PORT_FUNCTION_ATTR_MAX + 1] = {
 	[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] = { .type = NLA_BINARY },
 	[DEVLINK_PORT_FN_ATTR_STATE] =
 		NLA_POLICY_RANGE(NLA_U8, DEVLINK_PORT_FN_STATE_INACTIVE,
 				 DEVLINK_PORT_FN_STATE_ACTIVE),
+	[DEVLINK_PORT_FN_ATTR_CAPS] =
+		NLA_POLICY_BITFIELD32(DEVLINK_PORT_FN_CAPS_VALID_MASK),
 };
 
 static const struct nla_policy devlink_selftest_nl_policy[DEVLINK_ATTR_SELFTEST_ID_MAX + 1] = {
@@ -680,6 +685,60 @@ devlink_sb_tc_index_get_from_attrs(struct devlink_sb *devlink_sb,
 	return 0;
 }
 
+static void devlink_port_fn_cap_fill(struct nla_bitfield32 *caps,
+				     u32 cap, bool is_enable)
+{
+	caps->selector |= cap;
+	if (is_enable)
+		caps->value |= cap;
+}
+
+static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
+				     struct devlink_port *devlink_port,
+				     struct nla_bitfield32 *caps,
+				     struct netlink_ext_ack *extack)
+{
+	bool is_enable;
+	int err;
+
+	if (!ops->port_fn_roce_get)
+		return 0;
+
+	err = ops->port_fn_roce_get(devlink_port, &is_enable, extack);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_ROCE, is_enable);
+	return 0;
+}
+
+static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
+				     struct devlink_port *devlink_port,
+				     struct sk_buff *msg,
+				     struct netlink_ext_ack *extack,
+				     bool *msg_updated)
+{
+	struct nla_bitfield32 caps = {};
+	int err;
+
+	err = devlink_port_fn_roce_fill(ops, devlink_port, &caps, extack);
+	if (err)
+		return err;
+
+	if (!caps.selector)
+		return 0;
+	err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
+				 caps.selector);
+	if (err)
+		return err;
+
+	*msg_updated = true;
+	return 0;
+}
+
 static int
 devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb,
 				  struct genl_info *info,
@@ -1263,6 +1322,35 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
 	return 0;
 }
 
+static int
+devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
+			 struct netlink_ext_ack *extack)
+{
+	const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+	return ops->port_fn_roce_set(devlink_port, enable, extack);
+}
+
+static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
+				    const struct nlattr *attr,
+				    struct netlink_ext_ack *extack)
+{
+	struct nla_bitfield32 caps;
+	u32 caps_value;
+	int err;
+
+	caps = nla_get_bitfield32(attr);
+	caps_value = caps.value & caps.selector;
+	if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE) {
+		err = devlink_port_fn_roce_set(devlink_port,
+					       caps_value & DEVLINK_PORT_FN_CAP_ROCE,
+					       extack);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 static int
 devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *port,
 				   struct netlink_ext_ack *extack)
@@ -1281,6 +1369,10 @@ devlink_nl_port_function_attrs_put(struct sk_buff *msg, struct devlink_port *por
 					   &msg_updated);
 	if (err)
 		goto out;
+	err = devlink_port_fn_caps_fill(ops, port, msg, extack,
+					&msg_updated);
+	if (err)
+		goto out;
 	err = devlink_port_fn_state_fill(ops, port, msg, extack, &msg_updated);
 out:
 	if (err || !msg_updated)
@@ -1653,6 +1745,7 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
 					  struct netlink_ext_ack *extack)
 {
 	const struct devlink_ops *ops = devlink_port->devlink->ops;
+	struct nlattr *attr;
 
 	if (tb[DEVLINK_PORT_FUNCTION_ATTR_HW_ADDR] &&
 	    !ops->port_function_hw_addr_set) {
@@ -1665,6 +1758,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
 				    "Function does not support state setting");
 		return -EOPNOTSUPP;
 	}
+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+	if (attr) {
+		struct nla_bitfield32 caps;
+
+		caps = nla_get_bitfield32(attr);
+		if (caps.selector & DEVLINK_PORT_FN_CAP_ROCE &&
+		    !ops->port_fn_roce_set) {
+			NL_SET_ERR_MSG_ATTR(extack, attr,
+					    "Port doesn't support RoCE function attribute");
+			return -EOPNOTSUPP;
+		}
+	}
 	return 0;
 }
 
@@ -1692,6 +1797,14 @@ static int devlink_port_function_set(struct devlink_port *port,
 		if (err)
 			return err;
 	}
+
+	attr = tb[DEVLINK_PORT_FN_ATTR_CAPS];
+	if (attr) {
+		err = devlink_port_fn_caps_set(port, attr, extack);
+		if (err)
+			return err;
+	}
+
 	/* Keep this as the last function attribute set, so that when
 	 * multiple port function attributes are set along with state,
 	 * Those can be applied first before activating the state.
-- 
cgit v1.2.3


From a8ce7b26a51efc4d7753b23d639ae092878a6193 Mon Sep 17 00:00:00 2001
From: Shay Drory <shayd@nvidia.com>
Date: Tue, 6 Dec 2022 20:51:18 +0200
Subject: devlink: Expose port function commands to control migratable

Expose port function commands to enable / disable migratable
capability, this is used to set the port function as migratable.

Live migration is the process of transferring a live virtual machine
from one physical host to another without disrupting its normal
operation.

In order for a VM to be able to perform LM, all the VM components must
be able to perform migration. e.g.: to be migratable.
In order for VF to be migratable, VF must be bound to VFIO driver with
migration support.

When migratable capability is enabled for a function of the port, the
device is making the necessary preparations for the function to be
migratable, which might include disabling features which cannot be
migrated.

Example of LM with migratable function configuration:
Set migratable of the VF's port function.

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 migratable disable

$ devlink port function set pci/0000:06:00.0/2 migratable enable

$ devlink port show pci/0000:06:00.0/2
pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0
vfnum 1
    function:
        hw_addr 00:00:00:00:00:00 migratable enable

Bind VF to VFIO driver with migration support:
$ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind
$ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override
$ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind

Attach VF to the VM.
Start the VM.
Perform LM.

Signed-off-by: Shay Drory <shayd@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Acked-by: Shannon Nelson <shannon.nelson@amd.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/devlink/devlink-port.rst | 46 +++++++++++++++++++
 include/net/devlink.h                             | 21 +++++++++
 include/uapi/linux/devlink.h                      |  3 ++
 net/core/devlink.c                                | 55 +++++++++++++++++++++++
 4 files changed, 125 insertions(+)

(limited to 'include/net')

diff --git a/Documentation/networking/devlink/devlink-port.rst b/Documentation/networking/devlink/devlink-port.rst
index c3302d23e480..3da590953ce8 100644
--- a/Documentation/networking/devlink/devlink-port.rst
+++ b/Documentation/networking/devlink/devlink-port.rst
@@ -125,6 +125,9 @@ this means a MAC address.
 Users may also set the RoCE capability of the function using
 `devlink port function set roce` command.
 
+Users may also set the function as migratable using
+'devlink port function set migratable' command.
+
 Function attributes
 ===================
 
@@ -194,6 +197,49 @@ VF/SF driver cannot override it.
         function:
             hw_addr 00:00:00:00:00:00 roce disable
 
+migratable capability setup
+---------------------------
+Live migration is the process of transferring a live virtual machine
+from one physical host to another without disrupting its normal
+operation.
+
+User who want PCI VFs to be able to perform live migration need to
+explicitly enable the VF migratable capability.
+
+When user enables migratable capability for a VF, and the HV binds the VF to VFIO driver
+with migration support, the user can migrate the VM with this VF from one HV to a
+different one.
+
+However, when migratable capability is enable, device will disable features which cannot
+be migrated. Thus migratable cap can impose limitations on a VF so let the user decide.
+
+Example of LM with migratable function configuration:
+- Get migratable capability of the VF device::
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 migratable disable
+
+- Set migratable capability of the VF device::
+
+    $ devlink port function set pci/0000:06:00.0/2 migratable enable
+
+    $ devlink port show pci/0000:06:00.0/2
+    pci/0000:06:00.0/2: type eth netdev enp6s0pf0vf1 flavour pcivf pfnum 0 vfnum 1
+        function:
+            hw_addr 00:00:00:00:00:00 migratable enable
+
+- Bind VF to VFIO driver with migration support::
+
+    $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/unbind
+    $ echo mlx5_vfio_pci > /sys/bus/pci/devices/0000:08:00.0/driver_override
+    $ echo <pci_id> > /sys/bus/pci/devices/0000:08:00.0/driver/bind
+
+Attach VF to the VM.
+Start the VM.
+Perform live migration.
+
 Subfunction
 ============
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index ce4c65d2f2e7..0f376a28b9c4 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -1469,6 +1469,27 @@ struct devlink_ops {
 	 */
 	int (*port_fn_roce_set)(struct devlink_port *devlink_port,
 				bool enable, struct netlink_ext_ack *extack);
+	/**
+	 * @port_fn_migratable_get: Port function's migratable get function.
+	 *
+	 * Query migratable state of a function managed by the devlink port.
+	 * Return -EOPNOTSUPP if port function migratable handling is not
+	 * supported.
+	 */
+	int (*port_fn_migratable_get)(struct devlink_port *devlink_port,
+				      bool *is_enable,
+				      struct netlink_ext_ack *extack);
+	/**
+	 * @port_fn_migratable_set: Port function's migratable set function.
+	 *
+	 * Enable/Disable migratable state of a function managed by the devlink
+	 * port.
+	 * Return -EOPNOTSUPP if port function migratable handling is not
+	 * supported.
+	 */
+	int (*port_fn_migratable_set)(struct devlink_port *devlink_port,
+				      bool enable,
+				      struct netlink_ext_ack *extack);
 	/**
 	 * port_new() - Add a new port function of a specified flavor
 	 * @devlink: Devlink instance
diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h
index 6cc2925bd478..3782d4219ac9 100644
--- a/include/uapi/linux/devlink.h
+++ b/include/uapi/linux/devlink.h
@@ -660,12 +660,15 @@ enum devlink_resource_unit {
 
 enum devlink_port_fn_attr_cap {
 	DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT,
+	DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT,
 
 	/* Add new caps above */
 	__DEVLINK_PORT_FN_ATTR_CAPS_MAX,
 };
 
 #define DEVLINK_PORT_FN_CAP_ROCE _BITUL(DEVLINK_PORT_FN_ATTR_CAP_ROCE_BIT)
+#define DEVLINK_PORT_FN_CAP_MIGRATABLE \
+	_BITUL(DEVLINK_PORT_FN_ATTR_CAP_MIGRATABLE_BIT)
 
 enum devlink_port_function_attr {
 	DEVLINK_PORT_FUNCTION_ATTR_UNSPEC,
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 8c0ad52431c5..ab40ebcb4aea 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -715,6 +715,29 @@ static int devlink_port_fn_roce_fill(const struct devlink_ops *ops,
 	return 0;
 }
 
+static int devlink_port_fn_migratable_fill(const struct devlink_ops *ops,
+					   struct devlink_port *devlink_port,
+					   struct nla_bitfield32 *caps,
+					   struct netlink_ext_ack *extack)
+{
+	bool is_enable;
+	int err;
+
+	if (!ops->port_fn_migratable_get ||
+	    devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF)
+		return 0;
+
+	err = ops->port_fn_migratable_get(devlink_port, &is_enable, extack);
+	if (err) {
+		if (err == -EOPNOTSUPP)
+			return 0;
+		return err;
+	}
+
+	devlink_port_fn_cap_fill(caps, DEVLINK_PORT_FN_CAP_MIGRATABLE, is_enable);
+	return 0;
+}
+
 static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
 				     struct devlink_port *devlink_port,
 				     struct sk_buff *msg,
@@ -728,6 +751,10 @@ static int devlink_port_fn_caps_fill(const struct devlink_ops *ops,
 	if (err)
 		return err;
 
+	err = devlink_port_fn_migratable_fill(ops, devlink_port, &caps, extack);
+	if (err)
+		return err;
+
 	if (!caps.selector)
 		return 0;
 	err = nla_put_bitfield32(msg, DEVLINK_PORT_FN_ATTR_CAPS, caps.value,
@@ -1322,6 +1349,15 @@ static int devlink_port_fn_state_fill(const struct devlink_ops *ops,
 	return 0;
 }
 
+static int
+devlink_port_fn_mig_set(struct devlink_port *devlink_port, bool enable,
+			struct netlink_ext_ack *extack)
+{
+	const struct devlink_ops *ops = devlink_port->devlink->ops;
+
+	return ops->port_fn_migratable_set(devlink_port, enable, extack);
+}
+
 static int
 devlink_port_fn_roce_set(struct devlink_port *devlink_port, bool enable,
 			 struct netlink_ext_ack *extack)
@@ -1348,6 +1384,13 @@ static int devlink_port_fn_caps_set(struct devlink_port *devlink_port,
 		if (err)
 			return err;
 	}
+	if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+		err = devlink_port_fn_mig_set(devlink_port, caps_value &
+					      DEVLINK_PORT_FN_CAP_MIGRATABLE,
+					      extack);
+		if (err)
+			return err;
+	}
 	return 0;
 }
 
@@ -1769,6 +1812,18 @@ static int devlink_port_function_validate(struct devlink_port *devlink_port,
 					    "Port doesn't support RoCE function attribute");
 			return -EOPNOTSUPP;
 		}
+		if (caps.selector & DEVLINK_PORT_FN_CAP_MIGRATABLE) {
+			if (!ops->port_fn_migratable_set) {
+				NL_SET_ERR_MSG_ATTR(extack, attr,
+						    "Port doesn't support migratable function attribute");
+				return -EOPNOTSUPP;
+			}
+			if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PCI_VF) {
+				NL_SET_ERR_MSG_ATTR(extack, attr,
+						    "migratable function attribute supported for VFs only");
+				return -EOPNOTSUPP;
+			}
+		}
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From b534dc46c8ae0165b1b2509be24dbea4fa9c4011 Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <willemb@google.com>
Date: Wed, 7 Dec 2022 09:37:01 -0500
Subject: net_tstamp: add SOF_TIMESTAMPING_OPT_ID_TCP

Add an option to initialize SOF_TIMESTAMPING_OPT_ID for TCP from
write_seq sockets instead of snd_una.

This should have been the behavior from the start. Because processes
may now exist that rely on the established behavior, do not change
behavior of the existing option, but add the right behavior with a new
flag. It is encouraged to always set SOF_TIMESTAMPING_OPT_ID_TCP on
stream sockets along with the existing SOF_TIMESTAMPING_OPT_ID.

Intuitively the contract is that the counter is zero after the
setsockopt, so that the next write N results in a notification for
the last byte N - 1.

On idle sockets snd_una == write_seq and this holds for both. But on
sockets with data in transmission, snd_una records the unacked offset
in the stream. This depends on the ACK response from the peer. A
process cannot learn this in a race free manner (ioctl SIOCOUTQ is one
racy approach).

write_seq records the offset at the last byte written by the process.
This is a better starting point. It matches the intuitive contract in
all circumstances, unaffected by external behavior.

The new timestamp flag necessitates increasing sk_tsflags to 32 bits.
Move the field in struct sock to avoid growing the socket (for some
common CONFIG variants). The UAPI interface so_timestamping.flags is
already int, so 32 bits wide.

Reported-by: Sotirios Delimanolis <sotodel@meta.com>
Signed-off-by: Willem de Bruijn <willemb@google.com>
Link: https://lore.kernel.org/r/20221207143701.29861-1-willemdebruijn.kernel@gmail.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 Documentation/networking/timestamping.rst | 32 ++++++++++++++++++++++++++++++-
 include/net/sock.h                        |  6 +++---
 include/uapi/linux/net_tstamp.h           |  3 ++-
 net/core/sock.c                           |  9 ++++++++-
 net/ethtool/common.c                      |  1 +
 5 files changed, 45 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/timestamping.rst b/Documentation/networking/timestamping.rst
index be4eb1242057..f17c01834a12 100644
--- a/Documentation/networking/timestamping.rst
+++ b/Documentation/networking/timestamping.rst
@@ -179,7 +179,8 @@ SOF_TIMESTAMPING_OPT_ID:
   identifier and returns that along with the timestamp. The identifier
   is derived from a per-socket u32 counter (that wraps). For datagram
   sockets, the counter increments with each sent packet. For stream
-  sockets, it increments with every byte.
+  sockets, it increments with every byte. For stream sockets, also set
+  SOF_TIMESTAMPING_OPT_ID_TCP, see the section below.
 
   The counter starts at zero. It is initialized the first time that
   the socket option is enabled. It is reset each time the option is
@@ -192,6 +193,35 @@ SOF_TIMESTAMPING_OPT_ID:
   among all possibly concurrently outstanding timestamp requests for
   that socket.
 
+SOF_TIMESTAMPING_OPT_ID_TCP:
+  Pass this modifier along with SOF_TIMESTAMPING_OPT_ID for new TCP
+  timestamping applications. SOF_TIMESTAMPING_OPT_ID defines how the
+  counter increments for stream sockets, but its starting point is
+  not entirely trivial. This option fixes that.
+
+  For stream sockets, if SOF_TIMESTAMPING_OPT_ID is set, this should
+  always be set too. On datagram sockets the option has no effect.
+
+  A reasonable expectation is that the counter is reset to zero with
+  the system call, so that a subsequent write() of N bytes generates
+  a timestamp with counter N-1. SOF_TIMESTAMPING_OPT_ID_TCP
+  implements this behavior under all conditions.
+
+  SOF_TIMESTAMPING_OPT_ID without modifier often reports the same,
+  especially when the socket option is set when no data is in
+  transmission. If data is being transmitted, it may be off by the
+  length of the output queue (SIOCOUTQ).
+
+  The difference is due to being based on snd_una versus write_seq.
+  snd_una is the offset in the stream acknowledged by the peer. This
+  depends on factors outside of process control, such as network RTT.
+  write_seq is the last byte written by the process. This offset is
+  not affected by external inputs.
+
+  The difference is subtle and unlikely to be noticed when configured
+  at initial socket creation, when no data is queued or sent. But
+  SOF_TIMESTAMPING_OPT_ID_TCP behavior is more robust regardless of
+  when the socket option is set.
 
 SOF_TIMESTAMPING_OPT_CMSG:
   Support recv() cmsg for all timestamped packets. Control messages
diff --git a/include/net/sock.h b/include/net/sock.h
index 6d207e7c4ad0..ecea3dcc2217 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -503,10 +503,10 @@ struct sock {
 #if BITS_PER_LONG==32
 	seqlock_t		sk_stamp_seq;
 #endif
-	u16			sk_tsflags;
-	u8			sk_shutdown;
 	atomic_t		sk_tskey;
 	atomic_t		sk_zckey;
+	u32			sk_tsflags;
+	u8			sk_shutdown;
 
 	u8			sk_clockid;
 	u8			sk_txtime_deadline_mode : 1,
@@ -1899,7 +1899,7 @@ static inline void sock_replace_proto(struct sock *sk, struct proto *proto)
 struct sockcm_cookie {
 	u64 transmit_time;
 	u32 mark;
-	u16 tsflags;
+	u32 tsflags;
 };
 
 static inline void sockcm_init(struct sockcm_cookie *sockc,
diff --git a/include/uapi/linux/net_tstamp.h b/include/uapi/linux/net_tstamp.h
index 55501e5e7ac8..a2c66b3d7f0f 100644
--- a/include/uapi/linux/net_tstamp.h
+++ b/include/uapi/linux/net_tstamp.h
@@ -31,8 +31,9 @@ enum {
 	SOF_TIMESTAMPING_OPT_PKTINFO = (1<<13),
 	SOF_TIMESTAMPING_OPT_TX_SWHW = (1<<14),
 	SOF_TIMESTAMPING_BIND_PHC = (1 << 15),
+	SOF_TIMESTAMPING_OPT_ID_TCP = (1 << 16),
 
-	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_BIND_PHC,
+	SOF_TIMESTAMPING_LAST = SOF_TIMESTAMPING_OPT_ID_TCP,
 	SOF_TIMESTAMPING_MASK = (SOF_TIMESTAMPING_LAST - 1) |
 				 SOF_TIMESTAMPING_LAST
 };
diff --git a/net/core/sock.c b/net/core/sock.c
index 4571914a4aa8..b0ab841e0aed 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -901,13 +901,20 @@ int sock_set_timestamping(struct sock *sk, int optname,
 	if (val & ~SOF_TIMESTAMPING_MASK)
 		return -EINVAL;
 
+	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
+	    !(val & SOF_TIMESTAMPING_OPT_ID))
+		return -EINVAL;
+
 	if (val & SOF_TIMESTAMPING_OPT_ID &&
 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 		if (sk_is_tcp(sk)) {
 			if ((1 << sk->sk_state) &
 			    (TCPF_CLOSE | TCPF_LISTEN))
 				return -EINVAL;
-			atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
+			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
+				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
+			else
+				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 		} else {
 			atomic_set(&sk->sk_tskey, 0);
 		}
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 21cfe8557205..6f399afc2ff2 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -417,6 +417,7 @@ const char sof_timestamping_names[][ETH_GSTRING_LEN] = {
 	[const_ilog2(SOF_TIMESTAMPING_OPT_PKTINFO)]  = "option-pktinfo",
 	[const_ilog2(SOF_TIMESTAMPING_OPT_TX_SWHW)]  = "option-tx-swhw",
 	[const_ilog2(SOF_TIMESTAMPING_BIND_PHC)]     = "bind-phc",
+	[const_ilog2(SOF_TIMESTAMPING_OPT_ID_TCP)]   = "option-id-tcp",
 };
 static_assert(ARRAY_SIZE(sof_timestamping_names) == __SOF_TIMESTAMPING_CNT);
 
-- 
cgit v1.2.3


From 2a7d228f1ae78b6eabef5f18bd1a8d2280555628 Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Tue, 6 Dec 2022 10:55:10 -0300
Subject: net/sched: move struct action_ops definition out of ifdef

The type definition should be visible even in configurations not using
CONFIG_NET_CLS_ACT.

Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/act_api.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include/net')

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c94ea1a306e0..2a6f443f0ef6 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -101,11 +101,6 @@ static inline enum flow_action_hw_stats tc_act_hw_stats(u8 hw_stats)
 	return hw_stats;
 }
 
-#ifdef CONFIG_NET_CLS_ACT
-
-#define ACT_P_CREATED 1
-#define ACT_P_DELETED 1
-
 typedef void (*tc_action_priv_destructor)(void *priv);
 
 struct tc_action_ops {
@@ -140,6 +135,11 @@ struct tc_action_ops {
 				     struct netlink_ext_ack *extack);
 };
 
+#ifdef CONFIG_NET_CLS_ACT
+
+#define ACT_P_CREATED 1
+#define ACT_P_DELETED 1
+
 struct tc_action_net {
 	struct tcf_idrinfo *idrinfo;
 	const struct tc_action_ops *ops;
-- 
cgit v1.2.3


From 7f0e810220e2d985338ecdd907c1598404db251d Mon Sep 17 00:00:00 2001
From: Pedro Tammela <pctammela@mojatatu.com>
Date: Tue, 6 Dec 2022 10:55:11 -0300
Subject: net/sched: add retpoline wrapper for tc

On kernels using retpoline as a spectrev2 mitigation,
optimize actions and filters that are compiled as built-ins into a direct call.

On subsequent patches we expose the classifiers and actions functions
and wire up the wrapper into tc.

Signed-off-by: Pedro Tammela <pctammela@mojatatu.com>
Reviewed-by: Jamal Hadi Salim <jhs@mojatatu.com>
Reviewed-by: Victor Nogueira <victor@mojatatu.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/tc_wrapper.h | 251 +++++++++++++++++++++++++++++++++++++++++++++++
 net/sched/sch_api.c      |   5 +
 2 files changed, 256 insertions(+)
 create mode 100644 include/net/tc_wrapper.h

(limited to 'include/net')

diff --git a/include/net/tc_wrapper.h b/include/net/tc_wrapper.h
new file mode 100644
index 000000000000..ceed2fc089ff
--- /dev/null
+++ b/include/net/tc_wrapper.h
@@ -0,0 +1,251 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __NET_TC_WRAPPER_H
+#define __NET_TC_WRAPPER_H
+
+#include <net/pkt_cls.h>
+
+#if IS_ENABLED(CONFIG_RETPOLINE)
+
+#include <linux/cpufeature.h>
+#include <linux/static_key.h>
+#include <linux/indirect_call_wrapper.h>
+
+#define TC_INDIRECT_SCOPE
+
+extern struct static_key_false tc_skip_wrapper;
+
+/* TC Actions */
+#ifdef CONFIG_NET_CLS_ACT
+
+#define TC_INDIRECT_ACTION_DECLARE(fname)                              \
+	INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb,       \
+					    const struct tc_action *a, \
+					    struct tcf_result *res))
+
+TC_INDIRECT_ACTION_DECLARE(tcf_bpf_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_connmark_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_csum_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ct_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ctinfo_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_gact_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_gate_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ife_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_ipt_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_mirred_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_mpls_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_nat_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_pedit_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_police_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_sample_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_simp_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_skbedit_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_skbmod_act);
+TC_INDIRECT_ACTION_DECLARE(tcf_vlan_act);
+TC_INDIRECT_ACTION_DECLARE(tunnel_key_act);
+
+static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
+			   struct tcf_result *res)
+{
+	if (static_branch_likely(&tc_skip_wrapper))
+		goto skip;
+
+#if IS_BUILTIN(CONFIG_NET_ACT_GACT)
+	if (a->ops->act == tcf_gact_act)
+		return tcf_gact_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_MIRRED)
+	if (a->ops->act == tcf_mirred_act)
+		return tcf_mirred_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_PEDIT)
+	if (a->ops->act == tcf_pedit_act)
+		return tcf_pedit_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SKBEDIT)
+	if (a->ops->act == tcf_skbedit_act)
+		return tcf_skbedit_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SKBMOD)
+	if (a->ops->act == tcf_skbmod_act)
+		return tcf_skbmod_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_POLICE)
+	if (a->ops->act == tcf_police_act)
+		return tcf_police_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_BPF)
+	if (a->ops->act == tcf_bpf_act)
+		return tcf_bpf_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CONNMARK)
+	if (a->ops->act == tcf_connmark_act)
+		return tcf_connmark_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CSUM)
+	if (a->ops->act == tcf_csum_act)
+		return tcf_csum_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CT)
+	if (a->ops->act == tcf_ct_act)
+		return tcf_ct_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_CTINFO)
+	if (a->ops->act == tcf_ctinfo_act)
+		return tcf_ctinfo_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_GATE)
+	if (a->ops->act == tcf_gate_act)
+		return tcf_gate_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_MPLS)
+	if (a->ops->act == tcf_mpls_act)
+		return tcf_mpls_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_NAT)
+	if (a->ops->act == tcf_nat_act)
+		return tcf_nat_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_TUNNEL_KEY)
+	if (a->ops->act == tunnel_key_act)
+		return tunnel_key_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_VLAN)
+	if (a->ops->act == tcf_vlan_act)
+		return tcf_vlan_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_IFE)
+	if (a->ops->act == tcf_ife_act)
+		return tcf_ife_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_IPT)
+	if (a->ops->act == tcf_ipt_act)
+		return tcf_ipt_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SIMP)
+	if (a->ops->act == tcf_simp_act)
+		return tcf_simp_act(skb, a, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_ACT_SAMPLE)
+	if (a->ops->act == tcf_sample_act)
+		return tcf_sample_act(skb, a, res);
+#endif
+
+skip:
+	return a->ops->act(skb, a, res);
+}
+
+#endif /* CONFIG_NET_CLS_ACT */
+
+/* TC Filters */
+#ifdef CONFIG_NET_CLS
+
+#define TC_INDIRECT_FILTER_DECLARE(fname)                               \
+	INDIRECT_CALLABLE_DECLARE(int fname(struct sk_buff *skb,        \
+					    const struct tcf_proto *tp, \
+					    struct tcf_result *res))
+
+TC_INDIRECT_FILTER_DECLARE(basic_classify);
+TC_INDIRECT_FILTER_DECLARE(cls_bpf_classify);
+TC_INDIRECT_FILTER_DECLARE(cls_cgroup_classify);
+TC_INDIRECT_FILTER_DECLARE(fl_classify);
+TC_INDIRECT_FILTER_DECLARE(flow_classify);
+TC_INDIRECT_FILTER_DECLARE(fw_classify);
+TC_INDIRECT_FILTER_DECLARE(mall_classify);
+TC_INDIRECT_FILTER_DECLARE(route4_classify);
+TC_INDIRECT_FILTER_DECLARE(rsvp_classify);
+TC_INDIRECT_FILTER_DECLARE(rsvp6_classify);
+TC_INDIRECT_FILTER_DECLARE(tcindex_classify);
+TC_INDIRECT_FILTER_DECLARE(u32_classify);
+
+static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+				struct tcf_result *res)
+{
+	if (static_branch_likely(&tc_skip_wrapper))
+		goto skip;
+
+#if IS_BUILTIN(CONFIG_NET_CLS_BPF)
+	if (tp->classify == cls_bpf_classify)
+		return cls_bpf_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_U32)
+	if (tp->classify == u32_classify)
+		return u32_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FLOWER)
+	if (tp->classify == fl_classify)
+		return fl_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FW)
+	if (tp->classify == fw_classify)
+		return fw_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_MATCHALL)
+	if (tp->classify == mall_classify)
+		return mall_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_BASIC)
+	if (tp->classify == basic_classify)
+		return basic_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_CGROUP)
+	if (tp->classify == cls_cgroup_classify)
+		return cls_cgroup_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_FLOW)
+	if (tp->classify == flow_classify)
+		return flow_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_ROUTE4)
+	if (tp->classify == route4_classify)
+		return route4_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_RSVP)
+	if (tp->classify == rsvp_classify)
+		return rsvp_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_RSVP6)
+	if (tp->classify == rsvp6_classify)
+		return rsvp6_classify(skb, tp, res);
+#endif
+#if IS_BUILTIN(CONFIG_NET_CLS_TCINDEX)
+	if (tp->classify == tcindex_classify)
+		return tcindex_classify(skb, tp, res);
+#endif
+
+skip:
+	return tp->classify(skb, tp, res);
+}
+
+static inline void tc_wrapper_init(void)
+{
+#ifdef CONFIG_X86
+	if (!cpu_feature_enabled(X86_FEATURE_RETPOLINE))
+		static_branch_enable(&tc_skip_wrapper);
+#endif
+}
+
+#endif /* CONFIG_NET_CLS */
+
+#else
+
+#define TC_INDIRECT_SCOPE static
+
+static inline int tc_act(struct sk_buff *skb, const struct tc_action *a,
+			   struct tcf_result *res)
+{
+	return a->ops->act(skb, a, res);
+}
+
+static inline int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
+				struct tcf_result *res)
+{
+	return tp->classify(skb, tp, res);
+}
+
+static inline void tc_wrapper_init(void)
+{
+}
+
+#endif
+
+#endif /* __NET_TC_WRAPPER_H */
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 4a27dfb1ba0f..2317db02c764 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -31,6 +31,7 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 #include <net/pkt_cls.h>
+#include <net/tc_wrapper.h>
 
 #include <trace/events/qdisc.h>
 
@@ -2273,6 +2274,8 @@ static struct pernet_operations psched_net_ops = {
 	.exit = psched_net_exit,
 };
 
+DEFINE_STATIC_KEY_FALSE(tc_skip_wrapper);
+
 static int __init pktsched_init(void)
 {
 	int err;
@@ -2300,6 +2303,8 @@ static int __init pktsched_init(void)
 	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
 		      0);
 
+	tc_wrapper_init();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 5df7d714d8cbcce7642936cc0f6532f0c4c3d197 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:45:59 +0200
Subject: ipvs: add rcu protection to stats

In preparation to using RCU locking for the list
with estimators, make sure the struct ip_vs_stats
are released after RCU grace period by using RCU
callbacks. This affects ipvs->tot_stats where we
can not use RCU callbacks for ipvs, so we use
allocated struct ip_vs_stats_rcu. For services
and dests we force RCU callbacks for all cases.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             |  8 +++++-
 net/netfilter/ipvs/ip_vs_core.c | 10 +++++--
 net/netfilter/ipvs/ip_vs_ctl.c  | 64 +++++++++++++++++++++++++++--------------
 3 files changed, 57 insertions(+), 25 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index ff1804a0c469..bd8ae137e43b 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -405,6 +405,11 @@ struct ip_vs_stats {
 	struct ip_vs_kstats	kstats0;	/* reset values */
 };
 
+struct ip_vs_stats_rcu {
+	struct ip_vs_stats	s;
+	struct rcu_head		rcu_head;
+};
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
@@ -688,6 +693,7 @@ struct ip_vs_dest {
 	union nf_inet_addr	vaddr;		/* virtual IP address */
 	__u32			vfwmark;	/* firewall mark of service */
 
+	struct rcu_head		rcu_head;
 	struct list_head	t_list;		/* in dest_trash */
 	unsigned int		in_rs_table:1;	/* we are in rs_table */
 };
@@ -869,7 +875,7 @@ struct netns_ipvs {
 	atomic_t		conn_count;      /* connection counter */
 
 	/* ip_vs_ctl */
-	struct ip_vs_stats		tot_stats;  /* Statistics & est. */
+	struct ip_vs_stats_rcu	*tot_stats;      /* Statistics & est. */
 
 	int			num_services;    /* no of virtual services */
 	int			num_services6;   /* IPv6 virtual services */
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 51ad557a525b..fcdaef1fcccf 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -143,7 +143,7 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		s->cnt.inbytes += skb->len;
 		u64_stats_update_end(&s->syncp);
 
-		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+		s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 		u64_stats_update_begin(&s->syncp);
 		s->cnt.inpkts++;
 		s->cnt.inbytes += skb->len;
@@ -179,7 +179,7 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 		s->cnt.outbytes += skb->len;
 		u64_stats_update_end(&s->syncp);
 
-		s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+		s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 		u64_stats_update_begin(&s->syncp);
 		s->cnt.outpkts++;
 		s->cnt.outbytes += skb->len;
@@ -208,7 +208,7 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 	s->cnt.conns++;
 	u64_stats_update_end(&s->syncp);
 
-	s = this_cpu_ptr(ipvs->tot_stats.cpustats);
+	s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 	u64_stats_update_begin(&s->syncp);
 	s->cnt.conns++;
 	u64_stats_update_end(&s->syncp);
@@ -2448,6 +2448,10 @@ static void __exit ip_vs_cleanup(void)
 	ip_vs_conn_cleanup();
 	ip_vs_protocol_cleanup();
 	ip_vs_control_cleanup();
+	/* common rcu_barrier() used by:
+	 * - ip_vs_control_cleanup()
+	 */
+	rcu_barrier();
 	pr_info("ipvs unloaded.\n");
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 4d62059a6021..9016b641ae52 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -483,17 +483,14 @@ static void ip_vs_service_rcu_free(struct rcu_head *head)
 	ip_vs_service_free(svc);
 }
 
-static void __ip_vs_svc_put(struct ip_vs_service *svc, bool do_delay)
+static void __ip_vs_svc_put(struct ip_vs_service *svc)
 {
 	if (atomic_dec_and_test(&svc->refcnt)) {
 		IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
 			      svc->fwmark,
 			      IP_VS_DBG_ADDR(svc->af, &svc->addr),
 			      ntohs(svc->port));
-		if (do_delay)
-			call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
-		else
-			ip_vs_service_free(svc);
+		call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
 	}
 }
 
@@ -780,14 +777,22 @@ out:
 	return dest;
 }
 
+static void ip_vs_dest_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_dest *dest;
+
+	dest = container_of(head, struct ip_vs_dest, rcu_head);
+	free_percpu(dest->stats.cpustats);
+	ip_vs_dest_put_and_free(dest);
+}
+
 static void ip_vs_dest_free(struct ip_vs_dest *dest)
 {
 	struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
 
 	__ip_vs_dst_cache_reset(dest);
-	__ip_vs_svc_put(svc, false);
-	free_percpu(dest->stats.cpustats);
-	ip_vs_dest_put_and_free(dest);
+	__ip_vs_svc_put(svc);
+	call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
 }
 
 /*
@@ -811,6 +816,16 @@ static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
 	}
 }
 
+static void ip_vs_stats_rcu_free(struct rcu_head *head)
+{
+	struct ip_vs_stats_rcu *rs = container_of(head,
+						  struct ip_vs_stats_rcu,
+						  rcu_head);
+
+	free_percpu(rs->s.cpustats);
+	kfree(rs);
+}
+
 static void
 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
 {
@@ -923,7 +938,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 		if (old_svc != svc) {
 			ip_vs_zero_stats(&dest->stats);
 			__ip_vs_bind_svc(dest, svc);
-			__ip_vs_svc_put(old_svc, true);
+			__ip_vs_svc_put(old_svc);
 		}
 	}
 
@@ -1571,7 +1586,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
 	/*
 	 *    Free the service if nobody refers to it
 	 */
-	__ip_vs_svc_put(svc, true);
+	__ip_vs_svc_put(svc);
 
 	/* decrease the module use count */
 	ip_vs_use_count_dec();
@@ -1761,7 +1776,7 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 		}
 	}
 
-	ip_vs_zero_stats(&ipvs->tot_stats);
+	ip_vs_zero_stats(&ipvs->tot_stats->s);
 	return 0;
 }
 
@@ -2255,7 +2270,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 	seq_puts(seq,
 		 "   Conns  Packets  Packets            Bytes            Bytes\n");
 
-	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats);
+	ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
 	seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
 		   (unsigned long long)show.conns,
 		   (unsigned long long)show.inpkts,
@@ -2279,7 +2294,7 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v)
 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq_file_single_net(seq);
-	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats;
+	struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
 	struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
 	struct ip_vs_kstats kstats;
 	int i;
@@ -4107,7 +4122,6 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 			kfree(tbl);
 		return -ENOMEM;
 	}
-	ip_vs_start_estimator(ipvs, &ipvs->tot_stats);
 	ipvs->sysctl_tbl = tbl;
 	/* Schedule defense work */
 	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
@@ -4118,6 +4132,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
 			  expire_nodest_conn_handler);
 
+	ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
 	return 0;
 }
 
@@ -4129,7 +4144,7 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
 	cancel_delayed_work_sync(&ipvs->defense_work);
 	cancel_work_sync(&ipvs->defense_work.work);
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
-	ip_vs_stop_estimator(ipvs, &ipvs->tot_stats);
+	ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
 
 	if (!net_eq(net, &init_net))
 		kfree(ipvs->sysctl_tbl);
@@ -4165,17 +4180,20 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	atomic_set(&ipvs->conn_out_counter, 0);
 
 	/* procfs stats */
-	ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-	if (!ipvs->tot_stats.cpustats)
+	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
+	if (!ipvs->tot_stats)
 		return -ENOMEM;
+	ipvs->tot_stats->s.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+	if (!ipvs->tot_stats->s.cpustats)
+		goto err_tot_stats;
 
 	for_each_possible_cpu(i) {
 		struct ip_vs_cpu_stats *ipvs_tot_stats;
-		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats.cpustats, i);
+		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats->s.cpustats, i);
 		u64_stats_init(&ipvs_tot_stats->syncp);
 	}
 
-	spin_lock_init(&ipvs->tot_stats.lock);
+	spin_lock_init(&ipvs->tot_stats->s.lock);
 
 #ifdef CONFIG_PROC_FS
 	if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
@@ -4207,7 +4225,10 @@ err_stats:
 
 err_vs:
 #endif
-	free_percpu(ipvs->tot_stats.cpustats);
+	free_percpu(ipvs->tot_stats->s.cpustats);
+
+err_tot_stats:
+	kfree(ipvs->tot_stats);
 	return -ENOMEM;
 }
 
@@ -4220,7 +4241,7 @@ void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
 	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
 	remove_proc_entry("ip_vs", ipvs->net->proc_net);
 #endif
-	free_percpu(ipvs->tot_stats.cpustats);
+	call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
 }
 
 int __init ip_vs_register_nl_ioctl(void)
@@ -4280,5 +4301,6 @@ void ip_vs_control_cleanup(void)
 {
 	EnterFunction(2);
 	unregister_netdevice_notifier(&ip_vs_dst_notifier);
+	/* relying on common rcu_barrier() in ip_vs_cleanup() */
 	LeaveFunction(2);
 }
-- 
cgit v1.2.3


From de39afb3d811ba2c028de8662adafedb4899327b Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:46:00 +0200
Subject: ipvs: use common functions for stats allocation

Move alloc_percpu/free_percpu logic in new functions

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            |  5 +++
 net/netfilter/ipvs/ip_vs_ctl.c | 96 ++++++++++++++++++++++++------------------
 2 files changed, 60 insertions(+), 41 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index bd8ae137e43b..e5582c01a4a3 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -410,6 +410,11 @@ struct ip_vs_stats_rcu {
 	struct rcu_head		rcu_head;
 };
 
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s);
+struct ip_vs_stats *ip_vs_stats_alloc(void);
+void ip_vs_stats_release(struct ip_vs_stats *stats);
+void ip_vs_stats_free(struct ip_vs_stats *stats);
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 9016b641ae52..ec6db864ac36 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -471,7 +471,7 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
 
 static void ip_vs_service_free(struct ip_vs_service *svc)
 {
-	free_percpu(svc->stats.cpustats);
+	ip_vs_stats_release(&svc->stats);
 	kfree(svc);
 }
 
@@ -782,7 +782,7 @@ static void ip_vs_dest_rcu_free(struct rcu_head *head)
 	struct ip_vs_dest *dest;
 
 	dest = container_of(head, struct ip_vs_dest, rcu_head);
-	free_percpu(dest->stats.cpustats);
+	ip_vs_stats_release(&dest->stats);
 	ip_vs_dest_put_and_free(dest);
 }
 
@@ -822,7 +822,7 @@ static void ip_vs_stats_rcu_free(struct rcu_head *head)
 						  struct ip_vs_stats_rcu,
 						  rcu_head);
 
-	free_percpu(rs->s.cpustats);
+	ip_vs_stats_release(&rs->s);
 	kfree(rs);
 }
 
@@ -879,6 +879,47 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
 	spin_unlock_bh(&stats->lock);
 }
 
+/* Allocate fields after kzalloc */
+int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
+{
+	int i;
+
+	spin_lock_init(&s->lock);
+	s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
+	if (!s->cpustats)
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
+
+		u64_stats_init(&cs->syncp);
+	}
+	return 0;
+}
+
+struct ip_vs_stats *ip_vs_stats_alloc(void)
+{
+	struct ip_vs_stats *s = kzalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s && ip_vs_stats_init_alloc(s) >= 0)
+		return s;
+	kfree(s);
+	return NULL;
+}
+
+void ip_vs_stats_release(struct ip_vs_stats *stats)
+{
+	free_percpu(stats->cpustats);
+}
+
+void ip_vs_stats_free(struct ip_vs_stats *stats)
+{
+	if (stats) {
+		ip_vs_stats_release(stats);
+		kfree(stats);
+	}
+}
+
 /*
  *	Update a destination in the given service
  */
@@ -978,14 +1019,13 @@ static int
 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 {
 	struct ip_vs_dest *dest;
-	unsigned int atype, i;
+	unsigned int atype;
+	int ret;
 
 	EnterFunction(2);
 
 #ifdef CONFIG_IP_VS_IPV6
 	if (udest->af == AF_INET6) {
-		int ret;
-
 		atype = ipv6_addr_type(&udest->addr.in6);
 		if ((!(atype & IPV6_ADDR_UNICAST) ||
 			atype & IPV6_ADDR_LINKLOCAL) &&
@@ -1007,16 +1047,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	if (dest == NULL)
 		return -ENOMEM;
 
-	dest->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-	if (!dest->stats.cpustats)
+	ret = ip_vs_stats_init_alloc(&dest->stats);
+	if (ret < 0)
 		goto err_alloc;
 
-	for_each_possible_cpu(i) {
-		struct ip_vs_cpu_stats *ip_vs_dest_stats;
-		ip_vs_dest_stats = per_cpu_ptr(dest->stats.cpustats, i);
-		u64_stats_init(&ip_vs_dest_stats->syncp);
-	}
-
 	dest->af = udest->af;
 	dest->protocol = svc->protocol;
 	dest->vaddr = svc->addr;
@@ -1032,7 +1066,6 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 	INIT_HLIST_NODE(&dest->d_list);
 	spin_lock_init(&dest->dst_lock);
-	spin_lock_init(&dest->stats.lock);
 	__ip_vs_update_dest(svc, dest, udest, 1);
 
 	LeaveFunction(2);
@@ -1040,7 +1073,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 
 err_alloc:
 	kfree(dest);
-	return -ENOMEM;
+	return ret;
 }
 
 
@@ -1299,7 +1332,7 @@ static int
 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		  struct ip_vs_service **svc_p)
 {
-	int ret = 0, i;
+	int ret = 0;
 	struct ip_vs_scheduler *sched = NULL;
 	struct ip_vs_pe *pe = NULL;
 	struct ip_vs_service *svc = NULL;
@@ -1359,18 +1392,9 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		ret = -ENOMEM;
 		goto out_err;
 	}
-	svc->stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-	if (!svc->stats.cpustats) {
-		ret = -ENOMEM;
+	ret = ip_vs_stats_init_alloc(&svc->stats);
+	if (ret < 0)
 		goto out_err;
-	}
-
-	for_each_possible_cpu(i) {
-		struct ip_vs_cpu_stats *ip_vs_stats;
-		ip_vs_stats = per_cpu_ptr(svc->stats.cpustats, i);
-		u64_stats_init(&ip_vs_stats->syncp);
-	}
-
 
 	/* I'm the first user of the service */
 	atomic_set(&svc->refcnt, 0);
@@ -1387,7 +1411,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 
 	INIT_LIST_HEAD(&svc->destinations);
 	spin_lock_init(&svc->sched_lock);
-	spin_lock_init(&svc->stats.lock);
 
 	/* Bind the scheduler */
 	if (sched) {
@@ -4166,7 +4189,7 @@ static struct notifier_block ip_vs_dst_notifier = {
 
 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 {
-	int i, idx;
+	int idx;
 
 	/* Initialize rs_table */
 	for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
@@ -4183,18 +4206,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
 	if (!ipvs->tot_stats)
 		return -ENOMEM;
-	ipvs->tot_stats->s.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
-	if (!ipvs->tot_stats->s.cpustats)
+	if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
 		goto err_tot_stats;
 
-	for_each_possible_cpu(i) {
-		struct ip_vs_cpu_stats *ipvs_tot_stats;
-		ipvs_tot_stats = per_cpu_ptr(ipvs->tot_stats->s.cpustats, i);
-		u64_stats_init(&ipvs_tot_stats->syncp);
-	}
-
-	spin_lock_init(&ipvs->tot_stats->s.lock);
-
 #ifdef CONFIG_PROC_FS
 	if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
 			     &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
@@ -4225,7 +4239,7 @@ err_stats:
 
 err_vs:
 #endif
-	free_percpu(ipvs->tot_stats->s.cpustats);
+	ip_vs_stats_release(&ipvs->tot_stats->s);
 
 err_tot_stats:
 	kfree(ipvs->tot_stats);
-- 
cgit v1.2.3


From 1dbd8d9a82e3f26b9d063292d47ece673f48fce2 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:46:01 +0200
Subject: ipvs: use u64_stats_t for the per-cpu counters

Use the provided u64_stats_t type to avoid
load/store tearing.

Fixes: 316580b69d0a ("u64_stats: provide u64_stats_t type")
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Tested-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h             | 10 +++++-----
 net/netfilter/ipvs/ip_vs_core.c | 30 +++++++++++++++---------------
 net/netfilter/ipvs/ip_vs_ctl.c  | 10 +++++-----
 net/netfilter/ipvs/ip_vs_est.c  | 20 ++++++++++----------
 4 files changed, 35 insertions(+), 35 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e5582c01a4a3..a4d44138c2a8 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -351,11 +351,11 @@ struct ip_vs_seq {
 
 /* counters per cpu */
 struct ip_vs_counters {
-	__u64		conns;		/* connections scheduled */
-	__u64		inpkts;		/* incoming packets */
-	__u64		outpkts;	/* outgoing packets */
-	__u64		inbytes;	/* incoming bytes */
-	__u64		outbytes;	/* outgoing bytes */
+	u64_stats_t	conns;		/* connections scheduled */
+	u64_stats_t	inpkts;		/* incoming packets */
+	u64_stats_t	outpkts;	/* outgoing packets */
+	u64_stats_t	inbytes;	/* incoming bytes */
+	u64_stats_t	outbytes;	/* outgoing bytes */
 };
 /* Stats per cpu */
 struct ip_vs_cpu_stats {
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index fcdaef1fcccf..2fcc26507d69 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -132,21 +132,21 @@ ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 		s = this_cpu_ptr(dest->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.inpkts++;
-		s->cnt.inbytes += skb->len;
+		u64_stats_inc(&s->cnt.inpkts);
+		u64_stats_add(&s->cnt.inbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		svc = rcu_dereference(dest->svc);
 		s = this_cpu_ptr(svc->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.inpkts++;
-		s->cnt.inbytes += skb->len;
+		u64_stats_inc(&s->cnt.inpkts);
+		u64_stats_add(&s->cnt.inbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.inpkts++;
-		s->cnt.inbytes += skb->len;
+		u64_stats_inc(&s->cnt.inpkts);
+		u64_stats_add(&s->cnt.inbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		local_bh_enable();
@@ -168,21 +168,21 @@ ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb)
 
 		s = this_cpu_ptr(dest->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.outpkts++;
-		s->cnt.outbytes += skb->len;
+		u64_stats_inc(&s->cnt.outpkts);
+		u64_stats_add(&s->cnt.outbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		svc = rcu_dereference(dest->svc);
 		s = this_cpu_ptr(svc->stats.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.outpkts++;
-		s->cnt.outbytes += skb->len;
+		u64_stats_inc(&s->cnt.outpkts);
+		u64_stats_add(&s->cnt.outbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 		u64_stats_update_begin(&s->syncp);
-		s->cnt.outpkts++;
-		s->cnt.outbytes += skb->len;
+		u64_stats_inc(&s->cnt.outpkts);
+		u64_stats_add(&s->cnt.outbytes, skb->len);
 		u64_stats_update_end(&s->syncp);
 
 		local_bh_enable();
@@ -200,17 +200,17 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
 
 	s = this_cpu_ptr(cp->dest->stats.cpustats);
 	u64_stats_update_begin(&s->syncp);
-	s->cnt.conns++;
+	u64_stats_inc(&s->cnt.conns);
 	u64_stats_update_end(&s->syncp);
 
 	s = this_cpu_ptr(svc->stats.cpustats);
 	u64_stats_update_begin(&s->syncp);
-	s->cnt.conns++;
+	u64_stats_inc(&s->cnt.conns);
 	u64_stats_update_end(&s->syncp);
 
 	s = this_cpu_ptr(ipvs->tot_stats->s.cpustats);
 	u64_stats_update_begin(&s->syncp);
-	s->cnt.conns++;
+	u64_stats_inc(&s->cnt.conns);
 	u64_stats_update_end(&s->syncp);
 
 	local_bh_enable();
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index ec6db864ac36..5f9cc2e7ba71 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2335,11 +2335,11 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
 
 		do {
 			start = u64_stats_fetch_begin(&u->syncp);
-			conns = u->cnt.conns;
-			inpkts = u->cnt.inpkts;
-			outpkts = u->cnt.outpkts;
-			inbytes = u->cnt.inbytes;
-			outbytes = u->cnt.outbytes;
+			conns = u64_stats_read(&u->cnt.conns);
+			inpkts = u64_stats_read(&u->cnt.inpkts);
+			outpkts = u64_stats_read(&u->cnt.outpkts);
+			inbytes = u64_stats_read(&u->cnt.inbytes);
+			outbytes = u64_stats_read(&u->cnt.outbytes);
 		} while (u64_stats_fetch_retry(&u->syncp, start));
 
 		seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 9a1a7af6a186..f53150d82a92 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -67,11 +67,11 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
 		if (add) {
 			do {
 				start = u64_stats_fetch_begin(&s->syncp);
-				conns = s->cnt.conns;
-				inpkts = s->cnt.inpkts;
-				outpkts = s->cnt.outpkts;
-				inbytes = s->cnt.inbytes;
-				outbytes = s->cnt.outbytes;
+				conns = u64_stats_read(&s->cnt.conns);
+				inpkts = u64_stats_read(&s->cnt.inpkts);
+				outpkts = u64_stats_read(&s->cnt.outpkts);
+				inbytes = u64_stats_read(&s->cnt.inbytes);
+				outbytes = u64_stats_read(&s->cnt.outbytes);
 			} while (u64_stats_fetch_retry(&s->syncp, start));
 			sum->conns += conns;
 			sum->inpkts += inpkts;
@@ -82,11 +82,11 @@ static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
 			add = true;
 			do {
 				start = u64_stats_fetch_begin(&s->syncp);
-				sum->conns = s->cnt.conns;
-				sum->inpkts = s->cnt.inpkts;
-				sum->outpkts = s->cnt.outpkts;
-				sum->inbytes = s->cnt.inbytes;
-				sum->outbytes = s->cnt.outbytes;
+				sum->conns = u64_stats_read(&s->cnt.conns);
+				sum->inpkts = u64_stats_read(&s->cnt.inpkts);
+				sum->outpkts = u64_stats_read(&s->cnt.outpkts);
+				sum->inbytes = u64_stats_read(&s->cnt.inbytes);
+				sum->outbytes = u64_stats_read(&s->cnt.outbytes);
 			} while (u64_stats_fetch_retry(&s->syncp, start));
 		}
 	}
-- 
cgit v1.2.3


From 705dd34440812735ece298eb5bc153fde9544d42 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:46:02 +0200
Subject: ipvs: use kthreads for stats estimation

Estimating all entries in single list in timer context
by single CPU causes large latency with multiple IPVS rules
as reported in [1], [2], [3].

Spread the estimator structures in multiple chains and
use kthread(s) for the estimation. The chains are processed
in multiple (50) timer ticks to ensure the 2-second interval
between estimations with some accuracy. Every chain is
processed under RCU lock.

Every kthread works over its own data structure and all
such contexts are attached to array. The contexts can be
preserved while the kthread tasks are stopped or restarted.
When estimators are removed, unused kthread contexts are
released and the slots in array are left empty.

First kthread determines parameters to use, eg. maximum
number of estimators to process per kthread based on
chain's length (chain_max), allowing sub-100us cond_resched
rate and estimation taking up to 1/8 of the CPU capacity
to avoid any problems if chain_max is not correctly
calculated.

chain_max is calculated taking into account factors
such as CPU speed and memory/cache speed where the
cache_factor (4) is selected from real tests with
current generation of CPU/NUMA configurations to
correct the difference in CPU usage between
cached (during calc phase) and non-cached (working) state
of the estimated per-cpu data.

First kthread also plays the role of distributor of
added estimators to all kthreads, keeping low the
time to add estimators. The optimization is based on
the fact that newly added estimator should be estimated
after 2 seconds, so we have the time to offload the
adding to chain from controlling process to kthread 0.

The allocated kthread context may grow from 1 to 50
allocated structures for timer ticks which saves memory for
setups with small number of estimators.

We also add delayed work est_reload_work that will
make sure the kthread tasks are properly started/stopped.

ip_vs_start_estimator() is changed to report errors
which allows to safely store the estimators in
allocated structures.

Many thanks to Jiri Wiesner for his valuable comments
and for spending a lot of time reviewing and testing
the changes on different platforms with 48-256 CPUs and
1-8 NUMA nodes under different cpufreq governors.

[1] Report from Yunhong Jiang:
https://lore.kernel.org/netdev/D25792C1-1B89-45DE-9F10-EC350DC04ADC@gmail.com/
[2]
https://marc.info/?l=linux-virtual-server&m=159679809118027&w=2
[3] Report from Dust:
https://archive.linuxvirtualserver.org/html/lvs-devel/2020-12/msg00000.html

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Tested-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/net/ip_vs.h            |  88 ++++-
 net/netfilter/ipvs/ip_vs_ctl.c | 126 ++++--
 net/netfilter/ipvs/ip_vs_est.c | 876 +++++++++++++++++++++++++++++++++++++----
 3 files changed, 990 insertions(+), 100 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index a4d44138c2a8..04960dc6228f 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -42,6 +42,8 @@ static inline struct netns_ipvs *net_ipvs(struct net* net)
 /* Connections' size value needed by ip_vs_ctl.c */
 extern int ip_vs_conn_tab_size;
 
+extern struct mutex __ip_vs_mutex;
+
 struct ip_vs_iphdr {
 	int hdr_flags;	/* ipvs flags */
 	__u32 off;	/* Where IP or IPv4 header starts */
@@ -365,7 +367,7 @@ struct ip_vs_cpu_stats {
 
 /* IPVS statistics objects */
 struct ip_vs_estimator {
-	struct list_head	list;
+	struct hlist_node	list;
 
 	u64			last_inbytes;
 	u64			last_outbytes;
@@ -378,6 +380,10 @@ struct ip_vs_estimator {
 	u64			outpps;
 	u64			inbps;
 	u64			outbps;
+
+	s32			ktid:16,	/* kthread ID, -1=temp list */
+				ktrow:8,	/* row/tick ID for kthread */
+				ktcid:8;	/* chain ID for kthread tick */
 };
 
 /*
@@ -415,6 +421,66 @@ struct ip_vs_stats *ip_vs_stats_alloc(void);
 void ip_vs_stats_release(struct ip_vs_stats *stats);
 void ip_vs_stats_free(struct ip_vs_stats *stats);
 
+/* Process estimators in multiple timer ticks (20/50/100, see ktrow) */
+#define IPVS_EST_NTICKS		50
+/* Estimation uses a 2-second period containing ticks (in jiffies) */
+#define IPVS_EST_TICK		((2 * HZ) / IPVS_EST_NTICKS)
+
+/* Limit of CPU load per kthread (8 for 12.5%), ratio of CPU capacity (1/C).
+ * Value of 4 and above ensures kthreads will take work without exceeding
+ * the CPU capacity under different circumstances.
+ */
+#define IPVS_EST_LOAD_DIVISOR	8
+
+/* Kthreads should not have work that exceeds the CPU load above 50% */
+#define IPVS_EST_CPU_KTHREADS	(IPVS_EST_LOAD_DIVISOR / 2)
+
+/* Desired number of chains per timer tick (chain load factor in 100us units),
+ * 48=4.8ms of 40ms tick (12% CPU usage):
+ * 2 sec * 1000 ms in sec * 10 (100us in ms) / 8 (12.5%) / 50
+ */
+#define IPVS_EST_CHAIN_FACTOR	\
+	ALIGN_DOWN(2 * 1000 * 10 / IPVS_EST_LOAD_DIVISOR / IPVS_EST_NTICKS, 8)
+
+/* Compiled number of chains per tick
+ * The defines should match cond_resched_rcu
+ */
+#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
+#define IPVS_EST_TICK_CHAINS	IPVS_EST_CHAIN_FACTOR
+#else
+#define IPVS_EST_TICK_CHAINS	1
+#endif
+
+#if IPVS_EST_NTICKS > 127
+#error Too many timer ticks for ktrow
+#endif
+
+/* Multiple chains processed in same tick */
+struct ip_vs_est_tick_data {
+	struct hlist_head	chains[IPVS_EST_TICK_CHAINS];
+	DECLARE_BITMAP(present, IPVS_EST_TICK_CHAINS);
+	DECLARE_BITMAP(full, IPVS_EST_TICK_CHAINS);
+	int			chain_len[IPVS_EST_TICK_CHAINS];
+};
+
+/* Context for estimation kthread */
+struct ip_vs_est_kt_data {
+	struct netns_ipvs	*ipvs;
+	struct task_struct	*task;		/* task if running */
+	struct ip_vs_est_tick_data __rcu *ticks[IPVS_EST_NTICKS];
+	DECLARE_BITMAP(avail, IPVS_EST_NTICKS);	/* tick has space for ests */
+	unsigned long		est_timer;	/* estimation timer (jiffies) */
+	struct ip_vs_stats	*calc_stats;	/* Used for calculation */
+	int			tick_len[IPVS_EST_NTICKS];	/* est count */
+	int			id;		/* ktid per netns */
+	int			chain_max;	/* max ests per tick chain */
+	int			tick_max;	/* max ests per tick */
+	int			est_count;	/* attached ests to kthread */
+	int			est_max_count;	/* max ests per kthread */
+	int			add_row;	/* row for new ests */
+	int			est_row;	/* estimated row */
+};
+
 struct dst_entry;
 struct iphdr;
 struct ip_vs_conn;
@@ -953,9 +1019,17 @@ struct netns_ipvs {
 	struct ctl_table_header	*lblcr_ctl_header;
 	struct ctl_table	*lblcr_ctl_table;
 	/* ip_vs_est */
-	struct list_head	est_list;	/* estimator list */
-	spinlock_t		est_lock;
-	struct timer_list	est_timer;	/* Estimation timer */
+	struct delayed_work	est_reload_work;/* Reload kthread tasks */
+	struct mutex		est_mutex;	/* protect kthread tasks */
+	struct hlist_head	est_temp_list;	/* Ests during calc phase */
+	struct ip_vs_est_kt_data **est_kt_arr;	/* Array of kthread data ptrs */
+	unsigned long		est_max_threads;/* Hard limit of kthreads */
+	int			est_calc_phase;	/* Calculation phase */
+	int			est_chain_max;	/* Calculated chain_max */
+	int			est_kt_count;	/* Allocated ptrs */
+	int			est_add_ktid;	/* ktid where to add ests */
+	atomic_t		est_genid;	/* kthreads reload genid */
+	atomic_t		est_genid_done;	/* applied genid */
 	/* ip_vs_sync */
 	spinlock_t		sync_lock;
 	struct ipvs_master_sync_state *ms;
@@ -1486,10 +1560,14 @@ int stop_sync_thread(struct netns_ipvs *ipvs, int state);
 void ip_vs_sync_conn(struct netns_ipvs *ipvs, struct ip_vs_conn *cp, int pkts);
 
 /* IPVS rate estimator prototypes (from ip_vs_est.c) */
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats);
 void ip_vs_zero_estimator(struct ip_vs_stats *stats);
 void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats);
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs);
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+			    struct ip_vs_est_kt_data *kd);
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
 
 /* Various IPVS packet transmitters (from ip_vs_xmit.c) */
 int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 5f9cc2e7ba71..c41a5392edc9 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -49,8 +49,7 @@
 
 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
 
-/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */
-static DEFINE_MUTEX(__ip_vs_mutex);
+DEFINE_MUTEX(__ip_vs_mutex); /* Serialize configuration with sockopt/netlink */
 
 /* sysctl variables */
 
@@ -241,6 +240,47 @@ static void defense_work_handler(struct work_struct *work)
 }
 #endif
 
+static void est_reload_work_handler(struct work_struct *work)
+{
+	struct netns_ipvs *ipvs =
+		container_of(work, struct netns_ipvs, est_reload_work.work);
+	int genid_done = atomic_read(&ipvs->est_genid_done);
+	unsigned long delay = HZ / 10;	/* repeat startups after failure */
+	bool repeat = false;
+	int genid;
+	int id;
+
+	mutex_lock(&ipvs->est_mutex);
+	genid = atomic_read(&ipvs->est_genid);
+	for (id = 0; id < ipvs->est_kt_count; id++) {
+		struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
+
+		/* netns clean up started, abort delayed work */
+		if (!ipvs->enable)
+			goto unlock;
+		if (!kd)
+			continue;
+		/* New config ? Stop kthread tasks */
+		if (genid != genid_done)
+			ip_vs_est_kthread_stop(kd);
+		if (!kd->task) {
+			/* Do not start kthreads above 0 in calc phase */
+			if ((!id || !ipvs->est_calc_phase) &&
+			    ip_vs_est_kthread_start(ipvs, kd) < 0)
+				repeat = true;
+		}
+	}
+
+	atomic_set(&ipvs->est_genid_done, genid);
+
+	if (repeat)
+		queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
+				   delay);
+
+unlock:
+	mutex_unlock(&ipvs->est_mutex);
+}
+
 int
 ip_vs_use_count_inc(void)
 {
@@ -831,7 +871,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
 {
 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
 
-	spin_lock_bh(&src->lock);
+	spin_lock(&src->lock);
 
 	IP_VS_SHOW_STATS_COUNTER(conns);
 	IP_VS_SHOW_STATS_COUNTER(inpkts);
@@ -841,7 +881,7 @@ ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
 
 	ip_vs_read_estimator(dst, src);
 
-	spin_unlock_bh(&src->lock);
+	spin_unlock(&src->lock);
 }
 
 static void
@@ -862,7 +902,7 @@ ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
 static void
 ip_vs_zero_stats(struct ip_vs_stats *stats)
 {
-	spin_lock_bh(&stats->lock);
+	spin_lock(&stats->lock);
 
 	/* get current counters as zero point, rates are zeroed */
 
@@ -876,7 +916,7 @@ ip_vs_zero_stats(struct ip_vs_stats *stats)
 
 	ip_vs_zero_estimator(stats);
 
-	spin_unlock_bh(&stats->lock);
+	spin_unlock(&stats->lock);
 }
 
 /* Allocate fields after kzalloc */
@@ -998,7 +1038,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
 	spin_unlock_bh(&dest->dst_lock);
 
 	if (add) {
-		ip_vs_start_estimator(svc->ipvs, &dest->stats);
 		list_add_rcu(&dest->n_list, &svc->destinations);
 		svc->num_dests++;
 		sched = rcu_dereference_protected(svc->scheduler, 1);
@@ -1051,6 +1090,10 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	if (ret < 0)
 		goto err_alloc;
 
+	ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+	if (ret < 0)
+		goto err_stats;
+
 	dest->af = udest->af;
 	dest->protocol = svc->protocol;
 	dest->vaddr = svc->addr;
@@ -1071,6 +1114,9 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 	LeaveFunction(2);
 	return 0;
 
+err_stats:
+	ip_vs_stats_release(&dest->stats);
+
 err_alloc:
 	kfree(dest);
 	return ret;
@@ -1135,14 +1181,18 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
 			      IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
 			      ntohs(dest->vport));
 
+		ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
+		if (ret < 0)
+			goto err;
 		__ip_vs_update_dest(svc, dest, udest, 1);
-		ret = 0;
 	} else {
 		/*
 		 * Allocate and initialize the dest structure
 		 */
 		ret = ip_vs_new_dest(svc, udest);
 	}
+
+err:
 	LeaveFunction(2);
 
 	return ret;
@@ -1420,6 +1470,10 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 		sched = NULL;
 	}
 
+	ret = ip_vs_start_estimator(ipvs, &svc->stats);
+	if (ret < 0)
+		goto out_err;
+
 	/* Bind the ct retriever */
 	RCU_INIT_POINTER(svc->pe, pe);
 	pe = NULL;
@@ -1432,8 +1486,6 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	if (svc->pe && svc->pe->conn_out)
 		atomic_inc(&ipvs->conn_out_counter);
 
-	ip_vs_start_estimator(ipvs, &svc->stats);
-
 	/* Count only IPv4 services for old get/setsockopt interface */
 	if (svc->af == AF_INET)
 		ipvs->num_services++;
@@ -1444,8 +1496,15 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
 	ip_vs_svc_hash(svc);
 
 	*svc_p = svc;
-	/* Now there is a service - full throttle */
-	ipvs->enable = 1;
+
+	if (!ipvs->enable) {
+		/* Now there is a service - full throttle */
+		ipvs->enable = 1;
+
+		/* Start estimation for first time */
+		ip_vs_est_reload_start(ipvs);
+	}
+
 	return 0;
 
 
@@ -4065,13 +4124,16 @@ static void ip_vs_genl_unregister(void)
 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 {
 	struct net *net = ipvs->net;
-	int idx;
 	struct ctl_table *tbl;
+	int idx, ret;
 
 	atomic_set(&ipvs->dropentry, 0);
 	spin_lock_init(&ipvs->dropentry_lock);
 	spin_lock_init(&ipvs->droppacket_lock);
 	spin_lock_init(&ipvs->securetcp_lock);
+	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
+	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
+			  expire_nodest_conn_handler);
 
 	if (!net_eq(net, &init_net)) {
 		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4139,24 +4201,27 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 		tbl[idx++].mode = 0444;
 #endif
 
+	ret = -ENOMEM;
 	ipvs->sysctl_hdr = register_net_sysctl(net, "net/ipv4/vs", tbl);
-	if (ipvs->sysctl_hdr == NULL) {
-		if (!net_eq(net, &init_net))
-			kfree(tbl);
-		return -ENOMEM;
-	}
+	if (!ipvs->sysctl_hdr)
+		goto err;
 	ipvs->sysctl_tbl = tbl;
+
+	ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
+	if (ret < 0)
+		goto err;
+
 	/* Schedule defense work */
-	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
 	queue_delayed_work(system_long_wq, &ipvs->defense_work,
 			   DEFENSE_TIMER_PERIOD);
 
-	/* Init delayed work for expiring no dest conn */
-	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
-			  expire_nodest_conn_handler);
-
-	ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
 	return 0;
+
+err:
+	unregister_net_sysctl_table(ipvs->sysctl_hdr);
+	if (!net_eq(net, &init_net))
+		kfree(tbl);
+	return ret;
 }
 
 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
@@ -4189,6 +4254,7 @@ static struct notifier_block ip_vs_dst_notifier = {
 
 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 {
+	int ret = -ENOMEM;
 	int idx;
 
 	/* Initialize rs_table */
@@ -4202,10 +4268,12 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 	atomic_set(&ipvs->nullsvc_counter, 0);
 	atomic_set(&ipvs->conn_out_counter, 0);
 
+	INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
+
 	/* procfs stats */
 	ipvs->tot_stats = kzalloc(sizeof(*ipvs->tot_stats), GFP_KERNEL);
 	if (!ipvs->tot_stats)
-		return -ENOMEM;
+		goto out;
 	if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
 		goto err_tot_stats;
 
@@ -4222,7 +4290,8 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
 		goto err_percpu;
 #endif
 
-	if (ip_vs_control_net_init_sysctl(ipvs))
+	ret = ip_vs_control_net_init_sysctl(ipvs);
+	if (ret < 0)
 		goto err;
 
 	return 0;
@@ -4243,13 +4312,16 @@ err_vs:
 
 err_tot_stats:
 	kfree(ipvs->tot_stats);
-	return -ENOMEM;
+
+out:
+	return ret;
 }
 
 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
 {
 	ip_vs_trash_cleanup(ipvs);
 	ip_vs_control_net_cleanup_sysctl(ipvs);
+	cancel_delayed_work_sync(&ipvs->est_reload_work);
 #ifdef CONFIG_PROC_FS
 	remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
 	remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index f53150d82a92..2fb6c097437c 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -30,9 +30,6 @@
   long interval, it is easy to implement a user level daemon which
   periodically reads those statistical counters and measure rate.
 
-  Currently, the measurement is activated by slow timer handler. Hope
-  this measurement will not introduce too much load.
-
   We measure rate during the last 8 seconds every 2 seconds:
 
     avgrate = avgrate*(1-W) + rate*W
@@ -47,68 +44,76 @@
     to 32-bit values for conns, packets, bps, cps and pps.
 
   * A lot of code is taken from net/core/gen_estimator.c
- */
-
 
-/*
- * Make a summary from each cpu
+  KEY POINTS:
+  - cpustats counters are updated per-cpu in SoftIRQ context with BH disabled
+  - kthreads read the cpustats to update the estimators (svcs, dests, total)
+  - the states of estimators can be read (get stats) or modified (zero stats)
+    from processes
+
+  KTHREADS:
+  - estimators are added initially to est_temp_list and later kthread 0
+    distributes them to one or many kthreads for estimation
+  - kthread contexts are created and attached to array
+  - the kthread tasks are started when first service is added, before that
+    the total stats are not estimated
+  - the kthread context holds lists with estimators (chains) which are
+    processed every 2 seconds
+  - as estimators can be added dynamically and in bursts, we try to spread
+    them to multiple chains which are estimated at different time
+  - on start, kthread 0 enters calculation phase to determine the chain limits
+    and the limit of estimators per kthread
+  - est_add_ktid: ktid where to add new ests, can point to empty slot where
+    we should add kt data
  */
-static void ip_vs_read_cpu_stats(struct ip_vs_kstats *sum,
-				 struct ip_vs_cpu_stats __percpu *stats)
-{
-	int i;
-	bool add = false;
-
-	for_each_possible_cpu(i) {
-		struct ip_vs_cpu_stats *s = per_cpu_ptr(stats, i);
-		unsigned int start;
-		u64 conns, inpkts, outpkts, inbytes, outbytes;
 
-		if (add) {
-			do {
-				start = u64_stats_fetch_begin(&s->syncp);
-				conns = u64_stats_read(&s->cnt.conns);
-				inpkts = u64_stats_read(&s->cnt.inpkts);
-				outpkts = u64_stats_read(&s->cnt.outpkts);
-				inbytes = u64_stats_read(&s->cnt.inbytes);
-				outbytes = u64_stats_read(&s->cnt.outbytes);
-			} while (u64_stats_fetch_retry(&s->syncp, start));
-			sum->conns += conns;
-			sum->inpkts += inpkts;
-			sum->outpkts += outpkts;
-			sum->inbytes += inbytes;
-			sum->outbytes += outbytes;
-		} else {
-			add = true;
-			do {
-				start = u64_stats_fetch_begin(&s->syncp);
-				sum->conns = u64_stats_read(&s->cnt.conns);
-				sum->inpkts = u64_stats_read(&s->cnt.inpkts);
-				sum->outpkts = u64_stats_read(&s->cnt.outpkts);
-				sum->inbytes = u64_stats_read(&s->cnt.inbytes);
-				sum->outbytes = u64_stats_read(&s->cnt.outbytes);
-			} while (u64_stats_fetch_retry(&s->syncp, start));
-		}
-	}
-}
+static struct lock_class_key __ipvs_est_key;
 
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs);
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs);
 
-static void estimation_timer(struct timer_list *t)
+static void ip_vs_chain_estimation(struct hlist_head *chain)
 {
 	struct ip_vs_estimator *e;
+	struct ip_vs_cpu_stats *c;
 	struct ip_vs_stats *s;
 	u64 rate;
-	struct netns_ipvs *ipvs = from_timer(ipvs, t, est_timer);
 
-	if (!sysctl_run_estimation(ipvs))
-		goto skip;
+	hlist_for_each_entry_rcu(e, chain, list) {
+		u64 conns, inpkts, outpkts, inbytes, outbytes;
+		u64 kconns = 0, kinpkts = 0, koutpkts = 0;
+		u64 kinbytes = 0, koutbytes = 0;
+		unsigned int start;
+		int i;
+
+		if (kthread_should_stop())
+			break;
 
-	spin_lock(&ipvs->est_lock);
-	list_for_each_entry(e, &ipvs->est_list, list) {
 		s = container_of(e, struct ip_vs_stats, est);
+		for_each_possible_cpu(i) {
+			c = per_cpu_ptr(s->cpustats, i);
+			do {
+				start = u64_stats_fetch_begin(&c->syncp);
+				conns = u64_stats_read(&c->cnt.conns);
+				inpkts = u64_stats_read(&c->cnt.inpkts);
+				outpkts = u64_stats_read(&c->cnt.outpkts);
+				inbytes = u64_stats_read(&c->cnt.inbytes);
+				outbytes = u64_stats_read(&c->cnt.outbytes);
+			} while (u64_stats_fetch_retry(&c->syncp, start));
+			kconns += conns;
+			kinpkts += inpkts;
+			koutpkts += outpkts;
+			kinbytes += inbytes;
+			koutbytes += outbytes;
+		}
 
 		spin_lock(&s->lock);
-		ip_vs_read_cpu_stats(&s->kstats, s->cpustats);
+
+		s->kstats.conns = kconns;
+		s->kstats.inpkts = kinpkts;
+		s->kstats.outpkts = koutpkts;
+		s->kstats.inbytes = kinbytes;
+		s->kstats.outbytes = koutbytes;
 
 		/* scaled by 2^10, but divided 2 seconds */
 		rate = (s->kstats.conns - e->last_conns) << 9;
@@ -133,30 +138,754 @@ static void estimation_timer(struct timer_list *t)
 		e->outbps += ((s64)rate - (s64)e->outbps) >> 2;
 		spin_unlock(&s->lock);
 	}
-	spin_unlock(&ipvs->est_lock);
+}
 
-skip:
-	mod_timer(&ipvs->est_timer, jiffies + 2*HZ);
+static void ip_vs_tick_estimation(struct ip_vs_est_kt_data *kd, int row)
+{
+	struct ip_vs_est_tick_data *td;
+	int cid;
+
+	rcu_read_lock();
+	td = rcu_dereference(kd->ticks[row]);
+	if (!td)
+		goto out;
+	for_each_set_bit(cid, td->present, IPVS_EST_TICK_CHAINS) {
+		if (kthread_should_stop())
+			break;
+		ip_vs_chain_estimation(&td->chains[cid]);
+		cond_resched_rcu();
+		td = rcu_dereference(kd->ticks[row]);
+		if (!td)
+			break;
+	}
+
+out:
+	rcu_read_unlock();
 }
 
-void ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+static int ip_vs_estimation_kthread(void *data)
 {
-	struct ip_vs_estimator *est = &stats->est;
+	struct ip_vs_est_kt_data *kd = data;
+	struct netns_ipvs *ipvs = kd->ipvs;
+	int row = kd->est_row;
+	unsigned long now;
+	int id = kd->id;
+	long gap;
+
+	if (id > 0) {
+		if (!ipvs->est_chain_max)
+			return 0;
+	} else {
+		if (!ipvs->est_chain_max) {
+			ipvs->est_calc_phase = 1;
+			/* commit est_calc_phase before reading est_genid */
+			smp_mb();
+		}
+
+		/* kthread 0 will handle the calc phase */
+		if (ipvs->est_calc_phase)
+			ip_vs_est_calc_phase(ipvs);
+	}
+
+	while (1) {
+		if (!id && !hlist_empty(&ipvs->est_temp_list))
+			ip_vs_est_drain_temp_list(ipvs);
+		set_current_state(TASK_IDLE);
+		if (kthread_should_stop())
+			break;
+
+		/* before estimation, check if we should sleep */
+		now = jiffies;
+		gap = kd->est_timer - now;
+		if (gap > 0) {
+			if (gap > IPVS_EST_TICK) {
+				kd->est_timer = now - IPVS_EST_TICK;
+				gap = IPVS_EST_TICK;
+			}
+			schedule_timeout(gap);
+		} else {
+			__set_current_state(TASK_RUNNING);
+			if (gap < -8 * IPVS_EST_TICK)
+				kd->est_timer = now;
+		}
+
+		if (sysctl_run_estimation(ipvs) && kd->tick_len[row])
+			ip_vs_tick_estimation(kd, row);
+
+		row++;
+		if (row >= IPVS_EST_NTICKS)
+			row = 0;
+		WRITE_ONCE(kd->est_row, row);
+		kd->est_timer += IPVS_EST_TICK;
+	}
+	__set_current_state(TASK_RUNNING);
+
+	return 0;
+}
+
+/* Schedule stop/start for kthread tasks */
+void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
+{
+	/* Ignore reloads before first service is added */
+	if (!ipvs->enable)
+		return;
+	/* Bump the kthread configuration genid */
+	atomic_inc(&ipvs->est_genid);
+	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
+}
+
+/* Start kthread task with current configuration */
+int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
+			    struct ip_vs_est_kt_data *kd)
+{
+	unsigned long now;
+	int ret = 0;
+	long gap;
+
+	lockdep_assert_held(&ipvs->est_mutex);
+
+	if (kd->task)
+		goto out;
+	now = jiffies;
+	gap = kd->est_timer - now;
+	/* Sync est_timer if task is starting later */
+	if (abs(gap) > 4 * IPVS_EST_TICK)
+		kd->est_timer = now;
+	kd->task = kthread_create(ip_vs_estimation_kthread, kd, "ipvs-e:%d:%d",
+				  ipvs->gen, kd->id);
+	if (IS_ERR(kd->task)) {
+		ret = PTR_ERR(kd->task);
+		kd->task = NULL;
+		goto out;
+	}
+
+	pr_info("starting estimator thread %d...\n", kd->id);
+	wake_up_process(kd->task);
+
+out:
+	return ret;
+}
+
+void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd)
+{
+	if (kd->task) {
+		pr_info("stopping estimator thread %d...\n", kd->id);
+		kthread_stop(kd->task);
+		kd->task = NULL;
+	}
+}
+
+/* Apply parameters to kthread */
+static void ip_vs_est_set_params(struct netns_ipvs *ipvs,
+				 struct ip_vs_est_kt_data *kd)
+{
+	kd->chain_max = ipvs->est_chain_max;
+	/* We are using single chain on RCU preemption */
+	if (IPVS_EST_TICK_CHAINS == 1)
+		kd->chain_max *= IPVS_EST_CHAIN_FACTOR;
+	kd->tick_max = IPVS_EST_TICK_CHAINS * kd->chain_max;
+	kd->est_max_count = IPVS_EST_NTICKS * kd->tick_max;
+}
+
+/* Create and start estimation kthread in a free or new array slot */
+static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_est_kt_data *kd = NULL;
+	int id = ipvs->est_kt_count;
+	int ret = -ENOMEM;
+	void *arr = NULL;
+	int i;
+
+	if ((unsigned long)ipvs->est_kt_count >= ipvs->est_max_threads &&
+	    ipvs->enable && ipvs->est_max_threads)
+		return -EINVAL;
+
+	mutex_lock(&ipvs->est_mutex);
+
+	for (i = 0; i < id; i++) {
+		if (!ipvs->est_kt_arr[i])
+			break;
+	}
+	if (i >= id) {
+		arr = krealloc_array(ipvs->est_kt_arr, id + 1,
+				     sizeof(struct ip_vs_est_kt_data *),
+				     GFP_KERNEL);
+		if (!arr)
+			goto out;
+		ipvs->est_kt_arr = arr;
+	} else {
+		id = i;
+	}
+
+	kd = kzalloc(sizeof(*kd), GFP_KERNEL);
+	if (!kd)
+		goto out;
+	kd->ipvs = ipvs;
+	bitmap_fill(kd->avail, IPVS_EST_NTICKS);
+	kd->est_timer = jiffies;
+	kd->id = id;
+	ip_vs_est_set_params(ipvs, kd);
+
+	/* Pre-allocate stats used in calc phase */
+	if (!id && !kd->calc_stats) {
+		kd->calc_stats = ip_vs_stats_alloc();
+		if (!kd->calc_stats)
+			goto out;
+	}
+
+	/* Start kthread tasks only when services are present */
+	if (ipvs->enable) {
+		ret = ip_vs_est_kthread_start(ipvs, kd);
+		if (ret < 0)
+			goto out;
+	}
+
+	if (arr)
+		ipvs->est_kt_count++;
+	ipvs->est_kt_arr[id] = kd;
+	kd = NULL;
+	/* Use most recent kthread for new ests */
+	ipvs->est_add_ktid = id;
+	ret = 0;
+
+out:
+	mutex_unlock(&ipvs->est_mutex);
+	if (kd) {
+		ip_vs_stats_free(kd->calc_stats);
+		kfree(kd);
+	}
+
+	return ret;
+}
+
+/* Select ktid where to add new ests: available, unused or new slot */
+static void ip_vs_est_update_ktid(struct netns_ipvs *ipvs)
+{
+	int ktid, best = ipvs->est_kt_count;
+	struct ip_vs_est_kt_data *kd;
+
+	for (ktid = 0; ktid < ipvs->est_kt_count; ktid++) {
+		kd = ipvs->est_kt_arr[ktid];
+		if (kd) {
+			if (kd->est_count < kd->est_max_count) {
+				best = ktid;
+				break;
+			}
+		} else if (ktid < best) {
+			best = ktid;
+		}
+	}
+	ipvs->est_add_ktid = best;
+}
+
+/* Add estimator to current kthread (est_add_ktid) */
+static int ip_vs_enqueue_estimator(struct netns_ipvs *ipvs,
+				   struct ip_vs_estimator *est)
+{
+	struct ip_vs_est_kt_data *kd = NULL;
+	struct ip_vs_est_tick_data *td;
+	int ktid, row, crow, cid, ret;
+	int delay = est->ktrow;
+
+	BUILD_BUG_ON_MSG(IPVS_EST_TICK_CHAINS > 127,
+			 "Too many chains for ktcid");
+
+	if (ipvs->est_add_ktid < ipvs->est_kt_count) {
+		kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+		if (kd)
+			goto add_est;
+	}
+
+	ret = ip_vs_est_add_kthread(ipvs);
+	if (ret < 0)
+		goto out;
+	kd = ipvs->est_kt_arr[ipvs->est_add_ktid];
+
+add_est:
+	ktid = kd->id;
+	/* For small number of estimators prefer to use few ticks,
+	 * otherwise try to add into the last estimated row.
+	 * est_row and add_row point after the row we should use
+	 */
+	if (kd->est_count >= 2 * kd->tick_max || delay < IPVS_EST_NTICKS - 1)
+		crow = READ_ONCE(kd->est_row);
+	else
+		crow = kd->add_row;
+	crow += delay;
+	if (crow >= IPVS_EST_NTICKS)
+		crow -= IPVS_EST_NTICKS;
+	/* Assume initial delay ? */
+	if (delay >= IPVS_EST_NTICKS - 1) {
+		/* Preserve initial delay or decrease it if no space in tick */
+		row = crow;
+		if (crow < IPVS_EST_NTICKS - 1) {
+			crow++;
+			row = find_last_bit(kd->avail, crow);
+		}
+		if (row >= crow)
+			row = find_last_bit(kd->avail, IPVS_EST_NTICKS);
+	} else {
+		/* Preserve delay or increase it if no space in tick */
+		row = IPVS_EST_NTICKS;
+		if (crow > 0)
+			row = find_next_bit(kd->avail, IPVS_EST_NTICKS, crow);
+		if (row >= IPVS_EST_NTICKS)
+			row = find_first_bit(kd->avail, IPVS_EST_NTICKS);
+	}
+
+	td = rcu_dereference_protected(kd->ticks[row], 1);
+	if (!td) {
+		td = kzalloc(sizeof(*td), GFP_KERNEL);
+		if (!td) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		rcu_assign_pointer(kd->ticks[row], td);
+	}
+
+	cid = find_first_zero_bit(td->full, IPVS_EST_TICK_CHAINS);
+
+	kd->est_count++;
+	kd->tick_len[row]++;
+	if (!td->chain_len[cid])
+		__set_bit(cid, td->present);
+	td->chain_len[cid]++;
+	est->ktid = ktid;
+	est->ktrow = row;
+	est->ktcid = cid;
+	hlist_add_head_rcu(&est->list, &td->chains[cid]);
+
+	if (td->chain_len[cid] >= kd->chain_max) {
+		__set_bit(cid, td->full);
+		if (kd->tick_len[row] >= kd->tick_max)
+			__clear_bit(row, kd->avail);
+	}
+
+	/* Update est_add_ktid to point to first available/empty kt slot */
+	if (kd->est_count == kd->est_max_count)
+		ip_vs_est_update_ktid(ipvs);
+
+	ret = 0;
+
+out:
+	return ret;
+}
 
-	INIT_LIST_HEAD(&est->list);
+/* Start estimation for stats */
+int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
+{
+	struct ip_vs_estimator *est = &stats->est;
+	int ret;
+
+	if (!ipvs->est_max_threads && ipvs->enable)
+		ipvs->est_max_threads = IPVS_EST_CPU_KTHREADS *
+					num_possible_cpus();
+
+	est->ktid = -1;
+	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
+
+	/* We prefer this code to be short, kthread 0 will requeue the
+	 * estimator to available chain. If tasks are disabled, we
+	 * will not allocate much memory, just for kt 0.
+	 */
+	ret = 0;
+	if (!ipvs->est_kt_count || !ipvs->est_kt_arr[0])
+		ret = ip_vs_est_add_kthread(ipvs);
+	if (ret >= 0)
+		hlist_add_head(&est->list, &ipvs->est_temp_list);
+	else
+		INIT_HLIST_NODE(&est->list);
+	return ret;
+}
 
-	spin_lock_bh(&ipvs->est_lock);
-	list_add(&est->list, &ipvs->est_list);
-	spin_unlock_bh(&ipvs->est_lock);
+static void ip_vs_est_kthread_destroy(struct ip_vs_est_kt_data *kd)
+{
+	if (kd) {
+		if (kd->task) {
+			pr_info("stop unused estimator thread %d...\n", kd->id);
+			kthread_stop(kd->task);
+		}
+		ip_vs_stats_free(kd->calc_stats);
+		kfree(kd);
+	}
 }
 
+/* Unlink estimator from chain */
 void ip_vs_stop_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
 {
 	struct ip_vs_estimator *est = &stats->est;
+	struct ip_vs_est_tick_data *td;
+	struct ip_vs_est_kt_data *kd;
+	int ktid = est->ktid;
+	int row = est->ktrow;
+	int cid = est->ktcid;
+
+	/* Failed to add to chain ? */
+	if (hlist_unhashed(&est->list))
+		return;
+
+	/* On return, estimator can be freed, dequeue it now */
+
+	/* In est_temp_list ? */
+	if (ktid < 0) {
+		hlist_del(&est->list);
+		goto end_kt0;
+	}
+
+	hlist_del_rcu(&est->list);
+	kd = ipvs->est_kt_arr[ktid];
+	td = rcu_dereference_protected(kd->ticks[row], 1);
+	__clear_bit(cid, td->full);
+	td->chain_len[cid]--;
+	if (!td->chain_len[cid])
+		__clear_bit(cid, td->present);
+	kd->tick_len[row]--;
+	__set_bit(row, kd->avail);
+	if (!kd->tick_len[row]) {
+		RCU_INIT_POINTER(kd->ticks[row], NULL);
+		kfree_rcu(td);
+	}
+	kd->est_count--;
+	if (kd->est_count) {
+		/* This kt slot can become available just now, prefer it */
+		if (ktid < ipvs->est_add_ktid)
+			ipvs->est_add_ktid = ktid;
+		return;
+	}
 
-	spin_lock_bh(&ipvs->est_lock);
-	list_del(&est->list);
-	spin_unlock_bh(&ipvs->est_lock);
+	if (ktid > 0) {
+		mutex_lock(&ipvs->est_mutex);
+		ip_vs_est_kthread_destroy(kd);
+		ipvs->est_kt_arr[ktid] = NULL;
+		if (ktid == ipvs->est_kt_count - 1) {
+			ipvs->est_kt_count--;
+			while (ipvs->est_kt_count > 1 &&
+			       !ipvs->est_kt_arr[ipvs->est_kt_count - 1])
+				ipvs->est_kt_count--;
+		}
+		mutex_unlock(&ipvs->est_mutex);
+
+		/* This slot is now empty, prefer another available kt slot */
+		if (ktid == ipvs->est_add_ktid)
+			ip_vs_est_update_ktid(ipvs);
+	}
+
+end_kt0:
+	/* kt 0 is freed after all other kthreads and chains are empty */
+	if (ipvs->est_kt_count == 1 && hlist_empty(&ipvs->est_temp_list)) {
+		kd = ipvs->est_kt_arr[0];
+		if (!kd || !kd->est_count) {
+			mutex_lock(&ipvs->est_mutex);
+			if (kd) {
+				ip_vs_est_kthread_destroy(kd);
+				ipvs->est_kt_arr[0] = NULL;
+			}
+			ipvs->est_kt_count--;
+			mutex_unlock(&ipvs->est_mutex);
+			ipvs->est_add_ktid = 0;
+		}
+	}
+}
+
+/* Register all ests from est_temp_list to kthreads */
+static void ip_vs_est_drain_temp_list(struct netns_ipvs *ipvs)
+{
+	struct ip_vs_estimator *est;
+
+	while (1) {
+		int max = 16;
+
+		mutex_lock(&__ip_vs_mutex);
+
+		while (max-- > 0) {
+			est = hlist_entry_safe(ipvs->est_temp_list.first,
+					       struct ip_vs_estimator, list);
+			if (est) {
+				if (kthread_should_stop())
+					goto unlock;
+				hlist_del_init(&est->list);
+				if (ip_vs_enqueue_estimator(ipvs, est) >= 0)
+					continue;
+				est->ktid = -1;
+				hlist_add_head(&est->list,
+					       &ipvs->est_temp_list);
+				/* Abort, some entries will not be estimated
+				 * until next attempt
+				 */
+			}
+			goto unlock;
+		}
+		mutex_unlock(&__ip_vs_mutex);
+		cond_resched();
+	}
+
+unlock:
+	mutex_unlock(&__ip_vs_mutex);
+}
+
+/* Calculate limits for all kthreads */
+static int ip_vs_est_calc_limits(struct netns_ipvs *ipvs, int *chain_max)
+{
+	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+	struct ip_vs_est_kt_data *kd;
+	struct hlist_head chain;
+	struct ip_vs_stats *s;
+	int cache_factor = 4;
+	int i, loops, ntest;
+	s32 min_est = 0;
+	ktime_t t1, t2;
+	s64 diff, val;
+	int max = 8;
+	int ret = 1;
+
+	INIT_HLIST_HEAD(&chain);
+	mutex_lock(&__ip_vs_mutex);
+	kd = ipvs->est_kt_arr[0];
+	mutex_unlock(&__ip_vs_mutex);
+	s = kd ? kd->calc_stats : NULL;
+	if (!s)
+		goto out;
+	hlist_add_head(&s->est.list, &chain);
+
+	loops = 1;
+	/* Get best result from many tests */
+	for (ntest = 0; ntest < 12; ntest++) {
+		if (!(ntest & 3)) {
+			/* Wait for cpufreq frequency transition */
+			wait_event_idle_timeout(wq, kthread_should_stop(),
+						HZ / 50);
+			if (!ipvs->enable || kthread_should_stop())
+				goto stop;
+		}
+
+		local_bh_disable();
+		rcu_read_lock();
+
+		/* Put stats in cache */
+		ip_vs_chain_estimation(&chain);
+
+		t1 = ktime_get();
+		for (i = loops * cache_factor; i > 0; i--)
+			ip_vs_chain_estimation(&chain);
+		t2 = ktime_get();
+
+		rcu_read_unlock();
+		local_bh_enable();
+
+		if (!ipvs->enable || kthread_should_stop())
+			goto stop;
+		cond_resched();
+
+		diff = ktime_to_ns(ktime_sub(t2, t1));
+		if (diff <= 1 * NSEC_PER_USEC) {
+			/* Do more loops on low time resolution */
+			loops *= 2;
+			continue;
+		}
+		if (diff >= NSEC_PER_SEC)
+			continue;
+		val = diff;
+		do_div(val, loops);
+		if (!min_est || val < min_est) {
+			min_est = val;
+			/* goal: 95usec per chain */
+			val = 95 * NSEC_PER_USEC;
+			if (val >= min_est) {
+				do_div(val, min_est);
+				max = (int)val;
+			} else {
+				max = 1;
+			}
+		}
+	}
+
+out:
+	if (s)
+		hlist_del_init(&s->est.list);
+	*chain_max = max;
+	return ret;
+
+stop:
+	ret = 0;
+	goto out;
+}
+
+/* Calculate the parameters and apply them in context of kt #0
+ * ECP: est_calc_phase
+ * ECM: est_chain_max
+ * ECP	ECM	Insert Chain	enable	Description
+ * ---------------------------------------------------------------------------
+ * 0	0	est_temp_list	0	create kt #0 context
+ * 0	0	est_temp_list	0->1	service added, start kthread #0 task
+ * 0->1	0	est_temp_list	1	kt task #0 started, enters calc phase
+ * 1	0	est_temp_list	1	kt #0: determine est_chain_max,
+ *					stop tasks, move ests to est_temp_list
+ *					and free kd for kthreads 1..last
+ * 1->0	0->N	kt chains	1	ests can go to kthreads
+ * 0	N	kt chains	1	drain est_temp_list, create new kthread
+ *					contexts, start tasks, estimate
+ */
+static void ip_vs_est_calc_phase(struct netns_ipvs *ipvs)
+{
+	int genid = atomic_read(&ipvs->est_genid);
+	struct ip_vs_est_tick_data *td;
+	struct ip_vs_est_kt_data *kd;
+	struct ip_vs_estimator *est;
+	struct ip_vs_stats *stats;
+	int id, row, cid, delay;
+	bool last, last_td;
+	int chain_max;
+	int step;
+
+	if (!ip_vs_est_calc_limits(ipvs, &chain_max))
+		return;
+
+	mutex_lock(&__ip_vs_mutex);
+
+	/* Stop all other tasks, so that we can immediately move the
+	 * estimators to est_temp_list without RCU grace period
+	 */
+	mutex_lock(&ipvs->est_mutex);
+	for (id = 1; id < ipvs->est_kt_count; id++) {
+		/* netns clean up started, abort */
+		if (!ipvs->enable)
+			goto unlock2;
+		kd = ipvs->est_kt_arr[id];
+		if (!kd)
+			continue;
+		ip_vs_est_kthread_stop(kd);
+	}
+	mutex_unlock(&ipvs->est_mutex);
+
+	/* Move all estimators to est_temp_list but carefully,
+	 * all estimators and kthread data can be released while
+	 * we reschedule. Even for kthread 0.
+	 */
+	step = 0;
+
+	/* Order entries in est_temp_list in ascending delay, so now
+	 * walk delay(desc), id(desc), cid(asc)
+	 */
+	delay = IPVS_EST_NTICKS;
+
+next_delay:
+	delay--;
+	if (delay < 0)
+		goto end_dequeue;
+
+last_kt:
+	/* Destroy contexts backwards */
+	id = ipvs->est_kt_count;
+
+next_kt:
+	if (!ipvs->enable || kthread_should_stop())
+		goto unlock;
+	id--;
+	if (id < 0)
+		goto next_delay;
+	kd = ipvs->est_kt_arr[id];
+	if (!kd)
+		goto next_kt;
+	/* kt 0 can exist with empty chains */
+	if (!id && kd->est_count <= 1)
+		goto next_delay;
+
+	row = kd->est_row + delay;
+	if (row >= IPVS_EST_NTICKS)
+		row -= IPVS_EST_NTICKS;
+	td = rcu_dereference_protected(kd->ticks[row], 1);
+	if (!td)
+		goto next_kt;
+
+	cid = 0;
+
+walk_chain:
+	if (kthread_should_stop())
+		goto unlock;
+	step++;
+	if (!(step & 63)) {
+		/* Give chance estimators to be added (to est_temp_list)
+		 * and deleted (releasing kthread contexts)
+		 */
+		mutex_unlock(&__ip_vs_mutex);
+		cond_resched();
+		mutex_lock(&__ip_vs_mutex);
+
+		/* Current kt released ? */
+		if (id >= ipvs->est_kt_count)
+			goto last_kt;
+		if (kd != ipvs->est_kt_arr[id])
+			goto next_kt;
+		/* Current td released ? */
+		if (td != rcu_dereference_protected(kd->ticks[row], 1))
+			goto next_kt;
+		/* No fatal changes on the current kd and td */
+	}
+	est = hlist_entry_safe(td->chains[cid].first, struct ip_vs_estimator,
+			       list);
+	if (!est) {
+		cid++;
+		if (cid >= IPVS_EST_TICK_CHAINS)
+			goto next_kt;
+		goto walk_chain;
+	}
+	/* We can cheat and increase est_count to protect kt 0 context
+	 * from release but we prefer to keep the last estimator
+	 */
+	last = kd->est_count <= 1;
+	/* Do not free kt #0 data */
+	if (!id && last)
+		goto next_delay;
+	last_td = kd->tick_len[row] <= 1;
+	stats = container_of(est, struct ip_vs_stats, est);
+	ip_vs_stop_estimator(ipvs, stats);
+	/* Tasks are stopped, move without RCU grace period */
+	est->ktid = -1;
+	est->ktrow = row - kd->est_row;
+	if (est->ktrow < 0)
+		est->ktrow += IPVS_EST_NTICKS;
+	hlist_add_head(&est->list, &ipvs->est_temp_list);
+	/* kd freed ? */
+	if (last)
+		goto next_kt;
+	/* td freed ? */
+	if (last_td)
+		goto next_kt;
+	goto walk_chain;
+
+end_dequeue:
+	/* All estimators removed while calculating ? */
+	if (!ipvs->est_kt_count)
+		goto unlock;
+	kd = ipvs->est_kt_arr[0];
+	if (!kd)
+		goto unlock;
+	kd->add_row = kd->est_row;
+	ipvs->est_chain_max = chain_max;
+	ip_vs_est_set_params(ipvs, kd);
+
+	pr_info("using max %d ests per chain, %d per kthread\n",
+		kd->chain_max, kd->est_max_count);
+
+	/* Try to keep tot_stats in kt0, enqueue it early */
+	if (ipvs->tot_stats && !hlist_unhashed(&ipvs->tot_stats->s.est.list) &&
+	    ipvs->tot_stats->s.est.ktid == -1) {
+		hlist_del(&ipvs->tot_stats->s.est.list);
+		hlist_add_head(&ipvs->tot_stats->s.est.list,
+			       &ipvs->est_temp_list);
+	}
+
+	mutex_lock(&ipvs->est_mutex);
+
+	/* We completed the calc phase, new calc phase not requested */
+	if (genid == atomic_read(&ipvs->est_genid))
+		ipvs->est_calc_phase = 0;
+
+unlock2:
+	mutex_unlock(&ipvs->est_mutex);
+
+unlock:
+	mutex_unlock(&__ip_vs_mutex);
 }
 
 void ip_vs_zero_estimator(struct ip_vs_stats *stats)
@@ -191,14 +920,25 @@ void ip_vs_read_estimator(struct ip_vs_kstats *dst, struct ip_vs_stats *stats)
 
 int __net_init ip_vs_estimator_net_init(struct netns_ipvs *ipvs)
 {
-	INIT_LIST_HEAD(&ipvs->est_list);
-	spin_lock_init(&ipvs->est_lock);
-	timer_setup(&ipvs->est_timer, estimation_timer, 0);
-	mod_timer(&ipvs->est_timer, jiffies + 2 * HZ);
+	INIT_HLIST_HEAD(&ipvs->est_temp_list);
+	ipvs->est_kt_arr = NULL;
+	ipvs->est_max_threads = 0;
+	ipvs->est_calc_phase = 0;
+	ipvs->est_chain_max = 0;
+	ipvs->est_kt_count = 0;
+	ipvs->est_add_ktid = 0;
+	atomic_set(&ipvs->est_genid, 0);
+	atomic_set(&ipvs->est_genid_done, 0);
+	__mutex_init(&ipvs->est_mutex, "ipvs->est_mutex", &__ipvs_est_key);
 	return 0;
 }
 
 void __net_exit ip_vs_estimator_net_cleanup(struct netns_ipvs *ipvs)
 {
-	del_timer_sync(&ipvs->est_timer);
+	int i;
+
+	for (i = 0; i < ipvs->est_kt_count; i++)
+		ip_vs_est_kthread_destroy(ipvs->est_kt_arr[i]);
+	kfree(ipvs->est_kt_arr);
+	mutex_destroy(&ipvs->est_mutex);
 }
-- 
cgit v1.2.3


From f0be83d5421718ead31707b6ece34cf77d411c00 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:46:03 +0200
Subject: ipvs: add est_cpulist and est_nice sysctl vars

Allow the kthreads for stats to be configured for
specific cpulist (isolation) and niceness (scheduling
priority).

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/ipvs-sysctl.rst |  20 +++++
 include/net/ip_vs.h                      |  58 +++++++++++++
 net/netfilter/ipvs/ip_vs_ctl.c           | 143 ++++++++++++++++++++++++++++++-
 net/netfilter/ipvs/ip_vs_est.c           |  12 ++-
 4 files changed, 229 insertions(+), 4 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index 387fda80f05f..1b778705d706 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -129,6 +129,26 @@ drop_packet - INTEGER
 	threshold. When the mode 3 is set, the always mode drop rate
 	is controlled by the /proc/sys/net/ipv4/vs/am_droprate.
 
+est_cpulist - CPULIST
+	Allowed	CPUs for estimation kthreads
+
+	Syntax: standard cpulist format
+	empty list - stop kthread tasks and estimation
+	default - the system's housekeeping CPUs for kthreads
+
+	Example:
+	"all": all possible CPUs
+	"0-N": all possible CPUs, N denotes last CPU number
+	"0,1-N:1/2": first and all CPUs with odd number
+	"": empty list
+
+est_nice - INTEGER
+	default 0
+	Valid range: -20 (more favorable) .. 19 (less favorable)
+
+	Niceness value to use for the estimation kthreads (scheduling
+	priority)
+
 expire_nodest_conn - BOOLEAN
 	- 0 - disabled (default)
 	- not 0 - enabled
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index 04960dc6228f..dc51b5497cf7 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -29,6 +29,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #endif
 #include <net/net_namespace.h>		/* Netw namespace */
+#include <linux/sched/isolation.h>
 
 #define IP_VS_HDR_INVERSE	1
 #define IP_VS_HDR_ICMP		2
@@ -365,6 +366,9 @@ struct ip_vs_cpu_stats {
 	struct u64_stats_sync   syncp;
 };
 
+/* Default nice for estimator kthreads */
+#define IPVS_EST_NICE		0
+
 /* IPVS statistics objects */
 struct ip_vs_estimator {
 	struct hlist_node	list;
@@ -1009,6 +1013,12 @@ struct netns_ipvs {
 	int			sysctl_schedule_icmp;
 	int			sysctl_ignore_tunneled;
 	int			sysctl_run_estimation;
+#ifdef CONFIG_SYSCTL
+	cpumask_var_t		sysctl_est_cpulist;	/* kthread cpumask */
+	int			est_cpulist_valid;	/* cpulist set */
+	int			sysctl_est_nice;	/* kthread nice */
+	int			est_stopped;		/* stop tasks */
+#endif
 
 	/* ip_vs_lblc */
 	int			sysctl_lblc_expiration;
@@ -1162,6 +1172,19 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
 	return ipvs->sysctl_run_estimation;
 }
 
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+	if (ipvs->est_cpulist_valid)
+		return ipvs->sysctl_est_cpulist;
+	else
+		return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+	return ipvs->sysctl_est_nice;
+}
+
 #else
 
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
@@ -1259,6 +1282,16 @@ static inline int sysctl_run_estimation(struct netns_ipvs *ipvs)
 	return 1;
 }
 
+static inline const struct cpumask *sysctl_est_cpulist(struct netns_ipvs *ipvs)
+{
+	return housekeeping_cpumask(HK_TYPE_KTHREAD);
+}
+
+static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
+{
+	return IPVS_EST_NICE;
+}
+
 #endif
 
 /* IPVS core functions
@@ -1569,6 +1602,31 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
 			    struct ip_vs_est_kt_data *kd);
 void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
 
+static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+	ipvs->est_stopped = ipvs->est_cpulist_valid &&
+			    cpumask_empty(sysctl_est_cpulist(ipvs));
+#endif
+}
+
+static inline bool ip_vs_est_stopped(struct netns_ipvs *ipvs)
+{
+#ifdef CONFIG_SYSCTL
+	return ipvs->est_stopped;
+#else
+	return false;
+#endif
+}
+
+static inline int ip_vs_est_max_threads(struct netns_ipvs *ipvs)
+{
+	unsigned int limit = IPVS_EST_CPU_KTHREADS *
+			     cpumask_weight(sysctl_est_cpulist(ipvs));
+
+	return max(1U, limit);
+}
+
 /* Various IPVS packet transmitters (from ip_vs_xmit.c) */
 int ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
 		    struct ip_vs_protocol *pp, struct ip_vs_iphdr *iph);
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index c41a5392edc9..38df3ee655ed 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -263,7 +263,7 @@ static void est_reload_work_handler(struct work_struct *work)
 		/* New config ? Stop kthread tasks */
 		if (genid != genid_done)
 			ip_vs_est_kthread_stop(kd);
-		if (!kd->task) {
+		if (!kd->task && !ip_vs_est_stopped(ipvs)) {
 			/* Do not start kthreads above 0 in calc phase */
 			if ((!id || !ipvs->est_calc_phase) &&
 			    ip_vs_est_kthread_start(ipvs, kd) < 0)
@@ -1940,6 +1940,122 @@ proc_do_sync_ports(struct ctl_table *table, int write,
 	return rc;
 }
 
+static int ipvs_proc_est_cpumask_set(struct ctl_table *table, void *buffer)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	cpumask_var_t *valp = table->data;
+	cpumask_var_t newmask;
+	int ret;
+
+	if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = cpulist_parse(buffer, newmask);
+	if (ret)
+		goto out;
+
+	mutex_lock(&ipvs->est_mutex);
+
+	if (!ipvs->est_cpulist_valid) {
+		if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
+			ret = -ENOMEM;
+			goto unlock;
+		}
+		ipvs->est_cpulist_valid = 1;
+	}
+	cpumask_and(newmask, newmask, &current->cpus_mask);
+	cpumask_copy(*valp, newmask);
+	/* est_max_threads may depend on cpulist size */
+	ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
+	ipvs->est_calc_phase = 1;
+	ip_vs_est_reload_start(ipvs);
+
+unlock:
+	mutex_unlock(&ipvs->est_mutex);
+
+out:
+	free_cpumask_var(newmask);
+	return ret;
+}
+
+static int ipvs_proc_est_cpumask_get(struct ctl_table *table, void *buffer,
+				     size_t size)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	cpumask_var_t *valp = table->data;
+	struct cpumask *mask;
+	int ret;
+
+	mutex_lock(&ipvs->est_mutex);
+
+	if (ipvs->est_cpulist_valid)
+		mask = *valp;
+	else
+		mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
+	ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
+
+	mutex_unlock(&ipvs->est_mutex);
+
+	return ret;
+}
+
+static int ipvs_proc_est_cpulist(struct ctl_table *table, int write,
+				 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	/* Ignore both read and write(append) if *ppos not 0 */
+	if (*ppos || !*lenp) {
+		*lenp = 0;
+		return 0;
+	}
+	if (write) {
+		/* proc_sys_call_handler() appends terminator */
+		ret = ipvs_proc_est_cpumask_set(table, buffer);
+		if (ret >= 0)
+			*ppos += *lenp;
+	} else {
+		/* proc_sys_call_handler() allocates 1 byte for terminator */
+		ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
+		if (ret >= 0) {
+			*lenp = ret;
+			*ppos += *lenp;
+			ret = 0;
+		}
+	}
+	return ret;
+}
+
+static int ipvs_proc_est_nice(struct ctl_table *table, int write,
+			      void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	int *valp = table->data;
+	int val = *valp;
+	int ret;
+
+	struct ctl_table tmp_table = {
+		.data = &val,
+		.maxlen = sizeof(int),
+		.mode = table->mode,
+	};
+
+	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+	if (write && ret >= 0) {
+		if (val < MIN_NICE || val > MAX_NICE) {
+			ret = -EINVAL;
+		} else {
+			mutex_lock(&ipvs->est_mutex);
+			if (*valp != val) {
+				*valp = val;
+				ip_vs_est_reload_start(ipvs);
+			}
+			mutex_unlock(&ipvs->est_mutex);
+		}
+	}
+	return ret;
+}
+
 /*
  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
  *	Do not change order or insert new entries without
@@ -2116,6 +2232,18 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "est_cpulist",
+		.maxlen		= NR_CPUS,	/* unused */
+		.mode		= 0644,
+		.proc_handler	= ipvs_proc_est_cpulist,
+	},
+	{
+		.procname	= "est_nice",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= ipvs_proc_est_nice,
+	},
 #ifdef CONFIG_IP_VS_DEBUG
 	{
 		.procname	= "debug_level",
@@ -4134,6 +4262,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
 	INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
 			  expire_nodest_conn_handler);
+	ipvs->est_stopped = 0;
 
 	if (!net_eq(net, &init_net)) {
 		tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
@@ -4195,6 +4324,15 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
 	ipvs->sysctl_run_estimation = 1;
 	tbl[idx++].data = &ipvs->sysctl_run_estimation;
+
+	ipvs->est_cpulist_valid = 0;
+	tbl[idx].extra2 = ipvs;
+	tbl[idx++].data = &ipvs->sysctl_est_cpulist;
+
+	ipvs->sysctl_est_nice = IPVS_EST_NICE;
+	tbl[idx].extra2 = ipvs;
+	tbl[idx++].data = &ipvs->sysctl_est_nice;
+
 #ifdef CONFIG_IP_VS_DEBUG
 	/* Global sysctls must be ro in non-init netns */
 	if (!net_eq(net, &init_net))
@@ -4234,6 +4372,9 @@ static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
 	unregister_net_sysctl_table(ipvs->sysctl_hdr);
 	ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
 
+	if (ipvs->est_cpulist_valid)
+		free_cpumask_var(ipvs->sysctl_est_cpulist);
+
 	if (!net_eq(net, &init_net))
 		kfree(ipvs->sysctl_tbl);
 }
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index 2fb6c097437c..e0f5f5da5b6d 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -57,6 +57,9 @@
   - kthread contexts are created and attached to array
   - the kthread tasks are started when first service is added, before that
     the total stats are not estimated
+  - when configuration (cpulist/nice) is changed, the tasks are restarted
+    by work (est_reload_work)
+  - kthread tasks are stopped while the cpulist is empty
   - the kthread context holds lists with estimators (chains) which are
     processed every 2 seconds
   - as estimators can be added dynamically and in bursts, we try to spread
@@ -229,6 +232,7 @@ void ip_vs_est_reload_start(struct netns_ipvs *ipvs)
 	/* Ignore reloads before first service is added */
 	if (!ipvs->enable)
 		return;
+	ip_vs_est_stopped_recalc(ipvs);
 	/* Bump the kthread configuration genid */
 	atomic_inc(&ipvs->est_genid);
 	queue_delayed_work(system_long_wq, &ipvs->est_reload_work, 0);
@@ -259,6 +263,9 @@ int ip_vs_est_kthread_start(struct netns_ipvs *ipvs,
 		goto out;
 	}
 
+	set_user_nice(kd->task, sysctl_est_nice(ipvs));
+	set_cpus_allowed_ptr(kd->task, sysctl_est_cpulist(ipvs));
+
 	pr_info("starting estimator thread %d...\n", kd->id);
 	wake_up_process(kd->task);
 
@@ -334,7 +341,7 @@ static int ip_vs_est_add_kthread(struct netns_ipvs *ipvs)
 	}
 
 	/* Start kthread tasks only when services are present */
-	if (ipvs->enable) {
+	if (ipvs->enable && !ip_vs_est_stopped(ipvs)) {
 		ret = ip_vs_est_kthread_start(ipvs, kd);
 		if (ret < 0)
 			goto out;
@@ -478,8 +485,7 @@ int ip_vs_start_estimator(struct netns_ipvs *ipvs, struct ip_vs_stats *stats)
 	int ret;
 
 	if (!ipvs->est_max_threads && ipvs->enable)
-		ipvs->est_max_threads = IPVS_EST_CPU_KTHREADS *
-					num_possible_cpus();
+		ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
 
 	est->ktid = -1;
 	est->ktrow = IPVS_EST_NTICKS - 1;	/* Initial delay */
-- 
cgit v1.2.3


From 144361c1949f227df9244302da02c258a363b674 Mon Sep 17 00:00:00 2001
From: Julian Anastasov <ja@ssi.bg>
Date: Tue, 22 Nov 2022 18:46:04 +0200
Subject: ipvs: run_estimation should control the kthread tasks

Change the run_estimation flag to start/stop the kthread tasks.

Signed-off-by: Julian Anastasov <ja@ssi.bg>
Cc: yunhong-cgl jiang <xintian1976@gmail.com>
Cc: "dust.li" <dust.li@linux.alibaba.com>
Reviewed-by: Jiri Wiesner <jwiesner@suse.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/ipvs-sysctl.rst |  4 ++--
 include/net/ip_vs.h                      |  6 ++++--
 net/netfilter/ipvs/ip_vs_ctl.c           | 29 ++++++++++++++++++++++++++++-
 net/netfilter/ipvs/ip_vs_est.c           |  2 +-
 4 files changed, 35 insertions(+), 6 deletions(-)

(limited to 'include/net')

diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index 1b778705d706..3fb5fa142eef 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -324,8 +324,8 @@ run_estimation - BOOLEAN
 	0 - disabled
 	not 0 - enabled (default)
 
-	If disabled, the estimation will be stop, and you can't see
-	any update on speed estimation data.
+	If disabled, the estimation will be suspended and kthread tasks
+	stopped.
 
 	You can always re-enable estimation by setting this value to 1.
 	But be careful, the first estimation after re-enable is not
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index dc51b5497cf7..c6c61100d244 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -1605,8 +1605,10 @@ void ip_vs_est_kthread_stop(struct ip_vs_est_kt_data *kd);
 static inline void ip_vs_est_stopped_recalc(struct netns_ipvs *ipvs)
 {
 #ifdef CONFIG_SYSCTL
-	ipvs->est_stopped = ipvs->est_cpulist_valid &&
-			    cpumask_empty(sysctl_est_cpulist(ipvs));
+	/* Stop tasks while cpulist is empty or if disabled with flag */
+	ipvs->est_stopped = !sysctl_run_estimation(ipvs) ||
+			    (ipvs->est_cpulist_valid &&
+			     cpumask_empty(sysctl_est_cpulist(ipvs)));
 #endif
 }
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 38df3ee655ed..c9f598505642 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2056,6 +2056,32 @@ static int ipvs_proc_est_nice(struct ctl_table *table, int write,
 	return ret;
 }
 
+static int ipvs_proc_run_estimation(struct ctl_table *table, int write,
+				    void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct netns_ipvs *ipvs = table->extra2;
+	int *valp = table->data;
+	int val = *valp;
+	int ret;
+
+	struct ctl_table tmp_table = {
+		.data = &val,
+		.maxlen = sizeof(int),
+		.mode = table->mode,
+	};
+
+	ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
+	if (write && ret >= 0) {
+		mutex_lock(&ipvs->est_mutex);
+		if (*valp != val) {
+			*valp = val;
+			ip_vs_est_reload_start(ipvs);
+		}
+		mutex_unlock(&ipvs->est_mutex);
+	}
+	return ret;
+}
+
 /*
  *	IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
  *	Do not change order or insert new entries without
@@ -2230,7 +2256,7 @@ static struct ctl_table vs_vars[] = {
 		.procname	= "run_estimation",
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= ipvs_proc_run_estimation,
 	},
 	{
 		.procname	= "est_cpulist",
@@ -4323,6 +4349,7 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
 	tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
 	ipvs->sysctl_run_estimation = 1;
+	tbl[idx].extra2 = ipvs;
 	tbl[idx++].data = &ipvs->sysctl_run_estimation;
 
 	ipvs->est_cpulist_valid = 0;
diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c
index e0f5f5da5b6d..df56073bb282 100644
--- a/net/netfilter/ipvs/ip_vs_est.c
+++ b/net/netfilter/ipvs/ip_vs_est.c
@@ -212,7 +212,7 @@ static int ip_vs_estimation_kthread(void *data)
 				kd->est_timer = now;
 		}
 
-		if (sysctl_run_estimation(ipvs) && kd->tick_len[row])
+		if (kd->tick_len[row])
 			ip_vs_tick_estimation(kd, row);
 
 		row++;
-- 
cgit v1.2.3


From ebddb1404900657b7f03a56ee4c34a9d218c4030 Mon Sep 17 00:00:00 2001
From: Xin Long <lucien.xin@gmail.com>
Date: Thu, 8 Dec 2022 11:56:12 -0500
Subject: net: move the nat function to nf_nat_ovs for ovs and tc

There are two nat functions are nearly the same in both OVS and
TC code, (ovs_)ct_nat_execute() and ovs_ct_nat/tcf_ct_act_nat().

This patch creates nf_nat_ovs.c under netfilter and moves them
there then exports nf_ct_nat() so that it can be shared by both
OVS and TC, and keeps the nat (type) check and nat flag update
in OVS and TC's own place, as these parts are different between
OVS and TC.

Note that in OVS nat function it was using skb->protocol to get
the proto as it already skips vlans in key_extract(), while it
doesn't in TC, and TC has to call skb_protocol() to get proto.
So in nf_ct_nat_execute(), we keep using skb_protocol() which
works for both OVS and TC contrack.

Signed-off-by: Xin Long <lucien.xin@gmail.com>
Acked-by: Aaron Conole <aconole@redhat.com>
Acked-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/net/netfilter/nf_nat.h |   4 ++
 net/netfilter/Kconfig          |   3 +
 net/netfilter/Makefile         |   1 +
 net/netfilter/nf_nat_ovs.c     | 135 ++++++++++++++++++++++++++++++++++++++++
 net/openvswitch/Kconfig        |   1 +
 net/openvswitch/conntrack.c    | 137 +++--------------------------------------
 net/sched/Kconfig              |   1 +
 net/sched/act_ct.c             | 136 ++++------------------------------------
 8 files changed, 166 insertions(+), 252 deletions(-)
 create mode 100644 net/netfilter/nf_nat_ovs.c

(limited to 'include/net')

diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index e9eb01e99d2f..9877f064548a 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -104,6 +104,10 @@ unsigned int
 nf_nat_inet_fn(void *priv, struct sk_buff *skb,
 	       const struct nf_hook_state *state);
 
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+	      enum ip_conntrack_info ctinfo, int *action,
+	      const struct nf_nat_range2 *range, bool commit);
+
 static inline int nf_nat_initialized(const struct nf_conn *ct,
 				     enum nf_nat_manip_type manip)
 {
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 0846bd75b1da..f71b41c7ce2f 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -459,6 +459,9 @@ config NF_NAT_REDIRECT
 config NF_NAT_MASQUERADE
 	bool
 
+config NF_NAT_OVS
+	bool
+
 config NETFILTER_SYNPROXY
 	tristate
 
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 1d4db1943936..3754eb06fb41 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -59,6 +59,7 @@ obj-$(CONFIG_NF_LOG_SYSLOG) += nf_log_syslog.o
 obj-$(CONFIG_NF_NAT) += nf_nat.o
 nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o
 nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o
+nf_nat-$(CONFIG_NF_NAT_OVS) += nf_nat_ovs.o
 
 ifeq ($(CONFIG_NF_NAT),m)
 nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o
diff --git a/net/netfilter/nf_nat_ovs.c b/net/netfilter/nf_nat_ovs.c
new file mode 100644
index 000000000000..551abd2da614
--- /dev/null
+++ b/net/netfilter/nf_nat_ovs.c
@@ -0,0 +1,135 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Support nat functions for openvswitch and used by OVS and TC conntrack. */
+
+#include <net/netfilter/nf_nat.h>
+
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int nf_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+			     enum ip_conntrack_info ctinfo, int *action,
+			     const struct nf_nat_range2 *range,
+			     enum nf_nat_manip_type maniptype)
+{
+	__be16 proto = skb_protocol(skb, true);
+	int hooknum, err = NF_ACCEPT;
+
+	/* See HOOK2MANIP(). */
+	if (maniptype == NF_NAT_MANIP_SRC)
+		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+	else
+		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+	switch (ctinfo) {
+	case IP_CT_RELATED:
+	case IP_CT_RELATED_REPLY:
+		if (proto == htons(ETH_P_IP) &&
+		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+							   hooknum))
+				err = NF_DROP;
+			goto out;
+		} else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
+			__be16 frag_off;
+			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+			int hdrlen = ipv6_skip_exthdr(skb,
+						      sizeof(struct ipv6hdr),
+						      &nexthdr, &frag_off);
+
+			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+				if (!nf_nat_icmpv6_reply_translation(skb, ct,
+								     ctinfo,
+								     hooknum,
+								     hdrlen))
+					err = NF_DROP;
+				goto out;
+			}
+		}
+		/* Non-ICMP, fall thru to initialize if needed. */
+		fallthrough;
+	case IP_CT_NEW:
+		/* Seen it before?  This can happen for loopback, retrans,
+		 * or local packets.
+		 */
+		if (!nf_nat_initialized(ct, maniptype)) {
+			/* Initialize according to the NAT action. */
+			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+				/* Action is set up to establish a new
+				 * mapping.
+				 */
+				? nf_nat_setup_info(ct, range, maniptype)
+				: nf_nat_alloc_null_binding(ct, hooknum);
+			if (err != NF_ACCEPT)
+				goto out;
+		}
+		break;
+
+	case IP_CT_ESTABLISHED:
+	case IP_CT_ESTABLISHED_REPLY:
+		break;
+
+	default:
+		err = NF_DROP;
+		goto out;
+	}
+
+	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+	if (err == NF_ACCEPT)
+		*action |= BIT(maniptype);
+out:
+	return err;
+}
+
+int nf_ct_nat(struct sk_buff *skb, struct nf_conn *ct,
+	      enum ip_conntrack_info ctinfo, int *action,
+	      const struct nf_nat_range2 *range, bool commit)
+{
+	enum nf_nat_manip_type maniptype;
+	int err, ct_action = *action;
+
+	*action = 0;
+
+	/* Add NAT extension if not confirmed yet. */
+	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+		return NF_DROP;   /* Can't NAT. */
+
+	if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
+	    (ctinfo != IP_CT_RELATED || commit)) {
+		/* NAT an established or related connection like before. */
+		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+			/* This is the REPLY direction for a connection
+			 * for which NAT was applied in the forward
+			 * direction.  Do the reverse NAT.
+			 */
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+		else
+			maniptype = ct->status & IPS_SRC_NAT
+				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+	} else if (ct_action & BIT(NF_NAT_MANIP_SRC)) {
+		maniptype = NF_NAT_MANIP_SRC;
+	} else if (ct_action & BIT(NF_NAT_MANIP_DST)) {
+		maniptype = NF_NAT_MANIP_DST;
+	} else {
+		return NF_ACCEPT;
+	}
+
+	err = nf_ct_nat_execute(skb, ct, ctinfo, action, range, maniptype);
+	if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
+		if (ct->status & IPS_SRC_NAT) {
+			if (maniptype == NF_NAT_MANIP_SRC)
+				maniptype = NF_NAT_MANIP_DST;
+			else
+				maniptype = NF_NAT_MANIP_SRC;
+
+			err = nf_ct_nat_execute(skb, ct, ctinfo, action, range,
+						maniptype);
+		} else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
+			err = nf_ct_nat_execute(skb, ct, ctinfo, action, NULL,
+						NF_NAT_MANIP_SRC);
+		}
+	}
+	return err;
+}
+EXPORT_SYMBOL_GPL(nf_ct_nat);
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index 15bd287f5cbd..747d537a3f06 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -15,6 +15,7 @@ config OPENVSWITCH
 	select NET_MPLS_GSO
 	select DST_CACHE
 	select NET_NSH
+	select NF_NAT_OVS if NF_NAT
 	help
 	  Open vSwitch is a multilayer Ethernet switch targeted at virtualized
 	  environments.  In addition to supporting a variety of features
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 58c9f0edc3c4..c8b137649ca4 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -726,144 +726,27 @@ static void ovs_nat_update_key(struct sw_flow_key *key,
 	}
 }
 
-/* Modelled after nf_nat_ipv[46]_fn().
- * range is only used for new, uninitialized NAT state.
- * Returns either NF_ACCEPT or NF_DROP.
- */
-static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
-			      enum ip_conntrack_info ctinfo,
-			      const struct nf_nat_range2 *range,
-			      enum nf_nat_manip_type maniptype, struct sw_flow_key *key)
-{
-	int hooknum, err = NF_ACCEPT;
-
-	/* See HOOK2MANIP(). */
-	if (maniptype == NF_NAT_MANIP_SRC)
-		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
-	else
-		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
-		if (IS_ENABLED(CONFIG_NF_NAT) &&
-		    skb->protocol == htons(ETH_P_IP) &&
-		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   hooknum))
-				err = NF_DROP;
-			goto out;
-		} else if (IS_ENABLED(CONFIG_IPV6) &&
-			   skb->protocol == htons(ETH_P_IPV6)) {
-			__be16 frag_off;
-			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-			int hdrlen = ipv6_skip_exthdr(skb,
-						      sizeof(struct ipv6hdr),
-						      &nexthdr, &frag_off);
-
-			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
-				if (!nf_nat_icmpv6_reply_translation(skb, ct,
-								     ctinfo,
-								     hooknum,
-								     hdrlen))
-					err = NF_DROP;
-				goto out;
-			}
-		}
-		/* Non-ICMP, fall thru to initialize if needed. */
-		fallthrough;
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			/* Initialize according to the NAT action. */
-			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
-				/* Action is set up to establish a new
-				 * mapping.
-				 */
-				? nf_nat_setup_info(ct, range, maniptype)
-				: nf_nat_alloc_null_binding(ct, hooknum);
-			if (err != NF_ACCEPT)
-				goto out;
-		}
-		break;
-
-	case IP_CT_ESTABLISHED:
-	case IP_CT_ESTABLISHED_REPLY:
-		break;
-
-	default:
-		err = NF_DROP;
-		goto out;
-	}
-
-	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
-out:
-	/* Update the flow key if NAT successful. */
-	if (err == NF_ACCEPT)
-		ovs_nat_update_key(key, skb, maniptype);
-
-	return err;
-}
-
 /* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
 static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
 		      const struct ovs_conntrack_info *info,
 		      struct sk_buff *skb, struct nf_conn *ct,
 		      enum ip_conntrack_info ctinfo)
 {
-	enum nf_nat_manip_type maniptype;
-	int err;
+	int err, action = 0;
 
 	if (!(info->nat & OVS_CT_NAT))
 		return NF_ACCEPT;
+	if (info->nat & OVS_CT_SRC_NAT)
+		action |= BIT(NF_NAT_MANIP_SRC);
+	if (info->nat & OVS_CT_DST_NAT)
+		action |= BIT(NF_NAT_MANIP_DST);
 
-	/* Add NAT extension if not confirmed yet. */
-	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
-		return NF_DROP;   /* Can't NAT. */
+	err = nf_ct_nat(skb, ct, ctinfo, &action, &info->range, info->commit);
 
-	/* Determine NAT type.
-	 * Check if the NAT type can be deduced from the tracked connection.
-	 * Make sure new expected connections (IP_CT_RELATED) are NATted only
-	 * when committing.
-	 */
-	if (ctinfo != IP_CT_NEW && ct->status & IPS_NAT_MASK &&
-	    (ctinfo != IP_CT_RELATED || info->commit)) {
-		/* NAT an established or related connection like before. */
-		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
-			/* This is the REPLY direction for a connection
-			 * for which NAT was applied in the forward
-			 * direction.  Do the reverse NAT.
-			 */
-			maniptype = ct->status & IPS_SRC_NAT
-				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
-		else
-			maniptype = ct->status & IPS_SRC_NAT
-				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
-	} else if (info->nat & OVS_CT_SRC_NAT) {
-		maniptype = NF_NAT_MANIP_SRC;
-	} else if (info->nat & OVS_CT_DST_NAT) {
-		maniptype = NF_NAT_MANIP_DST;
-	} else {
-		return NF_ACCEPT; /* Connection is not NATed. */
-	}
-	err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype, key);
-
-	if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
-		if (ct->status & IPS_SRC_NAT) {
-			if (maniptype == NF_NAT_MANIP_SRC)
-				maniptype = NF_NAT_MANIP_DST;
-			else
-				maniptype = NF_NAT_MANIP_SRC;
-
-			err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range,
-						 maniptype, key);
-		} else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-			err = ovs_ct_nat_execute(skb, ct, ctinfo, NULL,
-						 NF_NAT_MANIP_SRC, key);
-		}
-	}
+	if (action & BIT(NF_NAT_MANIP_SRC))
+		ovs_nat_update_key(key, skb, NF_NAT_MANIP_SRC);
+	if (action & BIT(NF_NAT_MANIP_DST))
+		ovs_nat_update_key(key, skb, NF_NAT_MANIP_DST);
 
 	return err;
 }
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 4662a6ce8a7e..777d6b50505c 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -977,6 +977,7 @@ config NET_ACT_TUNNEL_KEY
 config NET_ACT_CT
 	tristate "connection tracking tc action"
 	depends on NET_CLS_ACT && NF_CONNTRACK && (!NF_NAT || NF_NAT) && NF_FLOW_TABLE
+	select NF_NAT_OVS if NF_NAT
 	help
 	  Say Y here to allow sending the packets to conntrack module.
 
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index eac4e07eb56b..0ca2bb8ed026 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -864,90 +864,6 @@ static void tcf_ct_params_free_rcu(struct rcu_head *head)
 	tcf_ct_params_free(params);
 }
 
-#if IS_ENABLED(CONFIG_NF_NAT)
-/* Modelled after nf_nat_ipv[46]_fn().
- * range is only used for new, uninitialized NAT state.
- * Returns either NF_ACCEPT or NF_DROP.
- */
-static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
-			  enum ip_conntrack_info ctinfo,
-			  const struct nf_nat_range2 *range,
-			  enum nf_nat_manip_type maniptype)
-{
-	__be16 proto = skb_protocol(skb, true);
-	int hooknum, err = NF_ACCEPT;
-
-	/* See HOOK2MANIP(). */
-	if (maniptype == NF_NAT_MANIP_SRC)
-		hooknum = NF_INET_LOCAL_IN; /* Source NAT */
-	else
-		hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
-
-	switch (ctinfo) {
-	case IP_CT_RELATED:
-	case IP_CT_RELATED_REPLY:
-		if (proto == htons(ETH_P_IP) &&
-		    ip_hdr(skb)->protocol == IPPROTO_ICMP) {
-			if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
-							   hooknum))
-				err = NF_DROP;
-			goto out;
-		} else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
-			__be16 frag_off;
-			u8 nexthdr = ipv6_hdr(skb)->nexthdr;
-			int hdrlen = ipv6_skip_exthdr(skb,
-						      sizeof(struct ipv6hdr),
-						      &nexthdr, &frag_off);
-
-			if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
-				if (!nf_nat_icmpv6_reply_translation(skb, ct,
-								     ctinfo,
-								     hooknum,
-								     hdrlen))
-					err = NF_DROP;
-				goto out;
-			}
-		}
-		/* Non-ICMP, fall thru to initialize if needed. */
-		fallthrough;
-	case IP_CT_NEW:
-		/* Seen it before?  This can happen for loopback, retrans,
-		 * or local packets.
-		 */
-		if (!nf_nat_initialized(ct, maniptype)) {
-			/* Initialize according to the NAT action. */
-			err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
-				/* Action is set up to establish a new
-				 * mapping.
-				 */
-				? nf_nat_setup_info(ct, range, maniptype)
-				: nf_nat_alloc_null_binding(ct, hooknum);
-			if (err != NF_ACCEPT)
-				goto out;
-		}
-		break;
-
-	case IP_CT_ESTABLISHED:
-	case IP_CT_ESTABLISHED_REPLY:
-		break;
-
-	default:
-		err = NF_DROP;
-		goto out;
-	}
-
-	err = nf_nat_packet(ct, ctinfo, hooknum, skb);
-out:
-	if (err == NF_ACCEPT) {
-		if (maniptype == NF_NAT_MANIP_SRC)
-			tc_skb_cb(skb)->post_ct_snat = 1;
-		if (maniptype == NF_NAT_MANIP_DST)
-			tc_skb_cb(skb)->post_ct_dnat = 1;
-	}
-	return err;
-}
-#endif /* CONFIG_NF_NAT */
-
 static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
 {
 #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
@@ -987,52 +903,22 @@ static int tcf_ct_act_nat(struct sk_buff *skb,
 			  bool commit)
 {
 #if IS_ENABLED(CONFIG_NF_NAT)
-	int err;
-	enum nf_nat_manip_type maniptype;
+	int err, action = 0;
 
 	if (!(ct_action & TCA_CT_ACT_NAT))
 		return NF_ACCEPT;
+	if (ct_action & TCA_CT_ACT_NAT_SRC)
+		action |= BIT(NF_NAT_MANIP_SRC);
+	if (ct_action & TCA_CT_ACT_NAT_DST)
+		action |= BIT(NF_NAT_MANIP_DST);
 
-	/* Add NAT extension if not confirmed yet. */
-	if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
-		return NF_DROP;   /* Can't NAT. */
-
-	if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
-	    (ctinfo != IP_CT_RELATED || commit)) {
-		/* NAT an established or related connection like before. */
-		if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
-			/* This is the REPLY direction for a connection
-			 * for which NAT was applied in the forward
-			 * direction.  Do the reverse NAT.
-			 */
-			maniptype = ct->status & IPS_SRC_NAT
-				? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
-		else
-			maniptype = ct->status & IPS_SRC_NAT
-				? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
-	} else if (ct_action & TCA_CT_ACT_NAT_SRC) {
-		maniptype = NF_NAT_MANIP_SRC;
-	} else if (ct_action & TCA_CT_ACT_NAT_DST) {
-		maniptype = NF_NAT_MANIP_DST;
-	} else {
-		return NF_ACCEPT;
-	}
+	err = nf_ct_nat(skb, ct, ctinfo, &action, range, commit);
+
+	if (action & BIT(NF_NAT_MANIP_SRC))
+		tc_skb_cb(skb)->post_ct_snat = 1;
+	if (action & BIT(NF_NAT_MANIP_DST))
+		tc_skb_cb(skb)->post_ct_dnat = 1;
 
-	err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
-	if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
-		if (ct->status & IPS_SRC_NAT) {
-			if (maniptype == NF_NAT_MANIP_SRC)
-				maniptype = NF_NAT_MANIP_DST;
-			else
-				maniptype = NF_NAT_MANIP_SRC;
-
-			err = ct_nat_execute(skb, ct, ctinfo, range,
-					     maniptype);
-		} else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
-			err = ct_nat_execute(skb, ct, ctinfo, NULL,
-					     NF_NAT_MANIP_SRC);
-		}
-	}
 	return err;
 #else
 	return NF_ACCEPT;
-- 
cgit v1.2.3


From 01d80532295cd359dc43e6bd71860d5515f84372 Mon Sep 17 00:00:00 2001
From: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Date: Thu, 1 Dec 2022 02:46:55 +0900
Subject: net: devlink: add DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER

As discussed in [1], abbreviating the bootloader to "bl" might not be
well understood. Instead, a bootloader technically being a firmware,
name it "fw.bootloader".

Add a new macro to devlink.h to formalize this new info attribute name
and update the documentation.

[1] https://lore.kernel.org/netdev/20221128142723.2f826d20@kernel.org/

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Vincent Mailhol <mailhol.vincent@wanadoo.fr>
Link: https://lore.kernel.org/all/20221130174658.29282-5-mailhol.vincent@wanadoo.fr
Signed-off-by: Marc Kleine-Budde <mkl@pengutronix.de>
---
 Documentation/networking/devlink/devlink-info.rst | 5 +++++
 include/net/devlink.h                             | 2 ++
 2 files changed, 7 insertions(+)

(limited to 'include/net')

diff --git a/Documentation/networking/devlink/devlink-info.rst b/Documentation/networking/devlink/devlink-info.rst
index 7572bf6de5c1..1242b0e6826b 100644
--- a/Documentation/networking/devlink/devlink-info.rst
+++ b/Documentation/networking/devlink/devlink-info.rst
@@ -198,6 +198,11 @@ fw.bundle_id
 
 Unique identifier of the entire firmware bundle.
 
+fw.bootloader
+-------------
+
+Version of the bootloader.
+
 Future work
 ===========
 
diff --git a/include/net/devlink.h b/include/net/devlink.h
index 0f376a28b9c4..6a2e4f21779f 100644
--- a/include/net/devlink.h
+++ b/include/net/devlink.h
@@ -621,6 +621,8 @@ enum devlink_param_generic_id {
 #define DEVLINK_INFO_VERSION_GENERIC_FW_ROCE	"fw.roce"
 /* Firmware bundle identifier */
 #define DEVLINK_INFO_VERSION_GENERIC_FW_BUNDLE_ID	"fw.bundle_id"
+/* Bootloader */
+#define DEVLINK_INFO_VERSION_GENERIC_FW_BOOTLOADER	"fw.bootloader"
 
 /**
  * struct devlink_flash_update_params - Flash Update parameters
-- 
cgit v1.2.3


From 3fff88186f047627bb128d65155f42517f8e448f Mon Sep 17 00:00:00 2001
From: Matthieu Baerts <matthieu.baerts@tessares.net>
Date: Fri, 9 Dec 2022 16:28:08 -0800
Subject: mptcp: remove MPTCP 'ifdef' in TCP SYN cookies

To ease the maintenance, it is often recommended to avoid having #ifdef
preprocessor conditions.

Here the section related to CONFIG_MPTCP was quite short but the next
commit needs to add more code around. It is then cleaner to move
specific MPTCP code to functions located in net/mptcp directory.

Now that mptcp_subflow_request_sock_ops structure can be static, it can
also be marked as "read only after init".

Suggested-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
Signed-off-by: Mat Martineau <mathew.j.martineau@linux.intel.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/mptcp.h   | 12 ++++++++++--
 net/ipv4/syncookies.c |  7 +++----
 net/mptcp/subflow.c   | 12 +++++++++++-
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 412479ebf5ad..3c5c68618fcc 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -97,8 +97,6 @@ struct mptcp_out_options {
 };
 
 #ifdef CONFIG_MPTCP
-extern struct request_sock_ops mptcp_subflow_request_sock_ops;
-
 void mptcp_init(void);
 
 static inline bool sk_is_mptcp(const struct sock *sk)
@@ -188,6 +186,9 @@ void mptcp_seq_show(struct seq_file *seq);
 int mptcp_subflow_init_cookie_req(struct request_sock *req,
 				  const struct sock *sk_listener,
 				  struct sk_buff *skb);
+struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
+					       struct sock *sk_listener,
+					       bool attach_listener);
 
 __be32 mptcp_get_reset_option(const struct sk_buff *skb);
 
@@ -274,6 +275,13 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
 	return 0; /* TCP fallback */
 }
 
+static inline struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
+							     struct sock *sk_listener,
+							     bool attach_listener)
+{
+	return NULL;
+}
+
 static inline __be32 mptcp_reset_option(const struct sk_buff *skb)  { return htonl(0u); }
 #endif /* CONFIG_MPTCP */
 
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 942d2dfa1115..26fb97d1d4d9 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -288,12 +288,11 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
 	struct tcp_request_sock *treq;
 	struct request_sock *req;
 
-#ifdef CONFIG_MPTCP
 	if (sk_is_mptcp(sk))
-		ops = &mptcp_subflow_request_sock_ops;
-#endif
+		req = mptcp_subflow_reqsk_alloc(ops, sk, false);
+	else
+		req = inet_reqsk_alloc(ops, sk, false);
 
-	req = inet_reqsk_alloc(ops, sk, false);
 	if (!req)
 		return NULL;
 
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index 2159b5f9988f..3f670f2d5c5c 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -529,7 +529,7 @@ static int subflow_v6_rebuild_header(struct sock *sk)
 }
 #endif
 
-struct request_sock_ops mptcp_subflow_request_sock_ops;
+static struct request_sock_ops mptcp_subflow_request_sock_ops __ro_after_init;
 static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
 
 static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
@@ -582,6 +582,16 @@ drop:
 }
 #endif
 
+struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
+					       struct sock *sk_listener,
+					       bool attach_listener)
+{
+	ops = &mptcp_subflow_request_sock_ops;
+
+	return inet_reqsk_alloc(ops, sk_listener, attach_listener);
+}
+EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
+
 /* validate hmac received in third ACK */
 static bool subflow_hmac_valid(const struct request_sock *req,
 			       const struct mptcp_options_received *mp_opt)
-- 
cgit v1.2.3


From c9209b269afd29c55b8be49916725ed469f8c5d9 Mon Sep 17 00:00:00 2001
From: Archie Pusaka <apusaka@chromium.org>
Date: Thu, 6 Oct 2022 17:09:31 +0800
Subject: Bluetooth: btusb: Introduce generic USB reset

On cmd_timeout with no reset_gpio, reset the USB port as a last
resort.

This patch changes the behavior of btusb_intel_cmd_timeout and
btusb_rtl_cmd_timeout.

Signed-off-by: Archie Pusaka <apusaka@chromium.org>
Reviewed-by: Abhishek Pandit-Subedi <abhishekpandit@google.com>
Reviewed-by: Ying Hsu <yinghsu@chromium.org>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 drivers/bluetooth/btusb.c        | 35 +++++++++++++++++++++++++----------
 include/net/bluetooth/hci_core.h |  1 +
 2 files changed, 26 insertions(+), 10 deletions(-)

(limited to 'include/net')

diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c
index 3b4ae7905fd4..a9e43425c35f 100644
--- a/drivers/bluetooth/btusb.c
+++ b/drivers/bluetooth/btusb.c
@@ -696,6 +696,28 @@ struct btusb_data {
 	unsigned cmd_timeout_cnt;
 };
 
+static void btusb_reset(struct hci_dev *hdev)
+{
+	struct btusb_data *data;
+	int err;
+
+	if (hdev->reset) {
+		hdev->reset(hdev);
+		return;
+	}
+
+	data = hci_get_drvdata(hdev);
+	/* This is not an unbalanced PM reference since the device will reset */
+	err = usb_autopm_get_interface(data->intf);
+	if (err) {
+		bt_dev_err(hdev, "Failed usb_autopm_get_interface: %d", err);
+		return;
+	}
+
+	bt_dev_err(hdev, "Resetting usb device.");
+	usb_queue_reset_device(data->intf);
+}
+
 static void btusb_intel_cmd_timeout(struct hci_dev *hdev)
 {
 	struct btusb_data *data = hci_get_drvdata(hdev);
@@ -705,7 +727,7 @@ static void btusb_intel_cmd_timeout(struct hci_dev *hdev)
 		return;
 
 	if (!reset_gpio) {
-		bt_dev_err(hdev, "No way to reset. Ignoring and continuing");
+		btusb_reset(hdev);
 		return;
 	}
 
@@ -736,7 +758,7 @@ static void btusb_rtl_cmd_timeout(struct hci_dev *hdev)
 		return;
 
 	if (!reset_gpio) {
-		bt_dev_err(hdev, "No gpio to reset Realtek device, ignoring");
+		btusb_reset(hdev);
 		return;
 	}
 
@@ -761,7 +783,6 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
 {
 	struct btusb_data *data = hci_get_drvdata(hdev);
 	struct gpio_desc *reset_gpio = data->reset_gpio;
-	int err;
 
 	if (++data->cmd_timeout_cnt < 5)
 		return;
@@ -787,13 +808,7 @@ static void btusb_qca_cmd_timeout(struct hci_dev *hdev)
 		return;
 	}
 
-	bt_dev_err(hdev, "Multiple cmd timeouts seen. Resetting usb device.");
-	/* This is not an unbalanced PM reference since the device will reset */
-	err = usb_autopm_get_interface(data->intf);
-	if (!err)
-		usb_queue_reset_device(data->intf);
-	else
-		bt_dev_err(hdev, "Failed usb_autopm_get_interface with %d", err);
+	btusb_reset(hdev);
 }
 
 static inline void btusb_free_frags(struct btusb_data *data)
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index c54bc71254af..55a40f5606c3 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -659,6 +659,7 @@ struct hci_dev {
 	int (*set_diag)(struct hci_dev *hdev, bool enable);
 	int (*set_bdaddr)(struct hci_dev *hdev, const bdaddr_t *bdaddr);
 	void (*cmd_timeout)(struct hci_dev *hdev);
+	void (*reset)(struct hci_dev *hdev);
 	bool (*wakeup)(struct hci_dev *hdev);
 	int (*set_quality_report)(struct hci_dev *hdev, bool enable);
 	int (*get_data_path_id)(struct hci_dev *hdev, __u8 *data_path);
-- 
cgit v1.2.3


From ad38e55e1c89384aecee1bb0425bf1bf21ec86fd Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Fri, 4 Nov 2022 22:13:00 +0100
Subject: Bluetooth: hci_event: Ignore reserved bits in LE Extended Adv Report

Broadcom controllers present on Apple Silicon devices use the upper
8 bits of the event type in the LE Extended Advertising Report for
the channel on which the frame has been received.
These bits are reserved according to the Bluetooth spec anyway such that
we can just drop them to ensure that the advertising results are parsed
correctly.

The following excerpt from a btmon trace shows a report received on
channel 37 by these controllers:

> HCI Event: LE Meta Event (0x3e) plen 55
      LE Extended Advertising Report (0x0d)
        Num reports: 1
        Entry 0
          Event type: 0x2513
            Props: 0x0013
              Connectable
              Scannable
              Use legacy advertising PDUs
            Data status: Complete
            Reserved (0x2500)
          Legacy PDU Type: Reserved (0x2513)
          Address type: Public (0x00)
          Address: XX:XX:XX:XX:XX:XX (Shenzhen Jingxun Software [...])
          Primary PHY: LE 1M
          Secondary PHY: No packets
          SID: no ADI field (0xff)
          TX power: 127 dBm
          RSSI: -76 dBm (0xb4)
          Periodic advertising interval: 0.00 msec (0x0000)
          Direct address type: Public (0x00)
          Direct address: 00:00:00:00:00:00 (OUI 00-00-00)
          Data length: 0x1d
          [...]
        Flags: 0x18
          Simultaneous LE and BR/EDR (Controller)
          Simultaneous LE and BR/EDR (Host)
        Company: Harman International Industries, Inc. (87)
          Data: [...]
        Service Data (UUID 0xfddf):
        Name (complete): JBL Flip 5

Signed-off-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h | 1 +
 net/bluetooth/hci_event.c   | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 684f1cd28730..a035ff6055da 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -2590,6 +2590,7 @@ struct hci_ev_le_conn_complete {
 #define LE_EXT_ADV_DIRECT_IND		0x0004
 #define LE_EXT_ADV_SCAN_RSP		0x0008
 #define LE_EXT_ADV_LEGACY_PDU		0x0010
+#define LE_EXT_ADV_EVT_TYPE_MASK	0x007f
 
 #define ADDR_LE_DEV_PUBLIC		0x00
 #define ADDR_LE_DEV_RANDOM		0x01
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index faca701bce2a..ade2628aae0d 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -6494,7 +6494,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data,
 					info->length))
 			break;
 
-		evt_type = __le16_to_cpu(info->type);
+		evt_type = __le16_to_cpu(info->type) & LE_EXT_ADV_EVT_TYPE_MASK;
 		legacy_evt_type = ext_evt_type_to_legacy(hdev, evt_type);
 		if (legacy_evt_type != LE_ADV_INVALID) {
 			process_adv_report(hdev, legacy_evt_type, &info->bdaddr,
-- 
cgit v1.2.3


From 392fca352c7a95e2828d49e7500e26d0c87ca265 Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Fri, 4 Nov 2022 22:13:01 +0100
Subject: Bluetooth: Add quirk to disable extended scanning

Broadcom 4377 controllers found in Apple x86 Macs with the T2 chip
claim to support extended scanning when querying supported states,

< HCI Command: LE Read Supported St.. (0x08|0x001c) plen 0
> HCI Event: Command Complete (0x0e) plen 12
      LE Read Supported States (0x08|0x001c) ncmd 1
        Status: Success (0x00)
        States: 0x000003ffffffffff
[...]
          LE Set Extended Scan Parameters (Octet 37 - Bit 5)
          LE Set Extended Scan Enable (Octet 37 - Bit 6)
[...]

, but then fail to actually implement the extended scanning:

< HCI Command: LE Set Extended Sca.. (0x08|0x0041) plen 8
        Own address type: Random (0x01)
        Filter policy: Accept all advertisement (0x00)
        PHYs: 0x01
        Entry 0: LE 1M
          Type: Active (0x01)
          Interval: 11.250 msec (0x0012)
          Window: 11.250 msec (0x0012)
> HCI Event: Command Complete (0x0e) plen 4
      LE Set Extended Scan Parameters (0x08|0x0041) ncmd 1
        Status: Unknown HCI Command (0x01)

Signed-off-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      | 10 ++++++++++
 include/net/bluetooth/hci_core.h |  4 +++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index a035ff6055da..4cf6bc363e4f 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -274,6 +274,16 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_ENHANCED_SETUP_SYNC_CONN,
+
+	/*
+	 * When this quirk is set, the HCI_OP_LE_SET_EXT_SCAN_ENABLE command is
+	 * disabled. This is required for some Broadcom controllers which
+	 * erroneously claim to support extended scanning.
+	 *
+	 * This quirk can be set before hci_register_dev is called or
+	 * during the hdev->setup vendor callback.
+	 */
+	HCI_QUIRK_BROKEN_EXT_SCAN,
 };
 
 /* HCI device flags */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 55a40f5606c3..1113d74e1f9f 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1690,7 +1690,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 
 /* Use ext scanning if set ext scan param and ext scan enable is supported */
 #define use_ext_scan(dev) (((dev)->commands[37] & 0x20) && \
-			   ((dev)->commands[37] & 0x40))
+			   ((dev)->commands[37] & 0x40) && \
+			   !test_bit(HCI_QUIRK_BROKEN_EXT_SCAN, &(dev)->quirks))
+
 /* Use ext create connection if command is supported */
 #define use_ext_conn(dev) ((dev)->commands[37] & 0x80)
 
-- 
cgit v1.2.3


From ffcb0a445ec2d5753751437706aa0a7ea8351099 Mon Sep 17 00:00:00 2001
From: Sven Peter <sven@svenpeter.dev>
Date: Fri, 4 Nov 2022 22:13:02 +0100
Subject: Bluetooth: Add quirk to disable MWS Transport Configuration

Broadcom 4378/4387 controllers found in Apple Silicon Macs claim to
support getting MWS Transport Layer Configuration,

< HCI Command: Read Local Supported... (0x04|0x0002) plen 0
> HCI Event: Command Complete (0x0e) plen 68
      Read Local Supported Commands (0x04|0x0002) ncmd 1
        Status: Success (0x00)
[...]
          Get MWS Transport Layer Configuration (Octet 30 - Bit 3)]
[...]

, but then don't actually allow the required command:

> HCI Event: Command Complete (0x0e) plen 15
      Get MWS Transport Layer Configuration (0x05|0x000c) ncmd 1
        Status: Command Disallowed (0x0c)
        Number of transports: 0
        Baud rate list: 0 entries
        00 00 00 00 00 00 00 00 00 00

Signed-off-by: Sven Peter <sven@svenpeter.dev>
Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
---
 include/net/bluetooth/hci.h      | 10 ++++++++++
 include/net/bluetooth/hci_core.h |  3 +++
 net/bluetooth/hci_sync.c         |  2 +-
 3 files changed, 14 insertions(+), 1 deletion(-)

(limited to 'include/net')

diff --git a/include/net/bluetooth/hci.h b/include/net/bluetooth/hci.h
index 4cf6bc363e4f..8d773b042c85 100644
--- a/include/net/bluetooth/hci.h
+++ b/include/net/bluetooth/hci.h
@@ -284,6 +284,16 @@ enum {
 	 * during the hdev->setup vendor callback.
 	 */
 	HCI_QUIRK_BROKEN_EXT_SCAN,
+
+	/*
+	 * When this quirk is set, the HCI_OP_GET_MWS_TRANSPORT_CONFIG command is
+	 * disabled. This is required for some Broadcom controllers which
+	 * erroneously claim to support MWS Transport Layer Configuration.
+	 *
+	 * This quirk can be set before hci_register_dev is called or
+	 * during the hdev->setup vendor callback.
+	 */
+	HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG,
 };
 
 /* HCI device flags */
diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h
index 1113d74e1f9f..7254edfba4c9 100644
--- a/include/net/bluetooth/hci_core.h
+++ b/include/net/bluetooth/hci_core.h
@@ -1720,6 +1720,9 @@ void hci_conn_del_sysfs(struct hci_conn *conn);
 	((dev)->le_features[3] & HCI_LE_CIS_PERIPHERAL)
 #define bis_capable(dev) ((dev)->le_features[3] & HCI_LE_ISO_BROADCASTER)
 
+#define mws_transport_config_capable(dev) (((dev)->commands[30] & 0x08) && \
+	(!test_bit(HCI_QUIRK_BROKEN_MWS_TRANSPORT_CONFIG, &(dev)->quirks)))
+
 /* ----- HCI protocols ----- */
 #define HCI_PROTO_DEFER             0x01
 
diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c
index 4debe205d4af..9e2d7e4b850c 100644
--- a/net/bluetooth/hci_sync.c
+++ b/net/bluetooth/hci_sync.c
@@ -4279,7 +4279,7 @@ static int hci_read_local_pairing_opts_sync(struct hci_dev *hdev)
 /* Get MWS transport configuration if the HCI command is supported */
 static int hci_get_mws_transport_config_sync(struct hci_dev *hdev)
 {
-	if (!(hdev->commands[30] & 0x08))
+	if (!mws_transport_config_capable(hdev))
 		return 0;
 
 	return __hci_cmd_sync_status(hdev, HCI_OP_GET_MWS_TRANSPORT_CONFIG,
-- 
cgit v1.2.3


From d7b061b80ee6f91aa0b89daa3069802d7ea4c57f Mon Sep 17 00:00:00 2001
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 12 Dec 2022 11:24:26 +0800
Subject: net: tso: inline tso_count_descs()

tso_count_descs() is a small function doing simple calculation,
and tso_count_descs() is used in fast path, so inline it to
reduce the overhead of calls.

Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
Link: https://lore.kernel.org/r/20221212032426.16050-1-linyunsheng@huawei.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/tso.h | 8 +++++++-
 net/core/tso.c    | 8 --------
 2 files changed, 7 insertions(+), 9 deletions(-)

(limited to 'include/net')

diff --git a/include/net/tso.h b/include/net/tso.h
index 62c98a9c60f1..e7e157ae0526 100644
--- a/include/net/tso.h
+++ b/include/net/tso.h
@@ -2,6 +2,7 @@
 #ifndef _TSO_H
 #define _TSO_H
 
+#include <linux/skbuff.h>
 #include <net/ip.h>
 
 #define TSO_HEADER_SIZE		256
@@ -16,7 +17,12 @@ struct tso_t {
 	u32	tcp_seq;
 };
 
-int tso_count_descs(const struct sk_buff *skb);
+/* Calculate the worst case buffer count */
+static inline int tso_count_descs(const struct sk_buff *skb)
+{
+	return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
+}
+
 void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
 		   int size, bool is_last);
 void tso_build_data(const struct sk_buff *skb, struct tso_t *tso, int size);
diff --git a/net/core/tso.c b/net/core/tso.c
index 4148f6d48953..e00796e3b146 100644
--- a/net/core/tso.c
+++ b/net/core/tso.c
@@ -5,14 +5,6 @@
 #include <net/tso.h>
 #include <asm/unaligned.h>
 
-/* Calculate expected number of TX descriptors */
-int tso_count_descs(const struct sk_buff *skb)
-{
-	/* The Marvell Way */
-	return skb_shinfo(skb)->gso_segs * 2 + skb_shinfo(skb)->nr_frags;
-}
-EXPORT_SYMBOL(tso_count_descs);
-
 void tso_build_hdr(const struct sk_buff *skb, char *hdr, struct tso_t *tso,
 		   int size, bool is_last)
 {
-- 
cgit v1.2.3


From 89300468e2b2ec216c7827ba04ac45c129794403 Mon Sep 17 00:00:00 2001
From: Coco Li <lixiaoyan@google.com>
Date: Sat, 10 Dec 2022 04:16:45 +0000
Subject: IPv6/GRO: generic helper to remove temporary HBH/jumbo header in
 driver

IPv6/TCP and GRO stacks can build big TCP packets with an added
temporary Hop By Hop header.

Is GSO is not involved, then the temporary header needs to be removed in
the driver. This patch provides a generic helper for drivers that need
to modify their headers in place.

Tested:
Compiled and ran with ethtool -K eth1 tso off
Could send Big TCP packets

Signed-off-by: Coco Li <lixiaoyan@google.com>
Link: https://lore.kernel.org/r/20221210041646.3587757-1-lixiaoyan@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/ipv6.h     | 33 +++++++++++++++++++++++++++++++++
 net/ipv6/ip6_offload.c | 27 ++++-----------------------
 2 files changed, 37 insertions(+), 23 deletions(-)

(limited to 'include/net')

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index d383c895592a..03f3af02a9a6 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -500,6 +500,39 @@ static inline int ipv6_has_hopopt_jumbo(const struct sk_buff *skb)
 	return jhdr->nexthdr;
 }
 
+/* Return 0 if HBH header is successfully removed
+ * Or if HBH removal is unnecessary (packet is not big TCP)
+ * Return error to indicate dropping the packet
+ */
+static inline int ipv6_hopopt_jumbo_remove(struct sk_buff *skb)
+{
+	const int hophdr_len = sizeof(struct hop_jumbo_hdr);
+	int nexthdr = ipv6_has_hopopt_jumbo(skb);
+	struct ipv6hdr *h6;
+
+	if (!nexthdr)
+		return 0;
+
+	if (skb_cow_head(skb, 0))
+		return -1;
+
+	/* Remove the HBH header.
+	 * Layout: [Ethernet header][IPv6 header][HBH][L4 Header]
+	 */
+	memmove(skb_mac_header(skb) + hophdr_len, skb_mac_header(skb),
+		skb_network_header(skb) - skb_mac_header(skb) +
+		sizeof(struct ipv6hdr));
+
+	__skb_pull(skb, hophdr_len);
+	skb->network_header += hophdr_len;
+	skb->mac_header += hophdr_len;
+
+	h6 = ipv6_hdr(skb);
+	h6->nexthdr = nexthdr;
+
+	return 0;
+}
+
 static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 {
 	/* If forwarding is enabled, RA are not accepted unless the special
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index 3ee345672849..00dc2e3b0184 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -77,7 +77,7 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	struct sk_buff *segs = ERR_PTR(-EINVAL);
 	struct ipv6hdr *ipv6h;
 	const struct net_offload *ops;
-	int proto, nexthdr;
+	int proto, err;
 	struct frag_hdr *fptr;
 	unsigned int payload_len;
 	u8 *prevhdr;
@@ -87,28 +87,9 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	bool gso_partial;
 
 	skb_reset_network_header(skb);
-	nexthdr = ipv6_has_hopopt_jumbo(skb);
-	if (nexthdr) {
-		const int hophdr_len = sizeof(struct hop_jumbo_hdr);
-		int err;
-
-		err = skb_cow_head(skb, 0);
-		if (err < 0)
-			return ERR_PTR(err);
-
-		/* remove the HBH header.
-		 * Layout: [Ethernet header][IPv6 header][HBH][TCP header]
-		 */
-		memmove(skb_mac_header(skb) + hophdr_len,
-			skb_mac_header(skb),
-			ETH_HLEN + sizeof(struct ipv6hdr));
-		skb->data += hophdr_len;
-		skb->len -= hophdr_len;
-		skb->network_header += hophdr_len;
-		skb->mac_header += hophdr_len;
-		ipv6h = (struct ipv6hdr *)skb->data;
-		ipv6h->nexthdr = nexthdr;
-	}
+	err = ipv6_hopopt_jumbo_remove(skb);
+	if (err)
+		return ERR_PTR(err);
 	nhoff = skb_network_header(skb) - skb_mac_header(skb);
 	if (unlikely(!pskb_may_pull(skb, sizeof(*ipv6h))))
 		goto out;
-- 
cgit v1.2.3


From fb87bd47516d9a26b6d549231aa743b20fd4a569 Mon Sep 17 00:00:00 2001
From: Guillaume Nault <gnault@redhat.com>
Date: Fri, 16 Dec 2022 07:45:26 -0500
Subject: net: Introduce sk_use_task_frag in struct sock.

Sockets that can be used while recursing into memory reclaim, like
those used by network block devices and file systems, mustn't use
current->task_frag: if the current process is already using it, then
the inner memory reclaim call would corrupt the task_frag structure.

To avoid this, sk_page_frag() uses ->sk_allocation to detect sockets
that mustn't use current->task_frag, assuming that those used during
memory reclaim had their allocation constraints reflected in
->sk_allocation.

This unfortunately doesn't cover all cases: in an attempt to remove all
usage of GFP_NOFS and GFP_NOIO, sunrpc stopped setting these flags in
->sk_allocation, and used memalloc_nofs critical sections instead.
This breaks the sk_page_frag() heuristic since the allocation
constraints are now stored in current->flags, which sk_page_frag()
can't read without risking triggering a cache miss and slowing down
TCP's fast path.

This patch creates a new field in struct sock, named sk_use_task_frag,
which sockets with memory reclaim constraints can set to false if they
can't safely use current->task_frag. In such cases, sk_page_frag() now
always returns the socket's page_frag (->sk_frag). The first user is
sunrpc, which needs to avoid using current->task_frag but can keep
->sk_allocation set to GFP_KERNEL otherwise.

Eventually, it might be possible to simplify sk_page_frag() by only
testing ->sk_use_task_frag and avoid relying on the ->sk_allocation
heuristic entirely (assuming other sockets will set ->sk_use_task_frag
according to their constraints in the future).

The new ->sk_use_task_frag field is placed in a hole in struct sock and
belongs to a cache line shared with ->sk_shutdown. Therefore it should
be hot and shouldn't have negative performance impacts on TCP's fast
path (sk_shutdown is tested just before the while() loop in
tcp_sendmsg_locked()).

Link: https://lore.kernel.org/netdev/b4d8cb09c913d3e34f853736f3f5628abfd7f4b6.1656699567.git.gnault@redhat.com/
Signed-off-by: Guillaume Nault <gnault@redhat.com>
Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 11 +++++++++--
 net/core/sock.c    |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index ecea3dcc2217..fefe1f4abf19 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -318,6 +318,9 @@ struct sk_filter;
   *	@sk_stamp: time stamp of last packet received
   *	@sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only
   *	@sk_tsflags: SO_TIMESTAMPING flags
+  *	@sk_use_task_frag: allow sk_page_frag() to use current->task_frag.
+  *			   Sockets that can be used under memory reclaim should
+  *			   set this to false.
   *	@sk_bind_phc: SO_TIMESTAMPING bind PHC index of PTP virtual clock
   *	              for timestamping
   *	@sk_tskey: counter to disambiguate concurrent tstamp requests
@@ -512,6 +515,7 @@ struct sock {
 	u8			sk_txtime_deadline_mode : 1,
 				sk_txtime_report_errors : 1,
 				sk_txtime_unused : 6;
+	bool			sk_use_task_frag;
 
 	struct socket		*sk_socket;
 	void			*sk_user_data;
@@ -2561,14 +2565,17 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
  * socket operations and end up recursing into sk_page_frag()
  * while it's already in use: explicitly avoid task page_frag
  * usage if the caller is potentially doing any of them.
- * This assumes that page fault handlers use the GFP_NOFS flags.
+ * This assumes that page fault handlers use the GFP_NOFS flags or
+ * explicitly disable sk_use_task_frag.
  *
  * Return: a per task page_frag if context allows that,
  * otherwise a per socket one.
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-	if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) ==
+	if (sk->sk_use_task_frag &&
+	    (sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC |
+				  __GFP_FS)) ==
 	    (__GFP_DIRECT_RECLAIM | __GFP_FS))
 		return &current->task_frag;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index d2587d8712db..f954d5893e79 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3390,6 +3390,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
 	sk->sk_state		=	TCP_CLOSE;
+	sk->sk_use_task_frag	=	true;
 	sk_set_socket(sk, sock);
 
 	sock_set_flag(sk, SOCK_ZAPPED);
-- 
cgit v1.2.3


From 08f65892c5ee15806dce7259e06c384b8cd768d7 Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Fri, 16 Dec 2022 07:45:28 -0500
Subject: net: simplify sk_page_frag

Now that in-kernel socket users that may recurse during reclaim have benn
converted to sk_use_task_frag = false, we can have sk_page_frag() simply
check that value.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Reviewed-by: Guillaume Nault <gnault@redhat.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
---
 include/net/sock.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

(limited to 'include/net')

diff --git a/include/net/sock.h b/include/net/sock.h
index fefe1f4abf19..dcd72e6285b2 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2564,19 +2564,14 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
  * Both direct reclaim and page faults can nest inside other
  * socket operations and end up recursing into sk_page_frag()
  * while it's already in use: explicitly avoid task page_frag
- * usage if the caller is potentially doing any of them.
- * This assumes that page fault handlers use the GFP_NOFS flags or
- * explicitly disable sk_use_task_frag.
+ * when users disable sk_use_task_frag.
  *
  * Return: a per task page_frag if context allows that,
  * otherwise a per socket one.
  */
 static inline struct page_frag *sk_page_frag(struct sock *sk)
 {
-	if (sk->sk_use_task_frag &&
-	    (sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC |
-				  __GFP_FS)) ==
-	    (__GFP_DIRECT_RECLAIM | __GFP_FS))
+	if (sk->sk_use_task_frag)
 		return &current->task_frag;
 
 	return &sk->sk_frag;
-- 
cgit v1.2.3